mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-12-31 05:39:06 +03:00
Release 5.4.12 (#341)
Multiple changes since last release, this will be the last 100% ABI and API compatible with Hyperscan release. Next versions will include major refactors and API extensions, it will be mostly backwards compatible however. Without particular order, platform support is now: * Linux (x86, Arm, Power) * FreeBSD 14 (x86, Arm, Power) * MacOS 14+ (x86, Arm) In total more than 200 configurations in the CI are tested for every PR. Other features: - Fat Runtime supported for Arm as well (ASIMD/SVE/SVE2). - Initial implementations for Arm SVE/SVE2 algorithms added, thanks to Yoan Picchi from Arm. - SIMDe support added, used as an alternative backend for existing platforms, but mostly interesting for allowing Vectorscan to build in new platforms without a supported SIMD engine. - Various speedups and optimizations. - Cppcheck and clang-tidy fixes throughout the code, both have been added to CI for multiple configurations, but only cppcheck triggers a build failure for now. Various bugfixes, most important listed: - Speed up truffle with 256b TBL instructions (#290) - Fix Clang Tidy warnings (#295) - Clang 17+ is more restrictive on rebind<T> on MacOS/Boost, remove warning (#332) - partial_load_u64 will fail if buf == NULL/c_len == 0 (#331) - Bugfix/fix avx512vbmi regressions (#335) - fix missing hs_version.h header (closes #198) - hs_valid_platform: Fix check for SSE4.2 (#310) - Fixed out of bounds read in AVX512VBMI version of fdr_exec_fat_teddy … (#333) - Fix noodle SVE2 off by one bug (#313) - Make vectorscan accept \0 starting pattern (#312) - Fix 5.4.11's config step regression (#327) - Fix double shufti's vector end false positive (#325)
This commit is contained in:
11
.clang-tidy
Normal file
11
.clang-tidy
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
#unit/gtest/gtest-all.cc,build/src/parser/Parser.cpp,build/src/parser/control_verbs.cpp
|
||||||
|
#Dont change first comment ignores specific files from clang-tidy
|
||||||
|
|
||||||
|
|
||||||
|
Checks: 'clang-analyzer-*,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,performance-*,-performance-unnecessary-value-param,-performance-avoid-endl'
|
||||||
|
WarningsAsErrors: ''
|
||||||
|
HeaderFilterRegex: '.*'
|
||||||
|
SystemHeaders: false
|
||||||
|
FormatStyle: none
|
||||||
|
InheritParentConfig: true
|
||||||
|
User: user
|
||||||
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
[submodule "simde"]
|
||||||
|
path = simde
|
||||||
|
url = https://github.com/simd-everywhere/simde.git
|
||||||
@@ -2,6 +2,39 @@
|
|||||||
|
|
||||||
This is a list of notable changes to Vectorscan, in reverse chronological order. For Hyperscan Changelog, check CHANGELOG.md
|
This is a list of notable changes to Vectorscan, in reverse chronological order. For Hyperscan Changelog, check CHANGELOG.md
|
||||||
|
|
||||||
|
## [5.4.12] 2025-07-21
|
||||||
|
|
||||||
|
Multiple changes since last release, this will be the last 100% ABI and API compatible with Hyperscan release.
|
||||||
|
Next versions will include major refactors and API extensions, it will be mostly backwards compatible however.
|
||||||
|
Without particular order, platform support is now:
|
||||||
|
|
||||||
|
* Linux (x86, Arm, Power)
|
||||||
|
* FreeBSD 14 (x86, Arm, Power)
|
||||||
|
* MacOS 14+ (x86, Arm)
|
||||||
|
|
||||||
|
In total more than 200 configurations in the CI are tested for every PR.
|
||||||
|
|
||||||
|
Other features:
|
||||||
|
- Fat Runtime supported for Arm as well (ASIMD/SVE/SVE2).
|
||||||
|
- Initial implementations for Arm SVE/SVE2 algorithms added, thanks to Yoan Picchi from Arm.
|
||||||
|
- SIMDe support added, used as an alternative backend for existing platforms, but mostly interesting for allowing Vectorscan to build in new platforms without a supported SIMD engine.
|
||||||
|
- Various speedups and optimizations.
|
||||||
|
- Cppcheck and clang-tidy fixes throughout the code, both have been added to CI for multiple configurations, but only cppcheck triggers a build failure for now.
|
||||||
|
|
||||||
|
Various bugfixes, most important listed:
|
||||||
|
- Speed up truffle with 256b TBL instructions (#290)
|
||||||
|
- Fix Clang Tidy warnings (#295)
|
||||||
|
- Clang 17+ is more restrictive on rebind<T> on MacOS/Boost, remove warning (#332)
|
||||||
|
- partial_load_u64 will fail if buf == NULL/c_len == 0 (#331)
|
||||||
|
- Bugfix/fix avx512vbmi regressions (#335)
|
||||||
|
- fix missing hs_version.h header (closes #198)
|
||||||
|
- hs_valid_platform: Fix check for SSE4.2 (#310)
|
||||||
|
- Fixed out of bounds read in AVX512VBMI version of fdr_exec_fat_teddy … (#333)
|
||||||
|
- Fix noodle SVE2 off by one bug (#313)
|
||||||
|
- Make vectorscan accept \0 starting pattern (#312)
|
||||||
|
- Fix 5.4.11's config step regression (#327)
|
||||||
|
- Fix double shufti's vector end false positive (#325)
|
||||||
|
|
||||||
## [5.4.11] 2023-11-19
|
## [5.4.11] 2023-11-19
|
||||||
|
|
||||||
- Refactor CMake build system to be much more modular.
|
- Refactor CMake build system to be much more modular.
|
||||||
|
|||||||
180
CMakeLists.txt
180
CMakeLists.txt
@@ -4,7 +4,7 @@ project (vectorscan C CXX)
|
|||||||
|
|
||||||
set (HS_MAJOR_VERSION 5)
|
set (HS_MAJOR_VERSION 5)
|
||||||
set (HS_MINOR_VERSION 4)
|
set (HS_MINOR_VERSION 4)
|
||||||
set (HS_PATCH_VERSION 11)
|
set (HS_PATCH_VERSION 12)
|
||||||
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
|
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
|
||||||
|
|
||||||
string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
|
string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
|
||||||
@@ -23,11 +23,10 @@ INCLUDE (CheckLibraryExists)
|
|||||||
INCLUDE (CheckSymbolExists)
|
INCLUDE (CheckSymbolExists)
|
||||||
include (CMakeDependentOption)
|
include (CMakeDependentOption)
|
||||||
include (GNUInstallDirs)
|
include (GNUInstallDirs)
|
||||||
include (${CMAKE_MODULE_PATH}/platform.cmake)
|
|
||||||
include (${CMAKE_MODULE_PATH}/boost.cmake)
|
include (${CMAKE_MODULE_PATH}/boost.cmake)
|
||||||
include (${CMAKE_MODULE_PATH}/ragel.cmake)
|
include (${CMAKE_MODULE_PATH}/ragel.cmake)
|
||||||
|
|
||||||
find_package(PkgConfig REQUIRED)
|
find_package(PkgConfig QUIET)
|
||||||
|
|
||||||
find_program(RAGEL ragel)
|
find_program(RAGEL ragel)
|
||||||
|
|
||||||
@@ -35,6 +34,13 @@ if(${RAGEL} STREQUAL "RAGEL-NOTFOUND")
|
|||||||
message(FATAL_ERROR "Ragel state machine compiler not found")
|
message(FATAL_ERROR "Ragel state machine compiler not found")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Add ccache to speed builds
|
||||||
|
find_program(CCACHE_FOUND ccache)
|
||||||
|
if(CCACHE_FOUND)
|
||||||
|
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
|
||||||
|
set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
|
||||||
|
endif(CCACHE_FOUND)
|
||||||
|
|
||||||
# Build type check
|
# Build type check
|
||||||
|
|
||||||
if (NOT CMAKE_BUILD_TYPE)
|
if (NOT CMAKE_BUILD_TYPE)
|
||||||
@@ -116,18 +122,33 @@ if (RELEASE_BUILD)
|
|||||||
add_definitions(-DNDEBUG)
|
add_definitions(-DNDEBUG)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Architecture detection
|
||||||
|
|
||||||
|
include (${CMAKE_MODULE_PATH}/platform.cmake)
|
||||||
|
|
||||||
# Detect OS and if Fat Runtime is available
|
# Detect OS and if Fat Runtime is available
|
||||||
include (${CMAKE_MODULE_PATH}/osdetection.cmake)
|
include (${CMAKE_MODULE_PATH}/osdetection.cmake)
|
||||||
|
|
||||||
if (ARCH_IA32 OR ARCH_X86_64)
|
if(ARCH_X86_64 AND BUILD_SSE2_SIMDE AND NOT FAT_RUNTIME)
|
||||||
|
set(SIMDE_BACKEND True)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(SIMDE_BACKEND)
|
||||||
|
include (${CMAKE_MODULE_PATH}/simde.cmake)
|
||||||
|
elseif (ARCH_IA32 OR ARCH_X86_64)
|
||||||
include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
|
include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
|
||||||
set(ARCH_FLAG march)
|
|
||||||
elseif (ARCH_ARM32 OR ARCH_AARCH64)
|
elseif (ARCH_ARM32 OR ARCH_AARCH64)
|
||||||
include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
|
include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
|
||||||
set(ARCH_FLAG march)
|
|
||||||
elseif (ARCH_PPC64EL)
|
elseif (ARCH_PPC64EL)
|
||||||
include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
|
include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
|
||||||
|
else ()
|
||||||
|
message(FATAL_ERROR "Unsupported platform")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (ARCH_PPC64EL)
|
||||||
set(ARCH_FLAG mcpu)
|
set(ARCH_FLAG mcpu)
|
||||||
|
else ()
|
||||||
|
set(ARCH_FLAG march)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# Detect Native arch flags if requested
|
# Detect Native arch flags if requested
|
||||||
@@ -139,9 +160,11 @@ include (${CMAKE_MODULE_PATH}/sanitize.cmake)
|
|||||||
|
|
||||||
if (NOT FAT_RUNTIME)
|
if (NOT FAT_RUNTIME)
|
||||||
if (GNUCC_TUNE)
|
if (GNUCC_TUNE)
|
||||||
|
message(STATUS "GNUCC_TUNE is set")
|
||||||
set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
|
set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
|
||||||
set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
|
set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
|
||||||
else()
|
else()
|
||||||
|
message(STATUS "GNUCC_TUNE is not set")
|
||||||
set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}")
|
set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}")
|
||||||
set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}")
|
set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}")
|
||||||
endif()
|
endif()
|
||||||
@@ -207,6 +230,19 @@ set_source_files_properties(
|
|||||||
|
|
||||||
ragelmaker(src/parser/control_verbs.rl)
|
ragelmaker(src/parser/control_verbs.rl)
|
||||||
|
|
||||||
|
# BSD has the _np funcs in a _np header
|
||||||
|
CHECK_INCLUDE_FILE_CXX(pthread_np.h HAVE_PTHREAD_NP_H)
|
||||||
|
if (HAVE_PTHREAD_NP_H)
|
||||||
|
set (PTHREAD_NP_INC pthread_np.h)
|
||||||
|
else ()
|
||||||
|
set (PTHREAD_NP_INC pthread.h)
|
||||||
|
endif ()
|
||||||
|
CHECK_CXX_SYMBOL_EXISTS(pthread_setaffinity_np ${PTHREAD_NP_INC} HAVE_DECL_PTHREAD_SETAFFINITY_NP)
|
||||||
|
|
||||||
|
CHECK_FUNCTION_EXISTS(malloc_info HAVE_MALLOC_INFO)
|
||||||
|
CHECK_FUNCTION_EXISTS(shmget HAVE_SHMGET)
|
||||||
|
set(HAVE_SHMGET ${HAVE_SHMGET} CACHE BOOL "shmget()")
|
||||||
|
|
||||||
# do substitutions
|
# do substitutions
|
||||||
configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
|
configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
|
||||||
configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
|
configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
|
||||||
@@ -239,8 +275,11 @@ set (hs_exec_common_SRCS
|
|||||||
src/util/arch/common/cpuid_flags.h
|
src/util/arch/common/cpuid_flags.h
|
||||||
src/util/multibit.c
|
src/util/multibit.c
|
||||||
)
|
)
|
||||||
|
if (SIMDE_BACKEND)
|
||||||
if (ARCH_IA32 OR ARCH_X86_64)
|
set (hs_exec_common_SRCS
|
||||||
|
${hs_exec_common_SRCS}
|
||||||
|
src/util/arch/simde/cpuid_flags.c)
|
||||||
|
elseif (ARCH_IA32 OR ARCH_X86_64)
|
||||||
set (hs_exec_common_SRCS
|
set (hs_exec_common_SRCS
|
||||||
${hs_exec_common_SRCS}
|
${hs_exec_common_SRCS}
|
||||||
src/util/arch/x86/cpuid_flags.c
|
src/util/arch/x86/cpuid_flags.c
|
||||||
@@ -275,7 +314,7 @@ set (hs_exec_SRCS
|
|||||||
src/fdr/fdr_confirm_runtime.h
|
src/fdr/fdr_confirm_runtime.h
|
||||||
src/fdr/flood_runtime.h
|
src/fdr/flood_runtime.h
|
||||||
src/fdr/fdr_loadval.h
|
src/fdr/fdr_loadval.h
|
||||||
src/fdr/teddy.c
|
src/fdr/teddy.cpp
|
||||||
src/fdr/teddy.h
|
src/fdr/teddy.h
|
||||||
src/fdr/teddy_internal.h
|
src/fdr/teddy_internal.h
|
||||||
src/fdr/teddy_runtime_common.h
|
src/fdr/teddy_runtime_common.h
|
||||||
@@ -398,7 +437,12 @@ set (hs_exec_SRCS
|
|||||||
src/database.h
|
src/database.h
|
||||||
)
|
)
|
||||||
|
|
||||||
if (ARCH_IA32 OR ARCH_X86_64)
|
if (SIMDE_BACKEND)
|
||||||
|
set (hs_exec_SRCS
|
||||||
|
${hs_exec_SRCS}
|
||||||
|
src/nfa/vermicelli_simd.cpp
|
||||||
|
src/util/supervector/arch/x86/impl.cpp)
|
||||||
|
elseif (ARCH_IA32 OR ARCH_X86_64)
|
||||||
set (hs_exec_SRCS
|
set (hs_exec_SRCS
|
||||||
${hs_exec_SRCS}
|
${hs_exec_SRCS}
|
||||||
src/nfa/vermicelli_simd.cpp
|
src/nfa/vermicelli_simd.cpp
|
||||||
@@ -414,9 +458,11 @@ set (hs_exec_SRCS
|
|||||||
src/util/supervector/arch/ppc64el/impl.cpp)
|
src/util/supervector/arch/ppc64el/impl.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
if (ARCH_IA32 OR ARCH_X86_64)
|
if (ARCH_IA32 OR ARCH_X86_64)
|
||||||
set (hs_exec_avx2_SRCS
|
set (hs_exec_avx2_SRCS
|
||||||
src/fdr/teddy_avx2.c
|
src/fdr/teddy.cpp
|
||||||
|
src/fdr/teddy_fat.cpp
|
||||||
src/util/arch/x86/masked_move.c
|
src/util/arch/x86/masked_move.c
|
||||||
src/util/arch/x86/masked_move.h
|
src/util/arch/x86/masked_move.h
|
||||||
)
|
)
|
||||||
@@ -918,16 +964,47 @@ else ()
|
|||||||
if (NOT BUILD_AVX512VBMI)
|
if (NOT BUILD_AVX512VBMI)
|
||||||
set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DDISABLE_AVX512VBMI_DISPATCH")
|
set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DDISABLE_AVX512VBMI_DISPATCH")
|
||||||
endif (NOT BUILD_AVX512VBMI)
|
endif (NOT BUILD_AVX512VBMI)
|
||||||
|
if(BUILD_SSE2_SIMDE)
|
||||||
|
set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2")
|
||||||
|
endif(BUILD_SSE2_SIMDE)
|
||||||
set_source_files_properties(src/dispatcher.c PROPERTIES
|
set_source_files_properties(src/dispatcher.c PROPERTIES
|
||||||
COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
|
COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
|
||||||
|
if(BUILD_SSE2_SIMDE AND NOT BUILD_AVX2 AND NOT BUILD_AVX512 AND NOT BUILD_AVX512VBMI)
|
||||||
|
set_source_files_properties(src/crc32.c PROPERTIES
|
||||||
|
COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2")
|
||||||
|
set_source_files_properties(src/hs.cpp PROPERTIES
|
||||||
|
COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2")
|
||||||
|
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (BUILD_STATIC_LIBS)
|
if (BUILD_STATIC_LIBS)
|
||||||
add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
|
|
||||||
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
|
if (BUILD_SSE2_SIMDE)
|
||||||
set_target_properties(hs_exec_core2 PROPERTIES
|
add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
|
||||||
COMPILE_FLAGS "-march=core2 -msse4.2"
|
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
|
||||||
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
|
message("Building SIMDE SSE2 version..")
|
||||||
)
|
include_directories(${PROJECT_SOURCE_DIR}/simde)
|
||||||
|
if (CMAKE_COMPILER_IS_CLANG)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set_target_properties(hs_exec_core2 PROPERTIES
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||||
|
COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
|
||||||
|
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
|
||||||
|
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
|
||||||
|
set_target_properties(hs_exec_core2 PROPERTIES
|
||||||
|
COMPILE_FLAGS "-march=core2 -msse4.2"
|
||||||
|
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
|
||||||
|
)
|
||||||
|
endif(BUILD_SSE2_SIMDE)
|
||||||
|
|
||||||
add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
|
add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
|
||||||
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
|
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
|
||||||
@@ -980,19 +1057,59 @@ else ()
|
|||||||
$<TARGET_OBJECTS:hs_compile>
|
$<TARGET_OBJECTS:hs_compile>
|
||||||
$<TARGET_OBJECTS:hs_exec_common>
|
$<TARGET_OBJECTS:hs_exec_common>
|
||||||
${RUNTIME_LIBS})
|
${RUNTIME_LIBS})
|
||||||
|
|
||||||
|
if (BUILD_SSE2_SIMDE)
|
||||||
|
set_target_properties(hs_compile PROPERTIES
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||||
|
COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
|
||||||
|
)
|
||||||
|
set_target_properties(hs PROPERTIES
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||||
|
COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
|
||||||
|
)
|
||||||
|
endif (BUILD_SSE2_SIMDE)
|
||||||
|
|
||||||
endif (BUILD_STATIC_LIBS)
|
endif (BUILD_STATIC_LIBS)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
# build shared libs
|
# build shared libs
|
||||||
add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
|
add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
|
||||||
set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
|
set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
|
||||||
add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
|
|
||||||
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
|
if (BUILD_SSE2_SIMDE)
|
||||||
set_target_properties(hs_exec_shared_core2 PROPERTIES
|
message("Building SIMDE SSE2 version..")
|
||||||
COMPILE_FLAGS "-march=core2 -msse4.2"
|
add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
|
||||||
POSITION_INDEPENDENT_CODE TRUE
|
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
|
||||||
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
|
|
||||||
)
|
include_directories(${PROJECT_SOURCE_DIR}/simde)
|
||||||
|
if (CMAKE_COMPILER_IS_CLANG)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||||
|
endif()
|
||||||
|
set_target_properties(hs_exec_shared_core2 PROPERTIES
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||||
|
COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
|
||||||
|
POSITION_INDEPENDENT_CODE TRUE
|
||||||
|
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
|
||||||
|
)
|
||||||
|
set_target_properties(hs_compile_shared PROPERTIES
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
|
string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||||
|
COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
|
||||||
|
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
|
||||||
|
set_target_properties(hs_exec_shared_core2 PROPERTIES
|
||||||
|
COMPILE_FLAGS "-march=core2 -msse4.2"
|
||||||
|
POSITION_INDEPENDENT_CODE TRUE
|
||||||
|
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
|
||||||
|
)
|
||||||
|
endif (BUILD_SSE2_SIMDE)
|
||||||
|
|
||||||
add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
|
add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
|
||||||
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
|
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
|
||||||
set_target_properties(hs_exec_shared_corei7 PROPERTIES
|
set_target_properties(hs_exec_shared_corei7 PROPERTIES
|
||||||
@@ -1194,11 +1311,17 @@ if (NOT BUILD_STATIC_LIBS)
|
|||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
add_subdirectory(util)
|
add_subdirectory(util)
|
||||||
add_subdirectory(unit)
|
|
||||||
|
|
||||||
if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
|
option(BUILD_UNIT "Build Hyperscan unit tests (default TRUE)" TRUE)
|
||||||
|
if(BUILD_UNIT)
|
||||||
|
add_subdirectory(unit)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(BUILD_TOOLS "Build Hyperscan tools (default TRUE)" TRUE)
|
||||||
|
if(EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt AND BUILD_TOOLS)
|
||||||
add_subdirectory(tools)
|
add_subdirectory(tools)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
|
if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
|
||||||
add_subdirectory(chimera)
|
add_subdirectory(chimera)
|
||||||
endif()
|
endif()
|
||||||
@@ -1213,4 +1336,7 @@ if(BUILD_BENCHMARKS)
|
|||||||
add_subdirectory(benchmarks)
|
add_subdirectory(benchmarks)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_subdirectory(doc/dev-reference)
|
option(BUILD_DOC "Build the Hyperscan documentation (default TRUE)" TRUE)
|
||||||
|
if(BUILD_DOC)
|
||||||
|
add_subdirectory(doc/dev-reference)
|
||||||
|
endif()
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
394 Konstantinos Margaritis <konstantinos@vectorcamp.gr>
|
549 Konstantinos Margaritis <konstantinos@vectorcamp.gr>
|
||||||
|
78 George Economou <george.economou@vectorcamp.gr>
|
||||||
|
71 George Tsoulkanakis <george.tsoulkanakis@vectorcamp.gr>
|
||||||
59 apostolos <apostolos.tapsas@vectorcamp.gr>
|
59 apostolos <apostolos.tapsas@vectorcamp.gr>
|
||||||
25 Hong, Yang A <yang.a.hong@intel.com>
|
25 Hong, Yang A <yang.a.hong@intel.com>
|
||||||
19 George Wort <george.wort@arm.com>
|
19 George Wort <george.wort@arm.com>
|
||||||
@@ -6,20 +8,29 @@
|
|||||||
7 Danila Kutenin <danilak@google.com>
|
7 Danila Kutenin <danilak@google.com>
|
||||||
7 Wang Xiang W <xiang.w.wang@intel.com>
|
7 Wang Xiang W <xiang.w.wang@intel.com>
|
||||||
6 Alex Bondarev <abondarev84@gmail.com>
|
6 Alex Bondarev <abondarev84@gmail.com>
|
||||||
5 Konstantinos Margaritis <konma@vectorcamp.gr>
|
6 Yoan Picchi <yoan.picchi@arm.com>
|
||||||
|
5 Jeremy Linton <jeremy.linton@arm.com>
|
||||||
3 Duncan Bellamy <dunk@denkimushi.com>
|
3 Duncan Bellamy <dunk@denkimushi.com>
|
||||||
2 Azat Khuzhin <a3at.mail@gmail.com>
|
2 Azat Khuzhin <a3at.mail@gmail.com>
|
||||||
2 Jan Henning <jan.thilo.henning@sap.com>
|
2 Jan Henning <jan.thilo.henning@sap.com>
|
||||||
1 BigRedEye <mail@bigredeye.me>
|
1 BigRedEye <mail@bigredeye.me>
|
||||||
|
1 Brad Larsen <bradford.larsen@praetorian.com>
|
||||||
|
1 Chrysovalantis - Michail Liakopoulos <valadis.liakopoulos@vectorcamp.gr>
|
||||||
1 Daniel Kutenin <kutdanila@yandex.ru>
|
1 Daniel Kutenin <kutdanila@yandex.ru>
|
||||||
1 Danila Kutenin <kutdanila@yandex.ru>
|
1 Danila Kutenin <kutdanila@yandex.ru>
|
||||||
|
1 HelixHexagon <60048780+HelixHexagon@users.noreply.github.com>
|
||||||
|
1 Jingbo Chen <cj@yanhuangdata.com>
|
||||||
1 Liu Zixian <hdu_sdlzx@163.com>
|
1 Liu Zixian <hdu_sdlzx@163.com>
|
||||||
|
1 Matthias Gliwka <matthias@gliwka.eu>
|
||||||
|
1 Michael Tremer <michael.tremer@ipfire.org>
|
||||||
1 Mitchell Wasson <miwasson@cisco.com>
|
1 Mitchell Wasson <miwasson@cisco.com>
|
||||||
1 Piotr Skamruk <piotr.skamruk@gmail.com>
|
1 Piotr Skamruk <piotr.skamruk@gmail.com>
|
||||||
|
1 Rafał Dowgird <dowgird@gmail.com>
|
||||||
1 Robbie Williamson <robbie.williamson@arm.com>
|
1 Robbie Williamson <robbie.williamson@arm.com>
|
||||||
1 Robert Schulze <robert@clickhouse.com>
|
1 Robert Schulze <robert@clickhouse.com>
|
||||||
1 Walt Stoneburner <wls@wwco.com>
|
1 Walt Stoneburner <wls@wwco.com>
|
||||||
1 Zhu,Wenjun <wenjun.zhu@intel.com>
|
1 Zhu,Wenjun <wenjun.zhu@intel.com>
|
||||||
1 hongyang7 <yang.a.hong@intel.com>
|
1 hongyang7 <yang.a.hong@intel.com>
|
||||||
|
1 ibrkas01arm <ibrahim.kashif@arm.com>
|
||||||
1 jplaisance <jeffplaisance@gmail.com>
|
1 jplaisance <jeffplaisance@gmail.com>
|
||||||
1 liquidaty <info@liquidaty.com>
|
1 liquidaty <info@liquidaty.com>
|
||||||
|
|||||||
76
README.md
76
README.md
@@ -1,8 +1,12 @@
|
|||||||
# About Vectorscan
|
# About Vectorscan
|
||||||
|
|
||||||
A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
|
A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
|
||||||
is 100% functional, and Power VSX are in development. ARM SVE2 support is in ongoing with
|
and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
|
||||||
access to hardware now. More platforms will follow in the future.
|
access to hardware now. More platforms will follow in the future.
|
||||||
|
Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
|
||||||
|
port, which can be either used for platforms without official SIMD support,
|
||||||
|
as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
|
||||||
|
for reference and comparison purposes.
|
||||||
|
|
||||||
Vectorscan will follow Intel's API and internal algorithms where possible, but will not
|
Vectorscan will follow Intel's API and internal algorithms where possible, but will not
|
||||||
hesitate to make code changes where it is thought of giving better performance or better
|
hesitate to make code changes where it is thought of giving better performance or better
|
||||||
@@ -94,7 +98,7 @@ some small but necessary changes were made that might break compatibility with h
|
|||||||
In order to build on Debian/Ubuntu make sure you install the following build-dependencies
|
In order to build on Debian/Ubuntu make sure you install the following build-dependencies
|
||||||
|
|
||||||
```
|
```
|
||||||
$ sudo apt build-essential cmake ragel pkg-config libsqlite3-dev libpcap-dev
|
$ sudo apt install build-essential cmake ragel pkg-config libsqlite3-dev libpcap-dev
|
||||||
```
|
```
|
||||||
|
|
||||||
### Other distributions
|
### Other distributions
|
||||||
@@ -109,6 +113,69 @@ Assuming an existing HomeBrew installation:
|
|||||||
% brew install boost cmake gcc libpcap pkg-config ragel sqlite
|
% brew install boost cmake gcc libpcap pkg-config ragel sqlite
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### *BSD
|
||||||
|
In NetBSD you will almost certainly need to have a newer compiler installed.
|
||||||
|
Also you will need to install cmake, sqlite, boost and ragel.
|
||||||
|
Also, libpcap is necessary for some of the benchmarks, so let's install that
|
||||||
|
as well.
|
||||||
|
When using pkgsrc, you would typically do this using something
|
||||||
|
similar to
|
||||||
|
```
|
||||||
|
pkg_add gcc12-12.3.0.tgz
|
||||||
|
pkg_add boost-headers-1.83.0.tgz boost-jam-1.83.0.tgz boost-libs-1.83.0nb1.tgz
|
||||||
|
pkg_add ragel-6.10.tgz
|
||||||
|
pkg_add cmake-3.28.1.tgz
|
||||||
|
pkg_add sqlite3-3.44.2.tgz
|
||||||
|
pkg_add libpcap-1.10.4.tgz
|
||||||
|
```
|
||||||
|
Version numbers etc will of course vary. One would either download the
|
||||||
|
binary packages or build them using pkgsrc. There exist some NetBSD pkg
|
||||||
|
tools like ```pkgin``` which help download e.g. dependencies as binary packages,
|
||||||
|
but overall NetBSD leaves a lot of detail exposed to the user.
|
||||||
|
The main package system used in NetBSD is pkgsrc and one will probably
|
||||||
|
want to read up more about it than is in the scope of this document.
|
||||||
|
See https://www.netbsd.org/docs/software/packages.html for more information.
|
||||||
|
|
||||||
|
This will not replace the compiler in the standard base distribution, and
|
||||||
|
cmake will probably find the base dist's compiler when it checks automatically.
|
||||||
|
Using the example of gcc12 from pkgsrc, one will need to set two
|
||||||
|
environment variables before starting:
|
||||||
|
```
|
||||||
|
export CC="/usr/pkg/gcc12/bin/cc"
|
||||||
|
export CXX="/usr/pkg/gcc12/bin/g++"
|
||||||
|
```
|
||||||
|
|
||||||
|
In FreeBSD similarly, you might want to install a different compiler.
|
||||||
|
If you want to use gcc, it is recommended to use gcc12.
|
||||||
|
You will also, as in NetBSD, need to install cmake, sqlite, boost and ragel packages.
|
||||||
|
Using the example of gcc12 from pkg:
|
||||||
|
installing the desired compiler:
|
||||||
|
```
|
||||||
|
pkg install gcc12
|
||||||
|
pkg install boost-all
|
||||||
|
pkg install ragel
|
||||||
|
pkg install cmake
|
||||||
|
pkg install sqlite
|
||||||
|
pkg install libpcap
|
||||||
|
pkg install ccache
|
||||||
|
```
|
||||||
|
and then before beginning the cmake and build process, set
|
||||||
|
the environment variables to point to this compiler:
|
||||||
|
```
|
||||||
|
export CC="/usr/local/bin/gcc"
|
||||||
|
export CXX="/usr/local/bin/g++"
|
||||||
|
```
|
||||||
|
A further note in FreeBSD, on the PowerPC and ARM platforms,
|
||||||
|
the gcc12 package installs to a slightly different name, on FreeBSD/ppc,
|
||||||
|
gcc12 will be found using:
|
||||||
|
```
|
||||||
|
export CC="/usr/local/bin/gcc12"
|
||||||
|
export CXX="/usr/local/bin/g++12"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then continue with the build as below.
|
||||||
|
|
||||||
|
|
||||||
## Configure & build
|
## Configure & build
|
||||||
|
|
||||||
In order to configure with `cmake` first create and cd into a build directory:
|
In order to configure with `cmake` first create and cd into a build directory:
|
||||||
@@ -148,6 +215,11 @@ Common options for Cmake are:
|
|||||||
|
|
||||||
* `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.
|
* `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.
|
||||||
|
|
||||||
|
## SIMDe options
|
||||||
|
|
||||||
|
* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
|
||||||
|
* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
|
If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
if (NOT FAT_RUNTIME AND (BUILD_STATIC_AND_SHARED OR BUILD_STATIC_LIBS))
|
include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
include_directories(${PROJECT_SOURCE_DIR})
|
||||||
|
|
||||||
|
if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
|
||||||
add_executable(benchmarks benchmarks.cpp)
|
add_executable(benchmarks benchmarks.cpp)
|
||||||
set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
|
set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
|
||||||
"-Wall -Wno-unused-variable")
|
"-Wall -Wno-unused-variable")
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2020, 2021, VectorCamp PC
|
* Copyright (c) 2020, 2021, VectorCamp PC
|
||||||
|
* Copyright (c) 2023, 2024, Arm Limited
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -26,32 +27,31 @@
|
|||||||
* POSSIBILITY OF SUCH DAMAGE.
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <cstdlib>
|
|
||||||
#include <memory>
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <iostream>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "util/arch.h"
|
||||||
#include "benchmarks.hpp"
|
#include "benchmarks.hpp"
|
||||||
|
|
||||||
#define MAX_LOOPS 1000000000
|
#define MAX_LOOPS 1000000000
|
||||||
#define MAX_MATCHES 5
|
#define MAX_MATCHES 5
|
||||||
#define N 8
|
#define N 8
|
||||||
|
|
||||||
struct hlmMatchEntry {
|
struct hlmMatchEntry {
|
||||||
size_t to;
|
size_t to;
|
||||||
u32 id;
|
u32 id;
|
||||||
hlmMatchEntry(size_t end, u32 identifier) :
|
hlmMatchEntry(size_t end, u32 identifier) : to(end), id(identifier) {}
|
||||||
to(end), id(identifier) {}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<hlmMatchEntry> ctxt;
|
std::vector<hlmMatchEntry> ctxt;
|
||||||
|
|
||||||
static
|
static hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
|
||||||
hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
|
UNUSED struct hs_scratch *scratch) { // cppcheck-suppress constParameterCallback
|
||||||
UNUSED struct hs_scratch *scratch) {
|
|
||||||
DEBUG_PRINTF("match @%zu = %u\n", to, id);
|
DEBUG_PRINTF("match @%zu = %u\n", to, id);
|
||||||
|
|
||||||
ctxt.push_back(hlmMatchEntry(to, id));
|
ctxt.push_back(hlmMatchEntry(to, id));
|
||||||
@@ -59,40 +59,42 @@ hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
|
|||||||
return HWLM_CONTINUE_MATCHING;
|
return HWLM_CONTINUE_MATCHING;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename InitFunc, typename BenchFunc>
|
template <typename InitFunc, typename BenchFunc>
|
||||||
static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse, MicroBenchmark &bench, InitFunc &&init, BenchFunc &&func) {
|
static void run_benchmarks(int size, int loops, int max_matches,
|
||||||
|
bool is_reverse, MicroBenchmark &bench,
|
||||||
|
InitFunc &&init, BenchFunc &&func) {
|
||||||
init(bench);
|
init(bench);
|
||||||
double total_sec = 0.0;
|
double total_sec = 0.0;
|
||||||
u64a total_size = 0;
|
|
||||||
double bw = 0.0;
|
|
||||||
double avg_bw = 0.0;
|
|
||||||
double max_bw = 0.0;
|
double max_bw = 0.0;
|
||||||
double avg_time = 0.0;
|
double avg_time = 0.0;
|
||||||
if (max_matches) {
|
if (max_matches) {
|
||||||
|
double avg_bw = 0.0;
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
for(int j = 0; j < max_matches - 1; j++) {
|
for (int j = 0; j < max_matches - 1; j++) {
|
||||||
bench.buf[pos] = 'b';
|
bench.buf[pos] = 'b';
|
||||||
pos = (j+1) *size / max_matches ;
|
pos = (j + 1) * size / max_matches;
|
||||||
bench.buf[pos] = 'a';
|
bench.buf[pos] = 'a';
|
||||||
u64a actual_size = 0;
|
u64a actual_size = 0;
|
||||||
auto start = std::chrono::steady_clock::now();
|
auto start = std::chrono::steady_clock::now();
|
||||||
for(int i = 0; i < loops; i++) {
|
for (int i = 0; i < loops; i++) {
|
||||||
const u8 *res = func(bench);
|
const u8 *res = func(bench);
|
||||||
if (is_reverse)
|
if (is_reverse)
|
||||||
actual_size += bench.buf.data() + size - res;
|
actual_size += bench.buf.data() + size - res;
|
||||||
else
|
else
|
||||||
actual_size += res - bench.buf.data();
|
actual_size += res - bench.buf.data();
|
||||||
}
|
}
|
||||||
auto end = std::chrono::steady_clock::now();
|
auto end = std::chrono::steady_clock::now();
|
||||||
double dt = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
double dt = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||||
|
end - start)
|
||||||
|
.count();
|
||||||
total_sec += dt;
|
total_sec += dt;
|
||||||
/*convert microseconds to seconds*/
|
/*convert microseconds to seconds*/
|
||||||
/*calculate bandwidth*/
|
/*calculate bandwidth*/
|
||||||
bw = (actual_size / dt) * 1000000.0 / 1048576.0;
|
double bw = (actual_size / dt) * 1000000.0 / 1048576.0;
|
||||||
/*std::cout << "act_size = " << act_size << std::endl;
|
/*std::cout << "act_size = " << act_size << std::endl;
|
||||||
std::cout << "dt = " << dt << std::endl;
|
std::cout << "dt = " << dt << std::endl;
|
||||||
std::cout << "bw = " << bw << std::endl;*/
|
std::cout << "bw = " << bw << std::endl;*/
|
||||||
avg_bw += bw;
|
avg_bw += bw;
|
||||||
/*convert to MB/s*/
|
/*convert to MB/s*/
|
||||||
max_bw = std::max(bw, max_bw);
|
max_bw = std::max(bw, max_bw);
|
||||||
/*calculate average time*/
|
/*calculate average time*/
|
||||||
@@ -100,20 +102,22 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
|
|||||||
}
|
}
|
||||||
avg_time /= max_matches;
|
avg_time /= max_matches;
|
||||||
avg_bw /= max_matches;
|
avg_bw /= max_matches;
|
||||||
total_sec /= 1000000.0;
|
total_sec /= 1000000.0;
|
||||||
/*convert average time to us*/
|
/*convert average time to us*/
|
||||||
printf(KMAG "%s: %u matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, "
|
printf("%-18s, %-12d, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7.3f\n",
|
||||||
KBLU "average time per call =" RST " %.3f μs," KBLU " max bandwidth = " RST " %.3f MB/s," KBLU " average bandwidth =" RST " %.3f MB/s \n",
|
|
||||||
bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
|
bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
|
||||||
} else {
|
} else {
|
||||||
|
u64a total_size = 0;
|
||||||
auto start = std::chrono::steady_clock::now();
|
auto start = std::chrono::steady_clock::now();
|
||||||
for (int i = 0; i < loops; i++) {
|
for (int i = 0; i < loops; i++) {
|
||||||
const u8 *res = func(bench);
|
func(bench);
|
||||||
}
|
}
|
||||||
auto end = std::chrono::steady_clock::now();
|
auto end = std::chrono::steady_clock::now();
|
||||||
total_sec += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
total_sec +=
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
||||||
|
.count();
|
||||||
/*calculate transferred size*/
|
/*calculate transferred size*/
|
||||||
total_size = size * loops;
|
total_size = (u64a)size * (u64a)loops;
|
||||||
/*calculate average time*/
|
/*calculate average time*/
|
||||||
avg_time = total_sec / loops;
|
avg_time = total_sec / loops;
|
||||||
/*convert microseconds to seconds*/
|
/*convert microseconds to seconds*/
|
||||||
@@ -122,130 +126,182 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
|
|||||||
max_bw = total_size / total_sec;
|
max_bw = total_size / total_sec;
|
||||||
/*convert to MB/s*/
|
/*convert to MB/s*/
|
||||||
max_bw /= 1048576.0;
|
max_bw /= 1048576.0;
|
||||||
printf(KMAG "%s: no matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, "
|
printf("%-18s, %-12s, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7s\n",
|
||||||
KBLU "average time per call =" RST " %.3f μs ," KBLU " bandwidth = " RST " %.3f MB/s \n",
|
bench.label, "0", size, loops, total_sec, avg_time, max_bw, "0");
|
||||||
bench.label, size ,loops, total_sec, avg_time, max_bw );
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(){
|
int main(){
|
||||||
int matches[] = {0, MAX_MATCHES};
|
const int matches[] = {0, MAX_MATCHES};
|
||||||
std::vector<size_t> sizes;
|
std::vector<size_t> sizes;
|
||||||
for (size_t i = 0; i < N; i++) sizes.push_back(16000 << i*2);
|
for (size_t i = 0; i < N; i++)
|
||||||
|
sizes.push_back(16000 << i * 2);
|
||||||
const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
|
const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
|
||||||
|
printf("%-18s, %-12s, %-10s, %-6s, %-10s, %-9s, %-8s, %-7s\n", "Matcher",
|
||||||
|
"max_matches", "size", "loops", "total_sec", "avg_time", "max_bw",
|
||||||
|
"avg_bw");
|
||||||
for (int m = 0; m < 2; m++) {
|
for (int m = 0; m < 2; m++) {
|
||||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
MicroBenchmark bench("Shufti", sizes[i]);
|
MicroBenchmark bench("Shufti", sizes[i]);
|
||||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
run_benchmarks(
|
||||||
|
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) {
|
||||||
b.chars.set('a');
|
b.chars.set('a');
|
||||||
ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
|
ue2::shuftiBuildMasks(b.chars,
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||||
memset(b.buf.data(), 'b', b.size);
|
memset(b.buf.data(), 'b', b.size);
|
||||||
},
|
},
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark const &b) {
|
||||||
return shuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
|
return shuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||||
}
|
b.buf.data() + b.size);
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
MicroBenchmark bench("Reverse Shufti", sizes[i]);
|
MicroBenchmark bench("Reverse Shufti", sizes[i]);
|
||||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
run_benchmarks(
|
||||||
|
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) {
|
||||||
b.chars.set('a');
|
b.chars.set('a');
|
||||||
ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
|
ue2::shuftiBuildMasks(b.chars,
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||||
memset(b.buf.data(), 'b', b.size);
|
memset(b.buf.data(), 'b', b.size);
|
||||||
},
|
},
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark const &b) {
|
||||||
return rshuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
|
return rshuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||||
}
|
b.buf.data() + b.size);
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
MicroBenchmark bench("Truffle", sizes[i]);
|
MicroBenchmark bench("Truffle", sizes[i]);
|
||||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
run_benchmarks(
|
||||||
|
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) {
|
||||||
b.chars.set('a');
|
b.chars.set('a');
|
||||||
ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
|
ue2::truffleBuildMasks(b.chars,
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||||
memset(b.buf.data(), 'b', b.size);
|
memset(b.buf.data(), 'b', b.size);
|
||||||
},
|
},
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark const &b) {
|
||||||
return truffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
|
return truffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||||
}
|
b.buf.data() + b.size);
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
MicroBenchmark bench("Reverse Truffle", sizes[i]);
|
MicroBenchmark bench("Reverse Truffle", sizes[i]);
|
||||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
run_benchmarks(
|
||||||
|
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) {
|
||||||
b.chars.set('a');
|
b.chars.set('a');
|
||||||
ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
|
ue2::truffleBuildMasks(b.chars,
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||||
memset(b.buf.data(), 'b', b.size);
|
memset(b.buf.data(), 'b', b.size);
|
||||||
},
|
},
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark const &b) {
|
||||||
return rtruffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
|
return rtruffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||||
}
|
b.buf.data() + b.size);
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
|
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||||
|
if(CAN_USE_WIDE_TRUFFLE) {
|
||||||
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
|
MicroBenchmark bench("Truffle Wide", sizes[i]);
|
||||||
|
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||||
|
[&](MicroBenchmark &b) {
|
||||||
|
b.chars.set('a');
|
||||||
|
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
|
||||||
|
memset(b.buf.data(), 'b', b.size);
|
||||||
|
},
|
||||||
|
[&](MicroBenchmark const &b) {
|
||||||
|
return truffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
|
MicroBenchmark bench("Reverse Truffle Wide", sizes[i]);
|
||||||
|
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||||
|
[&](MicroBenchmark &b) {
|
||||||
|
b.chars.set('a');
|
||||||
|
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
|
||||||
|
memset(b.buf.data(), 'b', b.size);
|
||||||
|
},
|
||||||
|
[&](MicroBenchmark const &b) {
|
||||||
|
return rtruffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
MicroBenchmark bench("Vermicelli", sizes[i]);
|
MicroBenchmark bench("Vermicelli", sizes[i]);
|
||||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
run_benchmarks(
|
||||||
|
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) {
|
||||||
b.chars.set('a');
|
b.chars.set('a');
|
||||||
ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
|
ue2::truffleBuildMasks(b.chars,
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||||
memset(b.buf.data(), 'b', b.size);
|
memset(b.buf.data(), 'b', b.size);
|
||||||
},
|
},
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark const &b) {
|
||||||
return vermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
|
return vermicelliExec('a', 'b', b.buf.data(),
|
||||||
}
|
b.buf.data() + b.size);
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
|
MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
|
||||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
run_benchmarks(
|
||||||
|
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) {
|
||||||
b.chars.set('a');
|
b.chars.set('a');
|
||||||
ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
|
ue2::truffleBuildMasks(b.chars,
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||||
memset(b.buf.data(), 'b', b.size);
|
memset(b.buf.data(), 'b', b.size);
|
||||||
},
|
},
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark const &b) {
|
||||||
return rvermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
|
return rvermicelliExec('a', 'b', b.buf.data(),
|
||||||
}
|
b.buf.data() + b.size);
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||||
//we imitate the noodle unit tests
|
// we imitate the noodle unit tests
|
||||||
std::string str;
|
std::string str;
|
||||||
const size_t char_len = 5;
|
const size_t char_len = 5;
|
||||||
str.resize(char_len + 1);
|
str.resize(char_len + 2);
|
||||||
for (size_t j=0; j < char_len; j++) {
|
for (size_t j = 0; j < char_len; j++) {
|
||||||
srand (time(NULL));
|
srand(time(NULL));
|
||||||
int key = rand() % + 36 ;
|
int key = rand() % +36;
|
||||||
str[char_len] = charset[key];
|
str[char_len] = charset[key];
|
||||||
str[char_len + 1] = '\0';
|
str[char_len + 1] = '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
MicroBenchmark bench("Noodle", sizes[i]);
|
MicroBenchmark bench("Noodle", sizes[i]);
|
||||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
run_benchmarks(
|
||||||
|
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) {
|
||||||
ctxt.clear();
|
ctxt.clear();
|
||||||
memset(b.buf.data(), 'a', b.size);
|
memset(b.buf.data(), 'a', b.size);
|
||||||
u32 id = 1000;
|
u32 id = 1000;
|
||||||
ue2::hwlmLiteral lit(str, true, id);
|
ue2::hwlmLiteral lit(str, true, id);
|
||||||
b.nt = ue2::noodBuildTable(lit);
|
b.nt = ue2::noodBuildTable(lit);
|
||||||
assert(b.nt != nullptr);
|
assert(b.nt.get() != nullptr);
|
||||||
},
|
},
|
||||||
[&](MicroBenchmark &b) {
|
[&](MicroBenchmark &b) { // cppcheck-suppress constParameterReference
|
||||||
noodExec(b.nt.get(), b.buf.data(), b.size, 0, hlmSimpleCallback, &b.scratch);
|
noodExec(b.nt.get(), b.buf.data(), b.size, 0,
|
||||||
|
hlmSimpleCallback, &b.scratch);
|
||||||
return b.buf.data() + b.size;
|
return b.buf.data() + b.size;
|
||||||
}
|
});
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2020, 2021, VectorCamp PC
|
* Copyright (c) 2020, 2021, VectorCamp PC
|
||||||
|
* Copyright (c) 2024, Arm Limited
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -26,44 +27,41 @@
|
|||||||
* POSSIBILITY OF SUCH DAMAGE.
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "hwlm/hwlm_literal.h"
|
||||||
|
#include "hwlm/noodle_build.h"
|
||||||
|
#include "hwlm/noodle_engine.h"
|
||||||
|
#include "hwlm/noodle_internal.h"
|
||||||
#include "nfa/shufti.h"
|
#include "nfa/shufti.h"
|
||||||
#include "nfa/shufticompile.h"
|
#include "nfa/shufticompile.h"
|
||||||
#include "nfa/truffle.h"
|
#include "nfa/truffle.h"
|
||||||
#include "nfa/trufflecompile.h"
|
#include "nfa/trufflecompile.h"
|
||||||
#include "nfa/vermicelli.hpp"
|
#include "nfa/vermicelli.hpp"
|
||||||
#include "hwlm/noodle_build.h"
|
|
||||||
#include "hwlm/noodle_engine.h"
|
|
||||||
#include "hwlm/noodle_internal.h"
|
|
||||||
#include "hwlm/hwlm_literal.h"
|
|
||||||
#include "util/bytecode_ptr.h"
|
|
||||||
#include "scratch.h"
|
#include "scratch.h"
|
||||||
|
#include "util/bytecode_ptr.h"
|
||||||
|
|
||||||
/*define colour control characters*/
|
class MicroBenchmark {
|
||||||
#define RST "\x1B[0m"
|
|
||||||
#define KRED "\x1B[31m"
|
|
||||||
#define KGRN "\x1B[32m"
|
|
||||||
#define KYEL "\x1B[33m"
|
|
||||||
#define KBLU "\x1B[34m"
|
|
||||||
#define KMAG "\x1B[35m"
|
|
||||||
#define KCYN "\x1B[36m"
|
|
||||||
#define KWHT "\x1B[37m"
|
|
||||||
|
|
||||||
class MicroBenchmark
|
|
||||||
{
|
|
||||||
public:
|
public:
|
||||||
char const *label;
|
struct hs_scratch scratch{};
|
||||||
size_t size;
|
char const *label;
|
||||||
|
size_t size;
|
||||||
|
std::vector<u8> buf;
|
||||||
|
ue2::bytecode_ptr<noodTable> nt;
|
||||||
|
ue2::CharReach chars;
|
||||||
|
|
||||||
// Shufti/Truffle
|
// Shufti/Truffle
|
||||||
m128 lo, hi;
|
union {
|
||||||
ue2::CharReach chars;
|
m256 truffle_mask;
|
||||||
std::vector<u8> buf;
|
struct {
|
||||||
|
#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
|
||||||
|
m128 truffle_mask_lo;
|
||||||
|
m128 truffle_mask_hi;
|
||||||
|
#else
|
||||||
|
m128 truffle_mask_hi;
|
||||||
|
m128 truffle_mask_lo;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
// Noodle
|
MicroBenchmark(char const *label_, size_t size_)
|
||||||
struct hs_scratch scratch;
|
: label(label_), size(size_), buf(size_){};
|
||||||
ue2::bytecode_ptr<noodTable> nt;
|
|
||||||
|
|
||||||
MicroBenchmark(char const *label_, size_t size_)
|
|
||||||
:label(label_), size(size_), buf(size_) {
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -9,22 +9,35 @@ if (USE_CPU_NATIVE)
|
|||||||
# the flag), so use that for tune.
|
# the flag), so use that for tune.
|
||||||
|
|
||||||
set(TUNE_FLAG "mtune")
|
set(TUNE_FLAG "mtune")
|
||||||
set(GNUCC_TUNE "")
|
|
||||||
|
# set the default fallback values for the arch and tune to native, in case we can't parse them properly later
|
||||||
|
set(GNUCC_ARCH "native")
|
||||||
|
set(GNUCC_TUNE "native")
|
||||||
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
|
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
|
||||||
|
|
||||||
# arg1 might exist if using ccache
|
# arg1 might exist if using ccache
|
||||||
string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
|
string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
|
||||||
set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -${TUNE_FLAG}=native)
|
set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE})
|
||||||
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
|
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
|
||||||
OUTPUT_VARIABLE _GCC_OUTPUT)
|
OUTPUT_VARIABLE _GCC_OUTPUT)
|
||||||
set(_GCC_OUTPUT_TUNE ${_GCC_OUTPUT})
|
set(_GCC_OUTPUT_TUNE ${_GCC_OUTPUT})
|
||||||
string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}=" POS)
|
string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}=" POS)
|
||||||
string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
|
string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
|
||||||
string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
|
string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" _GNUCC_ARCH "${_GCC_OUTPUT}")
|
||||||
|
|
||||||
|
# Only overwrite arch if non-empty
|
||||||
|
if(NOT _GNUCC_ARCH STREQUAL "")
|
||||||
|
set(GNUCC_ARCH ${_GNUCC_ARCH})
|
||||||
|
endif()
|
||||||
|
|
||||||
string(FIND "${_GCC_OUTPUT_TUNE}" "${TUNE_FLAG}=" POS_TUNE)
|
string(FIND "${_GCC_OUTPUT_TUNE}" "${TUNE_FLAG}=" POS_TUNE)
|
||||||
string(SUBSTRING "${_GCC_OUTPUT_TUNE}" ${POS_TUNE} -1 _GCC_OUTPUT_TUNE)
|
string(SUBSTRING "${_GCC_OUTPUT_TUNE}" ${POS_TUNE} -1 _GCC_OUTPUT_TUNE)
|
||||||
string(REGEX REPLACE "${TUNE_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_TUNE "${_GCC_OUTPUT_TUNE}")
|
string(REGEX REPLACE "${TUNE_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" _GNUCC_TUNE "${_GCC_OUTPUT_TUNE}")
|
||||||
|
|
||||||
|
# Only overwrite tune if non-empty
|
||||||
|
if (NOT _GNUCC_TUNE STREQUAL "")
|
||||||
|
set(GNUCC_TUNE ${_GNUCC_TUNE})
|
||||||
|
endif()
|
||||||
|
|
||||||
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
|
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
|
||||||
|
|
||||||
@@ -44,7 +57,7 @@ if (USE_CPU_NATIVE)
|
|||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_COMPILER_IS_CLANG)
|
elseif (CMAKE_COMPILER_IS_CLANG)
|
||||||
if (ARCH_IA32 OR ARCH_X86_64)
|
if (ARCH_IA32 OR ARCH_X86_64)
|
||||||
set(GNUCC_ARCH x86_64_v2)
|
set(GNUCC_ARCH x86-64-v2)
|
||||||
set(TUNE_FLAG generic)
|
set(TUNE_FLAG generic)
|
||||||
elseif(ARCH_AARCH64)
|
elseif(ARCH_AARCH64)
|
||||||
if (BUILD_SVE2_BITPERM)
|
if (BUILD_SVE2_BITPERM)
|
||||||
@@ -67,8 +80,25 @@ if (USE_CPU_NATIVE)
|
|||||||
message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
|
message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
if (ARCH_IA32 OR ARCH_X86_64)
|
if (SIMDE_BACKEND)
|
||||||
set(GNUCC_ARCH native)
|
if (ARCH_IA32 OR ARCH_X86_64)
|
||||||
|
set(GNUCC_ARCH x86-64-v2)
|
||||||
|
set(TUNE_FLAG generic)
|
||||||
|
elseif(ARCH_AARCH64)
|
||||||
|
set(GNUCC_ARCH armv8-a)
|
||||||
|
set(TUNE_FLAG generic)
|
||||||
|
elseif(ARCH_ARM32)
|
||||||
|
set(GNUCC_ARCH armv7a)
|
||||||
|
set(TUNE_FLAG generic)
|
||||||
|
elseif(ARCH_PPC64EL)
|
||||||
|
set(GNUCC_ARCH power8)
|
||||||
|
set(TUNE_FLAG power8)
|
||||||
|
else()
|
||||||
|
set(GNUCC_ARCH x86-64-v2)
|
||||||
|
set(TUNE_FLAG generic)
|
||||||
|
endif()
|
||||||
|
elseif (ARCH_IA32 OR ARCH_X86_64)
|
||||||
|
set(GNUCC_ARCH ${X86_ARCH})
|
||||||
set(TUNE_FLAG generic)
|
set(TUNE_FLAG generic)
|
||||||
elseif(ARCH_AARCH64)
|
elseif(ARCH_AARCH64)
|
||||||
if (BUILD_SVE2_BITPERM)
|
if (BUILD_SVE2_BITPERM)
|
||||||
@@ -84,8 +114,11 @@ else()
|
|||||||
elseif(ARCH_ARM32)
|
elseif(ARCH_ARM32)
|
||||||
set(GNUCC_ARCH armv7a)
|
set(GNUCC_ARCH armv7a)
|
||||||
set(TUNE_FLAG generic)
|
set(TUNE_FLAG generic)
|
||||||
|
elseif(ARCH_PPC64EL)
|
||||||
|
set(GNUCC_ARCH power8)
|
||||||
|
set(TUNE_FLAG power8)
|
||||||
else()
|
else()
|
||||||
set(GNUCC_ARCH power9)
|
set(GNUCC_ARCH native)
|
||||||
set(TUNE_FLAG power9)
|
set(TUNE_FLAG native)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -15,13 +15,21 @@ SYMSFILE=$(mktemp -p /tmp ${PREFIX}_rename.syms.XXXXX)
|
|||||||
KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
|
KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
|
||||||
# find the libc used by gcc
|
# find the libc used by gcc
|
||||||
LIBC_SO=$("$@" --print-file-name=libc.so.6)
|
LIBC_SO=$("$@" --print-file-name=libc.so.6)
|
||||||
|
NM_FLAG="-f"
|
||||||
|
if [ `uname` = "FreeBSD" ]; then
|
||||||
|
# for freebsd, we will specify the name,
|
||||||
|
# we will leave it work as is in linux
|
||||||
|
LIBC_SO=/lib/libc.so.7
|
||||||
|
# also, in BSD, the nm flag -F corresponds to the -f flag in linux.
|
||||||
|
NM_FLAG="-F"
|
||||||
|
fi
|
||||||
cp ${KEEPSYMS_IN} ${KEEPSYMS}
|
cp ${KEEPSYMS_IN} ${KEEPSYMS}
|
||||||
# get all symbols from libc and turn them into patterns
|
# get all symbols from libc and turn them into patterns
|
||||||
nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
|
nm ${NM_FLAG} p -g -D ${LIBC_SO} | sed 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
|
||||||
# build the object
|
# build the object
|
||||||
"$@"
|
"$@"
|
||||||
# rename the symbols in the object
|
# rename the symbols in the object
|
||||||
nm -f p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
|
nm ${NM_FLAG} p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
|
||||||
if test -s ${SYMSFILE}
|
if test -s ${SYMSFILE}
|
||||||
then
|
then
|
||||||
objcopy --redefine-syms=${SYMSFILE} ${OUT}
|
objcopy --redefine-syms=${SYMSFILE} ${OUT}
|
||||||
|
|||||||
@@ -1,22 +1,13 @@
|
|||||||
# set compiler flags - more are tested and added later
|
# set compiler flags - more are tested and added later
|
||||||
set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
|
set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra ")
|
||||||
set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
|
set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra ")
|
||||||
if (NOT CMAKE_COMPILER_IS_CLANG)
|
if (NOT CMAKE_COMPILER_IS_CLANG)
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT RELEASE_BUILD)
|
# Always use -Werror *also during release builds
|
||||||
# -Werror is most useful during development, don't potentially break
|
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wall -Werror")
|
||||||
# release builds
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wall -Werror")
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
|
|
||||||
if (CMAKE_COMPILER_IS_CLANG)
|
|
||||||
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER "13.0")
|
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-unused-but-set-variable")
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (DISABLE_ASSERTS)
|
if (DISABLE_ASSERTS)
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
|
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
|
||||||
@@ -25,28 +16,32 @@ endif()
|
|||||||
|
|
||||||
if(CMAKE_COMPILER_IS_GNUCC)
|
if(CMAKE_COMPILER_IS_GNUCC)
|
||||||
# spurious warnings?
|
# spurious warnings?
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
|
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds ") #-Wno-maybe-uninitialized")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized -Wno-uninitialized")
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
|
|
||||||
endif ()
|
|
||||||
# don't complain about abi
|
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (NOT(ARCH_IA32 AND RELEASE_BUILD))
|
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
|
CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
|
||||||
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
|
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
|
||||||
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
|
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
|
||||||
|
|
||||||
|
if(FREEBSD OR NETBSD)
|
||||||
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -gdwarf-4")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(NETBSD)
|
||||||
|
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
|
||||||
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(MACOSX)
|
||||||
|
# Boost headers cause such complains on MacOS
|
||||||
|
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
|
||||||
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
|
||||||
|
endif()
|
||||||
|
|
||||||
# these end up in the config file
|
# these end up in the config file
|
||||||
CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
|
CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
|
||||||
CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
|
CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
|
||||||
@@ -71,94 +66,41 @@ if (NOT CMAKE_COMPILER_IS_CLANG)
|
|||||||
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
|
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(C_FLAGS_TO_CHECK
|
|
||||||
# Variable length arrays are way bad, most especially at run time
|
|
||||||
"-Wvla"
|
|
||||||
# Pointer arith on void pointers is doing it wrong.
|
|
||||||
"-Wpointer-arith"
|
|
||||||
# Build our C code with -Wstrict-prototypes -Wmissing-prototypes
|
|
||||||
"-Wstrict-prototypes"
|
|
||||||
"-Wmissing-prototypes"
|
|
||||||
)
|
|
||||||
foreach (FLAG ${C_FLAGS_TO_CHECK})
|
|
||||||
# munge the name so it doesn't break things
|
|
||||||
string(REPLACE "-" "_" FNAME C_FLAG${FLAG})
|
|
||||||
CHECK_C_COMPILER_FLAG("${FLAG}" ${FNAME})
|
|
||||||
if (${FNAME})
|
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} ${FLAG}")
|
|
||||||
endif()
|
|
||||||
endforeach()
|
|
||||||
|
|
||||||
# self-assign should be thrown away, but clang whinges
|
|
||||||
CHECK_C_COMPILER_FLAG("-Wself-assign" CC_SELF_ASSIGN)
|
|
||||||
if (CC_SELF_ASSIGN)
|
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-self-assign")
|
|
||||||
endif()
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wself-assign" CXX_SELF_ASSIGN)
|
|
||||||
if (CXX_SELF_ASSIGN)
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-self-assign")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# clang gets up in our face for going paren crazy with macros
|
|
||||||
CHECK_C_COMPILER_FLAG("-Wparentheses-equality" CC_PAREN_EQUALITY)
|
|
||||||
if (CC_PAREN_EQUALITY)
|
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-parentheses-equality")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# clang complains about unused const vars in our Ragel-generated code.
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wunused-const-variable" CXX_UNUSED_CONST_VAR)
|
|
||||||
if (CXX_UNUSED_CONST_VAR)
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# clang-14 complains about unused-but-set variable.
|
# clang-14 complains about unused-but-set variable.
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wunused-but-set-variable" CXX_UNUSED_BUT_SET_VAR)
|
CHECK_CXX_COMPILER_FLAG("-Wunused-but-set-variable" CXX_UNUSED_BUT_SET_VAR)
|
||||||
if (CXX_UNUSED_BUT_SET_VAR)
|
if (CXX_UNUSED_BUT_SET_VAR)
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# clang-14 complains about using bitwise operator instead of logical ones.
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wbitwise-instead-of-logical" CXX_BITWISE_INSTEAD_OF_LOGICAL)
|
|
||||||
if (CXX_BITWISE_INSTEAD_OF_LOGICAL)
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-bitwise-instead-of-logical")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# clang-14 complains about using bitwise operator instead of logical ones.
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wbitwise-instead-of-logical" CXX_BITWISE_INSTEAD_OF_LOGICAL)
|
|
||||||
if (CXX_BITWISE_INSTEAD_OF_LOGICAL)
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-bitwise-instead-of-logical")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
|
CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
|
||||||
if (CXX_IGNORED_ATTR)
|
if(CMAKE_COMPILER_IS_GNUCC)
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
|
if (CXX_IGNORED_ATTR)
|
||||||
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# gcc 9 complains about redundant move for returned variable
|
CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_NON_NULL)
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wredundant-move" CXX_REDUNDANT_MOVE)
|
if(CMAKE_COMPILER_IS_GNUCC)
|
||||||
if (CXX_REDUNDANT_MOVE)
|
if (CXX_NON_NULL)
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-redundant-move")
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-nonnull")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# note this for later, g++ doesn't have this flag but clang does
|
# note this for later, g++ doesn't have this flag but clang does
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
|
CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
|
||||||
if (CXX_WEAK_VTABLES)
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wweak-vtables")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
|
CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
|
||||||
if (CXX_MISSING_DECLARATIONS)
|
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wmissing-declarations")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
|
CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
|
||||||
|
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
|
CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
|
||||||
|
|
||||||
# gcc 10 complains about this
|
# gcc complains about this
|
||||||
CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
|
if(CMAKE_COMPILER_IS_GNUCC)
|
||||||
CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
|
CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
|
||||||
if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
|
CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow")
|
if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
|
||||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow")
|
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
|
||||||
|
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -16,3 +16,12 @@ int main() {
|
|||||||
if (NOT HAVE_VSX)
|
if (NOT HAVE_VSX)
|
||||||
message(FATAL_ERROR "VSX support required for Power support")
|
message(FATAL_ERROR "VSX support required for Power support")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# fix unit-internal seg fault for freebsd and gcc13
|
||||||
|
if (FREEBSD AND CMAKE_COMPILER_IS_GNUCXX)
|
||||||
|
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "13")
|
||||||
|
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
|
||||||
|
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
|
||||||
|
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|||||||
@@ -1,33 +1,42 @@
|
|||||||
option(BUILD_AVX512 "Enabling support for AVX512" OFF)
|
option(BUILD_AVX512 "Enabling support for AVX512" OFF)
|
||||||
option(BUILD_AVX512VBMI "Enabling support for AVX512VBMI" OFF)
|
option(BUILD_AVX512VBMI "Enabling support for AVX512VBMI" OFF)
|
||||||
|
|
||||||
set(SKYLAKE_FLAG "-march=skylake-avx512")
|
set(SKYLAKE_ARCH "skylake-avx512")
|
||||||
set(ICELAKE_FLAG "-march=icelake-server")
|
set(ICELAKE_ARCH "icelake-server")
|
||||||
|
set(SKYLAKE_FLAG "-march=${SKYLAKE_ARCH}")
|
||||||
|
set(ICELAKE_FLAG "-march=${ICELAKE_ARCH}")
|
||||||
|
|
||||||
if (NOT FAT_RUNTIME)
|
if (NOT FAT_RUNTIME)
|
||||||
if (BUILD_AVX512VBMI)
|
if (BUILD_AVX512VBMI)
|
||||||
message (STATUS "AVX512VBMI implies AVX512, enabling BUILD_AVX512")
|
message (STATUS "AVX512VBMI implies AVX512, enabling BUILD_AVX512")
|
||||||
set(BUILD_AVX512 ON)
|
set(BUILD_AVX512 ON)
|
||||||
|
set(BUILD_AVX2 ON)
|
||||||
set(ARCH_C_FLAGS "${ICELAKE_FLAG}")
|
set(ARCH_C_FLAGS "${ICELAKE_FLAG}")
|
||||||
set(ARCH_CXX_FLAGS "${ICELAKE_FLAG}")
|
set(ARCH_CXX_FLAGS "${ICELAKE_FLAG}")
|
||||||
endif ()
|
set(X86_ARCH "${ICELAKE_ARCH}")
|
||||||
if (BUILD_AVX512)
|
elseif (BUILD_AVX512)
|
||||||
message (STATUS "AVX512 implies AVX2, enabling BUILD_AVX2")
|
message (STATUS "AVX512 implies AVX2, enabling BUILD_AVX2")
|
||||||
set(BUILD_AVX2 ON)
|
set(BUILD_AVX2 ON)
|
||||||
set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
|
set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
|
||||||
set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
|
set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
|
||||||
endif ()
|
set(X86_ARCH "${SKYLAKE_ARCH}")
|
||||||
if (BUILD_AVX2)
|
elseif (BUILD_AVX2)
|
||||||
message (STATUS "Enabling BUILD_AVX2")
|
message (STATUS "Enabling BUILD_AVX2")
|
||||||
set(ARCH_C_FLAGS "-mavx2")
|
set(ARCH_C_FLAGS "-mavx2")
|
||||||
set(ARCH_CXX_FLAGS "-mavx2")
|
set(ARCH_CXX_FLAGS "-mavx2")
|
||||||
|
set(X86_ARCH "core-avx2")
|
||||||
else()
|
else()
|
||||||
set(ARCH_C_FLAGS "-msse4.2")
|
set(ARCH_C_FLAGS "-msse4.2")
|
||||||
set(ARCH_CXX_FLAGS "-msse4.2")
|
set(ARCH_CXX_FLAGS "-msse4.2")
|
||||||
|
set(X86_ARCH "x86-64-v2")
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
|
set(BUILD_AVX512VBMI ON)
|
||||||
|
set(BUILD_AVX512 ON)
|
||||||
|
set(BUILD_AVX2 ON)
|
||||||
set(ARCH_C_FLAGS "-msse4.2")
|
set(ARCH_C_FLAGS "-msse4.2")
|
||||||
set(ARCH_CXX_FLAGS "-msse4.2")
|
set(ARCH_CXX_FLAGS "-msse4.2")
|
||||||
|
set(X86_ARCH "x86-64-v2")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}")
|
set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}")
|
||||||
@@ -129,5 +138,3 @@ else (NOT FAT_RUNTIME)
|
|||||||
message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
|
message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS CLANGCXX_MINVER)
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS CLANGCXX_MINVER)
|
||||||
message(FATAL_ERROR "A minimum of clang++ ${CLANGCXX_MINVER} is required for C++17 support")
|
message(FATAL_ERROR "A minimum of clang++ ${CLANGCXX_MINVER} is required for C++17 support")
|
||||||
endif()
|
endif()
|
||||||
|
string (REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)$" "\\1" CLANG_MAJOR_VERSION "${CMAKE_CXX_COMPILER_VERSION}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# compiler version checks TODO: test more compilers
|
# compiler version checks TODO: test more compilers
|
||||||
|
|||||||
@@ -4,27 +4,43 @@ endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|||||||
|
|
||||||
if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||||
set(FREEBSD true)
|
set(FREEBSD true)
|
||||||
|
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
|
||||||
|
#FIXME: find a nicer and more general way of doing this
|
||||||
|
if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
|
||||||
|
set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
|
||||||
|
elseif(ARCH_AARCH64 AND (CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12"))
|
||||||
|
set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
|
||||||
|
endif()
|
||||||
endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||||
|
|
||||||
option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" OFF)
|
if(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||||
message("Checking Fat Runtime Requirements...")
|
set(NETBSD true)
|
||||||
if (FAT_RUNTIME AND NOT LINUX)
|
endif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||||
message(FATAL_ERROR "Fat runtime is only supported on Linux OS")
|
|
||||||
|
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||||
|
set(MACOSX TRUE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (USE_CPU_NATIVE AND FAT_RUNTIME)
|
if (ARCH_IA32 OR ARCH_X86_64)
|
||||||
message(FATAL_ERROR "Fat runtime is not compatible with Native CPU detection")
|
option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ON)
|
||||||
|
else()
|
||||||
|
option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" OFF)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (FAT_RUNTIME AND LINUX)
|
if (FAT_RUNTIME)
|
||||||
|
message("Checking Fat Runtime Requirements...")
|
||||||
|
if (USE_CPU_NATIVE AND FAT_RUNTIME)
|
||||||
|
message(FATAL_ERROR "Fat runtime is not compatible with Native CPU detection")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT (ARCH_IA32 OR ARCH_X86_64 OR ARCH_AARCH64))
|
if (NOT (ARCH_IA32 OR ARCH_X86_64 OR ARCH_AARCH64))
|
||||||
message(FATAL_ERROR "Fat runtime is only supported on Intel and Aarch64 architectures")
|
message(FATAL_ERROR "Fat runtime is only supported on Intel and Aarch64 architectures")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Building Fat runtime for multiple microarchitectures")
|
message(STATUS "Building Fat runtime for multiple microarchitectures")
|
||||||
message(STATUS "generator is ${CMAKE_GENERATOR}")
|
message(STATUS "generator is ${CMAKE_GENERATOR}")
|
||||||
if (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
|
if (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
|
||||||
(CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
|
(CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
|
||||||
message (FATAL_ERROR "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
|
message (FATAL_ERROR "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
|
||||||
else()
|
else()
|
||||||
include (${CMAKE_MODULE_PATH}/attrib.cmake)
|
include (${CMAKE_MODULE_PATH}/attrib.cmake)
|
||||||
if (NOT HAS_C_ATTR_IFUNC)
|
if (NOT HAS_C_ATTR_IFUNC)
|
||||||
@@ -36,5 +52,3 @@ if (FAT_RUNTIME AND LINUX)
|
|||||||
message(FATAL_ERROR "Fat runtime is only built on Release builds")
|
message(FATAL_ERROR "Fat runtime is only built on Release builds")
|
||||||
endif()
|
endif()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ if (PCRE_BUILD_SOURCE)
|
|||||||
#if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR < ${PCRE_REQUIRED_MINOR_VERSION}
|
#if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR < ${PCRE_REQUIRED_MINOR_VERSION}
|
||||||
#error Incorrect pcre version
|
#error Incorrect pcre version
|
||||||
#endif
|
#endif
|
||||||
main() {}" CORRECT_PCRE_VERSION)
|
int main(void) {return 0;}" CORRECT_PCRE_VERSION)
|
||||||
set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")
|
set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")
|
||||||
|
|
||||||
if (NOT CORRECT_PCRE_VERSION)
|
if (NOT CORRECT_PCRE_VERSION)
|
||||||
|
|||||||
40
cmake/simde.cmake
Normal file
40
cmake/simde.cmake
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
LIST(APPEND CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/simde)
|
||||||
|
|
||||||
|
CHECK_INCLUDE_FILES(simde/x86/sse4.2.h SIMDE_SSE42_H_FOUND)
|
||||||
|
|
||||||
|
if (SIMDE_SSE42_H_FOUND)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
|
||||||
|
include_directories(${PROJECT_SOURCE_DIR}/simde)
|
||||||
|
|
||||||
|
if (CMAKE_COMPILER_IS_CLANG)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||||
|
if (ARCH_PPC64EL)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-altivec-src-compat")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-altivec-src-compat")
|
||||||
|
if (CLANG_MAJOR_VERSION EQUAL 15)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecate-lax-vec-conv-all")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecate-lax-vec-conv-all")
|
||||||
|
endif ()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (BUILD_SSE2_SIMDE)
|
||||||
|
message("using BUILD_SSE2_SIMDE..")
|
||||||
|
set(SIMDE_NATIVE true)
|
||||||
|
set(ARCH_C_FLAGS "-msse2")
|
||||||
|
set(ARCH_CXX_FLAGS "-msse2")
|
||||||
|
set(X86_ARCH "x86-64")
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (SIMDE_NATIVE AND NOT BUILD_SSE2_SIMDE)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
|
||||||
|
endif()
|
||||||
@@ -1,15 +1,9 @@
|
|||||||
#
|
#
|
||||||
# a lot of noise to find sqlite
|
# sqlite is only used in hsbench, no need to special case its build, depend only on OS installations using pkg-config
|
||||||
#
|
#
|
||||||
|
|
||||||
option(SQLITE_PREFER_STATIC "Build sqlite3 statically instead of using an installed lib" OFF)
|
|
||||||
|
|
||||||
if(NOT SQLITE_PREFER_STATIC)
|
|
||||||
find_package(PkgConfig QUIET)
|
|
||||||
|
|
||||||
# first check for sqlite on the system
|
# first check for sqlite on the system
|
||||||
pkg_check_modules(SQLITE3 sqlite3)
|
pkg_check_modules(SQLITE3 sqlite3)
|
||||||
endif()
|
|
||||||
|
|
||||||
# now do version checks
|
# now do version checks
|
||||||
if (SQLITE3_FOUND)
|
if (SQLITE3_FOUND)
|
||||||
@@ -17,20 +11,9 @@ if (SQLITE3_FOUND)
|
|||||||
if (SQLITE_VERSION LESS "3.8.10")
|
if (SQLITE_VERSION LESS "3.8.10")
|
||||||
message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version")
|
message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
|
|
||||||
if (NOT SQLITE3_BUILD_SOURCE)
|
|
||||||
set(_SAVED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
|
||||||
list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS})
|
list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS})
|
||||||
CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2)
|
CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2)
|
||||||
list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}")
|
list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}")
|
||||||
list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS})
|
list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS})
|
||||||
else()
|
|
||||||
if (NOT TARGET sqlite3_static)
|
|
||||||
# build sqlite as a static lib to compile into our test programs
|
|
||||||
add_library(sqlite3_static STATIC "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
|
|
||||||
set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-error -Wno-extra -Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION")
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# that's enough about sqlite
|
|
||||||
|
|||||||
15
cppcheck-suppression-list.txt
Normal file
15
cppcheck-suppression-list.txt
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
unknownMacro:*gtest-all.cc
|
||||||
|
knownConditionTrueFalse:*Parser.rl
|
||||||
|
knownConditionTrueFalse:*Parser.cpp
|
||||||
|
variableScope:*Parser.rl
|
||||||
|
duplicateBreak:*.rl
|
||||||
|
unreadVariable:*control_verbs.cpp
|
||||||
|
unreachableCode:*rose_build_dump.cpp
|
||||||
|
*:*simde/*
|
||||||
|
assertWithSideEffect
|
||||||
|
syntaxError
|
||||||
|
internalError
|
||||||
|
checkersReport
|
||||||
|
missingInclude
|
||||||
|
missingIncludeSystem
|
||||||
|
unmatchedSuppression
|
||||||
@@ -19,6 +19,7 @@ else()
|
|||||||
set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
|
set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
|
||||||
set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
|
set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
|
||||||
set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
|
set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
|
||||||
|
set(SPHINX_MAN_DIR "${CMAKE_CURRENT_BINARY_DIR}/man")
|
||||||
|
|
||||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
|
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
|
"${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
|
||||||
@@ -32,4 +33,14 @@ add_custom_target(dev-reference
|
|||||||
"${SPHINX_HTML_DIR}"
|
"${SPHINX_HTML_DIR}"
|
||||||
DEPENDS dev-reference-doxygen
|
DEPENDS dev-reference-doxygen
|
||||||
COMMENT "Building HTML dev reference with Sphinx")
|
COMMENT "Building HTML dev reference with Sphinx")
|
||||||
|
|
||||||
|
add_custom_target(dev-reference-man
|
||||||
|
${SPHINX_BUILD}
|
||||||
|
-b man
|
||||||
|
-c "${CMAKE_CURRENT_BINARY_DIR}"
|
||||||
|
-d "${SPHINX_CACHE_DIR}"
|
||||||
|
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||||
|
"${SPHINX_MAN_DIR}"
|
||||||
|
DEPENDS dev-reference-doxygen
|
||||||
|
COMMENT "Building man page reference with Sphinx")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ Introduction
|
|||||||
************
|
************
|
||||||
|
|
||||||
Chimera is a software regular expression matching engine that is a hybrid of
|
Chimera is a software regular expression matching engine that is a hybrid of
|
||||||
Hyperscan and PCRE. The design goals of Chimera are to fully support PCRE
|
Vectorscan and PCRE. The design goals of Chimera are to fully support PCRE
|
||||||
syntax as well as to take advantage of the high performance nature of Hyperscan.
|
syntax as well as to take advantage of the high performance nature of Vectorscan.
|
||||||
|
|
||||||
Chimera inherits the design guideline of Hyperscan with C APIs for compilation
|
Chimera inherits the design guideline of Vectorscan with C APIs for compilation
|
||||||
and scanning.
|
and scanning.
|
||||||
|
|
||||||
The Chimera API itself is composed of two major components:
|
The Chimera API itself is composed of two major components:
|
||||||
@@ -65,13 +65,13 @@ For a given database, Chimera provides several guarantees:
|
|||||||
.. note:: Chimera is designed to have the same matching behavior as PCRE,
|
.. note:: Chimera is designed to have the same matching behavior as PCRE,
|
||||||
including greedy/ungreedy, capturing, etc. Chimera reports both
|
including greedy/ungreedy, capturing, etc. Chimera reports both
|
||||||
**start offset** and **end offset** for each match like PCRE. Different
|
**start offset** and **end offset** for each match like PCRE. Different
|
||||||
from the fashion of reporting all matches in Hyperscan, Chimera only reports
|
from the fashion of reporting all matches in Vectorscan, Chimera only reports
|
||||||
non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will
|
non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will
|
||||||
match ``foofoofoofoo`` at offsets (0, 6) and (6, 12).
|
match ``foofoofoofoo`` at offsets (0, 6) and (6, 12).
|
||||||
|
|
||||||
.. note:: Since Chimera is a hybrid of Hyperscan and PCRE in order to support
|
.. note:: Since Chimera is a hybrid of Vectorscan and PCRE in order to support
|
||||||
full PCRE syntax, there will be extra performance overhead compared to
|
full PCRE syntax, there will be extra performance overhead compared to
|
||||||
Hyperscan-only solution. Please always use Hyperscan for better performance
|
Vectorscan-only solution. Please always use Vectorscan for better performance
|
||||||
unless you must need full PCRE syntax support.
|
unless you must need full PCRE syntax support.
|
||||||
|
|
||||||
See :ref:`chruntime` for more details
|
See :ref:`chruntime` for more details
|
||||||
@@ -83,12 +83,12 @@ Requirements
|
|||||||
The PCRE library (http://pcre.org/) version 8.41 is required for Chimera.
|
The PCRE library (http://pcre.org/) version 8.41 is required for Chimera.
|
||||||
|
|
||||||
.. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source
|
.. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source
|
||||||
directory under Hyperscan root directory in order to build Chimera.
|
directory under Vectorscan root directory in order to build Chimera.
|
||||||
|
|
||||||
Beside this, both hardware and software requirements of Chimera are the same to Hyperscan.
|
Beside this, both hardware and software requirements of Chimera are the same to Vectorscan.
|
||||||
See :ref:`hardware` and :ref:`software` for more details.
|
See :ref:`hardware` and :ref:`software` for more details.
|
||||||
|
|
||||||
.. note:: Building Hyperscan will automatically generate Chimera library.
|
.. note:: Building Vectorscan will automatically generate Chimera library.
|
||||||
Currently only static library is supported for Chimera, so please
|
Currently only static library is supported for Chimera, so please
|
||||||
use static build type when configure CMake build options.
|
use static build type when configure CMake build options.
|
||||||
|
|
||||||
@@ -119,7 +119,7 @@ databases:
|
|||||||
|
|
||||||
Compilation allows the Chimera library to analyze the given pattern(s) and
|
Compilation allows the Chimera library to analyze the given pattern(s) and
|
||||||
pre-determine how to scan for these patterns in an optimized fashion using
|
pre-determine how to scan for these patterns in an optimized fashion using
|
||||||
Hyperscan and PCRE.
|
Vectorscan and PCRE.
|
||||||
|
|
||||||
===============
|
===============
|
||||||
Pattern Support
|
Pattern Support
|
||||||
@@ -134,7 +134,7 @@ Semantics
|
|||||||
=========
|
=========
|
||||||
|
|
||||||
Chimera supports the exact same semantics of PCRE library. Moreover, it supports
|
Chimera supports the exact same semantics of PCRE library. Moreover, it supports
|
||||||
multiple simultaneous pattern matching like Hyperscan and the multiple matches
|
multiple simultaneous pattern matching like Vectorscan and the multiple matches
|
||||||
will be reported in order by end offset.
|
will be reported in order by end offset.
|
||||||
|
|
||||||
.. _chruntime:
|
.. _chruntime:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ Compiling Patterns
|
|||||||
Building a Database
|
Building a Database
|
||||||
*******************
|
*******************
|
||||||
|
|
||||||
The Hyperscan compiler API accepts regular expressions and converts them into a
|
The Vectorscan compiler API accepts regular expressions and converts them into a
|
||||||
compiled pattern database that can then be used to scan data.
|
compiled pattern database that can then be used to scan data.
|
||||||
|
|
||||||
The API provides three functions that compile regular expressions into
|
The API provides three functions that compile regular expressions into
|
||||||
@@ -24,7 +24,7 @@ databases:
|
|||||||
#. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
|
#. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
|
||||||
but allows :ref:`extparam` to be specified for each expression.
|
but allows :ref:`extparam` to be specified for each expression.
|
||||||
|
|
||||||
Compilation allows the Hyperscan library to analyze the given pattern(s) and
|
Compilation allows the Vectorscan library to analyze the given pattern(s) and
|
||||||
pre-determine how to scan for these patterns in an optimized fashion that would
|
pre-determine how to scan for these patterns in an optimized fashion that would
|
||||||
be far too expensive to compute at run-time.
|
be far too expensive to compute at run-time.
|
||||||
|
|
||||||
@@ -48,10 +48,10 @@ To compile patterns to be used in streaming mode, the ``mode`` parameter of
|
|||||||
block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
|
block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
|
||||||
requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
|
requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
|
||||||
for one mode (streaming, block or vectored) can only be used in that mode. The
|
for one mode (streaming, block or vectored) can only be used in that mode. The
|
||||||
version of Hyperscan used to produce a compiled pattern database must match the
|
version of Vectorscan used to produce a compiled pattern database must match the
|
||||||
version of Hyperscan used to scan with it.
|
version of Vectorscan used to scan with it.
|
||||||
|
|
||||||
Hyperscan provides support for targeting a database at a particular CPU
|
Vectorscan provides support for targeting a database at a particular CPU
|
||||||
platform; see :ref:`instr_specialization` for details.
|
platform; see :ref:`instr_specialization` for details.
|
||||||
|
|
||||||
=====================
|
=====================
|
||||||
@@ -75,14 +75,14 @@ characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
|
|||||||
While in pure literal case, all these meta characters lost extra meanings
|
While in pure literal case, all these meta characters lost extra meanings
|
||||||
expect for that they are just common ASCII codes.
|
expect for that they are just common ASCII codes.
|
||||||
|
|
||||||
Hyperscan is initially designed to process common regular expressions. It is
|
Vectorscan is initially designed to process common regular expressions. It is
|
||||||
hence embedded with a complex parser to do comprehensive regular grammar
|
hence embedded with a complex parser to do comprehensive regular grammar
|
||||||
interpretation. Particularly, the identification of above meta characters is the
|
interpretation. Particularly, the identification of above meta characters is the
|
||||||
basic step for the interpretation of far more complex regular grammars.
|
basic step for the interpretation of far more complex regular grammars.
|
||||||
|
|
||||||
However in real cases, patterns may not always be regular expressions. They
|
However in real cases, patterns may not always be regular expressions. They
|
||||||
could just be pure literals. Problem will come if the pure literals contain
|
could just be pure literals. Problem will come if the pure literals contain
|
||||||
regular meta characters. Supposing fed directly into traditional Hyperscan
|
regular meta characters. Supposing fed directly into traditional Vectorscan
|
||||||
compile API, all these meta characters will be interpreted in predefined ways,
|
compile API, all these meta characters will be interpreted in predefined ways,
|
||||||
which is unnecessary and the result is totally out of expectation. To avoid
|
which is unnecessary and the result is totally out of expectation. To avoid
|
||||||
such misunderstanding by traditional API, users have to preprocess these
|
such misunderstanding by traditional API, users have to preprocess these
|
||||||
@@ -90,7 +90,7 @@ literal patterns by converting the meta characters into some other formats:
|
|||||||
either by adding a backslash ``\`` before certain meta characters, or by
|
either by adding a backslash ``\`` before certain meta characters, or by
|
||||||
converting all the characters into a hexadecimal representation.
|
converting all the characters into a hexadecimal representation.
|
||||||
|
|
||||||
In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns:
|
In ``v5.2.0``, Vectorscan introduces 2 new compile APIs for pure literal patterns:
|
||||||
|
|
||||||
#. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
|
#. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
|
||||||
database.
|
database.
|
||||||
@@ -106,7 +106,7 @@ content directly into these APIs without worrying about writing regular meta
|
|||||||
characters in their patterns. No preprocessing work is needed any more.
|
characters in their patterns. No preprocessing work is needed any more.
|
||||||
|
|
||||||
For new APIs, the ``length`` of each literal pattern is a newly added parameter.
|
For new APIs, the ``length`` of each literal pattern is a newly added parameter.
|
||||||
Hyperscan needs to locate the end position of the input expression via clearly
|
Vectorscan needs to locate the end position of the input expression via clearly
|
||||||
knowing each literal's length, not by simply identifying character ``\0`` of a
|
knowing each literal's length, not by simply identifying character ``\0`` of a
|
||||||
string.
|
string.
|
||||||
|
|
||||||
@@ -127,19 +127,19 @@ Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_SINGLEMATCH`,
|
|||||||
Pattern Support
|
Pattern Support
|
||||||
***************
|
***************
|
||||||
|
|
||||||
Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
|
Vectorscan supports the pattern syntax used by the PCRE library ("libpcre"),
|
||||||
described at <http://www.pcre.org/>. However, not all constructs available in
|
described at <http://www.pcre.org/>. However, not all constructs available in
|
||||||
libpcre are supported. The use of unsupported constructs will result in
|
libpcre are supported. The use of unsupported constructs will result in
|
||||||
compilation errors.
|
compilation errors.
|
||||||
|
|
||||||
The version of PCRE used to validate Hyperscan's interpretation of this syntax
|
The version of PCRE used to validate Vectorscan's interpretation of this syntax
|
||||||
is 8.41 or above.
|
is 8.41 or above.
|
||||||
|
|
||||||
====================
|
====================
|
||||||
Supported Constructs
|
Supported Constructs
|
||||||
====================
|
====================
|
||||||
|
|
||||||
The following regex constructs are supported by Hyperscan:
|
The following regex constructs are supported by Vectorscan:
|
||||||
|
|
||||||
* Literal characters and strings, with all libpcre quoting and character
|
* Literal characters and strings, with all libpcre quoting and character
|
||||||
escapes.
|
escapes.
|
||||||
@@ -177,7 +177,7 @@ The following regex constructs are supported by Hyperscan:
|
|||||||
:c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
|
:c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
|
||||||
|
|
||||||
* Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
|
* Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
|
||||||
:regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
|
:regexp:`\\w+?`) are supported but ignored (as Vectorscan reports all
|
||||||
matches).
|
matches).
|
||||||
|
|
||||||
* Parenthesization, including the named and unnamed capturing and
|
* Parenthesization, including the named and unnamed capturing and
|
||||||
@@ -219,15 +219,15 @@ The following regex constructs are supported by Hyperscan:
|
|||||||
.. note:: At this time, not all patterns can be successfully compiled with the
|
.. note:: At this time, not all patterns can be successfully compiled with the
|
||||||
:c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
|
:c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
|
||||||
:ref:`som`. The patterns that support this flag are a subset of patterns that
|
:ref:`som`. The patterns that support this flag are a subset of patterns that
|
||||||
can be successfully compiled with Hyperscan; notably, many bounded repeat
|
can be successfully compiled with Vectorscan; notably, many bounded repeat
|
||||||
forms that can be compiled with Hyperscan without the Start of Match flag
|
forms that can be compiled with Vectorscan without the Start of Match flag
|
||||||
enabled cannot be compiled with the flag enabled.
|
enabled cannot be compiled with the flag enabled.
|
||||||
|
|
||||||
======================
|
======================
|
||||||
Unsupported Constructs
|
Unsupported Constructs
|
||||||
======================
|
======================
|
||||||
|
|
||||||
The following regex constructs are not supported by Hyperscan:
|
The following regex constructs are not supported by Vectorscan:
|
||||||
|
|
||||||
* Backreferences and capturing sub-expressions.
|
* Backreferences and capturing sub-expressions.
|
||||||
* Arbitrary zero-width assertions.
|
* Arbitrary zero-width assertions.
|
||||||
@@ -246,32 +246,32 @@ The following regex constructs are not supported by Hyperscan:
|
|||||||
Semantics
|
Semantics
|
||||||
*********
|
*********
|
||||||
|
|
||||||
While Hyperscan follows libpcre syntax, it provides different semantics. The
|
While Vectorscan follows libpcre syntax, it provides different semantics. The
|
||||||
major departures from libpcre semantics are motivated by the requirements of
|
major departures from libpcre semantics are motivated by the requirements of
|
||||||
streaming and multiple simultaneous pattern matching.
|
streaming and multiple simultaneous pattern matching.
|
||||||
|
|
||||||
The major departures from libpcre semantics are:
|
The major departures from libpcre semantics are:
|
||||||
|
|
||||||
#. **Multiple pattern matching**: Hyperscan allows matches to be reported for
|
#. **Multiple pattern matching**: Vectorscan allows matches to be reported for
|
||||||
several patterns simultaneously. This is not equivalent to separating the
|
several patterns simultaneously. This is not equivalent to separating the
|
||||||
patterns by :regexp:`|` in libpcre, which evaluates alternations
|
patterns by :regexp:`|` in libpcre, which evaluates alternations
|
||||||
left-to-right.
|
left-to-right.
|
||||||
|
|
||||||
#. **Lack of ordering**: the multiple matches that Hyperscan produces are not
|
#. **Lack of ordering**: the multiple matches that Vectorscan produces are not
|
||||||
guaranteed to be ordered, although they will always fall within the bounds of
|
guaranteed to be ordered, although they will always fall within the bounds of
|
||||||
the current scan.
|
the current scan.
|
||||||
|
|
||||||
#. **End offsets only**: Hyperscan's default behaviour is only to report the end
|
#. **End offsets only**: Vectorscan's default behaviour is only to report the end
|
||||||
offset of a match. Reporting of the start offset can be enabled with
|
offset of a match. Reporting of the start offset can be enabled with
|
||||||
per-expression flags at pattern compile time. See :ref:`som` for details.
|
per-expression flags at pattern compile time. See :ref:`som` for details.
|
||||||
|
|
||||||
#. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
|
#. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
|
||||||
``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
|
``fooxyzbarbar`` will return two matches from Vectorscan -- at the points
|
||||||
corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
|
corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
|
||||||
libpcre semantics by default would report only one match at ``fooxyzbarbar``
|
libpcre semantics by default would report only one match at ``fooxyzbarbar``
|
||||||
(greedy semantics) or, if non-greedy semantics were switched on, one match at
|
(greedy semantics) or, if non-greedy semantics were switched on, one match at
|
||||||
``fooxyzbar``. This means that switching between greedy and non-greedy
|
``fooxyzbar``. This means that switching between greedy and non-greedy
|
||||||
semantics is a no-op in Hyperscan.
|
semantics is a no-op in Vectorscan.
|
||||||
|
|
||||||
To support libpcre quantifier semantics while accurately reporting streaming
|
To support libpcre quantifier semantics while accurately reporting streaming
|
||||||
matches at the time they occur is impossible. For example, consider the pattern
|
matches at the time they occur is impossible. For example, consider the pattern
|
||||||
@@ -299,7 +299,7 @@ as in block 3 -- which would constitute a better match for the pattern.
|
|||||||
Start of Match
|
Start of Match
|
||||||
==============
|
==============
|
||||||
|
|
||||||
In standard operation, Hyperscan will only provide the end offset of a match
|
In standard operation, Vectorscan will only provide the end offset of a match
|
||||||
when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
|
when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
|
||||||
is specified for a particular pattern, then the same set of matches is
|
is specified for a particular pattern, then the same set of matches is
|
||||||
returned, but each match will also provide the leftmost possible start offset
|
returned, but each match will also provide the leftmost possible start offset
|
||||||
@@ -308,7 +308,7 @@ corresponding to its end offset.
|
|||||||
Using the SOM flag entails a number of trade-offs and limitations:
|
Using the SOM flag entails a number of trade-offs and limitations:
|
||||||
|
|
||||||
* Reduced pattern support: For many patterns, tracking SOM is complex and can
|
* Reduced pattern support: For many patterns, tracking SOM is complex and can
|
||||||
result in Hyperscan failing to compile a pattern with a "Pattern too
|
result in Vectorscan failing to compile a pattern with a "Pattern too
|
||||||
large" error, even if the pattern is supported in normal operation.
|
large" error, even if the pattern is supported in normal operation.
|
||||||
* Increased stream state: At scan time, state space is required to track
|
* Increased stream state: At scan time, state space is required to track
|
||||||
potential SOM offsets, and this must be stored in persistent stream state in
|
potential SOM offsets, and this must be stored in persistent stream state in
|
||||||
@@ -316,20 +316,20 @@ Using the SOM flag entails a number of trade-offs and limitations:
|
|||||||
required to match a pattern.
|
required to match a pattern.
|
||||||
* Performance overhead: Similarly, there is generally a performance cost
|
* Performance overhead: Similarly, there is generally a performance cost
|
||||||
associated with tracking SOM.
|
associated with tracking SOM.
|
||||||
* Incompatible features: Some other Hyperscan pattern flags (such as
|
* Incompatible features: Some other Vectorscan pattern flags (such as
|
||||||
:c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
|
:c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
|
||||||
used in combination with SOM. Specifying them together with
|
used in combination with SOM. Specifying them together with
|
||||||
:c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
|
:c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
|
||||||
|
|
||||||
In streaming mode, the amount of precision delivered by SOM can be controlled
|
In streaming mode, the amount of precision delivered by SOM can be controlled
|
||||||
with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
|
with the SOM horizon flags. These instruct Vectorscan to deliver accurate SOM
|
||||||
information within a certain distance of the end offset, and return a special
|
information within a certain distance of the end offset, and return a special
|
||||||
start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
|
start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
|
||||||
small or medium SOM horizon will usually reduce the stream state required for a
|
small or medium SOM horizon will usually reduce the stream state required for a
|
||||||
given database.
|
given database.
|
||||||
|
|
||||||
.. note:: In streaming mode, the start offset returned for a match may refer to
|
.. note:: In streaming mode, the start offset returned for a match may refer to
|
||||||
a point in the stream *before* the current block being scanned. Hyperscan
|
a point in the stream *before* the current block being scanned. Vectorscan
|
||||||
provides no facility for accessing earlier blocks; if the calling application
|
provides no facility for accessing earlier blocks; if the calling application
|
||||||
needs to inspect historical data, then it must store it itself.
|
needs to inspect historical data, then it must store it itself.
|
||||||
|
|
||||||
@@ -341,7 +341,7 @@ Extended Parameters
|
|||||||
|
|
||||||
In some circumstances, more control over the matching behaviour of a pattern is
|
In some circumstances, more control over the matching behaviour of a pattern is
|
||||||
required than can be specified easily using regular expression syntax. For
|
required than can be specified easily using regular expression syntax. For
|
||||||
these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
|
these scenarios, Vectorscan provides the :c:func:`hs_compile_ext_multi` function
|
||||||
that allows a set of "extended parameters" to be set on a per-pattern basis.
|
that allows a set of "extended parameters" to be set on a per-pattern basis.
|
||||||
|
|
||||||
Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
|
Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
|
||||||
@@ -383,18 +383,18 @@ section.
|
|||||||
Prefiltering Mode
|
Prefiltering Mode
|
||||||
=================
|
=================
|
||||||
|
|
||||||
Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
|
Vectorscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
|
||||||
be used to implement a prefilter for a pattern than Hyperscan would not
|
be used to implement a prefilter for a pattern than Vectorscan would not
|
||||||
ordinarily support.
|
ordinarily support.
|
||||||
|
|
||||||
This flag instructs Hyperscan to compile an "approximate" version of this
|
This flag instructs Vectorscan to compile an "approximate" version of this
|
||||||
pattern for use in a prefiltering application, even if Hyperscan does not
|
pattern for use in a prefiltering application, even if Vectorscan does not
|
||||||
support the pattern in normal operation.
|
support the pattern in normal operation.
|
||||||
|
|
||||||
The set of matches returned when this flag is used is guaranteed to be a
|
The set of matches returned when this flag is used is guaranteed to be a
|
||||||
superset of the matches specified by the non-prefiltering expression.
|
superset of the matches specified by the non-prefiltering expression.
|
||||||
|
|
||||||
If the pattern contains pattern constructs not supported by Hyperscan (such as
|
If the pattern contains pattern constructs not supported by Vectorscan (such as
|
||||||
zero-width assertions, back-references or conditional references) these
|
zero-width assertions, back-references or conditional references) these
|
||||||
constructs will be replaced internally with broader constructs that may match
|
constructs will be replaced internally with broader constructs that may match
|
||||||
more often.
|
more often.
|
||||||
@@ -404,7 +404,7 @@ back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
|
|||||||
approximated by having its back-reference replaced with its referent, forming
|
approximated by having its back-reference replaced with its referent, forming
|
||||||
:regexp:`/\\w+ again \\w+/`.
|
:regexp:`/\\w+ again \\w+/`.
|
||||||
|
|
||||||
Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
|
Furthermore, in prefiltering mode Vectorscan may simplify a pattern that would
|
||||||
otherwise return a "Pattern too large" error at compile time, or for performance
|
otherwise return a "Pattern too large" error at compile time, or for performance
|
||||||
reasons (subject to the matching guarantee above).
|
reasons (subject to the matching guarantee above).
|
||||||
|
|
||||||
@@ -422,22 +422,22 @@ matches for the pattern.
|
|||||||
Instruction Set Specialization
|
Instruction Set Specialization
|
||||||
******************************
|
******************************
|
||||||
|
|
||||||
Hyperscan is able to make use of several modern instruction set features found
|
Vectorscan is able to make use of several modern instruction set features found
|
||||||
on x86 processors to provide improvements in scanning performance.
|
on x86 processors to provide improvements in scanning performance.
|
||||||
|
|
||||||
Some of these features are selected when the library is built; for example,
|
Some of these features are selected when the library is built; for example,
|
||||||
Hyperscan will use the native ``POPCNT`` instruction on processors where it is
|
Vectorscan will use the native ``POPCNT`` instruction on processors where it is
|
||||||
available and the library has been optimized for the host architecture.
|
available and the library has been optimized for the host architecture.
|
||||||
|
|
||||||
.. note:: By default, the Hyperscan runtime is built with the ``-march=native``
|
.. note:: By default, the Vectorscan runtime is built with the ``-march=native``
|
||||||
compiler flag and (where possible) will make use of all instructions known by
|
compiler flag and (where possible) will make use of all instructions known by
|
||||||
the host's C compiler.
|
the host's C compiler.
|
||||||
|
|
||||||
To use some instruction set features, however, Hyperscan must build a
|
To use some instruction set features, however, Vectorscan must build a
|
||||||
specialized database to support them. This means that the target platform must
|
specialized database to support them. This means that the target platform must
|
||||||
be specified at pattern compile time.
|
be specified at pattern compile time.
|
||||||
|
|
||||||
The Hyperscan compiler API functions all accept an optional
|
The Vectorscan compiler API functions all accept an optional
|
||||||
:c:type:`hs_platform_info_t` argument, which describes the target platform
|
:c:type:`hs_platform_info_t` argument, which describes the target platform
|
||||||
for the database to be built. If this argument is NULL, the database will be
|
for the database to be built. If this argument is NULL, the database will be
|
||||||
targeted at the current host platform.
|
targeted at the current host platform.
|
||||||
@@ -467,7 +467,7 @@ See :ref:`api_constants` for the full list of CPU tuning and feature flags.
|
|||||||
Approximate matching
|
Approximate matching
|
||||||
********************
|
********************
|
||||||
|
|
||||||
Hyperscan provides an experimental approximate matching mode, which will match
|
Vectorscan provides an experimental approximate matching mode, which will match
|
||||||
patterns within a given edit distance. The exact matching behavior is defined as
|
patterns within a given edit distance. The exact matching behavior is defined as
|
||||||
follows:
|
follows:
|
||||||
|
|
||||||
@@ -492,7 +492,7 @@ follows:
|
|||||||
|
|
||||||
Here are a few examples of approximate matching:
|
Here are a few examples of approximate matching:
|
||||||
|
|
||||||
* Pattern :regexp:`/foo/` can match ``foo`` when using regular Hyperscan
|
* Pattern :regexp:`/foo/` can match ``foo`` when using regular Vectorscan
|
||||||
matching behavior. With approximate matching within edit distance 2, the
|
matching behavior. With approximate matching within edit distance 2, the
|
||||||
pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
|
pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
|
||||||
``f``, and anything else that lies within edit distance 2 of matching corpora
|
``f``, and anything else that lies within edit distance 2 of matching corpora
|
||||||
@@ -513,7 +513,7 @@ matching support. Here they are, in a nutshell:
|
|||||||
* Reduced pattern support:
|
* Reduced pattern support:
|
||||||
|
|
||||||
* For many patterns, approximate matching is complex and can result in
|
* For many patterns, approximate matching is complex and can result in
|
||||||
Hyperscan failing to compile a pattern with a "Pattern too large" error,
|
Vectorscan failing to compile a pattern with a "Pattern too large" error,
|
||||||
even if the pattern is supported in normal operation.
|
even if the pattern is supported in normal operation.
|
||||||
* Additionally, some patterns cannot be approximately matched because they
|
* Additionally, some patterns cannot be approximately matched because they
|
||||||
reduce to so-called "vacuous" patterns (patterns that match everything). For
|
reduce to so-called "vacuous" patterns (patterns that match everything). For
|
||||||
@@ -548,7 +548,7 @@ Logical Combinations
|
|||||||
********************
|
********************
|
||||||
|
|
||||||
For situations when a user requires behaviour that depends on the presence or
|
For situations when a user requires behaviour that depends on the presence or
|
||||||
absence of matches from groups of patterns, Hyperscan provides support for the
|
absence of matches from groups of patterns, Vectorscan provides support for the
|
||||||
logical combination of patterns in a given pattern set, with three operators:
|
logical combination of patterns in a given pattern set, with three operators:
|
||||||
``NOT``, ``AND`` and ``OR``.
|
``NOT``, ``AND`` and ``OR``.
|
||||||
|
|
||||||
@@ -561,7 +561,7 @@ offset is *true* if the expression it refers to is *false* at this offset.
|
|||||||
For example, ``NOT 101`` means that expression 101 has not yet matched at this
|
For example, ``NOT 101`` means that expression 101 has not yet matched at this
|
||||||
offset.
|
offset.
|
||||||
|
|
||||||
A logical combination is passed to Hyperscan at compile time as an expression.
|
A logical combination is passed to Vectorscan at compile time as an expression.
|
||||||
This combination expression will raise matches at every offset where one of its
|
This combination expression will raise matches at every offset where one of its
|
||||||
sub-expressions matches and the logical value of the whole expression is *true*.
|
sub-expressions matches and the logical value of the whole expression is *true*.
|
||||||
|
|
||||||
@@ -603,7 +603,7 @@ In a logical combination expression:
|
|||||||
* Whitespace is ignored.
|
* Whitespace is ignored.
|
||||||
|
|
||||||
To use a logical combination expression, it must be passed to one of the
|
To use a logical combination expression, it must be passed to one of the
|
||||||
Hyperscan compile functions (:c:func:`hs_compile_multi`,
|
Vectorscan compile functions (:c:func:`hs_compile_multi`,
|
||||||
:c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag,
|
:c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag,
|
||||||
which identifies the pattern as a logical combination expression. The patterns
|
which identifies the pattern as a logical combination expression. The patterns
|
||||||
referred to in the logical combination expression must be compiled together in
|
referred to in the logical combination expression must be compiled together in
|
||||||
@@ -613,7 +613,7 @@ When an expression has the :c:member:`HS_FLAG_COMBINATION` flag set, it ignores
|
|||||||
all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the
|
all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the
|
||||||
:c:member:`HS_FLAG_QUIET` flag.
|
:c:member:`HS_FLAG_QUIET` flag.
|
||||||
|
|
||||||
Hyperscan will accept logical combination expressions at compile time that
|
Vectorscan will accept logical combination expressions at compile time that
|
||||||
evaluate to *true* when no patterns have matched, and report the match for
|
evaluate to *true* when no patterns have matched, and report the match for
|
||||||
combination at end of data if no patterns have matched; for example: ::
|
combination at end of data if no patterns have matched; for example: ::
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
#
|
#
|
||||||
# Hyperscan documentation build configuration file, created by
|
# Vectorscan documentation build configuration file, created by
|
||||||
# sphinx-quickstart on Tue Sep 29 15:59:19 2015.
|
# sphinx-quickstart on Tue Sep 29 15:59:19 2015.
|
||||||
#
|
#
|
||||||
# This file is execfile()d with the current directory set to its
|
# This file is execfile()d with the current directory set to its
|
||||||
@@ -43,8 +43,8 @@ source_suffix = '.rst'
|
|||||||
master_doc = 'index'
|
master_doc = 'index'
|
||||||
|
|
||||||
# General information about the project.
|
# General information about the project.
|
||||||
project = u'Hyperscan'
|
project = u'Vectorscan'
|
||||||
copyright = u'2015-2018, Intel Corporation'
|
copyright = u'2015-2020, Intel Corporation; 2020-2024, VectorCamp; and other contributors'
|
||||||
|
|
||||||
# The version info for the project you're documenting, acts as replacement for
|
# The version info for the project you're documenting, acts as replacement for
|
||||||
# |version| and |release|, also used in various other places throughout the
|
# |version| and |release|, also used in various other places throughout the
|
||||||
@@ -202,7 +202,7 @@ latex_elements = {
|
|||||||
# (source start file, target name, title,
|
# (source start file, target name, title,
|
||||||
# author, documentclass [howto, manual, or own class]).
|
# author, documentclass [howto, manual, or own class]).
|
||||||
latex_documents = [
|
latex_documents = [
|
||||||
('index', 'Hyperscan.tex', u'Hyperscan Documentation',
|
('index', 'Hyperscan.tex', u'Vectorscan Documentation',
|
||||||
u'Intel Corporation', 'manual'),
|
u'Intel Corporation', 'manual'),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -232,8 +232,8 @@ latex_documents = [
|
|||||||
# One entry per manual page. List of tuples
|
# One entry per manual page. List of tuples
|
||||||
# (source start file, name, description, authors, manual section).
|
# (source start file, name, description, authors, manual section).
|
||||||
man_pages = [
|
man_pages = [
|
||||||
('index', 'hyperscan', u'Hyperscan Documentation',
|
('index', 'vectorscan', u'Vectorscan Documentation',
|
||||||
[u'Intel Corporation'], 1)
|
[u'Intel Corporation'], 7)
|
||||||
]
|
]
|
||||||
|
|
||||||
# If true, show URL addresses after external links.
|
# If true, show URL addresses after external links.
|
||||||
@@ -246,8 +246,8 @@ man_pages = [
|
|||||||
# (source start file, target name, title, author,
|
# (source start file, target name, title, author,
|
||||||
# dir menu entry, description, category)
|
# dir menu entry, description, category)
|
||||||
texinfo_documents = [
|
texinfo_documents = [
|
||||||
('index', 'Hyperscan', u'Hyperscan Documentation',
|
('index', 'Vectorscan', u'Vectorscan Documentation',
|
||||||
u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
|
u'Intel Corporation; VectorCamp', 'Vectorscan', 'High-performance regular expression matcher.',
|
||||||
'Miscellaneous'),
|
'Miscellaneous'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -7,43 +7,41 @@ Getting Started
|
|||||||
Very Quick Start
|
Very Quick Start
|
||||||
****************
|
****************
|
||||||
|
|
||||||
#. Clone Hyperscan ::
|
#. Clone Vectorscan ::
|
||||||
|
|
||||||
cd <where-you-want-hyperscan-source>
|
cd <where-you-want-vectorscan-source>
|
||||||
git clone git://github.com/intel/hyperscan
|
git clone https://github.com/VectorCamp/vectorscan
|
||||||
|
|
||||||
#. Configure Hyperscan
|
#. Configure Vectorscan
|
||||||
|
|
||||||
Ensure that you have the correct :ref:`dependencies <software>` present,
|
Ensure that you have the correct :ref:`dependencies <software>` present,
|
||||||
and then:
|
and then:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
cd <where-you-want-to-build-hyperscan>
|
cd <where-you-want-to-build-vectorscan>
|
||||||
mkdir <build-dir>
|
mkdir <build-dir>
|
||||||
cd <build-dir>
|
cd <build-dir>
|
||||||
cmake [-G <generator>] [options] <hyperscan-source-path>
|
cmake [-G <generator>] [options] <vectorscan-source-path>
|
||||||
|
|
||||||
Known working generators:
|
Known working generators:
|
||||||
* ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
|
* ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
|
||||||
* ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
|
* ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
|
||||||
* ``Visual Studio 15 2017`` --- Visual Studio projects
|
|
||||||
|
|
||||||
Generators that might work include:
|
Unsupported generators that might work include:
|
||||||
* ``Xcode`` --- OS X Xcode projects.
|
* ``Xcode`` --- OS X Xcode projects.
|
||||||
|
|
||||||
#. Build Hyperscan
|
#. Build Vectorscan
|
||||||
|
|
||||||
Depending on the generator used:
|
Depending on the generator used:
|
||||||
* ``cmake --build .`` --- will build everything
|
* ``cmake --build .`` --- will build everything
|
||||||
* ``make -j<jobs>`` --- use makefiles in parallel
|
* ``make -j<jobs>`` --- use makefiles in parallel
|
||||||
* ``ninja`` --- use Ninja build
|
* ``ninja`` --- use Ninja build
|
||||||
* ``MsBuild.exe`` --- use Visual Studio MsBuild
|
|
||||||
* etc.
|
* etc.
|
||||||
|
|
||||||
#. Check Hyperscan
|
#. Check Vectorscan
|
||||||
|
|
||||||
Run the Hyperscan unit tests: ::
|
Run the Vectorscan unit tests: ::
|
||||||
|
|
||||||
bin/unit-hyperscan
|
bin/unit-hyperscan
|
||||||
|
|
||||||
@@ -55,20 +53,23 @@ Requirements
|
|||||||
Hardware
|
Hardware
|
||||||
========
|
========
|
||||||
|
|
||||||
Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
|
Vectorscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
|
||||||
32-bit (IA-32 Architecture) modes.
|
32-bit (IA-32 Architecture) modes as well as Arm v8.0+ aarch64, and POWER 8+ ppc64le
|
||||||
|
machines.
|
||||||
|
|
||||||
Hyperscan is a high performance software library that takes advantage of recent
|
Hyperscan is a high performance software library that takes advantage of recent
|
||||||
Intel architecture advances. At a minimum, support for Supplemental Streaming
|
architecture advances.
|
||||||
SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
|
|
||||||
x86 processor.
|
|
||||||
|
|
||||||
Additionally, Hyperscan can make use of:
|
Additionally, Vectorscan can make use of:
|
||||||
|
|
||||||
* Intel Streaming SIMD Extensions 4.2 (SSE4.2)
|
* Intel Streaming SIMD Extensions 4.2 (SSE4.2)
|
||||||
* the POPCNT instruction
|
* the POPCNT instruction
|
||||||
* Bit Manipulation Instructions (BMI, BMI2)
|
* Bit Manipulation Instructions (BMI, BMI2)
|
||||||
* Intel Advanced Vector Extensions 2 (Intel AVX2)
|
* Intel Advanced Vector Extensions 2 (Intel AVX2)
|
||||||
|
* Arm NEON
|
||||||
|
* Arm SVE and SVE2
|
||||||
|
* Arm SVE2 BITPERM
|
||||||
|
* IBM Power8/Power9 VSX
|
||||||
|
|
||||||
if present.
|
if present.
|
||||||
|
|
||||||
@@ -79,40 +80,34 @@ These can be determined at library compile time, see :ref:`target_arch`.
|
|||||||
Software
|
Software
|
||||||
========
|
========
|
||||||
|
|
||||||
As a software library, Hyperscan doesn't impose any particular runtime
|
As a software library, Vectorscan doesn't impose any particular runtime
|
||||||
software requirements, however to build the Hyperscan library we require a
|
software requirements, however to build the Vectorscan library we require a
|
||||||
modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
|
modern C and C++ compiler -- in particular, Vectorscan requires C99 and C++17
|
||||||
compiler support. The supported compilers are:
|
compiler support. The supported compilers are:
|
||||||
|
|
||||||
* GCC, v4.8.1 or higher
|
* GCC, v9 or higher
|
||||||
* Clang, v3.4 or higher (with libstdc++ or libc++)
|
* Clang, v5 or higher (with libstdc++ or libc++)
|
||||||
* Intel C++ Compiler v15 or higher
|
|
||||||
* Visual C++ 2017 Build Tools
|
|
||||||
|
|
||||||
Examples of operating systems that Hyperscan is known to work on include:
|
Examples of operating systems that Vectorscan is known to work on include:
|
||||||
|
|
||||||
Linux:
|
Linux:
|
||||||
|
|
||||||
* Ubuntu 14.04 LTS or newer
|
* Ubuntu 20.04 LTS or newer
|
||||||
* RedHat/CentOS 7 or newer
|
* RedHat/CentOS 7 or newer
|
||||||
|
* Fedora 38 or newer
|
||||||
|
* Debian 10
|
||||||
|
|
||||||
FreeBSD:
|
FreeBSD:
|
||||||
|
|
||||||
* 10.0 or newer
|
* 10.0 or newer
|
||||||
|
|
||||||
Windows:
|
|
||||||
|
|
||||||
* 8 or newer
|
|
||||||
|
|
||||||
Mac OS X:
|
Mac OS X:
|
||||||
|
|
||||||
* 10.8 or newer, using XCode/Clang
|
* 10.8 or newer, using XCode/Clang
|
||||||
|
|
||||||
Hyperscan *may* compile and run on other platforms, but there is no guarantee.
|
Vectorscan *may* compile and run on other platforms, but there is no guarantee.
|
||||||
We currently have experimental support for Windows using Intel C++ Compiler
|
|
||||||
or Visual Studio 2017.
|
|
||||||
|
|
||||||
In addition, the following software is required for compiling the Hyperscan library:
|
In addition, the following software is required for compiling the Vectorscan library:
|
||||||
|
|
||||||
======================================================= =========== ======================================
|
======================================================= =========== ======================================
|
||||||
Dependency Version Notes
|
Dependency Version Notes
|
||||||
@@ -132,20 +127,20 @@ Ragel, you may use Cygwin to build it from source.
|
|||||||
Boost Headers
|
Boost Headers
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
Compiling Hyperscan depends on a recent version of the Boost C++ header
|
Compiling Vectorscan depends on a recent version of the Boost C++ header
|
||||||
library. If the Boost libraries are installed on the build machine in the
|
library. If the Boost libraries are installed on the build machine in the
|
||||||
usual paths, CMake will find them. If the Boost libraries are not installed,
|
usual paths, CMake will find them. If the Boost libraries are not installed,
|
||||||
the location of the Boost source tree can be specified during the CMake
|
the location of the Boost source tree can be specified during the CMake
|
||||||
configuration step using the ``BOOST_ROOT`` variable (described below).
|
configuration step using the ``BOOST_ROOT`` variable (described below).
|
||||||
|
|
||||||
Another alternative is to put a copy of (or a symlink to) the boost
|
Another alternative is to put a copy of (or a symlink to) the boost
|
||||||
subdirectory in ``<hyperscan-source-path>/include/boost``.
|
subdirectory in ``<vectorscanscan-source-path>/include/boost``.
|
||||||
|
|
||||||
For example: for the Boost-1.59.0 release: ::
|
For example: for the Boost-1.59.0 release: ::
|
||||||
|
|
||||||
ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
|
ln -s boost_1_59_0/boost <vectorscan-source-path>/include/boost
|
||||||
|
|
||||||
As Hyperscan uses the header-only parts of Boost, it is not necessary to
|
As Vectorscan uses the header-only parts of Boost, it is not necessary to
|
||||||
compile the Boost libraries.
|
compile the Boost libraries.
|
||||||
|
|
||||||
CMake Configuration
|
CMake Configuration
|
||||||
@@ -168,11 +163,12 @@ Common options for CMake include:
|
|||||||
| | Valid options are Debug, Release, RelWithDebInfo, |
|
| | Valid options are Debug, Release, RelWithDebInfo, |
|
||||||
| | and MinSizeRel. Default is RelWithDebInfo. |
|
| | and MinSizeRel. Default is RelWithDebInfo. |
|
||||||
+------------------------+----------------------------------------------------+
|
+------------------------+----------------------------------------------------+
|
||||||
| BUILD_SHARED_LIBS | Build Hyperscan as a shared library instead of |
|
| BUILD_SHARED_LIBS | Build Vectorscan as a shared library instead of |
|
||||||
| | the default static library. |
|
| | the default static library. |
|
||||||
|
| | Default: Off |
|
||||||
+------------------------+----------------------------------------------------+
|
+------------------------+----------------------------------------------------+
|
||||||
| BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs. |
|
| BUILD_STATIC_LIBS | Build Vectorscan as a static library. |
|
||||||
| | Default off. |
|
| | Default: On |
|
||||||
+------------------------+----------------------------------------------------+
|
+------------------------+----------------------------------------------------+
|
||||||
| BOOST_ROOT | Location of Boost source tree. |
|
| BOOST_ROOT | Location of Boost source tree. |
|
||||||
+------------------------+----------------------------------------------------+
|
+------------------------+----------------------------------------------------+
|
||||||
@@ -180,12 +176,64 @@ Common options for CMake include:
|
|||||||
+------------------------+----------------------------------------------------+
|
+------------------------+----------------------------------------------------+
|
||||||
| FAT_RUNTIME | Build the :ref:`fat runtime<fat_runtime>`. Default |
|
| FAT_RUNTIME | Build the :ref:`fat runtime<fat_runtime>`. Default |
|
||||||
| | true on Linux, not available elsewhere. |
|
| | true on Linux, not available elsewhere. |
|
||||||
|
| | Default: Off |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| USE_CPU_NATIVE | Native CPU detection is off by default, however it |
|
||||||
|
| | is possible to build a performance-oriented non-fat|
|
||||||
|
| | library tuned to your CPU. |
|
||||||
|
| | Default: Off |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| SANITIZE | Use libasan sanitizer to detect possible bugs. |
|
||||||
|
| | Valid options are address, memory and undefined. |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| SIMDE_BACKEND | Enable SIMDe backend. If this is chosen all native |
|
||||||
|
| | (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be |
|
||||||
|
| | disabled and a SIMDe SSE4.2 emulation backend will |
|
||||||
|
| | be enabled. This will enable Vectorscan to build |
|
||||||
|
| | and run on architectures without SIMD. |
|
||||||
|
| | Default: Off |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| SIMDE_NATIVE | Enable SIMDe native emulation of x86 SSE4.2 |
|
||||||
|
| | intrinsics on the building platform. That is, |
|
||||||
|
| | SSE4.2 intrinsics will be emulated using Neon on |
|
||||||
|
| | an Arm platform, or VSX on a Power platform, etc. |
|
||||||
|
| | Default: Off |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
|
||||||
|
X86 platform specific options include:
|
||||||
|
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| Variable | Description |
|
||||||
|
+========================+====================================================+
|
||||||
|
| BUILD_AVX2 | Enable code for AVX2. |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| BUILD_AVX512 | Enable code for AVX512. Implies BUILD_AVX2. |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| BUILD_AVX512VBMI | Enable code for AVX512 with VBMI extension. Implies|
|
||||||
|
| | BUILD_AVX512. |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
|
||||||
|
Arm platform specific options include:
|
||||||
|
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| Variable | Description |
|
||||||
|
+========================+====================================================+
|
||||||
|
| BUILD_SVE | Enable code for SVE, like on AWS Graviton3 CPUs. |
|
||||||
|
| | Not much code is ported just for SVE , but enabling|
|
||||||
|
| | SVE code production, does improve code generation, |
|
||||||
|
| | see Benchmarks. |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| BUILD_SVE2 | Enable code for SVE2, implies BUILD_SVE. Most |
|
||||||
|
| | non-Neon code is written for SVE2. |
|
||||||
|
+------------------------+----------------------------------------------------+
|
||||||
|
| BUILD_SVE2_BITPERM | Enable code for SVE2_BITPERM harwdare feature, |
|
||||||
|
| | implies BUILD_SVE2. |
|
||||||
+------------------------+----------------------------------------------------+
|
+------------------------+----------------------------------------------------+
|
||||||
|
|
||||||
For example, to generate a ``Debug`` build: ::
|
For example, to generate a ``Debug`` build: ::
|
||||||
|
|
||||||
cd <build-dir>
|
cd <build-dir>
|
||||||
cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
|
cmake -DCMAKE_BUILD_TYPE=Debug <vectorscan-source-path>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -193,7 +241,7 @@ Build Type
|
|||||||
----------
|
----------
|
||||||
|
|
||||||
CMake determines a number of features for a build based on the Build Type.
|
CMake determines a number of features for a build based on the Build Type.
|
||||||
Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
|
Vectorscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
|
||||||
information". This is a performance optimized build without runtime assertions
|
information". This is a performance optimized build without runtime assertions
|
||||||
but with debug symbols enabled.
|
but with debug symbols enabled.
|
||||||
|
|
||||||
@@ -201,7 +249,7 @@ The other types of builds are:
|
|||||||
|
|
||||||
* ``Release``: as above, but without debug symbols
|
* ``Release``: as above, but without debug symbols
|
||||||
* ``MinSizeRel``: a stripped release build
|
* ``MinSizeRel``: a stripped release build
|
||||||
* ``Debug``: used when developing Hyperscan. Includes runtime assertions
|
* ``Debug``: used when developing Vectorscan. Includes runtime assertions
|
||||||
(which has a large impact on runtime performance), and will also enable
|
(which has a large impact on runtime performance), and will also enable
|
||||||
some other build features like building internal unit
|
some other build features like building internal unit
|
||||||
tests.
|
tests.
|
||||||
@@ -211,7 +259,7 @@ The other types of builds are:
|
|||||||
Target Architecture
|
Target Architecture
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Unless using the :ref:`fat runtime<fat_runtime>`, by default Hyperscan will be
|
Unless using the :ref:`fat runtime<fat_runtime>`, by default Vectorscan will be
|
||||||
compiled to target the instruction set of the processor of the machine that
|
compiled to target the instruction set of the processor of the machine that
|
||||||
being used for compilation. This is done via the use of ``-march=native``. The
|
being used for compilation. This is done via the use of ``-march=native``. The
|
||||||
result of this means that a library built on one machine may not work on a
|
result of this means that a library built on one machine may not work on a
|
||||||
@@ -223,7 +271,7 @@ CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. F
|
|||||||
example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
|
example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
|
||||||
|
|
||||||
cmake -DCMAKE_C_FLAGS="-march=corei7" \
|
cmake -DCMAKE_C_FLAGS="-march=corei7" \
|
||||||
-DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
|
-DCMAKE_CXX_FLAGS="-march=corei7" <vectorscan-source-path>
|
||||||
|
|
||||||
For more information, refer to :ref:`instr_specialization`.
|
For more information, refer to :ref:`instr_specialization`.
|
||||||
|
|
||||||
@@ -232,17 +280,17 @@ For more information, refer to :ref:`instr_specialization`.
|
|||||||
Fat Runtime
|
Fat Runtime
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
A feature introduced in Hyperscan v4.4 is the ability for the Hyperscan
|
A feature introduced in Hyperscan v4.4 is the ability for the Vectorscan
|
||||||
library to dispatch the most appropriate runtime code for the host processor.
|
library to dispatch the most appropriate runtime code for the host processor.
|
||||||
This feature is called the "fat runtime", as a single Hyperscan library
|
This feature is called the "fat runtime", as a single Vectorscan library
|
||||||
contains multiple copies of the runtime code for different instruction sets.
|
contains multiple copies of the runtime code for different instruction sets.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
The fat runtime feature is only available on Linux. Release builds of
|
The fat runtime feature is only available on Linux. Release builds of
|
||||||
Hyperscan will default to having the fat runtime enabled where supported.
|
Vectorscan will default to having the fat runtime enabled where supported.
|
||||||
|
|
||||||
When building the library with the fat runtime, the Hyperscan runtime code
|
When building the library with the fat runtime, the Vectorscan runtime code
|
||||||
will be compiled multiple times for these different instruction sets, and
|
will be compiled multiple times for these different instruction sets, and
|
||||||
these compiled objects are combined into one library. There are no changes to
|
these compiled objects are combined into one library. There are no changes to
|
||||||
how user applications are built against this library.
|
how user applications are built against this library.
|
||||||
@@ -254,11 +302,11 @@ resolved so that the right version of each API function is used. There is no
|
|||||||
impact on function call performance, as this check and resolution is performed
|
impact on function call performance, as this check and resolution is performed
|
||||||
by the ELF loader once when the binary is loaded.
|
by the ELF loader once when the binary is loaded.
|
||||||
|
|
||||||
If the Hyperscan library is used on x86 systems without ``SSSE3``, the runtime
|
If the Vectorscan library is used on x86 systems without ``SSSE4.2``, the runtime
|
||||||
API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR`
|
API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR`
|
||||||
instead of potentially executing illegal instructions. The API function
|
instead of potentially executing illegal instructions. The API function
|
||||||
:c:func:`hs_valid_platform` can be used by application writers to determine if
|
:c:func:`hs_valid_platform` can be used by application writers to determine if
|
||||||
the current platform is supported by Hyperscan.
|
the current platform is supported by Vectorscan.
|
||||||
|
|
||||||
As of this release, the variants of the runtime that are built, and the CPU
|
As of this release, the variants of the runtime that are built, and the CPU
|
||||||
capability that is required, are the following:
|
capability that is required, are the following:
|
||||||
@@ -299,6 +347,11 @@ capability that is required, are the following:
|
|||||||
|
|
||||||
cmake -DBUILD_AVX512VBMI=on <...>
|
cmake -DBUILD_AVX512VBMI=on <...>
|
||||||
|
|
||||||
|
Vectorscan add support for Arm processors and SVE, SV2 and SVE2_BITPERM.
|
||||||
|
example: ::
|
||||||
|
|
||||||
|
cmake -DBUILD_SVE=ON -DBUILD_SVE2=ON -DBUILD_SVE2_BITPERM=ON <...>
|
||||||
|
|
||||||
As the fat runtime requires compiler, libc, and binutils support, at this time
|
As the fat runtime requires compiler, libc, and binutils support, at this time
|
||||||
it will only be enabled for Linux builds where the compiler supports the
|
it will only be enabled for Linux builds where the compiler supports the
|
||||||
`indirect function "ifunc" function attribute
|
`indirect function "ifunc" function attribute
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
###############################################
|
###############################################
|
||||||
Hyperscan |version| Developer's Reference Guide
|
Vectorscan |version| Developer's Reference Guide
|
||||||
###############################################
|
###############################################
|
||||||
|
|
||||||
-------
|
-------
|
||||||
|
|||||||
@@ -5,11 +5,11 @@
|
|||||||
Introduction
|
Introduction
|
||||||
############
|
############
|
||||||
|
|
||||||
Hyperscan is a software regular expression matching engine designed with
|
Vectorscan is a software regular expression matching engine designed with
|
||||||
high performance and flexibility in mind. It is implemented as a library that
|
high performance and flexibility in mind. It is implemented as a library that
|
||||||
exposes a straightforward C API.
|
exposes a straightforward C API.
|
||||||
|
|
||||||
The Hyperscan API itself is composed of two major components:
|
The Vectorscan API itself is composed of two major components:
|
||||||
|
|
||||||
***********
|
***********
|
||||||
Compilation
|
Compilation
|
||||||
@@ -17,7 +17,7 @@ Compilation
|
|||||||
|
|
||||||
These functions take a group of regular expressions, along with identifiers and
|
These functions take a group of regular expressions, along with identifiers and
|
||||||
option flags, and compile them into an immutable database that can be used by
|
option flags, and compile them into an immutable database that can be used by
|
||||||
the Hyperscan scanning API. This compilation process performs considerable
|
the Vectorscan scanning API. This compilation process performs considerable
|
||||||
analysis and optimization work in order to build a database that will match the
|
analysis and optimization work in order to build a database that will match the
|
||||||
given expressions efficiently.
|
given expressions efficiently.
|
||||||
|
|
||||||
@@ -36,8 +36,8 @@ See :ref:`compilation` for more detail.
|
|||||||
Scanning
|
Scanning
|
||||||
********
|
********
|
||||||
|
|
||||||
Once a Hyperscan database has been created, it can be used to scan data in
|
Once a Vectorscan database has been created, it can be used to scan data in
|
||||||
memory. Hyperscan provides several scanning modes, depending on whether the
|
memory. Vectorscan provides several scanning modes, depending on whether the
|
||||||
data to be scanned is available as a single contiguous block, whether it is
|
data to be scanned is available as a single contiguous block, whether it is
|
||||||
distributed amongst several blocks in memory at the same time, or whether it is
|
distributed amongst several blocks in memory at the same time, or whether it is
|
||||||
to be scanned as a sequence of blocks in a stream.
|
to be scanned as a sequence of blocks in a stream.
|
||||||
@@ -45,7 +45,7 @@ to be scanned as a sequence of blocks in a stream.
|
|||||||
Matches are delivered to the application via a user-supplied callback function
|
Matches are delivered to the application via a user-supplied callback function
|
||||||
that is called synchronously for each match.
|
that is called synchronously for each match.
|
||||||
|
|
||||||
For a given database, Hyperscan provides several guarantees:
|
For a given database, Vectorscan provides several guarantees:
|
||||||
|
|
||||||
* No memory allocations occur at runtime with the exception of two
|
* No memory allocations occur at runtime with the exception of two
|
||||||
fixed-size allocations, both of which should be done ahead of time for
|
fixed-size allocations, both of which should be done ahead of time for
|
||||||
@@ -56,7 +56,7 @@ For a given database, Hyperscan provides several guarantees:
|
|||||||
call.
|
call.
|
||||||
- **Stream state**: in streaming mode only, some state space is required to
|
- **Stream state**: in streaming mode only, some state space is required to
|
||||||
store data that persists between scan calls for each stream. This allows
|
store data that persists between scan calls for each stream. This allows
|
||||||
Hyperscan to track matches that span multiple blocks of data.
|
Vectorscan to track matches that span multiple blocks of data.
|
||||||
|
|
||||||
* The sizes of the scratch space and stream state (in streaming mode) required
|
* The sizes of the scratch space and stream state (in streaming mode) required
|
||||||
for a given database are fixed and determined at database compile time. This
|
for a given database are fixed and determined at database compile time. This
|
||||||
@@ -64,7 +64,7 @@ For a given database, Hyperscan provides several guarantees:
|
|||||||
time, and these structures can be pre-allocated if required for performance
|
time, and these structures can be pre-allocated if required for performance
|
||||||
reasons.
|
reasons.
|
||||||
|
|
||||||
* Any pattern that has successfully been compiled by the Hyperscan compiler can
|
* Any pattern that has successfully been compiled by the Vectorscan compiler can
|
||||||
be scanned against any input. There are no internal resource limits or other
|
be scanned against any input. There are no internal resource limits or other
|
||||||
limitations at runtime that could cause a scan call to return an error.
|
limitations at runtime that could cause a scan call to return an error.
|
||||||
|
|
||||||
@@ -74,12 +74,12 @@ See :ref:`runtime` for more detail.
|
|||||||
Tools
|
Tools
|
||||||
*****
|
*****
|
||||||
|
|
||||||
Some utilities for testing and benchmarking Hyperscan are included with the
|
Some utilities for testing and benchmarking Vectorscan are included with the
|
||||||
library. See :ref:`tools` for more information.
|
library. See :ref:`tools` for more information.
|
||||||
|
|
||||||
************
|
************
|
||||||
Example Code
|
Example Code
|
||||||
************
|
************
|
||||||
|
|
||||||
Some simple example code demonstrating the use of the Hyperscan API is
|
Some simple example code demonstrating the use of the Vectorscan API is
|
||||||
available in the ``examples/`` subdirectory of the Hyperscan distribution.
|
available in the ``examples/`` subdirectory of the Vectorscan distribution.
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
Performance Considerations
|
Performance Considerations
|
||||||
##########################
|
##########################
|
||||||
|
|
||||||
Hyperscan supports a wide range of patterns in all three scanning modes. It is
|
Vectorscan supports a wide range of patterns in all three scanning modes. It is
|
||||||
capable of extremely high levels of performance, but certain patterns can
|
capable of extremely high levels of performance, but certain patterns can
|
||||||
reduce performance markedly.
|
reduce performance markedly.
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ For example, caseless matching of :regexp:`/abc/` can be written as:
|
|||||||
* :regexp:`/(?i)abc(?-i)/`
|
* :regexp:`/(?i)abc(?-i)/`
|
||||||
* :regexp:`/abc/i`
|
* :regexp:`/abc/i`
|
||||||
|
|
||||||
Hyperscan is capable of handling all these constructs. Unless there is a
|
Vectorscan is capable of handling all these constructs. Unless there is a
|
||||||
specific reason otherwise, do not rewrite patterns from one form to another.
|
specific reason otherwise, do not rewrite patterns from one form to another.
|
||||||
|
|
||||||
As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
|
As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
|
||||||
@@ -41,24 +41,24 @@ Library usage
|
|||||||
|
|
||||||
.. tip:: Do not hand-optimize library usage.
|
.. tip:: Do not hand-optimize library usage.
|
||||||
|
|
||||||
The Hyperscan library is capable of dealing with small writes, unusually large
|
The Vectorscan library is capable of dealing with small writes, unusually large
|
||||||
and small pattern sets, etc. Unless there is a specific performance problem
|
and small pattern sets, etc. Unless there is a specific performance problem
|
||||||
with some usage of the library, it is best to use Hyperscan in a simple and
|
with some usage of the library, it is best to use Vectorscan in a simple and
|
||||||
direct fashion. For example, it is unlikely for there to be much benefit in
|
direct fashion. For example, it is unlikely for there to be much benefit in
|
||||||
buffering input to the library into larger blocks unless streaming writes are
|
buffering input to the library into larger blocks unless streaming writes are
|
||||||
tiny (say, 1-2 bytes at a time).
|
tiny (say, 1-2 bytes at a time).
|
||||||
|
|
||||||
Unlike many other pattern matching products, Hyperscan will run faster with
|
Unlike many other pattern matching products, Vectorscan will run faster with
|
||||||
small numbers of patterns and slower with large numbers of patterns in a smooth
|
small numbers of patterns and slower with large numbers of patterns in a smooth
|
||||||
fashion (as opposed to, typically, running at a moderate speed up to some fixed
|
fashion (as opposed to, typically, running at a moderate speed up to some fixed
|
||||||
limit then either breaking or running half as fast).
|
limit then either breaking or running half as fast).
|
||||||
|
|
||||||
Hyperscan also provides high-throughput matching with a single thread of
|
Vectorscan also provides high-throughput matching with a single thread of
|
||||||
control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
|
control per core; if a database runs at 3.0 Gbps in Vectorscan it means that a
|
||||||
3000-bit block of data will be scanned in 1 microsecond in a single thread of
|
3000-bit block of data will be scanned in 1 microsecond in a single thread of
|
||||||
control, not that it is required to scan 22 3000-bit blocks of data in 22
|
control, not that it is required to scan 22 3000-bit blocks of data in 22
|
||||||
microseconds. Thus, it is not usually necessary to buffer data to supply
|
microseconds. Thus, it is not usually necessary to buffer data to supply
|
||||||
Hyperscan with available parallelism.
|
Vectorscan with available parallelism.
|
||||||
|
|
||||||
********************
|
********************
|
||||||
Block-based matching
|
Block-based matching
|
||||||
@@ -72,7 +72,7 @@ accumulated before processing, it should be scanned in block rather than in
|
|||||||
streaming mode.
|
streaming mode.
|
||||||
|
|
||||||
Unnecessary use of streaming mode reduces the number of optimizations that can
|
Unnecessary use of streaming mode reduces the number of optimizations that can
|
||||||
be applied in Hyperscan and may make some patterns run slower.
|
be applied in Vectorscan and may make some patterns run slower.
|
||||||
|
|
||||||
If there is a mixture of 'block' and 'streaming' mode patterns, these should be
|
If there is a mixture of 'block' and 'streaming' mode patterns, these should be
|
||||||
scanned in separate databases except in the case that the streaming patterns
|
scanned in separate databases except in the case that the streaming patterns
|
||||||
@@ -107,7 +107,7 @@ Allocate scratch ahead of time
|
|||||||
|
|
||||||
Scratch allocation is not necessarily a cheap operation. Since it is the first
|
Scratch allocation is not necessarily a cheap operation. Since it is the first
|
||||||
time (after compilation or deserialization) that a pattern database is used,
|
time (after compilation or deserialization) that a pattern database is used,
|
||||||
Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
|
Vectorscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
|
||||||
must also allocate memory.
|
must also allocate memory.
|
||||||
|
|
||||||
Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
|
Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
|
||||||
@@ -329,7 +329,7 @@ Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
|
|||||||
the callback is considerably more expensive and general than
|
the callback is considerably more expensive and general than
|
||||||
:regexp:`/foo.{300}bar/`.
|
:regexp:`/foo.{300}bar/`.
|
||||||
|
|
||||||
Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
|
Similarly, the :cpp:member:`hs_expr_ext::min_length` extended parameter can be
|
||||||
used to specify a lower bound on the length of the matches for a pattern. Using
|
used to specify a lower bound on the length of the matches for a pattern. Using
|
||||||
this facility may be more lightweight in some circumstances than using the SOM
|
this facility may be more lightweight in some circumstances than using the SOM
|
||||||
flag and post-confirming match length in the calling application.
|
flag and post-confirming match length in the calling application.
|
||||||
|
|||||||
@@ -6,35 +6,35 @@ Preface
|
|||||||
Overview
|
Overview
|
||||||
********
|
********
|
||||||
|
|
||||||
Hyperscan is a regular expression engine designed to offer high performance, the
|
Vectorscan is a regular expression engine designed to offer high performance, the
|
||||||
ability to match multiple expressions simultaneously and flexibility in
|
ability to match multiple expressions simultaneously and flexibility in
|
||||||
scanning operation.
|
scanning operation.
|
||||||
|
|
||||||
Patterns are provided to a compilation interface which generates an immutable
|
Patterns are provided to a compilation interface which generates an immutable
|
||||||
pattern database. The scan interface then can be used to scan a target data
|
pattern database. The scan interface then can be used to scan a target data
|
||||||
buffer for the given patterns, returning any matching results from that data
|
buffer for the given patterns, returning any matching results from that data
|
||||||
buffer. Hyperscan also provides a streaming mode, in which matches that span
|
buffer. Vectorscan also provides a streaming mode, in which matches that span
|
||||||
several blocks in a stream are detected.
|
several blocks in a stream are detected.
|
||||||
|
|
||||||
This document is designed to facilitate code-level integration of the Hyperscan
|
This document is designed to facilitate code-level integration of the Vectorscan
|
||||||
library with existing or new applications.
|
library with existing or new applications.
|
||||||
|
|
||||||
:ref:`intro` is a short overview of the Hyperscan library, with more detail on
|
:ref:`intro` is a short overview of the Vectorscan library, with more detail on
|
||||||
the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
|
the Vectorscan API provided in the subsequent sections: :ref:`compilation` and
|
||||||
:ref:`runtime`.
|
:ref:`runtime`.
|
||||||
|
|
||||||
:ref:`perf` provides details on various factors which may impact the
|
:ref:`perf` provides details on various factors which may impact the
|
||||||
performance of a Hyperscan integration.
|
performance of a Vectorscan integration.
|
||||||
|
|
||||||
:ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
|
:ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
|
||||||
Hyperscan Application Programming Interface (API).
|
Vectorscan Application Programming Interface (API).
|
||||||
|
|
||||||
********
|
********
|
||||||
Audience
|
Audience
|
||||||
********
|
********
|
||||||
|
|
||||||
This guide is aimed at developers interested in integrating Hyperscan into an
|
This guide is aimed at developers interested in integrating Vectorscan into an
|
||||||
application. For information on building the Hyperscan library, see the Quick
|
application. For information on building the Vectorscan library, see the Quick
|
||||||
Start Guide.
|
Start Guide.
|
||||||
|
|
||||||
***********
|
***********
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
Scanning for Patterns
|
Scanning for Patterns
|
||||||
#####################
|
#####################
|
||||||
|
|
||||||
Hyperscan provides three different scanning modes, each with its own scan
|
Vectorscan provides three different scanning modes, each with its own scan
|
||||||
function beginning with ``hs_scan``. In addition, streaming mode has a number
|
function beginning with ``hs_scan``. In addition, streaming mode has a number
|
||||||
of other API functions for managing stream state.
|
of other API functions for managing stream state.
|
||||||
|
|
||||||
@@ -33,8 +33,8 @@ See :c:type:`match_event_handler` for more information.
|
|||||||
Streaming Mode
|
Streaming Mode
|
||||||
**************
|
**************
|
||||||
|
|
||||||
The core of the Hyperscan streaming runtime API consists of functions to open,
|
The core of the Vectorscan streaming runtime API consists of functions to open,
|
||||||
scan, and close Hyperscan data streams:
|
scan, and close Vectorscan data streams:
|
||||||
|
|
||||||
* :c:func:`hs_open_stream`: allocates and initializes a new stream for scanning.
|
* :c:func:`hs_open_stream`: allocates and initializes a new stream for scanning.
|
||||||
|
|
||||||
@@ -57,14 +57,14 @@ will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
|
|||||||
still call :c:func:`hs_close_stream` to complete the clean-up process for that
|
still call :c:func:`hs_close_stream` to complete the clean-up process for that
|
||||||
stream.
|
stream.
|
||||||
|
|
||||||
Streams exist in the Hyperscan library so that pattern matching state can be
|
Streams exist in the Vectorscan library so that pattern matching state can be
|
||||||
maintained across multiple blocks of target data -- without maintaining this
|
maintained across multiple blocks of target data -- without maintaining this
|
||||||
state, it would not be possible to detect patterns that span these blocks of
|
state, it would not be possible to detect patterns that span these blocks of
|
||||||
data. This, however, does come at the cost of requiring an amount of storage
|
data. This, however, does come at the cost of requiring an amount of storage
|
||||||
per-stream (the size of this storage is fixed at compile time), and a slight
|
per-stream (the size of this storage is fixed at compile time), and a slight
|
||||||
performance penalty in some cases to manage the state.
|
performance penalty in some cases to manage the state.
|
||||||
|
|
||||||
While Hyperscan does always support a strict ordering of multiple matches,
|
While Vectorscan does always support a strict ordering of multiple matches,
|
||||||
streaming matches will not be delivered at offsets before the current stream
|
streaming matches will not be delivered at offsets before the current stream
|
||||||
write, with the exception of zero-width asserts, where constructs such as
|
write, with the exception of zero-width asserts, where constructs such as
|
||||||
:regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
|
:regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
|
||||||
@@ -76,7 +76,7 @@ Stream Management
|
|||||||
=================
|
=================
|
||||||
|
|
||||||
In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
|
In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
|
||||||
:c:func:`hs_close_stream`, the Hyperscan API provides a number of other
|
:c:func:`hs_close_stream`, the Vectorscan API provides a number of other
|
||||||
functions for the management of streams:
|
functions for the management of streams:
|
||||||
|
|
||||||
* :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
|
* :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
|
||||||
@@ -98,10 +98,10 @@ A stream object is allocated as a fixed size region of memory which has been
|
|||||||
sized to ensure that no memory allocations are required during scan
|
sized to ensure that no memory allocations are required during scan
|
||||||
operations. When the system is under memory pressure, it may be useful to reduce
|
operations. When the system is under memory pressure, it may be useful to reduce
|
||||||
the memory consumed by streams that are not expected to be used soon. The
|
the memory consumed by streams that are not expected to be used soon. The
|
||||||
Hyperscan API provides calls for translating a stream to and from a compressed
|
Vectorscan API provides calls for translating a stream to and from a compressed
|
||||||
representation for this purpose. The compressed representation differs from the
|
representation for this purpose. The compressed representation differs from the
|
||||||
full stream object as it does not reserve space for components which are not
|
full stream object as it does not reserve space for components which are not
|
||||||
required given the current stream state. The Hyperscan API functions for this
|
required given the current stream state. The Vectorscan API functions for this
|
||||||
functionality are:
|
functionality are:
|
||||||
|
|
||||||
* :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
|
* :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
|
||||||
@@ -157,7 +157,7 @@ scanned in block mode.
|
|||||||
Scratch Space
|
Scratch Space
|
||||||
*************
|
*************
|
||||||
|
|
||||||
While scanning data, Hyperscan needs a small amount of temporary memory to store
|
While scanning data, Vectorscan needs a small amount of temporary memory to store
|
||||||
on-the-fly internal data. This amount is unfortunately too large to fit on the
|
on-the-fly internal data. This amount is unfortunately too large to fit on the
|
||||||
stack, particularly for embedded applications, and allocating memory dynamically
|
stack, particularly for embedded applications, and allocating memory dynamically
|
||||||
is too expensive, so a pre-allocated "scratch" space must be provided to the
|
is too expensive, so a pre-allocated "scratch" space must be provided to the
|
||||||
@@ -170,7 +170,7 @@ databases, only a single scratch region is necessary: in this case, calling
|
|||||||
will ensure that the scratch space is large enough to support scanning against
|
will ensure that the scratch space is large enough to support scanning against
|
||||||
any of the given databases.
|
any of the given databases.
|
||||||
|
|
||||||
While the Hyperscan library is re-entrant, the use of scratch spaces is not.
|
While the Vectorscan library is re-entrant, the use of scratch spaces is not.
|
||||||
For example, if by design it is deemed necessary to run recursive or nested
|
For example, if by design it is deemed necessary to run recursive or nested
|
||||||
scanning (say, from the match callback function), then an additional scratch
|
scanning (say, from the match callback function), then an additional scratch
|
||||||
space is required for that context.
|
space is required for that context.
|
||||||
@@ -219,11 +219,11 @@ For example:
|
|||||||
Custom Allocators
|
Custom Allocators
|
||||||
*****************
|
*****************
|
||||||
|
|
||||||
By default, structures used by Hyperscan at runtime (scratch space, stream
|
By default, structures used by Vectorscan at runtime (scratch space, stream
|
||||||
state, etc) are allocated with the default system allocators, usually
|
state, etc) are allocated with the default system allocators, usually
|
||||||
``malloc()`` and ``free()``.
|
``malloc()`` and ``free()``.
|
||||||
|
|
||||||
The Hyperscan API provides a facility for changing this behaviour to support
|
The Vectorscan API provides a facility for changing this behaviour to support
|
||||||
applications that use custom memory allocators.
|
applications that use custom memory allocators.
|
||||||
|
|
||||||
These functions are:
|
These functions are:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
Serialization
|
Serialization
|
||||||
#############
|
#############
|
||||||
|
|
||||||
For some applications, compiling Hyperscan pattern databases immediately prior
|
For some applications, compiling Vectorscan pattern databases immediately prior
|
||||||
to use is not an appropriate design. Some users may wish to:
|
to use is not an appropriate design. Some users may wish to:
|
||||||
|
|
||||||
* Compile pattern databases on a different host;
|
* Compile pattern databases on a different host;
|
||||||
@@ -14,9 +14,9 @@ to use is not an appropriate design. Some users may wish to:
|
|||||||
|
|
||||||
* Control the region of memory in which the compiled database is located.
|
* Control the region of memory in which the compiled database is located.
|
||||||
|
|
||||||
Hyperscan pattern databases are not completely flat in memory: they contain
|
Vectorscan pattern databases are not completely flat in memory: they contain
|
||||||
pointers and have specific alignment requirements. Therefore, they cannot be
|
pointers and have specific alignment requirements. Therefore, they cannot be
|
||||||
copied (or otherwise relocated) directly. To enable these use cases, Hyperscan
|
copied (or otherwise relocated) directly. To enable these use cases, Vectorscan
|
||||||
provides functionality for serializing and deserializing compiled pattern
|
provides functionality for serializing and deserializing compiled pattern
|
||||||
databases.
|
databases.
|
||||||
|
|
||||||
@@ -40,10 +40,10 @@ The API provides the following functions:
|
|||||||
returns a string containing information about the database. This call is
|
returns a string containing information about the database. This call is
|
||||||
analogous to :c:func:`hs_database_info`.
|
analogous to :c:func:`hs_database_info`.
|
||||||
|
|
||||||
.. note:: Hyperscan performs both version and platform compatibility checks
|
.. note:: Vectorscan performs both version and platform compatibility checks
|
||||||
upon deserialization. The :c:func:`hs_deserialize_database` and
|
upon deserialization. The :c:func:`hs_deserialize_database` and
|
||||||
:c:func:`hs_deserialize_database_at` functions will only permit the
|
:c:func:`hs_deserialize_database_at` functions will only permit the
|
||||||
deserialization of databases compiled with (a) the same version of Hyperscan
|
deserialization of databases compiled with (a) the same version of Vectorscan
|
||||||
and (b) platform features supported by the current host platform. See
|
and (b) platform features supported by the current host platform. See
|
||||||
:ref:`instr_specialization` for more information on platform specialization.
|
:ref:`instr_specialization` for more information on platform specialization.
|
||||||
|
|
||||||
@@ -51,17 +51,17 @@ The API provides the following functions:
|
|||||||
The Runtime Library
|
The Runtime Library
|
||||||
===================
|
===================
|
||||||
|
|
||||||
The main Hyperscan library (``libhs``) contains both the compiler and runtime
|
The main Vectorscan library (``libhs``) contains both the compiler and runtime
|
||||||
portions of the library. This means that in order to support the Hyperscan
|
portions of the library. This means that in order to support the Vectorscan
|
||||||
compiler, which is written in C++, it requires C++ linkage and has a
|
compiler, which is written in C++, it requires C++ linkage and has a
|
||||||
dependency on the C++ standard library.
|
dependency on the C++ standard library.
|
||||||
|
|
||||||
Many embedded applications require only the scanning ("runtime") portion of the
|
Many embedded applications require only the scanning ("runtime") portion of the
|
||||||
Hyperscan library. In these cases, pattern compilation generally takes place on
|
Vectorscan library. In these cases, pattern compilation generally takes place on
|
||||||
another host, and serialized pattern databases are delivered to the application
|
another host, and serialized pattern databases are delivered to the application
|
||||||
for use.
|
for use.
|
||||||
|
|
||||||
To support these applications without requiring the C++ dependency, a
|
To support these applications without requiring the C++ dependency, a
|
||||||
runtime-only version of the Hyperscan library, called ``libhs_runtime``, is also
|
runtime-only version of the Vectorscan library, called ``libhs_runtime``, is also
|
||||||
distributed. This library does not depend on the C++ standard library and
|
distributed. This library does not depend on the C++ standard library and
|
||||||
provides all Hyperscan functions other that those used to compile databases.
|
provides all Vectorscan functions other that those used to compile databases.
|
||||||
|
|||||||
@@ -4,14 +4,14 @@
|
|||||||
Tools
|
Tools
|
||||||
#####
|
#####
|
||||||
|
|
||||||
This section describes the set of utilities included with the Hyperscan library.
|
This section describes the set of utilities included with the Vectorscan library.
|
||||||
|
|
||||||
********************
|
********************
|
||||||
Quick Check: hscheck
|
Quick Check: hscheck
|
||||||
********************
|
********************
|
||||||
|
|
||||||
The ``hscheck`` tool allows the user to quickly check whether Hyperscan supports
|
The ``hscheck`` tool allows the user to quickly check whether Vectorscan supports
|
||||||
a group of patterns. If a pattern is rejected by Hyperscan's compiler, the
|
a group of patterns. If a pattern is rejected by Vectorscan's compiler, the
|
||||||
compile error is provided on standard output.
|
compile error is provided on standard output.
|
||||||
|
|
||||||
For example, given the following three patterns (the last of which contains a
|
For example, given the following three patterns (the last of which contains a
|
||||||
@@ -34,7 +34,7 @@ syntax error) in a file called ``/tmp/test``::
|
|||||||
Benchmarker: hsbench
|
Benchmarker: hsbench
|
||||||
********************
|
********************
|
||||||
|
|
||||||
The ``hsbench`` tool provides an easy way to measure Hyperscan's performance
|
The ``hsbench`` tool provides an easy way to measure Vectorscan's performance
|
||||||
for a particular set of patterns and corpus of data to be scanned.
|
for a particular set of patterns and corpus of data to be scanned.
|
||||||
|
|
||||||
Patterns are supplied in the format described below in
|
Patterns are supplied in the format described below in
|
||||||
@@ -44,7 +44,7 @@ easy control of how a corpus is broken into blocks and streams.
|
|||||||
|
|
||||||
.. note:: A group of Python scripts for constructing corpora databases from
|
.. note:: A group of Python scripts for constructing corpora databases from
|
||||||
various input types, such as PCAP network traffic captures or text files, can
|
various input types, such as PCAP network traffic captures or text files, can
|
||||||
be found in the Hyperscan source tree in ``tools/hsbench/scripts``.
|
be found in the Vectorscan source tree in ``tools/hsbench/scripts``.
|
||||||
|
|
||||||
Running hsbench
|
Running hsbench
|
||||||
===============
|
===============
|
||||||
@@ -56,7 +56,7 @@ produce output like this::
|
|||||||
$ hsbench -e /tmp/patterns -c /tmp/corpus.db
|
$ hsbench -e /tmp/patterns -c /tmp/corpus.db
|
||||||
|
|
||||||
Signatures: /tmp/patterns
|
Signatures: /tmp/patterns
|
||||||
Hyperscan info: Version: 4.3.1 Features: AVX2 Mode: STREAM
|
Vectorscan info: Version: 5.4.11 Features: AVX2 Mode: STREAM
|
||||||
Expression count: 200
|
Expression count: 200
|
||||||
Bytecode size: 342,540 bytes
|
Bytecode size: 342,540 bytes
|
||||||
Database CRC: 0x6cd6b67c
|
Database CRC: 0x6cd6b67c
|
||||||
@@ -77,7 +77,7 @@ takes to perform all twenty scans. The number of repeats can be changed with the
|
|||||||
``-n`` argument, and the results of each scan will be displayed if the
|
``-n`` argument, and the results of each scan will be displayed if the
|
||||||
``--per-scan`` argument is specified.
|
``--per-scan`` argument is specified.
|
||||||
|
|
||||||
To benchmark Hyperscan on more than one core, you can supply a list of cores
|
To benchmark Vectorscan on more than one core, you can supply a list of cores
|
||||||
with the ``-T`` argument, which will instruct ``hsbench`` to start one
|
with the ``-T`` argument, which will instruct ``hsbench`` to start one
|
||||||
benchmark thread per core given and compute the throughput from the time taken
|
benchmark thread per core given and compute the throughput from the time taken
|
||||||
to complete all of them.
|
to complete all of them.
|
||||||
@@ -91,17 +91,17 @@ Correctness Testing: hscollider
|
|||||||
*******************************
|
*******************************
|
||||||
|
|
||||||
The ``hscollider`` tool, or Pattern Collider, provides a way to verify
|
The ``hscollider`` tool, or Pattern Collider, provides a way to verify
|
||||||
Hyperscan's matching behaviour. It does this by compiling and scanning patterns
|
Vectorscan's matching behaviour. It does this by compiling and scanning patterns
|
||||||
(either singly or in groups) against known corpora and comparing the results
|
(either singly or in groups) against known corpora and comparing the results
|
||||||
against another engine (the "ground truth"). Two sources of ground truth for
|
against another engine (the "ground truth"). Two sources of ground truth for
|
||||||
comparison are available:
|
comparison are available:
|
||||||
|
|
||||||
* The PCRE library (http://pcre.org/).
|
* The PCRE library (http://pcre.org/).
|
||||||
* An NFA simulation run on Hyperscan's compile-time graph representation. This
|
* An NFA simulation run on Vectorscan's compile-time graph representation. This
|
||||||
is used if PCRE cannot support the pattern or if PCRE execution fails due to
|
is used if PCRE cannot support the pattern or if PCRE execution fails due to
|
||||||
a resource limit.
|
a resource limit.
|
||||||
|
|
||||||
Much of Hyperscan's testing infrastructure is built on ``hscollider``, and the
|
Much of Vectorscan's testing infrastructure is built on ``hscollider``, and the
|
||||||
tool is designed to take advantage of multiple cores and provide considerable
|
tool is designed to take advantage of multiple cores and provide considerable
|
||||||
flexibility in controlling the test. These options are described in the help
|
flexibility in controlling the test. These options are described in the help
|
||||||
(``hscollider -h``) and include:
|
(``hscollider -h``) and include:
|
||||||
@@ -116,11 +116,11 @@ flexibility in controlling the test. These options are described in the help
|
|||||||
Using hscollider to debug a pattern
|
Using hscollider to debug a pattern
|
||||||
===================================
|
===================================
|
||||||
|
|
||||||
One common use-case for ``hscollider`` is to determine whether Hyperscan will
|
One common use-case for ``hscollider`` is to determine whether Vectorscan will
|
||||||
match a pattern in the expected location, and whether this accords with PCRE's
|
match a pattern in the expected location, and whether this accords with PCRE's
|
||||||
behaviour for the same case.
|
behaviour for the same case.
|
||||||
|
|
||||||
Here is an example. We put our pattern in a file in Hyperscan's pattern
|
Here is an example. We put our pattern in a file in Vectorscan's pattern
|
||||||
format::
|
format::
|
||||||
|
|
||||||
$ cat /tmp/pat
|
$ cat /tmp/pat
|
||||||
@@ -172,7 +172,7 @@ individual matches are displayed in the output::
|
|||||||
|
|
||||||
Total elapsed time: 0.00522815 secs.
|
Total elapsed time: 0.00522815 secs.
|
||||||
|
|
||||||
We can see from this output that both PCRE and Hyperscan find matches ending at
|
We can see from this output that both PCRE and Vectorscan find matches ending at
|
||||||
offset 33 and 45, and so ``hscollider`` considers this test case to have
|
offset 33 and 45, and so ``hscollider`` considers this test case to have
|
||||||
passed.
|
passed.
|
||||||
|
|
||||||
@@ -180,13 +180,13 @@ passed.
|
|||||||
corpus alignment 0, and ``-T 1`` instructs us to only use one thread.)
|
corpus alignment 0, and ``-T 1`` instructs us to only use one thread.)
|
||||||
|
|
||||||
.. note:: In default operation, PCRE produces only one match for a scan, unlike
|
.. note:: In default operation, PCRE produces only one match for a scan, unlike
|
||||||
Hyperscan's automata semantics. The ``hscollider`` tool uses libpcre's
|
Vectorscan's automata semantics. The ``hscollider`` tool uses libpcre's
|
||||||
"callout" functionality to match Hyperscan's semantics.
|
"callout" functionality to match Vectorscan's semantics.
|
||||||
|
|
||||||
Running a larger scan test
|
Running a larger scan test
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
A set of patterns for testing purposes are distributed with Hyperscan, and these
|
A set of patterns for testing purposes are distributed with Vectorscan, and these
|
||||||
can be tested via ``hscollider`` on an in-tree build. Two CMake targets are
|
can be tested via ``hscollider`` on an in-tree build. Two CMake targets are
|
||||||
provided to do this easily:
|
provided to do this easily:
|
||||||
|
|
||||||
@@ -202,10 +202,10 @@ Debugging: hsdump
|
|||||||
*****************
|
*****************
|
||||||
|
|
||||||
When built in debug mode (using the CMake directive ``CMAKE_BUILD_TYPE`` set to
|
When built in debug mode (using the CMake directive ``CMAKE_BUILD_TYPE`` set to
|
||||||
``Debug``), Hyperscan includes support for dumping information about its
|
``Debug``), Vectorscan includes support for dumping information about its
|
||||||
internals during pattern compilation with the ``hsdump`` tool.
|
internals during pattern compilation with the ``hsdump`` tool.
|
||||||
|
|
||||||
This information is mostly of use to Hyperscan developers familiar with the
|
This information is mostly of use to Vectorscan developers familiar with the
|
||||||
library's internal structure, but can be used to diagnose issues with patterns
|
library's internal structure, but can be used to diagnose issues with patterns
|
||||||
and provide more information in bug reports.
|
and provide more information in bug reports.
|
||||||
|
|
||||||
@@ -215,7 +215,7 @@ and provide more information in bug reports.
|
|||||||
Pattern Format
|
Pattern Format
|
||||||
**************
|
**************
|
||||||
|
|
||||||
All of the Hyperscan tools accept patterns in the same format, read from plain
|
All of the Vectorscan tools accept patterns in the same format, read from plain
|
||||||
text files with one pattern per line. Each line looks like this:
|
text files with one pattern per line. Each line looks like this:
|
||||||
|
|
||||||
* ``<integer id>:/<regex>/<flags>``
|
* ``<integer id>:/<regex>/<flags>``
|
||||||
@@ -227,12 +227,12 @@ For example::
|
|||||||
3:/^.{10,20}hatstand/m
|
3:/^.{10,20}hatstand/m
|
||||||
|
|
||||||
The integer ID is the value that will be reported when a match is found by
|
The integer ID is the value that will be reported when a match is found by
|
||||||
Hyperscan and must be unique.
|
Vectorscan and must be unique.
|
||||||
|
|
||||||
The pattern itself is a regular expression in PCRE syntax; see
|
The pattern itself is a regular expression in PCRE syntax; see
|
||||||
:ref:`compilation` for more information on supported features.
|
:ref:`compilation` for more information on supported features.
|
||||||
|
|
||||||
The flags are single characters that map to Hyperscan flags as follows:
|
The flags are single characters that map to Vectorscan flags as follows:
|
||||||
|
|
||||||
========= ================================= ===========
|
========= ================================= ===========
|
||||||
Character API Flag Description
|
Character API Flag Description
|
||||||
@@ -256,7 +256,7 @@ between braces, separated by commas. For example::
|
|||||||
|
|
||||||
1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}
|
1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}
|
||||||
|
|
||||||
All Hyperscan tools will accept a pattern file (or a directory containing
|
All Vectorscan tools will accept a pattern file (or a directory containing
|
||||||
pattern files) with the ``-e`` argument. If no further arguments constraining
|
pattern files) with the ``-e`` argument. If no further arguments constraining
|
||||||
the pattern set are given, all patterns in those files are used.
|
the pattern set are given, all patterns in those files are used.
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2017, Intel Corporation
|
* Copyright (c) 2015-2017, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -134,7 +135,12 @@
|
|||||||
#include <netinet/tcp.h>
|
#include <netinet/tcp.h>
|
||||||
#include <netinet/udp.h>
|
#include <netinet/udp.h>
|
||||||
#include <netinet/ip_icmp.h>
|
#include <netinet/ip_icmp.h>
|
||||||
|
#ifdef __NetBSD__
|
||||||
|
#include <net/ethertypes.h>
|
||||||
|
#include <net/if_ether.h>
|
||||||
|
#else
|
||||||
#include <net/ethernet.h>
|
#include <net/ethernet.h>
|
||||||
|
#endif /* __NetBSD__ */
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
|
|
||||||
#include <pcap.h>
|
#include <pcap.h>
|
||||||
@@ -196,15 +202,15 @@ struct FiveTuple {
|
|||||||
unsigned int dstPort;
|
unsigned int dstPort;
|
||||||
|
|
||||||
// Construct a FiveTuple from a TCP or UDP packet.
|
// Construct a FiveTuple from a TCP or UDP packet.
|
||||||
FiveTuple(const struct ip *iphdr) {
|
explicit FiveTuple(const struct ip *iphdr) {
|
||||||
// IP fields
|
// IP fields
|
||||||
protocol = iphdr->ip_p;
|
protocol = iphdr->ip_p;
|
||||||
srcAddr = iphdr->ip_src.s_addr;
|
srcAddr = iphdr->ip_src.s_addr;
|
||||||
dstAddr = iphdr->ip_dst.s_addr;
|
dstAddr = iphdr->ip_dst.s_addr;
|
||||||
|
|
||||||
// UDP/TCP ports
|
// UDP/TCP ports
|
||||||
const struct udphdr *uh = (const struct udphdr *)
|
const struct udphdr *uh = reinterpret_cast<const struct udphdr *>
|
||||||
(((const char *)iphdr) + (iphdr->ip_hl * 4));
|
((reinterpret_cast<const char *>(iphdr)) + (iphdr->ip_hl * 4));
|
||||||
srcPort = uh->uh_sport;
|
srcPort = uh->uh_sport;
|
||||||
dstPort = uh->uh_dport;
|
dstPort = uh->uh_dport;
|
||||||
}
|
}
|
||||||
@@ -233,7 +239,7 @@ static
|
|||||||
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
||||||
unsigned int flags, void *ctx) {
|
unsigned int flags, void *ctx) {
|
||||||
// Our context points to a size_t storing the match count
|
// Our context points to a size_t storing the match count
|
||||||
size_t *matches = (size_t *)ctx;
|
size_t *matches = static_cast<size_t *>(ctx);
|
||||||
(*matches)++;
|
(*matches)++;
|
||||||
return 0; // continue matching
|
return 0; // continue matching
|
||||||
}
|
}
|
||||||
@@ -295,7 +301,7 @@ public:
|
|||||||
// database.
|
// database.
|
||||||
hs_error_t err = hs_alloc_scratch(db, &scratch);
|
hs_error_t err = hs_alloc_scratch(db, &scratch);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
|
cerr << "ERROR: could not allocate scratch space. Exiting.\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -307,8 +313,7 @@ public:
|
|||||||
size_t scratch_size;
|
size_t scratch_size;
|
||||||
hs_error_t err = hs_scratch_size(scratch, &scratch_size);
|
hs_error_t err = hs_scratch_size(scratch, &scratch_size);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: could not query scratch space size. Exiting."
|
cerr << "ERROR: could not query scratch space size. Exiting.\n";
|
||||||
<< endl;
|
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
return scratch_size;
|
return scratch_size;
|
||||||
@@ -334,9 +339,9 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Valid TCP or UDP packet
|
// Valid TCP or UDP packet
|
||||||
const struct ip *iphdr = (const struct ip *)(pktData
|
const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData
|
||||||
+ sizeof(struct ether_header));
|
+ sizeof(struct ether_header));
|
||||||
const char *payload = (const char *)pktData + offset;
|
const char *payload = reinterpret_cast<const char *>(pktData) + offset;
|
||||||
|
|
||||||
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
||||||
stream_map.size())).first->second;
|
stream_map.size())).first->second;
|
||||||
@@ -352,9 +357,8 @@ public:
|
|||||||
// Return the number of bytes scanned
|
// Return the number of bytes scanned
|
||||||
size_t bytes() const {
|
size_t bytes() const {
|
||||||
size_t sum = 0;
|
size_t sum = 0;
|
||||||
for (const auto &packet : packets) {
|
auto packs = [](size_t z, const string &packet) { return z + packet.size(); };
|
||||||
sum += packet.size();
|
sum += std::accumulate(packets.begin(), packets.end(), 0, packs);
|
||||||
}
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -374,7 +378,7 @@ public:
|
|||||||
for (auto &stream : streams) {
|
for (auto &stream : streams) {
|
||||||
hs_error_t err = hs_open_stream(db, 0, &stream);
|
hs_error_t err = hs_open_stream(db, 0, &stream);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: Unable to open stream. Exiting." << endl;
|
cerr << "ERROR: Unable to open stream. Exiting.\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -383,11 +387,11 @@ public:
|
|||||||
// Close all open Hyperscan streams (potentially generating any
|
// Close all open Hyperscan streams (potentially generating any
|
||||||
// end-anchored matches)
|
// end-anchored matches)
|
||||||
void closeStreams() {
|
void closeStreams() {
|
||||||
for (auto &stream : streams) {
|
for (const auto &stream : streams) {
|
||||||
hs_error_t err =
|
hs_error_t err =
|
||||||
hs_close_stream(stream, scratch, onMatch, &matchCount);
|
hs_close_stream(stream, scratch, onMatch, &matchCount);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: Unable to close stream. Exiting." << endl;
|
cerr << "ERROR: Unable to close stream. Exiting.\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -402,7 +406,7 @@ public:
|
|||||||
pkt.c_str(), pkt.length(), 0,
|
pkt.c_str(), pkt.length(), 0,
|
||||||
scratch, onMatch, &matchCount);
|
scratch, onMatch, &matchCount);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
cerr << "ERROR: Unable to scan packet. Exiting.\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -416,7 +420,7 @@ public:
|
|||||||
hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0,
|
hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0,
|
||||||
scratch, onMatch, &matchCount);
|
scratch, onMatch, &matchCount);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
cerr << "ERROR: Unable to scan packet. Exiting.\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -436,7 +440,7 @@ class Sigdata {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
Sigdata() {}
|
Sigdata() {}
|
||||||
Sigdata(const char *filename) {
|
explicit Sigdata(const char *filename) {
|
||||||
parseFile(filename, patterns, flags, ids, originals);
|
parseFile(filename, patterns, flags, ids, originals);
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -454,9 +458,8 @@ public:
|
|||||||
// dynamic storage.)
|
// dynamic storage.)
|
||||||
vector<const char *> cstrPatterns;
|
vector<const char *> cstrPatterns;
|
||||||
cstrPatterns.reserve(patterns.size());
|
cstrPatterns.reserve(patterns.size());
|
||||||
for (const auto &pattern : patterns) {
|
auto pstr = [](const string &pattern) { return pattern.c_str(); };
|
||||||
cstrPatterns.push_back(pattern.c_str());
|
std::transform(patterns.begin(), patterns.end(), std::back_inserter(cstrPatterns), pstr);
|
||||||
}
|
|
||||||
|
|
||||||
Clock clock;
|
Clock clock;
|
||||||
clock.start();
|
clock.start();
|
||||||
@@ -505,29 +508,29 @@ public:
|
|||||||
|
|
||||||
static
|
static
|
||||||
void usage(const char *) {
|
void usage(const char *) {
|
||||||
cerr << "Usage:" << endl << endl;
|
cerr << "Usage:\n\n";
|
||||||
cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]" << endl
|
cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]\n"
|
||||||
<< " [ -F factor_group_size ] [ -N | -S ] "
|
<< " [ -F factor_group_size ] [ -N | -S ] "
|
||||||
<< "<pattern file> <pcap file>" << endl << endl
|
<< "<pattern file> <pcap file>\n\n"
|
||||||
<< " -n repeats sets the number of times the PCAP is repeatedly "
|
<< " -n repeats sets the number of times the PCAP is repeatedly "
|
||||||
"scanned" << endl << " with the pattern." << endl
|
"scanned\n" << " with the pattern.\n"
|
||||||
<< " -G generations sets the number of generations that the "
|
<< " -G generations sets the number of generations that the "
|
||||||
"algorithm is" << endl << " run for." << endl
|
"algorithm is\n" << " run for.\n"
|
||||||
<< " -N sets non-streaming mode, -S sets streaming mode (default)."
|
<< " -N sets non-streaming mode, -S sets streaming mode (default)."
|
||||||
<< endl << " -F sets the factor group size (must be >0); this "
|
<< endl << " -F sets the factor group size (must be >0); this "
|
||||||
"allows the detection" << endl
|
"allows the detection\n"
|
||||||
<< " of multiple interacting factors." << endl << "" << endl
|
<< " of multiple interacting factors.\n" << "\n"
|
||||||
<< " -C sets the 'criterion', which can be either:" << endl
|
<< " -C sets the 'criterion', which can be either:\n"
|
||||||
<< " t throughput (the default) - this requires a pcap file"
|
<< " t throughput (the default) - this requires a pcap file"
|
||||||
<< endl << " r scratch size" << endl
|
<< endl << " r scratch size\n"
|
||||||
<< " s stream state size" << endl
|
<< " s stream state size\n"
|
||||||
<< " c compile time" << endl << " b bytecode size"
|
<< " c compile time\n" << " b bytecode size"
|
||||||
<< endl << endl
|
<< endl << endl
|
||||||
<< "We recommend the use of a utility like 'taskset' on "
|
<< "We recommend the use of a utility like 'taskset' on "
|
||||||
"multiprocessor hosts to" << endl
|
"multiprocessor hosts to\n"
|
||||||
<< "lock execution to a single processor: this will remove processor "
|
<< "lock execution to a single processor: this will remove processor "
|
||||||
"migration" << endl
|
"migration\n"
|
||||||
<< "by the scheduler as a source of noise in the results." << endl;
|
<< "by the scheduler as a source of noise in the results.\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
@@ -559,7 +562,7 @@ double measure_block_time(Benchmark &bench, unsigned int repeatCount) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode,
|
||||||
unsigned repeatCount, Criterion criterion,
|
unsigned repeatCount, Criterion criterion,
|
||||||
bool diagnose = true) {
|
bool diagnose = true) {
|
||||||
double compileTime = 0;
|
double compileTime = 0;
|
||||||
@@ -570,7 +573,7 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
|||||||
size_t dbSize;
|
size_t dbSize;
|
||||||
hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize);
|
hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: could not retrieve bytecode size" << endl;
|
cerr << "ERROR: could not retrieve bytecode size\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
return dbSize;
|
return dbSize;
|
||||||
@@ -581,7 +584,7 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
|||||||
size_t streamStateSize;
|
size_t streamStateSize;
|
||||||
hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize);
|
hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
cerr << "ERROR: could not retrieve stream state size" << endl;
|
cerr << "ERROR: could not retrieve stream state size\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
return streamStateSize;
|
return streamStateSize;
|
||||||
@@ -599,8 +602,9 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
|||||||
scan_time = measure_stream_time(bench, repeatCount);
|
scan_time = measure_stream_time(bench, repeatCount);
|
||||||
}
|
}
|
||||||
size_t bytes = bench.bytes();
|
size_t bytes = bench.bytes();
|
||||||
size_t matches = bench.matches();
|
|
||||||
if (diagnose) {
|
if (diagnose) {
|
||||||
|
size_t matches = bench.matches();
|
||||||
std::ios::fmtflags f(cout.flags());
|
std::ios::fmtflags f(cout.flags());
|
||||||
cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
|
cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
|
||||||
<< " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
|
<< " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
|
||||||
@@ -679,14 +683,13 @@ int main(int argc, char **argv) {
|
|||||||
Benchmark bench;
|
Benchmark bench;
|
||||||
if (criterion == CRITERION_THROUGHPUT) {
|
if (criterion == CRITERION_THROUGHPUT) {
|
||||||
if (!bench.readStreams(pcapFile)) {
|
if (!bench.readStreams(pcapFile)) {
|
||||||
cerr << "Unable to read packets from PCAP file. Exiting." << endl;
|
cerr << "Unable to read packets from PCAP file. Exiting.\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) {
|
if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) {
|
||||||
cerr << "Cannot evaluate stream state for block mode compile. Exiting."
|
cerr << "Cannot evaluate stream state for block mode compile. Exiting.\n";
|
||||||
<< endl;
|
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -724,7 +727,7 @@ int main(int argc, char **argv) {
|
|||||||
unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max);
|
unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max);
|
||||||
|
|
||||||
cout << "Cutting signatures cumulatively for " << generations
|
cout << "Cutting signatures cumulatively for " << generations
|
||||||
<< " generations" << endl;
|
<< " generations\n";
|
||||||
for (unsigned gen = 0; gen < generations; ++gen) {
|
for (unsigned gen = 0; gen < generations; ++gen) {
|
||||||
cout << "Generation " << gen << " ";
|
cout << "Generation " << gen << " ";
|
||||||
set<unsigned> s(work_sigs.begin(), work_sigs.end());
|
set<unsigned> s(work_sigs.begin(), work_sigs.end());
|
||||||
@@ -768,7 +771,7 @@ int main(int argc, char **argv) {
|
|||||||
cout << "Performance: ";
|
cout << "Performance: ";
|
||||||
print_criterion(criterion, best);
|
print_criterion(criterion, best);
|
||||||
cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
|
cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
|
||||||
<< "x) after cutting:" << endl;
|
<< "x) after cutting:\n";
|
||||||
cout.flags(out_f);
|
cout.flags(out_f);
|
||||||
|
|
||||||
// s now has factor_max signatures
|
// s now has factor_max signatures
|
||||||
@@ -791,7 +794,7 @@ int main(int argc, char **argv) {
|
|||||||
static
|
static
|
||||||
bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||||
unsigned int *length) {
|
unsigned int *length) {
|
||||||
const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
|
const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header));
|
||||||
const tcphdr *th = nullptr;
|
const tcphdr *th = nullptr;
|
||||||
|
|
||||||
// Ignore packets that aren't IPv4
|
// Ignore packets that aren't IPv4
|
||||||
@@ -810,7 +813,7 @@ bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
|||||||
|
|
||||||
switch (iph->ip_p) {
|
switch (iph->ip_p) {
|
||||||
case IPPROTO_TCP:
|
case IPPROTO_TCP:
|
||||||
th = (const tcphdr *)((const char *)iph + ihlen);
|
th = reinterpret_cast<const tcphdr *>(reinterpret_cast<const char *>(iph) + ihlen);
|
||||||
thlen = th->th_off * 4;
|
thlen = th->th_off * 4;
|
||||||
break;
|
break;
|
||||||
case IPPROTO_UDP:
|
case IPPROTO_UDP:
|
||||||
@@ -847,7 +850,7 @@ static unsigned parseFlags(const string &flagsStr) {
|
|||||||
case '\r': // stray carriage-return
|
case '\r': // stray carriage-return
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
cerr << "Unsupported flag \'" << c << "\'" << endl;
|
cerr << "Unsupported flag \'" << c << "\'\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -859,7 +862,7 @@ static void parseFile(const char *filename, vector<string> &patterns,
|
|||||||
vector<string> &originals) {
|
vector<string> &originals) {
|
||||||
ifstream inFile(filename);
|
ifstream inFile(filename);
|
||||||
if (!inFile.good()) {
|
if (!inFile.good()) {
|
||||||
cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
|
cerr << "ERROR: Can't open pattern file \"" << filename << "\"\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -889,7 +892,7 @@ static void parseFile(const char *filename, vector<string> &patterns,
|
|||||||
|
|
||||||
size_t flagsStart = expr.find_last_of('/');
|
size_t flagsStart = expr.find_last_of('/');
|
||||||
if (flagsStart == string::npos) {
|
if (flagsStart == string::npos) {
|
||||||
cerr << "ERROR: no trailing '/' char" << endl;
|
cerr << "ERROR: no trailing '/' char\n";
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2016, Intel Corporation
|
* Copyright (c) 2015-2016, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -54,6 +55,7 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <numeric>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@@ -68,7 +70,12 @@
|
|||||||
#include <netinet/tcp.h>
|
#include <netinet/tcp.h>
|
||||||
#include <netinet/udp.h>
|
#include <netinet/udp.h>
|
||||||
#include <netinet/ip_icmp.h>
|
#include <netinet/ip_icmp.h>
|
||||||
|
#ifdef __NetBSD__
|
||||||
|
#include <net/ethertypes.h>
|
||||||
|
#include <net/if_ether.h>
|
||||||
|
#else
|
||||||
#include <net/ethernet.h>
|
#include <net/ethernet.h>
|
||||||
|
#endif /* __NetBSD__ */
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
|
|
||||||
#include <pcap.h>
|
#include <pcap.h>
|
||||||
@@ -93,15 +100,15 @@ struct FiveTuple {
|
|||||||
unsigned int dstPort;
|
unsigned int dstPort;
|
||||||
|
|
||||||
// Construct a FiveTuple from a TCP or UDP packet.
|
// Construct a FiveTuple from a TCP or UDP packet.
|
||||||
FiveTuple(const struct ip *iphdr) {
|
explicit FiveTuple(const struct ip *iphdr) {
|
||||||
// IP fields
|
// IP fields
|
||||||
protocol = iphdr->ip_p;
|
protocol = iphdr->ip_p;
|
||||||
srcAddr = iphdr->ip_src.s_addr;
|
srcAddr = iphdr->ip_src.s_addr;
|
||||||
dstAddr = iphdr->ip_dst.s_addr;
|
dstAddr = iphdr->ip_dst.s_addr;
|
||||||
|
|
||||||
// UDP/TCP ports
|
// UDP/TCP ports
|
||||||
const struct udphdr *uh =
|
const char * iphdr_base = reinterpret_cast<const char *>(iphdr);
|
||||||
(const struct udphdr *)(((const char *)iphdr) + (iphdr->ip_hl * 4));
|
const struct udphdr *uh = reinterpret_cast<const struct udphdr *>(iphdr_base + (iphdr->ip_hl * 4));
|
||||||
srcPort = uh->uh_sport;
|
srcPort = uh->uh_sport;
|
||||||
dstPort = uh->uh_dport;
|
dstPort = uh->uh_dport;
|
||||||
}
|
}
|
||||||
@@ -130,7 +137,7 @@ static
|
|||||||
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
||||||
unsigned int flags, void *ctx) {
|
unsigned int flags, void *ctx) {
|
||||||
// Our context points to a size_t storing the match count
|
// Our context points to a size_t storing the match count
|
||||||
size_t *matches = (size_t *)ctx;
|
size_t *matches = static_cast<size_t *>(ctx);
|
||||||
(*matches)++;
|
(*matches)++;
|
||||||
return 0; // continue matching
|
return 0; // continue matching
|
||||||
}
|
}
|
||||||
@@ -226,9 +233,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Valid TCP or UDP packet
|
// Valid TCP or UDP packet
|
||||||
const struct ip *iphdr = (const struct ip *)(pktData
|
const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData + sizeof(struct ether_header));
|
||||||
+ sizeof(struct ether_header));
|
const char *payload = reinterpret_cast<const char *>(pktData) + offset;
|
||||||
const char *payload = (const char *)pktData + offset;
|
|
||||||
|
|
||||||
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
||||||
stream_map.size())).first->second;
|
stream_map.size())).first->second;
|
||||||
@@ -244,9 +250,8 @@ public:
|
|||||||
// Return the number of bytes scanned
|
// Return the number of bytes scanned
|
||||||
size_t bytes() const {
|
size_t bytes() const {
|
||||||
size_t sum = 0;
|
size_t sum = 0;
|
||||||
for (const auto &packet : packets) {
|
auto packs = [](size_t z, const string &packet) { return z + packet.size(); };
|
||||||
sum += packet.size();
|
sum += std::accumulate(packets.begin(), packets.end(), 0, packs);
|
||||||
}
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -275,7 +280,7 @@ public:
|
|||||||
// Close all open Hyperscan streams (potentially generating any
|
// Close all open Hyperscan streams (potentially generating any
|
||||||
// end-anchored matches)
|
// end-anchored matches)
|
||||||
void closeStreams() {
|
void closeStreams() {
|
||||||
for (auto &stream : streams) {
|
for (const auto &stream : streams) {
|
||||||
hs_error_t err = hs_close_stream(stream, scratch, onMatch,
|
hs_error_t err = hs_close_stream(stream, scratch, onMatch,
|
||||||
&matchCount);
|
&matchCount);
|
||||||
if (err != HS_SUCCESS) {
|
if (err != HS_SUCCESS) {
|
||||||
@@ -427,7 +432,8 @@ static void databasesFromFile(const char *filename,
|
|||||||
// storage.)
|
// storage.)
|
||||||
vector<const char*> cstrPatterns;
|
vector<const char*> cstrPatterns;
|
||||||
for (const auto &pattern : patterns) {
|
for (const auto &pattern : patterns) {
|
||||||
cstrPatterns.push_back(pattern.c_str());
|
// cppcheck-suppress useStlAlgorithm
|
||||||
|
cstrPatterns.push_back(pattern.c_str()); //NOLINT (performance-inefficient-vector-operation)
|
||||||
}
|
}
|
||||||
|
|
||||||
cout << "Compiling Hyperscan databases with " << patterns.size()
|
cout << "Compiling Hyperscan databases with " << patterns.size()
|
||||||
@@ -568,7 +574,8 @@ int main(int argc, char **argv) {
|
|||||||
*/
|
*/
|
||||||
static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||||
unsigned int *length) {
|
unsigned int *length) {
|
||||||
const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
|
const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header));
|
||||||
|
const char *iph_base = reinterpret_cast<const char *>(iph);
|
||||||
const tcphdr *th = nullptr;
|
const tcphdr *th = nullptr;
|
||||||
|
|
||||||
// Ignore packets that aren't IPv4
|
// Ignore packets that aren't IPv4
|
||||||
@@ -587,7 +594,7 @@ static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
|||||||
|
|
||||||
switch (iph->ip_p) {
|
switch (iph->ip_p) {
|
||||||
case IPPROTO_TCP:
|
case IPPROTO_TCP:
|
||||||
th = (const tcphdr *)((const char *)iph + ihlen);
|
th = reinterpret_cast<const tcphdr *>(iph_base + ihlen);
|
||||||
thlen = th->th_off * 4;
|
thlen = th->th_off * 4;
|
||||||
break;
|
break;
|
||||||
case IPPROTO_UDP:
|
case IPPROTO_UDP:
|
||||||
|
|||||||
@@ -67,7 +67,7 @@
|
|||||||
* to pass in the pattern that was being searched for so we can print it out.
|
* to pass in the pattern that was being searched for so we can print it out.
|
||||||
*/
|
*/
|
||||||
static int eventHandler(unsigned int id, unsigned long long from,
|
static int eventHandler(unsigned int id, unsigned long long from,
|
||||||
unsigned long long to, unsigned int flags, void *ctx) {
|
unsigned long long to, unsigned int flags, void *ctx) { // cppcheck-suppress constParameterCallback
|
||||||
printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to);
|
printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -150,7 +150,7 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
char *pattern = argv[1];
|
char *pattern = argv[1];
|
||||||
char *inputFN = argv[2];
|
const char *inputFN = argv[2];
|
||||||
|
|
||||||
/* First, we attempt to compile the pattern provided on the command line.
|
/* First, we attempt to compile the pattern provided on the command line.
|
||||||
* We assume 'DOTALL' semantics, meaning that the '.' meta-character will
|
* We assume 'DOTALL' semantics, meaning that the '.' meta-character will
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@
|
|||||||
includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
|
includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
|
||||||
|
|
||||||
Name: libhs
|
Name: libhs
|
||||||
Description: Intel(R) Hyperscan Library
|
Description: A portable fork of the high-performance regular expression matching library
|
||||||
Version: @HS_VERSION@
|
Version: @HS_VERSION@
|
||||||
Libs: -L${libdir} -lhs
|
Libs: -L${libdir} -lhs
|
||||||
Cflags: -I${includedir}/hs
|
Cflags: -I${includedir}/hs
|
||||||
|
|||||||
53
scripts/change_command.py
Normal file
53
scripts/change_command.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2020-2023, VectorCamp PC
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
# may be used to endorse or promote products derived from this software
|
||||||
|
# without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
#
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
#reads from the clang-tidy config file the first comment to ignore specific files
|
||||||
|
# Get the paths from the command-line arguments
|
||||||
|
# python3 ../source/scripts/change_command.py ../source/.clang-tidy ./compile_commands.json
|
||||||
|
clang_tidy_config_path = sys.argv[1]
|
||||||
|
compile_commands_path = sys.argv[2]
|
||||||
|
|
||||||
|
# Load the data from the file
|
||||||
|
with open(compile_commands_path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Open the clang-tidy config file and read the first comment
|
||||||
|
with open(clang_tidy_config_path, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith('#'):
|
||||||
|
ignore_files = line[1:].strip().split(',')
|
||||||
|
break
|
||||||
|
|
||||||
|
# Filter out the entries for the ignored files
|
||||||
|
data = [entry for entry in data if not any(ignore_file in entry['file'] for ignore_file in ignore_files)]
|
||||||
|
|
||||||
|
# Write the result to the same file
|
||||||
|
with open(compile_commands_path, 'w') as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
1
simde
Submodule
1
simde
Submodule
Submodule simde added at 416091ebdb
@@ -176,7 +176,8 @@ void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
|
|||||||
auto ecit = edge_cache.find(cache_key);
|
auto ecit = edge_cache.find(cache_key);
|
||||||
if (ecit == edge_cache.end()) {
|
if (ecit == edge_cache.end()) {
|
||||||
DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
|
DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
|
||||||
NFAEdge e = add_edge(u, v, g);
|
NFAEdge e;
|
||||||
|
std::tie(e, std::ignore) = add_edge(u, v, g);
|
||||||
edge_cache.emplace(cache_key, e);
|
edge_cache.emplace(cache_key, e);
|
||||||
g[e].assert_flags = flags;
|
g[e].assert_flags = flags;
|
||||||
if (++assert_edge_count > MAX_ASSERT_EDGES) {
|
if (++assert_edge_count > MAX_ASSERT_EDGES) {
|
||||||
@@ -229,11 +230,12 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,
|
|||||||
|
|
||||||
/* we need to interpose a dummy dot vertex between v and accept if
|
/* we need to interpose a dummy dot vertex between v and accept if
|
||||||
* required so that ^ doesn't match trailing \n */
|
* required so that ^ doesn't match trailing \n */
|
||||||
for (const auto &e : out_edges_range(v, g)) {
|
auto deads = [&g=g](const NFAEdge &e) {
|
||||||
if (target(e, g) == g.accept) {
|
return (target(e, g) == g.accept);
|
||||||
dead.emplace_back(e);
|
};
|
||||||
}
|
const auto &er = out_edges_range(v, g);
|
||||||
}
|
std::copy_if(begin(er), end(er), std::back_inserter(dead), deads);
|
||||||
|
|
||||||
/* assert has been resolved; clear flag */
|
/* assert has been resolved; clear flag */
|
||||||
g[v].assert_flags &= ~POS_FLAG_MULTILINE_START;
|
g[v].assert_flags &= ~POS_FLAG_MULTILINE_START;
|
||||||
}
|
}
|
||||||
@@ -251,6 +253,7 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,
|
|||||||
|
|
||||||
static
|
static
|
||||||
bool hasAssertVertices(const NGHolder &g) {
|
bool hasAssertVertices(const NGHolder &g) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (auto v : vertices_range(g)) {
|
for (auto v : vertices_range(g)) {
|
||||||
int flags = g[v].assert_flags;
|
int flags = g[v].assert_flags;
|
||||||
if (flags & WORDBOUNDARY_FLAGS) {
|
if (flags & WORDBOUNDARY_FLAGS) {
|
||||||
|
|||||||
@@ -417,7 +417,7 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
|
|||||||
"HS_FLAG_SOM_LEFTMOST are supported in literal API.");
|
"HS_FLAG_SOM_LEFTMOST are supported in literal API.");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!strcmp(expression, "")) {
|
if (expLength == 0) {
|
||||||
throw CompileError("Pure literal API doesn't support empty string.");
|
throw CompileError("Pure literal API doesn't support empty string.");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -443,7 +443,7 @@ bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
|
|||||||
if (!rose) {
|
if (!rose) {
|
||||||
DEBUG_PRINTF("error building rose\n");
|
DEBUG_PRINTF("error building rose\n");
|
||||||
assert(0);
|
assert(0);
|
||||||
return nullptr;
|
return bytecode_ptr<RoseEngine>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
dumpReportManager(ng.rm, ng.cc.grey);
|
dumpReportManager(ng.rm, ng.cc.grey);
|
||||||
@@ -478,7 +478,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
|
|||||||
DEBUG_PRINTF("db size %zu\n", db_len);
|
DEBUG_PRINTF("db size %zu\n", db_len);
|
||||||
DEBUG_PRINTF("db platform %llx\n", platform);
|
DEBUG_PRINTF("db platform %llx\n", platform);
|
||||||
|
|
||||||
struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
|
struct hs_database *db = static_cast<struct hs_database *>(hs_database_alloc(db_len));
|
||||||
if (hs_check_alloc(db) != HS_SUCCESS) {
|
if (hs_check_alloc(db) != HS_SUCCESS) {
|
||||||
hs_database_free(db);
|
hs_database_free(db);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@@ -492,7 +492,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
|
|||||||
DEBUG_PRINTF("shift is %zu\n", shift);
|
DEBUG_PRINTF("shift is %zu\n", shift);
|
||||||
|
|
||||||
db->bytecode = offsetof(struct hs_database, bytes) - shift;
|
db->bytecode = offsetof(struct hs_database, bytes) - shift;
|
||||||
char *bytecode = (char *)db + db->bytecode;
|
char *bytecode = reinterpret_cast<char *>(db) + db->bytecode;
|
||||||
assert(ISALIGNED_CL(bytecode));
|
assert(ISALIGNED_CL(bytecode));
|
||||||
|
|
||||||
db->magic = HS_DB_MAGIC;
|
db->magic = HS_DB_MAGIC;
|
||||||
@@ -525,7 +525,7 @@ struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) {
|
|||||||
throw CompileError("Internal error.");
|
throw CompileError("Internal error.");
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *bytecode = (const char *)(rose.get());
|
const char *bytecode = reinterpret_cast<const char *>(rose.get());
|
||||||
const platform_t p = target_to_platform(ng.cc.target_info);
|
const platform_t p = target_to_platform(ng.cc.target_info);
|
||||||
struct hs_database *db = dbCreate(bytecode, *length, p);
|
struct hs_database *db = dbCreate(bytecode, *length, p);
|
||||||
if (!db) {
|
if (!db) {
|
||||||
|
|||||||
@@ -57,15 +57,14 @@ extern const hs_compile_error_t hs_badalloc = {
|
|||||||
namespace ue2 {
|
namespace ue2 {
|
||||||
|
|
||||||
hs_compile_error_t *generateCompileError(const string &err, int expression) {
|
hs_compile_error_t *generateCompileError(const string &err, int expression) {
|
||||||
hs_compile_error_t *ret =
|
hs_compile_error_t *ret = static_cast<struct hs_compile_error *>(hs_misc_alloc(sizeof(hs_compile_error_t)));
|
||||||
(struct hs_compile_error *)hs_misc_alloc(sizeof(hs_compile_error_t));
|
|
||||||
if (ret) {
|
if (ret) {
|
||||||
hs_error_t e = hs_check_alloc(ret);
|
hs_error_t e = hs_check_alloc(ret);
|
||||||
if (e != HS_SUCCESS) {
|
if (e != HS_SUCCESS) {
|
||||||
hs_misc_free(ret);
|
hs_misc_free(ret);
|
||||||
return const_cast<hs_compile_error_t *>(&hs_badalloc);
|
return const_cast<hs_compile_error_t *>(&hs_badalloc);
|
||||||
}
|
}
|
||||||
char *msg = (char *)hs_misc_alloc(err.size() + 1);
|
char *msg = static_cast<char *>(hs_misc_alloc(err.size() + 1));
|
||||||
if (msg) {
|
if (msg) {
|
||||||
e = hs_check_alloc(msg);
|
e = hs_check_alloc(msg);
|
||||||
if (e != HS_SUCCESS) {
|
if (e != HS_SUCCESS) {
|
||||||
|
|||||||
@@ -542,14 +542,13 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
|
|||||||
|
|
||||||
// Main aligned loop, processes eight bytes at a time.
|
// Main aligned loop, processes eight bytes at a time.
|
||||||
|
|
||||||
u32 term1, term2;
|
|
||||||
for (size_t li = 0; li < running_length/8; li++) {
|
for (size_t li = 0; li < running_length/8; li++) {
|
||||||
u32 block = *(const u32 *)p_buf;
|
u32 block = *(const u32 *)p_buf;
|
||||||
crc ^= block;
|
crc ^= block;
|
||||||
p_buf += 4;
|
p_buf += 4;
|
||||||
term1 = crc_tableil8_o88[crc & 0x000000FF] ^
|
u32 term1 = crc_tableil8_o88[crc & 0x000000FF] ^
|
||||||
crc_tableil8_o80[(crc >> 8) & 0x000000FF];
|
crc_tableil8_o80[(crc >> 8) & 0x000000FF];
|
||||||
term2 = crc >> 16;
|
u32 term2 = crc >> 16;
|
||||||
crc = term1 ^
|
crc = term1 ^
|
||||||
crc_tableil8_o72[term2 & 0x000000FF] ^
|
crc_tableil8_o72[term2 & 0x000000FF] ^
|
||||||
crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
|
crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
|
||||||
|
|||||||
@@ -79,21 +79,18 @@ static UNUSED
|
|||||||
const platform_t hs_current_platform_no_avx2 = {
|
const platform_t hs_current_platform_no_avx2 = {
|
||||||
HS_PLATFORM_NOAVX2 |
|
HS_PLATFORM_NOAVX2 |
|
||||||
HS_PLATFORM_NOAVX512 |
|
HS_PLATFORM_NOAVX512 |
|
||||||
HS_PLATFORM_NOAVX512VBMI |
|
HS_PLATFORM_NOAVX512VBMI
|
||||||
0,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static UNUSED
|
static UNUSED
|
||||||
const platform_t hs_current_platform_no_avx512 = {
|
const platform_t hs_current_platform_no_avx512 = {
|
||||||
HS_PLATFORM_NOAVX512 |
|
HS_PLATFORM_NOAVX512 |
|
||||||
HS_PLATFORM_NOAVX512VBMI |
|
HS_PLATFORM_NOAVX512VBMI
|
||||||
0,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static UNUSED
|
static UNUSED
|
||||||
const platform_t hs_current_platform_no_avx512vbmi = {
|
const platform_t hs_current_platform_no_avx512vbmi = {
|
||||||
HS_PLATFORM_NOAVX512VBMI |
|
HS_PLATFORM_NOAVX512VBMI
|
||||||
0,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -115,6 +112,7 @@ struct hs_database {
|
|||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
const void *hs_get_bytecode(const struct hs_database *db) {
|
const void *hs_get_bytecode(const struct hs_database *db) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
return ((const char *)db + db->bytecode);
|
return ((const char *)db + db->bytecode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
249
src/dispatcher.c
249
src/dispatcher.c
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016-2020, Intel Corporation
|
* Copyright (c) 2016-2020, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -30,6 +31,39 @@
|
|||||||
#include "hs_common.h"
|
#include "hs_common.h"
|
||||||
#include "hs_runtime.h"
|
#include "hs_runtime.h"
|
||||||
#include "ue2common.h"
|
#include "ue2common.h"
|
||||||
|
|
||||||
|
/* Streamlining the dispatch to eliminate runtime checking/branching:
|
||||||
|
* What we want to do is, first call to the function will run the resolve
|
||||||
|
* code and set the static resolved/dispatch pointer to point to the
|
||||||
|
* correct function. Subsequent calls to the function will go directly to
|
||||||
|
* the resolved ptr. The simplest way to accomplish this is, to
|
||||||
|
* initially set the pointer to the resolve function.
|
||||||
|
* To accomplish this in a manner invisible to the user,
|
||||||
|
* we do involve some rather ugly/confusing macros in here.
|
||||||
|
* There are four macros that assemble the code for each function
|
||||||
|
* we want to dispatch in this manner:
|
||||||
|
* CREATE_DISPATCH
|
||||||
|
* this generates the declarations for the candidate target functions,
|
||||||
|
* for the fat_dispatch function pointer, for the resolve_ function,
|
||||||
|
* points the function pointer to the resolve function, and contains
|
||||||
|
* most of the definition of the resolve function. The very end of the
|
||||||
|
* resolve function is completed by the next macro, because in the
|
||||||
|
* CREATE_DISPATCH macro we have the argument list with the arg declarations,
|
||||||
|
* which is needed to generate correct function signatures, but we
|
||||||
|
* can't generate from this, in a macro, a _call_ to one of those functions.
|
||||||
|
* CONNECT_ARGS_1
|
||||||
|
* this macro fills in the actual call at the end of the resolve function,
|
||||||
|
* with the correct arg list. hence the name connect args.
|
||||||
|
* CONNECT_DISPATCH_2
|
||||||
|
* this macro likewise gives up the beginning of the definition of the
|
||||||
|
* actual entry point function (the 'real name' that's called by the user)
|
||||||
|
* but again in the pass-through call, cannot invoke the target without
|
||||||
|
* getting the arg list , which is supplied by the final macro,
|
||||||
|
* CONNECT_ARGS_3
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||||
#include "util/arch/x86/cpuid_inline.h"
|
#include "util/arch/x86/cpuid_inline.h"
|
||||||
#include "util/join.h"
|
#include "util/join.h"
|
||||||
@@ -57,30 +91,38 @@
|
|||||||
return (RTYPE)HS_ARCH_ERROR; \
|
return (RTYPE)HS_ARCH_ERROR; \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
/* resolver */ \
|
/* dispatch routing pointer for this function */ \
|
||||||
static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \
|
/* initially point it at the resolve function */ \
|
||||||
if (check_avx512vbmi()) { \
|
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__); \
|
||||||
return JOIN(avx512vbmi_, NAME); \
|
static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) = \
|
||||||
} \
|
&JOIN(resolve_, NAME); \
|
||||||
if (check_avx512()) { \
|
|
||||||
return JOIN(avx512_, NAME); \
|
|
||||||
} \
|
|
||||||
if (check_avx2()) { \
|
|
||||||
return JOIN(avx2_, NAME); \
|
|
||||||
} \
|
|
||||||
if (check_sse42() && check_popcnt()) { \
|
|
||||||
return JOIN(corei7_, NAME); \
|
|
||||||
} \
|
|
||||||
if (check_ssse3()) { \
|
|
||||||
return JOIN(core2_, NAME); \
|
|
||||||
} \
|
|
||||||
/* anything else is fail */ \
|
|
||||||
return JOIN(error_, NAME); \
|
|
||||||
} \
|
|
||||||
\
|
\
|
||||||
/* function */ \
|
/* resolver */ \
|
||||||
HS_PUBLIC_API \
|
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) { \
|
||||||
RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
|
if (check_avx512vbmi()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(avx512vbmi_, NAME); \
|
||||||
|
} \
|
||||||
|
else if (check_avx512()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(avx512_, NAME); \
|
||||||
|
} \
|
||||||
|
else if (check_avx2()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(avx2_, NAME); \
|
||||||
|
} \
|
||||||
|
else if (check_sse42() && check_popcnt()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(corei7_, NAME); \
|
||||||
|
} \
|
||||||
|
else if (check_ssse3()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(core2_, NAME); \
|
||||||
|
} else { \
|
||||||
|
/* anything else is fail */ \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(error_, NAME); \
|
||||||
|
} \
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#elif defined(ARCH_AARCH64)
|
#elif defined(ARCH_AARCH64)
|
||||||
#include "util/arch/arm/cpuid_inline.h"
|
#include "util/arch/arm/cpuid_inline.h"
|
||||||
@@ -97,99 +139,226 @@
|
|||||||
return (RTYPE)HS_ARCH_ERROR; \
|
return (RTYPE)HS_ARCH_ERROR; \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
/* resolver */ \
|
/* dispatch routing pointer for this function */ \
|
||||||
static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \
|
/* initially point it at the resolve function */ \
|
||||||
if (check_sve2()) { \
|
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__); \
|
||||||
return JOIN(sve2_, NAME); \
|
static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) = \
|
||||||
} \
|
&JOIN(resolve_, NAME); \
|
||||||
if (check_sve()) { \
|
|
||||||
return JOIN(sve_, NAME); \
|
|
||||||
} \
|
|
||||||
if (check_neon()) { \
|
|
||||||
return JOIN(neon_, NAME); \
|
|
||||||
} \
|
|
||||||
/* anything else is fail */ \
|
|
||||||
return JOIN(error_, NAME); \
|
|
||||||
} \
|
|
||||||
\
|
\
|
||||||
/* function */ \
|
/* resolver */ \
|
||||||
HS_PUBLIC_API \
|
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) { \
|
||||||
RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
|
if (check_sve2()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(sve2_, NAME); \
|
||||||
|
} \
|
||||||
|
else if (check_sve()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(sve_, NAME); \
|
||||||
|
} \
|
||||||
|
else if (check_neon()) { \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(neon_, NAME); \
|
||||||
|
} else { \
|
||||||
|
/* anything else is fail */ \
|
||||||
|
fat_dispatch_ ## NAME = &JOIN(error_, NAME); \
|
||||||
|
} \
|
||||||
|
|
||||||
|
|
||||||
|
/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#define CONNECT_ARGS_1(RTYPE, NAME, ...) \
|
||||||
|
return (*fat_dispatch_ ## NAME)(__VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
|
||||||
|
|
||||||
|
#define CONNECT_DISPATCH_2(RTYPE, NAME, ...) \
|
||||||
|
/* new function */ \
|
||||||
|
HS_PUBLIC_API \
|
||||||
|
RTYPE NAME(__VA_ARGS__) { \
|
||||||
|
|
||||||
|
|
||||||
|
#define CONNECT_ARGS_3(RTYPE, NAME, ...) \
|
||||||
|
return (*fat_dispatch_ ## NAME)(__VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
|
||||||
|
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
#pragma GCC diagnostic ignored "-Wunused-function"
|
#pragma GCC diagnostic ignored "-Wunused-function"
|
||||||
|
|
||||||
|
/* this gets a bit ugly to compose the static redirect functions,
|
||||||
|
* as we necessarily need first the typed arg list and then just the arg
|
||||||
|
* names, twice in a row, to define the redirect function and the
|
||||||
|
* dispatch function call */
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
|
CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
|
||||||
unsigned length, unsigned flags, hs_scratch_t *scratch,
|
unsigned length, unsigned flags, hs_scratch_t *scratch,
|
||||||
match_event_handler onEvent, void *userCtx);
|
match_event_handler onEvent, void *userCtx);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
|
||||||
|
unsigned length, unsigned flags, hs_scratch_t *scratch,
|
||||||
|
match_event_handler onEvent, void *userCtx);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database,
|
CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database,
|
||||||
size_t *stream_size);
|
size_t *stream_size);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_stream_size, database, stream_size);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_stream_size, const hs_database_t *database,
|
||||||
|
size_t *stream_size);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_stream_size, database, stream_size);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db,
|
CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db,
|
||||||
size_t *size);
|
size_t *size);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_database_size, db, size);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_database_size, const hs_database_t *db,
|
||||||
|
size_t *size);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_database_size, db, size);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db);
|
CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, dbIsValid, db);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, dbIsValid, const hs_database_t *db);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, dbIsValid, db);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db);
|
CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_free_database, db);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_free_database, hs_database_t *db);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_free_database, db);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db,
|
CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db,
|
||||||
unsigned int flags, hs_stream_t **stream);
|
unsigned int flags, hs_stream_t **stream);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_open_stream, db, flags, stream);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_open_stream, const hs_database_t *db,
|
||||||
|
unsigned int flags, hs_stream_t **stream);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_open_stream, db, flags, stream);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
|
CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
|
||||||
unsigned int length, unsigned int flags, hs_scratch_t *scratch,
|
unsigned int length, unsigned int flags, hs_scratch_t *scratch,
|
||||||
match_event_handler onEvent, void *ctxt);
|
match_event_handler onEvent, void *ctxt);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
|
||||||
|
unsigned int length, unsigned int flags, hs_scratch_t *scratch,
|
||||||
|
match_event_handler onEvent, void *ctxt);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id,
|
CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id,
|
||||||
hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
|
hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_close_stream, hs_stream_t *id,
|
||||||
|
hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db,
|
CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db,
|
||||||
const char *const *data, const unsigned int *length,
|
const char *const *data, const unsigned int *length,
|
||||||
unsigned int count, unsigned int flags, hs_scratch_t *scratch,
|
unsigned int count, unsigned int flags, hs_scratch_t *scratch,
|
||||||
match_event_handler onevent, void *context);
|
match_event_handler onevent, void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_scan_vector, const hs_database_t *db,
|
||||||
|
const char *const *data, const unsigned int *length,
|
||||||
|
unsigned int count, unsigned int flags, hs_scratch_t *scratch,
|
||||||
|
match_event_handler onevent, void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
|
CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_database_info, db, info);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_database_info, db, info);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
|
CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
|
||||||
const hs_stream_t *from_id);
|
const hs_stream_t *from_id);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_copy_stream, to_id, from_id);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
|
||||||
|
const hs_stream_t *from_id);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_copy_stream, to_id, from_id);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id,
|
CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id,
|
||||||
unsigned int flags, hs_scratch_t *scratch,
|
unsigned int flags, hs_scratch_t *scratch,
|
||||||
match_event_handler onEvent, void *context);
|
match_event_handler onEvent, void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_reset_stream, hs_stream_t *id,
|
||||||
|
unsigned int flags, hs_scratch_t *scratch,
|
||||||
|
match_event_handler onEvent, void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
|
CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
|
||||||
const hs_stream_t *from_id, hs_scratch_t *scratch,
|
const hs_stream_t *from_id, hs_scratch_t *scratch,
|
||||||
match_event_handler onEvent, void *context);
|
match_event_handler onEvent, void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
|
||||||
|
const hs_stream_t *from_id, hs_scratch_t *scratch,
|
||||||
|
match_event_handler onEvent, void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db,
|
CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db,
|
||||||
char **bytes, size_t *length);
|
char **bytes, size_t *length);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_serialize_database, db, bytes, length);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_serialize_database, const hs_database_t *db,
|
||||||
|
char **bytes, size_t *length);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_serialize_database, db, bytes, length);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes,
|
CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes,
|
||||||
const size_t length, hs_database_t **db);
|
const size_t length, hs_database_t **db);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_deserialize_database, bytes, length, db);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database, const char *bytes,
|
||||||
|
const size_t length, hs_database_t **db);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_deserialize_database, bytes, length, db);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes,
|
CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes,
|
||||||
const size_t length, hs_database_t *db);
|
const size_t length, hs_database_t *db);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_deserialize_database_at, bytes, length, db);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database_at, const char *bytes,
|
||||||
|
const size_t length, hs_database_t *db);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_deserialize_database_at, bytes, length, db);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
|
CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
|
||||||
size_t length, char **info);
|
size_t length, char **info);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_serialized_database_info, bytes, length, info);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_info, const char *bytes,
|
||||||
|
size_t length, char **info);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_serialized_database_info, bytes, length, info);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
|
CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
|
||||||
const size_t length, size_t *deserialized_size);
|
const size_t length, size_t *deserialized_size);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_size, const char *bytes,
|
||||||
|
const size_t length, size_t *deserialized_size);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
|
CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
|
||||||
char *buf, size_t buf_space, size_t *used_space);
|
char *buf, size_t buf_space, size_t *used_space);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_compress_stream, stream,
|
||||||
|
buf, buf_space, used_space);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
|
||||||
|
char *buf, size_t buf_space, size_t *used_space);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_compress_stream, stream,
|
||||||
|
buf, buf_space, used_space);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
|
CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
|
||||||
hs_stream_t **stream, const char *buf,size_t buf_size);
|
hs_stream_t **stream, const char *buf,size_t buf_size);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_expand_stream, const hs_database_t *db,
|
||||||
|
hs_stream_t **stream, const char *buf,size_t buf_size);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
|
||||||
|
|
||||||
CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
|
CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
|
||||||
const char *buf, size_t buf_size, hs_scratch_t *scratch,
|
const char *buf, size_t buf_size, hs_scratch_t *scratch,
|
||||||
match_event_handler onEvent, void *context);
|
match_event_handler onEvent, void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_reset_and_expand_stream, to_stream,
|
||||||
|
buf, buf_size, scratch, onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
|
||||||
|
const char *buf, size_t buf_size, hs_scratch_t *scratch,
|
||||||
|
match_event_handler onEvent, void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
|
||||||
|
buf, buf_size, scratch, onEvent, context);
|
||||||
|
|
||||||
/** INTERNALS **/
|
/** INTERNALS **/
|
||||||
|
|
||||||
CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
||||||
|
CONNECT_ARGS_1(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
|
||||||
|
CONNECT_DISPATCH_2(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
||||||
|
CONNECT_ARGS_3(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
|
||||||
|
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
|
|||||||
@@ -298,7 +298,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
|||||||
static really_inline
|
static really_inline
|
||||||
void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
|
void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
|
||||||
const u32 *confBase, const struct FDR_Runtime_Args *a,
|
const u32 *confBase, const struct FDR_Runtime_Args *a,
|
||||||
const u8 *ptr, u32 *last_match_id, struct zone *z) {
|
const u8 *ptr, u32 *last_match_id, const struct zone *z) {
|
||||||
const u8 bucket = 8;
|
const u8 bucket = 8;
|
||||||
|
|
||||||
if (likely(!*conf)) {
|
if (likely(!*conf)) {
|
||||||
@@ -308,7 +308,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
|
|||||||
/* ptr is currently referring to a location in the zone's buffer, we also
|
/* ptr is currently referring to a location in the zone's buffer, we also
|
||||||
* need a pointer in the original, main buffer for the final string compare.
|
* need a pointer in the original, main buffer for the final string compare.
|
||||||
*/
|
*/
|
||||||
const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
|
const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
|
||||||
|
|
||||||
const u8 *confLoc = ptr;
|
const u8 *confLoc = ptr;
|
||||||
|
|
||||||
@@ -333,7 +333,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
|
void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) {
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
|
DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
|
||||||
DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
|
DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void FDRCompiler::createInitialState(FDR *fdr) {
|
void FDRCompiler::createInitialState(FDR *fdr) {
|
||||||
u8 *start = (u8 *)&fdr->start;
|
u8 *start = reinterpret_cast<u8 *>(&fdr->start);
|
||||||
|
|
||||||
/* initial state should to be 1 in each slot in the bucket up to bucket
|
/* initial state should to be 1 in each slot in the bucket up to bucket
|
||||||
* minlen - 1, and 0 thereafter */
|
* minlen - 1, and 0 thereafter */
|
||||||
@@ -136,6 +136,7 @@ void FDRCompiler::createInitialState(FDR *fdr) {
|
|||||||
const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
|
const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
|
||||||
u32 min_len = ~0U;
|
u32 min_len = ~0U;
|
||||||
for (const LiteralIndex &lit_idx : bucket_lits) {
|
for (const LiteralIndex &lit_idx : bucket_lits) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));
|
min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,7 +176,7 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
|
|||||||
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
|
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
|
||||||
assert(fdr); // otherwise would have thrown std::bad_alloc
|
assert(fdr); // otherwise would have thrown std::bad_alloc
|
||||||
|
|
||||||
u8 *fdr_base = (u8 *)fdr.get();
|
u8 *fdr_base = reinterpret_cast<u8 *>(fdr.get());
|
||||||
|
|
||||||
// Write header.
|
// Write header.
|
||||||
fdr->size = size;
|
fdr->size = size;
|
||||||
@@ -205,7 +206,6 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
|
|||||||
assert(ISALIGNED_CL(ptr));
|
assert(ISALIGNED_CL(ptr));
|
||||||
fdr->floodOffset = verify_u32(ptr - fdr_base);
|
fdr->floodOffset = verify_u32(ptr - fdr_base);
|
||||||
memcpy(ptr, floodTable.get(), floodTable.size());
|
memcpy(ptr, floodTable.get(), floodTable.size());
|
||||||
ptr += floodTable.size(); // last write, no need to round up
|
|
||||||
|
|
||||||
return fdr;
|
return fdr;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ namespace ue2 {
|
|||||||
size_t maxLen(const vector<hwlmLiteral> &lits) {
|
size_t maxLen(const vector<hwlmLiteral> &lits) {
|
||||||
size_t rv = 0;
|
size_t rv = 0;
|
||||||
for (const auto &lit : lits) {
|
for (const auto &lit : lits) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
rv = max(rv, lit.s.size());
|
rv = max(rv, lit.s.size());
|
||||||
}
|
}
|
||||||
return rv;
|
return rv;
|
||||||
|
|||||||
@@ -84,9 +84,10 @@ struct FDRConfirm {
|
|||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
|
const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
const u8 *base = (const u8 *)fdrc;
|
const u8 *base = (const u8 *)fdrc;
|
||||||
const u32 *litIndex =
|
// cppcheck-suppress cstyleCast
|
||||||
(const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
|
const u32 *litIndex =(const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
|
||||||
assert(ISALIGNED(litIndex));
|
assert(ISALIGNED(litIndex));
|
||||||
return litIndex;
|
return litIndex;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ u64a make_u64a_mask(const vector<u8> &v) {
|
|||||||
u64a mask = 0;
|
u64a mask = 0;
|
||||||
size_t vlen = v.size();
|
size_t vlen = v.size();
|
||||||
size_t len = std::min(vlen, sizeof(mask));
|
size_t len = std::min(vlen, sizeof(mask));
|
||||||
unsigned char *m = (unsigned char *)&mask;
|
u8 *m = reinterpret_cast<u8 *>(&mask);
|
||||||
memcpy(m + sizeof(mask) - len, &v[vlen - len], len);
|
memcpy(m + sizeof(mask) - len, &v[vlen - len], len);
|
||||||
return mask;
|
return mask;
|
||||||
}
|
}
|
||||||
@@ -159,7 +159,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
|||||||
map<u32, vector<LiteralIndex> > res2lits;
|
map<u32, vector<LiteralIndex> > res2lits;
|
||||||
hwlm_group_t gm = 0;
|
hwlm_group_t gm = 0;
|
||||||
for (LiteralIndex i = 0; i < lits.size(); i++) {
|
for (LiteralIndex i = 0; i < lits.size(); i++) {
|
||||||
LitInfo & li = tmpLitInfo[i];
|
const LitInfo & li = tmpLitInfo[i];
|
||||||
u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
|
u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
|
||||||
DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
|
DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
|
||||||
res2lits[hash].emplace_back(i);
|
res2lits[hash].emplace_back(i);
|
||||||
@@ -245,10 +245,10 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
|||||||
fdrc->groups = gm;
|
fdrc->groups = gm;
|
||||||
|
|
||||||
// After the FDRConfirm, we have the lit index array.
|
// After the FDRConfirm, we have the lit index array.
|
||||||
u8 *fdrc_base = (u8 *)fdrc.get();
|
u8 *fdrc_base = reinterpret_cast<u8 *>(fdrc.get());
|
||||||
u8 *ptr = fdrc_base + sizeof(*fdrc);
|
u8 *ptr = fdrc_base + sizeof(*fdrc);
|
||||||
ptr = ROUNDUP_PTR(ptr, alignof(u32));
|
ptr = ROUNDUP_PTR(ptr, alignof(u32));
|
||||||
u32 *bitsToLitIndex = (u32 *)ptr;
|
u32 *bitsToLitIndex = reinterpret_cast<u32 *>(ptr);
|
||||||
ptr += bitsToLitIndexSize;
|
ptr += bitsToLitIndexSize;
|
||||||
|
|
||||||
// After the lit index array, we have the LitInfo structures themselves,
|
// After the lit index array, we have the LitInfo structures themselves,
|
||||||
@@ -265,7 +265,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
|||||||
LiteralIndex litIdx = *i;
|
LiteralIndex litIdx = *i;
|
||||||
|
|
||||||
// Write LitInfo header.
|
// Write LitInfo header.
|
||||||
LitInfo &finalLI = *(LitInfo *)ptr;
|
LitInfo &finalLI = *(reinterpret_cast<LitInfo *>(ptr));
|
||||||
finalLI = tmpLitInfo[litIdx];
|
finalLI = tmpLitInfo[litIdx];
|
||||||
|
|
||||||
ptr += sizeof(LitInfo); // String starts directly after LitInfo.
|
ptr += sizeof(LitInfo); // String starts directly after LitInfo.
|
||||||
@@ -294,15 +294,13 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
|
|||||||
const EngineDescription &eng,
|
const EngineDescription &eng,
|
||||||
const map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
|
const map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
|
||||||
bool make_small) {
|
bool make_small) {
|
||||||
unique_ptr<TeddyEngineDescription> teddyDescr =
|
|
||||||
getTeddyDescription(eng.getID());
|
|
||||||
|
|
||||||
BC2CONF bc2Conf;
|
BC2CONF bc2Conf;
|
||||||
u32 totalConfirmSize = 0;
|
u32 totalConfirmSize = 0;
|
||||||
for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
|
for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
|
||||||
if (contains(bucketToLits, b)) {
|
if (contains(bucketToLits, b)) {
|
||||||
vector<hwlmLiteral> vl;
|
vector<hwlmLiteral> vl;
|
||||||
for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
|
for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
vl.emplace_back(lits[lit_idx]);
|
vl.emplace_back(lits[lit_idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -320,7 +318,7 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
|
|||||||
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64);
|
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64);
|
||||||
assert(buf); // otherwise would have thrown std::bad_alloc
|
assert(buf); // otherwise would have thrown std::bad_alloc
|
||||||
|
|
||||||
u32 *confBase = (u32 *)buf.get();
|
u32 *confBase = reinterpret_cast<u32 *>(buf.get());
|
||||||
u8 *ptr = buf.get() + totalConfSwitchSize;
|
u8 *ptr = buf.get() + totalConfSwitchSize;
|
||||||
assert(ISALIGNED_CL(ptr));
|
assert(ISALIGNED_CL(ptr));
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2019, Intel Corporation
|
* Copyright (c) 2015-2019, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -54,9 +55,14 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
|
|||||||
if (likely(!start)) {
|
if (likely(!start)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// these cplusplus checks are needed because this is included in both fdr.c and teddy.cpp
|
||||||
|
#ifdef __cplusplus
|
||||||
|
const struct LitInfo *li
|
||||||
|
= reinterpret_cast<const struct LitInfo *>(reinterpret_cast<const u8 *>(fdrc) + start);
|
||||||
|
#else
|
||||||
const struct LitInfo *li
|
const struct LitInfo *li
|
||||||
= (const struct LitInfo *)((const u8 *)fdrc + start);
|
= (const struct LitInfo *)((const u8 *)fdrc + start);
|
||||||
|
#endif
|
||||||
|
|
||||||
struct hs_scratch *scratch = a->scratch;
|
struct hs_scratch *scratch = a->scratch;
|
||||||
assert(!scratch->fdr_conf);
|
assert(!scratch->fdr_conf);
|
||||||
@@ -74,18 +80,20 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
const u8 *loc = buf + i - li->size + 1;
|
do{ // this do while is to block off the line below from the goto
|
||||||
|
const u8 *loc = buf + i - li->size + 1;
|
||||||
|
|
||||||
if (loc < buf) {
|
if (loc < buf) {
|
||||||
u32 full_overhang = buf - loc;
|
u32 full_overhang = buf - loc;
|
||||||
size_t len_history = a->len_history;
|
size_t len_history = a->len_history;
|
||||||
|
|
||||||
// can't do a vectored confirm either if we don't have
|
// can't do a vectored confirm either if we don't have
|
||||||
// the bytes
|
// the bytes
|
||||||
if (full_overhang > len_history) {
|
if (full_overhang > len_history) {
|
||||||
goto out;
|
goto out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}while(0);
|
||||||
assert(li->size <= sizeof(CONF_TYPE));
|
assert(li->size <= sizeof(CONF_TYPE));
|
||||||
|
|
||||||
if (unlikely(!(li->groups & *control))) {
|
if (unlikely(!(li->groups & *control))) {
|
||||||
|
|||||||
@@ -74,9 +74,9 @@ void dumpLitIndex(const FDRConfirm *fdrc, FILE *f) {
|
|||||||
static
|
static
|
||||||
void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms,
|
void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms,
|
||||||
FILE *f) {
|
FILE *f) {
|
||||||
const u32 *conf = (const u32 *)((const char *)fdr_base + conf_offset);
|
const u32 *conf = reinterpret_cast<const u32 *>(reinterpret_cast<const char *>(fdr_base) + conf_offset);
|
||||||
for (u32 i = 0; i < num_confirms; i++) {
|
for (u32 i = 0; i < num_confirms; i++) {
|
||||||
const auto *fdrc = (const FDRConfirm *)((const char *)conf + conf[i]);
|
const auto *fdrc = reinterpret_cast<const FDRConfirm *>(reinterpret_cast<const char *>(conf) + conf[i]);
|
||||||
fprintf(f, " confirm %u\n", i);
|
fprintf(f, " confirm %u\n", i);
|
||||||
fprintf(f, " andmsk 0x%016llx\n", fdrc->andmsk);
|
fprintf(f, " andmsk 0x%016llx\n", fdrc->andmsk);
|
||||||
fprintf(f, " mult 0x%016llx\n", fdrc->mult);
|
fprintf(f, " mult 0x%016llx\n", fdrc->mult);
|
||||||
@@ -113,7 +113,7 @@ void dumpTeddyDupMasks(const u8 *dmsk, u32 numMasks, FILE *f) {
|
|||||||
u32 maskWidth = 2;
|
u32 maskWidth = 2;
|
||||||
fprintf(f, " dup nibble masks:\n");
|
fprintf(f, " dup nibble masks:\n");
|
||||||
for (u32 i = 0; i < numMasks * 2; i++) {
|
for (u32 i = 0; i < numMasks * 2; i++) {
|
||||||
fprintf(f, " -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
|
fprintf(f, " -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
|
||||||
for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
|
for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
|
||||||
u8 val = dmsk[i * 16 * maskWidth * 2 + j];
|
u8 val = dmsk[i * 16 * maskWidth * 2 + j];
|
||||||
for (u32 k = 0; k < 8; k++) {
|
for (u32 k = 0; k < 8; k++) {
|
||||||
@@ -131,7 +131,7 @@ void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
|
|||||||
// dump nibble masks
|
// dump nibble masks
|
||||||
fprintf(f, " nibble masks:\n");
|
fprintf(f, " nibble masks:\n");
|
||||||
for (u32 i = 0; i < numMasks * 2; i++) {
|
for (u32 i = 0; i < numMasks * 2; i++) {
|
||||||
fprintf(f, " -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
|
fprintf(f, " -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
|
||||||
for (u32 j = 0; j < 16 * maskWidth; j++) {
|
for (u32 j = 0; j < 16 * maskWidth; j++) {
|
||||||
u8 val = baseMsk[i * 16 * maskWidth + j];
|
u8 val = baseMsk[i * 16 * maskWidth + j];
|
||||||
for (u32 k = 0; k < 8; k++) {
|
for (u32 k = 0; k < 8; k++) {
|
||||||
@@ -157,7 +157,7 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
|
|||||||
fprintf(f, " buckets %u\n", des->getNumBuckets());
|
fprintf(f, " buckets %u\n", des->getNumBuckets());
|
||||||
fprintf(f, " packed %s\n", des->packed ? "true" : "false");
|
fprintf(f, " packed %s\n", des->packed ? "true" : "false");
|
||||||
fprintf(f, " strings %u\n", teddy->numStrings);
|
fprintf(f, " strings %u\n", teddy->numStrings);
|
||||||
fprintf(f, " size %zu bytes\n", fdrSize((const FDR *)teddy));
|
fprintf(f, " size %zu bytes\n", fdrSize(reinterpret_cast<const FDR *>(teddy)));
|
||||||
fprintf(f, " max length %u\n", teddy->maxStringLen);
|
fprintf(f, " max length %u\n", teddy->maxStringLen);
|
||||||
fprintf(f, " floodoff %u (%x)\n", teddy->floodOffset,
|
fprintf(f, " floodoff %u (%x)\n", teddy->floodOffset,
|
||||||
teddy->floodOffset);
|
teddy->floodOffset);
|
||||||
@@ -165,7 +165,7 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
|
|||||||
|
|
||||||
u32 maskWidth = des->getNumBuckets() / 8;
|
u32 maskWidth = des->getNumBuckets() / 8;
|
||||||
size_t headerSize = sizeof(Teddy);
|
size_t headerSize = sizeof(Teddy);
|
||||||
const u8 *teddy_base = (const u8 *)teddy;
|
const u8 *teddy_base = reinterpret_cast<const u8 *>(teddy);
|
||||||
const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
|
const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
|
||||||
dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
|
dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
|
||||||
size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
|
size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
|
||||||
@@ -201,7 +201,7 @@ void dumpFDR(const FDR *fdr, FILE *f) {
|
|||||||
|
|
||||||
void fdrPrintStats(const FDR *fdr, FILE *f) {
|
void fdrPrintStats(const FDR *fdr, FILE *f) {
|
||||||
if (fdrIsTeddy(fdr)) {
|
if (fdrIsTeddy(fdr)) {
|
||||||
dumpTeddy((const Teddy *)fdr, f);
|
dumpTeddy(reinterpret_cast<const Teddy *>(fdr), f);
|
||||||
} else {
|
} else {
|
||||||
dumpFDR(fdr, f);
|
dumpFDR(fdr, f);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
|
|||||||
} else if (num_lits < 5000) {
|
} else if (num_lits < 5000) {
|
||||||
// for larger but not huge sizes, go to stride 2 only if we have at
|
// for larger but not huge sizes, go to stride 2 only if we have at
|
||||||
// least minlen 3
|
// least minlen 3
|
||||||
desiredStride = MIN(min_len - 1, 2);
|
desiredStride = std::min(min_len - 1, 2UL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -208,8 +208,8 @@ bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
|
|||||||
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
|
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
|
||||||
assert(buf); // otherwise would have thrown std::bad_alloc
|
assert(buf); // otherwise would have thrown std::bad_alloc
|
||||||
|
|
||||||
u32 *floodHeader = (u32 *)buf.get();
|
u32 *floodHeader = reinterpret_cast<u32 *>(buf.get());
|
||||||
FDRFlood *layoutFlood = (FDRFlood *)(buf.get() + floodHeaderSize);
|
FDRFlood *layoutFlood = reinterpret_cast<FDRFlood *>(buf.get() + floodHeaderSize);
|
||||||
|
|
||||||
u32 currentFloodIndex = 0;
|
u32 currentFloodIndex = 0;
|
||||||
for (const auto &m : flood2chars) {
|
for (const auto &m : flood2chars) {
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2017, Intel Corporation
|
* Copyright (c) 2015-2017, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -37,6 +38,13 @@
|
|||||||
#define FLOOD_MINIMUM_SIZE 256
|
#define FLOOD_MINIMUM_SIZE 256
|
||||||
#define FLOOD_BACKOFF_START 32
|
#define FLOOD_BACKOFF_START 32
|
||||||
|
|
||||||
|
// this is because this file is included in both fdr.c and teddy.cpp
|
||||||
|
#if defined __cplusplus
|
||||||
|
#define CU64A_P_CAST(X) reinterpret_cast<const u64a*>(X)
|
||||||
|
#else
|
||||||
|
#define CU64A_P_CAST(X) (const u64a *)(X)
|
||||||
|
#endif
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
|
const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
|
||||||
// if we don't have a flood at either the start or end,
|
// if we don't have a flood at either the start or end,
|
||||||
@@ -47,18 +55,18 @@ const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
|
|||||||
|
|
||||||
/* entry points in runtime.c prefetch relevant data */
|
/* entry points in runtime.c prefetch relevant data */
|
||||||
#ifndef FLOOD_32
|
#ifndef FLOOD_32
|
||||||
u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
|
u64a x11 = *CU64A_P_CAST(ROUNDUP_PTR(buf, 8));
|
||||||
u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
|
u64a x12 = *CU64A_P_CAST(ROUNDUP_PTR(buf+8, 8));
|
||||||
if (x11 == x12) {
|
if (x11 == x12) {
|
||||||
return buf + floodBackoff;
|
return buf + floodBackoff;
|
||||||
}
|
}
|
||||||
u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
|
u64a x21 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2, 8));
|
||||||
u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
|
u64a x22 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2 + 8, 8));
|
||||||
if (x21 == x22) {
|
if (x21 == x22) {
|
||||||
return buf + floodBackoff;
|
return buf + floodBackoff;
|
||||||
}
|
}
|
||||||
u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
|
u64a x31 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 24, 8));
|
||||||
u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
|
u64a x32 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 16, 8));
|
||||||
if (x31 == x32) {
|
if (x31 == x32) {
|
||||||
return buf + floodBackoff;
|
return buf + floodBackoff;
|
||||||
}
|
}
|
||||||
@@ -106,9 +114,15 @@ const u8 * floodDetect(const struct FDR * fdr,
|
|||||||
|
|
||||||
// go from c to our FDRFlood structure
|
// go from c to our FDRFlood structure
|
||||||
u8 c = buf[i];
|
u8 c = buf[i];
|
||||||
|
#ifdef __cplusplus
|
||||||
|
const u8 * fBase = (reinterpret_cast<const u8 *>(fdr)) + fdr->floodOffset;
|
||||||
|
u32 fIdx = (reinterpret_cast<const u32 *>(fBase))[c];
|
||||||
|
const struct FDRFlood * fsb = reinterpret_cast<const struct FDRFlood *>(fBase + sizeof(u32) * 256);
|
||||||
|
#else
|
||||||
const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
|
const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
|
||||||
u32 fIdx = ((const u32 *)fBase)[c];
|
u32 fIdx = ((const u32 *)fBase)[c];
|
||||||
const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
|
const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
|
||||||
|
#endif
|
||||||
const struct FDRFlood * fl = &fsb[fIdx];
|
const struct FDRFlood * fl = &fsb[fIdx];
|
||||||
|
|
||||||
#ifndef FLOOD_32
|
#ifndef FLOOD_32
|
||||||
@@ -116,7 +130,7 @@ const u8 * floodDetect(const struct FDR * fdr,
|
|||||||
cmpVal |= cmpVal << 8;
|
cmpVal |= cmpVal << 8;
|
||||||
cmpVal |= cmpVal << 16;
|
cmpVal |= cmpVal << 16;
|
||||||
cmpVal |= cmpVal << 32;
|
cmpVal |= cmpVal << 32;
|
||||||
u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
|
u64a probe = *CU64A_P_CAST(ROUNDUP_PTR(buf+i, 8));
|
||||||
#else
|
#else
|
||||||
u32 cmpVal = c;
|
u32 cmpVal = c;
|
||||||
cmpVal |= cmpVal << 8;
|
cmpVal |= cmpVal << 8;
|
||||||
@@ -139,16 +153,16 @@ const u8 * floodDetect(const struct FDR * fdr,
|
|||||||
#ifndef FLOOD_32
|
#ifndef FLOOD_32
|
||||||
j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
|
j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
|
||||||
for (; j + 32 < mainLoopLen; j += 32) {
|
for (; j + 32 < mainLoopLen; j += 32) {
|
||||||
u64a v = *(const u64a *)(buf + j);
|
u64a v = *CU64A_P_CAST(buf + j);
|
||||||
u64a v2 = *(const u64a *)(buf + j + 8);
|
u64a v2 = *CU64A_P_CAST(buf + j + 8);
|
||||||
u64a v3 = *(const u64a *)(buf + j + 16);
|
u64a v3 = *CU64A_P_CAST(buf + j + 16);
|
||||||
u64a v4 = *(const u64a *)(buf + j + 24);
|
u64a v4 = *CU64A_P_CAST(buf + j + 24);
|
||||||
if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
|
if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (; j + 8 < mainLoopLen; j += 8) {
|
for (; j + 8 < mainLoopLen; j += 8) {
|
||||||
u64a v = *(const u64a *)(buf + j);
|
u64a v = *CU64A_P_CAST(buf + j);
|
||||||
if (v != cmpVal) {
|
if (v != cmpVal) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -172,7 +186,11 @@ const u8 * floodDetect(const struct FDR * fdr,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
for (; j < mainLoopLen; j++) {
|
for (; j < mainLoopLen; j++) {
|
||||||
|
#ifdef __cplusplus
|
||||||
|
u8 v = *(reinterpret_cast<const u8 *>(buf + j));
|
||||||
|
#else
|
||||||
u8 v = *(const u8 *)(buf + j);
|
u8 v = *(const u8 *)(buf + j);
|
||||||
|
#endif
|
||||||
if (v != c) {
|
if (v != c) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
1116
src/fdr/teddy.c
1116
src/fdr/teddy.c
File diff suppressed because it is too large
Load Diff
862
src/fdr/teddy.cpp
Normal file
862
src/fdr/teddy.cpp
Normal file
@@ -0,0 +1,862 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2015-2020, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** \file
|
||||||
|
* \brief Teddy literal matcher: SSSE3 engine runtime.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "fdr_internal.h"
|
||||||
|
#include "flood_runtime.h"
|
||||||
|
#include "teddy.h"
|
||||||
|
#include "teddy_internal.h"
|
||||||
|
#include "teddy_runtime_common.h"
|
||||||
|
#include "util/arch.h"
|
||||||
|
#include "util/simd_utils.h"
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *pt,
|
||||||
|
const u32* confBase,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(chunk != ones_u64a)) {
|
||||||
|
chunk = ~chunk;
|
||||||
|
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||||
|
control, last_match);
|
||||||
|
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||||
|
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||||
|
return HWLM_TERMINATED;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||||
|
if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
#else // 32/64
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *pt,
|
||||||
|
const u32* confBase,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(chunk != ones_u32)) {
|
||||||
|
chunk = ~chunk;
|
||||||
|
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||||
|
control, last_match);
|
||||||
|
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||||
|
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||||
|
return HWLM_TERMINATED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||||
|
if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512VBMI) || defined(HAVE_AVX512) // common to both 512b's
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||||
|
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
|
||||||
|
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t confirm_teddy_64_512(m512 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff512(var, ones512()))) {
|
||||||
|
m128 p128_0 = extract128from512(var, 0);
|
||||||
|
m128 p128_1 = extract128from512(var, 1);
|
||||||
|
m128 p128_2 = extract128from512(var, 2);
|
||||||
|
m128 p128_3 = extract128from512(var, 3);
|
||||||
|
u64a part1 = movq(p128_0);
|
||||||
|
u64a part2 = movq(rshiftbyte_m128(p128_0, 8));
|
||||||
|
u64a part3 = movq(p128_1);
|
||||||
|
u64a part4 = movq(rshiftbyte_m128(p128_1, 8));
|
||||||
|
u64a part5 = movq(p128_2);
|
||||||
|
u64a part6 = movq(rshiftbyte_m128(p128_2, 8));
|
||||||
|
u64a part7 = movq(p128_3);
|
||||||
|
u64a part8 = movq(rshiftbyte_m128(p128_3, 8));
|
||||||
|
CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part5, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part6, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part7, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part8, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define confirm_teddy_512_f confirm_teddy_64_512
|
||||||
|
|
||||||
|
#else // 32/64
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t confirm_teddy_32_512(m512 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff512(var, ones512()))) {
|
||||||
|
m128 p128_0 = extract128from512(var, 0);
|
||||||
|
m128 p128_1 = extract128from512(var, 1);
|
||||||
|
m128 p128_2 = extract128from512(var, 2);
|
||||||
|
m128 p128_3 = extract128from512(var, 3);
|
||||||
|
u32 part1 = movd(p128_0);
|
||||||
|
u32 part2 = movd(rshiftbyte_m128(p128_0, 4));
|
||||||
|
u32 part3 = movd(rshiftbyte_m128(p128_0, 8));
|
||||||
|
u32 part4 = movd(rshiftbyte_m128(p128_0, 12));
|
||||||
|
u32 part5 = movd(p128_1);
|
||||||
|
u32 part6 = movd(rshiftbyte_m128(p128_1, 4));
|
||||||
|
u32 part7 = movd(rshiftbyte_m128(p128_1, 8));
|
||||||
|
u32 part8 = movd(rshiftbyte_m128(p128_1, 12));
|
||||||
|
u32 part9 = movd(p128_2);
|
||||||
|
u32 part10 = movd(rshiftbyte_m128(p128_2, 4));
|
||||||
|
u32 part11 = movd(rshiftbyte_m128(p128_2, 8));
|
||||||
|
u32 part12 = movd(rshiftbyte_m128(p128_2, 12));
|
||||||
|
u32 part13 = movd(p128_3);
|
||||||
|
u32 part14 = movd(rshiftbyte_m128(p128_3, 4));
|
||||||
|
u32 part15 = movd(rshiftbyte_m128(p128_3, 8));
|
||||||
|
u32 part16 = movd(rshiftbyte_m128(p128_3, 12));
|
||||||
|
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part9, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part10, bucket, offset + 36, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part11, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part12, bucket, offset + 44, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part13, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part14, bucket, offset + 52, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part15, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part16, bucket, offset + 60, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define confirm_teddy_512_f confirm_teddy_32_512
|
||||||
|
|
||||||
|
|
||||||
|
#endif // 32/64
|
||||||
|
|
||||||
|
#define CONFIRM_TEDDY_512(...) if(confirm_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
#endif // AVX512VBMI or AVX512
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
|
||||||
|
|
||||||
|
#define TEDDY_VBMI_SL1_MASK 0xfffffffffffffffeULL
|
||||||
|
#define TEDDY_VBMI_SL2_MASK 0xfffffffffffffffcULL
|
||||||
|
#define TEDDY_VBMI_SL3_MASK 0xfffffffffffffff8ULL
|
||||||
|
|
||||||
|
template<int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
|
||||||
|
const m512 *sl_msk, const m512 val) {
|
||||||
|
m512 lo = and512(val, *lo_mask);
|
||||||
|
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
|
||||||
|
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
|
||||||
|
pshufb_m512(dup_mask[1], hi));
|
||||||
|
|
||||||
|
if constexpr (NMSK == 1) return shuf_or_b0;
|
||||||
|
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
|
||||||
|
pshufb_m512(dup_mask[3], hi));
|
||||||
|
m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
|
||||||
|
if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
|
||||||
|
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
|
||||||
|
pshufb_m512(dup_mask[5], hi));
|
||||||
|
m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
|
||||||
|
if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
|
||||||
|
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
|
||||||
|
pshufb_m512(dup_mask[7], hi));
|
||||||
|
m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
|
||||||
|
return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define TEDDY_VBMI_SL1_POS 15
|
||||||
|
#define TEDDY_VBMI_SL2_POS 14
|
||||||
|
#define TEDDY_VBMI_SL3_POS 13
|
||||||
|
|
||||||
|
#define TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffffffffffULL >> n_sh)
|
||||||
|
#define TEDDY_VBMI_CONF_MASK_FULL (0xffffffffffffffffULL << n_sh)
|
||||||
|
#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
|
||||||
|
#define TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffffffffffULL >> (64 - n_sh))
|
||||||
|
|
||||||
|
template<int NMSK>
|
||||||
|
hwlm_error_t fdr_exec_teddy_512vbmi_templ(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
const u8 *buf_end = a->buf + a->len;
|
||||||
|
const u8 *ptr = a->buf + a->start_offset;
|
||||||
|
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||||
|
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||||
|
u32 last_match = ones_u32;
|
||||||
|
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||||
|
const size_t iterBytes = 64;
|
||||||
|
u32 n_sh = NMSK - 1;
|
||||||
|
const size_t loopBytes = 64 - n_sh;
|
||||||
|
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||||
|
a->buf, a->len, a->start_offset);
|
||||||
|
|
||||||
|
const m128 *maskBase = getMaskBase(teddy);
|
||||||
|
|
||||||
|
m512 lo_mask = set1_64x8(0xf);
|
||||||
|
m512 dup_mask[NMSK * 2];
|
||||||
|
m512 sl_msk[NMSK - 1];
|
||||||
|
dup_mask[0] = set1_4x128(maskBase[0]);
|
||||||
|
dup_mask[1] = set1_4x128(maskBase[1]);
|
||||||
|
if constexpr (NMSK > 1){
|
||||||
|
dup_mask[2] = set1_4x128(maskBase[2]);
|
||||||
|
dup_mask[3] = set1_4x128(maskBase[3]);
|
||||||
|
sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 2){
|
||||||
|
dup_mask[4] = set1_4x128(maskBase[4]);
|
||||||
|
dup_mask[5] = set1_4x128(maskBase[5]);
|
||||||
|
sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 3){
|
||||||
|
dup_mask[6] = set1_4x128(maskBase[6]);
|
||||||
|
dup_mask[7] = set1_4x128(maskBase[7]);
|
||||||
|
sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
|
||||||
|
}
|
||||||
|
const u32 *confBase = getConfBase(teddy);
|
||||||
|
|
||||||
|
u64a k = TEDDY_VBMI_CONF_MASK_FULL;
|
||||||
|
m512 p_mask = set_mask_m512(~k);
|
||||||
|
u32 overlap = 0;
|
||||||
|
u64a patch = 0;
|
||||||
|
if (likely(ptr + loopBytes <= buf_end)) {
|
||||||
|
m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);
|
||||||
|
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr));
|
||||||
|
r_0 = or512(r_0, p_mask0);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
ptr += loopBytes;
|
||||||
|
overlap = n_sh;
|
||||||
|
patch = TEDDY_VBMI_LOAD_MASK_PATCH;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
|
||||||
|
__builtin_prefetch(ptr - n_sh + (64 * 2));
|
||||||
|
CHECK_FLOOD;
|
||||||
|
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr - n_sh));
|
||||||
|
r_0 = or512(r_0, p_mask);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ptr + loopBytes > buf_end);
|
||||||
|
if (ptr < buf_end) {
|
||||||
|
u32 left = (u32)(buf_end - ptr);
|
||||||
|
u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);
|
||||||
|
m512 p_mask1 = set_mask_m512(~k1);
|
||||||
|
m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);
|
||||||
|
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
|
||||||
|
r_0 = or512(r_0, p_mask1);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr - overlap);
|
||||||
|
}
|
||||||
|
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512vbmi_templ
|
||||||
|
|
||||||
|
#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
|
||||||
|
|
||||||
|
/* both 512b versions use the same confirm teddy */
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
static inline
|
||||||
|
m512 shift_or_512_templ(const m512 *dup_mask, m512 lo, m512 hi) {
|
||||||
|
return or512(lshift128_m512(or512(pshufb_m512(dup_mask[(NMSK - 1) * 2], lo),
|
||||||
|
pshufb_m512(dup_mask[(NMSK * 2) - 1], hi)),
|
||||||
|
NMSK - 1), shift_or_512_templ<NMSK - 1>(dup_mask, lo, hi));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
m512 shift_or_512_templ<1>(const m512 *dup_mask, m512 lo, m512 hi){
|
||||||
|
return or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_no_reinforcement_512_templ(const m512 *lo_mask,
|
||||||
|
const m512 *dup_mask,
|
||||||
|
const m512 val) {
|
||||||
|
m512 lo = and512(val, *lo_mask);
|
||||||
|
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
|
||||||
|
return shift_or_512_templ<NMSK>(dup_mask, lo, hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_512_templ(const m512 *lo_mask, const m512 *dup_mask,
|
||||||
|
const u8 *ptr, const u64a *r_msk_base,
|
||||||
|
u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
|
||||||
|
m512 lo = and512(load512(ptr), *lo_mask);
|
||||||
|
m512 hi = and512(rshift64_m512(load512(ptr), 4), *lo_mask);
|
||||||
|
*c_16 = *(ptr + 15);
|
||||||
|
*c_32 = *(ptr + 31);
|
||||||
|
*c_48 = *(ptr + 47);
|
||||||
|
m512 r_msk = set8x64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],
|
||||||
|
0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);
|
||||||
|
*c_0 = *(ptr + 63);
|
||||||
|
return or512(shift_or_512_templ<NMSK>(dup_mask, lo, hi), r_msk);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_512(ptr, n) \
|
||||||
|
prep_conf_teddy_512_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, \
|
||||||
|
&c_0, &c_16, &c_32, &c_48)
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
hwlm_error_t fdr_exec_teddy_512_templ(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
const u8 *buf_end = a->buf + a->len;
|
||||||
|
const u8 *ptr = a->buf + a->start_offset;
|
||||||
|
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||||
|
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||||
|
u32 last_match = ones_u32;
|
||||||
|
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||||
|
const size_t iterBytes = 128;
|
||||||
|
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||||
|
a->buf, a->len, a->start_offset);
|
||||||
|
|
||||||
|
const m128 *maskBase = getMaskBase(teddy);
|
||||||
|
|
||||||
|
m512 lo_mask = set1_64x8(0xf);
|
||||||
|
m512 dup_mask[NMSK * 2];
|
||||||
|
|
||||||
|
dup_mask[0] = set1_4x128(maskBase[0]);
|
||||||
|
dup_mask[1] = set1_4x128(maskBase[1]);
|
||||||
|
if constexpr (NMSK > 1){
|
||||||
|
dup_mask[2] = set1_4x128(maskBase[2]);
|
||||||
|
dup_mask[3] = set1_4x128(maskBase[3]);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 2){
|
||||||
|
dup_mask[4] = set1_4x128(maskBase[4]);
|
||||||
|
dup_mask[5] = set1_4x128(maskBase[5]);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 3){
|
||||||
|
dup_mask[6] = set1_4x128(maskBase[6]);
|
||||||
|
dup_mask[7] = set1_4x128(maskBase[7]);
|
||||||
|
}
|
||||||
|
const u32 *confBase = getConfBase(teddy);
|
||||||
|
|
||||||
|
const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
|
||||||
|
u32 c_0 = 0x100;
|
||||||
|
u32 c_16 = 0x100;
|
||||||
|
u32 c_32 = 0x100;
|
||||||
|
u32 c_48 = 0x100;
|
||||||
|
const u8 *mainStart = ROUNDUP_PTR(ptr, 64);
|
||||||
|
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||||
|
if (ptr < mainStart) {
|
||||||
|
ptr = mainStart - 64;
|
||||||
|
m512 p_mask;
|
||||||
|
m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,
|
||||||
|
a->buf, buf_end,
|
||||||
|
a->buf_history, a->len_history, NMSK);
|
||||||
|
m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask, val_0);
|
||||||
|
r_0 = or512(r_0, p_mask);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
ptr += 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 64 <= buf_end) {
|
||||||
|
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
ptr += 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||||
|
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||||
|
CHECK_FLOOD;
|
||||||
|
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
m512 r_1 = PREP_CONF_FN_512(ptr + 64, NMSK);
|
||||||
|
CONFIRM_TEDDY_512(r_1, 8, 64, NOT_CAUTIOUS, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 64 <= buf_end) {
|
||||||
|
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
ptr += 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ptr + 64 > buf_end);
|
||||||
|
if (ptr < buf_end) {
|
||||||
|
m512 p_mask;
|
||||||
|
m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,
|
||||||
|
a->buf_history, a->len_history, NMSK);
|
||||||
|
m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask,val_0);
|
||||||
|
r_0 = or512(r_0, p_mask);
|
||||||
|
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512_templ
|
||||||
|
|
||||||
|
/* #endif // AVX512 vs AVX512VBMI * back to the original fully exclusive logic */
|
||||||
|
|
||||||
|
#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
|
||||||
|
hwlm_error_t confirm_teddy_64_256(m256 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff256(var, ones256()))) {
|
||||||
|
m128 lo = movdq_lo(var);
|
||||||
|
m128 hi = movdq_hi(var);
|
||||||
|
u64a part1 = movq(lo);
|
||||||
|
u64a part2 = movq(rshiftbyte_m128(lo, 8));
|
||||||
|
u64a part3 = movq(hi);
|
||||||
|
u64a part4 = movq(rshiftbyte_m128(hi, 8));
|
||||||
|
CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define confirm_teddy_256_f confirm_teddy_64_256
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
hwlm_error_t confirm_teddy_32_256(m256 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff256(var, ones256()))) {
|
||||||
|
m128 lo = movdq_lo(var);
|
||||||
|
m128 hi = movdq_hi(var);
|
||||||
|
u32 part1 = movd(lo);
|
||||||
|
u32 part2 = movd(rshiftbyte_m128(lo, 4));
|
||||||
|
u32 part3 = movd(rshiftbyte_m128(lo, 8));
|
||||||
|
u32 part4 = movd(rshiftbyte_m128(lo, 12));
|
||||||
|
u32 part5 = movd(hi);
|
||||||
|
u32 part6 = movd(rshiftbyte_m128(hi, 4));
|
||||||
|
u32 part7 = movd(rshiftbyte_m128(hi, 8));
|
||||||
|
u32 part8 = movd(rshiftbyte_m128(hi, 12));
|
||||||
|
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define confirm_teddy_256_f confirm_teddy_32_256
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CONFIRM_TEDDY_256(...) if(confirm_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
/*
|
||||||
|
static really_inline
|
||||||
|
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||||
|
const u8 *lo, const u8 *hi,
|
||||||
|
const u8 *buf_history, size_t len_history,
|
||||||
|
const u32 nMasks) {
|
||||||
|
m128 p_mask128;
|
||||||
|
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||||
|
buf_history, len_history, nMasks));
|
||||||
|
*p_mask = set1_2x128(p_mask128);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
static inline
|
||||||
|
m256 shift_or_256_templ(const m256 *dup_mask, m256 lo, m256 hi){
|
||||||
|
return or256(lshift128_m256(or256(pshufb_m256(dup_mask[(NMSK-1)*2], lo),
|
||||||
|
pshufb_m256(dup_mask[(NMSK*2)-1], hi)),
|
||||||
|
(NMSK-1)), shift_or_256_templ<NMSK-1>(dup_mask, lo, hi));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
m256 shift_or_256_templ<1>(const m256 *dup_mask, m256 lo, m256 hi){
|
||||||
|
return or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m256 prep_conf_teddy_no_reinforcement_256_templ(const m256 *lo_mask,
|
||||||
|
const m256 *dup_mask,
|
||||||
|
const m256 val) {
|
||||||
|
m256 lo = and256(val, *lo_mask);
|
||||||
|
m256 hi = and256(rshift64_m256(val, 4), *lo_mask);
|
||||||
|
return shift_or_256_templ<NMSK>(dup_mask, lo, hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m256 prep_conf_teddy_256_templ(const m256 *lo_mask, const m256 *dup_mask,
|
||||||
|
const u8 *ptr, const u64a *r_msk_base,
|
||||||
|
u32 *c_0, u32 *c_128) {
|
||||||
|
m256 lo = and256(load256(ptr), *lo_mask);
|
||||||
|
m256 hi = and256(rshift64_m256(load256(ptr), 4), *lo_mask);
|
||||||
|
*c_128 = *(ptr + 15);
|
||||||
|
m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]);
|
||||||
|
*c_0 = *(ptr + 31);
|
||||||
|
return or256(shift_or_256_templ<NMSK>(dup_mask, lo, hi), r_msk);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_256_NO_REINFORCEMENT(val, n) \
|
||||||
|
prep_conf_teddy_no_reinforcement_256_templ<n>(&lo_mask, dup_mask, val)
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_256(ptr, n) \
|
||||||
|
prep_conf_teddy_256_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
hwlm_error_t fdr_exec_teddy_256_templ(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
const u8 *buf_end = a->buf + a->len;
|
||||||
|
const u8 *ptr = a->buf + a->start_offset;
|
||||||
|
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||||
|
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||||
|
u32 last_match = ones_u32;
|
||||||
|
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||||
|
const size_t iterBytes = 64;
|
||||||
|
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||||
|
a->buf, a->len, a->start_offset);
|
||||||
|
|
||||||
|
const m128 *maskBase = getMaskBase(teddy);
|
||||||
|
//PREPARE_MASKS_256;
|
||||||
|
|
||||||
|
m256 lo_mask = set1_32x8(0xf);
|
||||||
|
m256 dup_mask[NMSK * 2];
|
||||||
|
dup_mask[0] = set1_2x128(maskBase[0]);
|
||||||
|
dup_mask[1] = set1_2x128(maskBase[1]);
|
||||||
|
if constexpr (NMSK > 1){
|
||||||
|
dup_mask[2] = set1_2x128(maskBase[2]);
|
||||||
|
dup_mask[3] = set1_2x128(maskBase[3]);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 2){
|
||||||
|
dup_mask[4] = set1_2x128(maskBase[4]);
|
||||||
|
dup_mask[5] = set1_2x128(maskBase[5]);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 3){
|
||||||
|
dup_mask[6] = set1_2x128(maskBase[6]);
|
||||||
|
dup_mask[7] = set1_2x128(maskBase[7]);
|
||||||
|
}
|
||||||
|
const u32 *confBase = getConfBase(teddy);
|
||||||
|
|
||||||
|
const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
|
||||||
|
u32 c_0 = 0x100;
|
||||||
|
u32 c_128 = 0x100;
|
||||||
|
const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
|
||||||
|
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||||
|
if (ptr < mainStart) {
|
||||||
|
ptr = mainStart - 32;
|
||||||
|
m256 p_mask;
|
||||||
|
m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,
|
||||||
|
a->buf, buf_end,
|
||||||
|
a->buf_history, a->len_history, NMSK);
|
||||||
|
m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
|
||||||
|
r_0 = or256(r_0, p_mask);
|
||||||
|
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
ptr += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 32 <= buf_end) {
|
||||||
|
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
|
||||||
|
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
ptr += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||||
|
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||||
|
CHECK_FLOOD;
|
||||||
|
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
|
||||||
|
CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
m256 r_1 = PREP_CONF_FN_256(ptr + 32, NMSK);
|
||||||
|
CONFIRM_TEDDY_256(r_1, 8, 32, NOT_CAUTIOUS, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 32 <= buf_end) {
|
||||||
|
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
|
||||||
|
CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
ptr += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ptr + 32 > buf_end);
|
||||||
|
if (ptr < buf_end) {
|
||||||
|
m256 p_mask;
|
||||||
|
m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,
|
||||||
|
a->buf_history, a->len_history, NMSK);
|
||||||
|
m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
|
||||||
|
r_0 = or256(r_0, p_mask);
|
||||||
|
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_256_templ
|
||||||
|
|
||||||
|
#else // not defined HAVE_AVX2
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t confirm_teddy_64_128(m128 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff128(var, ones128()))) {
|
||||||
|
u64a lo = 0;
|
||||||
|
u64a hi = 0;
|
||||||
|
u64a __attribute__((aligned(16))) vec[2];
|
||||||
|
store128(vec, var);
|
||||||
|
lo = vec[0];
|
||||||
|
hi = vec[1];
|
||||||
|
CONF_CHUNK_64(lo, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_64(hi, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define confirm_teddy_128_f confirm_teddy_64_128
|
||||||
|
|
||||||
|
#else // 32/64
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t confirm_teddy_32_128(m128 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff128(var, ones128()))) {
|
||||||
|
u32 part1 = movd(var);
|
||||||
|
u32 part2 = movd(rshiftbyte_m128(var, 4));
|
||||||
|
u32 part3 = movd(rshiftbyte_m128(var, 8));
|
||||||
|
u32 part4 = movd(rshiftbyte_m128(var, 12));
|
||||||
|
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
#define confirm_teddy_128_f confirm_teddy_32_128
|
||||||
|
|
||||||
|
#endif // 32/64
|
||||||
|
|
||||||
|
|
||||||
|
#define CONFIRM_TEDDY_128(...) if(confirm_teddy_128_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m128 prep_conf_teddy_128_templ(const m128 *maskBase, m128 val) {
|
||||||
|
m128 mask = set1_16x8(0xf);
|
||||||
|
m128 lo = and128(val, mask);
|
||||||
|
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||||
|
m128 r1 = or128(pshufb_m128(maskBase[0 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[0 * 2 + 1], hi));
|
||||||
|
if constexpr (NMSK == 1) return r1;
|
||||||
|
m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[1 * 2 + 1], hi));
|
||||||
|
|
||||||
|
m128 old_1 = zeroes128();
|
||||||
|
m128 res_shifted_1 = palignr(res_1, old_1, 16 - 1);
|
||||||
|
m128 r2 = or128(r1, res_shifted_1);
|
||||||
|
if constexpr (NMSK == 2) return r2;
|
||||||
|
m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[2 * 2 + 1], hi));
|
||||||
|
m128 res_shifted_2 = palignr(res_2, old_1, 16 - 2);
|
||||||
|
m128 r3 = or128(r2, res_shifted_2);
|
||||||
|
if constexpr (NMSK == 3) return r3;
|
||||||
|
m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[3 * 2 + 1], hi));
|
||||||
|
m128 res_shifted_3 = palignr(res_3, old_1, 16 - 3);
|
||||||
|
return or128(r3, res_shifted_3);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NMSK>
|
||||||
|
hwlm_error_t fdr_exec_teddy_128_templ(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
const u8 *buf_end = a->buf + a->len;
|
||||||
|
const u8 *ptr = a->buf + a->start_offset;
|
||||||
|
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||||
|
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||||
|
u32 last_match = ones_u32;
|
||||||
|
const struct Teddy *teddy = reinterpret_cast<const struct Teddy *>(fdr);
|
||||||
|
const size_t iterBytes = 32;
|
||||||
|
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||||
|
a->buf, a->len, a->start_offset);
|
||||||
|
|
||||||
|
const m128 *maskBase = getMaskBase(teddy);
|
||||||
|
const u32 *confBase = getConfBase(teddy);
|
||||||
|
|
||||||
|
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||||
|
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||||
|
if (ptr < mainStart) {
|
||||||
|
ptr = mainStart - 16;
|
||||||
|
m128 p_mask;
|
||||||
|
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,
|
||||||
|
a->buf, buf_end,
|
||||||
|
a->buf_history, a->len_history, NMSK);
|
||||||
|
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
|
||||||
|
r_0 = or128(r_0, p_mask);
|
||||||
|
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
ptr += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 16 <= buf_end) {
|
||||||
|
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
|
||||||
|
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
ptr += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||||
|
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||||
|
CHECK_FLOOD;
|
||||||
|
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
|
||||||
|
CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
m128 r_1 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr + 16));
|
||||||
|
CONFIRM_TEDDY_128(r_1, 8, 16, NOT_CAUTIOUS, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 16 <= buf_end) {
|
||||||
|
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
|
||||||
|
CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
ptr += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ptr + 16 > buf_end);
|
||||||
|
if (ptr < buf_end) {
|
||||||
|
m128 p_mask;
|
||||||
|
m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,
|
||||||
|
a->buf_history, a->len_history, NMSK);
|
||||||
|
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
|
||||||
|
r_0 = or128(r_0, p_mask);
|
||||||
|
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_128_templ
|
||||||
|
|
||||||
|
|
||||||
|
#endif // HAVE_AVX2 HAVE_AVX512
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // extern
|
||||||
|
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016-2017, Intel Corporation
|
* Copyright (c) 2016-2017, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -39,6 +40,10 @@
|
|||||||
struct FDR; // forward declaration from fdr_internal.h
|
struct FDR; // forward declaration from fdr_internal.h
|
||||||
struct FDR_Runtime_Args;
|
struct FDR_Runtime_Args;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
||||||
const struct FDR_Runtime_Args *a,
|
const struct FDR_Runtime_Args *a,
|
||||||
hwlm_group_t control);
|
hwlm_group_t control);
|
||||||
@@ -106,5 +111,8 @@ hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
|
|||||||
hwlm_group_t control);
|
hwlm_group_t control);
|
||||||
|
|
||||||
#endif /* HAVE_AVX2 */
|
#endif /* HAVE_AVX2 */
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* TEDDY_H_ */
|
#endif /* TEDDY_H_ */
|
||||||
|
|||||||
@@ -1,709 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2016-2020, Intel Corporation
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions are met:
|
|
||||||
*
|
|
||||||
* * Redistributions of source code must retain the above copyright notice,
|
|
||||||
* this list of conditions and the following disclaimer.
|
|
||||||
* * Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
||||||
* may be used to endorse or promote products derived from this software
|
|
||||||
* without specific prior written permission.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
||||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
||||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
||||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
||||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
||||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
||||||
* POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/** \file
|
|
||||||
* \brief Teddy literal matcher: AVX2 engine runtime.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "fdr_internal.h"
|
|
||||||
#include "flood_runtime.h"
|
|
||||||
#include "teddy.h"
|
|
||||||
#include "teddy_internal.h"
|
|
||||||
#include "teddy_runtime_common.h"
|
|
||||||
#include "util/arch.h"
|
|
||||||
#include "util/simd_utils.h"
|
|
||||||
|
|
||||||
#if defined(HAVE_AVX2)
|
|
||||||
|
|
||||||
const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
|
|
||||||
|
|
||||||
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(chunk != ones_u64a)) { \
|
|
||||||
chunk = ~chunk; \
|
|
||||||
conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(chunk != ones_u32)) { \
|
|
||||||
chunk = ~chunk; \
|
|
||||||
conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
|
|
||||||
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
|
|
||||||
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(chunk != ones_u64a)) { \
|
|
||||||
chunk = ~chunk; \
|
|
||||||
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(chunk != ones_u32)) { \
|
|
||||||
chunk = ~chunk; \
|
|
||||||
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
const m256 *getMaskBase_fat(const struct Teddy *teddy) {
|
|
||||||
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
|
|
||||||
|
|
||||||
const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
|
|
||||||
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
|
|
||||||
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
|
|
||||||
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
|
|
||||||
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef ARCH_64_BIT
|
|
||||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(diff512(var, ones512()))) { \
|
|
||||||
m512 msk_interleave = load512(p_mask_interleave); \
|
|
||||||
m512 r = vpermb512(msk_interleave, var); \
|
|
||||||
m128 r0 = extract128from512(r, 0); \
|
|
||||||
m128 r1 = extract128from512(r, 1); \
|
|
||||||
m128 r2 = extract128from512(r, 2); \
|
|
||||||
m128 r3 = extract128from512(r, 3); \
|
|
||||||
u64a part1 = movq(r0); \
|
|
||||||
u64a part2 = extract64from128(r0, 1); \
|
|
||||||
u64a part3 = movq(r1); \
|
|
||||||
u64a part4 = extract64from128(r1, 1); \
|
|
||||||
u64a part5 = movq(r2); \
|
|
||||||
u64a part6 = extract64from128(r2, 1); \
|
|
||||||
u64a part7 = movq(r3); \
|
|
||||||
u64a part8 = extract64from128(r3, 1); \
|
|
||||||
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
#else
|
|
||||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(diff512(var, ones512()))) { \
|
|
||||||
m512 msk_interleave = load512(p_mask_interleave); \
|
|
||||||
m512 r = vpermb512(msk_interleave, var); \
|
|
||||||
m128 r0 = extract128from512(r, 0); \
|
|
||||||
m128 r1 = extract128from512(r, 1); \
|
|
||||||
m128 r2 = extract128from512(r, 2); \
|
|
||||||
m128 r3 = extract128from512(r, 3); \
|
|
||||||
u32 part1 = movd(r0); \
|
|
||||||
u32 part2 = extract32from128(r0, 1); \
|
|
||||||
u32 part3 = extract32from128(r0, 2); \
|
|
||||||
u32 part4 = extract32from128(r0, 3); \
|
|
||||||
u32 part5 = movd(r1); \
|
|
||||||
u32 part6 = extract32from128(r1, 1); \
|
|
||||||
u32 part7 = extract32from128(r1, 2); \
|
|
||||||
u32 part8 = extract32from128(r1, 3); \
|
|
||||||
u32 part9 = movd(r2); \
|
|
||||||
u32 part10 = extract32from128(r2, 1); \
|
|
||||||
u32 part11 = extract32from128(r2, 2); \
|
|
||||||
u32 part12 = extract32from128(r2, 3); \
|
|
||||||
u32 part13 = movd(r3); \
|
|
||||||
u32 part14 = extract32from128(r3, 1); \
|
|
||||||
u32 part15 = extract32from128(r3, 2); \
|
|
||||||
u32 part16 = extract32from128(r3, 3); \
|
|
||||||
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
|
|
||||||
CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
|
|
||||||
CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
|
|
||||||
CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
|
|
||||||
CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
|
|
||||||
CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
|
|
||||||
CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define PREP_FAT_SHUF_MASK \
|
|
||||||
m512 lo = and512(val, *lo_mask); \
|
|
||||||
m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_PSHUFB_OR_M1 \
|
|
||||||
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo), \
|
|
||||||
pshufb_m512(dup_mask[1], hi));
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_PSHUFB_OR_M2 \
|
|
||||||
FAT_TEDDY_VBMI_PSHUFB_OR_M1 \
|
|
||||||
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo), \
|
|
||||||
pshufb_m512(dup_mask[3], hi));
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_PSHUFB_OR_M3 \
|
|
||||||
FAT_TEDDY_VBMI_PSHUFB_OR_M2 \
|
|
||||||
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo), \
|
|
||||||
pshufb_m512(dup_mask[5], hi));
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_PSHUFB_OR_M4 \
|
|
||||||
FAT_TEDDY_VBMI_PSHUFB_OR_M3 \
|
|
||||||
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo), \
|
|
||||||
pshufb_m512(dup_mask[7], hi));
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_SL1_MASK 0xfffffffefffffffeULL
|
|
||||||
#define FAT_TEDDY_VBMI_SL2_MASK 0xfffffffcfffffffcULL
|
|
||||||
#define FAT_TEDDY_VBMI_SL3_MASK 0xfffffff8fffffff8ULL
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_SHIFT_M1
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_SHIFT_M2 \
|
|
||||||
FAT_TEDDY_VBMI_SHIFT_M1 \
|
|
||||||
m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_SHIFT_M3 \
|
|
||||||
FAT_TEDDY_VBMI_SHIFT_M2 \
|
|
||||||
m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_SHIFT_M4 \
|
|
||||||
FAT_TEDDY_VBMI_SHIFT_M3 \
|
|
||||||
m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
|
|
||||||
|
|
||||||
#define FAT_SHIFT_OR_M1 \
|
|
||||||
shuf_or_b0
|
|
||||||
|
|
||||||
#define FAT_SHIFT_OR_M2 \
|
|
||||||
or512(sl1, FAT_SHIFT_OR_M1)
|
|
||||||
|
|
||||||
#define FAT_SHIFT_OR_M3 \
|
|
||||||
or512(sl2, FAT_SHIFT_OR_M2)
|
|
||||||
|
|
||||||
#define FAT_SHIFT_OR_M4 \
|
|
||||||
or512(sl3, FAT_SHIFT_OR_M3)
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
|
|
||||||
UNUSED const m512 *sl_msk, const m512 val) {
|
|
||||||
PREP_FAT_SHUF_MASK;
|
|
||||||
FAT_TEDDY_VBMI_PSHUFB_OR_M1;
|
|
||||||
FAT_TEDDY_VBMI_SHIFT_M1;
|
|
||||||
return FAT_SHIFT_OR_M1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
|
|
||||||
const m512 *sl_msk, const m512 val) {
|
|
||||||
PREP_FAT_SHUF_MASK;
|
|
||||||
FAT_TEDDY_VBMI_PSHUFB_OR_M2;
|
|
||||||
FAT_TEDDY_VBMI_SHIFT_M2;
|
|
||||||
return FAT_SHIFT_OR_M2;
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
|
|
||||||
const m512 *sl_msk, const m512 val) {
|
|
||||||
PREP_FAT_SHUF_MASK;
|
|
||||||
FAT_TEDDY_VBMI_PSHUFB_OR_M3;
|
|
||||||
FAT_TEDDY_VBMI_SHIFT_M3;
|
|
||||||
return FAT_SHIFT_OR_M3;
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
|
|
||||||
const m512 *sl_msk, const m512 val) {
|
|
||||||
PREP_FAT_SHUF_MASK;
|
|
||||||
FAT_TEDDY_VBMI_PSHUFB_OR_M4;
|
|
||||||
FAT_TEDDY_VBMI_SHIFT_M4;
|
|
||||||
return FAT_SHIFT_OR_M4;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PREP_CONF_FAT_FN(val, n) \
|
|
||||||
prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_SL1_POS 15
|
|
||||||
#define FAT_TEDDY_VBMI_SL2_POS 14
|
|
||||||
#define FAT_TEDDY_VBMI_SL3_POS 13
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \
|
|
||||||
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1 \
|
|
||||||
sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \
|
|
||||||
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \
|
|
||||||
sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4 \
|
|
||||||
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \
|
|
||||||
sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In FAT teddy, it needs 2 bytes to represent result of each position,
|
|
||||||
* so each nibble's(for example, lo nibble of last byte) FAT teddy mask
|
|
||||||
* has 16x2 bytes:
|
|
||||||
* |----------------------------------|----------------------------------|
|
|
||||||
* 16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
|
|
||||||
* A B
|
|
||||||
* at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
|
|
||||||
* |----------------------------------|----------------------------------|
|
|
||||||
* 16bytes input data (lo nibbles) 16bytes duplicated data (lo nibbles)
|
|
||||||
* X X
|
|
||||||
* then do pshufb_m256(AB, XX).
|
|
||||||
*
|
|
||||||
* In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
|
|
||||||
* to 64 bytes:
|
|
||||||
* |----------------|----------------|----------------|----------------|
|
|
||||||
* X Y X Y
|
|
||||||
* in this case we need DUP_FAT_MASK to construct AABB:
|
|
||||||
* |----------------|----------------|----------------|----------------|
|
|
||||||
* A A B B
|
|
||||||
* then do pshufb_m512(AABB, XYXY).
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define PREPARE_FAT_MASKS(n) \
|
|
||||||
m512 lo_mask = set1_64x8(0xf); \
|
|
||||||
m512 sl_msk[n - 1]; \
|
|
||||||
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
|
|
||||||
|
|
||||||
#define FAT_TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffULL >> n_sh)
|
|
||||||
#define FAT_TEDDY_VBMI_CONF_MASK_FULL ((0xffffffffULL << n_sh) & 0xffffffffULL)
|
|
||||||
#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
|
|
||||||
#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffULL >> (32 - n_sh))
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
|
||||||
do { \
|
|
||||||
const u8 *buf_end = a->buf + a->len; \
|
|
||||||
const u8 *ptr = a->buf + a->start_offset; \
|
|
||||||
u32 floodBackoff = FLOOD_BACKOFF_START; \
|
|
||||||
const u8 *tryFloodDetect = a->firstFloodDetect; \
|
|
||||||
u32 last_match = ones_u32; \
|
|
||||||
const struct Teddy *teddy = (const struct Teddy *)fdr; \
|
|
||||||
const size_t iterBytes = 32; \
|
|
||||||
u32 n_sh = n_msk - 1; \
|
|
||||||
const size_t loopBytes = 32 - n_sh; \
|
|
||||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
|
|
||||||
a->buf, a->len, a->start_offset); \
|
|
||||||
\
|
|
||||||
const m512 *dup_mask = getDupMaskBase(teddy, n_msk); \
|
|
||||||
PREPARE_FAT_MASKS(n_msk); \
|
|
||||||
const u32 *confBase = getConfBase(teddy); \
|
|
||||||
\
|
|
||||||
u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL; \
|
|
||||||
m512 p_mask = set_mask_m512(~((k << 32) | k)); \
|
|
||||||
u32 overlap = 0; \
|
|
||||||
u64a patch = 0; \
|
|
||||||
if (likely(ptr + loopBytes <= buf_end)) { \
|
|
||||||
u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD; \
|
|
||||||
m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0)); \
|
|
||||||
m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk); \
|
|
||||||
r_0 = or512(r_0, p_mask0); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn); \
|
|
||||||
ptr += loopBytes; \
|
|
||||||
overlap = n_sh; \
|
|
||||||
patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) { \
|
|
||||||
CHECK_FLOOD; \
|
|
||||||
m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk); \
|
|
||||||
r_0 = or512(r_0, p_mask); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
assert(ptr + loopBytes > buf_end); \
|
|
||||||
if (ptr < buf_end) { \
|
|
||||||
u32 left = (u32)(buf_end - ptr); \
|
|
||||||
u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left); \
|
|
||||||
m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1)); \
|
|
||||||
m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap)); \
|
|
||||||
m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk); \
|
|
||||||
r_0 = or512(r_0, p_mask1); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
return HWLM_SUCCESS; \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
|
|
||||||
|
|
||||||
#ifdef ARCH_64_BIT
|
|
||||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(diff256(var, ones256()))) { \
|
|
||||||
m256 swap = swap128in256(var); \
|
|
||||||
m256 r = interleave256lo(var, swap); \
|
|
||||||
u64a part1 = extractlow64from256(r); \
|
|
||||||
u64a part2 = extract64from256(r, 1); \
|
|
||||||
r = interleave256hi(var, swap); \
|
|
||||||
u64a part3 = extractlow64from256(r); \
|
|
||||||
u64a part4 = extract64from256(r, 1); \
|
|
||||||
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
#else
|
|
||||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(diff256(var, ones256()))) { \
|
|
||||||
m256 swap = swap128in256(var); \
|
|
||||||
m256 r = interleave256lo(var, swap); \
|
|
||||||
u32 part1 = extractlow32from256(r); \
|
|
||||||
u32 part2 = extract32from256(r, 1); \
|
|
||||||
u32 part3 = extract32from256(r, 2); \
|
|
||||||
u32 part4 = extract32from256(r, 3); \
|
|
||||||
r = interleave256hi(var, swap); \
|
|
||||||
u32 part5 = extractlow32from256(r); \
|
|
||||||
u32 part6 = extract32from256(r, 1); \
|
|
||||||
u32 part7 = extract32from256(r, 2); \
|
|
||||||
u32 part8 = extract32from256(r, 3); \
|
|
||||||
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
|
|
||||||
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
|
||||||
const u8 *lo, const u8 *hi,
|
|
||||||
const u8 *buf_history, size_t len_history,
|
|
||||||
const u32 nMasks) {
|
|
||||||
m128 p_mask128;
|
|
||||||
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
|
||||||
buf_history, len_history, nMasks));
|
|
||||||
*p_mask = set1_2x128(p_mask128);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
|
|
||||||
m256 mask = set1_32x8(0xf);
|
|
||||||
m256 lo = and256(val, mask);
|
|
||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
|
||||||
return or256(pshufb_m256(maskBase[0 * 2], lo),
|
|
||||||
pshufb_m256(maskBase[0 * 2 + 1], hi));
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
|
|
||||||
m256 mask = set1_32x8(0xf);
|
|
||||||
m256 lo = and256(val, mask);
|
|
||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
|
||||||
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
|
|
||||||
|
|
||||||
m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
|
|
||||||
pshufb_m256(maskBase[1 * 2 + 1], hi));
|
|
||||||
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
|
|
||||||
*old_1 = res_1;
|
|
||||||
return or256(r, res_shifted_1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
|
||||||
m256 val) {
|
|
||||||
m256 mask = set1_32x8(0xf);
|
|
||||||
m256 lo = and256(val, mask);
|
|
||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
|
||||||
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
|
|
||||||
|
|
||||||
m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
|
|
||||||
pshufb_m256(maskBase[2 * 2 + 1], hi));
|
|
||||||
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
|
|
||||||
*old_2 = res_2;
|
|
||||||
return or256(r, res_shifted_2);
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
|
||||||
m256 *old_3, m256 val) {
|
|
||||||
m256 mask = set1_32x8(0xf);
|
|
||||||
m256 lo = and256(val, mask);
|
|
||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
|
||||||
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
|
|
||||||
|
|
||||||
m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
|
|
||||||
pshufb_m256(maskBase[3 * 2 + 1], hi));
|
|
||||||
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
|
|
||||||
*old_3 = res_3;
|
|
||||||
return or256(r, res_shifted_3);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
|
|
||||||
do { \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \
|
|
||||||
m256 res_old_1 = zeroes256();
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \
|
|
||||||
m256 res_old_1 = zeroes256(); \
|
|
||||||
m256 res_old_2 = zeroes256();
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \
|
|
||||||
m256 res_old_1 = zeroes256(); \
|
|
||||||
m256 res_old_2 = zeroes256(); \
|
|
||||||
m256 res_old_3 = zeroes256();
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
|
|
||||||
|
|
||||||
#define PREP_CONF_FAT_FN_1(mask_base, val) \
|
|
||||||
prep_conf_fat_teddy_m1(mask_base, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FAT_FN_2(mask_base, val) \
|
|
||||||
prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FAT_FN_3(mask_base, val) \
|
|
||||||
prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FAT_FN_4(mask_base, val) \
|
|
||||||
prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FAT_FN(mask_base, val, n) \
|
|
||||||
PREP_CONF_FAT_FN_##n(mask_base, val)
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
|
||||||
do { \
|
|
||||||
const u8 *buf_end = a->buf + a->len; \
|
|
||||||
const u8 *ptr = a->buf + a->start_offset; \
|
|
||||||
u32 floodBackoff = FLOOD_BACKOFF_START; \
|
|
||||||
const u8 *tryFloodDetect = a->firstFloodDetect; \
|
|
||||||
u32 last_match = ones_u32; \
|
|
||||||
const struct Teddy *teddy = (const struct Teddy *)fdr; \
|
|
||||||
const size_t iterBytes = 32; \
|
|
||||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
|
|
||||||
a->buf, a->len, a->start_offset); \
|
|
||||||
\
|
|
||||||
const m256 *maskBase = getMaskBase_fat(teddy); \
|
|
||||||
const u32 *confBase = getConfBase(teddy); \
|
|
||||||
\
|
|
||||||
FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \
|
|
||||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
|
|
||||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
|
|
||||||
if (ptr < mainStart) { \
|
|
||||||
ptr = mainStart - 16; \
|
|
||||||
m256 p_mask; \
|
|
||||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \
|
|
||||||
a->buf, buf_end, \
|
|
||||||
a->buf_history, a->len_history, \
|
|
||||||
n_msk); \
|
|
||||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
|
|
||||||
r_0 = or256(r_0, p_mask); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
|
||||||
ptr += 16; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
if (ptr + 16 <= buf_end) { \
|
|
||||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
|
||||||
ptr += 16; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
|
|
||||||
__builtin_prefetch(ptr + (iterBytes * 4)); \
|
|
||||||
CHECK_FLOOD; \
|
|
||||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
|
||||||
m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
if (ptr + 16 <= buf_end) { \
|
|
||||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
|
||||||
ptr += 16; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
assert(ptr + 16 > buf_end); \
|
|
||||||
if (ptr < buf_end) { \
|
|
||||||
m256 p_mask; \
|
|
||||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \
|
|
||||||
a->buf_history, a->len_history, \
|
|
||||||
n_msk); \
|
|
||||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
|
|
||||||
r_0 = or256(r_0, p_mask); \
|
|
||||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
return HWLM_SUCCESS; \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#endif // HAVE_AVX512VBMI
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
|
|
||||||
const struct FDR_Runtime_Args *a,
|
|
||||||
hwlm_group_t control) {
|
|
||||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // HAVE_AVX2
|
|
||||||
@@ -328,7 +328,7 @@ bool pack(const vector<hwlmLiteral> &lits,
|
|||||||
|
|
||||||
static
|
static
|
||||||
void initReinforcedTable(u8 *rmsk) {
|
void initReinforcedTable(u8 *rmsk) {
|
||||||
u64a *mask = (u64a *)rmsk;
|
u64a *mask = reinterpret_cast<u64a *>(rmsk);
|
||||||
fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
|
fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -576,8 +576,8 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
|||||||
|
|
||||||
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
|
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
|
||||||
assert(fdr); // otherwise would have thrown std::bad_alloc
|
assert(fdr); // otherwise would have thrown std::bad_alloc
|
||||||
Teddy *teddy = (Teddy *)fdr.get(); // ugly
|
Teddy *teddy = reinterpret_cast<Teddy *>(fdr.get()); // ugly
|
||||||
u8 *teddy_base = (u8 *)teddy;
|
u8 *teddy_base = reinterpret_cast<u8 *>(teddy);
|
||||||
|
|
||||||
// Write header.
|
// Write header.
|
||||||
teddy->size = size;
|
teddy->size = size;
|
||||||
@@ -597,7 +597,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
|||||||
assert(ISALIGNED_CL(ptr));
|
assert(ISALIGNED_CL(ptr));
|
||||||
teddy->floodOffset = verify_u32(ptr - teddy_base);
|
teddy->floodOffset = verify_u32(ptr - teddy_base);
|
||||||
memcpy(ptr, floodTable.get(), floodTable.size());
|
memcpy(ptr, floodTable.get(), floodTable.size());
|
||||||
ptr += floodTable.size();
|
|
||||||
|
|
||||||
// Write teddy masks.
|
// Write teddy masks.
|
||||||
u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
|
u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
|
||||||
@@ -622,7 +622,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
|||||||
static
|
static
|
||||||
bool assignStringsToBuckets(
|
bool assignStringsToBuckets(
|
||||||
const vector<hwlmLiteral> &lits,
|
const vector<hwlmLiteral> &lits,
|
||||||
TeddyEngineDescription &eng,
|
const TeddyEngineDescription &eng,
|
||||||
map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
|
map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
|
||||||
assert(eng.numMasks <= MAX_NUM_MASKS);
|
assert(eng.numMasks <= MAX_NUM_MASKS);
|
||||||
if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
|
if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
|
||||||
|
|||||||
@@ -52,14 +52,14 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
|
|||||||
|
|
||||||
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
|
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
|
||||||
static const TeddyEngineDef defns[] = {
|
static const TeddyEngineDef defns[] = {
|
||||||
{ 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
|
{ 3, HS_CPU_FEATURES_AVX2, 1, 16, false },
|
||||||
{ 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true },
|
{ 4, HS_CPU_FEATURES_AVX2, 1, 16, true },
|
||||||
{ 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },
|
{ 5, HS_CPU_FEATURES_AVX2, 2, 16, false },
|
||||||
{ 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true },
|
{ 6, HS_CPU_FEATURES_AVX2, 2, 16, true },
|
||||||
{ 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false },
|
{ 7, HS_CPU_FEATURES_AVX2, 3, 16, false },
|
||||||
{ 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true },
|
{ 8, HS_CPU_FEATURES_AVX2, 3, 16, true },
|
||||||
{ 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false },
|
{ 9, HS_CPU_FEATURES_AVX2, 4, 16, false },
|
||||||
{ 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true },
|
{ 10, HS_CPU_FEATURES_AVX2, 4, 16, true },
|
||||||
{ 11, 0, 1, 8, false },
|
{ 11, 0, 1, 8, false },
|
||||||
{ 12, 0, 1, 8, true },
|
{ 12, 0, 1, 8, true },
|
||||||
{ 13, 0, 2, 8, false },
|
{ 13, 0, 2, 8, false },
|
||||||
@@ -71,6 +71,7 @@ void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
|
|||||||
};
|
};
|
||||||
out->clear();
|
out->clear();
|
||||||
for (const auto &def : defns) {
|
for (const auto &def : defns) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
out->emplace_back(def);
|
out->emplace_back(def);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -123,6 +124,7 @@ bool isAllowed(const vector<hwlmLiteral> &vl, const TeddyEngineDescription &eng,
|
|||||||
u32 n_small_lits = 0;
|
u32 n_small_lits = 0;
|
||||||
for (const auto &lit : vl) {
|
for (const auto &lit : vl) {
|
||||||
if (lit.s.length() < eng.numMasks) {
|
if (lit.s.length() < eng.numMasks) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
n_small_lits++;
|
n_small_lits++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -204,6 +206,7 @@ unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
|
|||||||
getTeddyDescriptions(&descs);
|
getTeddyDescriptions(&descs);
|
||||||
|
|
||||||
for (const auto &desc : descs) {
|
for (const auto &desc : descs) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
if (desc.getID() == engineID) {
|
if (desc.getID() == engineID) {
|
||||||
return std::make_unique<TeddyEngineDescription>(desc);
|
return std::make_unique<TeddyEngineDescription>(desc);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ namespace ue2 {
|
|||||||
|
|
||||||
#define TEDDY_BUCKET_LOAD 6
|
#define TEDDY_BUCKET_LOAD 6
|
||||||
|
|
||||||
struct TeddyEngineDef {
|
struct TeddyEngineDef { //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||||
u32 id;
|
u32 id;
|
||||||
u64a cpu_features;
|
u64a cpu_features;
|
||||||
u32 numMasks;
|
u32 numMasks;
|
||||||
|
|||||||
570
src/fdr/teddy_fat.cpp
Normal file
570
src/fdr/teddy_fat.cpp
Normal file
@@ -0,0 +1,570 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2015-2020, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* fat teddy for AVX2 and AVX512VBMI */
|
||||||
|
|
||||||
|
#include "fdr_internal.h"
|
||||||
|
#include "flood_runtime.h"
|
||||||
|
#include "teddy.h"
|
||||||
|
#include "teddy_internal.h"
|
||||||
|
#include "teddy_runtime_common.h"
|
||||||
|
#include "util/arch.h"
|
||||||
|
#include "util/simd_utils.h"
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX2)
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *pt,
|
||||||
|
const u32* confBase,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(chunk != ones_u64a)) {
|
||||||
|
chunk = ~chunk;
|
||||||
|
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||||
|
control, last_match);
|
||||||
|
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||||
|
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||||
|
return HWLM_TERMINATED;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||||
|
if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
#else
|
||||||
|
static really_inline
|
||||||
|
hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *pt,
|
||||||
|
const u32* confBase,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(chunk != ones_u32)) {
|
||||||
|
chunk = ~chunk;
|
||||||
|
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||||
|
control, last_match);
|
||||||
|
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||||
|
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||||
|
return HWLM_TERMINATED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||||
|
if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
|
||||||
|
|
||||||
|
// fat 512 teddy is only with vbmi
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||||
|
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
|
||||||
|
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const u8 ALIGN_CL_DIRECTIVE p_mask_interleave[64] = {
|
||||||
|
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
|
||||||
|
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
|
||||||
|
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
|
||||||
|
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
hwlm_error_t confirm_fat_teddy_64_512(m512 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff512(var, ones512()))) {
|
||||||
|
m512 msk_interleave = load512(p_mask_interleave);
|
||||||
|
m512 r = vpermb512(msk_interleave, var);
|
||||||
|
m128 r0 = extract128from512(r, 0);
|
||||||
|
m128 r1 = extract128from512(r, 1);
|
||||||
|
m128 r2 = extract128from512(r, 2);
|
||||||
|
m128 r3 = extract128from512(r, 3);
|
||||||
|
u64a part1 = movq(r0);
|
||||||
|
u64a part2 = extract64from128(r0, 1);
|
||||||
|
u64a part3 = movq(r1);
|
||||||
|
u64a part4 = extract64from128(r1, 1);
|
||||||
|
u64a part5 = movq(r2);
|
||||||
|
u64a part6 = extract64from128(r2, 1);
|
||||||
|
u64a part7 = movq(r3);
|
||||||
|
u64a part8 = extract64from128(r3, 1);
|
||||||
|
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
#define confirm_fat_teddy_512_f confirm_fat_teddy_64_512
|
||||||
|
#else // 32-64
|
||||||
|
|
||||||
|
hwlm_error_t confirm_fat_teddy_32_512(m512 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff512(var, ones512()))) {
|
||||||
|
m512 msk_interleave = load512(p_mask_interleave);
|
||||||
|
m512 r = vpermb512(msk_interleave, var);
|
||||||
|
m128 r0 = extract128from512(r, 0);
|
||||||
|
m128 r1 = extract128from512(r, 1);
|
||||||
|
m128 r2 = extract128from512(r, 2);
|
||||||
|
m128 r3 = extract128from512(r, 3);
|
||||||
|
u32 part1 = movd(r0);
|
||||||
|
u32 part2 = extract32from128(r0, 1);
|
||||||
|
u32 part3 = extract32from128(r0, 2);
|
||||||
|
u32 part4 = extract32from128(r0, 3);
|
||||||
|
u32 part5 = movd(r1);
|
||||||
|
u32 part6 = extract32from128(r1, 1);
|
||||||
|
u32 part7 = extract32from128(r1, 2);
|
||||||
|
u32 part8 = extract32from128(r1, 3);
|
||||||
|
u32 part9 = movd(r2);
|
||||||
|
u32 part10 = extract32from128(r2, 1);
|
||||||
|
u32 part11 = extract32from128(r2, 2);
|
||||||
|
u32 part12 = extract32from128(r2, 3);
|
||||||
|
u32 part13 = movd(r3);
|
||||||
|
u32 part14 = extract32from128(r3, 1);
|
||||||
|
u32 part15 = extract32from128(r3, 2);
|
||||||
|
u32 part16 = extract32from128(r3, 3);
|
||||||
|
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
#define confirm_fat_teddy_512_f confirm_fat_teddy_32_512
|
||||||
|
#endif // 32/64
|
||||||
|
|
||||||
|
#define CONFIRM_FAT_TEDDY_512(...) if(confirm_fat_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
#define TEDDY_VBMI_SL1_MASK 0xfffffffffffffffeULL
|
||||||
|
#define TEDDY_VBMI_SL2_MASK 0xfffffffffffffffcULL
|
||||||
|
#define TEDDY_VBMI_SL3_MASK 0xfffffffffffffff8ULL
|
||||||
|
|
||||||
|
#define FAT_TEDDY_VBMI_SL1_MASK 0xfffffffefffffffeULL
|
||||||
|
#define FAT_TEDDY_VBMI_SL2_MASK 0xfffffffcfffffffcULL
|
||||||
|
#define FAT_TEDDY_VBMI_SL3_MASK 0xfffffff8fffffff8ULL
|
||||||
|
|
||||||
|
#define FAT_TEDDY_VBMI_SL1_POS 15
|
||||||
|
#define FAT_TEDDY_VBMI_SL2_POS 14
|
||||||
|
#define FAT_TEDDY_VBMI_SL3_POS 13
|
||||||
|
|
||||||
|
#define FAT_TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffULL >> n_sh)
|
||||||
|
#define FAT_TEDDY_VBMI_CONF_MASK_FULL ((0xffffffffULL << n_sh) & 0xffffffffULL)
|
||||||
|
#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
|
||||||
|
#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffULL >> (32 - n_sh))
|
||||||
|
|
||||||
|
template<int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_fat_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
|
||||||
|
const m512 *sl_msk, const m512 val) {
|
||||||
|
m512 lo = and512(val, *lo_mask);
|
||||||
|
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
|
||||||
|
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
|
||||||
|
pshufb_m512(dup_mask[1], hi));
|
||||||
|
|
||||||
|
if constexpr (NMSK == 1) return shuf_or_b0;
|
||||||
|
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
|
||||||
|
pshufb_m512(dup_mask[3], hi));
|
||||||
|
m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
|
||||||
|
if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
|
||||||
|
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
|
||||||
|
pshufb_m512(dup_mask[5], hi));
|
||||||
|
m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
|
||||||
|
if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
|
||||||
|
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
|
||||||
|
pshufb_m512(dup_mask[7], hi));
|
||||||
|
m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
|
||||||
|
return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define TEDDY_VBMI_SL1_POS 15
|
||||||
|
#define TEDDY_VBMI_SL2_POS 14
|
||||||
|
#define TEDDY_VBMI_SL3_POS 13
|
||||||
|
|
||||||
|
#define TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffffffffffULL >> n_sh)
|
||||||
|
#define TEDDY_VBMI_CONF_MASK_FULL (0xffffffffffffffffULL << n_sh)
|
||||||
|
#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
|
||||||
|
#define TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffffffffffULL >> (64 - n_sh))
|
||||||
|
|
||||||
|
template<int NMSK>
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_512vbmi_templ(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
const u8 *buf_end = a->buf + a->len;
|
||||||
|
const u8 *ptr = a->buf + a->start_offset;
|
||||||
|
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||||
|
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||||
|
u32 last_match = ones_u32;
|
||||||
|
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||||
|
const size_t iterBytes = 32;
|
||||||
|
u32 n_sh = NMSK - 1;
|
||||||
|
const size_t loopBytes = 32 - n_sh;
|
||||||
|
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||||
|
a->buf, a->len, a->start_offset);
|
||||||
|
|
||||||
|
const m512 *dup_mask = getDupMaskBase(teddy, NMSK);
|
||||||
|
m512 lo_mask = set1_64x8(0xf);
|
||||||
|
m512 sl_msk[NMSK - 1];
|
||||||
|
if constexpr (NMSK > 1){
|
||||||
|
sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 2){
|
||||||
|
sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
|
||||||
|
}
|
||||||
|
if constexpr (NMSK > 3){
|
||||||
|
sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
|
||||||
|
}
|
||||||
|
|
||||||
|
const u32 *confBase = getConfBase(teddy);
|
||||||
|
|
||||||
|
u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;
|
||||||
|
m512 p_mask = set_mask_m512(~((k << 32) | k));
|
||||||
|
u32 overlap = 0;
|
||||||
|
u64a patch = 0;
|
||||||
|
if (likely(ptr + loopBytes <= buf_end)) {
|
||||||
|
u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;
|
||||||
|
m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));
|
||||||
|
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu_maskz_m256(k0, ptr)));
|
||||||
|
|
||||||
|
r_0 = or512(r_0, p_mask0);
|
||||||
|
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr);
|
||||||
|
ptr += loopBytes;
|
||||||
|
overlap = n_sh;
|
||||||
|
patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
|
||||||
|
CHECK_FLOOD;
|
||||||
|
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu256(ptr - n_sh)));
|
||||||
|
r_0 = or512(r_0, p_mask);
|
||||||
|
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ptr + loopBytes > buf_end);
|
||||||
|
if (ptr < buf_end) {
|
||||||
|
u32 left = (u32)(buf_end - ptr);
|
||||||
|
u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);
|
||||||
|
m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));
|
||||||
|
m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));
|
||||||
|
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
|
||||||
|
|
||||||
|
r_0 = or512(r_0, p_mask1);
|
||||||
|
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr - overlap);
|
||||||
|
}
|
||||||
|
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_512vbmi_templ
|
||||||
|
|
||||||
|
|
||||||
|
#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
extern "C" {
|
||||||
|
hwlm_error_t confirm_fat_teddy_64_256(m256 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff256(var, ones256()))) {
|
||||||
|
m256 swap = swap128in256(var);
|
||||||
|
m256 r = interleave256lo(var, swap);
|
||||||
|
u64a part1 = extractlow64from256(r);
|
||||||
|
u64a part2 = extract64from256(r, 1);
|
||||||
|
r = interleave256hi(var, swap);
|
||||||
|
u64a part3 = extractlow64from256(r);
|
||||||
|
u64a part4 = extract64from256(r, 1);
|
||||||
|
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
} // extern C
|
||||||
|
|
||||||
|
#define confirm_fat_teddy_256_f confirm_fat_teddy_64_256
|
||||||
|
|
||||||
|
#else
|
||||||
|
extern "C" {
|
||||||
|
hwlm_error_t confirm_fat_teddy_32_256(m256 var, u8 bucket, u8 offset,
|
||||||
|
CautionReason reason, const u8 *ptr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
const u32* confBase, hwlm_group_t *control,
|
||||||
|
u32 *last_match) {
|
||||||
|
if (unlikely(diff256(var, ones256()))) {
|
||||||
|
m256 swap = swap128in256(var);
|
||||||
|
m256 r = interleave256lo(var, swap);
|
||||||
|
u32 part1 = extractlow32from256(r);
|
||||||
|
u32 part2 = extract32from256(r, 1);
|
||||||
|
u32 part3 = extract32from256(r, 2);
|
||||||
|
u32 part4 = extract32from256(r, 3);
|
||||||
|
r = interleave256hi(var, swap);
|
||||||
|
u32 part5 = extractlow32from256(r);
|
||||||
|
u32 part6 = extract32from256(r, 1);
|
||||||
|
u32 part7 = extract32from256(r, 2);
|
||||||
|
u32 part8 = extract32from256(r, 3);
|
||||||
|
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||||
|
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // extern C
|
||||||
|
|
||||||
|
#define confirm_fat_teddy_256_f confirm_fat_teddy_32_256
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CONFIRM_FAT_TEDDY_256(...) if(confirm_fat_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
const m256 *getMaskBase_fat(const struct Teddy *teddy) {
|
||||||
|
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||||
|
const u8 *lo, const u8 *hi,
|
||||||
|
const u8 *buf_history, size_t len_history,
|
||||||
|
const u32 nMasks) {
|
||||||
|
m128 p_mask128;
|
||||||
|
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||||
|
buf_history, len_history, nMasks));
|
||||||
|
*p_mask = set1_2x128(p_mask128);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int NMSK>
|
||||||
|
static really_inline
|
||||||
|
m256 prep_conf_fat_teddy_256_templ(const m256 *maskBase, m256 val,
|
||||||
|
m256* old_1, m256* old_2, m256* old_3){
|
||||||
|
m256 mask = set1_32x8(0xf);
|
||||||
|
m256 lo = and256(val, mask);
|
||||||
|
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||||
|
m256 r = or256(pshufb_m256(maskBase[0 * 2], lo),
|
||||||
|
pshufb_m256(maskBase[0 * 2 + 1], hi));
|
||||||
|
if constexpr (NMSK == 1) return r;
|
||||||
|
m256 res_1 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
|
||||||
|
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
|
||||||
|
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - (NMSK-1));
|
||||||
|
*old_1 = res_1;
|
||||||
|
r = or256(r, res_shifted_1);
|
||||||
|
if constexpr (NMSK == 2) return r;
|
||||||
|
m256 res_2 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
|
||||||
|
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
|
||||||
|
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - (NMSK-1));
|
||||||
|
*old_2 = res_2;
|
||||||
|
r = or256(r, res_shifted_2);
|
||||||
|
if constexpr (NMSK == 3) return r;
|
||||||
|
m256 res_3 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
|
||||||
|
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
|
||||||
|
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - (NMSK-1));
|
||||||
|
*old_3 = res_3;
|
||||||
|
return or256(r, res_shifted_3);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int NMSK>
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_256_templ(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
const u8 *buf_end = a->buf + a->len;
|
||||||
|
const u8 *ptr = a->buf + a->start_offset;
|
||||||
|
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||||
|
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||||
|
u32 last_match = ones_u32;
|
||||||
|
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||||
|
const size_t iterBytes = 32;
|
||||||
|
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||||
|
a->buf, a->len, a->start_offset);
|
||||||
|
|
||||||
|
const m256 *maskBase = getMaskBase_fat(teddy);
|
||||||
|
const u32 *confBase = getConfBase(teddy);
|
||||||
|
|
||||||
|
m256 res_old_1 = zeroes256();
|
||||||
|
m256 res_old_2 = zeroes256();
|
||||||
|
m256 res_old_3 = zeroes256();
|
||||||
|
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||||
|
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||||
|
if (ptr < mainStart) {
|
||||||
|
ptr = mainStart - 16;
|
||||||
|
m256 p_mask;
|
||||||
|
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,
|
||||||
|
a->buf, buf_end,
|
||||||
|
a->buf_history, a->len_history,
|
||||||
|
NMSK);
|
||||||
|
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
|
||||||
|
r_0 = or256(r_0, p_mask);
|
||||||
|
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
|
||||||
|
ptr += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 16 <= buf_end) {
|
||||||
|
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
|
||||||
|
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
|
||||||
|
ptr += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||||
|
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||||
|
CHECK_FLOOD;
|
||||||
|
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
|
||||||
|
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
m256 r_1 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr + 16), &res_old_1, &res_old_2, &res_old_3);
|
||||||
|
CONFIRM_FAT_TEDDY_256(r_1, 16, 16, NOT_CAUTIOUS, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr + 16 <= buf_end) {
|
||||||
|
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
|
||||||
|
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
|
||||||
|
ptr += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ptr + 16 > buf_end);
|
||||||
|
if (ptr < buf_end) {
|
||||||
|
m256 p_mask;
|
||||||
|
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,
|
||||||
|
a->buf_history, a->len_history,
|
||||||
|
NMSK);
|
||||||
|
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
|
||||||
|
r_0 = or256(r_0, p_mask);
|
||||||
|
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
|
||||||
|
}
|
||||||
|
return HWLM_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this check is because it is possible to build with both AVX512VBMI and AVX2 defined,
|
||||||
|
// to replicate the behaviour of the original flow of control we give preference
|
||||||
|
// to the former. If we're building for both then this will be compiled multiple times
|
||||||
|
// with the desired variant defined by itself.
|
||||||
|
#ifndef FDR_EXEC_FAT_TEDDY_FN
|
||||||
|
#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_256_templ
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // HAVE_AVX2 for fat teddy
|
||||||
|
|
||||||
|
/* we only have fat teddy in these two modes */
|
||||||
|
// #if (defined(HAVE_AVX2) || defined(HAVE_AVX512VBMI)) && defined(FDR_EXEC_FAT_TEDDY_FN)
|
||||||
|
// #if defined(FDR_EXEC_FAT_TEDDY_FN)
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
|
||||||
|
const struct FDR_Runtime_Args *a,
|
||||||
|
hwlm_group_t control) {
|
||||||
|
return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // extern c
|
||||||
|
|
||||||
|
#endif // HAVE_AVX2 from the beginning
|
||||||
|
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016-2020, Intel Corporation
|
* Copyright (c) 2016-2020, Intel Corporation
|
||||||
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -40,10 +41,6 @@
|
|||||||
#include "util/simd_utils.h"
|
#include "util/simd_utils.h"
|
||||||
#include "util/uniform_ops.h"
|
#include "util/uniform_ops.h"
|
||||||
|
|
||||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
|
|
||||||
#if defined(HAVE_AVX2)
|
|
||||||
extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(HAVE_AVX512VBMI)
|
#if defined(HAVE_AVX512VBMI)
|
||||||
static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
|
static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
|
||||||
@@ -142,6 +139,37 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
|
|||||||
// |----------|-------|----------------|............|
|
// |----------|-------|----------------|............|
|
||||||
// 0 start start+offset end(<=16)
|
// 0 start start+offset end(<=16)
|
||||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||||
|
|
||||||
|
// replace the p_mask_arr table.
|
||||||
|
// m is the length of the zone of bytes==0 , n is
|
||||||
|
// the offset where that zone begins. more specifically, there are
|
||||||
|
// 16-n bytes of 1's before the zone begins.
|
||||||
|
// m,n 4,7 - 4 bytes of 0s, and 16-7 bytes of 1's before that.
|
||||||
|
// 00 00 00 00 ff..ff
|
||||||
|
// ff ff ff ff ff ff ff ff 00 00 00 00 ff..ff
|
||||||
|
// m,n 15,15 - 15 bytes of 0s , f's high, but also with 16-15=1 byte of 1s
|
||||||
|
// in the beginning - which push the ff at the end off the high end , leaving
|
||||||
|
// ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
|
||||||
|
// m,n 15,16 - 15 bytes of 0s, ff high , with 16-16 = 0 ones on the low end
|
||||||
|
// before that, so,
|
||||||
|
// 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff
|
||||||
|
// so to get the one part, with the f's high, we start out with 1's and
|
||||||
|
// shift them up (right) by m+n.
|
||||||
|
// now to fill in any ones that belong on the low end we have to take
|
||||||
|
// some 1's and shift them down. the ones zone there needs to be 16-n long,
|
||||||
|
// meaning shifted down by 16-(16-n) , or of course just n.
|
||||||
|
// then we should be able to or these together.
|
||||||
|
static really_inline
|
||||||
|
m128 p_mask_gen(u8 m, u8 n){
|
||||||
|
m128 a = ones128();
|
||||||
|
m128 b = ones128();
|
||||||
|
m%=17; n%=17;
|
||||||
|
m+=(16-n); m%=17;
|
||||||
|
a = rshiftbyte_m128(a, n);
|
||||||
|
b = lshiftbyte_m128(b, m);
|
||||||
|
return or128(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||||
const u8 *lo, const u8 *hi,
|
const u8 *lo, const u8 *hi,
|
||||||
@@ -161,13 +189,11 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
|||||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||||
if (avail >= 16) {
|
if (avail >= 16) {
|
||||||
assert(start_offset - start <= 16);
|
assert(start_offset - start <= 16);
|
||||||
*p_mask = loadu128(p_mask_arr[16 - start_offset + start]
|
*p_mask = p_mask_gen(16 - start_offset + start, 16 - start_offset + start);
|
||||||
+ 16 - start_offset + start);
|
|
||||||
return loadu128(ptr);
|
return loadu128(ptr);
|
||||||
}
|
}
|
||||||
assert(start_offset - start <= avail);
|
assert(start_offset - start <= avail);
|
||||||
*p_mask = loadu128(p_mask_arr[avail - start_offset + start]
|
*p_mask = p_mask_gen(avail - start_offset + start, 16 - start_offset + start);
|
||||||
+ 16 - start_offset + start);
|
|
||||||
copy_start = 0;
|
copy_start = 0;
|
||||||
copy_len = avail;
|
copy_len = avail;
|
||||||
} else { // start zone
|
} else { // start zone
|
||||||
@@ -180,8 +206,7 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
|||||||
}
|
}
|
||||||
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
|
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
|
||||||
assert(start + start_offset <= end);
|
assert(start + start_offset <= end);
|
||||||
*p_mask = loadu128(p_mask_arr[end - start - start_offset]
|
*p_mask = p_mask_gen(end - start - start_offset, 16 - start - start_offset);
|
||||||
+ 16 - start - start_offset);
|
|
||||||
copy_start = start;
|
copy_start = start;
|
||||||
copy_len = end - start;
|
copy_len = end - start;
|
||||||
}
|
}
|
||||||
@@ -270,6 +295,20 @@ void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
|
|||||||
// |----------|-------|----------------|............|
|
// |----------|-------|----------------|............|
|
||||||
// 0 start start+offset end(<=32)
|
// 0 start start+offset end(<=32)
|
||||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||||
|
|
||||||
|
// like the pmask gen above this replaces the large array.
|
||||||
|
static really_inline
|
||||||
|
m256 fat_pmask_gen(u8 m, u8 n){
|
||||||
|
m256 a=ones256();
|
||||||
|
m256 b=ones256();
|
||||||
|
m%=33; n%=33;
|
||||||
|
m+=(32-n); m%=33;
|
||||||
|
|
||||||
|
a = rshift_byte_m256(a, m);
|
||||||
|
b = lshift_byte_m256(b, n);
|
||||||
|
return or256(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||||
const u8 *lo, const u8 *hi,
|
const u8 *lo, const u8 *hi,
|
||||||
@@ -289,13 +328,11 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
|||||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||||
if (avail >= 32) {
|
if (avail >= 32) {
|
||||||
assert(start_offset - start <= 32);
|
assert(start_offset - start <= 32);
|
||||||
*p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
|
*p_mask = fat_pmask_gen(32 - start_offset + start, 32 - start_offset + start);
|
||||||
+ 32 - start_offset + start);
|
|
||||||
return loadu256(ptr);
|
return loadu256(ptr);
|
||||||
}
|
}
|
||||||
assert(start_offset - start <= avail);
|
assert(start_offset - start <= avail);
|
||||||
*p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
|
*p_mask = fat_pmask_gen(avail - start_offset + start, 32 - start_offset + start);
|
||||||
+ 32 - start_offset + start);
|
|
||||||
copy_start = 0;
|
copy_start = 0;
|
||||||
copy_len = avail;
|
copy_len = avail;
|
||||||
} else { //start zone
|
} else { //start zone
|
||||||
@@ -308,8 +345,7 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
|||||||
}
|
}
|
||||||
uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
|
uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
|
||||||
assert(start + start_offset <= end);
|
assert(start + start_offset <= end);
|
||||||
*p_mask = loadu256(p_mask_arr256[end - start - start_offset]
|
*p_mask = fat_pmask_gen(end - start - start_offset, 32 - start - start_offset);
|
||||||
+ 32 - start - start_offset);
|
|
||||||
copy_start = start;
|
copy_start = start;
|
||||||
copy_len = end - start;
|
copy_len = end - start;
|
||||||
}
|
}
|
||||||
@@ -428,8 +464,13 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
|||||||
if (!cf) {
|
if (!cf) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
#ifdef __cplusplus
|
||||||
|
const struct FDRConfirm *fdrc = reinterpret_cast<const struct FDRConfirm *>
|
||||||
|
(reinterpret_cast<const u8 *>(confBase) + cf);
|
||||||
|
#else
|
||||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||||
((const u8 *)confBase + cf);
|
((const u8 *)confBase + cf);
|
||||||
|
#endif
|
||||||
if (!(fdrc->groups & *control)) {
|
if (!(fdrc->groups & *control)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -442,18 +483,31 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
|||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
const m128 *getMaskBase(const struct Teddy *teddy) {
|
const m128 *getMaskBase(const struct Teddy *teddy) {
|
||||||
|
#ifdef __cplusplus
|
||||||
|
return reinterpret_cast<const m128 *>(reinterpret_cast<const u8 *>(teddy) + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||||
|
#else
|
||||||
return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
|
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||||
|
#ifdef __cplusplus
|
||||||
|
return reinterpret_cast<const u64a *>(reinterpret_cast<const u8 *>(getMaskBase(teddy))
|
||||||
|
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
|
||||||
|
#else
|
||||||
return (const u64a *)((const u8 *)getMaskBase(teddy)
|
return (const u64a *)((const u8 *)getMaskBase(teddy)
|
||||||
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
|
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
const u32 *getConfBase(const struct Teddy *teddy) {
|
const u32 *getConfBase(const struct Teddy *teddy) {
|
||||||
|
#ifdef __cplusplus
|
||||||
|
return reinterpret_cast<const u32 *>(reinterpret_cast<const u8 *>(teddy) + teddy->confOffset);
|
||||||
|
#else
|
||||||
return (const u32 *)((const u8 *)teddy + teddy->confOffset);
|
return (const u32 *)((const u8 *)teddy + teddy->confOffset);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* TEDDY_RUNTIME_COMMON_H_ */
|
#endif /* TEDDY_RUNTIME_COMMON_H_ */
|
||||||
|
|||||||
@@ -589,7 +589,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
|
|||||||
return HS_COMPILER_ERROR;
|
return HS_COMPILER_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
hs_expr_info *rv = (hs_expr_info *)hs_misc_alloc(sizeof(*rv));
|
hs_expr_info *rv = static_cast<hs_expr_info *>(hs_misc_alloc(sizeof(*rv)));
|
||||||
if (!rv) {
|
if (!rv) {
|
||||||
*error = const_cast<hs_compile_error_t *>(&hs_enomem);
|
*error = const_cast<hs_compile_error_t *>(&hs_enomem);
|
||||||
return HS_COMPILER_ERROR;
|
return HS_COMPILER_ERROR;
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016-2017, Intel Corporation
|
* Copyright (c) 2016-2017, Intel Corporation
|
||||||
|
* Copyright (c) 2020-2023, VectorCamp PC
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -29,28 +30,33 @@
|
|||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include "hs_common.h"
|
#include "hs_common.h"
|
||||||
#include "ue2common.h"
|
#include "ue2common.h"
|
||||||
|
#if !defined(VS_SIMDE_BACKEND)
|
||||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||||
#include "util/arch/x86/cpuid_inline.h"
|
#include "util/arch/x86/cpuid_inline.h"
|
||||||
#elif defined(ARCH_AARCH64)
|
#elif defined(ARCH_AARCH64)
|
||||||
#include "util/arch/arm/cpuid_inline.h"
|
#include "util/arch/arm/cpuid_inline.h"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
HS_PUBLIC_API
|
HS_PUBLIC_API
|
||||||
hs_error_t HS_CDECL hs_valid_platform(void) {
|
hs_error_t HS_CDECL hs_valid_platform(void) {
|
||||||
/* Hyperscan requires SSSE3, anything else is a bonus */
|
/* Vectorscan requires SSE4.2, anything else is a bonus */
|
||||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64))
|
||||||
if (check_ssse3()) {
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
|
if (check_sse42()) {
|
||||||
return HS_SUCCESS;
|
return HS_SUCCESS;
|
||||||
} else {
|
} else {
|
||||||
return HS_ARCH_ERROR;
|
return HS_ARCH_ERROR;
|
||||||
}
|
}
|
||||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
|
||||||
|
//check_neon returns true for now
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (check_neon()) {
|
if (check_neon()) {
|
||||||
return HS_SUCCESS;
|
return HS_SUCCESS;
|
||||||
} else {
|
} else {
|
||||||
return HS_ARCH_ERROR;
|
return HS_ARCH_ERROR;
|
||||||
}
|
}
|
||||||
#elif defined(ARCH_PPC64EL)
|
#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
|
||||||
return HS_SUCCESS;
|
return HS_SUCCESS;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -73,7 +73,12 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
|
|||||||
return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
|
return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
|
||||||
case ACCEL_TRUFFLE:
|
case ACCEL_TRUFFLE:
|
||||||
DEBUG_PRINTF("truffle\n");
|
DEBUG_PRINTF("truffle\n");
|
||||||
return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
|
return truffleExec(aux->truffle.mask_lo, aux->truffle.mask_hi, ptr, end);
|
||||||
|
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||||
|
case ACCEL_TRUFFLE_WIDE:
|
||||||
|
DEBUG_PRINTF("truffle wide\n");
|
||||||
|
return truffleExecWide(aux->truffle.mask, ptr, end);
|
||||||
|
#endif // CAN_USE_WIDE_TRUFFLE
|
||||||
default:
|
default:
|
||||||
/* no acceleration, fall through and return current ptr */
|
/* no acceleration, fall through and return current ptr */
|
||||||
DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
|
DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
|
||||||
@@ -170,8 +175,7 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
|
|||||||
DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len);
|
DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len);
|
||||||
*start += delta;
|
*start += delta;
|
||||||
} else if (hlen) {
|
} else if (hlen) {
|
||||||
UNUSED size_t remaining = offset + ptr2 - found;
|
DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", offset + ptr2 - found, hlen);
|
||||||
DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", remaining, hlen);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
|
|||||||
// Called by an assertion.
|
// Called by an assertion.
|
||||||
static
|
static
|
||||||
bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {
|
bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (const auto &lit : lits) {
|
for (const auto &lit : lits) {
|
||||||
if (!lit.groups) {
|
if (!lit.groups) {
|
||||||
return false;
|
return false;
|
||||||
@@ -143,7 +144,7 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!eng) {
|
if (!eng) {
|
||||||
return nullptr;
|
return bytecode_ptr<HWLM>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(engSize);
|
assert(engSize);
|
||||||
@@ -155,6 +156,7 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
|
|||||||
auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
|
auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
|
||||||
|
|
||||||
h->type = proto.engType;
|
h->type = proto.engType;
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
|
memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
|
||||||
|
|
||||||
return h;
|
return h;
|
||||||
@@ -218,10 +220,12 @@ size_t hwlmSize(const HWLM *h) {
|
|||||||
|
|
||||||
switch (h->type) {
|
switch (h->type) {
|
||||||
case HWLM_ENGINE_NOOD:
|
case HWLM_ENGINE_NOOD:
|
||||||
engSize = noodSize((const noodTable *)HWLM_C_DATA(h));
|
// cppcheck-suppress cstyleCast
|
||||||
|
engSize = noodSize(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)));
|
||||||
break;
|
break;
|
||||||
case HWLM_ENGINE_FDR:
|
case HWLM_ENGINE_FDR:
|
||||||
engSize = fdrSize((const FDR *)HWLM_C_DATA(h));
|
// cppcheck-suppress cstyleCast
|
||||||
|
engSize = fdrSize(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -53,10 +53,12 @@ void hwlmGenerateDumpFiles(const HWLM *h, const string &base) {
|
|||||||
|
|
||||||
switch (h->type) {
|
switch (h->type) {
|
||||||
case HWLM_ENGINE_NOOD:
|
case HWLM_ENGINE_NOOD:
|
||||||
noodPrintStats((const noodTable *)HWLM_C_DATA(h), f);
|
// cppcheck-suppress cstyleCast
|
||||||
|
noodPrintStats(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)), f);
|
||||||
break;
|
break;
|
||||||
case HWLM_ENGINE_FDR:
|
case HWLM_ENGINE_FDR:
|
||||||
fdrPrintStats((const FDR *)HWLM_C_DATA(h), f);
|
// cppcheck-suppress cstyleCast
|
||||||
|
fdrPrintStats(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)), f);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
fprintf(f, "<unknown hwlm subengine>\n");
|
fprintf(f, "<unknown hwlm subengine>\n");
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ u64a make_u64a_mask(const vector<u8> &v) {
|
|||||||
|
|
||||||
u64a mask = 0;
|
u64a mask = 0;
|
||||||
size_t len = v.size();
|
size_t len = v.size();
|
||||||
unsigned char *m = (unsigned char *)&mask;
|
u8 *m = reinterpret_cast<u8 *>(&mask);
|
||||||
DEBUG_PRINTF("making mask len %zu\n", len);
|
DEBUG_PRINTF("making mask len %zu\n", len);
|
||||||
memcpy(m, &v[0], len);
|
memcpy(m, &v[0], len);
|
||||||
return mask;
|
return mask;
|
||||||
@@ -156,7 +156,7 @@ void noodPrintStats(const noodTable *n, FILE *f) {
|
|||||||
n->msk_len);
|
n->msk_len);
|
||||||
fprintf(f, "String: ");
|
fprintf(f, "String: ");
|
||||||
for (u32 i = 0; i < n->msk_len; i++) {
|
for (u32 i = 0; i < n->msk_len; i++) {
|
||||||
const u8 *m = (const u8 *)&n->cmp;
|
const u8 *m = reinterpret_cast<const u8 *>(&n->cmp);
|
||||||
if (isgraph(m[i]) && m[i] != '\\') {
|
if (isgraph(m[i]) && m[i] != '\\') {
|
||||||
fprintf(f, "%c", m[i]);
|
fprintf(f, "%c", m[i]);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -148,15 +148,14 @@ hwlm_error_t doubleCheckMatched(const struct noodTable *n, const u8 *buf,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
svbool_t doubleMatched(svuint16_t chars, const u8 *d,
|
svbool_t doubleMatchedLoop(svuint16_t chars, const u8 *d,
|
||||||
svbool_t pg, svbool_t pg_rot,
|
|
||||||
svbool_t * const matched, svbool_t * const matched_rot) {
|
svbool_t * const matched, svbool_t * const matched_rot) {
|
||||||
svuint16_t vec = svreinterpret_u16(svld1_u8(pg, d));
|
svuint16_t vec = svreinterpret_u16(svld1_u8(svptrue_b8(), d));
|
||||||
// d - 1 won't underflow as the first position in buf has been dealt
|
// d - 1 won't underflow as the first position in buf has been dealt
|
||||||
// with meaning that d > buf
|
// with meaning that d > buf
|
||||||
svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, d - 1));
|
svuint16_t vec_rot = svreinterpret_u16(svld1_u8(svptrue_b8(), d - 1));
|
||||||
*matched = svmatch(pg, vec, chars);
|
*matched = svmatch(svptrue_b8(), vec, chars);
|
||||||
*matched_rot = svmatch(pg_rot, vec_rot, chars);
|
*matched_rot = svmatch(svptrue_b8(), vec_rot, chars);
|
||||||
return svorr_z(svptrue_b8(), *matched, *matched_rot);
|
return svorr_z(svptrue_b8(), *matched, *matched_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,10 +166,34 @@ hwlm_error_t scanDoubleOnce(const struct noodTable *n, const u8 *buf,
|
|||||||
DEBUG_PRINTF("start %p end %p\n", d, e);
|
DEBUG_PRINTF("start %p end %p\n", d, e);
|
||||||
assert(d < e);
|
assert(d < e);
|
||||||
assert(d > buf);
|
assert(d > buf);
|
||||||
svbool_t pg = svwhilelt_b8_s64(0, e - d);
|
const ptrdiff_t size = e - d;
|
||||||
svbool_t pg_rot = svwhilelt_b8_s64(0, e - d + 1);
|
svbool_t pg = svwhilelt_b8_s64(0, size);
|
||||||
svbool_t matched, matched_rot;
|
svbool_t pg_rot = svwhilelt_b8_s64(0, size + 1);
|
||||||
svbool_t any = doubleMatched(svreinterpret_u16(chars), d, pg, pg_rot, &matched, &matched_rot);
|
|
||||||
|
svuint16_t vec = svreinterpret_u16(svld1_u8(pg, d));
|
||||||
|
// d - 1 won't underflow as the first position in buf has been dealt
|
||||||
|
// with meaning that d > buf
|
||||||
|
svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, d - 1));
|
||||||
|
|
||||||
|
// we reuse u8 predicates for u16 lanes. This means that we will check against one
|
||||||
|
// extra \0 character at the end of the vector.
|
||||||
|
if(unlikely(n->key1 == '\0')) {
|
||||||
|
if (size % 2) {
|
||||||
|
// if odd, vec has an odd number of lanes and has the spurious \0
|
||||||
|
svbool_t lane_to_disable = svrev_b8(svpfirst(svrev_b8(pg), svpfalse()));
|
||||||
|
pg = sveor_z(svptrue_b8(), pg, lane_to_disable);
|
||||||
|
} else {
|
||||||
|
// if even, vec_rot has an odd number of lanes and has the spurious \0
|
||||||
|
// we need to disable the last active lane as well, but we know pg is
|
||||||
|
// the same as pg_rot without the last lane
|
||||||
|
pg_rot = pg;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
svbool_t matched = svmatch(pg, vec, svreinterpret_u16(chars));
|
||||||
|
svbool_t matched_rot = svmatch(pg_rot, vec_rot, svreinterpret_u16(chars));
|
||||||
|
svbool_t any = svorr_z(svptrue_b8(), matched, matched_rot);
|
||||||
|
|
||||||
return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
|
return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -187,8 +210,7 @@ hwlm_error_t scanDoubleLoop(const struct noodTable *n, const u8 *buf,
|
|||||||
for (size_t i = 0; i < loops; i++, d += svcntb()) {
|
for (size_t i = 0; i < loops; i++, d += svcntb()) {
|
||||||
DEBUG_PRINTF("d %p \n", d);
|
DEBUG_PRINTF("d %p \n", d);
|
||||||
svbool_t matched, matched_rot;
|
svbool_t matched, matched_rot;
|
||||||
svbool_t any = doubleMatched(svreinterpret_u16(chars), d, svptrue_b8(), svptrue_b8(),
|
svbool_t any = doubleMatchedLoop(svreinterpret_u16(chars), d, &matched, &matched_rot);
|
||||||
&matched, &matched_rot);
|
|
||||||
hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
|
hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
|
||||||
matched, matched_rot, any);
|
matched, matched_rot, any);
|
||||||
RETURN_IF_TERMINATED(rv);
|
RETURN_IF_TERMINATED(rv);
|
||||||
|
|||||||
@@ -142,9 +142,18 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end);
|
rv = truffleExec(accel->truffle.mask_lo, accel->truffle.mask_hi, c, c_end);
|
||||||
break;
|
break;
|
||||||
|
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||||
|
case ACCEL_TRUFFLE_WIDE:
|
||||||
|
DEBUG_PRINTF("accel Truffle Wide %p %p\n", c, c_end);
|
||||||
|
if (c + 15 >= c_end) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
rv = truffleExecWide(accel->truffle.mask, c, c_end);
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
case ACCEL_DSHUFTI:
|
case ACCEL_DSHUFTI:
|
||||||
DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end);
|
DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end);
|
||||||
if (c + 15 + 1 >= c_end) {
|
if (c + 15 + 1 >= c_end) {
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ enum AccelType {
|
|||||||
ACCEL_VERM16,
|
ACCEL_VERM16,
|
||||||
ACCEL_DVERM16,
|
ACCEL_DVERM16,
|
||||||
ACCEL_DVERM16_MASKED,
|
ACCEL_DVERM16_MASKED,
|
||||||
|
ACCEL_TRUFFLE_WIDE,
|
||||||
};
|
};
|
||||||
|
|
||||||
/** \brief Structure for accel framework. */
|
/** \brief Structure for accel framework. */
|
||||||
@@ -136,8 +137,18 @@ union AccelAux {
|
|||||||
struct {
|
struct {
|
||||||
u8 accel_type;
|
u8 accel_type;
|
||||||
u8 offset;
|
u8 offset;
|
||||||
m128 mask1;
|
union {
|
||||||
m128 mask2;
|
m256 mask;
|
||||||
|
struct {
|
||||||
|
#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
|
||||||
|
m128 mask_lo;
|
||||||
|
m128 mask_hi;
|
||||||
|
#else
|
||||||
|
m128 mask_hi;
|
||||||
|
m128 mask_lo;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
};
|
||||||
} truffle;
|
} truffle;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -182,6 +182,7 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
|
|||||||
vector<vector<CharReach>> rv;
|
vector<vector<CharReach>> rv;
|
||||||
rv.reserve(paths.size());
|
rv.reserve(paths.size());
|
||||||
for (auto &p : paths) {
|
for (auto &p : paths) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
rv.emplace_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
|
rv.emplace_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
|
||||||
std::make_move_iterator(p.reach.end())));
|
std::make_move_iterator(p.reach.end())));
|
||||||
}
|
}
|
||||||
@@ -426,10 +427,11 @@ void
|
|||||||
accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
||||||
const AccelScheme &info,
|
const AccelScheme &info,
|
||||||
void *accel_out) {
|
void *accel_out) {
|
||||||
AccelAux *accel = (AccelAux *)accel_out;
|
AccelAux *accel = reinterpret_cast<AccelAux *>(accel_out);
|
||||||
|
|
||||||
DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
|
DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
|
||||||
info.double_offset);
|
info.double_offset);
|
||||||
|
// cppcheck-suppress redundantInitialization
|
||||||
accel->generic.offset = verify_u8(info.offset);
|
accel->generic.offset = verify_u8(info.offset);
|
||||||
|
|
||||||
if (double_byte_ok(info) && info.double_cr.none() &&
|
if (double_byte_ok(info) && info.double_cr.none() &&
|
||||||
@@ -473,7 +475,8 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
|||||||
u8 c1 = info.double_byte.begin()->first & m1;
|
u8 c1 = info.double_byte.begin()->first & m1;
|
||||||
u8 c2 = info.double_byte.begin()->second & m2;
|
u8 c2 = info.double_byte.begin()->second & m2;
|
||||||
#ifdef HAVE_SVE2
|
#ifdef HAVE_SVE2
|
||||||
if (vermicelliDoubleMasked16Build(c1, c2, m1, m2, (u8 *)&accel->mdverm16.mask)) {
|
if (vermicelliDoubleMasked16Build(c1, c2, m1, m2,
|
||||||
|
reinterpret_cast<u8 *>(&accel->mdverm16.mask))) {
|
||||||
accel->accel_type = ACCEL_DVERM16_MASKED;
|
accel->accel_type = ACCEL_DVERM16_MASKED;
|
||||||
accel->mdverm16.offset = verify_u8(info.double_offset);
|
accel->mdverm16.offset = verify_u8(info.double_offset);
|
||||||
accel->mdverm16.c1 = c1;
|
accel->mdverm16.c1 = c1;
|
||||||
@@ -482,8 +485,9 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
|||||||
c1, c2);
|
c1, c2);
|
||||||
return;
|
return;
|
||||||
} else if (info.double_byte.size() <= 8 &&
|
} else if (info.double_byte.size() <= 8 &&
|
||||||
vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
|
vermicelliDouble16Build(info.double_byte,
|
||||||
(u8 *)&accel->dverm16.firsts)) {
|
reinterpret_cast<u8 *>(&accel->dverm16.mask),
|
||||||
|
reinterpret_cast<u8 *>(&accel->dverm16.firsts))) {
|
||||||
accel->accel_type = ACCEL_DVERM16;
|
accel->accel_type = ACCEL_DVERM16;
|
||||||
accel->dverm16.offset = verify_u8(info.double_offset);
|
accel->dverm16.offset = verify_u8(info.double_offset);
|
||||||
DEBUG_PRINTF("building double16-vermicelli\n");
|
DEBUG_PRINTF("building double16-vermicelli\n");
|
||||||
@@ -503,8 +507,9 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
|||||||
}
|
}
|
||||||
#ifdef HAVE_SVE2
|
#ifdef HAVE_SVE2
|
||||||
if (info.double_byte.size() <= 8 &&
|
if (info.double_byte.size() <= 8 &&
|
||||||
vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
|
vermicelliDouble16Build(info.double_byte,
|
||||||
(u8 *)&accel->dverm16.firsts)) {
|
reinterpret_cast<u8 *>(&accel->dverm16.mask),
|
||||||
|
reinterpret_cast<u8 *>(&accel->dverm16.firsts))) {
|
||||||
accel->accel_type = ACCEL_DVERM16;
|
accel->accel_type = ACCEL_DVERM16;
|
||||||
accel->dverm16.offset = verify_u8(info.double_offset);
|
accel->dverm16.offset = verify_u8(info.double_offset);
|
||||||
DEBUG_PRINTF("building double16-vermicelli\n");
|
DEBUG_PRINTF("building double16-vermicelli\n");
|
||||||
@@ -515,9 +520,11 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
|||||||
|
|
||||||
if (double_byte_ok(info) &&
|
if (double_byte_ok(info) &&
|
||||||
shuftiBuildDoubleMasks(
|
shuftiBuildDoubleMasks(
|
||||||
info.double_cr, info.double_byte, (u8 *)&accel->dshufti.lo1,
|
info.double_cr, info.double_byte,
|
||||||
(u8 *)&accel->dshufti.hi1, (u8 *)&accel->dshufti.lo2,
|
reinterpret_cast<u8 *>(&accel->dshufti.lo1),
|
||||||
(u8 *)&accel->dshufti.hi2)) {
|
reinterpret_cast<u8 *>(&accel->dshufti.hi1),
|
||||||
|
reinterpret_cast<u8 *>(&accel->dshufti.lo2),
|
||||||
|
reinterpret_cast<u8 *>(&accel->dshufti.hi2))) {
|
||||||
accel->accel_type = ACCEL_DSHUFTI;
|
accel->accel_type = ACCEL_DSHUFTI;
|
||||||
accel->dshufti.offset = verify_u8(info.double_offset);
|
accel->dshufti.offset = verify_u8(info.double_offset);
|
||||||
DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
|
DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
|
||||||
@@ -549,7 +556,7 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
|||||||
#ifdef HAVE_SVE2
|
#ifdef HAVE_SVE2
|
||||||
if (info.cr.count() <= 16) {
|
if (info.cr.count() <= 16) {
|
||||||
accel->accel_type = ACCEL_VERM16;
|
accel->accel_type = ACCEL_VERM16;
|
||||||
vermicelli16Build(info.cr, (u8 *)&accel->verm16.mask);
|
vermicelli16Build(info.cr, reinterpret_cast<u8 *>(&accel->verm16.mask));
|
||||||
DEBUG_PRINTF("state %hu is vermicelli16\n", this_idx);
|
DEBUG_PRINTF("state %hu is vermicelli16\n", this_idx);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -562,16 +569,27 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
accel->accel_type = ACCEL_SHUFTI;
|
accel->accel_type = ACCEL_SHUFTI;
|
||||||
if (-1 != shuftiBuildMasks(info.cr, (u8 *)&accel->shufti.lo,
|
if (-1 != shuftiBuildMasks(info.cr,
|
||||||
(u8 *)&accel->shufti.hi)) {
|
reinterpret_cast<u8 *>(&accel->shufti.lo),
|
||||||
|
reinterpret_cast<u8 *>(&accel->shufti.hi))) {
|
||||||
DEBUG_PRINTF("state %hu is shufti\n", this_idx);
|
DEBUG_PRINTF("state %hu is shufti\n", this_idx);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(!info.cr.none());
|
assert(!info.cr.none());
|
||||||
accel->accel_type = ACCEL_TRUFFLE;
|
#if defined(CAN_USE_WIDE_TRUFFLE)
|
||||||
truffleBuildMasks(info.cr, (u8 *)&accel->truffle.mask1,
|
if(CAN_USE_WIDE_TRUFFLE) {
|
||||||
(u8 *)&accel->truffle.mask2);
|
accel->accel_type = ACCEL_TRUFFLE_WIDE;
|
||||||
|
truffleBuildMasksWide(info.cr,
|
||||||
|
reinterpret_cast<u8 *>(&accel->truffle.mask));
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
accel->accel_type = ACCEL_TRUFFLE;
|
||||||
|
truffleBuildMasks(info.cr,
|
||||||
|
reinterpret_cast<u8 *>(&accel->truffle.mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&accel->truffle.mask_hi));
|
||||||
|
}
|
||||||
DEBUG_PRINTF("state %hu is truffle\n", this_idx);
|
DEBUG_PRINTF("state %hu is truffle\n", this_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -93,6 +93,8 @@ const char *accelName(u8 accel_type) {
|
|||||||
return "double-shufti";
|
return "double-shufti";
|
||||||
case ACCEL_TRUFFLE:
|
case ACCEL_TRUFFLE:
|
||||||
return "truffle";
|
return "truffle";
|
||||||
|
case ACCEL_TRUFFLE_WIDE:
|
||||||
|
return "truffle wide";
|
||||||
case ACCEL_RED_TAPE:
|
case ACCEL_RED_TAPE:
|
||||||
return "red tape";
|
return "red tape";
|
||||||
default:
|
default:
|
||||||
@@ -178,6 +180,13 @@ void dumpTruffleCharReach(FILE *f, const u8 *hiset, const u8 *hiclear) {
|
|||||||
describeClass(cr).c_str());
|
describeClass(cr).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void dumpWideTruffleCharReach(FILE *f, const u8 *mask) {
|
||||||
|
CharReach cr = truffle2crWide(mask);
|
||||||
|
fprintf(f, "count %zu class %s\n", cr.count(),
|
||||||
|
describeClass(cr).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) {
|
void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) {
|
||||||
fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str());
|
fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str());
|
||||||
@@ -210,31 +219,38 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
|
|||||||
break;
|
break;
|
||||||
case ACCEL_SHUFTI: {
|
case ACCEL_SHUFTI: {
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
dumpShuftiMasks(f, (const u8 *)&accel.shufti.lo,
|
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.shufti.lo),
|
||||||
(const u8 *)&accel.shufti.hi);
|
reinterpret_cast<const u8 *>(&accel.shufti.hi));
|
||||||
dumpShuftiCharReach(f, (const u8 *)&accel.shufti.lo,
|
dumpShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.shufti.lo),
|
||||||
(const u8 *)&accel.shufti.hi);
|
reinterpret_cast<const u8 *>(&accel.shufti.hi));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ACCEL_DSHUFTI:
|
case ACCEL_DSHUFTI:
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
fprintf(f, "mask 1\n");
|
fprintf(f, "mask 1\n");
|
||||||
dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo1,
|
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1),
|
||||||
(const u8 *)&accel.dshufti.hi1);
|
reinterpret_cast<const u8 *>(&accel.dshufti.hi1));
|
||||||
fprintf(f, "mask 2\n");
|
fprintf(f, "mask 2\n");
|
||||||
dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo2,
|
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo2),
|
||||||
(const u8 *)&accel.dshufti.hi2);
|
reinterpret_cast<const u8 *>(&accel.dshufti.hi2));
|
||||||
dumpDShuftiCharReach(f, (const u8 *)&accel.dshufti.lo1,
|
dumpDShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1),
|
||||||
(const u8 *)&accel.dshufti.hi1,
|
reinterpret_cast<const u8 *>(&accel.dshufti.hi1),
|
||||||
(const u8 *)&accel.dshufti.lo2,
|
reinterpret_cast<const u8 *>(&accel.dshufti.lo2),
|
||||||
(const u8 *)&accel.dshufti.hi2);
|
reinterpret_cast<const u8 *>(&accel.dshufti.hi2));
|
||||||
break;
|
break;
|
||||||
case ACCEL_TRUFFLE: {
|
case ACCEL_TRUFFLE: {
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
dumpTruffleMasks(f, (const u8 *)&accel.truffle.mask1,
|
dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
|
||||||
(const u8 *)&accel.truffle.mask2);
|
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
|
||||||
dumpTruffleCharReach(f, (const u8 *)&accel.truffle.mask1,
|
dumpTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
|
||||||
(const u8 *)&accel.truffle.mask2);
|
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ACCEL_TRUFFLE_WIDE: {
|
||||||
|
fprintf(f, "\n");
|
||||||
|
dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
|
||||||
|
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
|
||||||
|
dumpWideTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -84,8 +84,9 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
|
DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
|
||||||
if (-1 != shuftiBuildMasks(info.single_stops, (u8 *)&aux->shufti.lo,
|
if (-1 != shuftiBuildMasks(info.single_stops,
|
||||||
(u8 *)&aux->shufti.hi)) {
|
reinterpret_cast<u8 *>(&aux->shufti.lo),
|
||||||
|
reinterpret_cast<u8 *>(&aux->shufti.hi))) {
|
||||||
aux->accel_type = ACCEL_SHUFTI;
|
aux->accel_type = ACCEL_SHUFTI;
|
||||||
aux->shufti.offset = offset;
|
aux->shufti.offset = offset;
|
||||||
DEBUG_PRINTF("shufti built OK\n");
|
DEBUG_PRINTF("shufti built OK\n");
|
||||||
@@ -96,10 +97,20 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
|
|||||||
|
|
||||||
if (outs <= ACCEL_MAX_STOP_CHAR) {
|
if (outs <= ACCEL_MAX_STOP_CHAR) {
|
||||||
DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
|
DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
|
||||||
aux->accel_type = ACCEL_TRUFFLE;
|
|
||||||
aux->truffle.offset = offset;
|
aux->truffle.offset = offset;
|
||||||
truffleBuildMasks(info.single_stops, (u8 *)&aux->truffle.mask1,
|
#if defined(CAN_USE_WIDE_TRUFFLE)
|
||||||
(u8 *)&aux->truffle.mask2);
|
if(CAN_USE_WIDE_TRUFFLE) {
|
||||||
|
aux->accel_type = ACCEL_TRUFFLE_WIDE;
|
||||||
|
truffleBuildMasksWide(info.single_stops,
|
||||||
|
reinterpret_cast<u8 *>(&aux->truffle.mask));
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
aux->accel_type = ACCEL_TRUFFLE;
|
||||||
|
truffleBuildMasks(info.single_stops,
|
||||||
|
reinterpret_cast<u8 *>(&aux->truffle.mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&aux->truffle.mask_hi));
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -219,8 +230,9 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
|
|||||||
c1, c2);
|
c1, c2);
|
||||||
return;
|
return;
|
||||||
} else if (outs2 <= 8 &&
|
} else if (outs2 <= 8 &&
|
||||||
vermicelliDouble16Build(info.double_stop2, (u8 *)&aux->dverm16.mask,
|
vermicelliDouble16Build(info.double_stop2,
|
||||||
(u8 *)&aux->dverm16.firsts)) {
|
reinterpret_cast<u8 *>(&aux->dverm16.mask),
|
||||||
|
reinterpret_cast<u8 *>(&aux->dverm16.firsts))) {
|
||||||
aux->accel_type = ACCEL_DVERM16;
|
aux->accel_type = ACCEL_DVERM16;
|
||||||
aux->dverm16.offset = offset;
|
aux->dverm16.offset = offset;
|
||||||
DEBUG_PRINTF("building double16-vermicelli\n");
|
DEBUG_PRINTF("building double16-vermicelli\n");
|
||||||
@@ -254,9 +266,11 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
|
|||||||
aux->accel_type = ACCEL_DSHUFTI;
|
aux->accel_type = ACCEL_DSHUFTI;
|
||||||
aux->dshufti.offset = offset;
|
aux->dshufti.offset = offset;
|
||||||
if (shuftiBuildDoubleMasks(
|
if (shuftiBuildDoubleMasks(
|
||||||
info.double_stop1, info.double_stop2, (u8 *)&aux->dshufti.lo1,
|
info.double_stop1, info.double_stop2,
|
||||||
(u8 *)&aux->dshufti.hi1, (u8 *)&aux->dshufti.lo2,
|
reinterpret_cast<u8 *>(&aux->dshufti.lo1),
|
||||||
(u8 *)&aux->dshufti.hi2)) {
|
reinterpret_cast<u8 *>(&aux->dshufti.hi1),
|
||||||
|
reinterpret_cast<u8 *>(&aux->dshufti.lo2),
|
||||||
|
reinterpret_cast<u8 *>(&aux->dshufti.hi2))) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask
|
|||||||
|
|
||||||
template <uint16_t S>
|
template <uint16_t S>
|
||||||
static really_inline
|
static really_inline
|
||||||
SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars) {
|
SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> *inout_t1, SuperVector<S> chars) {
|
||||||
|
|
||||||
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
|
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
|
||||||
SuperVector<S> chars_lo = chars & low4bits;
|
SuperVector<S> chars_lo = chars & low4bits;
|
||||||
@@ -57,18 +57,25 @@ SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi,
|
|||||||
c1_lo.print8("c1_lo");
|
c1_lo.print8("c1_lo");
|
||||||
SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
|
SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
|
||||||
c1_hi.print8("c1_hi");
|
c1_hi.print8("c1_hi");
|
||||||
SuperVector<S> t1 = c1_lo | c1_hi;
|
SuperVector<S> new_t1 = c1_lo | c1_hi;
|
||||||
t1.print8("t1");
|
// t1 is the match mask for the first char of the patterns
|
||||||
|
new_t1.print8("t1");
|
||||||
|
|
||||||
SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
|
SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
|
||||||
c2_lo.print8("c2_lo");
|
c2_lo.print8("c2_lo");
|
||||||
SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
|
SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
|
||||||
c2_hi.print8("c2_hi");
|
c2_hi.print8("c2_hi");
|
||||||
SuperVector<S> t2 = c2_lo | c2_hi;
|
SuperVector<S> t2 = c2_lo | c2_hi;
|
||||||
|
// t2 is the match mask for the second char of the patterns
|
||||||
t2.print8("t2");
|
t2.print8("t2");
|
||||||
t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)");
|
|
||||||
SuperVector<S> t = t1 | (t2.template vshr_128_imm<1>());
|
// offset t1 so it aligns with t2. The hole created by the offset is filled
|
||||||
|
// with the last elements of the previous t1 so no info is lost.
|
||||||
|
// Bits set to 0 lining up indicate a match.
|
||||||
|
SuperVector<S> t = (new_t1.alignr(*inout_t1, S-1)) | t2;
|
||||||
t.print8("t");
|
t.print8("t");
|
||||||
|
|
||||||
|
*inout_t1 = new_t1;
|
||||||
|
|
||||||
return !t.eq(SuperVector<S>::Ones());
|
return !t.eq(SuperVector<S>::Ones());
|
||||||
}
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2017, Intel Corporation
|
* Copyright (c) 2015-2017, Intel Corporation
|
||||||
* Copyright (c) 2020-2021, VectorCamp PC
|
* Copyright (c) 2020-2021, VectorCamp PC
|
||||||
|
* Copyright (c) 2023, Arm Limited
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@@ -32,6 +33,204 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_SVE
|
||||||
|
#ifdef HAVE_SVE2
|
||||||
|
|
||||||
|
/*
|
||||||
|
* blockSingleMask takes in a character set (as masks) and a string and return for each character
|
||||||
|
* of the string wether or not it is part of the set.
|
||||||
|
*
|
||||||
|
* 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit
|
||||||
|
* represents whether or not a character is in the character set. The 'highclear' and
|
||||||
|
* 'highset' in the name refers to the MSb of the byte of the character (allowing two
|
||||||
|
* 128-bit masks to cover all 256 values).
|
||||||
|
*
|
||||||
|
* The mask is an array of 32 bytes and is encoded this way:
|
||||||
|
* Let C be a character in the set. The bit describing that character is at byte[C%32] and
|
||||||
|
* within that byte, it's at bit[C/32]
|
||||||
|
* As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ...
|
||||||
|
*
|
||||||
|
* Assume the mask is in one of those configurations:
|
||||||
|
* - both masks are exactly 128b wide
|
||||||
|
* - the first mask is exactly 256b wide and the second is zeroed.
|
||||||
|
* - the first mask is more than 256b wide, with bits past the 256th being zero, and the second mask is zeroed.
|
||||||
|
*/
|
||||||
|
static really_inline
|
||||||
|
svuint8_t blockSingleMaskWideSVE2(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
|
||||||
|
const svuint8_t pshub_mask = svdup_u8(0x1f);
|
||||||
|
const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
|
||||||
|
svuint8x2_t shuf_mask_32 = svcreate2(shuf_mask_lo_highclear, shuf_mask_lo_highset);
|
||||||
|
/*
|
||||||
|
* svtbl2 does a table lookup. Each byte in the second argument indexes into the array of bytes
|
||||||
|
* in shuf_mask_32 and saves the result in the corresponding byte of byte_select.
|
||||||
|
* We mask the chars so that we are using the low nibble of char as the index.
|
||||||
|
*/
|
||||||
|
svuint8_t byte_select = svtbl2(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We now have selected the byte that contain the bit corresponding to the char. We need to
|
||||||
|
* further filter it, otherwise we'd get a match for any character % 32 to a searched character
|
||||||
|
*
|
||||||
|
* The low nibble was used previously to select the byte out of the mask. The high nibble is
|
||||||
|
* used to select the bit out of the byte. So we shift everything right by 5.
|
||||||
|
*
|
||||||
|
* Using svtbl, we can make an array where each element is a different bit. Using the high
|
||||||
|
* nibble we can get a mask selecting only the bit out of a byte that may have the relevant
|
||||||
|
* charset char.
|
||||||
|
*/
|
||||||
|
svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5);
|
||||||
|
svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
|
||||||
|
/*
|
||||||
|
* We apply the bit_select mask onto the selected byte. What is left is the bit in the charset
|
||||||
|
* encoding the character in char. A non zero value means the char was in the charset
|
||||||
|
*
|
||||||
|
* The _x suffix only works if we process a full char vector. If we were to use a partial
|
||||||
|
* vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
|
||||||
|
* lanes may have arbitrary values
|
||||||
|
*/
|
||||||
|
return svand_x(svptrue_b8(), byte_select, bit_select);
|
||||||
|
}
|
||||||
|
#endif //HAVE_SVE2
|
||||||
|
|
||||||
|
/*
|
||||||
|
* blockSingleMask takes in a character set (as masks) and a string and return for each character
|
||||||
|
* of the string wether or not it is part of the set.
|
||||||
|
*
|
||||||
|
* 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit
|
||||||
|
* represents whether or not a character is in the character set. The 'highclear' and
|
||||||
|
* 'highset' in the name refers to the MSb of the byte of the character (allowing two
|
||||||
|
* 128-bit masks to cover all 256 values).
|
||||||
|
*
|
||||||
|
* The masks are arrays of 16 bytes each and are encoded this way:
|
||||||
|
* Let C be a character in the set. The bit describing that character is at byte[C%16] and
|
||||||
|
* within that byte, it's at bit[C/16]
|
||||||
|
* As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x40 0x00 0x00 0x00 ...
|
||||||
|
*
|
||||||
|
* Assume both mask are 128b wide. If they are larger, the additional bits must be zero
|
||||||
|
*/
|
||||||
|
static really_inline
|
||||||
|
svuint8_t blockSingleMaskSVE(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
|
||||||
|
|
||||||
|
const svuint8_t highconst = svdup_u8(0x80);
|
||||||
|
const svuint8_t pshub_mask = svdup_u8(0x8f);
|
||||||
|
const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes
|
||||||
|
* in shuf_mask_lo_highclear and saves the result in the corresponding byte of byte_select_low.
|
||||||
|
* We mask the chars so that we are using the low nibble of char as the index but we keep the
|
||||||
|
* MSb so that high characters (not represented by the highclear mask) become an index out of
|
||||||
|
* bounds and result in a 0.
|
||||||
|
*/
|
||||||
|
svuint8_t byte_select_low = svtbl(shuf_mask_lo_highclear, svand_x(svptrue_b8(), chars, pshub_mask));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We flip the MSb of the chars and do the same table lookup with the highset mask.
|
||||||
|
* This way it's the characters with MSb cleared that will result in out of bands indexes.
|
||||||
|
* This allows us to cover the full range (0-127 and 128-255)
|
||||||
|
*/
|
||||||
|
svuint8_t char_high_flipped = sveor_x(svptrue_b8(), chars, highconst);
|
||||||
|
svuint8_t byte_select_high = svtbl(shuf_mask_lo_highset, svand_x(svptrue_b8(), char_high_flipped, pshub_mask));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We now have selected the byte that contain the bit corresponding to the char. We need to
|
||||||
|
* further filter it, otherwise we'd get a match for any character % 16 to a searched character
|
||||||
|
*
|
||||||
|
* The low nibble was used previously to select the byte out of the mask. The high nibble is
|
||||||
|
* used to select the bit out of the byte. So we shift everything right by 4.
|
||||||
|
*
|
||||||
|
* Using svtbl, we can make an array where each element is a different bit. Using the high
|
||||||
|
* nibble we can get a mask selecting only the bit out of a byte that may have the relevant
|
||||||
|
* charset char.
|
||||||
|
*/
|
||||||
|
svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 4);
|
||||||
|
svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
|
||||||
|
/*
|
||||||
|
* For every lane, only one of the byte selected may have a value, so we can OR them. We
|
||||||
|
* then apply the bit_select mask. What is left is the bit in the charset encoding the
|
||||||
|
* character in char. A non zero value means the char was in the charset
|
||||||
|
*
|
||||||
|
* The _x suffix only works if we process a full char vector. If we were to use a partial
|
||||||
|
* vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
|
||||||
|
* lanes may have arbitrary values
|
||||||
|
*/
|
||||||
|
return svand_x(svptrue_b8(), svorr_x(svptrue_b8(), byte_select_low, byte_select_high), bit_select);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* blockSingleMask takes in a character set (as masks) and a string and return for each character
|
||||||
|
* of the string wether or not it is part of the set.
|
||||||
|
*
|
||||||
|
* 'shuf_mask_32' is a 256-bit masks where each bit represents whether or not a character is in
|
||||||
|
* the character set.
|
||||||
|
*
|
||||||
|
* The mask is an array of 32 bytes and is encoded this way:
|
||||||
|
* Let C be a character in the set. The bit describing that character is at byte[C%32] and
|
||||||
|
* within that byte, it's at bit[C/32]
|
||||||
|
* As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ...
|
||||||
|
*
|
||||||
|
* Assume both mask are 128b wide. If they are larger, the additional bits must be zero
|
||||||
|
*/
|
||||||
|
static really_inline
|
||||||
|
svuint8_t blockSingleMaskWideSVE(svuint8_t shuf_mask_32, svuint8_t chars) {//TODO I might have issues with the type
|
||||||
|
|
||||||
|
const svuint8_t pshub_mask = svdup_u8(0x1f);
|
||||||
|
const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes
|
||||||
|
* in shuf_mask_32 and saves the result in the corresponding byte of byte_select.
|
||||||
|
* We mask the chars so that we are using the low nibble of char as the index.
|
||||||
|
*/
|
||||||
|
svuint8_t byte_select = svtbl(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We now have selected the byte that contain the bit corresponding to the char. We need to
|
||||||
|
* further filter it, otherwise we'd get a match for any character % 32 to a searched character
|
||||||
|
*
|
||||||
|
* The low nibble was used previously to select the byte out of the mask. The high nibble is
|
||||||
|
* used to select the bit out of the byte. So we shift everything right by 5.
|
||||||
|
*
|
||||||
|
* Using svtbl, we can make an array where each element is a different bit. Using the high
|
||||||
|
* nibble we can get a mask selecting only the bit out of a byte that may have the relevant
|
||||||
|
* charset char.
|
||||||
|
*/
|
||||||
|
svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5);
|
||||||
|
svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
|
||||||
|
/*
|
||||||
|
* We apply the bit_select mask onto the selected byte. What is left is the bit in the charset
|
||||||
|
* encoding the character in char. A non zero value means the char was in the charset
|
||||||
|
*
|
||||||
|
* The _x suffix only works if we process a full char vector. If we were to use a partial
|
||||||
|
* vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
|
||||||
|
* lanes may have arbitrary values
|
||||||
|
*/
|
||||||
|
return svand_x(svptrue_b8(), byte_select, bit_select);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* require normal truffle compilation. The 256b mask is split between the two parameters */
|
||||||
|
static really_inline
|
||||||
|
svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
|
||||||
|
return blockSingleMaskSVE(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* require wide truffle compilation. The 256b mask is fully contained in the first parameter */
|
||||||
|
static really_inline
|
||||||
|
svuint8_t blockSingleMaskWide32(svuint8_t shuf_mask_32, svuint8_t chars) {
|
||||||
|
return blockSingleMaskWideSVE(shuf_mask_32, chars);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_SVE2
|
||||||
|
/* require wide truffle compilation. The 256b mask is split between the two parameters if the vector is 128b,
|
||||||
|
* or fully contained in the first parameter is it's 256b and more*/
|
||||||
|
static really_inline
|
||||||
|
svuint8_t blockSingleMaskWide(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
|
||||||
|
return blockSingleMaskWideSVE2(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
|
||||||
|
}
|
||||||
|
#endif //HAVE_SVE2
|
||||||
|
#endif //HAVE_SVE
|
||||||
|
|
||||||
|
/* require normal truffle compilation. The 256b mask is split between the two parameters */
|
||||||
template <uint16_t S>
|
template <uint16_t S>
|
||||||
static really_inline
|
static really_inline
|
||||||
const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
|
const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
|
||||||
|
|||||||
@@ -94,8 +94,8 @@ char subCastleReportCurrent(const struct Castle *c, struct mq *q,
|
|||||||
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
||||||
const struct RepeatInfo *info = getRepeatInfo(sub);
|
const struct RepeatInfo *info = getRepeatInfo(sub);
|
||||||
|
|
||||||
union RepeatControl *rctrl = getControl(q->state, sub);
|
const union RepeatControl *rctrl = getControl(q->state, sub);
|
||||||
char *rstate = (char *)q->streamState + sub->streamStateOffset +
|
const char *rstate = (char *)q->streamState + sub->streamStateOffset +
|
||||||
info->packedCtrlSize;
|
info->packedCtrlSize;
|
||||||
enum RepeatMatch match =
|
enum RepeatMatch match =
|
||||||
repeatHasMatch(info, rctrl, rstate, offset);
|
repeatHasMatch(info, rctrl, rstate, offset);
|
||||||
@@ -118,10 +118,10 @@ int castleReportCurrent(const struct Castle *c, struct mq *q) {
|
|||||||
|
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *active = (u8 *)q->streamState;
|
u8 *active = (u8 *)q->streamState;
|
||||||
u8 *groups = active + c->groupIterOffset;
|
const u8 *groups = active + c->groupIterOffset;
|
||||||
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
||||||
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
||||||
u8 *cur = active + i * c->activeIdxSize;
|
const u8 *cur = active + i * c->activeIdxSize;
|
||||||
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
||||||
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
||||||
if (subCastleReportCurrent(c, q,
|
if (subCastleReportCurrent(c, q,
|
||||||
@@ -156,8 +156,8 @@ char subCastleInAccept(const struct Castle *c, struct mq *q,
|
|||||||
}
|
}
|
||||||
const struct RepeatInfo *info = getRepeatInfo(sub);
|
const struct RepeatInfo *info = getRepeatInfo(sub);
|
||||||
|
|
||||||
union RepeatControl *rctrl = getControl(q->state, sub);
|
const union RepeatControl *rctrl = getControl(q->state, sub);
|
||||||
char *rstate = (char *)q->streamState + sub->streamStateOffset +
|
const char *rstate = (char *)q->streamState + sub->streamStateOffset +
|
||||||
info->packedCtrlSize;
|
info->packedCtrlSize;
|
||||||
enum RepeatMatch match =
|
enum RepeatMatch match =
|
||||||
repeatHasMatch(info, rctrl, rstate, offset);
|
repeatHasMatch(info, rctrl, rstate, offset);
|
||||||
@@ -180,10 +180,10 @@ char castleInAccept(const struct Castle *c, struct mq *q,
|
|||||||
|
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *active = (u8 *)q->streamState;
|
u8 *active = (u8 *)q->streamState;
|
||||||
u8 *groups = active + c->groupIterOffset;
|
const u8 *groups = active + c->groupIterOffset;
|
||||||
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
||||||
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
||||||
u8 *cur = active + i * c->activeIdxSize;
|
const u8 *cur = active + i * c->activeIdxSize;
|
||||||
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
||||||
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
||||||
if (subCastleInAccept(c, q, report, offset, activeIdx)) {
|
if (subCastleInAccept(c, q, report, offset, activeIdx)) {
|
||||||
@@ -213,8 +213,8 @@ void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
|
|||||||
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
||||||
const struct RepeatInfo *info = getRepeatInfo(sub);
|
const struct RepeatInfo *info = getRepeatInfo(sub);
|
||||||
|
|
||||||
union RepeatControl *rctrl = getControl(full_state, sub);
|
const union RepeatControl *rctrl = getControl(full_state, sub);
|
||||||
char *rstate = (char *)stream_state + sub->streamStateOffset +
|
const char *rstate = (char *)stream_state + sub->streamStateOffset +
|
||||||
info->packedCtrlSize;
|
info->packedCtrlSize;
|
||||||
|
|
||||||
if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) {
|
if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) {
|
||||||
@@ -242,10 +242,10 @@ void castleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
|
|||||||
|
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *active = (u8 *)stream_state;
|
u8 *active = (u8 *)stream_state;
|
||||||
u8 *groups = active + c->groupIterOffset;
|
const u8 *groups = active + c->groupIterOffset;
|
||||||
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
||||||
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
||||||
u8 *cur = active + i * c->activeIdxSize;
|
const u8 *cur = active + i * c->activeIdxSize;
|
||||||
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
||||||
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
||||||
subCastleDeactivateStaleSubs(c, offset, full_state,
|
subCastleDeactivateStaleSubs(c, offset, full_state,
|
||||||
@@ -329,8 +329,8 @@ void subCastleFindMatch(const struct Castle *c, const u64a begin,
|
|||||||
size_t *mloc, char *found, const u32 subIdx) {
|
size_t *mloc, char *found, const u32 subIdx) {
|
||||||
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
||||||
const struct RepeatInfo *info = getRepeatInfo(sub);
|
const struct RepeatInfo *info = getRepeatInfo(sub);
|
||||||
union RepeatControl *rctrl = getControl(full_state, sub);
|
const union RepeatControl *rctrl = getControl(full_state, sub);
|
||||||
char *rstate = (char *)stream_state + sub->streamStateOffset +
|
const char *rstate = (char *)stream_state + sub->streamStateOffset +
|
||||||
info->packedCtrlSize;
|
info->packedCtrlSize;
|
||||||
|
|
||||||
u64a match = repeatNextMatch(info, rctrl, rstate, begin);
|
u64a match = repeatNextMatch(info, rctrl, rstate, begin);
|
||||||
@@ -374,10 +374,10 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
|
|||||||
|
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *active = (u8 *)stream_state;
|
u8 *active = (u8 *)stream_state;
|
||||||
u8 *groups = active + c->groupIterOffset;
|
const u8 *groups = active + c->groupIterOffset;
|
||||||
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
||||||
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
||||||
u8 *cur = active + i * c->activeIdxSize;
|
const u8 *cur = active + i * c->activeIdxSize;
|
||||||
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
||||||
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
||||||
subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
|
subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
|
||||||
@@ -386,7 +386,7 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (c->exclusive != PURE_EXCLUSIVE) {
|
if (c->exclusive != PURE_EXCLUSIVE) {
|
||||||
u8 *active = (u8 *)stream_state + c->activeOffset;
|
const u8 *active = (u8 *)stream_state + c->activeOffset;
|
||||||
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
|
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
|
||||||
i != MMB_INVALID;
|
i != MMB_INVALID;
|
||||||
i = mmbit_iterate(active, c->numRepeats, i)) {
|
i = mmbit_iterate(active, c->numRepeats, i)) {
|
||||||
@@ -400,8 +400,8 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
u64a subCastleNextMatch(const struct Castle *c, void *full_state,
|
u64a subCastleNextMatch(const struct Castle *c, const void *full_state,
|
||||||
void *stream_state, const u64a loc,
|
const void *stream_state, const u64a loc,
|
||||||
const u32 subIdx) {
|
const u32 subIdx) {
|
||||||
DEBUG_PRINTF("subcastle %u\n", subIdx);
|
DEBUG_PRINTF("subcastle %u\n", subIdx);
|
||||||
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
||||||
@@ -489,15 +489,14 @@ char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
|
|||||||
// full_state (scratch).
|
// full_state (scratch).
|
||||||
|
|
||||||
u64a offset = end; // min offset of next match
|
u64a offset = end; // min offset of next match
|
||||||
u32 activeIdx = 0;
|
|
||||||
mmbit_clear(matching, c->numRepeats);
|
mmbit_clear(matching, c->numRepeats);
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *active = (u8 *)stream_state;
|
u8 *active = (u8 *)stream_state;
|
||||||
u8 *groups = active + c->groupIterOffset;
|
u8 *groups = active + c->groupIterOffset;
|
||||||
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
||||||
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
||||||
u8 *cur = active + i * c->activeIdxSize;
|
const u8 *cur = active + i * c->activeIdxSize;
|
||||||
activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
||||||
u64a match = subCastleNextMatch(c, full_state, stream_state,
|
u64a match = subCastleNextMatch(c, full_state, stream_state,
|
||||||
loc, activeIdx);
|
loc, activeIdx);
|
||||||
set_matching(c, match, groups, matching, c->numGroups, i,
|
set_matching(c, match, groups, matching, c->numGroups, i,
|
||||||
@@ -797,7 +796,7 @@ char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,
|
|||||||
|
|
||||||
char found = 0;
|
char found = 0;
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
|
const u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
|
||||||
found = mmbit_any(groups, c->numGroups);
|
found = mmbit_any(groups, c->numGroups);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -864,7 +863,7 @@ char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
|
const u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
|
||||||
if (mmbit_any_precise(groups, c->numGroups)) {
|
if (mmbit_any_precise(groups, c->numGroups)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -884,7 +883,7 @@ char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
|
s64a castleLastKillLoc(const struct Castle *c, const struct mq *q) {
|
||||||
assert(q_cur_type(q) == MQE_START);
|
assert(q_cur_type(q) == MQE_START);
|
||||||
assert(q_last_type(q) == MQE_END);
|
assert(q_last_type(q) == MQE_END);
|
||||||
s64a sp = q_cur_loc(q);
|
s64a sp = q_cur_loc(q);
|
||||||
@@ -907,7 +906,6 @@ s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
|
|||||||
if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
|
if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
|
||||||
return (s64a)loc - hlen;
|
return (s64a)loc - hlen;
|
||||||
}
|
}
|
||||||
ep = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return sp - 1; /* the repeats are never killed */
|
return sp - 1; /* the repeats are never killed */
|
||||||
@@ -959,7 +957,7 @@ char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report) {
|
|||||||
|
|
||||||
char found = 0;
|
char found = 0;
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
|
const u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
|
||||||
found = mmbit_any_precise(groups, c->numGroups);
|
found = mmbit_any_precise(groups, c->numGroups);
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -1007,10 +1005,10 @@ char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q) {
|
|||||||
|
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *active = (u8 *)q->streamState;
|
u8 *active = (u8 *)q->streamState;
|
||||||
u8 *groups = active + c->groupIterOffset;
|
const u8 *groups = active + c->groupIterOffset;
|
||||||
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
||||||
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
||||||
u8 *cur = active + i * c->activeIdxSize;
|
const u8 *cur = active + i * c->activeIdxSize;
|
||||||
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
||||||
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
DEBUG_PRINTF("subcastle %u\n", activeIdx);
|
||||||
const struct SubCastle *sub = getSubCastle(c, activeIdx);
|
const struct SubCastle *sub = getSubCastle(c, activeIdx);
|
||||||
@@ -1079,7 +1077,7 @@ void subCastleQueueCompressState(const struct Castle *c, const u32 subIdx,
|
|||||||
const struct mq *q, const u64a offset) {
|
const struct mq *q, const u64a offset) {
|
||||||
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
const struct SubCastle *sub = getSubCastle(c, subIdx);
|
||||||
const struct RepeatInfo *info = getRepeatInfo(sub);
|
const struct RepeatInfo *info = getRepeatInfo(sub);
|
||||||
union RepeatControl *rctrl = getControl(q->state, sub);
|
const union RepeatControl *rctrl = getControl(q->state, sub);
|
||||||
char *packed = (char *)q->streamState + sub->streamStateOffset;
|
char *packed = (char *)q->streamState + sub->streamStateOffset;
|
||||||
DEBUG_PRINTF("sub %u next match %llu\n", subIdx,
|
DEBUG_PRINTF("sub %u next match %llu\n", subIdx,
|
||||||
repeatNextMatch(info, rctrl,
|
repeatNextMatch(info, rctrl,
|
||||||
@@ -1100,10 +1098,10 @@ char nfaExecCastle_queueCompressState(const struct NFA *n, const struct mq *q,
|
|||||||
DEBUG_PRINTF("offset=%llu\n", offset);
|
DEBUG_PRINTF("offset=%llu\n", offset);
|
||||||
if (c->exclusive) {
|
if (c->exclusive) {
|
||||||
u8 *active = (u8 *)q->streamState;
|
u8 *active = (u8 *)q->streamState;
|
||||||
u8 *groups = active + c->groupIterOffset;
|
const u8 *groups = active + c->groupIterOffset;
|
||||||
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
|
||||||
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
|
||||||
u8 *cur = active + i * c->activeIdxSize;
|
const u8 *cur = active + i * c->activeIdxSize;
|
||||||
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
|
||||||
DEBUG_PRINTF("packing state for sub %u\n", activeIdx);
|
DEBUG_PRINTF("packing state for sub %u\n", activeIdx);
|
||||||
subCastleQueueCompressState(c, activeIdx, q, offset);
|
subCastleQueueCompressState(c, activeIdx, q, offset);
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ namespace ue2 {
|
|||||||
static
|
static
|
||||||
void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
|
void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
|
||||||
const RepeatInfo *info =
|
const RepeatInfo *info =
|
||||||
(const RepeatInfo *)((const char *)&sub + sub.repeatInfoOffset);
|
reinterpret_cast<const RepeatInfo *>(reinterpret_cast<const char *>(&sub) + sub.repeatInfoOffset);
|
||||||
fprintf(f, " repeat model: %s\n", repeatTypeName(info->type));
|
fprintf(f, " repeat model: %s\n", repeatTypeName(info->type));
|
||||||
fprintf(f, " repeat bounds: {%u, %u}\n", info->repeatMin,
|
fprintf(f, " repeat bounds: {%u, %u}\n", info->repeatMin,
|
||||||
info->repeatMax);
|
info->repeatMax);
|
||||||
@@ -69,7 +69,7 @@ void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
|
void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
|
||||||
const Castle *c = (const Castle *)getImplNfa(nfa);
|
const Castle *c = reinterpret_cast<const Castle *>(getImplNfa(nfa));
|
||||||
|
|
||||||
StdioFile f(base + ".txt", "w");
|
StdioFile f(base + ".txt", "w");
|
||||||
|
|
||||||
@@ -88,15 +88,15 @@ void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
|
|||||||
fprintf(f, "negated verm, scanning for 0x%02x\n", c->u.verm.c);
|
fprintf(f, "negated verm, scanning for 0x%02x\n", c->u.verm.c);
|
||||||
break;
|
break;
|
||||||
case CASTLE_SHUFTI: {
|
case CASTLE_SHUFTI: {
|
||||||
const CharReach cr = shufti2cr((const u8 *)&c->u.shuf.mask_lo,
|
const CharReach cr = shufti2cr(reinterpret_cast<const u8 *>(&c->u.shuf.mask_lo),
|
||||||
(const u8 *)&c->u.shuf.mask_hi);
|
reinterpret_cast<const u8 *>(&c->u.shuf.mask_hi));
|
||||||
fprintf(f, "shufti, scanning for %s (%zu chars)\n",
|
fprintf(f, "shufti, scanning for %s (%zu chars)\n",
|
||||||
describeClass(cr).c_str(), cr.count());
|
describeClass(cr).c_str(), cr.count());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CASTLE_TRUFFLE: {
|
case CASTLE_TRUFFLE: {
|
||||||
const CharReach cr = truffle2cr((const u8 *)&c->u.truffle.mask1,
|
const CharReach cr = truffle2cr(reinterpret_cast<const u8 *>(&c->u.truffle.mask1),
|
||||||
(const u8 *)&c->u.truffle.mask2);
|
reinterpret_cast<const u8 *>(&c->u.truffle.mask2));
|
||||||
fprintf(f, "truffle, scanning for %s (%zu chars)\n",
|
fprintf(f, "truffle, scanning for %s (%zu chars)\n",
|
||||||
describeClass(cr).c_str(), cr.count());
|
describeClass(cr).c_str(), cr.count());
|
||||||
break;
|
break;
|
||||||
@@ -112,7 +112,7 @@ void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
|
|||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
|
|
||||||
const SubCastle *sub =
|
const SubCastle *sub =
|
||||||
(const SubCastle *)((const char *)c + sizeof(Castle));
|
reinterpret_cast<const SubCastle *>(reinterpret_cast<const char *>(c) + sizeof(Castle));
|
||||||
for (u32 i = 0; i < c->numRepeats; i++) {
|
for (u32 i = 0; i < c->numRepeats; i++) {
|
||||||
fprintf(f, "Sub %u:\n", i);
|
fprintf(f, "Sub %u:\n", i);
|
||||||
dumpTextSubCastle(sub[i], f);
|
dumpTextSubCastle(sub[i], f);
|
||||||
|
|||||||
@@ -106,25 +106,27 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
|
|||||||
#ifdef HAVE_SVE2
|
#ifdef HAVE_SVE2
|
||||||
if (cr.count() <= 16) {
|
if (cr.count() <= 16) {
|
||||||
c->type = CASTLE_NVERM16;
|
c->type = CASTLE_NVERM16;
|
||||||
vermicelli16Build(cr, (u8 *)&c->u.verm16.mask);
|
vermicelli16Build(cr, reinterpret_cast<u8 *>(&c->u.verm16.mask));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (negated.count() <= 16) {
|
if (negated.count() <= 16) {
|
||||||
c->type = CASTLE_VERM16;
|
c->type = CASTLE_VERM16;
|
||||||
vermicelli16Build(negated, (u8 *)&c->u.verm16.mask);
|
vermicelli16Build(negated, reinterpret_cast<u8 *>(&c->u.verm16.mask));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // HAVE_SVE2
|
#endif // HAVE_SVE2
|
||||||
|
|
||||||
if (shuftiBuildMasks(negated, (u8 *)&c->u.shuf.mask_lo,
|
if (shuftiBuildMasks(negated,
|
||||||
(u8 *)&c->u.shuf.mask_hi) != -1) {
|
reinterpret_cast<u8 *>(&c->u.shuf.mask_lo),
|
||||||
|
reinterpret_cast<u8 *>(&c->u.shuf.mask_hi)) != -1) {
|
||||||
c->type = CASTLE_SHUFTI;
|
c->type = CASTLE_SHUFTI;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
c->type = CASTLE_TRUFFLE;
|
c->type = CASTLE_TRUFFLE;
|
||||||
truffleBuildMasks(negated, (u8 *)(u8 *)&c->u.truffle.mask1,
|
truffleBuildMasks(negated,
|
||||||
(u8 *)&c->u.truffle.mask2);
|
reinterpret_cast<u8 *>(&c->u.truffle.mask1),
|
||||||
|
reinterpret_cast<u8 *>(&c->u.truffle.mask2));
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
@@ -227,11 +229,13 @@ vector<u32> removeClique(CliqueGraph &cg) {
|
|||||||
while (!graph_empty(cg)) {
|
while (!graph_empty(cg)) {
|
||||||
const vector<u32> &c = cliquesVec.back();
|
const vector<u32> &c = cliquesVec.back();
|
||||||
vector<CliqueVertex> dead;
|
vector<CliqueVertex> dead;
|
||||||
for (const auto &v : vertices_range(cg)) {
|
|
||||||
if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
|
auto deads = [&c=c, &cg=cg](const CliqueVertex &v) {
|
||||||
dead.emplace_back(v);
|
return (find(c.begin(), c.end(), cg[v].stateId) != c.end());
|
||||||
}
|
};
|
||||||
}
|
const auto &vr = vertices_range(cg);
|
||||||
|
std::copy_if(begin(vr), end(vr), std::back_inserter(dead), deads);
|
||||||
|
|
||||||
for (const auto &v : dead) {
|
for (const auto &v : dead) {
|
||||||
clear_vertex(v, cg);
|
clear_vertex(v, cg);
|
||||||
remove_vertex(v, cg);
|
remove_vertex(v, cg);
|
||||||
@@ -294,7 +298,7 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,
|
|||||||
size_t lower = 0;
|
size_t lower = 0;
|
||||||
size_t total = 0;
|
size_t total = 0;
|
||||||
while (lower < trigSize) {
|
while (lower < trigSize) {
|
||||||
vector<CliqueVertex> vertices;
|
vector<CliqueVertex> clvertices;
|
||||||
unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
|
unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
|
||||||
|
|
||||||
vector<vector<size_t>> min_reset_dist;
|
vector<vector<size_t>> min_reset_dist;
|
||||||
@@ -302,7 +306,7 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,
|
|||||||
// get min reset distance for each repeat
|
// get min reset distance for each repeat
|
||||||
for (size_t i = lower; i < upper; i++) {
|
for (size_t i = lower; i < upper; i++) {
|
||||||
CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
|
CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
|
||||||
vertices.emplace_back(v);
|
clvertices.emplace_back(v);
|
||||||
|
|
||||||
const vector<size_t> &tmp_dist =
|
const vector<size_t> &tmp_dist =
|
||||||
minResetDistToEnd(triggers[i], cr);
|
minResetDistToEnd(triggers[i], cr);
|
||||||
@@ -311,11 +315,11 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,
|
|||||||
|
|
||||||
// find exclusive pair for each repeat
|
// find exclusive pair for each repeat
|
||||||
for (size_t i = lower; i < upper; i++) {
|
for (size_t i = lower; i < upper; i++) {
|
||||||
CliqueVertex s = vertices[i - lower];
|
CliqueVertex s = clvertices[i - lower];
|
||||||
for (size_t j = i + 1; j < upper; j++) {
|
for (size_t j = i + 1; j < upper; j++) {
|
||||||
if (findExclusivePair(i, j, lower, min_reset_dist,
|
if (findExclusivePair(i, j, lower, min_reset_dist,
|
||||||
triggers)) {
|
triggers)) {
|
||||||
CliqueVertex d = vertices[j - lower];
|
CliqueVertex d = clvertices[j - lower];
|
||||||
add_edge(s, d, *cg);
|
add_edge(s, d, *cg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -600,9 +604,9 @@ buildCastle(const CastleProto &proto,
|
|||||||
nfa->minWidth = verify_u32(minWidth);
|
nfa->minWidth = verify_u32(minWidth);
|
||||||
nfa->maxWidth = maxWidth.is_finite() ? verify_u32(maxWidth) : 0;
|
nfa->maxWidth = maxWidth.is_finite() ? verify_u32(maxWidth) : 0;
|
||||||
|
|
||||||
char * const base_ptr = (char *)nfa.get() + sizeof(NFA);
|
char * const base_ptr = reinterpret_cast<char *>(nfa.get()) + sizeof(NFA);
|
||||||
char *ptr = base_ptr;
|
char *ptr = base_ptr;
|
||||||
Castle *c = (Castle *)ptr;
|
Castle *c = reinterpret_cast<Castle *>(ptr);
|
||||||
c->numRepeats = verify_u32(subs.size());
|
c->numRepeats = verify_u32(subs.size());
|
||||||
c->numGroups = exclusiveInfo.numGroups;
|
c->numGroups = exclusiveInfo.numGroups;
|
||||||
c->exclusive = verify_s8(exclusive);
|
c->exclusive = verify_s8(exclusive);
|
||||||
@@ -613,7 +617,7 @@ buildCastle(const CastleProto &proto,
|
|||||||
writeCastleScanEngine(cr, c);
|
writeCastleScanEngine(cr, c);
|
||||||
|
|
||||||
ptr += sizeof(Castle);
|
ptr += sizeof(Castle);
|
||||||
SubCastle *subCastles = ((SubCastle *)(ROUNDUP_PTR(ptr, alignof(u32))));
|
SubCastle *subCastles = reinterpret_cast<SubCastle *>(ROUNDUP_PTR(ptr, alignof(u32)));
|
||||||
copy(subs.begin(), subs.end(), subCastles);
|
copy(subs.begin(), subs.end(), subCastles);
|
||||||
|
|
||||||
u32 length = 0;
|
u32 length = 0;
|
||||||
@@ -623,16 +627,17 @@ buildCastle(const CastleProto &proto,
|
|||||||
SubCastle *sub = &subCastles[i];
|
SubCastle *sub = &subCastles[i];
|
||||||
sub->repeatInfoOffset = offset;
|
sub->repeatInfoOffset = offset;
|
||||||
|
|
||||||
ptr = (char *)sub + offset;
|
ptr = reinterpret_cast<char *>(sub) + offset;
|
||||||
memcpy(ptr, &infos[i], sizeof(RepeatInfo));
|
memcpy(ptr, &infos[i], sizeof(RepeatInfo));
|
||||||
|
|
||||||
if (patchSize[i]) {
|
if (patchSize[i]) {
|
||||||
RepeatInfo *info = (RepeatInfo *)ptr;
|
RepeatInfo *info = reinterpret_cast<RepeatInfo *>(ptr);
|
||||||
u64a *table = ((u64a *)(ROUNDUP_PTR(((char *)(info) +
|
char *info_base = reinterpret_cast<char *>(info);
|
||||||
sizeof(*info)), alignof(u64a))));
|
u64a *table = reinterpret_cast<u64a *>(ROUNDUP_PTR(info_base +
|
||||||
|
sizeof(*info), alignof(u64a)));
|
||||||
copy(tables.begin() + tableIdx,
|
copy(tables.begin() + tableIdx,
|
||||||
tables.begin() + tableIdx + patchSize[i], table);
|
tables.begin() + tableIdx + patchSize[i], table);
|
||||||
u32 diff = (char *)table - (char *)info +
|
u32 diff = reinterpret_cast<char *>(table) - info_base +
|
||||||
sizeof(u64a) * patchSize[i];
|
sizeof(u64a) * patchSize[i];
|
||||||
info->length = diff;
|
info->length = diff;
|
||||||
length += diff;
|
length += diff;
|
||||||
@@ -655,7 +660,6 @@ buildCastle(const CastleProto &proto,
|
|||||||
if (!stale_iter.empty()) {
|
if (!stale_iter.empty()) {
|
||||||
c->staleIterOffset = verify_u32(ptr - base_ptr);
|
c->staleIterOffset = verify_u32(ptr - base_ptr);
|
||||||
copy_bytes(ptr, stale_iter);
|
copy_bytes(ptr, stale_iter);
|
||||||
ptr += byte_length(stale_iter);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nfa;
|
return nfa;
|
||||||
@@ -672,6 +676,7 @@ set<ReportID> all_reports(const CastleProto &proto) {
|
|||||||
depth findMinWidth(const CastleProto &proto) {
|
depth findMinWidth(const CastleProto &proto) {
|
||||||
depth min_width(depth::infinity());
|
depth min_width(depth::infinity());
|
||||||
for (const PureRepeat &pr : proto.repeats | map_values) {
|
for (const PureRepeat &pr : proto.repeats | map_values) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
min_width = min(min_width, pr.bounds.min);
|
min_width = min(min_width, pr.bounds.min);
|
||||||
}
|
}
|
||||||
return min_width;
|
return min_width;
|
||||||
@@ -680,6 +685,7 @@ depth findMinWidth(const CastleProto &proto) {
|
|||||||
depth findMaxWidth(const CastleProto &proto) {
|
depth findMaxWidth(const CastleProto &proto) {
|
||||||
depth max_width(0);
|
depth max_width(0);
|
||||||
for (const PureRepeat &pr : proto.repeats | map_values) {
|
for (const PureRepeat &pr : proto.repeats | map_values) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
max_width = max(max_width, pr.bounds.max);
|
max_width = max(max_width, pr.bounds.max);
|
||||||
}
|
}
|
||||||
return max_width;
|
return max_width;
|
||||||
@@ -746,6 +752,7 @@ u32 CastleProto::merge(const PureRepeat &pr) {
|
|||||||
|
|
||||||
// First, see if this repeat is already in this castle.
|
// First, see if this repeat is already in this castle.
|
||||||
for (const auto &m : repeats) {
|
for (const auto &m : repeats) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
if (m.second == pr) {
|
if (m.second == pr) {
|
||||||
DEBUG_PRINTF("repeat already present, with top %u\n", m.first);
|
DEBUG_PRINTF("repeat already present, with top %u\n", m.first);
|
||||||
return m.first;
|
return m.first;
|
||||||
@@ -919,7 +926,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
|
|||||||
u32 min_bound = pr.bounds.min; // always finite
|
u32 min_bound = pr.bounds.min; // always finite
|
||||||
if (min_bound == 0) { // Vacuous case, we can only do this once.
|
if (min_bound == 0) { // Vacuous case, we can only do this once.
|
||||||
assert(!edge(g.start, g.accept, g).second);
|
assert(!edge(g.start, g.accept, g).second);
|
||||||
NFAEdge e = add_edge(g.start, g.accept, g);
|
NFAEdge e = add_edge(g.start, g.accept, g).first;
|
||||||
g[e].tops.insert(top);
|
g[e].tops.insert(top);
|
||||||
g[u].reports.insert(pr.reports.begin(), pr.reports.end());
|
g[u].reports.insert(pr.reports.begin(), pr.reports.end());
|
||||||
min_bound = 1;
|
min_bound = 1;
|
||||||
@@ -928,7 +935,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
|
|||||||
for (u32 i = 0; i < min_bound; i++) {
|
for (u32 i = 0; i < min_bound; i++) {
|
||||||
NFAVertex v = add_vertex(g);
|
NFAVertex v = add_vertex(g);
|
||||||
g[v].char_reach = pr.reach;
|
g[v].char_reach = pr.reach;
|
||||||
NFAEdge e = add_edge(u, v, g);
|
NFAEdge e = add_edge(u, v, g).first;
|
||||||
if (u == g.start) {
|
if (u == g.start) {
|
||||||
g[e].tops.insert(top);
|
g[e].tops.insert(top);
|
||||||
}
|
}
|
||||||
@@ -947,7 +954,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
|
|||||||
if (head != u) {
|
if (head != u) {
|
||||||
add_edge(head, v, g);
|
add_edge(head, v, g);
|
||||||
}
|
}
|
||||||
NFAEdge e = add_edge(u, v, g);
|
NFAEdge e = add_edge(u, v, g).first;
|
||||||
if (u == g.start) {
|
if (u == g.start) {
|
||||||
g[e].tops.insert(top);
|
g[e].tops.insert(top);
|
||||||
}
|
}
|
||||||
@@ -970,6 +977,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
|
|||||||
static
|
static
|
||||||
bool hasZeroMinBound(const CastleProto &proto) {
|
bool hasZeroMinBound(const CastleProto &proto) {
|
||||||
const depth zero(0);
|
const depth zero(0);
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (const PureRepeat &pr : proto.repeats | map_values) {
|
for (const PureRepeat &pr : proto.repeats | map_values) {
|
||||||
if (pr.bounds.min == zero) {
|
if (pr.bounds.min == zero) {
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@@ -263,6 +263,7 @@ void mapping_new_states(const HopcroftInfo &info,
|
|||||||
new_states.reserve(num_partitions);
|
new_states.reserve(num_partitions);
|
||||||
|
|
||||||
for (const auto &m : ordering) {
|
for (const auto &m : ordering) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
new_states.emplace_back(rdfa.states[m.first]);
|
new_states.emplace_back(rdfa.states[m.first]);
|
||||||
}
|
}
|
||||||
rdfa.states = std::move(new_states);
|
rdfa.states = std::move(new_states);
|
||||||
@@ -304,6 +305,7 @@ void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
|
|||||||
DEBUG_PRINTF("dfa is empty\n");
|
DEBUG_PRINTF("dfa is empty\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cppcheck-suppress unreadVariable
|
||||||
UNUSED const size_t states_before = rdfa.states.size();
|
UNUSED const size_t states_before = rdfa.states.size();
|
||||||
|
|
||||||
HopcroftInfo info(rdfa);
|
HopcroftInfo info(rdfa);
|
||||||
|
|||||||
@@ -978,14 +978,14 @@ char nfaExecGough16_initCompressedState(const struct NFA *nfa, u64a offset,
|
|||||||
char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
|
char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
|
||||||
const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
|
const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
|
||||||
NfaCallback cb = q->cb;
|
NfaCallback cb = q->cb;
|
||||||
void *ctxt = q->context;
|
|
||||||
u8 s = *(u8 *)q->state;
|
u8 s = *(u8 *)q->state;
|
||||||
u64a offset = q_cur_offset(q);
|
u64a offset = q_cur_offset(q);
|
||||||
struct gough_som_info *som = getSomInfo(q->state);
|
const struct gough_som_info *som = getSomInfo(q->state);
|
||||||
assert(q_cur_type(q) == MQE_START);
|
assert(q_cur_type(q) == MQE_START);
|
||||||
assert(s);
|
assert(s);
|
||||||
|
|
||||||
if (s >= m->accept_limit_8) {
|
if (s >= m->accept_limit_8) {
|
||||||
|
void *ctxt = q->context;
|
||||||
u32 cached_accept_id = 0;
|
u32 cached_accept_id = 0;
|
||||||
u16 cached_accept_state = 0;
|
u16 cached_accept_state = 0;
|
||||||
u32 cached_accept_som = 0;
|
u32 cached_accept_som = 0;
|
||||||
@@ -1000,16 +1000,16 @@ char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
|
|||||||
char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q) {
|
char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q) {
|
||||||
const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
|
const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
|
||||||
NfaCallback cb = q->cb;
|
NfaCallback cb = q->cb;
|
||||||
void *ctxt = q->context;
|
|
||||||
u16 s = *(u16 *)q->state;
|
u16 s = *(u16 *)q->state;
|
||||||
const struct mstate_aux *aux = get_aux(m, s);
|
const struct mstate_aux *aux = get_aux(m, s);
|
||||||
u64a offset = q_cur_offset(q);
|
u64a offset = q_cur_offset(q);
|
||||||
struct gough_som_info *som = getSomInfo(q->state);
|
const struct gough_som_info *som = getSomInfo(q->state);
|
||||||
assert(q_cur_type(q) == MQE_START);
|
assert(q_cur_type(q) == MQE_START);
|
||||||
DEBUG_PRINTF("state %hu\n", s);
|
DEBUG_PRINTF("state %hu\n", s);
|
||||||
assert(s);
|
assert(s);
|
||||||
|
|
||||||
if (aux->accept) {
|
if (aux->accept) {
|
||||||
|
void *ctxt = q->context;
|
||||||
u32 cached_accept_id = 0;
|
u32 cached_accept_id = 0;
|
||||||
u16 cached_accept_state = 0;
|
u16 cached_accept_state = 0;
|
||||||
u32 cached_accept_som = 0;
|
u32 cached_accept_som = 0;
|
||||||
|
|||||||
@@ -92,6 +92,7 @@ struct gough_info {
|
|||||||
static really_inline
|
static really_inline
|
||||||
const struct gough_info *get_gough(const struct mcclellan *m) {
|
const struct gough_info *get_gough(const struct mcclellan *m) {
|
||||||
assert(m->haig_offset);
|
assert(m->haig_offset);
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
const char *n = (const char *)m - sizeof(struct NFA);
|
const char *n = (const char *)m - sizeof(struct NFA);
|
||||||
return (const struct gough_info *)(n + m->haig_offset);
|
return (const struct gough_info *)(n + m->haig_offset);
|
||||||
}
|
}
|
||||||
@@ -102,6 +103,7 @@ const u32 *get_gough_top_offsets(const struct mcclellan *m) {
|
|||||||
if (!g->top_prog_offset) {
|
if (!g->top_prog_offset) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
const char *n = (const char *)m - sizeof(struct NFA);
|
const char *n = (const char *)m - sizeof(struct NFA);
|
||||||
return (const u32 *)(n + g->top_prog_offset);
|
return (const u32 *)(n + g->top_prog_offset);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ void GoughSSAVarMin::replace_input(GoughSSAVar *old_v, GoughSSAVar *new_v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
void translateRawReports(UNUSED GoughGraph &cfg, UNUSED const raw_som_dfa &raw,
|
void translateRawReports(UNUSED const GoughGraph &cfg, UNUSED const raw_som_dfa &raw,
|
||||||
const flat_map<u32, GoughSSAVarJoin *> &joins_at_s,
|
const flat_map<u32, GoughSSAVarJoin *> &joins_at_s,
|
||||||
UNUSED GoughVertex s,
|
UNUSED GoughVertex s,
|
||||||
const set<som_report> &reports_in,
|
const set<som_report> &reports_in,
|
||||||
@@ -206,10 +206,6 @@ void makeCFG_top_edge(GoughGraph &cfg, const vector<GoughVertex> &vertices,
|
|||||||
assert(contains(src_slots, slot_id));
|
assert(contains(src_slots, slot_id));
|
||||||
|
|
||||||
shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
|
shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
|
||||||
if (!vmin) {
|
|
||||||
assert(0);
|
|
||||||
throw std::bad_alloc();
|
|
||||||
}
|
|
||||||
cfg[e].vars.emplace_back(vmin);
|
cfg[e].vars.emplace_back(vmin);
|
||||||
final_var = vmin.get();
|
final_var = vmin.get();
|
||||||
|
|
||||||
@@ -321,10 +317,6 @@ void makeCFG_edge(GoughGraph &cfg, const map<u32, u32> &som_creators,
|
|||||||
DEBUG_PRINTF("bypassing min on join %u\n", slot_id);
|
DEBUG_PRINTF("bypassing min on join %u\n", slot_id);
|
||||||
} else {
|
} else {
|
||||||
shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
|
shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
|
||||||
if (!vmin) {
|
|
||||||
assert(0);
|
|
||||||
throw std::bad_alloc();
|
|
||||||
}
|
|
||||||
cfg[e].vars.emplace_back(vmin);
|
cfg[e].vars.emplace_back(vmin);
|
||||||
final_var = vmin.get();
|
final_var = vmin.get();
|
||||||
|
|
||||||
@@ -441,10 +433,11 @@ unique_ptr<GoughGraph> makeCFG(const raw_som_dfa &raw) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
|
// cppcheck-suppress constParameterReference
|
||||||
void copy_propagate_report_set(vector<pair<ReportID, GoughSSAVar *> > &rep) {
|
void copy_propagate_report_set(vector<pair<ReportID, GoughSSAVar *> > &rep) {
|
||||||
vector<pair<ReportID, GoughSSAVar *> >::iterator it = rep.begin();
|
vector<pair<ReportID, GoughSSAVar *> >::iterator it = rep.begin();
|
||||||
while (it != rep.end()) {
|
while (it != rep.end()) {
|
||||||
GoughSSAVar *var = it->second;
|
const GoughSSAVar *var = it->second;
|
||||||
if (!var) {
|
if (!var) {
|
||||||
++it;
|
++it;
|
||||||
continue;
|
continue;
|
||||||
@@ -546,7 +539,7 @@ void remove_dead(GoughGraph &g) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (!queue.empty()) {
|
while (!queue.empty()) {
|
||||||
GoughSSAVar *v = queue.back();
|
const GoughSSAVar *v = queue.back();
|
||||||
queue.pop_back();
|
queue.pop_back();
|
||||||
for (GoughSSAVar *var : v->get_inputs()) {
|
for (GoughSSAVar *var : v->get_inputs()) {
|
||||||
if (var->seen) {
|
if (var->seen) {
|
||||||
@@ -602,6 +595,7 @@ void GoughSSAVarNew::generate(vector<gough_ins> *out) const {
|
|||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
template<typename C, typename K>
|
template<typename C, typename K>
|
||||||
bool contains_loose(const C &container, const K &key) {
|
bool contains_loose(const C &container, const K &key) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (const auto &elem : container) {
|
for (const auto &elem : container) {
|
||||||
if (elem == key) {
|
if (elem == key) {
|
||||||
return true;
|
return true;
|
||||||
@@ -650,6 +644,7 @@ void GoughSSAVarJoin::generate(UNUSED vector<gough_ins> *out) const {
|
|||||||
|
|
||||||
GoughSSAVar *GoughSSAVarJoin::get_input(const GoughEdge &prev) const {
|
GoughSSAVar *GoughSSAVarJoin::get_input(const GoughEdge &prev) const {
|
||||||
for (const auto &var_edge : input_map) {
|
for (const auto &var_edge : input_map) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
if (contains(var_edge.second, prev)) {
|
if (contains(var_edge.second, prev)) {
|
||||||
return var_edge.first;
|
return var_edge.first;
|
||||||
}
|
}
|
||||||
@@ -658,8 +653,8 @@ GoughSSAVar *GoughSSAVarJoin::get_input(const GoughEdge &prev) const {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const flat_set<GoughEdge> &GoughSSAVarJoin::get_edges_for_input(
|
// cppcheck-suppress constParameterPointer
|
||||||
GoughSSAVar *input) const {
|
const flat_set<GoughEdge> &GoughSSAVarJoin::get_edges_for_input(GoughSSAVar *input) const {
|
||||||
return input_map.at(input);
|
return input_map.at(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -810,7 +805,7 @@ private:
|
|||||||
|
|
||||||
static
|
static
|
||||||
void prep_joins_for_generation(const GoughGraph &g, GoughVertex v,
|
void prep_joins_for_generation(const GoughGraph &g, GoughVertex v,
|
||||||
map<GoughEdge, edge_join_info> *edge_info) {
|
map<GoughEdge, edge_join_info> &edge_info) {
|
||||||
DEBUG_PRINTF("writing out joins for %u\n", g[v].state_id);
|
DEBUG_PRINTF("writing out joins for %u\n", g[v].state_id);
|
||||||
for (const auto &var : g[v].vars) {
|
for (const auto &var : g[v].vars) {
|
||||||
u32 dest_slot = var->slot;
|
u32 dest_slot = var->slot;
|
||||||
@@ -821,7 +816,7 @@ void prep_joins_for_generation(const GoughGraph &g, GoughVertex v,
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (const GoughEdge &incoming_edge : var_edges.second) {
|
for (const GoughEdge &incoming_edge : var_edges.second) {
|
||||||
(*edge_info)[incoming_edge].insert(input, dest_slot);
|
edge_info[incoming_edge].insert(input, dest_slot);
|
||||||
DEBUG_PRINTF("need %u<-%u\n", dest_slot, input);
|
DEBUG_PRINTF("need %u<-%u\n", dest_slot, input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -919,7 +914,7 @@ void build_blocks(const GoughGraph &g,
|
|||||||
}
|
}
|
||||||
|
|
||||||
map<GoughEdge, edge_join_info> eji;
|
map<GoughEdge, edge_join_info> eji;
|
||||||
prep_joins_for_generation(g, t, &eji);
|
prep_joins_for_generation(g, t, eji);
|
||||||
|
|
||||||
for (auto &m : eji) {
|
for (auto &m : eji) {
|
||||||
vector<gough_ins> &block = (*blocks)[gough_edge_id(g, m.first)];
|
vector<gough_ins> &block = (*blocks)[gough_edge_id(g, m.first)];
|
||||||
@@ -1017,7 +1012,7 @@ void update_accel_prog_offset(const gough_build_strat &gbs,
|
|||||||
verts[gbs.gg[v].state_id] = v;
|
verts[gbs.gg[v].state_id] = v;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto &m : gbs.built_accel) {
|
for (const auto &m : gbs.built_accel) {
|
||||||
gough_accel *ga = m.first;
|
gough_accel *ga = m.first;
|
||||||
assert(!ga->prog_offset);
|
assert(!ga->prog_offset);
|
||||||
GoughVertex v = verts[m.second];
|
GoughVertex v = verts[m.second];
|
||||||
@@ -1050,7 +1045,7 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
|
|||||||
|| !cc.streaming);
|
|| !cc.streaming);
|
||||||
|
|
||||||
if (!cc.grey.allowGough) {
|
if (!cc.grey.allowGough) {
|
||||||
return nullptr;
|
return bytecode_ptr<NFA>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUG_PRINTF("hello world\n");
|
DEBUG_PRINTF("hello world\n");
|
||||||
@@ -1081,11 +1076,11 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
|
|||||||
auto basic_dfa = mcclellanCompile_i(raw, gbs, cc);
|
auto basic_dfa = mcclellanCompile_i(raw, gbs, cc);
|
||||||
assert(basic_dfa);
|
assert(basic_dfa);
|
||||||
if (!basic_dfa) {
|
if (!basic_dfa) {
|
||||||
return nullptr;
|
return bytecode_ptr<NFA>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
u8 alphaShift
|
const auto nfa = static_cast<const mcclellan *>(getImplNfa(basic_dfa.get()));
|
||||||
= ((const mcclellan *)getImplNfa(basic_dfa.get()))->alphaShift;
|
u8 alphaShift = nfa->alphaShift;
|
||||||
u32 edge_count = (1U << alphaShift) * raw.states.size();
|
u32 edge_count = (1U << alphaShift) * raw.states.size();
|
||||||
|
|
||||||
u32 curr_offset = ROUNDUP_N(basic_dfa->length, 4);
|
u32 curr_offset = ROUNDUP_N(basic_dfa->length, 4);
|
||||||
@@ -1126,8 +1121,8 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
|
|||||||
u32 gough_size = ROUNDUP_N(curr_offset, 16);
|
u32 gough_size = ROUNDUP_N(curr_offset, 16);
|
||||||
auto gough_dfa = make_zeroed_bytecode_ptr<NFA>(gough_size);
|
auto gough_dfa = make_zeroed_bytecode_ptr<NFA>(gough_size);
|
||||||
|
|
||||||
memcpy(gough_dfa.get(), basic_dfa.get(), basic_dfa->length);
|
memcpy(reinterpret_cast<char *>(gough_dfa.get()), basic_dfa.get(), basic_dfa->length);
|
||||||
memcpy((char *)gough_dfa.get() + haig_offset, &gi, sizeof(gi));
|
memcpy(reinterpret_cast<char *>(gough_dfa.get()) + haig_offset, &gi, sizeof(gi));
|
||||||
if (gough_dfa->type == MCCLELLAN_NFA_16) {
|
if (gough_dfa->type == MCCLELLAN_NFA_16) {
|
||||||
gough_dfa->type = GOUGH_NFA_16;
|
gough_dfa->type = GOUGH_NFA_16;
|
||||||
} else {
|
} else {
|
||||||
@@ -1140,18 +1135,18 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
|
|||||||
gough_dfa->streamStateSize = base_state_size + slot_count * somPrecision;
|
gough_dfa->streamStateSize = base_state_size + slot_count * somPrecision;
|
||||||
gough_dfa->scratchStateSize = (u32)(16 + scratch_slot_count * sizeof(u64a));
|
gough_dfa->scratchStateSize = (u32)(16 + scratch_slot_count * sizeof(u64a));
|
||||||
|
|
||||||
mcclellan *m = (mcclellan *)getMutableImplNfa(gough_dfa.get());
|
auto *m = reinterpret_cast<mcclellan *>(getMutableImplNfa(gough_dfa.get()));
|
||||||
m->haig_offset = haig_offset;
|
m->haig_offset = haig_offset;
|
||||||
|
|
||||||
/* update nfa length, haig_info offset (leave mcclellan length alone) */
|
/* update nfa length, haig_info offset (leave mcclellan length alone) */
|
||||||
gough_dfa->length = gough_size;
|
gough_dfa->length = gough_size;
|
||||||
|
|
||||||
/* copy in blocks */
|
/* copy in blocks */
|
||||||
copy_bytes((u8 *)gough_dfa.get() + edge_prog_offset, edge_blocks);
|
copy_bytes(reinterpret_cast<u8 *>(gough_dfa.get()) + edge_prog_offset, edge_blocks);
|
||||||
if (top_prog_offset) {
|
if (top_prog_offset) {
|
||||||
copy_bytes((u8 *)gough_dfa.get() + top_prog_offset, top_blocks);
|
copy_bytes(reinterpret_cast<u8 *>(gough_dfa.get()) + top_prog_offset, top_blocks);
|
||||||
}
|
}
|
||||||
copy_bytes((u8 *)gough_dfa.get() + prog_base_offset, temp_blocks);
|
copy_bytes(reinterpret_cast<u8 *>(gough_dfa.get()) + prog_base_offset, temp_blocks);
|
||||||
|
|
||||||
return gough_dfa;
|
return gough_dfa;
|
||||||
}
|
}
|
||||||
@@ -1184,7 +1179,7 @@ AccelScheme gough_build_strat::find_escape_strings(dstate_id_t this_idx) const {
|
|||||||
void gough_build_strat::buildAccel(dstate_id_t this_idx, const AccelScheme &info,
|
void gough_build_strat::buildAccel(dstate_id_t this_idx, const AccelScheme &info,
|
||||||
void *accel_out) {
|
void *accel_out) {
|
||||||
assert(mcclellan_build_strat::accelSize() == sizeof(AccelAux));
|
assert(mcclellan_build_strat::accelSize() == sizeof(AccelAux));
|
||||||
gough_accel *accel = (gough_accel *)accel_out;
|
gough_accel *accel = reinterpret_cast<gough_accel *>(accel_out);
|
||||||
/* build a plain accelaux so we can work out where we can get to */
|
/* build a plain accelaux so we can work out where we can get to */
|
||||||
mcclellan_build_strat::buildAccel(this_idx, info, &accel->accel);
|
mcclellan_build_strat::buildAccel(this_idx, info, &accel->accel);
|
||||||
DEBUG_PRINTF("state %hu is accel with type %hhu\n", this_idx,
|
DEBUG_PRINTF("state %hu is accel with type %hhu\n", this_idx,
|
||||||
@@ -1299,7 +1294,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
|
|||||||
*arbReport = MO_INVALID_IDX;
|
*arbReport = MO_INVALID_IDX;
|
||||||
assert(!ri->rl.empty()); /* all components should be able to generate
|
assert(!ri->rl.empty()); /* all components should be able to generate
|
||||||
reports */
|
reports */
|
||||||
return std::move(ri);
|
return ri;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 raw_gough_report_info_impl::getReportListSize() const {
|
u32 raw_gough_report_info_impl::getReportListSize() const {
|
||||||
@@ -1322,7 +1317,8 @@ void raw_gough_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
|
|||||||
for (const raw_gough_report_list &r : rl) {
|
for (const raw_gough_report_list &r : rl) {
|
||||||
ro.emplace_back(base_offset);
|
ro.emplace_back(base_offset);
|
||||||
|
|
||||||
gough_report_list *p = (gough_report_list *)((char *)n + base_offset);
|
u8 * n_ptr = reinterpret_cast<u8 *>(n);
|
||||||
|
gough_report_list *p = reinterpret_cast<gough_report_list *>(n_ptr + base_offset);
|
||||||
u32 i = 0;
|
u32 i = 0;
|
||||||
|
|
||||||
for (const som_report &sr : r.reports) {
|
for (const som_report &sr : r.reports) {
|
||||||
|
|||||||
@@ -146,6 +146,7 @@ bool verify_neighbour(const GoughGraph &g, GoughVertex u,
|
|||||||
const map<gough_edge_id, vector<gough_ins> > &blocks,
|
const map<gough_edge_id, vector<gough_ins> > &blocks,
|
||||||
const set<GoughVertex> &succs,
|
const set<GoughVertex> &succs,
|
||||||
const vector<gough_ins> &block_sl) {
|
const vector<gough_ins> &block_sl) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (const auto &e : out_edges_range(u, g)) {
|
for (const auto &e : out_edges_range(u, g)) {
|
||||||
if (!g[e].reach.any()) { /* ignore top edges */
|
if (!g[e].reach.any()) { /* ignore top edges */
|
||||||
continue;
|
continue;
|
||||||
@@ -172,6 +173,7 @@ static
|
|||||||
bool verify_neighbour_no_block(const GoughGraph &g, GoughVertex u,
|
bool verify_neighbour_no_block(const GoughGraph &g, GoughVertex u,
|
||||||
const map<gough_edge_id, vector<gough_ins> > &blocks,
|
const map<gough_edge_id, vector<gough_ins> > &blocks,
|
||||||
const set<GoughVertex> &succs) {
|
const set<GoughVertex> &succs) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (const auto &e : out_edges_range(u, g)) {
|
for (const auto &e : out_edges_range(u, g)) {
|
||||||
if (!g[e].reach.any()) { /* ignore top edges */
|
if (!g[e].reach.any()) { /* ignore top edges */
|
||||||
continue;
|
continue;
|
||||||
@@ -229,6 +231,7 @@ bool allow_two_byte_accel(const GoughGraph &g,
|
|||||||
succs.insert(target(e, g));
|
succs.insert(target(e, g));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (auto w : adjacent_vertices_range(v, g)) {
|
for (auto w : adjacent_vertices_range(v, g)) {
|
||||||
if (w != v && !verify_neighbour(g, w, blocks, succs, block_sl)) {
|
if (w != v && !verify_neighbour(g, w, blocks, succs, block_sl)) {
|
||||||
return false;
|
return false;
|
||||||
@@ -249,6 +252,7 @@ bool allow_two_byte_accel(const GoughGraph &g,
|
|||||||
}
|
}
|
||||||
succs.insert(target(e, g));
|
succs.insert(target(e, g));
|
||||||
|
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (auto w : adjacent_vertices_range(v, g)) {
|
for (auto w : adjacent_vertices_range(v, g)) {
|
||||||
if (w != v && !verify_neighbour_no_block(g, w, blocks, succs)) {
|
if (w != v && !verify_neighbour_no_block(g, w, blocks, succs)) {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@@ -145,7 +145,8 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
|
|||||||
fprintf(f, "\tuses:");
|
fprintf(f, "\tuses:");
|
||||||
vector<u32> used_id;
|
vector<u32> used_id;
|
||||||
for (const GoughSSAVar *var : used) {
|
for (const GoughSSAVar *var : used) {
|
||||||
used_id.emplace_back(var->slot);
|
// cppcheck-suppress useStlAlgorithm
|
||||||
|
used_id.emplace_back(var->slot); //NOLINT (performance-inefficient-vector-operation)
|
||||||
}
|
}
|
||||||
for (const u32 &id : used_id) {
|
for (const u32 &id : used_id) {
|
||||||
fprintf(f, " %u", id);
|
fprintf(f, " %u", id);
|
||||||
@@ -167,7 +168,8 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
|
|||||||
fprintf(f, "\tuses:");
|
fprintf(f, "\tuses:");
|
||||||
vector<u32> used_id;
|
vector<u32> used_id;
|
||||||
for (const GoughSSAVar *var : used) {
|
for (const GoughSSAVar *var : used) {
|
||||||
used_id.emplace_back(var->slot);
|
// cppcheck-suppress useStlAlgorithm
|
||||||
|
used_id.emplace_back(var->slot); //NOLINT (performance-inefficient-vector-operation)
|
||||||
}
|
}
|
||||||
for (const u32 &id : used_id) {
|
for (const u32 &id : used_id) {
|
||||||
fprintf(f, " %u", id);
|
fprintf(f, " %u", id);
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ namespace ue2 {
|
|||||||
template<typename VarP, typename VarQ>
|
template<typename VarP, typename VarQ>
|
||||||
void emplace_back_all_raw(vector<VarP> *out, const vector<VarQ> &in) {
|
void emplace_back_all_raw(vector<VarP> *out, const vector<VarQ> &in) {
|
||||||
for (const auto &var : in) {
|
for (const auto &var : in) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
out->emplace_back(var.get());
|
out->emplace_back(var.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -194,7 +195,7 @@ void handle_pending_vars(GoughSSAVar *def, const GoughGraph &g,
|
|||||||
if (contains(aux.containing_v, var)) {
|
if (contains(aux.containing_v, var)) {
|
||||||
/* def is used by join vertex, value only needs to be live on some
|
/* def is used by join vertex, value only needs to be live on some
|
||||||
* incoming edges */
|
* incoming edges */
|
||||||
GoughSSAVarJoin *vj = (GoughSSAVarJoin *)var;
|
const GoughSSAVarJoin *vj = reinterpret_cast<const GoughSSAVarJoin *>(var);
|
||||||
const flat_set<GoughEdge> &live_edges
|
const flat_set<GoughEdge> &live_edges
|
||||||
= vj->get_edges_for_input(def);
|
= vj->get_edges_for_input(def);
|
||||||
for (const auto &e : live_edges) {
|
for (const auto &e : live_edges) {
|
||||||
@@ -278,7 +279,7 @@ set<const GoughSSAVar *> live_during(GoughSSAVar *def, const GoughGraph &g,
|
|||||||
|
|
||||||
template<typename VarP>
|
template<typename VarP>
|
||||||
void set_initial_slots(const vector<VarP> &vars, u32 *next_slot) {
|
void set_initial_slots(const vector<VarP> &vars, u32 *next_slot) {
|
||||||
for (auto &var : vars) {
|
for (const auto &var : vars) {
|
||||||
assert(var->slot == INVALID_SLOT);
|
assert(var->slot == INVALID_SLOT);
|
||||||
var->slot = (*next_slot)++;
|
var->slot = (*next_slot)++;
|
||||||
}
|
}
|
||||||
@@ -380,6 +381,7 @@ template<typename VarP>
|
|||||||
void add_to_dom_ordering(const vector<VarP> &vars,
|
void add_to_dom_ordering(const vector<VarP> &vars,
|
||||||
vector<GoughSSAVar *> *out) {
|
vector<GoughSSAVar *> *out) {
|
||||||
for (const auto &var : vars) {
|
for (const auto &var : vars) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
out->emplace_back(var.get());
|
out->emplace_back(var.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -438,7 +440,7 @@ void create_slot_mapping(const GoughGraph &cfg, UNUSED u32 old_slot_count,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
void update_local_slots(GoughGraph &g, set<GoughSSAVar *> &locals,
|
void update_local_slots(GoughGraph &g, const set<GoughSSAVar *> &locals,
|
||||||
u32 local_base) {
|
u32 local_base) {
|
||||||
DEBUG_PRINTF("%zu local variables\n", locals.size());
|
DEBUG_PRINTF("%zu local variables\n", locals.size());
|
||||||
/* local variables only occur on edges (joins are never local) */
|
/* local variables only occur on edges (joins are never local) */
|
||||||
|
|||||||
@@ -59,14 +59,14 @@ namespace ue2 {
|
|||||||
static
|
static
|
||||||
void goughGetTransitions(const NFA *n, u16 s, u16 *t) {
|
void goughGetTransitions(const NFA *n, u16 s, u16 *t) {
|
||||||
assert(isGoughType(n->type));
|
assert(isGoughType(n->type));
|
||||||
const mcclellan *m = (const mcclellan *)getImplNfa(n);
|
const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(n));
|
||||||
const mstate_aux *aux = getAux(n, s);
|
const mstate_aux *aux = getAux(n, s);
|
||||||
const u32 as = m->alphaShift;
|
const u32 as = m->alphaShift;
|
||||||
const char *sher_base
|
const char *sher_base
|
||||||
= (const char *)m - sizeof(struct NFA) + m->sherman_offset;
|
= reinterpret_cast<const char *>(m) - sizeof(struct NFA) + m->sherman_offset;
|
||||||
|
|
||||||
if (n->type == GOUGH_NFA_8) {
|
if (n->type == GOUGH_NFA_8) {
|
||||||
const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcclellan));
|
const u8 *succ_table = reinterpret_cast<const u8 *>(reinterpret_cast<const char *>(m) + sizeof(mcclellan));
|
||||||
for (u16 c = 0; c < N_CHARS; c++) {
|
for (u16 c = 0; c < N_CHARS; c++) {
|
||||||
t[c] = succ_table[((u32)s << as) + m->remap[c]];
|
t[c] = succ_table[((u32)s << as) + m->remap[c]];
|
||||||
}
|
}
|
||||||
@@ -76,14 +76,14 @@ void goughGetTransitions(const NFA *n, u16 s, u16 *t) {
|
|||||||
if (s >= m->sherman_limit) {
|
if (s >= m->sherman_limit) {
|
||||||
const char *state_base
|
const char *state_base
|
||||||
= findShermanState(m, sher_base, m->sherman_limit, s);
|
= findShermanState(m, sher_base, m->sherman_limit, s);
|
||||||
base_s = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
|
base_s = *(reinterpret_cast<const u16 *>(state_base + SHERMAN_DADDY_OFFSET));
|
||||||
}
|
}
|
||||||
|
|
||||||
const u16 *succ_table = (const u16 *)((const char *)m
|
const u16 *succ_table = reinterpret_cast<const u16 *>(reinterpret_cast<const char *>(m)
|
||||||
+ sizeof(mcclellan));
|
+ sizeof(mcclellan));
|
||||||
for (u16 c = 0; c < N_CHARS; c++) {
|
for (u16 c = 0; c < N_CHARS; c++) {
|
||||||
const u8 *addr
|
const u8 *addr
|
||||||
= (const u8*)(succ_table + (((u32)base_s << as) + m->remap[c]));
|
= reinterpret_cast<const u8*>(succ_table + (((u32)base_s << as) + m->remap[c]));
|
||||||
t[c] = unaligned_load_u16(addr);
|
t[c] = unaligned_load_u16(addr);
|
||||||
t[c] &= STATE_MASK;
|
t[c] &= STATE_MASK;
|
||||||
}
|
}
|
||||||
@@ -91,15 +91,15 @@ void goughGetTransitions(const NFA *n, u16 s, u16 *t) {
|
|||||||
if (s >= m->sherman_limit) {
|
if (s >= m->sherman_limit) {
|
||||||
const char *state_base
|
const char *state_base
|
||||||
= findShermanState(m, sher_base, m->sherman_limit, s);
|
= findShermanState(m, sher_base, m->sherman_limit, s);
|
||||||
u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
|
u8 len = *(reinterpret_cast<const u8 *>(SHERMAN_LEN_OFFSET + state_base));
|
||||||
const u8 *chars = (const u8 *)state_base + SHERMAN_CHARS_OFFSET;
|
const u8 *chars = reinterpret_cast<const u8 *>(state_base) + SHERMAN_CHARS_OFFSET;
|
||||||
const u16 *states
|
const u16 *states
|
||||||
= (const u16 *)(state_base + SHERMAN_STATES_OFFSET(len));
|
= reinterpret_cast<const u16 *>(state_base + SHERMAN_STATES_OFFSET(len));
|
||||||
|
|
||||||
for (u8 i = 0; i < len; i++) {
|
for (u8 i = 0; i < len; i++) {
|
||||||
for (u16 c = 0; c < N_CHARS; c++) {
|
for (u16 c = 0; c < N_CHARS; c++) {
|
||||||
if (m->remap[c] != chars[i]) {
|
if (m->remap[c] != chars[i]) {
|
||||||
t[c] = unaligned_load_u16((const u8*)&states[i])
|
t[c] = unaligned_load_u16(reinterpret_cast<const u8*>(&states[i]))
|
||||||
& STATE_MASK;
|
& STATE_MASK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -116,14 +116,14 @@ void describeNode(const NFA *n, const mcclellan *m, u16 i, FILE *f) {
|
|||||||
|
|
||||||
bool isSherman = m->sherman_limit && i >= m->sherman_limit;
|
bool isSherman = m->sherman_limit && i >= m->sherman_limit;
|
||||||
const char *sher_base
|
const char *sher_base
|
||||||
= (const char *)m - sizeof(NFA) + m->sherman_offset;
|
= reinterpret_cast<const char *>(m) - sizeof(NFA) + m->sherman_offset;
|
||||||
|
|
||||||
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
|
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
|
||||||
"label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
|
"label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
|
||||||
|
|
||||||
if (aux->accel_offset) {
|
if (aux->accel_offset) {
|
||||||
dumpAccelDot(f, i,
|
dumpAccelDot(f, i,
|
||||||
&((const gough_accel *)((const char *)m + aux->accel_offset))->accel);
|
&(reinterpret_cast<const gough_accel *>(reinterpret_cast<const char *>(m) + aux->accel_offset))->accel);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (aux->accept_eod) {
|
if (aux->accept_eod) {
|
||||||
@@ -151,7 +151,7 @@ void describeNode(const NFA *n, const mcclellan *m, u16 i, FILE *f) {
|
|||||||
const char *sherman_state
|
const char *sherman_state
|
||||||
= findShermanState(m, sher_base, m->sherman_limit, i);
|
= findShermanState(m, sher_base, m->sherman_limit, i);
|
||||||
fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
|
fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
|
||||||
u16 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
|
u16 daddy = *(reinterpret_cast<const u16 *>(sherman_state + SHERMAN_DADDY_OFFSET));
|
||||||
if (daddy) {
|
if (daddy) {
|
||||||
fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
|
fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
|
||||||
i, daddy);
|
i, daddy);
|
||||||
@@ -197,7 +197,7 @@ void dump_programs(FILE *f, const NFA *nfa,
|
|||||||
for (set<pair<pair<u32, u32>, u32 > >::const_iterator it
|
for (set<pair<pair<u32, u32>, u32 > >::const_iterator it
|
||||||
= prog_dump.begin(); it != prog_dump.end(); ++it) {
|
= prog_dump.begin(); it != prog_dump.end(); ++it) {
|
||||||
assert(it->second);
|
assert(it->second);
|
||||||
const gough_ins *p = (const gough_ins *)((const u8 *)nfa + it->second);
|
const gough_ins *p = reinterpret_cast<const gough_ins *>(reinterpret_cast<const u8 *>(nfa) + it->second);
|
||||||
dump_program(f, it->first, p);
|
dump_program(f, it->first, p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -205,17 +205,17 @@ void dump_programs(FILE *f, const NFA *nfa,
|
|||||||
static
|
static
|
||||||
void dumpTransitions(const NFA *nfa, FILE *f,
|
void dumpTransitions(const NFA *nfa, FILE *f,
|
||||||
set<pair<pair<u32, u32>, u32 > > *prog_dump) {
|
set<pair<pair<u32, u32>, u32 > > *prog_dump) {
|
||||||
const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
|
const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));
|
||||||
const gough_info *g = get_gough(m);
|
const gough_info *g = get_gough(m);
|
||||||
u32 alphaSize = 1U << m->alphaShift;
|
u32 alphaSize = 1U << m->alphaShift;
|
||||||
const u32 *prog_offset_table = (const u32 *)(g + 1);
|
const u32 *prog_offset_table = reinterpret_cast<const u32 *>(g + 1);
|
||||||
|
|
||||||
for (u16 i = 0; i < m->state_count; i++) {
|
for (u16 i = 0; i < m->state_count; i++) {
|
||||||
fprintf(f, "%05hu", i);
|
fprintf(f, "%05hu", i);
|
||||||
const mstate_aux *aux = getAux(nfa, i);
|
const mstate_aux *aux = getAux(nfa, i);
|
||||||
|
|
||||||
if (aux->accel_offset) {
|
if (aux->accel_offset) {
|
||||||
dumpAccelText(f, (const union AccelAux *)((const char *)m +
|
dumpAccelText(f, reinterpret_cast<const union AccelAux *>(reinterpret_cast<const char *>(m) +
|
||||||
aux->accel_offset));
|
aux->accel_offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -263,7 +263,7 @@ void dumpTransitions(const NFA *nfa, FILE *f,
|
|||||||
static
|
static
|
||||||
void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f) {
|
void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f) {
|
||||||
assert(nfa->type == GOUGH_NFA_8);
|
assert(nfa->type == GOUGH_NFA_8);
|
||||||
const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
|
const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));
|
||||||
|
|
||||||
dumpDotPreambleDfa(f);
|
dumpDotPreambleDfa(f);
|
||||||
|
|
||||||
@@ -284,7 +284,7 @@ static
|
|||||||
void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
|
void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
|
||||||
|
|
||||||
assert(nfa->type == GOUGH_NFA_8);
|
assert(nfa->type == GOUGH_NFA_8);
|
||||||
const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
|
const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));
|
||||||
|
|
||||||
fprintf(f, "gough 8\n");
|
fprintf(f, "gough 8\n");
|
||||||
fprintf(f, "report: %u, states %u, length %u\n", m->arb_report,
|
fprintf(f, "report: %u, states %u, length %u\n", m->arb_report,
|
||||||
@@ -308,7 +308,7 @@ void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
|
|||||||
static
|
static
|
||||||
void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
|
void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
|
||||||
assert(nfa->type == GOUGH_NFA_16);
|
assert(nfa->type == GOUGH_NFA_16);
|
||||||
const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
|
const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));
|
||||||
|
|
||||||
dumpDotPreambleDfa(f);
|
dumpDotPreambleDfa(f);
|
||||||
|
|
||||||
@@ -328,7 +328,7 @@ void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
|
|||||||
static
|
static
|
||||||
void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
|
void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
|
||||||
assert(nfa->type == GOUGH_NFA_16);
|
assert(nfa->type == GOUGH_NFA_16);
|
||||||
const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
|
const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));
|
||||||
// const gough_info *h = get_gough(m);
|
// const gough_info *h = get_gough(m);
|
||||||
|
|
||||||
fprintf(f, "gough 16\n");
|
fprintf(f, "gough 16\n");
|
||||||
@@ -336,7 +336,7 @@ void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
|
|||||||
m->state_count, m->length);
|
m->state_count, m->length);
|
||||||
fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
|
fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
|
||||||
m->start_floating);
|
m->start_floating);
|
||||||
fprintf(f, "single accept: %d\n", !!(int)m->flags & MCCLELLAN_FLAG_SINGLE);
|
fprintf(f, "single accept: %d\n", !!(m->flags & MCCLELLAN_FLAG_SINGLE));
|
||||||
fprintf(f, "sherman_limit: %u, sherman_end: %u\n", m->sherman_limit,
|
fprintf(f, "sherman_limit: %u, sherman_end: %u\n", m->sherman_limit,
|
||||||
m->sherman_end);
|
m->sherman_end);
|
||||||
|
|
||||||
|
|||||||
@@ -307,7 +307,7 @@ char lbrMatchLoop(const struct lbr_common *l, const u64a begin, const u64a end,
|
|||||||
static really_inline
|
static really_inline
|
||||||
char lbrRevScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
|
char lbrRevScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
|
||||||
UNUSED size_t begin, UNUSED size_t end,
|
UNUSED size_t begin, UNUSED size_t end,
|
||||||
UNUSED size_t *loc) {
|
UNUSED const size_t *loc) {
|
||||||
assert(begin <= end);
|
assert(begin <= end);
|
||||||
assert(nfa->type == LBR_NFA_DOT);
|
assert(nfa->type == LBR_NFA_DOT);
|
||||||
// Nothing can kill a dot!
|
// Nothing can kill a dot!
|
||||||
@@ -413,7 +413,7 @@ char lbrRevScanTruf(const struct NFA *nfa, const u8 *buf,
|
|||||||
static really_inline
|
static really_inline
|
||||||
char lbrFwdScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
|
char lbrFwdScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
|
||||||
UNUSED size_t begin, UNUSED size_t end,
|
UNUSED size_t begin, UNUSED size_t end,
|
||||||
UNUSED size_t *loc) {
|
UNUSED const size_t *loc) {
|
||||||
assert(begin <= end);
|
assert(begin <= end);
|
||||||
assert(nfa->type == LBR_NFA_DOT);
|
assert(nfa->type == LBR_NFA_DOT);
|
||||||
// Nothing can kill a dot!
|
// Nothing can kill a dot!
|
||||||
|
|||||||
@@ -180,7 +180,7 @@ found_top:;
|
|||||||
|
|
||||||
u64a ep = MIN(MIN(end, (s64a)q->length) + offset, first_match);
|
u64a ep = MIN(MIN(end, (s64a)q->length) + offset, first_match);
|
||||||
if (ep > sp && sp >= offset) {
|
if (ep > sp && sp >= offset) {
|
||||||
size_t eloc;
|
size_t eloc = 0;
|
||||||
DEBUG_PRINTF("rev b%llu e%llu/%zu\n", sp - offset, ep - offset,
|
DEBUG_PRINTF("rev b%llu e%llu/%zu\n", sp - offset, ep - offset,
|
||||||
q->length);
|
q->length);
|
||||||
assert(ep - offset <= q->length);
|
assert(ep - offset <= q->length);
|
||||||
@@ -279,6 +279,7 @@ char JOIN(ENGINE_EXEC_NAME, _Q_i)(const struct NFA *nfa, struct mq *q,
|
|||||||
assert(rv == MO_CONTINUE_MATCHING);
|
assert(rv == MO_CONTINUE_MATCHING);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (escape_found) {
|
if (escape_found) {
|
||||||
DEBUG_PRINTF("clearing repeat due to escape\n");
|
DEBUG_PRINTF("clearing repeat due to escape\n");
|
||||||
clearRepeat(info, lstate);
|
clearRepeat(info, lstate);
|
||||||
@@ -355,6 +356,7 @@ void JOIN(ENGINE_EXEC_NAME, _StreamSilent)(const struct NFA *nfa, struct mq *q,
|
|||||||
|
|
||||||
size_t eloc = 0;
|
size_t eloc = 0;
|
||||||
char escaped = FWDSCAN_FN(nfa, buf, 0, length, &eloc);
|
char escaped = FWDSCAN_FN(nfa, buf, 0, length, &eloc);
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (escaped) {
|
if (escaped) {
|
||||||
assert(eloc < length);
|
assert(eloc < length);
|
||||||
DEBUG_PRINTF("escape found at %zu, clearing repeat\n", eloc);
|
DEBUG_PRINTF("escape found at %zu, clearing repeat\n", eloc);
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ namespace ue2 {
|
|||||||
static
|
static
|
||||||
void lbrDumpCommon(const lbr_common *lc, FILE *f) {
|
void lbrDumpCommon(const lbr_common *lc, FILE *f) {
|
||||||
const RepeatInfo *info
|
const RepeatInfo *info
|
||||||
= (const RepeatInfo *)((const char *)lc + lc->repeatInfoOffset);
|
= reinterpret_cast<const RepeatInfo *>(reinterpret_cast<const char *>(lc) + lc->repeatInfoOffset);
|
||||||
fprintf(f, "Limited Bounded Repeat\n");
|
fprintf(f, "Limited Bounded Repeat\n");
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
fprintf(f, "repeat model: %s\n", repeatTypeName(info->type));
|
fprintf(f, "repeat model: %s\n", repeatTypeName(info->type));
|
||||||
@@ -70,7 +70,7 @@ void lbrDumpCommon(const lbr_common *lc, FILE *f) {
|
|||||||
void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
|
void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
|
||||||
assert(nfa);
|
assert(nfa);
|
||||||
assert(nfa->type == LBR_NFA_DOT);
|
assert(nfa->type == LBR_NFA_DOT);
|
||||||
const lbr_dot *ld = (const lbr_dot *)getImplNfa(nfa);
|
const lbr_dot *ld = reinterpret_cast<const lbr_dot *>(getImplNfa(nfa));
|
||||||
StdioFile f(base + ".txt", "w");
|
StdioFile f(base + ".txt", "w");
|
||||||
lbrDumpCommon(&ld->common, f);
|
lbrDumpCommon(&ld->common, f);
|
||||||
fprintf(f, "DOT model\n");
|
fprintf(f, "DOT model\n");
|
||||||
@@ -81,7 +81,7 @@ void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
|
|||||||
void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
|
void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
|
||||||
assert(nfa);
|
assert(nfa);
|
||||||
assert(nfa->type == LBR_NFA_VERM);
|
assert(nfa->type == LBR_NFA_VERM);
|
||||||
const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
|
const lbr_verm *lv = reinterpret_cast<const lbr_verm *>(getImplNfa(nfa));
|
||||||
StdioFile f(base + ".txt", "w");
|
StdioFile f(base + ".txt", "w");
|
||||||
lbrDumpCommon(&lv->common, f);
|
lbrDumpCommon(&lv->common, f);
|
||||||
fprintf(f, "VERM model, scanning for 0x%02x\n", lv->c);
|
fprintf(f, "VERM model, scanning for 0x%02x\n", lv->c);
|
||||||
@@ -92,7 +92,7 @@ void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
|
|||||||
void nfaExecLbrNVerm_dump(const NFA *nfa, const string &base) {
|
void nfaExecLbrNVerm_dump(const NFA *nfa, const string &base) {
|
||||||
assert(nfa);
|
assert(nfa);
|
||||||
assert(nfa->type == LBR_NFA_NVERM);
|
assert(nfa->type == LBR_NFA_NVERM);
|
||||||
const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
|
const lbr_verm *lv = reinterpret_cast<const lbr_verm *>(getImplNfa(nfa));
|
||||||
StdioFile f(base + ".txt", "w");
|
StdioFile f(base + ".txt", "w");
|
||||||
lbrDumpCommon(&lv->common, f);
|
lbrDumpCommon(&lv->common, f);
|
||||||
fprintf(f, "NEGATED VERM model, scanning for 0x%02x\n", lv->c);
|
fprintf(f, "NEGATED VERM model, scanning for 0x%02x\n", lv->c);
|
||||||
@@ -106,11 +106,11 @@ void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) {
|
|||||||
|
|
||||||
StdioFile f(base + ".txt", "w");
|
StdioFile f(base + ".txt", "w");
|
||||||
|
|
||||||
const lbr_shuf *ls = (const lbr_shuf *)getImplNfa(nfa);
|
const lbr_shuf *ls = reinterpret_cast<const lbr_shuf *>(getImplNfa(nfa));
|
||||||
lbrDumpCommon(&ls->common, f);
|
lbrDumpCommon(&ls->common, f);
|
||||||
|
|
||||||
CharReach cr = shufti2cr((const u8 *)&ls->mask_lo,
|
CharReach cr = shufti2cr(reinterpret_cast<const u8 *>(&ls->mask_lo),
|
||||||
(const u8 *)&ls->mask_hi);
|
reinterpret_cast<const u8 *>(&ls->mask_hi));
|
||||||
fprintf(f, "SHUF model, scanning for: %s (%zu chars)\n",
|
fprintf(f, "SHUF model, scanning for: %s (%zu chars)\n",
|
||||||
describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
|
describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
@@ -123,11 +123,11 @@ void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) {
|
|||||||
|
|
||||||
StdioFile f(base + ".txt", "w");
|
StdioFile f(base + ".txt", "w");
|
||||||
|
|
||||||
const lbr_truf *lt = (const lbr_truf *)getImplNfa(nfa);
|
const lbr_truf *lt = reinterpret_cast<const lbr_truf *>(getImplNfa(nfa));
|
||||||
lbrDumpCommon(<->common, f);
|
lbrDumpCommon(<->common, f);
|
||||||
|
|
||||||
CharReach cr = truffle2cr((const u8 *)<->mask1,
|
CharReach cr = truffle2cr(reinterpret_cast<const u8 *>(<->mask1),
|
||||||
(const u8 *)<->mask2);
|
reinterpret_cast<const u8 *>(<->mask2));
|
||||||
fprintf(f, "TRUFFLE model, scanning for: %s (%zu chars)\n",
|
fprintf(f, "TRUFFLE model, scanning for: %s (%zu chars)\n",
|
||||||
describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
|
describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ extern "C"
|
|||||||
char gf_name##_Q(const struct NFA *n, struct mq *q, s64a end); \
|
char gf_name##_Q(const struct NFA *n, struct mq *q, s64a end); \
|
||||||
char gf_name##_Q2(const struct NFA *n, struct mq *q, s64a end); \
|
char gf_name##_Q2(const struct NFA *n, struct mq *q, s64a end); \
|
||||||
char gf_name##_QR(const struct NFA *n, struct mq *q, ReportID report); \
|
char gf_name##_QR(const struct NFA *n, struct mq *q, ReportID report); \
|
||||||
char gf_name##_reportCurrent(const struct NFA *n, struct mq *q); \
|
char gf_name##_reportCurrent(const struct NFA *n, const struct mq *q); \
|
||||||
char gf_name##_inAccept(const struct NFA *n, ReportID report, \
|
char gf_name##_inAccept(const struct NFA *n, ReportID report, \
|
||||||
struct mq *q); \
|
struct mq *q); \
|
||||||
char gf_name##_inAnyAccept(const struct NFA *n, struct mq *q); \
|
char gf_name##_inAnyAccept(const struct NFA *n, struct mq *q); \
|
||||||
|
|||||||
@@ -125,6 +125,7 @@ char PROCESS_ACCEPTS_IMPL_FN(const IMPL_NFA_T *limex, const STATE_T *s,
|
|||||||
const STATE_T accept_mask = *acceptMask;
|
const STATE_T accept_mask = *acceptMask;
|
||||||
STATE_T accepts = AND_STATE(*s, accept_mask);
|
STATE_T accepts = AND_STATE(*s, accept_mask);
|
||||||
|
|
||||||
|
DEBUG_PRINTF("sizeof(STATE_T): %ld, sizeof(CHUNK_T): %ld, NUM_STATE_CHUNKS: %ld\n", sizeof(STATE_T), sizeof(CHUNK_T), NUM_STATE_CHUNKS);
|
||||||
// Caller must ensure that we have at least one accept state on.
|
// Caller must ensure that we have at least one accept state on.
|
||||||
assert(ISNONZERO_STATE(accepts));
|
assert(ISNONZERO_STATE(accepts));
|
||||||
|
|
||||||
@@ -135,6 +136,7 @@ char PROCESS_ACCEPTS_IMPL_FN(const IMPL_NFA_T *limex, const STATE_T *s,
|
|||||||
memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
|
memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
|
||||||
|
|
||||||
u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
|
u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
|
||||||
|
// cppcheck-suppress unsignedLessThanZero
|
||||||
for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
|
for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
|
||||||
CHUNK_T chunk = chunks[i];
|
CHUNK_T chunk = chunks[i];
|
||||||
while (chunk != 0) {
|
while (chunk != 0) {
|
||||||
@@ -332,7 +334,7 @@ void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx,
|
|||||||
// UE-1636) need to guard cyclic tug-accepts as well.
|
// UE-1636) need to guard cyclic tug-accepts as well.
|
||||||
static really_inline
|
static really_inline
|
||||||
char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
|
char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
|
||||||
union RepeatControl *repeat_ctrl, char *repeat_state,
|
const union RepeatControl *repeat_ctrl, const char *repeat_state,
|
||||||
u64a offset, ReportID report) {
|
u64a offset, ReportID report) {
|
||||||
assert(limex);
|
assert(limex);
|
||||||
|
|
||||||
@@ -358,6 +360,7 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
|
|||||||
memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
|
memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
|
||||||
|
|
||||||
u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
|
u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
|
||||||
|
// cppcheck-suppress unsignedLessThanZero
|
||||||
for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
|
for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
|
||||||
CHUNK_T chunk = chunks[i];
|
CHUNK_T chunk = chunks[i];
|
||||||
while (chunk != 0) {
|
while (chunk != 0) {
|
||||||
@@ -382,7 +385,7 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
|
|||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
|
char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
|
||||||
union RepeatControl *repeat_ctrl, char *repeat_state,
|
const union RepeatControl *repeat_ctrl, const char *repeat_state,
|
||||||
u64a offset) {
|
u64a offset) {
|
||||||
assert(limex);
|
assert(limex);
|
||||||
|
|
||||||
|
|||||||
@@ -140,6 +140,7 @@ reindexByStateId(const unordered_map<NFAVertex, NFAStateSet> &in,
|
|||||||
for (size_t i = m.second.find_first(); i != m.second.npos;
|
for (size_t i = m.second.find_first(); i != m.second.npos;
|
||||||
i = m.second.find_next(i)) {
|
i = m.second.find_next(i)) {
|
||||||
u32 state_id = indexToState[i];
|
u32 state_id = indexToState[i];
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (state_id == NO_STATE) {
|
if (state_id == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -269,7 +270,7 @@ void maskClear(Mask &m) {
|
|||||||
template<class Mask>
|
template<class Mask>
|
||||||
u8 *maskGetByte(Mask &m, u32 bit) {
|
u8 *maskGetByte(Mask &m, u32 bit) {
|
||||||
assert(bit < sizeof(m)*8);
|
assert(bit < sizeof(m)*8);
|
||||||
u8 *m8 = (u8 *)&m;
|
u8 *m8 = reinterpret_cast<u8 *>(&m);
|
||||||
|
|
||||||
return m8 + bit/8;
|
return m8 + bit/8;
|
||||||
}
|
}
|
||||||
@@ -290,7 +291,7 @@ void maskSetBits(Mask &m, const NFAStateSet &bits) {
|
|||||||
|
|
||||||
template<class Mask>
|
template<class Mask>
|
||||||
bool isMaskZero(Mask &m) {
|
bool isMaskZero(Mask &m) {
|
||||||
u8 *m8 = (u8 *)&m;
|
const u8 *m8 = reinterpret_cast<u8 *>(&m);
|
||||||
for (u32 i = 0; i < sizeof(m); i++) {
|
for (u32 i = 0; i < sizeof(m); i++) {
|
||||||
if (m8[i]) {
|
if (m8[i]) {
|
||||||
return false;
|
return false;
|
||||||
@@ -303,7 +304,7 @@ bool isMaskZero(Mask &m) {
|
|||||||
template<class Mask>
|
template<class Mask>
|
||||||
void maskSetByte(Mask &m, const unsigned int idx, const char val) {
|
void maskSetByte(Mask &m, const unsigned int idx, const char val) {
|
||||||
assert(idx < sizeof(m));
|
assert(idx < sizeof(m));
|
||||||
char *m8 = (char *)&m;
|
char *m8 = reinterpret_cast<char *>(&m);
|
||||||
char &byte = m8[idx];
|
char &byte = m8[idx];
|
||||||
byte = val;
|
byte = val;
|
||||||
}
|
}
|
||||||
@@ -329,11 +330,12 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
|
|||||||
// Build a list of vertices with a state index assigned.
|
// Build a list of vertices with a state index assigned.
|
||||||
vector<NFAVertex> verts;
|
vector<NFAVertex> verts;
|
||||||
verts.reserve(args.num_states);
|
verts.reserve(args.num_states);
|
||||||
for (auto v : vertices_range(h)) {
|
auto sidat = [&state_ids=state_ids](const NFAVertex &v) {
|
||||||
if (state_ids.at(v) != NO_STATE) {
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
verts.emplace_back(v);
|
return (state_ids.at(v) != NO_STATE);
|
||||||
}
|
};
|
||||||
}
|
const auto &vr = vertices_range(h);
|
||||||
|
std::copy_if(begin(vr), end(vr), std::back_inserter(verts), sidat);
|
||||||
|
|
||||||
// Build a mapping from set-of-states -> reachability.
|
// Build a mapping from set-of-states -> reachability.
|
||||||
map<NFAStateSet, CharReach> mapping;
|
map<NFAStateSet, CharReach> mapping;
|
||||||
@@ -482,6 +484,7 @@ bool allow_wide_accel(NFAVertex v, const NGHolder &g, NFAVertex sds_or_proxy) {
|
|||||||
static
|
static
|
||||||
bool allow_wide_accel(const vector<NFAVertex> &vv, const NGHolder &g,
|
bool allow_wide_accel(const vector<NFAVertex> &vv, const NGHolder &g,
|
||||||
NFAVertex sds_or_proxy) {
|
NFAVertex sds_or_proxy) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
for (auto v : vv) {
|
for (auto v : vv) {
|
||||||
if (allow_wide_accel(v, g, sds_or_proxy)) {
|
if (allow_wide_accel(v, g, sds_or_proxy)) {
|
||||||
return true;
|
return true;
|
||||||
@@ -555,7 +558,8 @@ void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
|
|||||||
|
|
||||||
// Similarly, connect (start, startDs) if necessary.
|
// Similarly, connect (start, startDs) if necessary.
|
||||||
if (!edge(g.start, g.startDs, g).second) {
|
if (!edge(g.start, g.startDs, g).second) {
|
||||||
NFAEdge e = add_edge(g.start, g.startDs, g);
|
NFAEdge e;
|
||||||
|
std::tie(e, std::ignore) = add_edge(g.start, g.startDs, g);
|
||||||
tempEdges.emplace_back(e); // Remove edge later.
|
tempEdges.emplace_back(e); // Remove edge later.
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -584,6 +588,7 @@ bool containsBadSubset(const limex_accel_info &accel,
|
|||||||
subset = state_set;
|
subset = state_set;
|
||||||
subset.reset(j);
|
subset.reset(j);
|
||||||
|
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (effective_sds != NO_STATE && subset.count() == 1 &&
|
if (effective_sds != NO_STATE && subset.count() == 1 &&
|
||||||
subset.test(effective_sds)) {
|
subset.test(effective_sds)) {
|
||||||
continue;
|
continue;
|
||||||
@@ -623,7 +628,8 @@ void fillAccelInfo(build_info &bi) {
|
|||||||
|
|
||||||
vector<NFAVertex> astates;
|
vector<NFAVertex> astates;
|
||||||
for (const auto &m : accel_map) {
|
for (const auto &m : accel_map) {
|
||||||
astates.emplace_back(m.first);
|
// cppcheck-suppress useStlAlgorithm
|
||||||
|
astates.emplace_back(m.first); //NOLINT (performance-inefficient-vector-operation)
|
||||||
}
|
}
|
||||||
|
|
||||||
NFAStateSet useful(num_states);
|
NFAStateSet useful(num_states);
|
||||||
@@ -799,12 +805,14 @@ u32 getEffectiveAccelStates(const build_info &args,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (const auto &s_mask : args.squashMap | map_values) {
|
for (const auto &s_mask : args.squashMap | map_values) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
if (!s_mask.test(state_id)) {
|
if (!s_mask.test(state_id)) {
|
||||||
may_turn_off |= 1U << accel_id;
|
may_turn_off |= 1U << accel_id;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (const auto &s_mask : args.reportSquashMap | map_values) {
|
for (const auto &s_mask : args.reportSquashMap | map_values) {
|
||||||
|
// cppcheck-suppress useStlAlgorithm
|
||||||
if (!s_mask.test(state_id)) {
|
if (!s_mask.test(state_id)) {
|
||||||
may_turn_off |= 1U << accel_id;
|
may_turn_off |= 1U << accel_id;
|
||||||
break;
|
break;
|
||||||
@@ -914,11 +922,13 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
|
|||||||
|
|
||||||
// Start with the NONE case.
|
// Start with the NONE case.
|
||||||
auxvec.emplace_back(AccelAux());
|
auxvec.emplace_back(AccelAux());
|
||||||
|
// cppcheck-suppress memsetClassFloat
|
||||||
memset(&auxvec[0], 0, sizeof(AccelAux));
|
memset(&auxvec[0], 0, sizeof(AccelAux));
|
||||||
auxvec[0].accel_type = ACCEL_NONE; // no states on.
|
auxvec[0].accel_type = ACCEL_NONE; // no states on.
|
||||||
|
|
||||||
AccelAux aux;
|
AccelAux aux;
|
||||||
for (u32 i = 1; i < accelCount; i++) {
|
for (u32 i = 1; i < accelCount; i++) {
|
||||||
|
// cppcheck-suppress memsetClassFloat
|
||||||
memset(&aux, 0, sizeof(aux));
|
memset(&aux, 0, sizeof(aux));
|
||||||
|
|
||||||
NFAStateSet effective_states(args.num_states);
|
NFAStateSet effective_states(args.num_states);
|
||||||
@@ -1064,7 +1074,7 @@ void buildAcceptsList(const build_info &args, ReportListCache &reports_cache,
|
|||||||
a.reports = addReports(h[v].reports, reports, reports_cache);
|
a.reports = addReports(h[v].reports, reports, reports_cache);
|
||||||
}
|
}
|
||||||
a.squash = addSquashMask(args, v, squash);
|
a.squash = addSquashMask(args, v, squash);
|
||||||
accepts.emplace_back(std::move(a));
|
accepts.emplace_back(a);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1083,6 +1093,7 @@ void buildAccepts(const build_info &args, ReportListCache &reports_cache,
|
|||||||
for (auto v : vertices_range(h)) {
|
for (auto v : vertices_range(h)) {
|
||||||
u32 state_id = args.state_ids.at(v);
|
u32 state_id = args.state_ids.at(v);
|
||||||
|
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (state_id == NO_STATE || !is_match_vertex(v, h)) {
|
if (state_id == NO_STATE || !is_match_vertex(v, h)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1142,6 +1153,7 @@ u32 compressedStateSize(const NGHolder &h, const NFAStateSet &maskedStates,
|
|||||||
|
|
||||||
for (auto v : vertices_range(h)) {
|
for (auto v : vertices_range(h)) {
|
||||||
u32 i = state_ids.at(v);
|
u32 i = state_ids.at(v);
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (i == NO_STATE || maskedStates.test(i)) {
|
if (i == NO_STATE || maskedStates.test(i)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1167,6 +1179,7 @@ bool hasSquashableInitDs(const build_info &args) {
|
|||||||
|
|
||||||
NFAStateSet initDs(args.num_states);
|
NFAStateSet initDs(args.num_states);
|
||||||
u32 sds_state = args.state_ids.at(h.startDs);
|
u32 sds_state = args.state_ids.at(h.startDs);
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (sds_state == NO_STATE) {
|
if (sds_state == NO_STATE) {
|
||||||
DEBUG_PRINTF("no states in initds\n");
|
DEBUG_PRINTF("no states in initds\n");
|
||||||
return false;
|
return false;
|
||||||
@@ -1208,10 +1221,11 @@ bool hasSquashableInitDs(const build_info &args) {
|
|||||||
static
|
static
|
||||||
bool hasInitDsStates(const NGHolder &h,
|
bool hasInitDsStates(const NGHolder &h,
|
||||||
const unordered_map<NFAVertex, u32> &state_ids) {
|
const unordered_map<NFAVertex, u32> &state_ids) {
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (state_ids.at(h.startDs) != NO_STATE) {
|
if (state_ids.at(h.startDs) != NO_STATE) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (is_triggered(h) && state_ids.at(h.start) != NO_STATE) {
|
if (is_triggered(h) && state_ids.at(h.start) != NO_STATE) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -1227,6 +1241,7 @@ void findMaskedCompressionStates(const build_info &args,
|
|||||||
// Rose leftfixes can mask out initds, which is worth doing if it will
|
// Rose leftfixes can mask out initds, which is worth doing if it will
|
||||||
// stay on forever (i.e. it's not squashable).
|
// stay on forever (i.e. it's not squashable).
|
||||||
u32 sds_i = args.state_ids.at(h.startDs);
|
u32 sds_i = args.state_ids.at(h.startDs);
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (sds_i != NO_STATE && !hasSquashableInitDs(args)) {
|
if (sds_i != NO_STATE && !hasSquashableInitDs(args)) {
|
||||||
maskedStates.set(sds_i);
|
maskedStates.set(sds_i);
|
||||||
DEBUG_PRINTF("masking out initds state\n");
|
DEBUG_PRINTF("masking out initds state\n");
|
||||||
@@ -1242,6 +1257,7 @@ void findMaskedCompressionStates(const build_info &args,
|
|||||||
for (const auto &e : edges_range(h)) {
|
for (const auto &e : edges_range(h)) {
|
||||||
u32 from = args.state_ids.at(source(e, h));
|
u32 from = args.state_ids.at(source(e, h));
|
||||||
u32 to = args.state_ids.at(target(e, h));
|
u32 to = args.state_ids.at(target(e, h));
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (from == NO_STATE) {
|
if (from == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1249,6 +1265,7 @@ void findMaskedCompressionStates(const build_info &args,
|
|||||||
// We cannot mask out EOD accepts, as they have to perform an
|
// We cannot mask out EOD accepts, as they have to perform an
|
||||||
// action after they're switched on that may be delayed until the
|
// action after they're switched on that may be delayed until the
|
||||||
// next stream write.
|
// next stream write.
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (to == NO_STATE && target(e, h) != h.acceptEod) {
|
if (to == NO_STATE && target(e, h) != h.acceptEod) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1399,6 +1416,7 @@ u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
|
|||||||
for (auto v : vertices_range(h)) {
|
for (auto v : vertices_range(h)) {
|
||||||
const u32 i = args.state_ids.at(v);
|
const u32 i = args.state_ids.at(v);
|
||||||
|
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (i == NO_STATE) {
|
if (i == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1481,6 +1499,8 @@ u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
u32 j = args.state_ids.at(w);
|
u32 j = args.state_ids.at(w);
|
||||||
|
// j can be NO_STATE if args.state_ids.at(w) returns NO_STATE
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (j == NO_STATE) {
|
if (j == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1553,6 +1573,7 @@ u32 findMaxVarShift(const build_info &args, u32 nShifts) {
|
|||||||
for (const auto &e : edges_range(h)) {
|
for (const auto &e : edges_range(h)) {
|
||||||
u32 from = args.state_ids.at(source(e, h));
|
u32 from = args.state_ids.at(source(e, h));
|
||||||
u32 to = args.state_ids.at(target(e, h));
|
u32 to = args.state_ids.at(target(e, h));
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (from == NO_STATE || to == NO_STATE) {
|
if (from == NO_STATE || to == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1572,7 +1593,7 @@ u32 findMaxVarShift(const build_info &args, u32 nShifts) {
|
|||||||
static
|
static
|
||||||
int getLimexScore(const build_info &args, u32 nShifts) {
|
int getLimexScore(const build_info &args, u32 nShifts) {
|
||||||
const NGHolder &h = args.h;
|
const NGHolder &h = args.h;
|
||||||
u32 maxVarShift = nShifts;
|
u32 maxVarShift;
|
||||||
int score = 0;
|
int score = 0;
|
||||||
|
|
||||||
score += SHIFT_COST * nShifts;
|
score += SHIFT_COST * nShifts;
|
||||||
@@ -1582,6 +1603,7 @@ int getLimexScore(const build_info &args, u32 nShifts) {
|
|||||||
for (const auto &e : edges_range(h)) {
|
for (const auto &e : edges_range(h)) {
|
||||||
u32 from = args.state_ids.at(source(e, h));
|
u32 from = args.state_ids.at(source(e, h));
|
||||||
u32 to = args.state_ids.at(target(e, h));
|
u32 to = args.state_ids.at(target(e, h));
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (from == NO_STATE || to == NO_STATE) {
|
if (from == NO_STATE || to == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1678,7 +1700,7 @@ static
|
|||||||
bool cannotDie(const build_info &args) {
|
bool cannotDie(const build_info &args) {
|
||||||
const auto &h = args.h;
|
const auto &h = args.h;
|
||||||
const auto &state_ids = args.state_ids;
|
const auto &state_ids = args.state_ids;
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
// If we have a startDs we're actually using, we can't die.
|
// If we have a startDs we're actually using, we can't die.
|
||||||
if (state_ids.at(h.startDs) != NO_STATE) {
|
if (state_ids.at(h.startDs) != NO_STATE) {
|
||||||
DEBUG_PRINTF("is using startDs\n");
|
DEBUG_PRINTF("is using startDs\n");
|
||||||
@@ -1700,7 +1722,7 @@ struct Factory {
|
|||||||
static
|
static
|
||||||
void allocState(NFA *nfa, u32 repeatscratchStateSize,
|
void allocState(NFA *nfa, u32 repeatscratchStateSize,
|
||||||
u32 repeatStreamState) {
|
u32 repeatStreamState) {
|
||||||
implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa);
|
const implNFA_t *limex = reinterpret_cast<implNFA_t *>(getMutableImplNfa(nfa));
|
||||||
|
|
||||||
// LimEx NFAs now store the following in state:
|
// LimEx NFAs now store the following in state:
|
||||||
// 1. state bitvector (always present)
|
// 1. state bitvector (always present)
|
||||||
@@ -1766,7 +1788,7 @@ struct Factory {
|
|||||||
u32 tableOffset, tugMaskOffset;
|
u32 tableOffset, tugMaskOffset;
|
||||||
size_t len = repeatAllocSize(br, &tableOffset, &tugMaskOffset);
|
size_t len = repeatAllocSize(br, &tableOffset, &tugMaskOffset);
|
||||||
auto info = make_zeroed_bytecode_ptr<NFARepeatInfo>(len);
|
auto info = make_zeroed_bytecode_ptr<NFARepeatInfo>(len);
|
||||||
char *info_ptr = (char *)info.get();
|
char *info_ptr = reinterpret_cast<char *>(info.get());
|
||||||
|
|
||||||
// Collect state space info.
|
// Collect state space info.
|
||||||
RepeatStateInfo rsi(br.type, br.repeatMin, br.repeatMax, br.minPeriod);
|
RepeatStateInfo rsi(br.type, br.repeatMin, br.repeatMax, br.minPeriod);
|
||||||
@@ -1781,8 +1803,7 @@ struct Factory {
|
|||||||
info->tugMaskOffset = tugMaskOffset;
|
info->tugMaskOffset = tugMaskOffset;
|
||||||
|
|
||||||
// Fill the RepeatInfo structure.
|
// Fill the RepeatInfo structure.
|
||||||
RepeatInfo *repeat =
|
RepeatInfo *repeat = reinterpret_cast<RepeatInfo *>(info_ptr + sizeof(NFARepeatInfo));
|
||||||
(RepeatInfo *)(info_ptr + sizeof(NFARepeatInfo));
|
|
||||||
repeat->type = br.type;
|
repeat->type = br.type;
|
||||||
repeat->repeatMin = depth_to_u32(br.repeatMin);
|
repeat->repeatMin = depth_to_u32(br.repeatMin);
|
||||||
repeat->repeatMax = depth_to_u32(br.repeatMax);
|
repeat->repeatMax = depth_to_u32(br.repeatMax);
|
||||||
@@ -1808,7 +1829,7 @@ struct Factory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fill the tug mask.
|
// Fill the tug mask.
|
||||||
tableRow_t *tugMask = (tableRow_t *)(info_ptr + tugMaskOffset);
|
tableRow_t *tugMask = reinterpret_cast<tableRow_t *>(info_ptr + tugMaskOffset);
|
||||||
for (auto v : br.tug_triggers) {
|
for (auto v : br.tug_triggers) {
|
||||||
u32 state_id = args.state_ids.at(v);
|
u32 state_id = args.state_ids.at(v);
|
||||||
assert(state_id != NO_STATE);
|
assert(state_id != NO_STATE);
|
||||||
@@ -1831,6 +1852,7 @@ struct Factory {
|
|||||||
u32 s_i = args.state_ids.at(h.start);
|
u32 s_i = args.state_ids.at(h.start);
|
||||||
u32 sds_i = args.state_ids.at(h.startDs);
|
u32 sds_i = args.state_ids.at(h.startDs);
|
||||||
|
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (s_i != NO_STATE) {
|
if (s_i != NO_STATE) {
|
||||||
maskSetBit(limex->init, s_i);
|
maskSetBit(limex->init, s_i);
|
||||||
if (is_triggered(h)) {
|
if (is_triggered(h)) {
|
||||||
@@ -1838,6 +1860,7 @@ struct Factory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (sds_i != NO_STATE) {
|
if (sds_i != NO_STATE) {
|
||||||
maskSetBit(limex->init, sds_i);
|
maskSetBit(limex->init, sds_i);
|
||||||
maskSetBit(limex->initDS, sds_i);
|
maskSetBit(limex->initDS, sds_i);
|
||||||
@@ -1873,6 +1896,7 @@ struct Factory {
|
|||||||
for (const auto &e : edges_range(h)) {
|
for (const auto &e : edges_range(h)) {
|
||||||
u32 from = args.state_ids.at(source(e, h));
|
u32 from = args.state_ids.at(source(e, h));
|
||||||
u32 to = args.state_ids.at(target(e, h));
|
u32 to = args.state_ids.at(target(e, h));
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (from == NO_STATE || to == NO_STATE) {
|
if (from == NO_STATE || to == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1911,6 +1935,7 @@ struct Factory {
|
|||||||
for (const auto &e : edges_range(h)) {
|
for (const auto &e : edges_range(h)) {
|
||||||
u32 from = args.state_ids.at(source(e, h));
|
u32 from = args.state_ids.at(source(e, h));
|
||||||
u32 to = args.state_ids.at(target(e, h));
|
u32 to = args.state_ids.at(target(e, h));
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (from == NO_STATE || to == NO_STATE) {
|
if (from == NO_STATE || to == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1929,7 +1954,7 @@ struct Factory {
|
|||||||
const u32 reportListOffset) {
|
const u32 reportListOffset) {
|
||||||
DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);
|
DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);
|
||||||
|
|
||||||
exception_t *etable = (exception_t *)((char *)limex + exceptionsOffset);
|
exception_t *etable = reinterpret_cast<exception_t *>(reinterpret_cast<char *>(limex) + exceptionsOffset);
|
||||||
assert(ISALIGNED(etable));
|
assert(ISALIGNED(etable));
|
||||||
|
|
||||||
map<u32, ExceptionProto> exception_by_state;
|
map<u32, ExceptionProto> exception_by_state;
|
||||||
@@ -1977,10 +2002,10 @@ struct Factory {
|
|||||||
limex->exceptionCount = ecount;
|
limex->exceptionCount = ecount;
|
||||||
|
|
||||||
if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
|
if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
|
||||||
const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask);
|
const u8 *exceptionMask = reinterpret_cast<const u8 *>(&limex->exceptionMask);
|
||||||
u8 *shufMask = (u8 *)&limex->exceptionShufMask;
|
u8 *shufMask = reinterpret_cast<u8 *>(&limex->exceptionShufMask);
|
||||||
u8 *bitMask = (u8 *)&limex->exceptionBitMask;
|
u8 *bitMask = reinterpret_cast<u8 *>(&limex->exceptionBitMask);
|
||||||
u8 *andMask = (u8 *)&limex->exceptionAndMask;
|
u8 *andMask = reinterpret_cast<u8 *>(&limex->exceptionAndMask);
|
||||||
|
|
||||||
u32 tot_cnt = 0;
|
u32 tot_cnt = 0;
|
||||||
u32 pos = 0;
|
u32 pos = 0;
|
||||||
@@ -2040,7 +2065,7 @@ struct Factory {
|
|||||||
copy(reachMap.begin(), reachMap.end(), &limex->reachMap[0]);
|
copy(reachMap.begin(), reachMap.end(), &limex->reachMap[0]);
|
||||||
|
|
||||||
// Reach table is right after the LimEx structure.
|
// Reach table is right after the LimEx structure.
|
||||||
tableRow_t *reachMask = (tableRow_t *)((char *)limex + reachOffset);
|
tableRow_t *reachMask = reinterpret_cast<tableRow_t *>(reinterpret_cast<char *>(limex) + reachOffset);
|
||||||
assert(ISALIGNED(reachMask));
|
assert(ISALIGNED(reachMask));
|
||||||
for (size_t i = 0, end = reach.size(); i < end; i++) {
|
for (size_t i = 0, end = reach.size(); i < end; i++) {
|
||||||
maskSetBits(reachMask[i], reach[i]);
|
maskSetBits(reachMask[i], reach[i]);
|
||||||
@@ -2054,7 +2079,7 @@ struct Factory {
|
|||||||
DEBUG_PRINTF("topsOffset=%u\n", topsOffset);
|
DEBUG_PRINTF("topsOffset=%u\n", topsOffset);
|
||||||
|
|
||||||
limex->topOffset = topsOffset;
|
limex->topOffset = topsOffset;
|
||||||
tableRow_t *topMasks = (tableRow_t *)((char *)limex + topsOffset);
|
tableRow_t *topMasks = reinterpret_cast<tableRow_t *>(reinterpret_cast<char *>(limex) + topsOffset);
|
||||||
assert(ISALIGNED(topMasks));
|
assert(ISALIGNED(topMasks));
|
||||||
|
|
||||||
for (size_t i = 0, end = tops.size(); i < end; i++) {
|
for (size_t i = 0, end = tops.size(); i < end; i++) {
|
||||||
@@ -2066,8 +2091,8 @@ struct Factory {
|
|||||||
|
|
||||||
static
|
static
|
||||||
void writeAccelSsse3Masks(const NFAStateSet &accelMask, implNFA_t *limex) {
|
void writeAccelSsse3Masks(const NFAStateSet &accelMask, implNFA_t *limex) {
|
||||||
char *perm_base = (char *)&limex->accelPermute;
|
char *perm_base = reinterpret_cast<char *>(&limex->accelPermute);
|
||||||
char *comp_base = (char *)&limex->accelCompare;
|
char *comp_base = reinterpret_cast<char *>(&limex->accelCompare);
|
||||||
|
|
||||||
u32 num = 0; // index in accel table.
|
u32 num = 0; // index in accel table.
|
||||||
for (size_t i = accelMask.find_first(); i != accelMask.npos;
|
for (size_t i = accelMask.find_first(); i != accelMask.npos;
|
||||||
@@ -2078,8 +2103,8 @@ struct Factory {
|
|||||||
// PSHUFB permute and compare masks
|
// PSHUFB permute and compare masks
|
||||||
size_t mask_idx = sizeof(u_128) * (state_id / 128U);
|
size_t mask_idx = sizeof(u_128) * (state_id / 128U);
|
||||||
DEBUG_PRINTF("mask_idx=%zu\n", mask_idx);
|
DEBUG_PRINTF("mask_idx=%zu\n", mask_idx);
|
||||||
u_128 *perm = (u_128 *)(perm_base + mask_idx);
|
u_128 *perm = reinterpret_cast<u_128 *>(perm_base + mask_idx);
|
||||||
u_128 *comp = (u_128 *)(comp_base + mask_idx);
|
u_128 *comp = reinterpret_cast<u_128 *>(comp_base + mask_idx);
|
||||||
maskSetByte(*perm, num, ((state_id % 128U) / 8U));
|
maskSetByte(*perm, num, ((state_id % 128U) / 8U));
|
||||||
maskSetByte(*comp, num, ~(1U << (state_id % 8U)));
|
maskSetByte(*comp, num, ~(1U << (state_id % 8U)));
|
||||||
}
|
}
|
||||||
@@ -2097,11 +2122,11 @@ struct Factory {
|
|||||||
// Write accel lookup table.
|
// Write accel lookup table.
|
||||||
limex->accelTableOffset = accelTableOffset;
|
limex->accelTableOffset = accelTableOffset;
|
||||||
copy(accelTable.begin(), accelTable.end(),
|
copy(accelTable.begin(), accelTable.end(),
|
||||||
(u8 *)((char *)limex + accelTableOffset));
|
reinterpret_cast<u8 *>(reinterpret_cast<char *>(limex) + accelTableOffset));
|
||||||
|
|
||||||
// Write accel aux structures.
|
// Write accel aux structures.
|
||||||
limex->accelAuxOffset = accelAuxOffset;
|
limex->accelAuxOffset = accelAuxOffset;
|
||||||
AccelAux *auxTable = (AccelAux *)((char *)limex + accelAuxOffset);
|
AccelAux *auxTable = reinterpret_cast<AccelAux *>(reinterpret_cast<char *>(limex) + accelAuxOffset);
|
||||||
assert(ISALIGNED(auxTable));
|
assert(ISALIGNED(auxTable));
|
||||||
copy(accelAux.begin(), accelAux.end(), auxTable);
|
copy(accelAux.begin(), accelAux.end(), auxTable);
|
||||||
|
|
||||||
@@ -2131,7 +2156,7 @@ struct Factory {
|
|||||||
const vector<NFAStateSet> &squash, implNFA_t *limex,
|
const vector<NFAStateSet> &squash, implNFA_t *limex,
|
||||||
const u32 acceptsOffset, const u32 acceptsEodOffset,
|
const u32 acceptsOffset, const u32 acceptsEodOffset,
|
||||||
const u32 squashOffset, const u32 reportListOffset) {
|
const u32 squashOffset, const u32 reportListOffset) {
|
||||||
char *limex_base = (char *)limex;
|
char *limex_base = reinterpret_cast<char *>(limex);
|
||||||
|
|
||||||
DEBUG_PRINTF("acceptsOffset=%u, acceptsEodOffset=%u, squashOffset=%u\n",
|
DEBUG_PRINTF("acceptsOffset=%u, acceptsEodOffset=%u, squashOffset=%u\n",
|
||||||
acceptsOffset, acceptsEodOffset, squashOffset);
|
acceptsOffset, acceptsEodOffset, squashOffset);
|
||||||
@@ -2154,7 +2179,7 @@ struct Factory {
|
|||||||
limex->acceptOffset = acceptsOffset;
|
limex->acceptOffset = acceptsOffset;
|
||||||
limex->acceptCount = verify_u32(accepts.size());
|
limex->acceptCount = verify_u32(accepts.size());
|
||||||
DEBUG_PRINTF("NFA has %zu accepts\n", accepts.size());
|
DEBUG_PRINTF("NFA has %zu accepts\n", accepts.size());
|
||||||
NFAAccept *acceptsTable = (NFAAccept *)(limex_base + acceptsOffset);
|
NFAAccept *acceptsTable = reinterpret_cast<NFAAccept *>(limex_base + acceptsOffset);
|
||||||
assert(ISALIGNED(acceptsTable));
|
assert(ISALIGNED(acceptsTable));
|
||||||
transform(accepts.begin(), accepts.end(), acceptsTable,
|
transform(accepts.begin(), accepts.end(), acceptsTable,
|
||||||
transform_offset_fn);
|
transform_offset_fn);
|
||||||
@@ -2163,7 +2188,7 @@ struct Factory {
|
|||||||
limex->acceptEodOffset = acceptsEodOffset;
|
limex->acceptEodOffset = acceptsEodOffset;
|
||||||
limex->acceptEodCount = verify_u32(acceptsEod.size());
|
limex->acceptEodCount = verify_u32(acceptsEod.size());
|
||||||
DEBUG_PRINTF("NFA has %zu EOD accepts\n", acceptsEod.size());
|
DEBUG_PRINTF("NFA has %zu EOD accepts\n", acceptsEod.size());
|
||||||
NFAAccept *acceptsEodTable = (NFAAccept *)(limex_base + acceptsEodOffset);
|
NFAAccept *acceptsEodTable = reinterpret_cast<NFAAccept *>(limex_base + acceptsEodOffset);
|
||||||
assert(ISALIGNED(acceptsEodTable));
|
assert(ISALIGNED(acceptsEodTable));
|
||||||
transform(acceptsEod.begin(), acceptsEod.end(), acceptsEodTable,
|
transform(acceptsEod.begin(), acceptsEod.end(), acceptsEodTable,
|
||||||
transform_offset_fn);
|
transform_offset_fn);
|
||||||
@@ -2172,7 +2197,7 @@ struct Factory {
|
|||||||
limex->squashCount = verify_u32(squash.size());
|
limex->squashCount = verify_u32(squash.size());
|
||||||
limex->squashOffset = squashOffset;
|
limex->squashOffset = squashOffset;
|
||||||
DEBUG_PRINTF("NFA has %zu report squash masks\n", squash.size());
|
DEBUG_PRINTF("NFA has %zu report squash masks\n", squash.size());
|
||||||
tableRow_t *mask = (tableRow_t *)(limex_base + squashOffset);
|
tableRow_t *mask = reinterpret_cast<tableRow_t *>(limex_base + squashOffset);
|
||||||
assert(ISALIGNED(mask));
|
assert(ISALIGNED(mask));
|
||||||
for (size_t i = 0, end = squash.size(); i < end; i++) {
|
for (size_t i = 0, end = squash.size(); i < end; i++) {
|
||||||
maskSetBits(mask[i], squash[i]);
|
maskSetBits(mask[i], squash[i]);
|
||||||
@@ -2194,13 +2219,13 @@ struct Factory {
|
|||||||
for (u32 i = 0; i < num_repeats; i++) {
|
for (u32 i = 0; i < num_repeats; i++) {
|
||||||
repeatOffsets[i] = offset;
|
repeatOffsets[i] = offset;
|
||||||
assert(repeats[i]);
|
assert(repeats[i]);
|
||||||
memcpy((char *)limex + offset, repeats[i].get(), repeats[i].size());
|
memcpy(reinterpret_cast<char *>(limex) + offset, repeats[i].get(), repeats[i].size());
|
||||||
offset += repeats[i].size();
|
offset += repeats[i].size();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write repeat offset lookup table.
|
// Write repeat offset lookup table.
|
||||||
assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
|
assert(ISALIGNED_N(reinterpret_cast<char *>(limex) + repeatOffsetsOffset, alignof(u32)));
|
||||||
copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);
|
copy_bytes(reinterpret_cast<char *>(limex) + repeatOffsetsOffset, repeatOffsets);
|
||||||
|
|
||||||
limex->repeatOffset = repeatOffsetsOffset;
|
limex->repeatOffset = repeatOffsetsOffset;
|
||||||
limex->repeatCount = num_repeats;
|
limex->repeatCount = num_repeats;
|
||||||
@@ -2210,15 +2235,15 @@ struct Factory {
|
|||||||
void writeReportList(const vector<ReportID> &reports, implNFA_t *limex,
|
void writeReportList(const vector<ReportID> &reports, implNFA_t *limex,
|
||||||
const u32 reportListOffset) {
|
const u32 reportListOffset) {
|
||||||
DEBUG_PRINTF("reportListOffset=%u\n", reportListOffset);
|
DEBUG_PRINTF("reportListOffset=%u\n", reportListOffset);
|
||||||
assert(ISALIGNED_N((char *)limex + reportListOffset,
|
assert(ISALIGNED_N(reinterpret_cast<char *>(limex) + reportListOffset,
|
||||||
alignof(ReportID)));
|
alignof(ReportID)));
|
||||||
copy_bytes((char *)limex + reportListOffset, reports);
|
copy_bytes(reinterpret_cast<char *>(limex) + reportListOffset, reports);
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
bytecode_ptr<NFA> generateNfa(const build_info &args) {
|
bytecode_ptr<NFA> generateNfa(const build_info &args) {
|
||||||
if (args.num_states > NFATraits<dtype>::maxStates) {
|
if (args.num_states > NFATraits<dtype>::maxStates) {
|
||||||
return nullptr;
|
return bytecode_ptr<NFA>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build bounded repeat structures.
|
// Build bounded repeat structures.
|
||||||
@@ -2321,7 +2346,7 @@ struct Factory {
|
|||||||
auto nfa = make_zeroed_bytecode_ptr<NFA>(nfaSize);
|
auto nfa = make_zeroed_bytecode_ptr<NFA>(nfaSize);
|
||||||
assert(nfa); // otherwise we would have thrown std::bad_alloc
|
assert(nfa); // otherwise we would have thrown std::bad_alloc
|
||||||
|
|
||||||
implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa.get());
|
implNFA_t *limex = reinterpret_cast<implNFA_t *>(getMutableImplNfa(nfa.get()));
|
||||||
assert(ISALIGNED(limex));
|
assert(ISALIGNED(limex));
|
||||||
|
|
||||||
writeReachMapping(reach, reachMap, limex, reachOffset);
|
writeReachMapping(reach, reachMap, limex, reachOffset);
|
||||||
@@ -2453,6 +2478,7 @@ bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const u32 i = state_ids.at(v);
|
const u32 i = state_ids.at(v);
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (i == NO_STATE) {
|
if (i == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -2533,6 +2559,7 @@ bool isFast(const build_info &args) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
u32 j = args.state_ids.at(w);
|
u32 j = args.state_ids.at(w);
|
||||||
|
// cppcheck-suppress knownConditionTrueFalse
|
||||||
if (j == NO_STATE) {
|
if (j == NO_STATE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -2577,7 +2604,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
|
|||||||
|
|
||||||
if (!cc.grey.allowLimExNFA) {
|
if (!cc.grey.allowLimExNFA) {
|
||||||
DEBUG_PRINTF("limex not allowed\n");
|
DEBUG_PRINTF("limex not allowed\n");
|
||||||
return nullptr;
|
return bytecode_ptr<NFA>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If you ask for a particular type, it had better be an NFA.
|
// If you ask for a particular type, it had better be an NFA.
|
||||||
@@ -2612,7 +2639,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
|
|||||||
|
|
||||||
if (scores.empty()) {
|
if (scores.empty()) {
|
||||||
DEBUG_PRINTF("No NFA returned a valid score for this case.\n");
|
DEBUG_PRINTF("No NFA returned a valid score for this case.\n");
|
||||||
return nullptr;
|
return bytecode_ptr<NFA>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort acceptable models in priority order, lowest score first.
|
// Sort acceptable models in priority order, lowest score first.
|
||||||
@@ -2631,7 +2658,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
|
|||||||
}
|
}
|
||||||
|
|
||||||
DEBUG_PRINTF("NFA build failed.\n");
|
DEBUG_PRINTF("NFA build failed.\n");
|
||||||
return nullptr;
|
return bytecode_ptr<NFA>(nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 countAccelStates(NGHolder &h,
|
u32 countAccelStates(NGHolder &h,
|
||||||
|
|||||||
@@ -108,14 +108,14 @@ void dumpRepeats(const limex_type *limex, u32 model_size, FILE *f) {
|
|||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
fprintf(f, "%u bounded repeats.\n", limex->repeatCount);
|
fprintf(f, "%u bounded repeats.\n", limex->repeatCount);
|
||||||
|
|
||||||
const char *base = (const char *)limex;
|
const char *base = reinterpret_cast<const char *>(limex);
|
||||||
const u32 *repeatOffset = (const u32 *)(base + limex->repeatOffset);
|
const u32 *repeatOffset = reinterpret_cast<const u32 *>(base + limex->repeatOffset);
|
||||||
|
|
||||||
for (u32 i = 0; i < limex->repeatCount; i++) {
|
for (u32 i = 0; i < limex->repeatCount; i++) {
|
||||||
const NFARepeatInfo *info =
|
const NFARepeatInfo *info =
|
||||||
(const NFARepeatInfo *)(base + repeatOffset[i]);
|
reinterpret_cast<const NFARepeatInfo *>(base + repeatOffset[i]);
|
||||||
const RepeatInfo *repeat =
|
const RepeatInfo *repeat =
|
||||||
(const RepeatInfo *)((const char *)info + sizeof(*info));
|
reinterpret_cast<const RepeatInfo *>(reinterpret_cast<const char *>(info) + sizeof(*info));
|
||||||
fprintf(f, " repeat %u: %s {%u,%u} packedCtrlSize=%u, "
|
fprintf(f, " repeat %u: %s {%u,%u} packedCtrlSize=%u, "
|
||||||
"stateSize=%u\n",
|
"stateSize=%u\n",
|
||||||
i, repeatTypeName(repeat->type), repeat->repeatMin,
|
i, repeatTypeName(repeat->type), repeat->repeatMin,
|
||||||
@@ -123,7 +123,7 @@ void dumpRepeats(const limex_type *limex, u32 model_size, FILE *f) {
|
|||||||
fprintf(f, " nfa state: stream offset %u\n", info->stateOffset);
|
fprintf(f, " nfa state: stream offset %u\n", info->stateOffset);
|
||||||
fprintf(f, " ");
|
fprintf(f, " ");
|
||||||
|
|
||||||
const u8 *tug_mask = (const u8 *)info + info->tugMaskOffset;
|
const u8 *tug_mask = reinterpret_cast<const u8 *>(info) + info->tugMaskOffset;
|
||||||
dumpMask(f, "tugs", tug_mask, model_size);
|
dumpMask(f, "tugs", tug_mask, model_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -136,7 +136,7 @@ void dumpLimexReachMasks(u32 model_size, const u8 *reach, u32 reachCount,
|
|||||||
for (u32 i = 0; i < reachCount; i++) {
|
for (u32 i = 0; i < reachCount; i++) {
|
||||||
char tmp_common[100];
|
char tmp_common[100];
|
||||||
const u8 *row = reach + (i * (model_size/8));
|
const u8 *row = reach + (i * (model_size/8));
|
||||||
sprintf(tmp_common, "reach mask %u ", i);
|
snprintf(tmp_common, sizeof(tmp_common), "reach mask %u ", i);
|
||||||
dumpMask(f, tmp_common, row, model_size);
|
dumpMask(f, tmp_common, row, model_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -157,7 +157,7 @@ void dumpLimexReachMap(const u8 *reachMap, FILE *f) {
|
|||||||
template<typename limex_type>
|
template<typename limex_type>
|
||||||
static
|
static
|
||||||
const NFA *limex_to_nfa(const limex_type *limex) {
|
const NFA *limex_to_nfa(const limex_type *limex) {
|
||||||
return (const NFA *)((const char *)limex - sizeof(NFA));
|
return reinterpret_cast<const NFA *>(reinterpret_cast<const char *>(limex) - sizeof(NFA));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename limex_type>
|
template<typename limex_type>
|
||||||
@@ -172,8 +172,8 @@ void dumpAccel(const limex_type *limex, FILE *f) {
|
|||||||
|
|
||||||
u32 tableOffset = limex->accelTableOffset;
|
u32 tableOffset = limex->accelTableOffset;
|
||||||
u32 auxOffset = limex->accelAuxOffset;
|
u32 auxOffset = limex->accelAuxOffset;
|
||||||
const u8 *accelTable = (const u8 *)((const char *)limex + tableOffset);
|
const u8 *accelTable = reinterpret_cast<const u8 *>(reinterpret_cast<const char *>(limex) + tableOffset);
|
||||||
const AccelAux *aux = (const AccelAux *)((const char *)limex + auxOffset);
|
const AccelAux *aux = reinterpret_cast<const AccelAux *>(reinterpret_cast<const char *>(limex) + auxOffset);
|
||||||
|
|
||||||
for (u32 i = 0; i < limex->accelCount; i++) {
|
for (u32 i = 0; i < limex->accelCount; i++) {
|
||||||
fprintf(f, " accel %u (aux entry %u): ", i, accelTable[i]);
|
fprintf(f, " accel %u (aux entry %u): ", i, accelTable[i]);
|
||||||
@@ -191,7 +191,7 @@ void dumpAcceptList(const char *limex_base, const struct NFAAccept *accepts,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
fprintf(f, " idx %u fires report list %u:", i, a.reports);
|
fprintf(f, " idx %u fires report list %u:", i, a.reports);
|
||||||
const ReportID *report = (const ReportID *)(limex_base + a.reports);
|
const ReportID *report = reinterpret_cast<const ReportID *>(limex_base + a.reports);
|
||||||
for (; *report != MO_INVALID_IDX; report++) {
|
for (; *report != MO_INVALID_IDX; report++) {
|
||||||
fprintf(f, " %u", *report);
|
fprintf(f, " %u", *report);
|
||||||
}
|
}
|
||||||
@@ -202,18 +202,18 @@ void dumpAcceptList(const char *limex_base, const struct NFAAccept *accepts,
|
|||||||
template<typename limex_type>
|
template<typename limex_type>
|
||||||
static
|
static
|
||||||
void dumpAccepts(const limex_type *limex, FILE *f) {
|
void dumpAccepts(const limex_type *limex, FILE *f) {
|
||||||
const char *limex_base = (const char *)limex;
|
const char *limex_base = reinterpret_cast<const char *>(limex);
|
||||||
|
|
||||||
const u32 acceptCount = limex->acceptCount;
|
const u32 acceptCount = limex->acceptCount;
|
||||||
const u32 acceptEodCount = limex->acceptEodCount;
|
const u32 acceptEodCount = limex->acceptEodCount;
|
||||||
|
|
||||||
fprintf(f, "\n%u accepts.\n", acceptCount);
|
fprintf(f, "\n%u accepts.\n", acceptCount);
|
||||||
const auto *accepts =
|
const auto *accepts =
|
||||||
(const struct NFAAccept *)(limex_base + limex->acceptOffset);
|
reinterpret_cast<const struct NFAAccept *>(limex_base + limex->acceptOffset);
|
||||||
dumpAcceptList(limex_base, accepts, acceptCount, f);
|
dumpAcceptList(limex_base, accepts, acceptCount, f);
|
||||||
fprintf(f, "\n%u accepts at EOD.\n", acceptEodCount);
|
fprintf(f, "\n%u accepts at EOD.\n", acceptEodCount);
|
||||||
const auto *accepts_eod =
|
const auto *accepts_eod =
|
||||||
(const struct NFAAccept *)(limex_base + limex->acceptEodOffset);
|
reinterpret_cast<const struct NFAAccept *>(limex_base + limex->acceptEodOffset);
|
||||||
dumpAcceptList(limex_base, accepts_eod, acceptEodCount, f);
|
dumpAcceptList(limex_base, accepts_eod, acceptEodCount, f);
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
}
|
}
|
||||||
@@ -224,7 +224,7 @@ void dumpSquash(const limex_type *limex, FILE *f) {
|
|||||||
u32 size = limex_traits<limex_type>::size;
|
u32 size = limex_traits<limex_type>::size;
|
||||||
|
|
||||||
// Dump squash masks, if there are any.
|
// Dump squash masks, if there are any.
|
||||||
const u8 *squashMask = (const u8 *)limex + limex->squashOffset;
|
const u8 *squashMask = reinterpret_cast<const u8 *>(limex) + limex->squashOffset;
|
||||||
for (u32 i = 0; i < limex->squashCount; i++) {
|
for (u32 i = 0; i < limex->squashCount; i++) {
|
||||||
std::ostringstream name;
|
std::ostringstream name;
|
||||||
name << "squash_" << i;
|
name << "squash_" << i;
|
||||||
@@ -238,7 +238,7 @@ static
|
|||||||
const typename limex_traits<limex_type>::exception_type *
|
const typename limex_traits<limex_type>::exception_type *
|
||||||
getExceptionTable(const limex_type *limex) {
|
getExceptionTable(const limex_type *limex) {
|
||||||
return (const typename limex_traits<limex_type>::exception_type *)
|
return (const typename limex_traits<limex_type>::exception_type *)
|
||||||
((const char *)limex + limex->exceptionOffset);
|
(reinterpret_cast<const char *>(limex) + limex->exceptionOffset);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename limex_type>
|
template<typename limex_type>
|
||||||
@@ -248,7 +248,7 @@ void dumpLimexExceptions(const limex_type *limex, FILE *f) {
|
|||||||
getExceptionTable(limex);
|
getExceptionTable(limex);
|
||||||
const u32 size = limex_traits<limex_type>::size;
|
const u32 size = limex_traits<limex_type>::size;
|
||||||
|
|
||||||
const char *limex_base = (const char *)limex;
|
const char *limex_base = reinterpret_cast<const char *>(limex);
|
||||||
|
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
for (u32 i = 0; i < limex->exceptionCount; i++) {
|
for (u32 i = 0; i < limex->exceptionCount; i++) {
|
||||||
@@ -259,13 +259,13 @@ void dumpLimexExceptions(const limex_type *limex, FILE *f) {
|
|||||||
case LIMEX_TRIGGER_POS: fprintf(f, " trigger: POS\n"); break;
|
case LIMEX_TRIGGER_POS: fprintf(f, " trigger: POS\n"); break;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
dumpMask(f, "succ", (const u8 *)&e[i].successors, size);
|
dumpMask(f, "succ", reinterpret_cast<const u8 *>(&e[i].successors), size);
|
||||||
dumpMask(f, "squash", (const u8 *)&e[i].squash, size);
|
dumpMask(f, "squash", reinterpret_cast<const u8 *>(&e[i].squash), size);
|
||||||
fprintf(f, "reports: ");
|
fprintf(f, "reports: ");
|
||||||
if (e[i].reports == MO_INVALID_IDX) {
|
if (e[i].reports == MO_INVALID_IDX) {
|
||||||
fprintf(f, " <none>\n");
|
fprintf(f, " <none>\n");
|
||||||
} else {
|
} else {
|
||||||
const ReportID *r = (const ReportID *)(limex_base + e[i].reports);
|
const ReportID *r = reinterpret_cast<const ReportID *>(limex_base + e[i].reports);
|
||||||
while (*r != MO_INVALID_IDX) {
|
while (*r != MO_INVALID_IDX) {
|
||||||
fprintf(f, " %u", *r++);
|
fprintf(f, " %u", *r++);
|
||||||
}
|
}
|
||||||
@@ -282,7 +282,7 @@ void dumpLimexShifts(const limex_type *limex, FILE *f) {
|
|||||||
fprintf(f, "Shift Masks:\n");
|
fprintf(f, "Shift Masks:\n");
|
||||||
for(u32 i = 0; i < limex->shiftCount; i++) {
|
for(u32 i = 0; i < limex->shiftCount; i++) {
|
||||||
fprintf(f, "\t Shift %u(%hhu)\t\tMask: %s\n", i, limex->shiftAmount[i],
|
fprintf(f, "\t Shift %u(%hhu)\t\tMask: %s\n", i, limex->shiftAmount[i],
|
||||||
dumpMask((const u8 *)&limex->shift[i], size).c_str());
|
dumpMask(reinterpret_cast<const u8 *>(&limex->shift[i]), size).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<typename limex_type>
|
template<typename limex_type>
|
||||||
@@ -304,20 +304,20 @@ void dumpLimexText(const limex_type *limex, FILE *f) {
|
|||||||
}
|
}
|
||||||
fprintf(f, "\n\n");
|
fprintf(f, "\n\n");
|
||||||
|
|
||||||
dumpMask(f, "init", (const u8 *)&limex->init, size);
|
dumpMask(f, "init", reinterpret_cast<const u8 *>(&limex->init), size);
|
||||||
dumpMask(f, "init_dot_star", (const u8 *)&limex->initDS, size);
|
dumpMask(f, "init_dot_star", reinterpret_cast<const u8 *>(&limex->initDS), size);
|
||||||
dumpMask(f, "accept", (const u8 *)&limex->accept, size);
|
dumpMask(f, "accept", reinterpret_cast<const u8 *>(&limex->accept), size);
|
||||||
dumpMask(f, "accept_at_eod", (const u8 *)&limex->acceptAtEOD, size);
|
dumpMask(f, "accept_at_eod", reinterpret_cast<const u8 *>(&limex->acceptAtEOD), size);
|
||||||
dumpMask(f, "accel", (const u8 *)&limex->accel, size);
|
dumpMask(f, "accel", reinterpret_cast<const u8 *>(&limex->accel), size);
|
||||||
dumpMask(f, "accel_and_friends", (const u8 *)&limex->accel_and_friends,
|
dumpMask(f, "accel_and_friends", reinterpret_cast<const u8 *>(&limex->accel_and_friends),
|
||||||
size);
|
size);
|
||||||
dumpMask(f, "compress_mask", (const u8 *)&limex->compressMask, size);
|
dumpMask(f, "compress_mask", reinterpret_cast<const u8 *>(&limex->compressMask), size);
|
||||||
dumpMask(f, "emask", (const u8 *)&limex->exceptionMask, size);
|
dumpMask(f, "emask", reinterpret_cast<const u8 *>(&limex->exceptionMask), size);
|
||||||
dumpMask(f, "zombie", (const u8 *)&limex->zombieMask, size);
|
dumpMask(f, "zombie", reinterpret_cast<const u8 *>(&limex->zombieMask), size);
|
||||||
|
|
||||||
// Dump top masks, if there are any.
|
// Dump top masks, if there are any.
|
||||||
u32 topCount = limex->topCount;
|
u32 topCount = limex->topCount;
|
||||||
const u8 *topMask = (const u8 *)limex + limex->topOffset;
|
const u8 *topMask = reinterpret_cast<const u8 *>(limex) + limex->topOffset;
|
||||||
for (u32 i = 0; i < topCount; i++) {
|
for (u32 i = 0; i < topCount; i++) {
|
||||||
std::ostringstream name;
|
std::ostringstream name;
|
||||||
name << "top_" << i;
|
name << "top_" << i;
|
||||||
@@ -331,7 +331,7 @@ void dumpLimexText(const limex_type *limex, FILE *f) {
|
|||||||
dumpSquash(limex, f);
|
dumpSquash(limex, f);
|
||||||
|
|
||||||
dumpLimexReachMap(limex->reachMap, f);
|
dumpLimexReachMap(limex->reachMap, f);
|
||||||
dumpLimexReachMasks(size, (const u8 *)limex + sizeof(*limex) /* reach*/,
|
dumpLimexReachMasks(size, reinterpret_cast<const u8 *>(limex) + sizeof(*limex) /* reach*/,
|
||||||
limex->reachSize, f);
|
limex->reachSize, f);
|
||||||
|
|
||||||
dumpAccepts(limex, f);
|
dumpAccepts(limex, f);
|
||||||
@@ -378,7 +378,7 @@ struct limex_labeller : public nfa_labeller {
|
|||||||
void label_state(FILE *f, u32 state) const override {
|
void label_state(FILE *f, u32 state) const override {
|
||||||
const typename limex_traits<limex_type>::exception_type *exceptions
|
const typename limex_traits<limex_type>::exception_type *exceptions
|
||||||
= getExceptionTable(limex);
|
= getExceptionTable(limex);
|
||||||
if (!testbit((const u8 *)&limex->exceptionMask,
|
if (!testbit(reinterpret_cast<const u8 *>(&limex->exceptionMask),
|
||||||
limex_traits<limex_type>::size, state)) {
|
limex_traits<limex_type>::size, state)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -404,11 +404,11 @@ static
|
|||||||
void dumpVertexDotInfo(const limex_type *limex, u32 state_count, FILE *f,
|
void dumpVertexDotInfo(const limex_type *limex, u32 state_count, FILE *f,
|
||||||
const nfa_labeller &labeller) {
|
const nfa_labeller &labeller) {
|
||||||
u32 size = sizeof(limex->init) * 8;
|
u32 size = sizeof(limex->init) * 8;
|
||||||
const u8 *reach = (const u8 *)limex + sizeof(*limex);
|
const u8 *reach = reinterpret_cast<const u8 *>(limex) + sizeof(*limex);
|
||||||
vector<CharReach> perStateReach;
|
vector<CharReach> perStateReach;
|
||||||
setupReach(limex->reachMap, reach, size, state_count, &perStateReach);
|
setupReach(limex->reachMap, reach, size, state_count, &perStateReach);
|
||||||
|
|
||||||
const u8 *topMask = (const u8 *)limex + limex->topOffset;
|
const u8 *topMask = reinterpret_cast<const u8 *>(limex) + limex->topOffset;
|
||||||
|
|
||||||
for (u32 state = 0; state < state_count; state++) {
|
for (u32 state = 0; state < state_count; state++) {
|
||||||
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
|
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
|
||||||
@@ -419,15 +419,15 @@ void dumpVertexDotInfo(const limex_type *limex, u32 state_count, FILE *f,
|
|||||||
// bung in another couple lines to push char class (the widest thing) up a bit
|
// bung in another couple lines to push char class (the widest thing) up a bit
|
||||||
fprintf(f, "\\n\\n\" ];\n");
|
fprintf(f, "\\n\\n\" ];\n");
|
||||||
|
|
||||||
if (testbit((const u8 *)&limex->acceptAtEOD, size, state)) {
|
if (testbit(reinterpret_cast<const u8 *>(&limex->acceptAtEOD), size, state)) {
|
||||||
fprintf(f, "%u [ shape = box ];\n", state);
|
fprintf(f, "%u [ shape = box ];\n", state);
|
||||||
} else if (testbit((const u8 *)&limex->accept, size, state)) {
|
} else if (testbit(reinterpret_cast<const u8 *>(&limex->accept), size, state)) {
|
||||||
fprintf(f, "%u [ shape = doublecircle ];\n", state);
|
fprintf(f, "%u [ shape = doublecircle ];\n", state);
|
||||||
}
|
}
|
||||||
if (testbit((const u8 *)&limex->accel, size, state)) {
|
if (testbit(reinterpret_cast<const u8 *>(&limex->accel), size, state)) {
|
||||||
fprintf(f, "%u [ color = red style = diagonals];\n", state);
|
fprintf(f, "%u [ color = red style = diagonals];\n", state);
|
||||||
}
|
}
|
||||||
if (testbit((const u8 *)&limex->init, size, state)) {
|
if (testbit(reinterpret_cast<const u8 *>(&limex->init), size, state)) {
|
||||||
fprintf(f, "START -> %u [ color = grey ];\n", state);
|
fprintf(f, "START -> %u [ color = grey ];\n", state);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -447,7 +447,7 @@ template<typename limex_type>
|
|||||||
static
|
static
|
||||||
void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
|
void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
|
||||||
u32 size = limex_traits<limex_type>::size;
|
u32 size = limex_traits<limex_type>::size;
|
||||||
if (!testbit((const u8 *)&limex->exceptionMask, size, state)) {
|
if (!testbit(reinterpret_cast<const u8 *>(&limex->exceptionMask), size, state)) {
|
||||||
return; /* not exceptional */
|
return; /* not exceptional */
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -461,10 +461,10 @@ void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
|
|||||||
u32 state_count = limex_to_nfa(limex)->nPositions;
|
u32 state_count = limex_to_nfa(limex)->nPositions;
|
||||||
|
|
||||||
for (u32 j = 0; j < state_count; j++) {
|
for (u32 j = 0; j < state_count; j++) {
|
||||||
if (testbit((const u8 *)&e->successors, size, j)) {
|
if (testbit(reinterpret_cast<const u8 *>(&e->successors), size, j)) {
|
||||||
fprintf(f, "%u -> %u [color = blue];\n", state, j);
|
fprintf(f, "%u -> %u [color = blue];\n", state, j);
|
||||||
}
|
}
|
||||||
if (!testbit((const u8 *)&e->squash, size, j)) {
|
if (!testbit(reinterpret_cast<const u8 *>(&e->squash), size, j)) {
|
||||||
fprintf(f, "%u -> %u [color = grey style = dashed];\n", state, j);
|
fprintf(f, "%u -> %u [color = grey style = dashed];\n", state, j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -480,7 +480,7 @@ static
|
|||||||
void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
|
void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
|
||||||
for (u32 j = 0; j < limex->shiftCount; j++) {
|
for (u32 j = 0; j < limex->shiftCount; j++) {
|
||||||
const u32 shift_amount = limex->shiftAmount[j];
|
const u32 shift_amount = limex->shiftAmount[j];
|
||||||
if (testbit((const u8 *)&limex->shift[j],
|
if (testbit(reinterpret_cast<const u8 *>(&limex->shift[j]),
|
||||||
limex_traits<limex_type>::size, state)) {
|
limex_traits<limex_type>::size, state)) {
|
||||||
fprintf(f, "%u -> %u;\n", state, state + shift_amount);
|
fprintf(f, "%u -> %u;\n", state, state + shift_amount);
|
||||||
}
|
}
|
||||||
@@ -502,7 +502,7 @@ void dumpLimexDot(const NFA *nfa, const limex_type *limex, FILE *f) {
|
|||||||
|
|
||||||
#define LIMEX_DUMP_FN(size) \
|
#define LIMEX_DUMP_FN(size) \
|
||||||
void nfaExecLimEx##size##_dump(const NFA *nfa, const string &base) { \
|
void nfaExecLimEx##size##_dump(const NFA *nfa, const string &base) { \
|
||||||
auto limex = (const LimExNFA##size *)getImplNfa(nfa); \
|
auto limex = reinterpret_cast<const LimExNFA##size *>(getImplNfa(nfa)); \
|
||||||
dumpLimexText(limex, StdioFile(base + ".txt", "w")); \
|
dumpLimexText(limex, StdioFile(base + ".txt", "w")); \
|
||||||
dumpLimexDot(nfa, limex, StdioFile(base + ".dot", "w")); \
|
dumpLimexDot(nfa, limex, StdioFile(base + ".dot", "w")); \
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -302,8 +302,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// A copy of the estate as an array of GPR-sized chunks.
|
// A copy of the estate as an array of GPR-sized chunks.
|
||||||
CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
|
CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; // cppcheck-suppress duplicateExpression
|
||||||
CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
|
CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; // cppcheck-suppress duplicateExpression
|
||||||
#ifdef ESTATE_ON_STACK
|
#ifdef ESTATE_ON_STACK
|
||||||
memcpy(chunks, &estate, sizeof(STATE_T));
|
memcpy(chunks, &estate, sizeof(STATE_T));
|
||||||
#else
|
#else
|
||||||
@@ -311,7 +311,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
|
|||||||
#endif
|
#endif
|
||||||
memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
|
memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
|
||||||
|
|
||||||
u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
|
u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)]; // cppcheck-suppress duplicateExpression
|
||||||
base_index[0] = 0;
|
base_index[0] = 0;
|
||||||
for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
|
for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
|
||||||
base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
|
base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
|
||||||
@@ -322,6 +322,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
|
|||||||
#ifdef ARCH_64_BIT
|
#ifdef ARCH_64_BIT
|
||||||
t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
|
t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
|
||||||
#endif
|
#endif
|
||||||
|
// cppcheck-suppress unsignedLessThanZero
|
||||||
assert(t < ARRAY_LENGTH(chunks));
|
assert(t < ARRAY_LENGTH(chunks));
|
||||||
CHUNK_T word = chunks[t];
|
CHUNK_T word = chunks[t];
|
||||||
assert(word != 0);
|
assert(word != 0);
|
||||||
|
|||||||
@@ -163,12 +163,12 @@ struct LimExNFA##size { \
|
|||||||
m512 exceptionAndMask; /**< exception and mask */ \
|
m512 exceptionAndMask; /**< exception and mask */ \
|
||||||
};
|
};
|
||||||
|
|
||||||
CREATE_NFA_LIMEX(32)
|
CREATE_NFA_LIMEX(32) //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||||
CREATE_NFA_LIMEX(64)
|
CREATE_NFA_LIMEX(64) //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||||
CREATE_NFA_LIMEX(128)
|
CREATE_NFA_LIMEX(128) //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||||
CREATE_NFA_LIMEX(256)
|
CREATE_NFA_LIMEX(256) //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||||
CREATE_NFA_LIMEX(384)
|
CREATE_NFA_LIMEX(384) //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||||
CREATE_NFA_LIMEX(512)
|
CREATE_NFA_LIMEX(512) //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||||
|
|
||||||
/** \brief Structure describing a bounded repeat within the LimEx NFA.
|
/** \brief Structure describing a bounded repeat within the LimEx NFA.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -927,7 +927,7 @@ char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
|
|||||||
context);
|
context);
|
||||||
}
|
}
|
||||||
|
|
||||||
char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
|
char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, const struct mq *q) {
|
||||||
const IMPL_NFA_T *limex = getImplNfa(n);
|
const IMPL_NFA_T *limex = getImplNfa(n);
|
||||||
REPORTCURRENT_FN(limex, q);
|
REPORTCURRENT_FN(limex, q);
|
||||||
return 1;
|
return 1;
|
||||||
@@ -984,9 +984,9 @@ char JOIN(LIMEX_API_ROOT, _inAccept)(const struct NFA *nfa,
|
|||||||
assert(q->state && q->streamState);
|
assert(q->state && q->streamState);
|
||||||
|
|
||||||
const IMPL_NFA_T *limex = getImplNfa(nfa);
|
const IMPL_NFA_T *limex = getImplNfa(nfa);
|
||||||
union RepeatControl *repeat_ctrl =
|
const union RepeatControl *repeat_ctrl =
|
||||||
getRepeatControlBase(q->state, sizeof(STATE_T));
|
getRepeatControlBase(q->state, sizeof(STATE_T));
|
||||||
char *repeat_state = q->streamState + limex->stateSize;
|
const char *repeat_state = q->streamState + limex->stateSize;
|
||||||
STATE_T state = *(STATE_T *)q->state;
|
STATE_T state = *(STATE_T *)q->state;
|
||||||
u64a offset = q->offset + q_last_loc(q) + 1;
|
u64a offset = q->offset + q_last_loc(q) + 1;
|
||||||
|
|
||||||
@@ -999,9 +999,9 @@ char JOIN(LIMEX_API_ROOT, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
|
|||||||
assert(q->state && q->streamState);
|
assert(q->state && q->streamState);
|
||||||
|
|
||||||
const IMPL_NFA_T *limex = getImplNfa(nfa);
|
const IMPL_NFA_T *limex = getImplNfa(nfa);
|
||||||
union RepeatControl *repeat_ctrl =
|
const union RepeatControl *repeat_ctrl =
|
||||||
getRepeatControlBase(q->state, sizeof(STATE_T));
|
getRepeatControlBase(q->state, sizeof(STATE_T));
|
||||||
char *repeat_state = q->streamState + limex->stateSize;
|
const char *repeat_state = q->streamState + limex->stateSize;
|
||||||
STATE_T state = *(STATE_T *)q->state;
|
STATE_T state = *(STATE_T *)q->state;
|
||||||
u64a offset = q->offset + q_last_loc(q) + 1;
|
u64a offset = q->offset + q_last_loc(q) + 1;
|
||||||
|
|
||||||
@@ -1020,9 +1020,9 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
|
|||||||
|
|
||||||
if (limex->repeatCount) {
|
if (limex->repeatCount) {
|
||||||
u64a offset = q->offset + loc + 1;
|
u64a offset = q->offset + loc + 1;
|
||||||
union RepeatControl *repeat_ctrl =
|
const union RepeatControl *repeat_ctrl =
|
||||||
getRepeatControlBase(q->state, sizeof(STATE_T));
|
getRepeatControlBase(q->state, sizeof(STATE_T));
|
||||||
char *repeat_state = q->streamState + limex->stateSize;
|
const char *repeat_state = q->streamState + limex->stateSize;
|
||||||
SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &state);
|
SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -134,7 +134,6 @@ u16 doWide16(const char *wide_entry, const u8 **c_inout, const u8 *end,
|
|||||||
len_c -= 16;
|
len_c -= 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
pos = 0;
|
|
||||||
// at least one in (0, 16).
|
// at least one in (0, 16).
|
||||||
u32 loadLength_w = MIN(len_w, 16);
|
u32 loadLength_w = MIN(len_w, 16);
|
||||||
u32 loadLength_c = MIN(len_c, 16);
|
u32 loadLength_c = MIN(len_c, 16);
|
||||||
|
|||||||
@@ -106,9 +106,10 @@ static really_inline
|
|||||||
const char *findShermanState(UNUSED const struct mcclellan *m,
|
const char *findShermanState(UNUSED const struct mcclellan *m,
|
||||||
const char *sherman_base_offset, u32 sherman_base,
|
const char *sherman_base_offset, u32 sherman_base,
|
||||||
u32 s) {
|
u32 s) {
|
||||||
const char *rv
|
const char *rv = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
|
||||||
= sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
|
// cppcheck-suppress cstyleCast
|
||||||
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
|
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
|
UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
|
||||||
assert(type == SHERMAN_STATE);
|
assert(type == SHERMAN_STATE);
|
||||||
return rv;
|
return rv;
|
||||||
@@ -123,13 +124,15 @@ char *findMutableShermanState(char *sherman_base_offset, u16 sherman_base,
|
|||||||
static really_inline
|
static really_inline
|
||||||
const char *findWideEntry8(UNUSED const struct mcclellan *m,
|
const char *findWideEntry8(UNUSED const struct mcclellan *m,
|
||||||
const char *wide_base, u32 wide_limit, u32 s) {
|
const char *wide_base, u32 wide_limit, u32 s) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
UNUSED u8 type = *(const u8 *)wide_base;
|
UNUSED u8 type = *(const u8 *)wide_base;
|
||||||
assert(type == WIDE_STATE);
|
assert(type == WIDE_STATE);
|
||||||
const u32 entry_offset
|
// cppcheck-suppress cstyleCast
|
||||||
= *(const u32 *)(wide_base
|
const u32 entry_offset = *(const u32 *)(wide_base
|
||||||
+ WIDE_ENTRY_OFFSET8((s - wide_limit) * sizeof(u32)));
|
+ WIDE_ENTRY_OFFSET8((s - wide_limit) * sizeof(u32)));
|
||||||
|
|
||||||
const char *rv = wide_base + entry_offset;
|
const char *rv = wide_base + entry_offset;
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
|
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
@@ -137,21 +140,23 @@ const char *findWideEntry8(UNUSED const struct mcclellan *m,
|
|||||||
static really_inline
|
static really_inline
|
||||||
const char *findWideEntry16(UNUSED const struct mcclellan *m,
|
const char *findWideEntry16(UNUSED const struct mcclellan *m,
|
||||||
const char *wide_base, u32 wide_limit, u32 s) {
|
const char *wide_base, u32 wide_limit, u32 s) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
UNUSED u8 type = *(const u8 *)wide_base;
|
UNUSED u8 type = *(const u8 *)wide_base;
|
||||||
assert(type == WIDE_STATE);
|
assert(type == WIDE_STATE);
|
||||||
const u32 entry_offset
|
// cppcheck-suppress cstyleCast
|
||||||
= *(const u32 *)(wide_base
|
const u32 entry_offset = *(const u32 *)(wide_base
|
||||||
+ WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
|
+ WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
|
||||||
|
|
||||||
const char *rv = wide_base + entry_offset;
|
const char *rv = wide_base + entry_offset;
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
|
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
char *findMutableWideEntry16(char *wide_base, u32 wide_limit, u32 s) {
|
char *findMutableWideEntry16(char *wide_base, u32 wide_limit, u32 s) {
|
||||||
u32 entry_offset
|
// cppcheck-suppress cstyleCast
|
||||||
= *(const u32 *)(wide_base
|
u32 entry_offset = *(const u32 *)(wide_base
|
||||||
+ WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
|
+ WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
|
||||||
|
|
||||||
return wide_base + entry_offset;
|
return wide_base + entry_offset;
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user