Merge branch develop into master

This commit is contained in:
Matthew Barr 2016-06-01 11:09:05 +10:00
commit e3e0a0fab0
270 changed files with 21472 additions and 15494 deletions

4
.gitignore vendored
View File

@ -46,10 +46,6 @@ sqlite3
src/config.h src/config.h
src/config.h.in src/config.h.in
src/hs_version.h src/hs_version.h
src/fdr/fdr_autogen.c
src/fdr/fdr_autogen_compiler.cpp
src/fdr/teddy_autogen.c
src/fdr/teddy_autogen_compiler.cpp
src/parser/Parser.cpp src/parser/Parser.cpp
# Generated PCRE files # Generated PCRE files

View File

@ -2,6 +2,40 @@
This is a list of notable changes to Hyperscan, in reverse chronological order. This is a list of notable changes to Hyperscan, in reverse chronological order.
## [4.2.0] 2016-05-31
- Introduce an interpreter for many complex actions to replace the use of
internal reports within the core of Hyperscan (the "Rose" engine). This
improves scanning performance and reduces database size for many pattern
sets.
- Many enhancements to the acceleration framework used by NFA and DFA engines,
including more flexible multibyte implementations and more AVX2 support. This
improves scanning performance for many pattern sets.
- Improved prefiltering support for complex patterns containing very large
bounded repeats (`R{M,N}` with large `N`).
- Improve scanning performance of pattern sets with a very large number of
EOD-anchored patterns.
- Improve scanning performance of large pattern sets that use the
`HS_FLAG_SINGLEMATCH` flag.
- Improve scanning performance of pattern sets that contain a single literal by
improving the "Noodle" literal matcher.
- Small reductions in total stream state for many pattern sets.
- Improve runtime detection of AVX2 support.
- Disable -Werror for release builds, in order to behave better for packagers
and users with different compiler combinations than those that we test.
- Improve support for building on Windows with MSVC 2015 (github issue #14).
Support for Hyperscan on Windows is still experimental.
- Small updates to fix warnings identified by Coverity.
- Remove Python codegen for the "FDR" and "Teddy" literal matchers. These are
now implemented directly in C code.
- Remove the specialist "Sidecar" engine in favour of using our more general
repeat engines.
- New API function: add the `hs_expression_ext_info()` function. This is a
variant of `hs_expression_info()` that can accept patterns with extended
parameters.
- New API error value: add the `HS_SCRATCH_IN_USE` error, which is returned
when Hyperscan detects that a scratch region is already in use on entry to an
API function.
## [4.1.0] 2015-12-18 ## [4.1.0] 2015-12-18
- Update version of PCRE used by testing tools as a syntax and semantic - Update version of PCRE used by testing tools as a syntax and semantic
reference to PCRE 8.38. reference to PCRE 8.38.

View File

@ -2,7 +2,7 @@ cmake_minimum_required (VERSION 2.8.11)
project (Hyperscan C CXX) project (Hyperscan C CXX)
set (HS_MAJOR_VERSION 4) set (HS_MAJOR_VERSION 4)
set (HS_MINOR_VERSION 1) set (HS_MINOR_VERSION 2)
set (HS_PATCH_VERSION 0) set (HS_PATCH_VERSION 0)
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
@ -75,7 +75,7 @@ if(NOT Boost_FOUND)
set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include") set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
find_package(Boost ${BOOST_MINVERSION}) find_package(Boost ${BOOST_MINVERSION})
if(NOT Boost_FOUND) if(NOT Boost_FOUND)
message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.") message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system packages if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
endif() endif()
endif() endif()
@ -115,7 +115,9 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
endif() endif()
#for config #for config
set(HS_OPTIMIZE OPTIMISE) if (OPTIMISE)
set(HS_OPTIMIZE ON)
endif()
CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF) CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF)
@ -171,8 +173,14 @@ else()
endif() endif()
# set compiler flags - more are tested and added later # set compiler flags - more are tested and added later
set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -Werror") set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual")
set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Werror -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor") set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor")
if (NOT RELEASE_BUILD)
# -Werror is most useful during development, don't potentially break
# release builds
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
endif()
if (NOT CMAKE_C_FLAGS MATCHES .*march.*) if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
message(STATUS "Building for current host CPU") message(STATUS "Building for current host CPU")
@ -229,6 +237,9 @@ if (RELEASE_BUILD)
endif() endif()
endif() endif()
# ensure we are building for the right target arch
include (${CMAKE_MODULE_PATH}/arch.cmake)
# testing a builtin takes a little more work # testing a builtin takes a little more work
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED) CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED) CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
@ -332,7 +343,7 @@ endif()
add_subdirectory(util) add_subdirectory(util)
add_subdirectory(unit) add_subdirectory(unit)
add_subdirectory(doc/dev-reference) add_subdirectory(doc/dev-reference)
if (EXISTS ${CMAKE_SOURCE_DIR}/tools) if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
add_subdirectory(tools) add_subdirectory(tools)
endif() endif()
@ -340,8 +351,15 @@ endif()
configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h) configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h) configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
if (PKG_CONFIG_FOUND) if (NOT WIN32)
# we really only need to do this if we have pkg-config # expand out library names for pkgconfig static link info
foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
# this is fragile, but protects us from toolchain specific files
if (NOT EXISTS ${LIB})
set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
endif()
endforeach()
configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars
install(FILES ${CMAKE_BINARY_DIR}/libhs.pc install(FILES ${CMAKE_BINARY_DIR}/libhs.pc
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig") DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
@ -352,11 +370,6 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
# include the autogen targets
add_subdirectory(src/fdr)
include_directories(${PROJECT_BINARY_DIR}/src/fdr)
if(NOT WIN32) if(NOT WIN32)
set(RAGEL_C_FLAGS "-Wno-unused") set(RAGEL_C_FLAGS "-Wno-unused")
endif() endif()
@ -376,14 +389,13 @@ SET(hs_HEADERS
) )
install(FILES ${hs_HEADERS} DESTINATION include/hs) install(FILES ${hs_HEADERS} DESTINATION include/hs)
set(fdr_autogen_targets autogen_runtime autogen_teddy_runtime)
set (hs_exec_SRCS set (hs_exec_SRCS
${hs_HEADERS} ${hs_HEADERS}
src/hs_version.h src/hs_version.h
src/ue2common.h src/ue2common.h
src/alloc.c src/alloc.c
src/allocator.h src/allocator.h
src/report.h
src/runtime.c src/runtime.c
src/fdr/fdr.c src/fdr/fdr.c
src/fdr/fdr.h src/fdr/fdr.h
@ -394,7 +406,9 @@ set (hs_exec_SRCS
src/fdr/flood_runtime.h src/fdr/flood_runtime.h
src/fdr/fdr_loadval.h src/fdr/fdr_loadval.h
src/fdr/teddy.c src/fdr/teddy.c
src/fdr/teddy.h
src/fdr/teddy_internal.h src/fdr/teddy_internal.h
src/fdr/teddy_runtime_common.h
src/hwlm/hwlm.c src/hwlm/hwlm.c
src/hwlm/hwlm.h src/hwlm/hwlm.h
src/hwlm/hwlm_internal.h src/hwlm/hwlm_internal.h
@ -437,6 +451,25 @@ set (hs_exec_SRCS
src/nfa/mpv.h src/nfa/mpv.h
src/nfa/mpv.c src/nfa/mpv.c
src/nfa/mpv_internal.h src/nfa/mpv_internal.h
src/nfa/multiaccel_common.h
src/nfa/multiaccel_doubleshift.h
src/nfa/multiaccel_doubleshiftgrab.h
src/nfa/multiaccel_long.h
src/nfa/multiaccel_longgrab.h
src/nfa/multiaccel_shift.h
src/nfa/multiaccel_shiftgrab.h
src/nfa/multishufti.c
src/nfa/multishufti_avx2.h
src/nfa/multishufti_sse.h
src/nfa/multishufti.h
src/nfa/multitruffle.c
src/nfa/multitruffle_avx2.h
src/nfa/multitruffle_sse.h
src/nfa/multitruffle.h
src/nfa/multivermicelli.c
src/nfa/multivermicelli.h
src/nfa/multivermicelli_sse.h
src/nfa/multivermicelli_avx2.h
src/nfa/nfa_api.h src/nfa/nfa_api.h
src/nfa/nfa_api_dispatch.c src/nfa/nfa_api_dispatch.c
src/nfa/nfa_internal.h src/nfa/nfa_internal.h
@ -444,20 +477,17 @@ set (hs_exec_SRCS
src/nfa/repeat.c src/nfa/repeat.c
src/nfa/repeat.h src/nfa/repeat.h
src/nfa/repeat_internal.h src/nfa/repeat_internal.h
src/nfa/shufti_common.h
src/nfa/shufti.c src/nfa/shufti.c
src/nfa/shufti.h src/nfa/shufti.h
src/nfa/truffle_common.h
src/nfa/truffle.c src/nfa/truffle.c
src/nfa/truffle.h src/nfa/truffle.h
src/nfa/vermicelli.h src/nfa/vermicelli.h
src/nfa/vermicelli_run.h src/nfa/vermicelli_run.h
src/nfa/vermicelli_sse.h src/nfa/vermicelli_sse.h
src/sidecar/sidecar.c
src/sidecar/sidecar.h
src/sidecar/sidecar_generic.h
src/sidecar/sidecar_internal.h
src/sidecar/sidecar_shufti.c
src/sidecar/sidecar_shufti.h
src/som/som.h src/som/som.h
src/som/som_operation.h
src/som/som_runtime.h src/som/som_runtime.h
src/som/som_runtime.c src/som/som_runtime.c
src/som/som_stream.c src/som/som_stream.c
@ -473,10 +503,11 @@ set (hs_exec_SRCS
src/rose/match.h src/rose/match.h
src/rose/match.c src/rose/match.c
src/rose/miracle.h src/rose/miracle.h
src/rose/program_runtime.h
src/rose/runtime.h src/rose/runtime.h
src/rose/rose_sidecar_runtime.h
src/rose/rose.h src/rose/rose.h
src/rose/rose_internal.h src/rose/rose_internal.h
src/rose/rose_program.h
src/rose/rose_types.h src/rose/rose_types.h
src/rose/rose_common.h src/rose/rose_common.h
src/util/bitutils.h src/util/bitutils.h
@ -484,7 +515,6 @@ set (hs_exec_SRCS
src/util/fatbit.h src/util/fatbit.h
src/util/fatbit.c src/util/fatbit.c
src/util/join.h src/util/join.h
src/util/masked_move.c
src/util/masked_move.h src/util/masked_move.h
src/util/multibit.h src/util/multibit.h
src/util/multibit_internal.h src/util/multibit_internal.h
@ -498,6 +528,7 @@ set (hs_exec_SRCS
src/util/shuffle_ssse3.h src/util/shuffle_ssse3.h
src/util/simd_utils.h src/util/simd_utils.h
src/util/simd_utils_ssse3.h src/util/simd_utils_ssse3.h
src/util/simd_utils_ssse3.c
src/util/state_compress.h src/util/state_compress.h
src/util/state_compress.c src/util/state_compress.c
src/util/unaligned.h src/util/unaligned.h
@ -510,6 +541,14 @@ set (hs_exec_SRCS
src/database.h src/database.h
) )
if (HAVE_AVX2)
set (hs_exec_SRCS
${hs_exec_SRCS}
src/fdr/teddy_avx2.c
src/util/masked_move.c
)
endif ()
SET (hs_SRCS SET (hs_SRCS
${hs_HEADERS} ${hs_HEADERS}
@ -574,6 +613,8 @@ SET (hs_SRCS
src/nfa/mcclellan_internal.h src/nfa/mcclellan_internal.h
src/nfa/mcclellancompile.cpp src/nfa/mcclellancompile.cpp
src/nfa/mcclellancompile.h src/nfa/mcclellancompile.h
src/nfa/mcclellancompile_accel.cpp
src/nfa/mcclellancompile_accel.h
src/nfa/mcclellancompile_util.cpp src/nfa/mcclellancompile_util.cpp
src/nfa/mcclellancompile_util.h src/nfa/mcclellancompile_util.h
src/nfa/limex_compile.cpp src/nfa/limex_compile.cpp
@ -583,6 +624,8 @@ SET (hs_SRCS
src/nfa/mpv_internal.h src/nfa/mpv_internal.h
src/nfa/mpvcompile.cpp src/nfa/mpvcompile.cpp
src/nfa/mpvcompile.h src/nfa/mpvcompile.h
src/nfa/multiaccel_compilehelper.cpp
src/nfa/multiaccel_compilehelper.h
src/nfa/nfa_api.h src/nfa/nfa_api.h
src/nfa/nfa_api_queue.h src/nfa/nfa_api_queue.h
src/nfa/nfa_api_util.h src/nfa/nfa_api_util.h
@ -762,8 +805,6 @@ SET (hs_SRCS
src/parser/unsupported.h src/parser/unsupported.h
src/parser/utf8_validate.h src/parser/utf8_validate.h
src/parser/utf8_validate.cpp src/parser/utf8_validate.cpp
src/sidecar/sidecar_compile.cpp
src/sidecar/sidecar_compile.h
src/smallwrite/smallwrite_build.cpp src/smallwrite/smallwrite_build.cpp
src/smallwrite/smallwrite_build.h src/smallwrite/smallwrite_build.h
src/smallwrite/smallwrite_internal.h src/smallwrite/smallwrite_internal.h
@ -771,6 +812,7 @@ SET (hs_SRCS
src/som/slot_manager.h src/som/slot_manager.h
src/som/slot_manager_internal.h src/som/slot_manager_internal.h
src/som/som.h src/som/som.h
src/som/som_operation.h
src/rose/rose_build.h src/rose/rose_build.h
src/rose/rose_build_add.cpp src/rose/rose_build_add.cpp
src/rose/rose_build_add_internal.h src/rose/rose_build_add_internal.h
@ -778,6 +820,8 @@ SET (hs_SRCS
src/rose/rose_build_anchored.cpp src/rose/rose_build_anchored.cpp
src/rose/rose_build_anchored.h src/rose/rose_build_anchored.h
src/rose/rose_build_bytecode.cpp src/rose/rose_build_bytecode.cpp
src/rose/rose_build_castle.h
src/rose/rose_build_castle.cpp
src/rose/rose_build_compile.cpp src/rose/rose_build_compile.cpp
src/rose/rose_build_convert.cpp src/rose/rose_build_convert.cpp
src/rose/rose_build_convert.h src/rose/rose_build_convert.h
@ -786,6 +830,8 @@ SET (hs_SRCS
src/rose/rose_build_infix.h src/rose/rose_build_infix.h
src/rose/rose_build_lookaround.cpp src/rose/rose_build_lookaround.cpp
src/rose/rose_build_lookaround.h src/rose/rose_build_lookaround.h
src/rose/rose_build_matchers.cpp
src/rose/rose_build_matchers.h
src/rose/rose_build_merge.cpp src/rose/rose_build_merge.cpp
src/rose/rose_build_merge.h src/rose/rose_build_merge.h
src/rose/rose_build_misc.cpp src/rose/rose_build_misc.cpp
@ -799,6 +845,7 @@ SET (hs_SRCS
src/rose/rose_in_graph.h src/rose/rose_in_graph.h
src/rose/rose_in_util.cpp src/rose/rose_in_util.cpp
src/rose/rose_in_util.h src/rose/rose_in_util.h
src/util/accel_scheme.h
src/util/alloc.cpp src/util/alloc.cpp
src/util/alloc.h src/util/alloc.h
src/util/bitfield.h src/util/bitfield.h
@ -820,7 +867,6 @@ SET (hs_SRCS
src/util/dump_mask.cpp src/util/dump_mask.cpp
src/util/dump_mask.h src/util/dump_mask.h
src/util/graph.h src/util/graph.h
src/util/internal_report.h
src/util/multibit_build.cpp src/util/multibit_build.cpp
src/util/multibit_build.h src/util/multibit_build.h
src/util/order_check.h src/util/order_check.h
@ -828,7 +874,6 @@ SET (hs_SRCS
src/util/partitioned_set.h src/util/partitioned_set.h
src/util/popcount.h src/util/popcount.h
src/util/queue_index_factory.h src/util/queue_index_factory.h
src/util/report.cpp
src/util/report.h src/util/report.h
src/util/report_manager.cpp src/util/report_manager.cpp
src/util/report_manager.h src/util/report_manager.h
@ -874,8 +919,6 @@ set(hs_dump_SRCS
src/parser/dump.cpp src/parser/dump.cpp
src/parser/dump.h src/parser/dump.h
src/parser/position_dump.h src/parser/position_dump.h
src/sidecar/sidecar_dump.cpp
src/sidecar/sidecar_dump.h
src/smallwrite/smallwrite_dump.cpp src/smallwrite/smallwrite_dump.cpp
src/smallwrite/smallwrite_dump.h src/smallwrite/smallwrite_dump.h
src/som/slot_manager_dump.cpp src/som/slot_manager_dump.cpp
@ -901,11 +944,9 @@ set (LIB_VERSION ${HS_VERSION})
set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}) set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
add_library(hs_exec OBJECT ${hs_exec_SRCS}) add_library(hs_exec OBJECT ${hs_exec_SRCS})
add_dependencies(hs_exec ${fdr_autogen_targets})
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS) if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
add_library(hs_exec_shared OBJECT ${hs_exec_SRCS}) add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
add_dependencies(hs_exec_shared ${fdr_autogen_targets})
set_target_properties(hs_exec_shared PROPERTIES set_target_properties(hs_exec_shared PROPERTIES
POSITION_INDEPENDENT_CODE TRUE) POSITION_INDEPENDENT_CODE TRUE)
endif() endif()
@ -929,14 +970,16 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
OUTPUT_NAME hs_runtime OUTPUT_NAME hs_runtime
MACOSX_RPATH ON MACOSX_RPATH ON
LINKER_LANGUAGE C) LINKER_LANGUAGE C)
install(TARGETS hs_runtime_shared DESTINATION lib) install(TARGETS hs_runtime_shared
RUNTIME DESTINATION bin
ARCHIVE DESTINATION lib
LIBRARY DESTINATION lib)
endif() endif()
# we want the static lib for testing # we want the static lib for testing
add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>) add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
add_dependencies(hs ragel_Parser) add_dependencies(hs ragel_Parser)
add_dependencies(hs autogen_compiler autogen_teddy_compiler)
if (NOT BUILD_SHARED_LIBS) if (NOT BUILD_SHARED_LIBS)
install(TARGETS hs DESTINATION lib) install(TARGETS hs DESTINATION lib)
@ -945,13 +988,15 @@ endif()
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS) if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>) add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
add_dependencies(hs_shared ragel_Parser) add_dependencies(hs_shared ragel_Parser)
add_dependencies(hs_shared autogen_compiler autogen_teddy_compiler)
set_target_properties(hs_shared PROPERTIES set_target_properties(hs_shared PROPERTIES
OUTPUT_NAME hs OUTPUT_NAME hs
VERSION ${LIB_VERSION} VERSION ${LIB_VERSION}
SOVERSION ${LIB_SOVERSION} SOVERSION ${LIB_SOVERSION}
MACOSX_RPATH ON) MACOSX_RPATH ON)
install(TARGETS hs_shared DESTINATION lib) install(TARGETS hs_shared
RUNTIME DESTINATION bin
ARCHIVE DESTINATION lib
LIBRARY DESTINATION lib)
endif() endif()
if(NOT WIN32) if(NOT WIN32)

42
cmake/arch.cmake Normal file
View File

@ -0,0 +1,42 @@
# detect architecture features
#
# must be called after determining where compiler intrinsics are defined
if (HAVE_C_X86INTRIN_H)
set (INTRIN_INC_H "x86intrin.h")
elseif (HAVE_C_INTRIN_H)
set (INTRIN_INC_H "intrin.h")
else ()
message (FATAL_ERROR "No intrinsics header found")
endif ()
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
__m128i a = _mm_set1_epi8(1);
(void)_mm_shuffle_epi8(a, a);
}" HAVE_SSSE3)
if (NOT HAVE_SSSE3)
message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
endif ()
# now look for AVX2
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX2__)
#error no avx2
#endif
int main(){
__m256i z = _mm256_setzero_si256();
(void)_mm256_xor_si256(z, z);
}" HAVE_AVX2)
if (NOT HAVE_AVX2)
message(STATUS "Building without AVX2 support")
endif ()
unset (CMAKE_REQUIRED_FLAGS)
unset (INTRIN_INC_H)

View File

@ -15,9 +15,6 @@
/* internal build, switch on dump support. */ /* internal build, switch on dump support. */
#cmakedefine DUMP_SUPPORT #cmakedefine DUMP_SUPPORT
/* Build tools with threading support */
#cmakedefine ENABLE_TOOLS_THREADS
/* Define to 1 if `backtrace' works. */ /* Define to 1 if `backtrace' works. */
#cmakedefine HAVE_BACKTRACE #cmakedefine HAVE_BACKTRACE
@ -39,10 +36,6 @@
/* C compiler has intrin.h */ /* C compiler has intrin.h */
#cmakedefine HAVE_C_INTRIN_H #cmakedefine HAVE_C_INTRIN_H
/* Define to 1 if you have the declaration of `pthread_barrier_init', and to 0
if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_BARRIER_INIT
/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
0 if you don't. */ 0 if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
@ -59,9 +52,6 @@
/* Define to 1 if `posix_memalign' works. */ /* Define to 1 if `posix_memalign' works. */
#cmakedefine HAVE_POSIX_MEMALIGN #cmakedefine HAVE_POSIX_MEMALIGN
/* Define to 1 if you have the <pthread.h> header file. */
#cmakedefine HAVE_PTHREAD_H
/* Define to 1 if you have the `setrlimit' function. */ /* Define to 1 if you have the `setrlimit' function. */
#cmakedefine HAVE_SETRLIMIT #cmakedefine HAVE_SETRLIMIT

View File

@ -119,12 +119,21 @@ The following regex constructs are supported by Hyperscan:
* The anchors :regexp:`^`, :regexp:`$`, :regexp:`\\A`, :regexp:`\\Z` and * The anchors :regexp:`^`, :regexp:`$`, :regexp:`\\A`, :regexp:`\\Z` and
:regexp:`\\z`. :regexp:`\\z`.
* Option modifiers for: * Option modifiers:
* Case-sensitivity: :regexp:`(?i)` and :regexp:`(?-i)` These allow behaviour to be switched on (with :regexp:`(?<option>)`) and off
* Multi-line: :regexp:`(?m)` and :regexp:`(?-m)` (with :regexp:`(?-<option>)`) for a sub-pattern. The supported options are:
* Dot-all: :regexp:`(?s)` and :regexp:`(?-s)`
* Extended syntax: :regexp:`(?s)` and :regexp:`(?-s)` * :regexp:`i`: Case-insensitive matching, as per
:c:member:`HS_FLAG_CASELESS`.
* :regexp:`m`: Multi-line matching, as per :c:member:`HS_FLAG_MULTILINE`.
* :regexp:`s`: Interpret ``.`` as "any character", as per
:c:member:`HS_FLAG_DOTALL`.
* :regexp:`x`: Extended syntax, which will ignore most whitespace in the
pattern for compatibility with libpcre's ``PCRE_EXTENDED`` option.
For example, the expression :regexp:`foo(?i)bar(?-i)baz` will switch on
case-insensitive matching *only* for the ``bar`` portion of the match.
* The :regexp:`\\b` and :regexp:`\\B` zero-width assertions (word boundary and * The :regexp:`\\b` and :regexp:`\\B` zero-width assertions (word boundary and
'not word boundary', respectively). 'not word boundary', respectively).

View File

@ -44,7 +44,7 @@ master_doc = 'index'
# General information about the project. # General information about the project.
project = u'Hyperscan' project = u'Hyperscan'
copyright = u'2015, Intel Corporation' copyright = u'2015-2016, Intel Corporation'
# The version info for the project you're documenting, acts as replacement for # The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the # |version| and |release|, also used in various other places throughout the

View File

@ -30,4 +30,4 @@ and/or other countries.
\*Other names and brands may be claimed as the property of others. \*Other names and brands may be claimed as the property of others.
Copyright |copy| 2015, Intel Corporation. All rights reserved. Copyright |copy| 2015-2016, Intel Corporation. All rights reserved.

View File

@ -15,6 +15,7 @@ Hyperscan |version| Developer's Reference Guide
getting_started getting_started
compilation compilation
runtime runtime
serialization
performance performance
api_constants api_constants
api_files api_files

View File

@ -124,13 +124,19 @@ databases, only a single scratch region is necessary: in this case, calling
will ensure that the scratch space is large enough to support scanning against will ensure that the scratch space is large enough to support scanning against
any of the given databases. any of the given databases.
Importantly, only one such space is required per thread and can (and indeed While the Hyperscan library is re-entrant, the use of scratch spaces is not.
should) be allocated before data scanning is to commence. In a scenario where a For example, if by design it is deemed necessary to run recursive or nested
set of expressions are compiled by a single "master" thread and data will be scanning (say, from the match callback function), then an additional scratch
scanned by multiple "worker" threads, the convenience function space is required for that context.
:c:func:`hs_clone_scratch` allows multiple copies of an existing scratch space
to be made for each thread (rather than forcing the caller to pass all the In the absence of recursive scanning, only one such space is required per thread
compiled databases through :c:func:`hs_alloc_scratch` multiple times). and can (and indeed should) be allocated before data scanning is to commence.
In a scenario where a set of expressions are compiled by a single "master"
thread and data will be scanned by multiple "worker" threads, the convenience
function :c:func:`hs_clone_scratch` allows multiple copies of an existing
scratch space to be made for each thread (rather than forcing the caller to pass
all the compiled databases through :c:func:`hs_alloc_scratch` multiple times).
For example: For example:
@ -163,14 +169,6 @@ For example:
/* Now two threads can both scan against database db, /* Now two threads can both scan against database db,
each with its own scratch space. */ each with its own scratch space. */
While the Hyperscan library is re-entrant, the use of scratch spaces is not.
For example, if by design it is deemed necessary to run recursive or nested
scanning (say, from the match callback function), then an additional scratch
space is required for that context.
The easiest way to achieve this is to build up a single scratch space as a
prototype, then clone it for each context:
***************** *****************
Custom Allocators Custom Allocators
***************** *****************

View File

@ -0,0 +1,67 @@
.. _serialization:
#############
Serialization
#############
For some applications, compiling Hyperscan pattern databases immediately prior
to use is not an appropriate design. Some users may wish to:
* Compile pattern databases on a different host;
* Persist compiled databases to storage and only re-compile pattern databases
when the patterns change;
* Control the region of memory in which the compiled database is located.
Hyperscan pattern databases are not completely flat in memory: they contain
pointers and have specific alignment requirements. Therefore, they cannot be
copied (or otherwise relocated) directly. To enable these use cases, Hyperscan
provides functionality for serializing and deserializing compiled pattern
databases.
The API provides the following functions:
#. :c:func:`hs_serialize_database`: serializes a pattern database into a
flat relocatable buffer of bytes.
#. :c:func:`hs_deserialize_database`: reconstructs a newly allocated pattern
database from the output of :c:func:`hs_serialize_database`.
#. :c:func:`hs_deserialize_database_at`: reconstructs a pattern
database at a given memory location from the output of
:c:func:`hs_serialize_database`.
#. :c:func:`hs_serialized_database_size`: given a serialized pattern database,
returns the size of the memory block required by the database when
deserialized.
#. :c:func:`hs_serialized_database_info`: given a serialized pattern database,
returns a string containing information about the database. This call is
analogous to :c:func:`hs_database_info`.
.. note:: Hyperscan performs both version and platform compatibility checks
upon deserialization. The :c:func:`hs_deserialize_database` and
:c:func:`hs_deserialize_database_at` functions will only permit the
deserialization of databases compiled with (a) the same version of Hyperscan
and (b) platform features supported by the current host platform. See
:ref:`instr_specialization` for more information on platform specialization.
===================
The Runtime Library
===================
The main Hyperscan library (``libhs``) contains both the compiler and runtime
portions of the library. This means that in order to support the Hyperscan
compiler, which is written in C++, it requires C++ linkage and has a
dependency on the C++ standard library.
Many embedded applications require only the scanning ("runtime") portion of the
Hyperscan library. In these cases, pattern compilation generally takes place on
another host, and serialized pattern databases are delivered to the application
for use.
To support these applications without requiring the C++ dependency, a
runtime-only version of the Hyperscan library, called ``libhs_runtime``, is also
distributed. This library does not depend on the C++ standard library and
provides all Hyperscan functions other that those used to compile databases.

View File

@ -7,4 +7,5 @@ Name: libhs
Description: Intel(R) Hyperscan Library Description: Intel(R) Hyperscan Library
Version: @HS_VERSION@ Version: @HS_VERSION@
Libs: -L${libdir} -lhs Libs: -L${libdir} -lhs
Libs.private: @PRIVATE_LIBS@
Cflags: -I${includedir}/hs Cflags: -I${includedir}/hs

View File

@ -1,39 +0,0 @@
# The set of rules and other nastiness for generating FDR/Teddy source
# we need to add these as explicit dependencies
set(AUTOGEN_PY_FILES
arch.py
autogen.py
autogen_utils.py
base_autogen.py
fdr_autogen.py
teddy_autogen.py
)
function(fdr_autogen type out)
add_custom_command (
COMMENT "AUTOGEN ${out}"
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${out}"
COMMAND ${PYTHON} "${CMAKE_CURRENT_SOURCE_DIR}/autogen.py" ${type} > "${CMAKE_CURRENT_BINARY_DIR}/${out}"
DEPENDS ${AUTOGEN_PY_FILES}
)
add_custom_target(autogen_${type} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${out}")
endfunction(fdr_autogen)
#now build the functions
fdr_autogen(runtime fdr_autogen.c)
fdr_autogen(compiler fdr_autogen_compiler.cpp)
fdr_autogen(teddy_runtime teddy_autogen.c)
fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
set(fdr_GENERATED_SRC
${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
PARENT_SCOPE)
set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
include_directories(${CMAKE_CURRENT_BINARY_DIR})

View File

@ -1,58 +0,0 @@
#!/usr/bin/python
# Copyright (c) 2015, Intel Corporation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Intel Corporation nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import autogen_utils
# wrapper for architectures
class Arch:
def __init__(self, name, extensions = []):
self.name = name
self.extensions = extensions
self.target = None
def get_guard(self):
# these defines definitely fall into the "belt-and-suspenders"
# category of paranoia
if (self.guard_list == []):
return "#if 1"
return "#if " + " && ".join(self.guard_list)
class X86Arch(Arch):
def __init__(self, name, extensions = []):
Arch.__init__(self, name, extensions)
self.guard_list = [ ]
self.target = "0"
if "AVX2" in extensions:
self.target += " | HS_CPU_FEATURES_AVX2"
self.guard_list += [ "defined(__AVX2__)" ]
arch_x86_64 = X86Arch("x86_64", extensions = [ ])
arch_x86_64_avx2 = X86Arch("x86_64_avx2", extensions = [ "AVX2" ])

View File

@ -1,154 +0,0 @@
#!/usr/bin/python
# Copyright (c) 2015, Intel Corporation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Intel Corporation nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
from autogen_utils import *
from fdr_autogen import *
from teddy_autogen import *
from arch import *
# FDR setup
# these are either produced - if the guard succeeds, or #defined to zeroes.
# either the function or the zero is fine in our array of function pointers
def produce_fdr_runtimes(l):
for m in l:
m.produce_code()
def produce_fdr_compiles(l):
print "void getFdrDescriptions(vector<FDREngineDescription> *out) {"
print " static const FDREngineDef defns[] = {"
for m in l:
m.produce_compile_call()
print " };"
print " out->clear();"
print " for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
print " out->push_back(FDREngineDescription(defns[i]));"
print " }"
print "}"
def build_fdr_matchers():
all_matchers = [ ]
strides = [ 1, 2, 4 ]
common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
for s in strides:
all_matchers += [ M3(stride = s, **common) ]
return all_matchers
# teddy setup
def build_teddy_matchers():
all_matchers = [ ]
# AVX2
all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ]
all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ]
for n_msk in range(1, 5):
all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ]
all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ]
# SSE/SSE2/SSSE3
for n_msk in range(1, 5):
all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ]
all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ]
return all_matchers
def produce_teddy_compiles(l):
print "void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {"
print " static const TeddyEngineDef defns[] = {"
for m in l:
m.produce_compile_call()
print " };"
print " out->clear();"
print " for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
print " out->push_back(TeddyEngineDescription(defns[i]));"
print " }"
print "}"
# see below - we don't produce our 'zeros' at the point of the teddy runtimes as they
# are linked. So we either generate the function or we don't - then at the point of the
# header in fdr_autogen.c we either generate the header or we #define the zero.
def produce_teddy_runtimes(l):
# Since we're using -Wmissing-prototypes, we need headers first.
for m in l:
m.produce_guard()
print m.produce_header(visible = True, header_only = True)
m.close_guard()
for m in l:
m.produce_guard()
m.produce_code()
m.close_guard()
# see produce_teddy_runtimes() comment for the rationale
def produce_teddy_headers(l):
for m in l:
m.produce_guard()
print m.produce_header(visible = True, header_only = True)
m.produce_zero_alternative()
# general utilities
def make_fdr_function_pointers(matcher_list):
print """
typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
static FDRFUNCTYPE funcs[] = {
"""
all_funcs = ",\n".join([ " %s" % m.get_name() for m in matcher_list ])
print all_funcs
print """
};
"""
def assign_ids(matcher_list, next_id):
for m in matcher_list:
m.id = next_id
next_id += 1
return next_id
# Main entry point
m = build_fdr_matchers()
next_id = assign_ids(m, 0)
tm = build_teddy_matchers()
next_id = assign_ids(tm, next_id)
if sys.argv[1] == "compiler":
produce_fdr_compiles(m)
elif sys.argv[1] == "runtime":
produce_fdr_runtimes(m)
produce_teddy_headers(tm)
make_fdr_function_pointers(m+tm)
elif sys.argv[1] == "teddy_runtime":
produce_teddy_runtimes(tm)
elif sys.argv[1] == "teddy_compiler":
produce_teddy_compiles(tm)

View File

@ -1,285 +0,0 @@
#!/usr/bin/python
# Copyright (c) 2015, Intel Corporation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Intel Corporation nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
def fail_out(msg = ""):
print >>sys.stderr, "Internal failure in autogen.py: " + msg
sys.exit(1)
class IntegerType:
def __init__(self, size):
self.size = size
def get_name(self):
return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size]
def size_in_bytes(self):
return self.size / 8
def isSIMDOnIntel(self):
return False
def zero_expression(self):
return "0"
def constant_to_string(self, n):
if self.size == 64:
suffix = "ULL"
else:
suffix = ""
return "0x%x%s" % (n & ((1 << self.size) - 1), suffix)
def lowbits(self, n):
return (1 << n) - 1
def highbits(self, n):
return ~(self.lowbits(self.size - n))
def lowbit_mask(self, n):
return self.constant_to_string(self.lowbits(n))
def highbit_mask(self, n):
return self.constant_to_string(self.highbits(n))
def lowbit_extract_expr(self, expr_string, n):
return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
def highbit_extract_expr(self, expr_string, n):
return "(%s >> %d)" % (expr_string, self.size - n)
def flip_lowbits_expr(self, expr_string, n):
return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
def bit_extract_expr(self, expr_string, low, high):
lbm = self.lowbit_mask(high - low)
return "((%s >> %d) & %s)" % (expr_string, low, lbm)
# shifts are +ve if left and -ve if right
def shift_expr(self, expr_string, n):
if n <= -self.size or n >= self.size:
return self.zero_expression()
elif (n > 0):
return "(%s << %d)" % (expr_string, n)
elif (n < 0):
return "(%s >> %d)" % (expr_string, -n)
else:
return "(%s)" % (expr_string)
# code is:
# "normal" (always between buf and len) - the default
# "aligned" (means normal + aligned to a natural boundary)
# "cautious_forward" (means may go off the end of buf+len)
# "cautious_backwards" (means may go off the start of buf)
# "cautious_everywhere" (means may go off both)
def load_expr_data(self, offset = 0, code = "normal",
base_string = "ptr", bounds_lo = "buf", bounds_hi = "buf + len"):
if code is "normal":
return "lv_%s(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
elif code is "aligned":
if self.size is 8:
fail_out("no aligned byte loads")
return "lv_%s_a(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
elif code is "cautious_forward":
return "lv_%s_cf(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
elif code is "cautious_backward":
return "lv_%s_cb(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
elif code is "cautious_everywhere":
return "lv_%s_ce(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
class SIMDIntegerType(IntegerType):
def __init__(self, size):
IntegerType.__init__(self, size)
def isSIMDOnIntel(self):
return True
def zero_expression(self):
return "zeroes128()"
def lowbit_extract_expr(self, expr_string, n):
if (n <= 32):
tmpType = IntegerType(32)
tmpExpr = "movd(%s)" % expr_string
elif (32 < n <= 64):
tmpType = IntegerType(64)
tmpExpr = "movq(%s)" % expr_string
return tmpType.lowbit_extract_expr(tmpExpr, n)
def highbit_extract_expr(self, expr_string, n):
fail_out("Unimplemented high bit extract on m128")
def bit_extract_expr(self, expr_string, low, high, flip):
fail_out("Unimplemented bit extract on m128")
def shift_expr(self, expr_string, n):
if n % 8 != 0:
fail_out("Trying to shift a m128 by a bit granular value")
# should check that n is divisible by 8
if n <= -self.size or n >= self.size:
return self.zero_expression()
elif (n > 0):
return "_mm_slli_si128(%s, %s)" % (expr_string, n / 8)
elif (n < 0):
return "_mm_srli_si128(%s, %s)" % (expr_string, -n / 8)
else:
return "(%s)" % (expr_string)
def lowbit_mask(self, n):
if n % 8 != 0:
fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
return self.shift_expr("ones128()", -(128 - n))
def getRequiredType(bits):
if bits == 128:
return SIMDIntegerType(bits)
for b in [ 8, 16, 32, 64]:
if (bits <= b):
return IntegerType(b)
return None
class IntegerVariable:
def __init__(self, name, type):
self.name = name
self.type = type
def gen_initializer_stmt(self, initialization_string = None):
if initialization_string:
return "%s %s = %s;" % (self.type.get_name(), self.name, initialization_string)
else:
return "%s %s;" % (self.type.get_name(), self.name)
class Step:
def __init__(self, context, offset = 0):
self.context = context
self.matcher = context.matcher
self.offset = offset
self.latency = 1
self.dependency_list = []
self.latest = None
self.context.add_step(self)
# return a string, complete with indentation
def emit(self):
indent = " " * (self.offset*2 + self.matcher.default_body_indent)
s = "\n".join( [ indent + line for line in self.val.split("\n")] )
if self.latest:
s += " // " + str(self.debug_step) + " L" + str(self.latency) + " LTST:%d" % self.latest
if self.dependency_list:
s += " Derps: "
for (d,l) in self.dependency_list:
s += "%d/%d " % (d.debug_step,l)
return s
def add_dependency(self, step, anti_dependency = False, output_dependency = False):
if anti_dependency or output_dependency:
self.dependency_list += [ (step, 1) ]
else:
self.dependency_list += [ (step, step.latency) ]
def nv(self, type, var_name):
return self.context.new_var(self, type, var_name)
def gv(self, var_name, reader = True, writer = False):
return self.context.get_var(self, var_name, reader = reader, writer = writer)
# utility steps, generic
class LabelStep(Step):
def __init__(self, context, offset = 0, label_prefix = "off"):
Step.__init__(self, context, offset)
self.val = "%s%d: UNUSED;" % (label_prefix, offset)
class OpenScopeStep(Step):
def __init__(self, context, offset = 0):
Step.__init__(self, context, offset)
self.val = "{"
class CloseScopeStep(Step):
def __init__(self, context, offset = 0):
Step.__init__(self, context, offset)
self.val = "}"
class CodeGenContext:
def __init__(self, matcher):
self.vars = {}
self.steps = []
self.ctr = 0
self.matcher = matcher
self.var_writer = {} # var to a single writer
self.var_readers = {} # var to a list of all the readers that read the last value
def new_var(self, step, type, var_name):
var = IntegerVariable(var_name, type)
self.vars[var_name] = var
self.var_writer[var_name] = step
return var
def get_var(self, step, var_name, reader = True, writer = False):
if reader:
writer_step = self.var_writer[var_name]
if writer_step:
step.add_dependency(writer_step)
self.var_readers.setdefault(var_name, []).append(step)
if writer and not reader:
if self.var_writer[var_name]:
step.add_dependency(self.var_writer[var_name], output_dependency = True)
if writer:
if self.var_readers.has_key(var_name):
for reader in [ r for r in self.var_readers[var_name] if r is not step ]:
step.add_dependency(reader, anti_dependency = True)
self.var_readers[var_name] = []
self.var_writer[var_name] = step
return self.vars[var_name]
def add_step(self, step):
self.steps += [ step ]
step.debug_step = self.ctr
self.ctr += 1
def dontschedule(self, finals):
return "\n".join( [ s.emit() for s in self.steps ] )
def schedule(self, finals):
for f in finals:
f.latest = f.latency
worklist = finals
while worklist:
current = worklist[0]
worklist = worklist[1:]
for (dep, lat) in current.dependency_list:
if dep.latest is None or dep.latest < (current.latest + dep.latency):
dep.latest = current.latest + lat
if dep not in worklist:
worklist += [ dep ]
self.steps.sort(reverse = True, key = lambda s : s.latest)
return "\n".join( [ s.emit() for s in self.steps ] )

View File

@ -1,167 +0,0 @@
#!/usr/bin/python
# Copyright (c) 2015, Intel Corporation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Intel Corporation nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
from autogen_utils import *
from base_autogen import *
from string import Template
class MatcherBase:
def __init__(self):
pass
def get_name(self):
return "fdr_exec_%03d" % self.id
def produce_header(self, visible, header_only = False):
s = ""
if not visible:
s += "static never_inline"
s += """
hwlm_error_t %s(UNUSED const struct FDR *fdr,
UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
if header_only:
s += ";"
else:
s += "{"
s += "\n"
return s
def produce_guard(self):
print self.arch.get_guard()
def produce_zero_alternative(self):
print """
#else
#define %s 0
#endif
""" % self.get_name()
# trivial function for documentation/modularity
def close_guard(self):
print "#endif"
def produce_common_declarations(self):
return """
const u8 * buf = a->buf;
const size_t len = a->len;
const u8 * ptr = buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t * control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 * tryFloodDetect = a->firstFloodDetect;
UNUSED u32 bit, bitRem, confSplit, idx;
u32 byte, cf;
const struct FDRConfirm *fdrc;
u32 last_match = (u32)-1;
"""
def produce_continue_check(self):
return """if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
*a->groups = controlVal;
return HWLM_TERMINATED;
}
"""
def produce_flood_check(self):
return """
if (P0(ptr > tryFloodDetect)) {
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
*a->groups = controlVal;
return HWLM_TERMINATED;
}
}
"""
def produce_footer(self):
return """
*a->groups = controlVal;
return HWLM_SUCCESS;
}
"""
def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
if cautious:
caution_string = "VECTORING"
else:
caution_string = "NOT_CAUTIOUS"
conf_split_mask = IntegerType(32).constant_to_string(
self.conf_top_level_split - 1)
if enable_confirmless:
quick_check_string = """
if (!fdrc->mult) {
u32 id = fdrc->nBitsOrSoleID;
if ((last_match == id) && (fdrc->flags & NoRepeat))
continue;
last_match = id;
controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
continue;
} """
else:
quick_check_string = ""
if do_bailout:
bailout_string = """
if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
else:
bailout_string = ""
return Template("""
if (P0(!!$CONFVAR)) {
do {
bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
byte = bit / $NUM_BUCKETS + $OFFSET;
bitRem = bit % $NUM_BUCKETS;
$BAILOUT_STRING
confSplit = *(ptr+byte) & $SPLIT_MASK;
idx = confSplit * $NUM_BUCKETS + bitRem;
cf = confBase[idx];
if (!cf)
continue;
fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
if (!(fdrc->groups & *control))
continue;
$QUICK_CHECK_STRING
confWithBit(fdrc, a, ptr - buf + byte, $CAUTION_STRING, $CONF_PULL_BACK, control, &last_match);
} while(P0(!!$CONFVAR));
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
*a->groups = controlVal;
return HWLM_TERMINATED;
}
}""").substitute(CONFVAR = conf_var_name,
CONFVAR_SIZE = conf_var_size,
NUM_BUCKETS = self.num_buckets,
OFFSET = offset,
SPLIT_MASK = conf_split_mask,
QUICK_CHECK_STRING = quick_check_string,
BAILOUT_STRING = bailout_string,
CAUTION_STRING = caution_string,
CONF_PULL_BACK = self.conf_pull_back)
def indent(block, depth):
return "\n".join([ (" " * (4*depth)) + line for line in block.splitlines() ] )

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -55,6 +55,7 @@ public:
u32 getNumBuckets() const { return numBuckets; } u32 getNumBuckets() const { return numBuckets; }
u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; } u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; } u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; }
bool isValidOnTarget(const target_t &target_in) const; bool isValidOnTarget(const target_t &target_in) const;
virtual u32 getDefaultFloodSuffixLength() const = 0; virtual u32 getDefaultFloodSuffixLength() const = 0;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -26,34 +26,790 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "util/simd_utils.h"
#define P0(cnd) unlikely(cnd)
#include "fdr.h" #include "fdr.h"
#include "fdr_internal.h"
#include "teddy_internal.h"
#include "flood_runtime.h"
#include "fdr_confirm.h" #include "fdr_confirm.h"
#include "fdr_confirm_runtime.h" #include "fdr_confirm_runtime.h"
#include "fdr_streaming_runtime.h" #include "fdr_internal.h"
#include "fdr_loadval.h" #include "fdr_loadval.h"
#include "fdr_autogen.c" #include "fdr_streaming_runtime.h"
#include "flood_runtime.h"
#include "teddy.h"
#include "teddy_internal.h"
#include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"
/** \brief number of bytes processed in each iteration */
#define ITER_BYTES 16
/** \brief total zone buffer size */
#define ZONE_TOTAL_SIZE 64
/** \brief maximum number of allowed zones */
#define ZONE_MAX 3
/** \brief zone information.
*
* Zone represents a region of data to scan in FDR.
*
* The incoming buffer is to split in multiple zones to ensure two properties:
* 1: that we can read 8? bytes behind to generate a hash safely
* 2: that we can read the byte after the current byte (domain > 8)
*/
struct zone {
/** \brief copied buffer, used only when it is a boundary zone. */
u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
/** \brief shift amount for fdr state to avoid unwanted match. */
u8 shift;
/** \brief if boundary zone, start points into the zone buffer after the
* pre-padding. Otherwise, points to the main buffer, appropriately. */
const u8 *start;
/** \brief if boundary zone, end points to the end of zone. Otherwise,
* pointer to the main buffer, appropriately. */
const u8 *end;
/** \brief the amount to adjust to go from a pointer in the zones region
* (between start and end) to a pointer in the original data buffer. */
ptrdiff_t zone_pointer_adjust;
/** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
* otherwise end of the zone buf. floodPtr always points inside the same
* buffer as the start pointe. */
const u8 *floodPtr;
};
static
const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
};
/* generates an initial state mask based on the last byte-ish of history rather
* than being all accepting. If there is no history to consider, the state is
* generated based on the minimum length of each bucket in order to prevent
* confirms.
*/
static really_inline
m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
const struct zone *z) {
m128 s;
if (len_history) {
/* +1: the zones ensure that we can read the byte at z->end */
u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
tmp &= fdr->domainMask;
s = *((const m128 *)ft + tmp);
s = shiftRight8Bits(s);
} else {
s = fdr->start;
}
return s;
}
static really_inline
void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
u64a *conf8, m128 *s) {
/* +1: the zones ensure that we can read the byte at z->end */
u64a current_data_0;
u64a current_data_8;
current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
u64a v7 = (lv_u16(itPtr + 7, start_ptr, end_ptr + 1) << 1) &
domain_mask_adjusted;
u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
u64a v1 = (current_data_0 >> 7) & domain_mask_adjusted;
u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
u64a v3 = (current_data_0 >> 23) & domain_mask_adjusted;
u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
u64a v5 = (current_data_0 >> 39) & domain_mask_adjusted;
u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
u64a v15 = (lv_u16(itPtr + 15, start_ptr, end_ptr + 1) << 1) &
domain_mask_adjusted;
u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
u64a v9 = (current_data_8 >> 7) & domain_mask_adjusted;
u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
u64a v11 = (current_data_8 >> 23) & domain_mask_adjusted;
u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
u64a v13 = (current_data_8 >> 39) & domain_mask_adjusted;
u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
m128 st0 = *(const m128 *)(ft + v0*8);
m128 st1 = *(const m128 *)(ft + v1*8);
m128 st2 = *(const m128 *)(ft + v2*8);
m128 st3 = *(const m128 *)(ft + v3*8);
m128 st4 = *(const m128 *)(ft + v4*8);
m128 st5 = *(const m128 *)(ft + v5*8);
m128 st6 = *(const m128 *)(ft + v6*8);
m128 st7 = *(const m128 *)(ft + v7*8);
m128 st8 = *(const m128 *)(ft + v8*8);
m128 st9 = *(const m128 *)(ft + v9*8);
m128 st10 = *(const m128 *)(ft + v10*8);
m128 st11 = *(const m128 *)(ft + v11*8);
m128 st12 = *(const m128 *)(ft + v12*8);
m128 st13 = *(const m128 *)(ft + v13*8);
m128 st14 = *(const m128 *)(ft + v14*8);
m128 st15 = *(const m128 *)(ft + v15*8);
st1 = byteShiftLeft128(st1, 1);
st2 = byteShiftLeft128(st2, 2);
st3 = byteShiftLeft128(st3, 3);
st4 = byteShiftLeft128(st4, 4);
st5 = byteShiftLeft128(st5, 5);
st6 = byteShiftLeft128(st6, 6);
st7 = byteShiftLeft128(st7, 7);
st9 = byteShiftLeft128(st9, 1);
st10 = byteShiftLeft128(st10, 2);
st11 = byteShiftLeft128(st11, 3);
st12 = byteShiftLeft128(st12, 4);
st13 = byteShiftLeft128(st13, 5);
st14 = byteShiftLeft128(st14, 6);
st15 = byteShiftLeft128(st15, 7);
*s = or128(*s, st0);
*s = or128(*s, st1);
*s = or128(*s, st2);
*s = or128(*s, st3);
*s = or128(*s, st4);
*s = or128(*s, st5);
*s = or128(*s, st6);
*s = or128(*s, st7);
*conf0 = movq(*s);
*s = byteShiftRight128(*s, 8);
*conf0 ^= ~0ULL;
*s = or128(*s, st8);
*s = or128(*s, st9);
*s = or128(*s, st10);
*s = or128(*s, st11);
*s = or128(*s, st12);
*s = or128(*s, st13);
*s = or128(*s, st14);
*s = or128(*s, st15);
*conf8 = movq(*s);
*s = byteShiftRight128(*s, 8);
*conf8 ^= ~0ULL;
}
static really_inline
void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
u64a *conf8, m128 *s) {
u64a current_data_0;
u64a current_data_8;
current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
m128 st0 = *(const m128 *)(ft + v0*8);
m128 st2 = *(const m128 *)(ft + v2*8);
m128 st4 = *(const m128 *)(ft + v4*8);
m128 st6 = *(const m128 *)(ft + v6*8);
m128 st8 = *(const m128 *)(ft + v8*8);
m128 st10 = *(const m128 *)(ft + v10*8);
m128 st12 = *(const m128 *)(ft + v12*8);
m128 st14 = *(const m128 *)(ft + v14*8);
st2 = byteShiftLeft128(st2, 2);
st4 = byteShiftLeft128(st4, 4);
st6 = byteShiftLeft128(st6, 6);
st10 = byteShiftLeft128(st10, 2);
st12 = byteShiftLeft128(st12, 4);
st14 = byteShiftLeft128(st14, 6);
*s = or128(*s, st0);
*s = or128(*s, st2);
*s = or128(*s, st4);
*s = or128(*s, st6);
*conf0 = movq(*s);
*s = byteShiftRight128(*s, 8);
*conf0 ^= ~0ULL;
*s = or128(*s, st8);
*s = or128(*s, st10);
*s = or128(*s, st12);
*s = or128(*s, st14);
*conf8 = movq(*s);
*s = byteShiftRight128(*s, 8);
*conf8 ^= ~0ULL;
}
static really_inline
void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
u64a *conf8, m128 *s) {
u64a current_data_0;
u64a current_data_8;
current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
m128 st0 = *(const m128 *)(ft + v0*8);
m128 st4 = *(const m128 *)(ft + v4*8);
m128 st8 = *(const m128 *)(ft + v8*8);
m128 st12 = *(const m128 *)(ft + v12*8);
st4 = byteShiftLeft128(st4, 4);
st12 = byteShiftLeft128(st12, 4);
*s = or128(*s, st0);
*s = or128(*s, st4);
*conf0 = movq(*s);
*s = byteShiftRight128(*s, 8);
*conf0 ^= ~0ULL;
*s = or128(*s, st8);
*s = or128(*s, st12);
*conf8 = movq(*s);
*s = byteShiftRight128(*s, 8);
*conf8 ^= ~0ULL;
}
static really_inline
void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal,
const u32 *confBase, const struct FDR_Runtime_Args *a,
const u8 *ptr, hwlmcb_rv_t *control, u32 *last_match_id,
struct zone *z) {
const u8 bucket = 8;
const u8 pullback = 1;
if (likely(!*conf)) {
return;
}
/* ptr is currently referring to a location in the zone's buffer, we also
* need a pointer in the original, main buffer for the final string compare.
*/
const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
const u8 *confLoc = ptr;
do {
u32 bit = findAndClearLSB_64(conf);
u32 byte = bit / bucket + offset;
u32 bitRem = bit % bucket;
u32 confSplit = *(ptr + byte);
u32 idx = confSplit * bucket + bitRem;
u32 cf = confBase[idx];
if (!cf) {
continue;
}
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
((const u8 *)confBase + cf);
if (!(fdrc->groups & *control)) {
continue;
}
if (!fdrc->mult) {
u32 id = fdrc->nBitsOrSoleID;
if ((*last_match_id == id) && (fdrc->flags & NoRepeat)) {
continue;
}
*last_match_id = id;
*controlVal = a->cb(ptr_main + byte - a->buf,
ptr_main + byte - a->buf, id, a->ctxt);
continue;
}
u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a));
confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback,
control, last_match_id, confVal);
} while (unlikely(!!*conf));
}
static really_inline
void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
#ifdef DEBUG
DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
z->start, z->end, z->shift);
DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
z->zone_pointer_adjust, z->floodPtr);
DEBUG_PRINTF("zone buf:");
for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) {
if (i % 8 == 0) {
printf("_");
}
if (z->buf[i]) {
printf("%02x", z->buf[i]);
} else {
printf("..");
}
}
printf("\n");
#endif
};
/**
* \brief Updates attributes for non-boundary region zone.
*/
static really_inline
void createMainZone(const u8 *flood, const u8 *begin, const u8 *end,
struct zone *z) {
z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */
z->start = begin;
z->end = end;
z->floodPtr = flood;
z->shift = 0;
}
/**
* \brief Create zone for short cases (<= ITER_BYTES).
*
* For this case we need to copy everything into the zone's internal buffer.
*
* We need to ensure that we run over real data if it exists (in history or
* before zone begin). We also need to ensure 8 bytes before any data being
* matched can be read (to perform a conf hash).
*
* We also need to ensure that the data at z->end can be read.
*
* Hence, the zone consists of:
* 16 bytes of history,
* 1 - 24 bytes of data form the buffer (ending at end),
* 1 byte of final padding
*/
static really_inline
void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
const u8 *end, struct zone *z) {
/* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
* the checks in boundary zone. */
z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
ptrdiff_t z_len = end - begin;
assert(z_len > 0);
assert(z_len <= ITER_BYTES);
z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */
static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */
/* we are guaranteed to always have 16 initialised bytes at the end of
* the history buffer (they may be garbage coming from the stream state
* preceding hbuf, but bytes that don't correspond to actual history
* shouldn't affect computations). */
*(m128 *)z->buf = loadu128(hend - sizeof(m128));
/* The amount of data we have to copy from main buffer. */
size_t copy_len = MIN((size_t)(end - buf),
ITER_BYTES + sizeof(CONF_TYPE));
u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
switch (copy_len) {
case 1:
*zone_data = *(end - 1);
break;
case 2:
*(u16 *)zone_data = unaligned_load_u16(end - 2);
break;
case 3:
*(u16 *)zone_data = unaligned_load_u16(end - 3);
*(zone_data + 2) = *(end - 1);
break;
case 4:
*(u32 *)zone_data = unaligned_load_u32(end - 4);
break;
case 5:
case 6:
case 7:
/* perform copy with 2 overlapping 4-byte chunks from buf. */
*(u32 *)zone_data = unaligned_load_u32(end - copy_len);
unaligned_store_u32(zone_data + copy_len - sizeof(u32),
unaligned_load_u32(end - sizeof(u32)));
break;
case 8:
*(u64a *)zone_data = unaligned_load_u64a(end - 8);
break;
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
/* perform copy with 2 overlapping 8-byte chunks from buf. */
*(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
unaligned_load_u64a(end - sizeof(u64a)));
break;
case 16:
/* copy 16-bytes from buf. */
*(m128 *)zone_data = loadu128(end - 16);
break;
default:
assert(copy_len <= sizeof(m128) + sizeof(u64a));
/* perform copy with (potentially overlapping) 8-byte and 16-byte chunks.
*/
*(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
storeu128(zone_data + copy_len - sizeof(m128),
loadu128(end - sizeof(m128)));
break;
}
/* set the start and end location of the zone buf
* to be scanned */
u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
/* copy the post-padding byte; this is required for domain > 8 due to
* overhang */
*z_end = 0;
z->end = z_end;
z->start = z_end - ITER_BYTES;
z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
assert(z->start + z->shift == z_end - z_len);
}
/**
* \brief Create a zone for the start region.
*
* This function requires that there is > ITER_BYTES of data in the buffer to
* scan. The start zone itself is always responsible for scanning exactly
* ITER_BYTES of data - there are no warmup/junk bytes scanned.
*
* This zone ensures that the byte at z->end can be read and corresponds to
* the next byte of data.
*
* 8 bytes of history data are provided before z->start to allow proper hash
* generation in streaming mode. If buf != begin, upto 8 bytes of data
* prior to begin is also provided.
*
* Although we are not interested in bare literals which start before begin
* if buf != begin, lookarounds associated with the literal may require
* the data prior to begin for hash purposes.
*/
static really_inline
void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
struct zone *z) {
assert(ITER_BYTES == sizeof(m128));
assert(sizeof(CONF_TYPE) == 8);
static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
const u8 *end = begin + ITER_BYTES;
/* set floodPtr to the end of zone buf to avoid checks in start zone */
z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
z->shift = 0; /* we are processing ITER_BYTES of real data */
/* we are guaranteed to always have 16 initialised bytes at the end of the
* history buffer (they may be garbage coming from the stream state
* preceding hbuf, but bytes that don't correspond to actual history
* shouldn't affect computations). However, for start zones, history is only
* required for conf hash purposes so we only need 8 bytes */
unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
/* The amount of data we have to copy from main buffer. */
size_t copy_len = MIN((size_t)(end - buf),
ITER_BYTES + sizeof(CONF_TYPE));
assert(copy_len >= 16);
/* copy the post-padding byte; this is required for domain > 8 due to
* overhang. The start requires that there is data after the zone so it
* it safe to dereference end */
z->buf[ZONE_START_BEGIN + copy_len] = *end;
/* set the start and end location of the zone buf to be scanned */
u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
z->end = z_end;
z->start = z_end - ITER_BYTES;
/* copy the first 8 bytes of the valid region */
unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
unaligned_load_u64a(end - copy_len));
/* copy the last 16 bytes, may overlap with the previous 8 byte write */
storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
}
/**
* \brief Create a zone for the end region.
*
* This function requires that there is > ITER_BYTES of data in the buffer to
* scan. The end zone, however, is only responsible for a scanning the <=
* ITER_BYTES rump of data. The end zone is required to handle a full ITER_BYTES
* iteration as the main loop cannot handle the last byte of the buffer.
*
* This zone ensures that the byte at z->end can be read by filling it with a
* padding character.
*
* Upto 8 bytes of data prior to begin is also provided for the purposes of
* generating hashes. History is not copied, as all locations which require
* history for generating a hash are the responsiblity of the start zone.
*/
static really_inline
void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
struct zone *z) {
/* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
* the checks in boundary zone. */
z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
ptrdiff_t z_len = end - begin;
assert(z_len > 0);
assert(z_len <= ITER_BYTES);
z->shift = ITER_BYTES - z_len;
/* The amount of data we have to copy from main buffer. */
size_t copy_len = MIN((size_t)(end - buf),
ITER_BYTES + sizeof(CONF_TYPE));
assert(copy_len >= 16);
/* copy the post-padding byte; this is required for domain > 8 due to
* overhang */
z->buf[copy_len] = 0;
/* set the start and end location of the zone buf
* to be scanned */
u8 *z_end = z->buf + copy_len;
z->end = z_end;
z->start = z_end - ITER_BYTES;
assert(z->start + z->shift == z_end - z_len);
/* copy the first 8 bytes of the valid region */
unaligned_store_u64a(z->buf, unaligned_load_u64a(end - copy_len));
/* copy the last 16 bytes, may overlap with the previous 8 byte write */
storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
}
/**
* \brief Prepare zones.
*
* This function prepares zones with actual buffer and some padded bytes.
* The actual ITER_BYTES bytes in zone is preceded by main buf and/or
* history buf and succeeded by padded bytes possibly from main buf,
* if available.
*/
static really_inline
size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
size_t start, const u8 *flood, struct zone *zoneArr) {
const u8 *ptr = buf + start;
size_t remaining = len - start;
if (remaining <= ITER_BYTES) {
/* enough bytes to make only one zone */
createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
return 1;
}
/* enough bytes to make more than one zone */
size_t numZone = 0;
createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
ptr += ITER_BYTES;
assert(ptr < buf + len);
/* find maximum buffer location that the main zone can scan
* - must be a multiple of ITER_BYTES, and
* - cannot contain the last byte (due to overhang)
*/
const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 1, ITER_BYTES);
assert(main_end >= ptr);
/* create a zone if multiple of ITER_BYTES are found */
if (main_end != ptr) {
createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
ptr = main_end;
}
/* create a zone with rest of the data from the main buffer */
createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
return numZone;
}
#define INVALID_MATCH_ID (~0U)
#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
do { \
const u8 *tryFloodDetect = zz->floodPtr; \
const u8 *start_ptr = zz->start; \
const u8 *end_ptr = zz->end; \
\
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
itPtr += ITER_BYTES) { \
if (unlikely(itPtr > tryFloodDetect)) { \
tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
&floodBackoff, &controlVal, \
ITER_BYTES); \
if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \
return HWLM_TERMINATED; \
} \
} \
__builtin_prefetch(itPtr + (ITER_BYTES*4)); \
u64a conf0; \
u64a conf8; \
get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted, \
ft, &conf0, &conf8, &s); \
do_confirm_fdr(&conf0, 0, &controlVal, confBase, a, itPtr, \
control, &last_match_id, zz); \
do_confirm_fdr(&conf8, 8, &controlVal, confBase, a, itPtr, \
control, &last_match_id, zz); \
if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \
return HWLM_TERMINATED; \
} \
} /* end for loop */ \
} while (0) \
static never_inline
hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
u32 last_match_id = INVALID_MATCH_ID;
u64a domain_mask_adjusted = fdr->domainMask << 1;
u8 stride = fdr->stride;
const u8 *ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
const u32 *confBase = (const u32 *)(ft + fdr->tabSize);
struct zone zones[ZONE_MAX];
assert(fdr->domain > 8 && fdr->domain < 16);
size_t numZone = prepareZones(a->buf, a->len,
a->buf_history + a->len_history,
a->start_offset, a->firstFloodDetect, zones);
assert(numZone <= ZONE_MAX);
m128 state = getInitState(fdr, a->len_history, ft, &zones[0]);
for (size_t curZone = 0; curZone < numZone; curZone++) {
struct zone *z = &zones[curZone];
dumpZoneInfo(z, curZone);
/* When a zone contains less data than is processed in an iteration
* of FDR_MAIN_LOOP(), we need to scan over some extra data.
*
* We have chosen to scan this extra data at the start of the
* iteration. The extra data is either data we have already scanned or
* garbage (if it is earlier than offset 0),
*
* As a result we need to shift the incoming state back so that it will
* properly line up with the data being scanned.
*
* We also need to forbid reporting any matches in the data being
* rescanned as they have already been reported (or are over garbage but
* later stages should also provide that safety guarantee).
*/
u8 shift = z->shift;
state = variable_byte_shift_m128(state, shift);
state = or128(state, load128(zone_or_mask[shift]));
switch (stride) {
case 1:
FDR_MAIN_LOOP(z, state, get_conf_stride_1);
break;
case 2:
FDR_MAIN_LOOP(z, state, get_conf_stride_2);
break;
case 4:
FDR_MAIN_LOOP(z, state, get_conf_stride_4);
break;
default:
break;
}
}
return HWLM_SUCCESS;
}
#if defined(__AVX2__)
#define ONLY_AVX2(func) func
#else
#define ONLY_AVX2(func) NULL
#endif
typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
static const FDRFUNCTYPE funcs[] = {
fdr_engine_exec,
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast),
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat),
ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat),
ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat),
ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat),
ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat),
fdr_exec_teddy_msks1,
fdr_exec_teddy_msks1_pck,
fdr_exec_teddy_msks2,
fdr_exec_teddy_msks2_pck,
fdr_exec_teddy_msks3,
fdr_exec_teddy_msks3_pck,
fdr_exec_teddy_msks4,
fdr_exec_teddy_msks4_pck,
};
#define FAKE_HISTORY_SIZE 16 #define FAKE_HISTORY_SIZE 16
static const u8 fake_history[FAKE_HISTORY_SIZE]; static const u8 fake_history[FAKE_HISTORY_SIZE];
hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t start, hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
HWLMCallback cb, void *ctxt, hwlm_group_t groups) { size_t start, HWLMCallback cb, void *ctxt,
hwlm_group_t groups) {
// We guarantee (for safezone construction) that it is safe to read 16
// bytes before the end of the history buffer.
const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
const struct FDR_Runtime_Args a = { const struct FDR_Runtime_Args a = {
buf, buf,
len, len,
fake_history, hbuf,
0, 0,
fake_history, // nocase hbuf, // nocase
0, 0,
start, start,
cb, cb,
@ -73,7 +829,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t st
hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf, hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
size_t hlen, const u8 *buf, size_t len, size_t hlen, const u8 *buf, size_t len,
size_t start, HWLMCallback cb, void *ctxt, size_t start, HWLMCallback cb, void *ctxt,
hwlm_group_t groups, u8 * stream_state) { hwlm_group_t groups, u8 *stream_state) {
struct FDR_Runtime_Args a = { struct FDR_Runtime_Args a = {
buf, buf,
len, len,
@ -86,9 +842,9 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
ctxt, ctxt,
&groups, &groups,
nextFloodDetect(buf, len, FLOOD_BACKOFF_START), nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
hbuf ? CONF_LOADVAL_CALL_CAUTIOUS(hbuf + hlen - 8, hbuf, hbuf + hlen) /* we are guaranteed to always have 16 initialised bytes at the end of
: (u64a)0 * the history buffer (they may be garbage). */
hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
}; };
fdrUnpackState(fdr, &a, stream_state); fdrUnpackState(fdr, &a, stream_state);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -43,9 +43,6 @@ extern "C" {
struct FDR; struct FDR;
/** \brief Returns size in bytes of the given FDR engine. */
size_t fdrSize(const struct FDR *fdr);
/** \brief Returns non-zero if the contents of the stream state indicate that /** \brief Returns non-zero if the contents of the stream state indicate that
* there is active FDR history beyond the regularly used history. */ * there is active FDR history beyond the regularly used history. */
u32 fdrStreamStateActive(const struct FDR *fdr, const u8 *stream_state); u32 fdrStreamStateActive(const struct FDR *fdr, const u8 *stream_state);

View File

@ -1,564 +0,0 @@
#!/usr/bin/python
# Copyright (c) 2015, Intel Corporation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Intel Corporation nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
from autogen_utils import *
from base_autogen import *
from string import Template
class OrStep(Step):
def __init__(self, context, offset, width):
Step.__init__(self, context, offset)
s_var = self.gv("st%d" % offset)
if width < 128:
self.val = "s |= %s;" % s_var.name
else:
self.val = "s = or%d(s, %s);" % (width, s_var.name)
class ShiftStateStep(Step):
def __init__(self, context, offset = 0, stride_used = 1):
Step.__init__(self, context, offset)
m = self.matcher
state = m.state_variable
shift_distance = -1 * stride_used * m.num_buckets
self.val = "%s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
class BulkLoadStep(Step):
def __init__(self, context, offset, size, define_var = True, aligned = True):
Step.__init__(self, context, offset)
m = self.matcher
self.latency = 4
blt = m.bulk_load_type
if aligned:
init_string = blt.load_expr_data(self.offset, code = "aligned")
else:
init_string = blt.load_expr_data(self.offset)
var_name = "current_data_%d" % offset
if define_var:
lb_var = self.nv(blt, var_name)
self.val = lb_var.gen_initializer_stmt(init_string)
else:
lb_var = self.gv(var_name, reader = False, writer = True)
self.val = "%s = %s;" % (var_name, init_string)
class ValueExtractStep(Step):
def __init__(self, context, offset, sub_load_cautious = False):
Step.__init__(self, context, offset)
m = self.matcher
self.latency = 2
dsb = m.datasize_bytes
modval = offset % dsb
if modval == dsb - 1:
# Case 1: reading more than one byte over the end of the bulk load
self.latency = 4
if sub_load_cautious:
code_string = "cautious_forward"
else:
code_string = "normal"
load_string = m.single_load_type.load_expr_data(self.offset, code_string)
temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
else:
# Case 2: reading a value that can be found entirely in the current register
if m.fdr2_force_naive_load:
load_string = m.single_load_type.load_expr_data(self.offset, "normal")
temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
else:
lb_var = self.gv("current_data_%d" % (offset - modval))
if modval == 0:
# Case 2a: value is at LSB end of the register and must be left-
# shifted into place if there is a "reach_shift_adjust" required
temp_string = "(%s << %d)" % (lb_var.name, m.reach_shift_adjust)
else:
# Case 2b: value is in the middle of the register and will be
# right-shifted into place (adjusted by "reach_shift_adjust")
temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
v_var = self.nv(m.value_extract_type, "v%d" % offset)
self.val = v_var.gen_initializer_stmt(init_string)
class TableLookupStep(Step):
def __init__(self, context, reach_multiplier, offset = 0):
Step.__init__(self, context, offset)
m = self.matcher
self.latency = 4
v_var = self.gv("v%d" % offset)
s_var = self.nv(m.state_type, "st%d" % offset)
init_string = "*(const %s *)(ft + %s*%dU)" % ( m.state_type.get_name(),
v_var.name, reach_multiplier)
self.val = s_var.gen_initializer_stmt(init_string)
class ShiftReachMaskStep(Step):
def __init__(self, context, offset):
Step.__init__(self, context, offset)
m = self.matcher
extr = m.extract_frequency
modval = offset % extr
s_var = self.gv("st%d" % offset, writer = True)
self.val = "%s = %s;" % (s_var.name, s_var.type.shift_expr(s_var.name, modval * m.num_buckets))
class ConfExtractStep(Step):
def __init__(self, context, offset):
Step.__init__(self, context, offset)
m = self.matcher
if m.state_type.isSIMDOnIntel():
self.latency = 2
init_string = m.state_type.lowbit_extract_expr("s", m.extract_size)
extr_var = self.nv(m.extr_type, "extr%d" % offset)
self.val = extr_var.gen_initializer_stmt(init_string)
class ConfAccumulateStep(Step):
def __init__(self, context, extract_offset, conf_offset, define_var = True):
Step.__init__(self, context, extract_offset)
m = self.matcher
extr_var = self.gv("extr%d" % extract_offset)
extr_var_cast = "((%s)%s)" % (m.conf_type.get_name(), extr_var.name)
if extract_offset == conf_offset:
# create conf_var as a straight copy of extr
if define_var:
conf_var = self.nv(m.conf_type, "conf%d" % conf_offset)
self.val = conf_var.gen_initializer_stmt(extr_var_cast)
else:
conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
self.val = "%s = %s;" % (conf_var.name, extr_var_cast)
else:
# shift extr_var and insert/OR it in conf_var
conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
shift_dist = (extract_offset - conf_offset) * m.num_buckets
self.val = "%s |= %s;" % (conf_var.name, m.conf_type.shift_expr(extr_var_cast, shift_dist))
self.latency = 2
class ConfirmFlipStep(Step):
def __init__(self, context, offset):
Step.__init__(self, context, offset)
m = self.matcher
conf_var = self.gv("conf%d" % self.offset, writer = True)
self.val = "%s = %s;" % (conf_var.name,
conf_var.type.flip_lowbits_expr(conf_var.name, self.matcher.confirm_frequency * m.num_buckets))
class ConfirmStep(Step):
def __init__(self, context, offset, cautious = False):
Step.__init__(self, context, offset)
m = self.matcher
conf_var = self.gv("conf%d" % offset, writer = True)
self.val = m.produce_confirm_base(conf_var.name, conf_var.type.size, offset, cautious,
enable_confirmless = m.stride == 1, do_bailout = False)
class M3(MatcherBase):
def produce_compile_call(self):
print " { %d, %d, %d, %d, %s, %d, %d }," % (
self.id, self.state_width, self.num_buckets,
self.stride,
self.arch.target, self.conf_pull_back, self.conf_top_level_split)
def produce_main_loop(self, switch_variant = False):
stride_offsets = xrange(0, self.loop_bytes, self.stride)
stride_offsetSet = set(stride_offsets)
so_steps_last_block = []
sh = None
last_confirm = None
ctxt = CodeGenContext(self)
if switch_variant:
print " ptr -= (iterBytes - dist);"
print " { " # need an extra scope around switch variant to stop its globals escaping
else:
print " if (doMainLoop) {"
print " for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
print self.produce_flood_check()
print " __builtin_prefetch(ptr + (iterBytes*4));"
print " assert(((size_t)ptr % START_MOD) == 0);"
# just do globally for now
if switch_variant:
subsidiary_load_cautious = True
confirm_cautious = True
else:
subsidiary_load_cautious = False
confirm_cautious = False
if not self.fdr2_force_naive_load:
bulk_load_steps = [ off for off in range(self.loop_bytes)
if off % self.datasize_bytes == 0 and
(set(range(off, off + self.datasize_bytes - 1)) & stride_offsetSet)]
else:
bulk_load_steps = []
confirm_steps = [ off for off in range(self.loop_bytes) if off % self.confirm_frequency == 0 ]
for off in bulk_load_steps:
lb_var = ctxt.new_var(None, self.bulk_load_type, "current_data_%d" % off)
print " " + lb_var.gen_initializer_stmt()
for off in confirm_steps:
var_name = "conf%d" % off
conf_def_var = ctxt.new_var(None, self.conf_type, var_name)
if switch_variant:
init_string = "(%s)-1" % self.conf_type.get_name()
else:
init_string = ""
print " " + conf_def_var.gen_initializer_stmt(init_string)
if switch_variant:
print " switch(iterBytes - dist) {"
for i in range(0, self.loop_bytes):
print " case %d:" % i
# init and poison conf; over-precise but harmless
conf_id = (i / self.confirm_frequency) * self.confirm_frequency
if i % self.confirm_frequency:
conf_fixup_bits = self.conf_type.size - (self.num_buckets * (i % self.confirm_frequency))
print " conf%d >>= %d;" % (conf_id, conf_fixup_bits)
else:
print " conf%d = 0;" % conf_id
# init state
state_fixup = i % self.extract_frequency
state = self.state_variable
shift_distance = self.num_buckets * state_fixup
if state_fixup:
print " %s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
if self.state_width < 128:
print " %s |= %s;" % (state.name, state.type.lowbit_mask(shift_distance))
else:
print " %s = or%d(%s, %s);" % (state.name, self.state_width, state.name, state.type.lowbit_mask(shift_distance))
if not self.fdr2_force_naive_load:
# init current_data (could poison it in some cases)
load_mod = i % self.datasize_bytes
load_offset = i - load_mod
if load_mod:
# not coming in on an even boundary means having to do a load var
# actually, there are a bunch of things we can do on this bulk load
# to avoid having to be 'cautious_backwards' but I'm not completely
# sure they are good ideas
init_string = self.bulk_load_type.load_expr_data(load_offset,
code = "cautious_backward")
var_name = "current_data_%d" % load_offset
lb_var = ctxt.get_var(None, var_name, reader = False, writer = True)
print " %s = %s;" % (lb_var.name, init_string)
print " goto off%d;" % i
print " case %d: goto skipSwitch;" % self.loop_bytes
print " }"
print " {"
for off in range(self.loop_bytes):
# X_mod is the offset we're up to relative to the last X operation
# X_offset is which of the last X operations matches this iteration
if (switch_variant):
LabelStep(ctxt, off)
if off in bulk_load_steps:
if not self.fdr2_force_naive_load:
BulkLoadStep(ctxt, off, self.datasize, define_var = False, aligned = not switch_variant)
if off in stride_offsets:
if switch_variant:
OpenScopeStep(ctxt, off)
ValueExtractStep(ctxt, off, sub_load_cautious = subsidiary_load_cautious)
TableLookupStep(ctxt, self.reach_mult, off)
if off % self.extract_frequency:
ShiftReachMaskStep(ctxt, off)
so = OrStep(ctxt, off, self.state_width)
if switch_variant:
CloseScopeStep(ctxt, off)
if sh != None:
so.add_dependency(sh)
so_steps_last_block += [ so ]
extract_mod = off % self.extract_frequency
extract_offset = off - extract_mod
extract_ready = extract_mod == self.extract_frequency - 1
if extract_ready:
if switch_variant:
OpenScopeStep(ctxt, off)
ex = ConfExtractStep(ctxt, extract_offset)
ConfAccumulateStep(ctxt, extract_offset, confirm_offset, define_var = False)
for so_step in so_steps_last_block:
ex.add_dependency(so_step)
if switch_variant:
CloseScopeStep(ctxt, off)
so_steps_last_block = []
sh = ShiftStateStep(ctxt, extract_offset, stride_used = self.extract_frequency)
sh.add_dependency(ex)
confirm_mod = off % self.confirm_frequency
confirm_offset = off - confirm_mod
confirm_ready = confirm_mod == self.confirm_frequency - 1
if confirm_ready:
cflip = ConfirmFlipStep(ctxt, confirm_offset)
cf = ConfirmStep(ctxt, confirm_offset, cautious = confirm_cautious )
if last_confirm:
cf.add_dependency(last_confirm)
last_confirm = cf
if not switch_variant:
print ctxt.schedule([ last_confirm, sh ])
else:
print ctxt.dontschedule([ last_confirm, sh ])
if switch_variant:
print "skipSwitch:;"
print " ptr += iterBytes;"
print " }" # close extra scope around switch variant
print " }"
def produce_init_state(self):
state = self.state_variable
s_type = self.state_type
shift_distance = -1 * self.num_buckets
shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
s = Template("""
$TYPENAME s;
if (a->len_history) {
u32 tmp = 0;
if (a->start_offset == 0) {
tmp = a->buf_history[a->len_history - 1];
tmp |= (a->buf[0] << 8);
} else {
tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
}
tmp &= fdr->domainMask;
s = *((const $TYPENAME *)ft + tmp);
$SHIFT_EXPR;
} else {
s = *(const $TYPENAME *)&fdr->start;
}
""").substitute(TYPENAME = s_type.get_name(),
ZERO_EXPR = s_type.zero_expression(),
SHIFT_EXPR = shift_expr)
return s
def produce_code(self):
loop_read_behind = 0
loop_read_ahead = self.loop_bytes + 1
# we set up mask and shift stuff for extracting our masks from registers
#
# we have a choice as to whether to mask out the value early or
# extract the value (shift first) then mask it
#
# Intel has a free scaling factor from 1/2/4/8 so we want to combine
# the extra needed shift for SSE registers with the mask operation
ssb = self.state_type.size / 8 # state size in bytes
# Intel path
if ssb == 16:
# obscure corner - we don't have the room in the register to
# do this for all values so we don't. domain==16 is pretty
# bad anyhow, of course
self.reach_mult = 8
else:
self.reach_mult = ssb
shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
print self.produce_header(visible = False)
print "// ",
print " Arch: " + self.arch.name,
print " State type: " + self.state_type.get_name(),
print " Num buckets: %d" % self.num_buckets,
print " Stride: %d" % self.stride
print self.produce_common_declarations()
print " assert(fdr->domain > 8 && fdr->domain < 16);"
print
print " u64a domain_mask = fdr->domainMask;"
print " const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
print " const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
print self.produce_init_state()
print " const size_t iterBytes = %d;" % self.loop_bytes
print " const size_t START_MOD = %d;" % self.datasize_bytes
print " const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
print """
while (ptr < buf + len) {
u8 doMainLoop = 1;
size_t remaining = len - (ptr - buf);
size_t dist;
if (remaining <= iterBytes) {
dist = remaining; // once through the switch and we're done
} else if (remaining < 2 * iterBytes) {
// nibble some stuff off the front, skip the main loop,
// then come back here
dist = iterBytes; // maybe could be cleverer
} else {
// now, we need to see if we can make it to a main loop iteration
// if so, we need to ensure that the main loop iteration is aligned
// to a START_MOD boundary and i >= 8 so we can read ptr + i - 8
// see if we can do it - if not, just switch the main loop off,
// eat iterBytes in cautious mode, and come back to this loop
const u8 * target = MAX(buf + 8, ptr);
target = ROUNDUP_PTR(target, START_MOD);
dist = target - ptr;
if (dist > iterBytes) {
doMainLoop = 0;
dist = iterBytes;
}
}
"""
self.produce_main_loop(switch_variant = True)
self.produce_main_loop(switch_variant = False)
print """
}
"""
print self.produce_footer()
def get_name(self):
return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
def __init__(self, state_width, stride,
arch,
table_state_width = None,
num_buckets = 8,
extract_frequency = None,
confirm_frequency = None):
# First - set up the values that are fundamental to how this matcher will operate
self.arch = arch
# get the width of the state width on which we operate internally
if state_width not in [ 128 ]:
fail_out("Unknown state width: %d" % state_width)
self.state_width = state_width
self.state_type = getRequiredType(self.state_width)
self.state_variable = IntegerVariable("s", self.state_type)
table_state_width = state_width
self.table_state_width = state_width
self.table_state_type = getRequiredType(self.table_state_width)
# this is the load type required for domain [9:15] if we want to
# load it one at a time
self.single_load_type = IntegerType(16)
# stride is the frequency with which we make data-driven
# accesses to our reach table
if stride not in [ 1, 2, 4, 8]:
fail_out("Unsupported stride: %d" % stride)
if stride * num_buckets > state_width:
fail_out("Stride %d is too big for the number of buckets %d given state width %d\n" % (stride, num_buckets, state_width))
self.stride = stride
if num_buckets != 8:
fail_out("Unsupported number of buckets: %d" % num_buckets)
if state_width % num_buckets and state_width == 128:
fail_out("Bucket scheme requires bit-shifts on m128 (failing)")
self.num_buckets = num_buckets
# Second - set up derived or optimization values - these can be
# overridden by arguments that are passed in
self.datasize = 64
self.bulk_load_type = IntegerType(self.datasize)
self.datasize_bytes = self.datasize/8
self.value_extract_type = IntegerType(self.datasize)
self.fdr2_force_naive_load = False # disable everywhere for trunk
# extract frequency is how frequently (in bytes) we destructively shift
# our state value after having pulled out that many bytes into a
# confirm register (of one sort or another).
# none means a default value - datasize, our biggest easily available GPR
if extract_frequency is None:
extract_frequency = self.datasize_bytes
self.extract_frequency = extract_frequency
self.extract_size = self.extract_frequency*self.num_buckets
if extract_frequency < stride:
fail_out("Can't extract at extract frequency %d with stride %d" % (extract_frequency, stride))
if extract_frequency not in [ None, 1, 2, 4, 8, 16]:
fail_out("Weird extract frequency: %d" % extract_frequency)
if self.extract_size <= 32:
self.extr_type = IntegerType(32)
elif self.extract_size <= 64:
self.extr_type = IntegerType(64)
else:
fail_out("Implausible size %d required for confirm extract step" % size)
# extract_frequency is how often we pull out our state and place
# it somewhere in a lossless fashion
# confirm_frequency, on the other hand, is how frequently we
# take the state extracted by extract_frequency and cobble it
# together into a matching loop
# confirm_frequency must be a multiple of extract_frequency
# and must fit into a fast register; for now; we're going to
# stay in the GPR domain
if confirm_frequency is None:
confirm_frequency = self.extract_frequency
self.confirm_frequency = confirm_frequency
if confirm_frequency % self.extract_frequency:
fail_out("Confirm frequency %d must be evenly divisible by extract_frequency %d" % (confirm_frequency, self.extract_frequency))
self.conf_size = self.confirm_frequency * self.num_buckets
if self.conf_size <= 32:
self.conf_type = IntegerType(32)
elif self.conf_size <= 64:
self.conf_type = IntegerType(64)
else:
fail_out("Implausible size %d required for confirm accumulate step" % self.conf_size)
# how many bytes in flight at once
self.loop_bytes = 16
# confirm configuration
# how many entries in the top-level confirm table - 256 means
# complete split on the last character
self.conf_top_level_split = 256
# how much we 'pull back' in confirm - this is obviously related
# to the first level conf but we will keep two separate paramters
# for this to avoid the risk of conflating these
self.conf_pull_back = 1
if self.conf_pull_back > 0 and self.conf_top_level_split < 256:
fail_out("Pull back distance %d not supported by top level split %d" % (self.conf_pull_back, self.conf_top_level_split))
# minor stuff
self.default_body_indent = 8

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -29,7 +29,7 @@
/** \file /** \file
* \brief FDR literal matcher: build API. * \brief FDR literal matcher: build API.
*/ */
#include "fdr.h"
#include "fdr_internal.h" #include "fdr_internal.h"
#include "fdr_compile.h" #include "fdr_compile.h"
#include "fdr_confirm.h" #include "fdr_confirm.h"
@ -187,9 +187,9 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
/* we are allowing domains 9 to 15 only */ /* we are allowing domains 9 to 15 only */
assert(eng.bits > 8 && eng.bits < 16); assert(eng.bits > 8 && eng.bits < 16);
fdr->domain = eng.bits; fdr->domain = eng.bits;
fdr->schemeWidthByte = eng.schemeWidth / 8;
fdr->domainMask = (1 << eng.bits) - 1; fdr->domainMask = (1 << eng.bits) - 1;
fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte; fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8);
fdr->stride = eng.stride;
if (link.first) { if (link.first) {
fdr->link = verify_u32(ptr - fdr_base); fdr->link = verify_u32(ptr - fdr_base);
@ -544,6 +544,7 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
// temporary hack for unit testing // temporary hack for unit testing
if (hint != HINT_INVALID) { if (hint != HINT_INVALID) {
des->bits = 9; des->bits = 9;
des->stride = 1;
} }
FDRCompiler fc(lits, *des, make_small); FDRCompiler fc(lits, *des, make_small);
@ -571,10 +572,9 @@ fdrBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small, u32 hint,
#endif #endif
} // namespace ue2
// FIXME: should be compile-time only
size_t fdrSize(const FDR *fdr) { size_t fdrSize(const FDR *fdr) {
assert(fdr); assert(fdr);
return fdr->size; return fdr->size;
} }
} // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -61,6 +61,9 @@ fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
#endif #endif
/** \brief Returns size in bytes of the given FDR engine. */
size_t fdrSize(const struct FDR *fdr);
} // namespace ue2 } // namespace ue2
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "fdr.h"
#include "fdr_internal.h" #include "fdr_internal.h"
#include "fdr_compile_internal.h" #include "fdr_compile_internal.h"
#include "fdr_confirm.h" #include "fdr_confirm.h"

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -36,144 +36,121 @@
#include "util/bitutils.h" #include "util/bitutils.h"
#include "util/compare.h" #include "util/compare.h"
#define CONF_LOADVAL_CALL lv_u64a
#define CONF_LOADVAL_CALL_CAUTIOUS lv_u64a_ce
// this is ordinary confirmation function which runs through // this is ordinary confirmation function which runs through
// the whole confirmation procedure // the whole confirmation procedure
static really_inline static really_inline
void confWithBit(const struct FDRConfirm * fdrc, void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
const struct FDR_Runtime_Args * a, size_t i, u32 pullBackAmount, hwlmcb_rv_t *control,
size_t i, u32 *last_match, u64a conf_key) {
CautionReason r,
u32 pullBackAmount,
hwlmcb_rv_t *control,
u32 * last_match) {
assert(i < a->len); assert(i < a->len);
assert(ISALIGNED(fdrc)); assert(ISALIGNED(fdrc));
const u8 * buf = a->buf; const u8 * buf = a->buf;
const size_t len = a->len; u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
fdrc->nBitsOrSoleID);
CONF_TYPE v; u32 start = getConfirmLitIndex(fdrc)[c];
const u8 * confirm_loc = buf + i - pullBackAmount - 7; if (likely(!start)) {
if (likely(r == NOT_CAUTIOUS || confirm_loc >= buf)) { return;
v = CONF_LOADVAL_CALL(confirm_loc, buf, buf + len);
} else { // r == VECTORING, confirm_loc < buf
u64a histBytes = a->histBytes;
v = CONF_LOADVAL_CALL_CAUTIOUS(confirm_loc, buf, buf + len);
// stitch together v (which doesn't move) and history (which does)
u32 overhang = buf - confirm_loc;
histBytes >>= 64 - (overhang * 8);
v |= histBytes;
} }
u32 c = CONF_HASH_CALL(v, fdrc->andmsk, fdrc->mult, fdrc->nBitsOrSoleID); const struct LitInfo *li
u32 start = getConfirmLitIndex(fdrc)[c]; = (const struct LitInfo *)((const u8 *)fdrc + start);
if (P0(start)) {
const struct LitInfo *l =
(const struct LitInfo *)((const u8 *)fdrc + start);
u8 oldNext; // initialized in loop u8 oldNext; // initialized in loop
do { do {
assert(ISALIGNED(l)); assert(ISALIGNED(li));
if (P0( (v & l->msk) != l->v)) { if (unlikely((conf_key & li->msk) != li->v)) {
goto out;
}
if ((*last_match == li->id) && (li->flags & NoRepeat)) {
goto out;
}
const u8 *loc = buf + i - li->size + 1 - pullBackAmount;
u8 caseless = li->flags & Caseless;
if (loc < buf) {
u32 full_overhang = buf - loc;
const u8 *history = caseless ? a->buf_history_nocase
: a->buf_history;
size_t len_history = caseless ? a->len_history_nocase
: a->len_history;
// can't do a vectored confirm either if we don't have
// the bytes
if (full_overhang > len_history) {
goto out; goto out;
} }
if ((*last_match == l->id) && (l->flags & NoRepeat)) { // as for the regular case, no need to do a full confirm if
goto out; // we're a short literal
if (unlikely(li->size > sizeof(CONF_TYPE))) {
const u8 *s1 = li->s;
const u8 *s2 = s1 + full_overhang;
const u8 *loc1 = history + len_history - full_overhang;
const u8 *loc2 = buf;
size_t size1 = MIN(full_overhang, li->size - sizeof(CONF_TYPE));
size_t wind_size2_back = sizeof(CONF_TYPE) + full_overhang;
size_t size2 = wind_size2_back > li->size ?
0 : li->size - wind_size2_back;
if (cmpForward(loc1, s1, size1, caseless)) {
goto out;
}
if (cmpForward(loc2, s2, size2, caseless)) {
goto out;
}
} }
} else { // NON-VECTORING PATH
const u8 * loc = buf + i - l->size + 1 - pullBackAmount; // if string < conf_type we don't need regular string cmp
if (unlikely(li->size > sizeof(CONF_TYPE))) {
if (cmpForward(loc, li->s, li->size - sizeof(CONF_TYPE),
caseless)) {
goto out;
}
}
}
u8 caseless = l->flags & Caseless; if (unlikely(!(li->groups & *control))) {
if (loc < buf) { goto out;
u32 full_overhang = buf - loc; }
const u8 * history = (caseless) ? if (unlikely(li->flags & ComplexConfirm)) {
a->buf_history_nocase : a->buf_history; const u8 *loc2 = buf + i - li->extended_size + 1 - pullBackAmount;
size_t len_history = (caseless) ? if (loc2 < buf) {
a->len_history_nocase : a->len_history; u32 full_overhang = buf - loc2;
size_t len_history = caseless ? a->len_history_nocase
// can't do a vectored confirm either if we don't have : a->len_history;
// the bytes
if (full_overhang > len_history) { if (full_overhang > len_history) {
goto out; goto out;
} }
// as for the regular case, no need to do a full confirm if
// we're a short literal
if (unlikely(l->size > sizeof(CONF_TYPE))) {
const u8 * s1 = l->s;
const u8 * s2 = s1 + full_overhang;
const u8 * loc1 = history + len_history - full_overhang;
const u8 * loc2 = buf;
size_t size1 = MIN(full_overhang,
l->size - sizeof(CONF_TYPE));
size_t wind_size2_back = sizeof(CONF_TYPE) +
full_overhang;
size_t size2 = wind_size2_back > l->size ?
0 : l->size - wind_size2_back;
if (cmpForward(loc1, s1, size1, caseless)) {
goto out;
}
if (cmpForward(loc2, s2, size2, caseless)) {
goto out;
}
}
} else { // NON-VECTORING PATH
// if string < conf_type we don't need regular string cmp
if (unlikely(l->size > sizeof(CONF_TYPE))) {
if (cmpForward(loc, l->s, l->size - sizeof(CONF_TYPE), caseless)) {
goto out;
}
}
} }
}
if (P0(!(l->groups & *control))) { *last_match = li->id;
goto out; *control = a->cb(loc - buf, i, li->id, a->ctxt);
} out:
oldNext = li->next; // oldNext is either 0 or an 'adjust' value
if (unlikely(l->flags & ComplexConfirm)) { li = (const struct LitInfo *)((const u8 *)li + oldNext + li->size);
const u8 * loc2 = buf + i - l->extended_size + 1 - pullBackAmount; } while (oldNext);
if (loc2 < buf) {
u32 full_overhang = buf - loc2;
size_t len_history = (caseless) ?
a->len_history_nocase : a->len_history;
if (full_overhang > len_history) {
goto out;
}
}
}
*last_match = l->id;
*control = a->cb(loc - buf, i, l->id, a->ctxt);
out:
oldNext = l->next; // oldNext is either 0 or an 'adjust' value
l = (const struct LitInfo*)((const u8 *)l + oldNext + l->size);
} while (oldNext);
}
} }
// 'light-weight' confirmation function which is used by 1-mask Teddy; // 'light-weight' confirmation function which is used by 1-mask Teddy;
// in the 'confirmless' case it simply calls callback function, // in the 'confirmless' case it simply calls callback function,
// otherwise it calls 'confWithBit' function for the full confirmation procedure // otherwise it calls 'confWithBit' function for the full confirmation procedure
static really_inline static really_inline
void confWithBit1(const struct FDRConfirm * fdrc, void confWithBit1(const struct FDRConfirm *fdrc,
const struct FDR_Runtime_Args * a, const struct FDR_Runtime_Args *a, size_t i,
size_t i, hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
CautionReason r,
hwlmcb_rv_t *control,
u32 * last_match) {
assert(i < a->len); assert(i < a->len);
assert(ISALIGNED(fdrc)); assert(ISALIGNED(fdrc));
if (unlikely(fdrc->mult)) { if (unlikely(fdrc->mult)) {
confWithBit(fdrc, a, i, r, 0, control, last_match); confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
return; return;
} else { } else {
u32 id = fdrc->nBitsOrSoleID; u32 id = fdrc->nBitsOrSoleID;
@ -190,12 +167,9 @@ void confWithBit1(const struct FDRConfirm * fdrc,
// In the 'confirmless' case it makes fast 32-bit comparison, // In the 'confirmless' case it makes fast 32-bit comparison,
// otherwise it calls 'confWithBit' function for the full confirmation procedure // otherwise it calls 'confWithBit' function for the full confirmation procedure
static really_inline static really_inline
void confWithBitMany(const struct FDRConfirm * fdrc, void confWithBitMany(const struct FDRConfirm *fdrc,
const struct FDR_Runtime_Args * a, const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
size_t i, hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
CautionReason r,
hwlmcb_rv_t *control,
u32 * last_match) {
assert(i < a->len); assert(i < a->len);
assert(ISALIGNED(fdrc)); assert(ISALIGNED(fdrc));
@ -204,7 +178,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc,
} }
if (unlikely(fdrc->mult)) { if (unlikely(fdrc->mult)) {
confWithBit(fdrc, a, i, r, 0, control, last_match); confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
return; return;
} else { } else {
const u32 id = fdrc->nBitsOrSoleID; const u32 id = fdrc->nBitsOrSoleID;
@ -215,7 +189,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc,
} }
if (r == VECTORING && len > i - a->start_offset) { if (r == VECTORING && len > i - a->start_offset) {
if (len > (i + a->len_history)) { if (len > i + a->len_history) {
return; return;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -28,11 +28,11 @@
#include "config.h" #include "config.h"
#include "fdr.h" #include "fdr_compile.h"
#include "fdr_internal.h"
#include "fdr_compile_internal.h" #include "fdr_compile_internal.h"
#include "fdr_dump.h" #include "fdr_dump.h"
#include "fdr_engine_description.h" #include "fdr_engine_description.h"
#include "fdr_internal.h"
#include "teddy_engine_description.h" #include "teddy_engine_description.h"
#include "ue2common.h" #include "ue2common.h"
@ -68,8 +68,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
} }
if (isTeddy) { if (isTeddy) {
unique_ptr<TeddyEngineDescription> des = auto des = getTeddyDescription(fdr->engineID);
getTeddyDescription(fdr->engineID);
if (des) { if (des) {
fprintf(f, " masks %u\n", des->numMasks); fprintf(f, " masks %u\n", des->numMasks);
fprintf(f, " buckets %u\n", des->getNumBuckets()); fprintf(f, " buckets %u\n", des->getNumBuckets());
@ -78,16 +77,8 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
fprintf(f, " <unknown engine>\n"); fprintf(f, " <unknown engine>\n");
} }
} else { } else {
unique_ptr<FDREngineDescription> des = fprintf(f, " domain %u\n", fdr->domain);
getFdrDescription(fdr->engineID); fprintf(f, " stride %u\n", fdr->stride);
if (des) {
fprintf(f, " domain %u\n", des->bits);
fprintf(f, " stride %u\n", des->stride);
fprintf(f, " buckets %u\n", des->getNumBuckets());
fprintf(f, " width %u\n", des->schemeWidth);
} else {
fprintf(f, " <unknown engine>\n");
}
} }
fprintf(f, " strings ???\n"); fprintf(f, " strings ???\n");

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -42,13 +42,11 @@ using namespace std;
namespace ue2 { namespace ue2 {
#include "fdr_autogen_compiler.cpp"
FDREngineDescription::FDREngineDescription(const FDREngineDef &def) FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
: EngineDescription(def.id, targetByArchFeatures(def.cpu_features), : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
def.numBuckets, def.confirmPullBackDistance, def.numBuckets, def.confirmPullBackDistance,
def.confirmTopLevelSplit), def.confirmTopLevelSplit),
schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {} schemeWidth(def.schemeWidth), stride(0), bits(0) {}
u32 FDREngineDescription::getDefaultFloodSuffixLength() const { u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
// rounding up, so that scheme width 32 and 6 buckets is 6 not 5! // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@ -56,6 +54,12 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
return ((getSchemeWidth() + getNumBuckets() - 1) / getNumBuckets()) + 1; return ((getSchemeWidth() + getNumBuckets() - 1) / getNumBuckets()) + 1;
} }
void getFdrDescriptions(vector<FDREngineDescription> *out) {
static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
out->clear();
out->emplace_back(def);
}
static static
u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) { u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
u32 desiredStride = 1; // always our safe fallback u32 desiredStride = 1; // always our safe fallback
@ -108,32 +112,33 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
FDREngineDescription *best = nullptr; FDREngineDescription *best = nullptr;
u32 best_score = 0; u32 best_score = 0;
FDREngineDescription &eng = allDescs[0];
for (u32 domain = 9; domain <= 15; domain++) { for (u32 domain = 9; domain <= 15; domain++) {
for (size_t engineID = 0; engineID < allDescs.size(); engineID++) { for (size_t stride = 1; stride <= 4; stride *= 2) {
// to make sure that domains >=14 have stride 1 according to origin // to make sure that domains >=14 have stride 1 according to origin
if (domain > 13 && engineID > 0) { if (domain > 13 && stride > 1) {
continue; continue;
} }
FDREngineDescription &eng = allDescs[engineID];
if (!eng.isValidOnTarget(target)) { if (!eng.isValidOnTarget(target)) {
continue; continue;
} }
if (msl < eng.stride) { if (msl < stride) {
continue; continue;
} }
u32 score = 100; u32 score = 100;
score -= absdiff(desiredStride, eng.stride); score -= absdiff(desiredStride, stride);
if (eng.stride <= desiredStride) { if (stride <= desiredStride) {
score += eng.stride; score += stride;
} }
u32 effLits = vl.size(); /* * desiredStride;*/ u32 effLits = vl.size(); /* * desiredStride;*/
u32 ideal; u32 ideal;
if (effLits < eng.getNumBuckets()) { if (effLits < eng.getNumBuckets()) {
if (eng.stride == 1) { if (stride == 1) {
ideal = 8; ideal = 8;
} else { } else {
ideal = 10; ideal = 10;
@ -158,27 +163,28 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
ideal -= 2; ideal -= 2;
} }
if (eng.stride > 1) { if (stride > 1) {
ideal++; ideal++;
} }
DEBUG_PRINTF("effLits %u\n", effLits); DEBUG_PRINTF("effLits %u\n", effLits);
if (target.is_atom_class() && !make_small && effLits < 4000) { if (target.is_atom_class() && !make_small && effLits < 4000) {
/* Unless it is a very heavy case, we want to build smaller tables /* Unless it is a very heavy case, we want to build smaller
* on lightweight machines due to their small caches. */ * tables on lightweight machines due to their small caches. */
ideal -= 2; ideal -= 2;
} }
score -= absdiff(ideal, domain); score -= absdiff(ideal, domain);
DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u " DEBUG_PRINTF("fdr %u: width=%u, domain=%u, buckets=%u, stride=%zu "
"-> score=%u\n", "-> score=%u\n",
eng.getID(), eng.schemeWidth, eng.bits, eng.getID(), eng.schemeWidth, domain,
eng.getNumBuckets(), eng.stride, score); eng.getNumBuckets(), stride, score);
if (!best || score > best_score) { if (!best || score > best_score) {
eng.bits = domain; eng.bits = domain;
eng.stride = stride;
best = &eng; best = &eng;
best_score = score; best_score = score;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,6 @@ struct FDREngineDef {
u32 id; u32 id;
u32 schemeWidth; u32 schemeWidth;
u32 numBuckets; u32 numBuckets;
u32 stride;
u64a cpu_features; u64a cpu_features;
u32 confirmPullBackDistance; u32 confirmPullBackDistance;
u32 confirmTopLevelSplit; u32 confirmTopLevelSplit;
@ -73,7 +72,6 @@ chooseEngine(const target_t &target, const std::vector<hwlmLiteral> &vl,
bool make_small); bool make_small);
std::unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID); std::unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID);
void getFdrDescriptions(std::vector<FDREngineDescription> *out); void getFdrDescriptions(std::vector<FDREngineDescription> *out);
} // namespace ue2 } // namespace ue2
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -76,17 +76,17 @@ struct FDR {
* structures (spillover strings and hash table) if we're a secondary * structures (spillover strings and hash table) if we're a secondary
* structure. */ * structure. */
u32 link; u32 link;
u8 domain; /* dynamic domain info */ u8 stride; /* stride - how frequeuntly the data is consulted by the first
u8 schemeWidthByte; /* scheme width in bytes */ * stage matcher */
u8 domain; /* number of bits used to index into main FDR table. This value
* is used only of debugging/asserts. */
u16 domainMask; /* pre-computed domain mask */ u16 domainMask; /* pre-computed domain mask */
u32 tabSize; /* pre-computed hashtable size in bytes */ u32 tabSize; /* pre-computed hashtable size in bytes */
u32 pad1; u32 pad;
union { m128 start; /* initial start state to use at offset 0. The state has been set
u32 s_u32; * up based on the min length of buckets to reduce the need for
u64a s_u64a; * pointless confirms. */
m128 s_m128;
} start;
}; };
/** \brief FDR runtime arguments. /** \brief FDR runtime arguments.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -29,188 +29,43 @@
#ifndef FDR_LOADVAL_H #ifndef FDR_LOADVAL_H
#define FDR_LOADVAL_H #define FDR_LOADVAL_H
#include "fdr_internal.h"
#include "ue2common.h" #include "ue2common.h"
#include "util/unaligned.h" #include "util/unaligned.h"
#include "util/simd_utils.h"
#define MAKE_LOADVAL(type, name) \ #define MAKE_LOADVAL(type, name) \
static really_inline type name (const u8 * ptr, UNUSED const u8 * lo, UNUSED const u8 * hi) static really_inline \
type name(const u8 *ptr, UNUSED const u8 *lo, UNUSED const u8 *hi)
#define NORMAL_SAFE(type) assert(ptr >= lo && (ptr + sizeof(type) - 1) < hi) #define NORMAL_SAFE(type) \
#define ALIGNED_SAFE(type) NORMAL_SAFE(type); assert(((size_t)ptr % sizeof(type)) == 0); do { \
// these ones need asserts to test the property that we're not handling dynamically assert(ptr >= lo); \
#define CAUTIOUS_FORWARD_SAFE(type) assert(ptr >= lo) assert(ptr + sizeof(type) - 1 < hi); \
#define CAUTIOUS_BACKWARD_SAFE(type) assert((ptr + sizeof(type) - 1) < hi) } while(0)
#define CF_INDEX_CHECK (ptr + i < hi) #define MAKE_LOOP_CE(TYPE) \
#define CB_INDEX_CHECK (lo <= ptr + i) TYPE v = 0; \
#define CE_INDEX_CHECK (lo <= ptr + i) && (ptr + i < hi) for (TYPE i = 0; i < sizeof(TYPE); i++) { \
if ((lo <= ptr + i) && (ptr + i < hi)) { \
#define MAKE_LOOP(TYPE, COND, SHIFT_FIDDLE) \ v += (TYPE)ptr[i] << (i*8); \
TYPE v = 0; \ } \
for (TYPE i = 0; i < sizeof(TYPE); i++) { \ } \
if (COND) { \
v += (TYPE)ptr[i] << ((SHIFT_FIDDLE)*8); \
} \
} \
return v; return v;
#define MAKE_LOOP_BE(TYPE, COND) \
MAKE_LOOP(TYPE, COND, sizeof(TYPE)-i-1)
#define MAKE_LOOP_LE(TYPE, COND) \
MAKE_LOOP(TYPE, COND, i)
#define MAKE_LOOP_BE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE); MAKE_LOOP_BE(TYPE, CF_INDEX_CHECK)
#define MAKE_LOOP_BE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_BE(TYPE, CB_INDEX_CHECK)
#define MAKE_LOOP_BE_CE(TYPE) MAKE_LOOP_BE(TYPE, CE_INDEX_CHECK)
#define MAKE_LOOP_LE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE); MAKE_LOOP_LE(TYPE, CF_INDEX_CHECK)
#define MAKE_LOOP_LE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_LE(TYPE, CB_INDEX_CHECK)
#define MAKE_LOOP_LE_CE(TYPE) MAKE_LOOP_LE(TYPE, CE_INDEX_CHECK)
// no suffix = normal (unaligned) // no suffix = normal (unaligned)
// _a = aligned
// _cf = cautious forwards, base is always in bounds, but may read over the end of the buffer (test against hi)
// _cb = cautious backwards, final byte is always in bounds, but may read over the start of the buffer (test against lo)
// _ce = cautious everywhere (in both directions); test against hi and lo // _ce = cautious everywhere (in both directions); test against hi and lo
// u8 loadvals
MAKE_LOADVAL(u8, lv_u8) {
NORMAL_SAFE(u8);
return *ptr;
}
MAKE_LOADVAL(u8, lv_u8_cf) {
CAUTIOUS_FORWARD_SAFE(u8);
if (ptr < hi) {
return *ptr;
} else {
return 0;
}
}
MAKE_LOADVAL(u8, lv_u8_cb) {
CAUTIOUS_BACKWARD_SAFE(u8);
if (lo <= ptr) {
return *ptr;
} else {
return 0;
}
}
MAKE_LOADVAL(u8, lv_u8_ce) {
if ((lo <= ptr) && (ptr < hi)) {
return *ptr;
} else {
return 0;
}
}
MAKE_LOADVAL(u16, lv_u16) { MAKE_LOADVAL(u16, lv_u16) {
NORMAL_SAFE(u16); NORMAL_SAFE(u16);
return unaligned_load_u16(ptr); return unaligned_load_u16(ptr);
} }
MAKE_LOADVAL(u16, lv_u16_a) {
ALIGNED_SAFE(u16);
return *(const u16 *)ptr;
}
MAKE_LOADVAL(u32, lv_u32) {
NORMAL_SAFE(u32);
return unaligned_load_u32(ptr);
}
MAKE_LOADVAL(u32, lv_u32_a) {
ALIGNED_SAFE(u32);
return *(const u32 *)ptr;
}
MAKE_LOADVAL(u64a, lv_u64a) { MAKE_LOADVAL(u64a, lv_u64a) {
NORMAL_SAFE(u32); NORMAL_SAFE(u32);
return unaligned_load_u64a(ptr); return unaligned_load_u64a(ptr);
} }
MAKE_LOADVAL(u64a, lv_u64a_a) { MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_CE(u16); }
ALIGNED_SAFE(u64a);
return *(const u64a *)ptr;
}
MAKE_LOADVAL(u16, lv_u16_cf) { MAKE_LOOP_LE_CF(u16); } MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_CE(u64a); }
MAKE_LOADVAL(u16, lv_u16_cb) { MAKE_LOOP_LE_CB(u16); }
MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_LE_CE(u16); }
MAKE_LOADVAL(u32, lv_u32_cf) { MAKE_LOOP_LE_CF(u32); }
MAKE_LOADVAL(u32, lv_u32_cb) { MAKE_LOOP_LE_CB(u32); }
MAKE_LOADVAL(u32, lv_u32_ce) { MAKE_LOOP_LE_CE(u32); }
MAKE_LOADVAL(u64a, lv_u64a_cf) { MAKE_LOOP_LE_CF(u64a); }
MAKE_LOADVAL(u64a, lv_u64a_cb) { MAKE_LOOP_LE_CB(u64a); }
MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_LE_CE(u64a); }
MAKE_LOADVAL(m128, lv_m128) {
NORMAL_SAFE(m128);
return loadu128(ptr);
}
MAKE_LOADVAL(m128, lv_m128_a) {
ALIGNED_SAFE(m128);
assert((size_t)ptr % sizeof(m128) == 0);
return *(const m128 *)ptr;
}
// m128 cases need to be manually created
MAKE_LOADVAL(m128, lv_m128_cf) {
CAUTIOUS_FORWARD_SAFE(m128);
union {
u8 val8[16];
m128 val128;
} u;
for (u32 i = 0; i < 16; i++) {
if (ptr + i < hi) {
u.val8[i] = ptr[i];
} else {
u.val8[i] = 0;
}
}
return u.val128;
}
MAKE_LOADVAL(m128, lv_m128_cb) {
CAUTIOUS_BACKWARD_SAFE(m128);
union {
u8 val8[16];
m128 val128;
} u;
for (u32 i = 0; i < 16; i++) {
if (lo <= ptr + i) {
u.val8[i] = ptr[i];
} else {
u.val8[i] = 0;
}
}
return u.val128;
}
MAKE_LOADVAL(m128, lv_m128_ce) {
union {
u8 val8[16];
m128 val128;
} u;
for (u32 i = 0; i < 16; i++) {
if ((lo <= ptr + i) && (ptr + i < hi)) {
u.val8[i] = ptr[i];
} else {
u.val8[i] = 0;
}
}
return u.val128;
}
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "fdr.h"
#include "fdr_internal.h" #include "fdr_internal.h"
#include "fdr_streaming_internal.h" #include "fdr_streaming_internal.h"
#include "fdr_compile_internal.h" #include "fdr_compile_internal.h"

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -32,6 +32,8 @@
#include "fdr_streaming_internal.h" #include "fdr_streaming_internal.h"
#include "util/partial_store.h" #include "util/partial_store.h"
#include <string.h>
static really_inline static really_inline
const struct FDRSTableHeader * getSHDR(const struct FDR * fdr) { const struct FDRSTableHeader * getSHDR(const struct FDR * fdr) {
const u8 * linkPtr = ((const u8 *)fdr) + fdr->link; const u8 * linkPtr = ((const u8 *)fdr) + fdr->link;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "fdr.h"
#include "fdr_internal.h" #include "fdr_internal.h"
#include "fdr_confirm.h" #include "fdr_confirm.h"
#include "fdr_compile_internal.h" #include "fdr_compile_internal.h"

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -26,11 +26,19 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "config.h" /** \file
* \brief Teddy literal matcher: SSSE3 engine runtime.
*/
#include "fdr_internal.h"
#include "flood_runtime.h"
#include "teddy.h"
#include "teddy_internal.h"
#include "teddy_runtime_common.h"
#include "util/simd_utils.h" #include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h" #include "util/simd_utils_ssse3.h"
static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -67,178 +75,584 @@ static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
}; };
// Note: p_mask is an output param that initialises a poison mask. #ifdef ARCH_64_BIT
UNUSED static really_inline #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, do { \
const u8 *buf_history, size_t len_history, if (unlikely(isnonzero128(var))) { \
const u32 nMasks) { u64a lo = movq(var); \
union { u64a hi = movq(byteShiftRight128(var, 8)); \
u8 val8[16]; if (unlikely(lo)) { \
m128 val128; conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \
} u; control, &last_match); \
u.val128 = zeroes128(); CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(hi)) { \
conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \
control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
} while (0);
#else
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
if (unlikely(isnonzero128(var))) { \
u32 part1 = movd(var); \
u32 part2 = movd(byteShiftRight128(var, 4)); \
u32 part3 = movd(byteShiftRight128(var, 8)); \
u32 part4 = movd(byteShiftRight128(var, 12)); \
if (unlikely(part1)) { \
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part2)) { \
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part3)) { \
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part4)) { \
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
} while (0);
#endif
if (ptr >= lo) { static really_inline
u32 avail = (u32)(hi - ptr); m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
if (avail >= 16) { m128 mask = set16x8(0xf);
*p_mask = load128((const void*)(p_mask_arr[16] + 16)); m128 lo = and128(val, mask);
return loadu128(ptr); m128 hi = and128(rshift2x64(val, 4), mask);
} return and128(and128(pshufb(maskBase[0*2], lo),
*p_mask = load128((const void*)(p_mask_arr[avail] + 16)); pshufb(maskBase[0*2+1], hi)), p_mask);
for (u32 i = 0; i < avail; i++) { }
u.val8[i] = ptr[i];
} static really_inline
} else { m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1)); m128 val) {
u32 start = (u32)(lo - ptr); m128 mask = set16x8(0xf);
u32 i; m128 lo = and128(val, mask);
for (i = start - need; ptr + i < lo; i++) { m128 hi = and128(rshift2x64(val, 4), mask);
u.val8[i] = buf_history[len_history - (lo - (ptr + i))]; m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
}
u32 end = MIN(16, (u32)(hi - ptr)); m128 res_1 = and128(pshufb(maskBase[1*2], lo),
*p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start)); pshufb(maskBase[1*2+1], hi));
for (; i < end; i++) { m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
u.val8[i] = ptr[i]; *old_1 = res_1;
} return and128(and128(r, p_mask), res_shifted_1);
}
static really_inline
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 p_mask, m128 val) {
m128 mask = set16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift2x64(val, 4), mask);
m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
m128 res_2 = and128(pshufb(maskBase[2*2], lo),
pshufb(maskBase[2*2+1], hi));
m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
*old_2 = res_2;
return and128(r, res_shifted_2);
}
static really_inline
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 *old_3, m128 p_mask, m128 val) {
m128 mask = set16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift2x64(val, 4), mask);
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
m128 res_3 = and128(pshufb(maskBase[3*2], lo),
pshufb(maskBase[3*2+1], hi));
m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
*old_3 = res_3;
return and128(r, res_shifted_3);
}
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 1);
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16;
} }
return u.val128; if (ptr + 16 < buf_end) {
} m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16;
#if defined(__AVX2__)
UNUSED static really_inline
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks));
*p_mask = set2x128(p_mask128);
return ret;
}
static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
};
UNUSED static really_inline
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history) {
union {
u8 val8[32];
m256 val256;
} u;
if (ptr >= lo) {
u32 avail = (u32)(hi - ptr);
if (avail >= 32) {
*p_mask = load256((const void*)(p_mask_arr256[32] + 32));
return loadu256(ptr);
}
*p_mask = load256((const void*)(p_mask_arr256[avail] + 32));
for (u32 i = 0; i < avail; i++) {
u.val8[i] = ptr[i];
}
} else {
// need contains "how many chars to pull from history"
// calculate based on what we need, what we have in the buffer
// and only what we need to make primary confirm work
u32 start = (u32)(lo - ptr);
u32 i;
for (i = start; ptr + i < lo; i++) {
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
}
u32 end = MIN(32, (u32)(hi - ptr));
*p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start));
for (; i < end; i++) {
u.val8[i] = ptr[i];
}
} }
return u.val256; for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
} }
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
#endif // __AVX2__ const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 1);
#define P0(cnd) unlikely(cnd) const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
#include "fdr.h" if (ptr + 16 < buf_end) {
#include "fdr_internal.h" m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
#include "flood_runtime.h" CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
#include "fdr_confirm.h" for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
#include "fdr_confirm_runtime.h" __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
#include "fdr_loadval.h" for (; ptr < buf_end; ptr += 16) {
#include "util/bitutils.h" m128 p_mask;
#include "teddy_internal.h" m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
}
#include "teddy_autogen.c" hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 2);
m128 res_old_1 = ones128();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
}
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 2);
m128 res_old_1 = ones128();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
}
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 3);
m128 res_old_1 = ones128();
m128 res_old_2 = ones128();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
}
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 3);
m128 res_old_1 = ones128();
m128 res_old_2 = ones128();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
}
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 4);
m128 res_old_1 = ones128();
m128 res_old_2 = ones128();
m128 res_old_3 = ones128();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
}
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
hwlmcb_rv_t controlVal = *a->groups;
hwlmcb_rv_t *control = &controlVal;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy, 4);
m128 res_old_1 = ones128();
m128 res_old_2 = ones128();
m128 res_old_3 = ones128();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
}
*a->groups = controlVal;
return HWLM_SUCCESS;
}

97
src/fdr/teddy.h Normal file
View File

@ -0,0 +1,97 @@
/*
* Copyright (c) 2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Teddy literal matcher: function declarations.
*/
#ifndef TEDDY_H_
#define TEDDY_H_
struct FDR; // forward declaration from fdr_internal.h
struct FDR_Runtime_Args;
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
#if defined(__AVX2__)
hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
const struct FDR_Runtime_Args *a);
#endif /* __AVX2__ */
#endif /* TEDDY_H_ */

View File

@ -1,545 +0,0 @@
#!/usr/bin/python
# Copyright (c) 2015, Intel Corporation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Intel Corporation nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
from autogen_utils import *
from base_autogen import *
from string import Template
class MT(MatcherBase):
def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
if self.packed:
print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
else:
if self.num_masks == 1:
conf_func = "confWithBit1"
else:
conf_func = "confWithBitMany"
if cautious:
caution_string = "VECTORING"
else:
caution_string = "NOT_CAUTIOUS"
print " if (P0(!!%s)) {" % var_name
print " do {"
if bits == 64:
print " bit = findAndClearLSB_64(&%s);" % (var_name)
else:
print " bit = findAndClearLSB_32(&%s);" % (var_name)
print " byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
print " idx = bit %% %d;" % self.num_buckets
print " cf = confBase[idx];"
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
print " if (!(fdrc->groups & *control))"
print " continue;"
print " %s(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % (conf_func, caution_string)
print " } while(P0(!!%s));" % var_name
print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
print " *a->groups = controlVal;"
print " return HWLM_TERMINATED;"
print " }"
print " }"
def produce_needed_temporaries(self, max_iterations):
print " m128 p_mask;"
for iter in range(0, max_iterations):
print " m128 val_%d;" % iter
print " m128 val_%d_lo;" % iter
print " m128 val_%d_hi;" % iter
for x in range(self.num_masks):
print " m128 res_%d_%d;" % (iter, x)
if x != 0:
print " m128 res_shifted_%d_%d;" % (iter, x)
print " m128 r_%d;" % iter
print "#ifdef ARCH_64_BIT"
print " u64a r_%d_lopart;" % iter
print " u64a r_%d_hipart;" % iter
print "#else"
print " u32 r_%d_part1;" % iter
print " u32 r_%d_part2;" % iter
print " u32 r_%d_part3;" % iter
print " u32 r_%d_part4;" % iter
print "#endif"
def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
cautious, save_old):
if cautious:
print " val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
else:
print " val_%d = load128(ptr + %d);" % (iter, iter*16)
print " val_%d_lo = and128(val_%d, lomask);" % (iter, iter)
print " val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter)
print " val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter)
print
for x in range(self.num_masks):
print Template("""
res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2] , val_${ITER}_lo),
pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
if x != 0:
if iter == 0:
print " res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x)
else:
print " res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x)
if x != 0 and iter == effective_num_iterations - 1 and save_old:
print " res_old_%d = res_%d_%d;" % (x, iter, x)
print
if cautious:
print " r_%d = and128(res_%d_0, p_mask);" % (iter, iter)
else:
print " r_%d = res_%d_0;" % (iter, iter)
for x in range(1, self.num_masks):
print " r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
print
def produce_one_iteration_confirm(self, iter, confirmCautious):
setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter),
(8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ]
setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter),
(4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter),
(8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter),
(12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ]
print " if (P0(isnonzero128(r_%d))) {" % (iter)
print "#ifdef ARCH_64_BIT"
for (off, val, init) in setup64:
print " %s = %s;" % (val, init)
for (off, val, init) in setup64:
self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
print "#else"
for (off, val, init) in setup32:
print " %s = %s;" % (val, init)
for (off, val, init) in setup32:
self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
print "#endif"
print " }"
def produce_one_iteration(self, iter, effective_num_iterations, cautious = False,
confirmCautious = True, save_old = True):
self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old)
self.produce_one_iteration_confirm(iter, confirmCautious)
def produce_code(self):
print self.produce_header(visible = True, header_only = False)
print self.produce_common_declarations()
print
self.produce_needed_temporaries(self.num_iterations)
print
print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks
print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
print " const size_t iterBytes = %d;" % (self.num_iterations * 16)
print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
' buf, len, a->start_offset);'
print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
' mainStart);'
for x in range(self.num_masks):
if (x != 0):
print " m128 res_old_%d = ones128();" % x
print " m128 lomask = set16x8(0xf);"
print " if (ptr < mainStart) {"
print " ptr = mainStart - 16;"
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
print " ptr += 16;"
print " }"
print " if (ptr + 16 < buf + len) {"
self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
print " ptr += 16;"
print " }"
print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
print " __builtin_prefetch(ptr + (iterBytes*4));"
print self.produce_flood_check()
for iter in range(self.num_iterations):
self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
print " }"
print " for (; ptr < buf + len; ptr += 16) {"
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
print " }"
print self.produce_footer()
def produce_compile_call(self):
packed_str = { False : "false", True : "true"}[self.packed]
print " { %d, %s, %d, %d, %s, %d, %d }," % (
self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
self.conf_pull_back, self.conf_top_level_split)
def get_name(self):
if self.packed:
pck_string = "_pck"
else:
pck_string = ""
if self.num_buckets == 16:
type_string = "_fat"
else:
type_string = ""
return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string)
def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8):
self.arch = arch
self.packed = packed
self.num_masks = num_masks
self.num_buckets = num_buckets
self.num_iterations = 2
if packed:
self.conf_top_level_split = 32
else:
self.conf_top_level_split = 1
self.conf_pull_back = 0
class MTFat(MT):
def produce_needed_temporaries(self, max_iterations):
print " m256 p_mask;"
for iter in range(0, max_iterations):
print " m256 val_%d;" % iter
print " m256 val_%d_lo;" % iter
print " m256 val_%d_hi;" % iter
for x in range(self.num_masks):
print " m256 res_%d_%d;" % (iter, x)
if x != 0:
print " m256 res_shifted_%d_%d;" % (iter, x)
print " m256 r_%d;" % iter
print "#ifdef ARCH_64_BIT"
print " u64a r_%d_part1;" % iter
print " u64a r_%d_part2;" % iter
print " u64a r_%d_part3;" % iter
print " u64a r_%d_part4;" % iter
print "#else"
print " u32 r_%d_part1;" % iter
print " u32 r_%d_part2;" % iter
print " u32 r_%d_part3;" % iter
print " u32 r_%d_part4;" % iter
print " u32 r_%d_part5;" % iter
print " u32 r_%d_part6;" % iter
print " u32 r_%d_part7;" % iter
print " u32 r_%d_part8;" % iter
print "#endif"
def produce_code(self):
print self.produce_header(visible = True, header_only = False)
print self.produce_common_declarations()
print
self.produce_needed_temporaries(self.num_iterations)
print
print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
print " const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));"
print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks
print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
print " const size_t iterBytes = %d;" % (self.num_iterations * 16)
print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
' buf, len, a->start_offset);'
print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
' mainStart);'
for x in range(self.num_masks):
if (x != 0):
print " m256 res_old_%d = ones256();" % x
print " m256 lomask = set32x8(0xf);"
print " if (ptr < mainStart) {"
print " ptr = mainStart - 16;"
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
print " ptr += 16;"
print " }"
print " if (ptr + 16 < buf + len) {"
self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
print " ptr += 16;"
print " }"
print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
print " __builtin_prefetch(ptr + (iterBytes*4));"
print self.produce_flood_check()
for iter in range(self.num_iterations):
self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
print " }"
print " for (; ptr < buf + len; ptr += 16) {"
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
print " }"
print self.produce_footer()
def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
cautious, save_old):
if cautious:
print " val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
else:
print " val_%d = load2x128(ptr + %d);" % (iter, iter*16)
print " val_%d_lo = and256(val_%d, lomask);" % (iter, iter)
print " val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter)
print " val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter)
print
for x in range(self.num_masks):
print Template("""
res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2] , val_${ITER}_lo),
vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
if x != 0:
if iter == 0:
print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x)
else:
print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x)
if x != 0 and iter == effective_num_iterations - 1 and save_old:
print " res_old_%d = res_%d_%d;" % (x, iter, x)
print
if cautious:
print " r_%d = and256(res_%d_0, p_mask);" % (iter, iter)
else:
print " r_%d = res_%d_0;" % (iter, iter)
for x in range(1, self.num_masks):
print " r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
print
def produce_one_iteration_confirm(self, iter, confirmCautious):
setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"),
(4, "r_%d_part2" % iter, "extract64from256(r, 1);\n r = interleave256hi(r_%d, r_swap)" % (iter)),
(8, "r_%d_part3" % iter, "extractlow64from256(r)"),
(12, "r_%d_part4" % iter, "extract64from256(r, 1)") ]
setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"),
(2, "r_%d_part2" % iter, "extract32from256(r, 1)"),
(4, "r_%d_part3" % iter, "extract32from256(r, 2)"),
(6, "r_%d_part4" % iter, "extract32from256(r, 3);\n r = interleave256hi(r_%d, r_swap)" % (iter)),
(8, "r_%d_part5" % iter, "extractlow32from256(r)"),
(10, "r_%d_part6" % iter, "extract32from256(r, 1)"),
(12, "r_%d_part7" % iter, "extract32from256(r, 2)"),
(14, "r_%d_part8" % iter, "extract32from256(r, 3)") ]
print " if (P0(isnonzero256(r_%d))) {" % (iter)
print " m256 r_swap = swap128in256(r_%d);" % (iter)
print " m256 r = interleave256lo(r_%d, r_swap);" % (iter)
print "#ifdef ARCH_64_BIT"
for (off, val, init) in setup64:
print " %s = %s;" % (val, init)
for (off, val, init) in setup64:
self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
print "#else"
for (off, val, init) in setup32:
print " %s = %s;" % (val, init)
for (off, val, init) in setup32:
self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
print "#endif"
print " }"
class MTFast(MatcherBase):
def produce_confirm(self, cautious):
if cautious:
cautious_str = "VECTORING"
else:
cautious_str = "NOT_CAUTIOUS"
print " for (u32 i = 0; i < arrCnt; i++) {"
print " byte = bitArr[i] / 8;"
if self.packed:
conf_split_mask = IntegerType(32).constant_to_string(
self.conf_top_level_split - 1)
print " bitRem = bitArr[i] % 8;"
print " confSplit = *(ptr+byte) & 0x1f;"
print " idx = confSplit * %d + bitRem;" % self.num_buckets
print " cf = confBase[idx];"
print " if (!cf)"
print " continue;"
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
print " if (!(fdrc->groups & *control))"
print " continue;"
print " confWithBit(fdrc, a, ptr - buf + byte, %s, 0, control, &last_match);" % cautious_str
else:
print " cf = confBase[bitArr[i] % 8];"
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
print " confWithBit1(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % cautious_str
print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
print " *a->groups = controlVal;"
print " return HWLM_TERMINATED;"
print " }"
print " }"
def produce_needed_temporaries(self, max_iterations):
print " u32 arrCnt;"
print " u16 bitArr[512];"
print " m256 p_mask;"
print " m256 val_0;"
print " m256 val_0_lo;"
print " m256 val_0_hi;"
print " m256 res_0;"
print " m256 res_1;"
print " m128 lo_part;"
print " m128 hi_part;"
print "#ifdef ARCH_64_BIT"
print " u64a r_0_part;"
print "#else"
print " u32 r_0_part;"
print "#endif"
def produce_bit_scan(self, offset, bits):
print " while (P0(!!r_0_part)) {"
if bits == 64:
print " bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset)
else:
print " bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset)
print " }"
def produce_bit_check_128(self, var_name, offset):
print " if (P0(isnonzero128(%s))) {" % (var_name)
print "#ifdef ARCH_64_BIT"
print " r_0_part = movq(%s);" % (var_name)
self.produce_bit_scan(offset, 64)
print " r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name)
self.produce_bit_scan(offset + 1, 64)
print "#else"
print " r_0_part = movd(%s);" % (var_name)
self.produce_bit_scan(offset * 2, 32)
for step in range(1, 4):
print " r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4)
self.produce_bit_scan(offset * 2 + step, 32)
print "#endif"
print " }"
def produce_bit_check_256(self, iter, single_iter, cautious):
print " if (P0(isnonzero256(res_%d))) {" % (iter)
if single_iter:
print " arrCnt = 0;"
print " lo_part = cast256to128(res_%d);" % (iter)
print " hi_part = cast256to128(swap128in256(res_%d));" % (iter)
self.produce_bit_check_128("lo_part", iter * 4)
self.produce_bit_check_128("hi_part", iter * 4 + 2)
if single_iter:
self.produce_confirm(cautious)
print " }"
def produce_one_iteration_state_calc(self, iter, cautious):
if cautious:
print " val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32)
else:
print " val_0 = load256(ptr + %d);" % (iter * 32)
print " val_0_lo = and256(val_0, lomask);"
print " val_0_hi = rshift4x64(val_0, 4);"
print " val_0_hi = and256(val_0_hi, lomask);"
print " res_%d = and256(vpshufb(maskLo , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter)
if cautious:
print " res_%d = and256(res_%d, p_mask);" % (iter, iter)
def produce_code(self):
print self.produce_header(visible = True, header_only = False)
print self.produce_common_declarations()
print
self.produce_needed_temporaries(self.num_iterations)
print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
print " const m256 maskLo = set2x128(maskBase[0]);"
print " const m256 maskHi = set2x128(maskBase[1]);"
print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);"
print " const u8 * mainStart = ROUNDUP_PTR(ptr, 32);"
print " const size_t iterBytes = %d;" % (self.num_iterations * 32)
print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
' buf, len, a->start_offset);'
print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
' mainStart);'
print " const m256 lomask = set32x8(0xf);"
print " if (ptr < mainStart) {"
print " ptr = mainStart - 32;"
self.produce_one_iteration_state_calc(iter = 0, cautious = True)
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
print " ptr += 32;"
print " }"
print " if (ptr + 32 < buf + len) {"
self.produce_one_iteration_state_calc(iter = 0, cautious = False)
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
print " ptr += 32;"
print " }"
print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
print " __builtin_prefetch(ptr + (iterBytes*4));"
print self.produce_flood_check()
for iter in range (0, self.num_iterations):
self.produce_one_iteration_state_calc(iter = iter, cautious = False)
print " arrCnt = 0;"
for iter in range (0, self.num_iterations):
self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False)
self.produce_confirm(cautious = False)
print " }"
print " for (; ptr < buf + len; ptr += 32) {"
self.produce_one_iteration_state_calc(iter = 0, cautious = True)
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
print " }"
print self.produce_footer()
def get_name(self):
if self.packed:
pck_string = "_pck"
else:
pck_string = ""
return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string)
def produce_compile_call(self):
packed_str = { False : "false", True : "true"}[self.packed]
print " { %d, %s, %d, %d, %s, %d, %d }," % (
self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
self.conf_pull_back, self.conf_top_level_split)
def __init__(self, arch, packed = False):
self.arch = arch
self.packed = packed
self.num_masks = 1
self.num_buckets = 8
self.num_iterations = 2
self.conf_top_level_split = 1
self.conf_pull_back = 0
if packed:
self.conf_top_level_split = 32
else:
self.conf_top_level_split = 1
self.conf_pull_back = 0

1110
src/fdr/teddy_avx2.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "fdr.h"
#include "fdr_internal.h" #include "fdr_internal.h"
#include "fdr_compile_internal.h" #include "fdr_compile_internal.h"
#include "fdr_confirm.h" #include "fdr_confirm.h"
@ -65,7 +64,32 @@ bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const
return false; return false;
} }
#include "teddy_autogen_compiler.cpp" void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
static const TeddyEngineDef defns[] = {
{ 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 },
{ 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 },
{ 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 },
{ 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 },
{ 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 },
{ 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 },
{ 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 },
{ 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 },
{ 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 },
{ 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 },
{ 11, 0, 1, 8, false, 0, 1 },
{ 12, 0, 1, 8, true, 0, 32 },
{ 13, 0, 2, 8, false, 0, 1 },
{ 14, 0, 2, 8, true, 0, 32 },
{ 15, 0, 3, 8, false, 0, 1 },
{ 16, 0, 3, 8, true, 0, 32 },
{ 17, 0, 4, 8, false, 0, 1 },
{ 18, 0, 4, 8, true, 0, 32 },
};
out->clear();
for (const auto &def : defns) {
out->emplace_back(def);
}
}
static static
size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) { size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) {

View File

@ -0,0 +1,256 @@
/*
* Copyright (c) 2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Teddy literal matcher: common runtime procedures.
*/
#ifndef TEDDY_RUNTIME_COMMON_H_
#define TEDDY_RUNTIME_COMMON_H_
#include "fdr_confirm.h"
#include "fdr_confirm_runtime.h"
#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
#ifdef ARCH_64_BIT
#define TEDDY_CONF_TYPE u64a
#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
#else
#define TEDDY_CONF_TYPE u32
#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
#endif
#define CHECK_HWLM_TERMINATE_MATCHING \
do { \
if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \
*a->groups = controlVal; \
return HWLM_TERMINATED; \
} \
} while (0);
#define CHECK_FLOOD \
do { \
if (unlikely(ptr > tryFloodDetect)) { \
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \
&floodBackoff, &controlVal, \
iterBytes); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} while (0);
/*
* \brief Copy a block of [0,15] bytes efficiently.
*
* This function is a workaround intended to stop some compilers from
* synthesizing a memcpy function call out of the copy of a small number of
* bytes that we do in vectoredLoad128.
*/
static really_inline
void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
switch (len) {
case 0:
break;
case 1:
*dst = *src;
break;
case 2:
unaligned_store_u16(dst, unaligned_load_u16(src));
break;
case 3:
unaligned_store_u16(dst, unaligned_load_u16(src));
dst[2] = src[2];
break;
case 4:
unaligned_store_u32(dst, unaligned_load_u32(src));
break;
case 5:
case 6:
case 7:
/* Perform copy with two overlapping 4-byte chunks. */
unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
unaligned_store_u32(dst, unaligned_load_u32(src));
break;
case 8:
unaligned_store_u64a(dst, unaligned_load_u64a(src));
break;
default:
/* Perform copy with two overlapping 8-byte chunks. */
assert(len < 16);
unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
unaligned_store_u64a(dst, unaligned_load_u64a(src));
break;
}
}
// Note: p_mask is an output param that initialises a poison mask.
static really_inline
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
union {
u8 val8[16];
m128 val128;
} u;
u.val128 = zeroes128();
uintptr_t copy_start;
uintptr_t copy_len;
if (ptr >= lo) {
uintptr_t avail = (uintptr_t)(hi - ptr);
if (avail >= 16) {
*p_mask = load128(p_mask_arr[16] + 16);
return loadu128(ptr);
}
*p_mask = load128(p_mask_arr[avail] + 16);
copy_start = 0;
copy_len = avail;
} else {
uintptr_t need = MIN((uintptr_t)(lo - ptr),
MIN(len_history, nMasks - 1));
uintptr_t start = (uintptr_t)(lo - ptr);
uintptr_t i;
for (i = start - need; ptr + i < lo; i++) {
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
}
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
*p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
copy_start = i;
copy_len = end - i;
}
// Runt block from the buffer.
copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
return u.val128;
}
static really_inline
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
CautionReason reason) {
u64a confVal = 0;
const u8 *buf = a->buf;
size_t len = a->len;
const u8 *confirm_loc = ptr + byte - 7;
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
confVal = lv_u64a(confirm_loc, buf, buf + len);
} else { // r == VECTORING, confirm_loc < buf
u64a histBytes = a->histBytes;
confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
// stitch together confVal and history
u32 overhang = buf - confirm_loc;
histBytes >>= 64 - (overhang * 8);
confVal |= histBytes;
}
return confVal;
}
static really_inline
void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
const u32 *confBase, CautionReason reason,
const struct FDR_Runtime_Args *a, const u8 *ptr,
hwlmcb_rv_t *control, u32 *last_match) {
do {
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
u32 byte = bit / bucket + offset;
u32 bitRem = bit % bucket;
u32 confSplit = *(ptr+byte) & 0x1f;
u32 idx = confSplit * bucket + bitRem;
u32 cf = confBase[idx];
if (!cf) {
continue;
}
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
((const u8 *)confBase + cf);
if (!(fdrc->groups & *control)) {
continue;
}
u64a confVal = getConfVal(a, ptr, byte, reason);
confWithBit(fdrc, a, ptr - a->buf + byte, 0, control,
last_match, confVal);
} while (unlikely(*conf));
}
static really_inline
void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
const u32 *confBase, CautionReason reason,
const struct FDR_Runtime_Args *a, const u8 *ptr,
hwlmcb_rv_t *control, u32 *last_match) {
do {
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
u32 byte = bit / bucket + offset;
u32 idx = bit % bucket;
u32 cf = confBase[idx];
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
((const u8 *)confBase + cf);
if (!(fdrc->groups & *control)) {
continue;
}
u64a confVal = getConfVal(a, ptr, byte, reason);
confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
confVal);
} while (unlikely(*conf));
}
static really_inline
void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
const u32 *confBase, CautionReason reason,
const struct FDR_Runtime_Args *a, const u8 *ptr,
hwlmcb_rv_t *control, u32 *last_match) {
do {
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
u32 byte = bit / bucket + offset;
u32 idx = bit % bucket;
u32 cf = confBase[idx];
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
((const u8 *)confBase + cf);
if (!(fdrc->groups & *control)) {
continue;
}
u64a confVal = getConfVal(a, ptr, byte, reason);
confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
last_match, confVal);
} while (unlikely(*conf));
}
static really_inline
const m128 * getMaskBase(const struct Teddy *teddy) {
return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy));
}
static really_inline
const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) {
return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
(numMask*32));
}
#endif /* TEDDY_RUNTIME_COMMON_H_ */

View File

@ -54,7 +54,6 @@ Grey::Grey(void) :
allowRose(true), allowRose(true),
allowExtendedNFA(true), /* bounded repeats of course */ allowExtendedNFA(true), /* bounded repeats of course */
allowLimExNFA(true), allowLimExNFA(true),
allowSidecar(true),
allowAnchoredAcyclic(true), allowAnchoredAcyclic(true),
allowSmallLiteralSet(true), allowSmallLiteralSet(true),
allowCastle(true), allowCastle(true),
@ -207,7 +206,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
G_UPDATE(allowRose); G_UPDATE(allowRose);
G_UPDATE(allowExtendedNFA); G_UPDATE(allowExtendedNFA);
G_UPDATE(allowLimExNFA); G_UPDATE(allowLimExNFA);
G_UPDATE(allowSidecar);
G_UPDATE(allowAnchoredAcyclic); G_UPDATE(allowAnchoredAcyclic);
G_UPDATE(allowSmallLiteralSet); G_UPDATE(allowSmallLiteralSet);
G_UPDATE(allowCastle); G_UPDATE(allowCastle);

View File

@ -54,7 +54,6 @@ struct Grey {
bool allowRose; bool allowRose;
bool allowExtendedNFA; bool allowExtendedNFA;
bool allowLimExNFA; bool allowLimExNFA;
bool allowSidecar;
bool allowAnchoredAcyclic; bool allowAnchoredAcyclic;
bool allowSmallLiteralSet; bool allowSmallLiteralSet;
bool allowCastle; bool allowCastle;

View File

@ -39,6 +39,7 @@
#include "compiler/error.h" #include "compiler/error.h"
#include "nfagraph/ng.h" #include "nfagraph/ng.h"
#include "nfagraph/ng_expr_info.h" #include "nfagraph/ng_expr_info.h"
#include "nfagraph/ng_extparam.h"
#include "parser/parse_error.h" #include "parser/parse_error.h"
#include "parser/Parser.h" #include "parser/Parser.h"
#include "parser/prefilter.h" #include "parser/prefilter.h"
@ -310,7 +311,8 @@ hs_error_t hs_compile_ext_multi(const char * const *expressions,
static static
hs_error_t hs_expression_info_int(const char *expression, unsigned int flags, hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
unsigned int mode, hs_expr_info_t **info, const hs_expr_ext_t *ext, unsigned int mode,
hs_expr_info_t **info,
hs_compile_error_t **error) { hs_compile_error_t **error) {
if (!error) { if (!error) {
// nowhere to write an error, but we can still return an error code. // nowhere to write an error, but we can still return an error code.
@ -347,7 +349,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
} }
ReportManager rm(cc.grey); ReportManager rm(cc.grey);
ParsedExpression pe(0, expression, flags, 0); ParsedExpression pe(0, expression, flags, 0, ext);
assert(pe.component); assert(pe.component);
// Apply prefiltering transformations if desired. // Apply prefiltering transformations if desired.
@ -362,6 +364,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
throw ParseError("Internal error."); throw ParseError("Internal error.");
} }
handleExtendedParams(rm, *g, cc);
fillExpressionInfo(rm, *g, &local_info); fillExpressionInfo(rm, *g, &local_info);
} }
catch (const CompileError &e) { catch (const CompileError &e) {
@ -394,7 +397,16 @@ extern "C" HS_PUBLIC_API
hs_error_t hs_expression_info(const char *expression, unsigned int flags, hs_error_t hs_expression_info(const char *expression, unsigned int flags,
hs_expr_info_t **info, hs_expr_info_t **info,
hs_compile_error_t **error) { hs_compile_error_t **error) {
return hs_expression_info_int(expression, flags, HS_MODE_BLOCK, info, return hs_expression_info_int(expression, flags, nullptr, HS_MODE_BLOCK,
info, error);
}
extern "C" HS_PUBLIC_API
hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
const hs_expr_ext_t *ext,
hs_expr_info_t **info,
hs_compile_error_t **error) {
return hs_expression_info_int(expression, flags, ext, HS_MODE_BLOCK, info,
error); error);
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -500,6 +500,25 @@ const char *hs_version(void);
*/ */
#define HS_BAD_ALLOC (-9) #define HS_BAD_ALLOC (-9)
/**
* The scratch region was already in use.
*
* This error is returned when Hyperscan is able to detect that the scratch
* region given is already in use by another Hyperscan API call.
*
* A separate scratch region, allocated with @ref hs_alloc_scratch() or @ref
* hs_clone_scratch(), is required for every concurrent caller of the Hyperscan
* API.
*
* For example, this error might be returned when @ref hs_scan() has been
* called inside a callback delivered by a currently-executing @ref hs_scan()
* call using the same scratch region.
*
* Note: Not all concurrent uses of scratch regions may be detected. This error
* is intended as a best-effort debugging tool, not a guarantee.
*/
#define HS_SCRATCH_IN_USE (-10)
/** @} */ /** @} */
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -158,7 +158,7 @@ typedef struct hs_platform_info {
/** /**
* A type containing information related to an expression that is returned by * A type containing information related to an expression that is returned by
* @ref hs_expression_info(). * @ref hs_expression_info() or @ref hs_expression_ext_info.
*/ */
typedef struct hs_expr_info { typedef struct hs_expr_info {
/** /**
@ -201,7 +201,8 @@ typedef struct hs_expr_info {
/** /**
* A structure containing additional parameters related to an expression, * A structure containing additional parameters related to an expression,
* passed in at build time to @ref hs_compile_ext_multi(). * passed in at build time to @ref hs_compile_ext_multi() or @ref
* hs_expression_ext_info.
* *
* These parameters allow the set of matches produced by a pattern to be * These parameters allow the set of matches produced by a pattern to be
* constrained at compile time, rather than relying on the application to * constrained at compile time, rather than relying on the application to
@ -401,7 +402,7 @@ hs_error_t hs_compile_multi(const char *const *expressions,
hs_database_t **db, hs_compile_error_t **error); hs_database_t **db, hs_compile_error_t **error);
/** /**
* The multiple regular expression compiler with extended pattern support. * The multiple regular expression compiler with extended parameter support.
* *
* This function call compiles a group of expressions into a database in the * This function call compiles a group of expressions into a database in the
* same way as @ref hs_compile_multi(), but allows additional parameters to be * same way as @ref hs_compile_multi(), but allows additional parameters to be
@ -550,6 +551,62 @@ hs_error_t hs_expression_info(const char *expression, unsigned int flags,
hs_expr_info_t **info, hs_expr_info_t **info,
hs_compile_error_t **error); hs_compile_error_t **error);
/**
* Utility function providing information about a regular expression, with
* extended parameter support. The information provided in @ref hs_expr_info_t
* includes the minimum and maximum width of a pattern match.
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @a flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @a expression, and @ref HS_FLAG_CASELESS as the @a
* flags.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Valid values are:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
* expression per stream.
* - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
* empty string, such as `.*`.
* - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - HS_FLAG_UCP - Use Unicode properties for character classes.
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
*
* @param ext
* A pointer to a filled @ref hs_expr_ext_t structure that defines
* extended behaviour for this pattern. NULL may be specified if no
* extended parameters are needed.
*
* @param info
* On success, a pointer to the pattern information will be returned in
* this parameter, or NULL on failure. This structure is allocated using
* the allocator supplied in @ref hs_set_allocator() (or malloc() if no
* allocator was set) and should be freed by the caller.
*
* @param error
* If the call fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
const hs_expr_ext_t *ext,
hs_expr_info_t **info,
hs_compile_error_t **error);
/** /**
* Populates the platform information based on the current host. * Populates the platform information based on the current host.
* *

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -37,7 +37,6 @@
#include "noodle_build.h" #include "noodle_build.h"
#include "ue2common.h" #include "ue2common.h"
#include "fdr/fdr_compile.h" #include "fdr/fdr_compile.h"
#include "fdr/fdr.h"
#include "nfa/shufticompile.h" #include "nfa/shufticompile.h"
#include "util/alloc.h" #include "util/alloc.h"
#include "util/bitutils.h" #include "util/bitutils.h"
@ -526,8 +525,7 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
DEBUG_PRINTF("build noodle table\n"); DEBUG_PRINTF("build noodle table\n");
engType = HWLM_ENGINE_NOOD; engType = HWLM_ENGINE_NOOD;
const hwlmLiteral &lit = lits.front(); const hwlmLiteral &lit = lits.front();
auto noodle = noodBuildTable((const u8 *)lit.s.c_str(), lit.s.length(), auto noodle = noodBuildTable(lit);
lit.nocase, lit.id);
if (noodle) { if (noodle) {
engSize = noodSize(noodle.get()); engSize = noodSize(noodle.get());
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -34,13 +34,11 @@
#include "util/compare.h" // for ourisalpha #include "util/compare.h" // for ourisalpha
#include "util/ue2string.h" // for escapeString #include "util/ue2string.h" // for escapeString
#include <algorithm>
#include <iomanip> #include <iomanip>
#include <sstream> #include <sstream>
#include <boost/algorithm/cxx11/all_of.hpp>
using namespace std; using namespace std;
using namespace boost::algorithm;
namespace ue2 { namespace ue2 {
@ -91,10 +89,17 @@ hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in,
assert(msk.size() <= HWLM_MASKLEN); assert(msk.size() <= HWLM_MASKLEN);
assert(msk.size() == cmp.size()); assert(msk.size() == cmp.size());
DEBUG_PRINTF("literal '%s', msk=%s, cmp=%s\n", // If we've been handled a nocase literal, all letter characters must be
escapeString(s).c_str(), dumpMask(msk).c_str(), // upper-case.
if (nocase) {
upperString(s);
}
DEBUG_PRINTF("literal '%s'%s, msk=%s, cmp=%s\n", escapeString(s).c_str(),
nocase ? " (nocase)" : "", dumpMask(msk).c_str(),
dumpMask(cmp).c_str()); dumpMask(cmp).c_str());
// Mask and compare vectors MUST be the same size. // Mask and compare vectors MUST be the same size.
assert(msk.size() == cmp.size()); assert(msk.size() == cmp.size());
@ -102,7 +107,7 @@ hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in,
assert(maskIsConsistent(s, nocase, msk, cmp)); assert(maskIsConsistent(s, nocase, msk, cmp));
// In the name of good hygiene, zap msk/cmp if msk is all zeroes. // In the name of good hygiene, zap msk/cmp if msk is all zeroes.
if (all_of_equal(msk.begin(), msk.end(), 0)) { if (all_of(begin(msk), end(msk), [](u8 val) { return val == 0; })) {
msk.clear(); msk.clear();
cmp.clear(); cmp.clear();
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -95,11 +95,6 @@ struct hwlmLiteral {
*/ */
std::vector<u8> cmp; std::vector<u8> cmp;
/** \brief Simple constructor: no group information, no msk/cmp. */
hwlmLiteral(const std::string &s_in, bool nocase_in, u32 id_in)
: s(s_in), id(id_in), nocase(nocase_in), noruns(false),
groups(HWLM_ALL_GROUPS), msk(0), cmp(0) {}
/** \brief Complete constructor, takes group information and msk/cmp. /** \brief Complete constructor, takes group information and msk/cmp.
* *
* This constructor takes a msk/cmp pair. Both must be vectors of length <= * This constructor takes a msk/cmp pair. Both must be vectors of length <=
@ -107,6 +102,10 @@ struct hwlmLiteral {
hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in, hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in,
u32 id_in, hwlm_group_t groups_in, u32 id_in, hwlm_group_t groups_in,
const std::vector<u8> &msk_in, const std::vector<u8> &cmp_in); const std::vector<u8> &msk_in, const std::vector<u8> &cmp_in);
/** \brief Simple constructor: no group information, no msk/cmp. */
hwlmLiteral(const std::string &s_in, bool nocase_in, u32 id_in)
: hwlmLiteral(s_in, nocase_in, false, id_in, HWLM_ALL_GROUPS, {}, {}) {}
}; };
/** /**

View File

@ -26,28 +26,35 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
/** \file /**
* \file
* \brief Noodle literal matcher: build code. * \brief Noodle literal matcher: build code.
*/ */
#include <cstring> // for memcpy
#include "noodle_build.h" #include "noodle_build.h"
#include "hwlm_literal.h"
#include "noodle_internal.h" #include "noodle_internal.h"
#include "ue2common.h"
#include "util/alloc.h" #include "util/alloc.h"
#include "util/compare.h" #include "util/compare.h"
#include "util/verify_types.h" #include "util/verify_types.h"
#include "ue2common.h"
#include <cstring> // for memcpy
namespace ue2 { namespace ue2 {
static static
size_t findNoodFragOffset(const u8 *lit, size_t len, bool nocase) { size_t findNoodFragOffset(const hwlmLiteral &lit) {
const auto &s = lit.s;
const size_t len = lit.s.length();
size_t offset = 0; size_t offset = 0;
for (size_t i = 0; i + 1 < len; i++) { for (size_t i = 0; i + 1 < len; i++) {
int diff = 0; int diff = 0;
const char c = lit[i]; const char c = s[i];
const char d = lit[i + 1]; const char d = s[i + 1];
if (nocase && ourisalpha(c)) { if (lit.nocase && ourisalpha(c)) {
diff = (mytoupper(c) != mytoupper(d)); diff = (mytoupper(c) != mytoupper(d));
} else { } else {
diff = (c != d); diff = (c != d);
@ -60,21 +67,24 @@ size_t findNoodFragOffset(const u8 *lit, size_t len, bool nocase) {
return offset; return offset;
} }
/** \brief Construct a Noodle matcher for the given literal. */ aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
aligned_unique_ptr<noodTable> noodBuildTable(const u8 *lit, size_t len, if (!lit.msk.empty()) {
bool nocase, u32 id) { DEBUG_PRINTF("noodle can't handle supplementary masks\n");
size_t noodle_len = sizeof(noodTable) + len; return nullptr;
aligned_unique_ptr<noodTable> n = }
aligned_zmalloc_unique<noodTable>(noodle_len);
const auto &s = lit.s;
size_t noodle_len = sizeof(noodTable) + s.length();
auto n = aligned_zmalloc_unique<noodTable>(noodle_len);
assert(n); assert(n);
size_t key_offset = findNoodFragOffset(lit, len, nocase); size_t key_offset = findNoodFragOffset(lit);
n->id = id; n->id = lit.id;
n->len = verify_u32(len); n->len = verify_u32(s.length());
n->key_offset = verify_u32(key_offset); n->key_offset = verify_u32(key_offset);
n->nocase = nocase ? 1 : 0; n->nocase = lit.nocase ? 1 : 0;
memcpy(n->str, lit, len); memcpy(n->str, s.c_str(), s.length());
return n; return n;
} }

View File

@ -40,9 +40,10 @@ struct noodTable;
namespace ue2 { namespace ue2 {
struct hwlmLiteral;
/** \brief Construct a Noodle matcher for the given literal. */ /** \brief Construct a Noodle matcher for the given literal. */
ue2::aligned_unique_ptr<noodTable> noodBuildTable(const u8 *lit, size_t len, ue2::aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit);
bool nocase, u32 id);
size_t noodSize(const noodTable *n); size_t noodSize(const noodTable *n);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -37,6 +37,7 @@
#include "util/compare.h" #include "util/compare.h"
#include "util/masked_move.h" #include "util/masked_move.h"
#include "util/simd_utils.h" #include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"
#include <ctype.h> #include <ctype.h>
#include <stdbool.h> #include <stdbool.h>

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -221,7 +221,7 @@ hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
u32 z0 = movemask256(eq256(mask1, v)); u32 z0 = movemask256(eq256(mask1, v));
u32 z1 = movemask256(eq256(mask2, v)); u32 z1 = movemask256(eq256(mask2, v));
u32 z = (lastz0 | (z0 << 1)) & z1; u32 z = (lastz0 | (z0 << 1)) & z1;
lastz0 = (z0 & 0x80000000) >> 31; lastz0 = z0 >> 31;
// On large packet buffers, this prefetch appears to get us about 2%. // On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d + 128); __builtin_prefetch(d + 128);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -190,8 +190,8 @@ hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
m128 v = noCase ? and128(load128(d), caseMask) : load128(d); m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
m128 z1 = eq128(mask1, v); m128 z1 = eq128(mask1, v);
m128 z2 = eq128(mask2, v); m128 z2 = eq128(mask2, v);
u32 z = movemask128(and128(or128(lastz1, shiftLeft8Bits(z1)), z2)); u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2));
lastz1 = _mm_srli_si128(z1, 15); lastz1 = z1;
// On large packet buffers, this prefetch appears to get us about 2%. // On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d + 128); __builtin_prefetch(d + 128);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -30,6 +30,9 @@
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "vermicelli.h" #include "vermicelli.h"
#include "multishufti.h"
#include "multitruffle.h"
#include "multivermicelli.h"
#include "ue2common.h" #include "ue2common.h"
const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
@ -81,6 +84,18 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
c_end - 1); c_end - 1);
break; break;
case ACCEL_DVERM_MASKED:
DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end);
if (c + 16 + 1 >= c_end) {
return c;
}
/* need to stop one early to get an accurate end state */
rv = vermicelliDoubleMaskedExec(accel->dverm.c1, accel->dverm.c2,
accel->dverm.m1, accel->dverm.m2,
c, c_end - 1);
break;
case ACCEL_SHUFTI: case ACCEL_SHUFTI:
DEBUG_PRINTF("accel shufti %p %p\n", c, c_end); DEBUG_PRINTF("accel shufti %p %p\n", c, c_end);
if (c + 15 >= c_end) { if (c + 15 >= c_end) {
@ -117,6 +132,221 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
rv = c_end; rv = c_end;
break; break;
/* multibyte matchers */
case ACCEL_MLVERM:
DEBUG_PRINTF("accel mlverm %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = long_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
break;
case ACCEL_MLVERM_NOCASE:
DEBUG_PRINTF("accel mlverm nc %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = long_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
break;
case ACCEL_MLGVERM:
DEBUG_PRINTF("accel mlgverm %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = longgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
break;
case ACCEL_MLGVERM_NOCASE:
DEBUG_PRINTF("accel mlgverm nc %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = longgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
break;
case ACCEL_MSVERM:
DEBUG_PRINTF("accel msverm %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shift_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
break;
case ACCEL_MSVERM_NOCASE:
DEBUG_PRINTF("accel msverm nc %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shift_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
break;
case ACCEL_MSGVERM:
DEBUG_PRINTF("accel msgverm %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shiftgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
break;
case ACCEL_MSGVERM_NOCASE:
DEBUG_PRINTF("accel msgverm nc %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shiftgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
break;
case ACCEL_MDSVERM:
DEBUG_PRINTF("accel mdsverm %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshift_vermicelliExec(accel->mdverm.c, 0, c, c_end,
accel->mdverm.len1, accel->mdverm.len2);
break;
case ACCEL_MDSVERM_NOCASE:
DEBUG_PRINTF("accel mdsverm nc %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshift_vermicelliExec(accel->mdverm.c, 1, c, c_end,
accel->mdverm.len1, accel->mdverm.len2);
break;
case ACCEL_MDSGVERM:
DEBUG_PRINTF("accel mdsgverm %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 0, c, c_end,
accel->mdverm.len1, accel->mdverm.len2);
break;
case ACCEL_MDSGVERM_NOCASE:
DEBUG_PRINTF("accel mdsgverm nc %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 1, c, c_end,
accel->mdverm.len1, accel->mdverm.len2);
break;
case ACCEL_MLSHUFTI:
DEBUG_PRINTF("accel mlshufti %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = long_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
accel->mshufti.len);
break;
case ACCEL_MLGSHUFTI:
DEBUG_PRINTF("accel mlgshufti %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = longgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
accel->mshufti.len);
break;
case ACCEL_MSSHUFTI:
DEBUG_PRINTF("accel msshufti %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shift_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
accel->mshufti.len);
break;
case ACCEL_MSGSHUFTI:
DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shiftgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
accel->mshufti.len);
break;
case ACCEL_MDSSHUFTI:
DEBUG_PRINTF("accel mdsshufti %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshift_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
accel->mdshufti.len1, accel->mdshufti.len2);
break;
case ACCEL_MDSGSHUFTI:
DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshiftgrab_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
accel->mdshufti.len1, accel->mdshufti.len2);
break;
case ACCEL_MLTRUFFLE:
DEBUG_PRINTF("accel mltruffle %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = long_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
c, c_end, accel->mtruffle.len);
break;
case ACCEL_MLGTRUFFLE:
DEBUG_PRINTF("accel mlgtruffle %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = longgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
c, c_end, accel->mtruffle.len);
break;
case ACCEL_MSTRUFFLE:
DEBUG_PRINTF("accel mstruffle %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shift_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
c, c_end, accel->mtruffle.len);
break;
case ACCEL_MSGTRUFFLE:
DEBUG_PRINTF("accel msgtruffle %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = shiftgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
c, c_end, accel->mtruffle.len);
break;
case ACCEL_MDSTRUFFLE:
DEBUG_PRINTF("accel mdstruffle %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshift_truffleExec(accel->mdtruffle.mask1,
accel->mdtruffle.mask2, c, c_end,
accel->mdtruffle.len1,
accel->mdtruffle.len2);
break;
case ACCEL_MDSGTRUFFLE:
DEBUG_PRINTF("accel mdsgtruffle %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = doubleshiftgrab_truffleExec(accel->mdtruffle.mask1,
accel->mdtruffle.mask2, c, c_end,
accel->mdtruffle.len1,
accel->mdtruffle.len2);
break;
default: default:
assert(!"not here"); assert(!"not here");
return c; return c;
@ -127,5 +357,7 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
rv = MAX(c + accel->generic.offset, rv); rv = MAX(c + accel->generic.offset, rv);
rv -= accel->generic.offset; rv -= accel->generic.offset;
DEBUG_PRINTF("advanced %zd\n", rv - c);
return rv; return rv;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -60,7 +60,37 @@ enum AccelType {
ACCEL_SHUFTI, ACCEL_SHUFTI,
ACCEL_DSHUFTI, ACCEL_DSHUFTI,
ACCEL_TRUFFLE, ACCEL_TRUFFLE,
ACCEL_RED_TAPE ACCEL_RED_TAPE,
/* multibyte vermicellis */
ACCEL_MLVERM,
ACCEL_MLVERM_NOCASE,
ACCEL_MLGVERM,
ACCEL_MLGVERM_NOCASE,
ACCEL_MSVERM,
ACCEL_MSVERM_NOCASE,
ACCEL_MSGVERM,
ACCEL_MSGVERM_NOCASE,
ACCEL_MDSVERM,
ACCEL_MDSVERM_NOCASE,
ACCEL_MDSGVERM,
ACCEL_MDSGVERM_NOCASE,
/* multibyte shuftis */
ACCEL_MLSHUFTI,
ACCEL_MLGSHUFTI,
ACCEL_MSSHUFTI,
ACCEL_MSGSHUFTI,
ACCEL_MDSSHUFTI,
ACCEL_MDSGSHUFTI,
/* multibyte truffles */
ACCEL_MLTRUFFLE,
ACCEL_MLGTRUFFLE,
ACCEL_MSTRUFFLE,
ACCEL_MSGTRUFFLE,
ACCEL_MDSTRUFFLE,
ACCEL_MDSGTRUFFLE,
/* masked dverm */
ACCEL_DVERM_MASKED,
}; };
/** \brief Structure for accel framework. */ /** \brief Structure for accel framework. */
@ -80,7 +110,22 @@ union AccelAux {
u8 offset; u8 offset;
u8 c1; // uppercase if nocase u8 c1; // uppercase if nocase
u8 c2; // uppercase if nocase u8 c2; // uppercase if nocase
u8 m1; // masked variant
u8 m2; // masked variant
} dverm; } dverm;
struct {
u8 accel_type;
u8 offset;
u8 c; // uppercase if nocase
u8 len;
} mverm;
struct {
u8 accel_type;
u8 offset;
u8 c; // uppercase if nocase
u8 len1;
u8 len2;
} mdverm;
struct { struct {
u8 accel_type; u8 accel_type;
u8 offset; u8 offset;
@ -95,12 +140,42 @@ union AccelAux {
m128 lo2; m128 lo2;
m128 hi2; m128 hi2;
} dshufti; } dshufti;
struct {
u8 accel_type;
u8 offset;
m128 lo;
m128 hi;
u8 len;
} mshufti;
struct {
u8 accel_type;
u8 offset;
m128 lo;
m128 hi;
u8 len1;
u8 len2;
} mdshufti;
struct { struct {
u8 accel_type; u8 accel_type;
u8 offset; u8 offset;
m128 mask1; m128 mask1;
m128 mask2; m128 mask2;
} truffle; } truffle;
struct {
u8 accel_type;
u8 offset;
m128 mask1;
m128 mask2;
u8 len;
} mtruffle;
struct {
u8 accel_type;
u8 offset;
m128 mask1;
m128 mask2;
u8 len1;
u8 len2;
} mdtruffle;
}; };
/** /**

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -37,16 +37,21 @@
#include "shufticompile.h" #include "shufticompile.h"
#include "trufflecompile.h" #include "trufflecompile.h"
#include "ue2common.h" #include "ue2common.h"
#include "util/bitutils.h"
#include "util/charreach.h" #include "util/charreach.h"
#include "util/dump_charclass.h" #include "util/dump_charclass.h"
#include "util/dump_mask.h" #include "util/dump_mask.h"
#include "util/simd_utils.h"
#include <cstdio> #include <cstdio>
#include <vector>
#ifndef DUMP_SUPPORT #ifndef DUMP_SUPPORT
#error No dump support! #error No dump support!
#endif #endif
using namespace std;
namespace ue2 { namespace ue2 {
static static
@ -62,6 +67,8 @@ const char *accelName(u8 accel_type) {
return "double-vermicelli"; return "double-vermicelli";
case ACCEL_DVERM_NOCASE: case ACCEL_DVERM_NOCASE:
return "double-vermicelli nocase"; return "double-vermicelli nocase";
case ACCEL_DVERM_MASKED:
return "double-vermicelli masked";
case ACCEL_RVERM: case ACCEL_RVERM:
return "reverse vermicelli"; return "reverse vermicelli";
case ACCEL_RVERM_NOCASE: case ACCEL_RVERM_NOCASE:
@ -86,11 +93,144 @@ const char *accelName(u8 accel_type) {
return "truffle"; return "truffle";
case ACCEL_RED_TAPE: case ACCEL_RED_TAPE:
return "red tape"; return "red tape";
case ACCEL_MLVERM:
return "multibyte long vermicelli";
case ACCEL_MLVERM_NOCASE:
return "multibyte long vermicelli nocase";
case ACCEL_MLGVERM:
return "multibyte long-grab vermicelli";
case ACCEL_MLGVERM_NOCASE:
return "multibyte long-grab vermicelli nocase";
case ACCEL_MSVERM:
return "multibyte shift vermicelli";
case ACCEL_MSVERM_NOCASE:
return "multibyte shift vermicelli nocase";
case ACCEL_MSGVERM:
return "multibyte shift-grab vermicelli";
case ACCEL_MSGVERM_NOCASE:
return "multibyte shift-grab vermicelli nocase";
case ACCEL_MDSVERM:
return "multibyte doubleshift vermicelli";
case ACCEL_MDSVERM_NOCASE:
return "multibyte doubleshift vermicelli nocase";
case ACCEL_MDSGVERM:
return "multibyte doubleshift-grab vermicelli";
case ACCEL_MDSGVERM_NOCASE:
return "multibyte doubleshift-grab vermicelli nocase";
case ACCEL_MLSHUFTI:
return "multibyte long shufti";
case ACCEL_MLGSHUFTI:
return "multibyte long-grab shufti";
case ACCEL_MSSHUFTI:
return "multibyte shift shufti";
case ACCEL_MSGSHUFTI:
return "multibyte shift-grab shufti";
case ACCEL_MDSSHUFTI:
return "multibyte doubleshift shufti";
case ACCEL_MDSGSHUFTI:
return "multibyte doubleshift-grab shufti";
case ACCEL_MLTRUFFLE:
return "multibyte long truffle";
case ACCEL_MLGTRUFFLE:
return "multibyte long-grab truffle";
case ACCEL_MSTRUFFLE:
return "multibyte shift truffle";
case ACCEL_MSGTRUFFLE:
return "multibyte shift-grab truffle";
case ACCEL_MDSTRUFFLE:
return "multibyte doubleshift truffle";
case ACCEL_MDSGTRUFFLE:
return "multibyte doubleshift-grab truffle";
default: default:
return "unknown!"; return "unknown!";
} }
} }
static
void dumpShuftiCharReach(FILE *f, const m128 &lo, const m128 &hi) {
CharReach cr = shufti2cr(lo, hi);
fprintf(f, "count %zu class %s\n", cr.count(),
describeClass(cr).c_str());
}
static
vector<CharReach> shufti2cr_array(const m128 lo_in, const m128 hi_in) {
const u8 *lo = (const u8 *)&lo_in;
const u8 *hi = (const u8 *)&hi_in;
vector<CharReach> crs(8);
for (u32 i = 0; i < 256; i++) {
u32 combined = lo[(u8)i & 0xf] & hi[(u8)i >> 4];
while (combined) {
u32 j = findAndClearLSB_32(&combined);
crs.at(j).set(i);
}
}
return crs;
}
static
void dumpDShuftiCharReach(FILE *f, const m128 &lo1, const m128 &hi1,
const m128 &lo2, const m128 &hi2) {
vector<CharReach> cr1 = shufti2cr_array(not128(lo1), not128(hi1));
vector<CharReach> cr2 = shufti2cr_array(not128(lo2), not128(hi2));
map<CharReach, set<u32> > cr1_group;
assert(cr1.size() == 8 && cr2.size() == 8);
for (u32 i = 0; i < 8; i++) {
if (!cr1[i].any()) {
continue;
}
cr1_group[cr1[i]].insert(i);
}
map<CharReach, CharReach> rev;
for (const auto &e : cr1_group) {
CharReach rhs;
for (u32 r : e.second) {
rhs |= cr2.at(r);
}
rev[rhs] |= e.first;
}
fprintf(f, "escapes: {");
for (auto it = rev.begin(); it != rev.end(); ++it) {
const auto &e = *it;
if (it != rev.begin()) {
fprintf(f, ", ");
}
if (e.first.all()) {
fprintf(f, "%s", describeClass(e.second).c_str());
} else {
fprintf(f, "%s%s", describeClass(e.second).c_str(),
describeClass(e.first).c_str());
}
}
fprintf(f, "}\n");
}
static
void dumpShuftiMasks(FILE *f, const m128 &lo, const m128 &hi) {
fprintf(f, "lo %s\n",
dumpMask((const u8 *)&lo, 128).c_str());
fprintf(f, "hi %s\n",
dumpMask((const u8 *)&hi, 128).c_str());
}
static
void dumpTruffleCharReach(FILE *f, const m128 &hiset, const m128 &hiclear) {
CharReach cr = truffle2cr(hiset, hiclear);
fprintf(f, "count %zu class %s\n", cr.count(),
describeClass(cr).c_str());
}
static
void dumpTruffleMasks(FILE *f, const m128 &hiset, const m128 &hiclear) {
fprintf(f, "lo %s\n",
dumpMask((const u8 *)&hiset, 128).c_str());
fprintf(f, "hi %s\n",
dumpMask((const u8 *)&hiclear, 128).c_str());
}
void dumpAccelInfo(FILE *f, const AccelAux &accel) { void dumpAccelInfo(FILE *f, const AccelAux &accel) {
fprintf(f, " %s", accelName(accel.accel_type)); fprintf(f, " %s", accelName(accel.accel_type));
if (accel.generic.offset) { if (accel.generic.offset) {
@ -110,39 +250,76 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
case ACCEL_RDVERM_NOCASE: case ACCEL_RDVERM_NOCASE:
fprintf(f, " [\\x%02hhx\\x%02hhx]\n", accel.dverm.c1, accel.dverm.c2); fprintf(f, " [\\x%02hhx\\x%02hhx]\n", accel.dverm.c1, accel.dverm.c2);
break; break;
case ACCEL_DVERM_MASKED:
fprintf(f, " [\\x%02hhx\\x%02hhx] & [\\x%02hhx\\x%02hhx]\n",
accel.dverm.c1, accel.dverm.c2, accel.dverm.m1, accel.dverm.m2);
break;
case ACCEL_SHUFTI: { case ACCEL_SHUFTI: {
fprintf(f, "\n"); fprintf(f, "\n");
fprintf(f, "lo %s\n", dumpShuftiMasks(f, accel.shufti.lo, accel.shufti.hi);
dumpMask((const u8 *)&accel.shufti.lo, 128).c_str()); dumpShuftiCharReach(f, accel.shufti.lo, accel.shufti.hi);
fprintf(f, "hi %s\n",
dumpMask((const u8 *)&accel.shufti.hi, 128).c_str());
CharReach cr = shufti2cr(accel.shufti.lo, accel.shufti.hi);
fprintf(f, "count %zu class %s\n", cr.count(),
describeClass(cr).c_str());
break; break;
} }
case ACCEL_DSHUFTI: case ACCEL_DSHUFTI:
fprintf(f, "\n"); fprintf(f, "\n");
fprintf(f, "lo1 %s\n", fprintf(f, "mask 1\n");
dumpMask((const u8 *)&accel.dshufti.lo1, 128).c_str()); dumpShuftiMasks(f, accel.dshufti.lo1, accel.dshufti.hi1);
fprintf(f, "hi1 %s\n", fprintf(f, "mask 2\n");
dumpMask((const u8 *)&accel.dshufti.hi1, 128).c_str()); dumpShuftiMasks(f, accel.dshufti.lo2, accel.dshufti.hi2);
fprintf(f, "lo2 %s\n", dumpDShuftiCharReach(f, accel.dshufti.lo1, accel.dshufti.hi1,
dumpMask((const u8 *)&accel.dshufti.lo2, 128).c_str()); accel.dshufti.lo2, accel.dshufti.hi2);
fprintf(f, "hi2 %s\n",
dumpMask((const u8 *)&accel.dshufti.hi2, 128).c_str());
break; break;
case ACCEL_TRUFFLE: { case ACCEL_TRUFFLE: {
fprintf(f, "\n"); fprintf(f, "\n");
fprintf(f, "lo %s\n", dumpTruffleMasks(f, accel.truffle.mask1, accel.truffle.mask2);
dumpMask((const u8 *)&accel.truffle.mask1, 128).c_str()); dumpTruffleCharReach(f, accel.truffle.mask1, accel.truffle.mask2);
fprintf(f, "hi %s\n",
dumpMask((const u8 *)&accel.truffle.mask2, 128).c_str());
CharReach cr = truffle2cr(accel.truffle.mask1, accel.truffle.mask2);
fprintf(f, "count %zu class %s\n", cr.count(),
describeClass(cr).c_str());
break; break;
} }
case ACCEL_MLVERM:
case ACCEL_MLVERM_NOCASE:
case ACCEL_MLGVERM:
case ACCEL_MLGVERM_NOCASE:
case ACCEL_MSVERM:
case ACCEL_MSVERM_NOCASE:
case ACCEL_MSGVERM:
case ACCEL_MSGVERM_NOCASE:
fprintf(f, " [\\x%02hhx] len:%u\n", accel.mverm.c, accel.mverm.len);
break;
case ACCEL_MDSVERM:
case ACCEL_MDSVERM_NOCASE:
case ACCEL_MDSGVERM:
case ACCEL_MDSGVERM_NOCASE:
fprintf(f, " [\\x%02hhx] len1:%u len2:%u\n", accel.mdverm.c, accel.mdverm.len1,
accel.mdverm.len2);
break;
case ACCEL_MLSHUFTI:
case ACCEL_MLGSHUFTI:
case ACCEL_MSSHUFTI:
case ACCEL_MSGSHUFTI:
fprintf(f, " len:%u\n", accel.mshufti.len);
dumpShuftiMasks(f, accel.mshufti.lo, accel.mshufti.hi);
dumpShuftiCharReach(f, accel.mshufti.lo, accel.mshufti.hi);
break;
case ACCEL_MDSSHUFTI:
case ACCEL_MDSGSHUFTI:
fprintf(f, " len1:%u len2:%u\n", accel.mdshufti.len1, accel.mdshufti.len2);
dumpShuftiMasks(f, accel.mdshufti.lo, accel.mdshufti.hi);
dumpShuftiCharReach(f, accel.mdshufti.lo, accel.mdshufti.hi);
break;
case ACCEL_MLTRUFFLE:
case ACCEL_MLGTRUFFLE:
case ACCEL_MSTRUFFLE:
case ACCEL_MSGTRUFFLE:
fprintf(f, " len:%u\n", accel.mtruffle.len);
dumpTruffleMasks(f, accel.mtruffle.mask1, accel.mtruffle.mask2);
dumpTruffleCharReach(f, accel.mtruffle.mask1, accel.mtruffle.mask2);
break;
case ACCEL_MDSTRUFFLE:
case ACCEL_MDSGTRUFFLE:
fprintf(f, " len1:%u len2:%u\n", accel.mdtruffle.len1, accel.mdtruffle.len2);
dumpTruffleMasks(f, accel.mdtruffle.mask1, accel.mdtruffle.mask2);
dumpTruffleCharReach(f, accel.mdtruffle.mask1, accel.mdtruffle.mask2);
break;
default: default:
fprintf(f, "\n"); fprintf(f, "\n");
break; break;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -94,6 +94,47 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
DEBUG_PRINTF("unable to accelerate case with %zu outs\n", outs); DEBUG_PRINTF("unable to accelerate case with %zu outs\n", outs);
} }
bool buildDvermMask(const flat_set<pair<u8, u8>> &escape_set, u8 *m1_out,
u8 *m2_out) {
u8 a1 = 0xff;
u8 a2 = 0xff;
u8 b1 = 0xff;
u8 b2 = 0xff;
for (const auto &e : escape_set) {
DEBUG_PRINTF("%0hhx %0hhx\n", e.first, e.second);
a1 &= e.first;
b1 &= ~e.first;
a2 &= e.second;
b2 &= ~e.second;
}
u8 m1 = a1 | b1;
u8 m2 = a2 | b2;
u32 holes1 = 8 - popcount32(m1);
u32 holes2 = 8 - popcount32(m2);
DEBUG_PRINTF("aaaa %0hhx %0hhx\n", a1, a2);
DEBUG_PRINTF("bbbb %0hhx %0hhx\n", b1, b2);
DEBUG_PRINTF("mask %0hhx %0hhx\n", m1, m2);
assert(holes1 <= 8 && holes2 <= 8);
assert(escape_set.size() <= 1U << (holes1 + holes2));
if (escape_set.size() != 1U << (holes1 + holes2)) {
return false;
}
if (m1_out) {
*m1_out = m1;
}
if (m2_out) {
*m2_out = m2;
}
return true;
}
static static
bool isCaselessDouble(const flat_set<pair<u8, u8>> &stop) { bool isCaselessDouble(const flat_set<pair<u8, u8>> &stop) {
// test for vector containing <A,Z> <A,z> <a,Z> <a,z> // test for vector containing <A,Z> <A,z> <a,Z> <a,z>
@ -149,17 +190,31 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
return; return;
} }
if (outs1 + outs2 <= 8) { if (outs1 == 0) {
if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438. u8 m1;
DEBUG_PRINTF("building double-shufti for %zu one-byte and %zu" u8 m2;
" two-byte literals\n", outs1, outs2);
aux->accel_type = ACCEL_DSHUFTI; if (buildDvermMask(info.double_stop2, &m1, &m2)) {
aux->dshufti.offset = offset; aux->accel_type = ACCEL_DVERM_MASKED;
shuftiBuildDoubleMasks(info.double_stop1, info.double_stop2, aux->dverm.offset = offset;
&aux->dshufti.lo1, aux->dverm.c1 = info.double_stop2.begin()->first & m1;
&aux->dshufti.hi1, aux->dverm.c2 = info.double_stop2.begin()->second & m2;
&aux->dshufti.lo2, aux->dverm.m1 = m1;
&aux->dshufti.hi2); aux->dverm.m2 = m2;
DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
aux->dverm.c1, aux->dverm.c2);
return;
}
}
if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438.
DEBUG_PRINTF("building double-shufti for %zu one-byte and %zu"
" two-byte literals\n", outs1, outs2);
aux->accel_type = ACCEL_DSHUFTI;
aux->dshufti.offset = offset;
if (shuftiBuildDoubleMasks(info.double_stop1, info.double_stop2,
&aux->dshufti.lo1, &aux->dshufti.hi1,
&aux->dshufti.lo2, &aux->dshufti.hi2)) {
return; return;
} }
} }
@ -169,13 +224,285 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
aux->accel_type = ACCEL_NONE; aux->accel_type = ACCEL_NONE;
} }
static
void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
if (info.ma_type == MultibyteAccelInfo::MAT_NONE) {
DEBUG_PRINTF("no multimatch for us :(");
return;
}
u32 offset = info.multiaccel_offset;
const CharReach &stops = info.multiaccel_stops;
assert(aux->accel_type == ACCEL_NONE);
if (stops.all()) {
return;
}
size_t outs = stops.count();
DEBUG_PRINTF("%zu outs\n", outs);
assert(outs && outs < 256);
switch (info.ma_type) {
case MultibyteAccelInfo::MAT_LONG:
if (outs == 1) {
aux->accel_type = ACCEL_MLVERM;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first();
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
return;
}
if (outs == 2 && stops.isCaselessChar()) {
aux->accel_type = ACCEL_MLVERM_NOCASE;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first() & CASE_CLEAR;
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
aux->verm.c);
return;
}
break;
case MultibyteAccelInfo::MAT_LONGGRAB:
if (outs == 1) {
aux->accel_type = ACCEL_MLGVERM;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first();
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
return;
}
if (outs == 2 && stops.isCaselessChar()) {
aux->accel_type = ACCEL_MLGVERM_NOCASE;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first() & CASE_CLEAR;
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
aux->verm.c);
return;
}
break;
case MultibyteAccelInfo::MAT_SHIFT:
if (outs == 1) {
aux->accel_type = ACCEL_MSVERM;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first();
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
return;
}
if (outs == 2 && stops.isCaselessChar()) {
aux->accel_type = ACCEL_MSVERM_NOCASE;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first() & CASE_CLEAR;
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
aux->verm.c);
return;
}
break;
case MultibyteAccelInfo::MAT_SHIFTGRAB:
if (outs == 1) {
aux->accel_type = ACCEL_MSGVERM;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first();
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
return;
}
if (outs == 2 && stops.isCaselessChar()) {
aux->accel_type = ACCEL_MSGVERM_NOCASE;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first() & CASE_CLEAR;
aux->mverm.len = info.ma_len1;
DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
aux->verm.c);
return;
}
break;
case MultibyteAccelInfo::MAT_DSHIFT:
if (outs == 1) {
aux->accel_type = ACCEL_MDSVERM;
aux->mdverm.offset = offset;
aux->mdverm.c = stops.find_first();
aux->mdverm.len1 = info.ma_len1;
aux->mdverm.len2 = info.ma_len2;
DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
return;
}
if (outs == 2 && stops.isCaselessChar()) {
aux->accel_type = ACCEL_MDSVERM_NOCASE;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first() & CASE_CLEAR;
aux->mdverm.len1 = info.ma_len1;
aux->mdverm.len2 = info.ma_len2;
DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
aux->verm.c);
return;
}
break;
case MultibyteAccelInfo::MAT_DSHIFTGRAB:
if (outs == 1) {
aux->accel_type = ACCEL_MDSGVERM;
aux->mdverm.offset = offset;
aux->mdverm.c = stops.find_first();
aux->mdverm.len1 = info.ma_len1;
aux->mdverm.len2 = info.ma_len2;
DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
return;
}
if (outs == 2 && stops.isCaselessChar()) {
aux->accel_type = ACCEL_MDSGVERM_NOCASE;
aux->mverm.offset = offset;
aux->mverm.c = stops.find_first() & CASE_CLEAR;
aux->mdverm.len1 = info.ma_len1;
aux->mdverm.len2 = info.ma_len2;
DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
aux->verm.c);
return;
}
break;
default:
// shouldn't happen
assert(0);
return;
}
DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
switch (info.ma_type) {
case MultibyteAccelInfo::MAT_LONG:
if (shuftiBuildMasks(stops, &aux->mshufti.lo,
&aux->mshufti.hi) == -1) {
break;
}
aux->accel_type = ACCEL_MLSHUFTI;
aux->mshufti.offset = offset;
aux->mshufti.len = info.ma_len1;
return;
case MultibyteAccelInfo::MAT_LONGGRAB:
if (shuftiBuildMasks(stops, &aux->mshufti.lo,
&aux->mshufti.hi) == -1) {
break;
}
aux->accel_type = ACCEL_MLGSHUFTI;
aux->mshufti.offset = offset;
aux->mshufti.len = info.ma_len1;
return;
case MultibyteAccelInfo::MAT_SHIFT:
if (shuftiBuildMasks(stops, &aux->mshufti.lo,
&aux->mshufti.hi) == -1) {
break;
}
aux->accel_type = ACCEL_MSSHUFTI;
aux->mshufti.offset = offset;
aux->mshufti.len = info.ma_len1;
return;
case MultibyteAccelInfo::MAT_SHIFTGRAB:
if (shuftiBuildMasks(stops, &aux->mshufti.lo,
&aux->mshufti.hi) == -1) {
break;
}
aux->accel_type = ACCEL_MSGSHUFTI;
aux->mshufti.offset = offset;
aux->mshufti.len = info.ma_len1;
return;
case MultibyteAccelInfo::MAT_DSHIFT:
if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
&aux->mdshufti.hi) == -1) {
break;
}
aux->accel_type = ACCEL_MDSSHUFTI;
aux->mdshufti.offset = offset;
aux->mdshufti.len1 = info.ma_len1;
aux->mdshufti.len2 = info.ma_len2;
return;
case MultibyteAccelInfo::MAT_DSHIFTGRAB:
if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
&aux->mdshufti.hi) == -1) {
break;
}
aux->accel_type = ACCEL_MDSGSHUFTI;
aux->mdshufti.offset = offset;
aux->mdshufti.len1 = info.ma_len1;
aux->mdshufti.len2 = info.ma_len2;
return;
default:
// shouldn't happen
assert(0);
return;
}
DEBUG_PRINTF("shufti build failed, falling through\n");
if (outs <= ACCEL_MAX_STOP_CHAR) {
DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
switch (info.ma_type) {
case MultibyteAccelInfo::MAT_LONG:
aux->accel_type = ACCEL_MLTRUFFLE;
aux->mtruffle.offset = offset;
aux->mtruffle.len = info.ma_len1;
truffleBuildMasks(stops, &aux->mtruffle.mask1,
&aux->mtruffle.mask2);
break;
case MultibyteAccelInfo::MAT_LONGGRAB:
aux->accel_type = ACCEL_MLGTRUFFLE;
aux->mtruffle.offset = offset;
aux->mtruffle.len = info.ma_len1;
truffleBuildMasks(stops, &aux->mtruffle.mask1,
&aux->mtruffle.mask2);
break;
case MultibyteAccelInfo::MAT_SHIFT:
aux->accel_type = ACCEL_MSTRUFFLE;
aux->mtruffle.offset = offset;
aux->mtruffle.len = info.ma_len1;
truffleBuildMasks(stops, &aux->mtruffle.mask1,
&aux->mtruffle.mask2);
break;
case MultibyteAccelInfo::MAT_SHIFTGRAB:
aux->accel_type = ACCEL_MSGTRUFFLE;
aux->mtruffle.offset = offset;
aux->mtruffle.len = info.ma_len1;
truffleBuildMasks(stops, &aux->mtruffle.mask1,
&aux->mtruffle.mask2);
break;
case MultibyteAccelInfo::MAT_DSHIFT:
aux->accel_type = ACCEL_MDSTRUFFLE;
aux->mdtruffle.offset = offset;
aux->mdtruffle.len1 = info.ma_len1;
aux->mdtruffle.len2 = info.ma_len2;
truffleBuildMasks(stops, &aux->mtruffle.mask1,
&aux->mdtruffle.mask2);
break;
case MultibyteAccelInfo::MAT_DSHIFTGRAB:
aux->accel_type = ACCEL_MDSGTRUFFLE;
aux->mdtruffle.offset = offset;
aux->mdtruffle.len1 = info.ma_len1;
aux->mdtruffle.len2 = info.ma_len2;
truffleBuildMasks(stops, &aux->mtruffle.mask1,
&aux->mdtruffle.mask2);
break;
default:
// shouldn't happen
assert(0);
return;
}
return;
}
DEBUG_PRINTF("unable to accelerate multibyte case with %zu outs\n", outs);
}
bool buildAccelAux(const AccelInfo &info, AccelAux *aux) { bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
assert(aux->accel_type == ACCEL_NONE); assert(aux->accel_type == ACCEL_NONE);
if (info.single_stops.none()) { if (info.single_stops.none()) {
DEBUG_PRINTF("picked red tape\n"); DEBUG_PRINTF("picked red tape\n");
aux->accel_type = ACCEL_RED_TAPE; aux->accel_type = ACCEL_RED_TAPE;
aux->generic.offset = info.single_offset; aux->generic.offset = info.single_offset;
} else { }
if (aux->accel_type == ACCEL_NONE) {
buildAccelMulti(info, aux);
}
if (aux->accel_type == ACCEL_NONE) {
buildAccelDouble(info, aux); buildAccelDouble(info, aux);
} }
if (aux->accel_type == ACCEL_NONE) { if (aux->accel_type == ACCEL_NONE) {

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -37,9 +37,30 @@ union AccelAux;
namespace ue2 { namespace ue2 {
struct MultibyteAccelInfo {
/* multibyte accel schemes, ordered by strength */
enum multiaccel_type {
MAT_SHIFT,
MAT_SHIFTGRAB,
MAT_DSHIFT,
MAT_DSHIFTGRAB,
MAT_LONG,
MAT_LONGGRAB,
MAT_MAX,
MAT_NONE = MAT_MAX
};
CharReach cr;
u32 offset = 0;
u32 len1 = 0;
u32 len2 = 0;
multiaccel_type type = MAT_NONE;
};
struct AccelInfo { struct AccelInfo {
AccelInfo() : single_offset(0U), double_offset(0U), AccelInfo() : single_offset(0U), double_offset(0U),
single_stops(CharReach::dot()) {} single_stops(CharReach::dot()),
multiaccel_offset(0), ma_len1(0), ma_len2(0),
ma_type(MultibyteAccelInfo::MAT_NONE) {}
u32 single_offset; /**< offset correction to apply to single schemes */ u32 single_offset; /**< offset correction to apply to single schemes */
u32 double_offset; /**< offset correction to apply to double schemes */ u32 double_offset; /**< offset correction to apply to double schemes */
CharReach double_stop1; /**< single-byte accel stop literals for double CharReach double_stop1; /**< single-byte accel stop literals for double
@ -47,10 +68,19 @@ struct AccelInfo {
flat_set<std::pair<u8, u8>> double_stop2; /**< double-byte accel stop flat_set<std::pair<u8, u8>> double_stop2; /**< double-byte accel stop
* literals */ * literals */
CharReach single_stops; /**< escapes for single byte acceleration */ CharReach single_stops; /**< escapes for single byte acceleration */
u32 multiaccel_offset; /**< offset correction to apply to multibyte schemes */
CharReach multiaccel_stops; /**< escapes for multibyte acceleration */
u32 ma_len1; /**< multiaccel len1 */
u32 ma_len2; /**< multiaccel len2 */
MultibyteAccelInfo::multiaccel_type ma_type; /**< multiaccel type */
}; };
bool buildAccelAux(const AccelInfo &info, AccelAux *aux); bool buildAccelAux(const AccelInfo &info, AccelAux *aux);
/* returns true is the escape set can be handled with a masked double_verm */
bool buildDvermMask(const flat_set<std::pair<u8, u8>> &escape_set,
u8 *m1_out = nullptr, u8 *m2_out = nullptr);
} // namespace ue2 } // namespace ue2
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -96,7 +96,8 @@ char subCastleReportCurrent(const struct Castle *c, struct mq *q,
repeatHasMatch(info, rctrl, rstate, offset); repeatHasMatch(info, rctrl, rstate, offset);
DEBUG_PRINTF("repeatHasMatch returned %d\n", match); DEBUG_PRINTF("repeatHasMatch returned %d\n", match);
if (match == REPEAT_MATCH) { if (match == REPEAT_MATCH) {
DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, subIdx); DEBUG_PRINTF("firing match at %llu for sub %u, report %u\n", offset,
subIdx, sub->report);
if (q->cb(offset, sub->report, q->context) == MO_HALT_MATCHING) { if (q->cb(offset, sub->report, q->context) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING; return MO_HALT_MATCHING;
} }
@ -111,17 +112,22 @@ int castleReportCurrent(const struct Castle *c, struct mq *q) {
DEBUG_PRINTF("offset=%llu\n", offset); DEBUG_PRINTF("offset=%llu\n", offset);
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(q->streamState, u8 *active = (u8 *)q->streamState;
c->activeIdxSize); u8 *groups = active + c->groupIterOffset;
DEBUG_PRINTF("subcastle %u\n", activeIdx); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
if (activeIdx < c->numRepeats && subCastleReportCurrent(c, q, i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
offset, activeIdx) == MO_HALT_MATCHING) { u8 *cur = active + i * c->activeIdxSize;
return MO_HALT_MATCHING; const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx);
if (subCastleReportCurrent(c, q,
offset, activeIdx) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} }
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
const u8 *active = (const u8 *)q->streamState + c->activeIdxSize; const u8 *active = (const u8 *)q->streamState + c->activeOffset;
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
DEBUG_PRINTF("subcastle %u\n", i); DEBUG_PRINTF("subcastle %u\n", i);
@ -162,11 +168,18 @@ static really_inline
char castleInAccept(const struct Castle *c, struct mq *q, char castleInAccept(const struct Castle *c, struct mq *q,
const ReportID report, const u64a offset) { const ReportID report, const u64a offset) {
DEBUG_PRINTF("offset=%llu\n", offset); DEBUG_PRINTF("offset=%llu\n", offset);
/* ignore when just catching up due to full queue */
if (report == MO_INVALID_IDX) {
return 0;
}
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(q->streamState, u8 *active = (u8 *)q->streamState;
c->activeIdxSize); u8 *groups = active + c->groupIterOffset;
if (activeIdx < c->numRepeats) { for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
if (subCastleInAccept(c, q, report, offset, activeIdx)) { if (subCastleInAccept(c, q, report, offset, activeIdx)) {
return 1; return 1;
@ -174,11 +187,10 @@ char castleInAccept(const struct Castle *c, struct mq *q,
} }
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
const u8 *active = (const u8 *)q->streamState + c->activeIdxSize; const u8 *active = (const u8 *)q->streamState + c->activeOffset;
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
i != MMB_INVALID; i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
i = mmbit_iterate(active, c->numRepeats, i)) {
DEBUG_PRINTF("subcastle %u\n", i); DEBUG_PRINTF("subcastle %u\n", i);
if (subCastleInAccept(c, q, report, offset, i)) { if (subCastleInAccept(c, q, report, offset, i)) {
return 1; return 1;
@ -193,7 +205,6 @@ static really_inline
void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset, void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
void *full_state, void *stream_state, void *full_state, void *stream_state,
const u32 subIdx) { const u32 subIdx) {
u8 *active = (u8 *)stream_state;
const struct SubCastle *sub = getSubCastle(c, subIdx); const struct SubCastle *sub = getSubCastle(c, subIdx);
const struct RepeatInfo *info = getRepeatInfo(sub); const struct RepeatInfo *info = getRepeatInfo(sub);
@ -203,10 +214,13 @@ void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) { if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) {
DEBUG_PRINTF("sub %u is stale at offset %llu\n", subIdx, offset); DEBUG_PRINTF("sub %u is stale at offset %llu\n", subIdx, offset);
if (sub->exclusive) { if (sub->exclusiveId < c->numRepeats) {
partial_store_u32(stream_state, c->numRepeats, c->activeIdxSize); u8 *active = (u8 *)stream_state;
u8 *groups = active + c->groupIterOffset;
mmbit_unset(groups, c->numGroups, sub->exclusiveId);
} else { } else {
mmbit_unset(active + c->activeIdxSize, c->numRepeats, subIdx); u8 *active = (u8 *)stream_state + c->activeOffset;
mmbit_unset(active, c->numRepeats, subIdx);
} }
} }
} }
@ -216,30 +230,47 @@ void castleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
void *full_state, void *stream_state) { void *full_state, void *stream_state) {
DEBUG_PRINTF("offset=%llu\n", offset); DEBUG_PRINTF("offset=%llu\n", offset);
if (!c->staleIterOffset) {
DEBUG_PRINTF("{no repeats can go stale}\n");
return; /* no subcastle can ever go stale */
}
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(stream_state, c->activeIdxSize); u8 *active = (u8 *)stream_state;
if (activeIdx < c->numRepeats) { u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
subCastleDeactivateStaleSubs(c, offset, full_state, subCastleDeactivateStaleSubs(c, offset, full_state,
stream_state, activeIdx); stream_state, activeIdx);
} }
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
const u8 *active = (const u8 *)stream_state + c->activeIdxSize; const u8 *active = (const u8 *)stream_state + c->activeOffset;
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); const struct mmbit_sparse_iter *it
i != MMB_INVALID; = (const void *)((const char *)c + c->staleIterOffset);
i = mmbit_iterate(active, c->numRepeats, i)) {
struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
u32 numRepeats = c->numRepeats;
u32 idx = 0;
u32 i = mmbit_sparse_iter_begin(active, numRepeats, &idx, it, si_state);
while(i != MMB_INVALID) {
DEBUG_PRINTF("subcastle %u\n", i); DEBUG_PRINTF("subcastle %u\n", i);
subCastleDeactivateStaleSubs(c, offset, full_state, subCastleDeactivateStaleSubs(c, offset, full_state, stream_state, i);
stream_state, i); i = mmbit_sparse_iter_next(active, numRepeats, i, &idx, it,
si_state);
} }
} }
} }
static really_inline static really_inline
void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset, void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset,
void *full_state, void *stream_state) { void *full_state, void *stream_state,
UNUSED char stale_checked) {
assert(top < c->numRepeats); assert(top < c->numRepeats);
const struct SubCastle *sub = getSubCastle(c, top); const struct SubCastle *sub = getSubCastle(c, top);
@ -249,12 +280,20 @@ void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset,
info->packedCtrlSize; info->packedCtrlSize;
char is_alive = 0; char is_alive = 0;
if (sub->exclusive) { u8 *active = (u8 *)stream_state;
const u32 activeIdx = partial_load_u32(stream_state, c->activeIdxSize); if (sub->exclusiveId < c->numRepeats) {
is_alive = (activeIdx == top); u8 *groups = active + c->groupIterOffset;
partial_store_u32(stream_state, top, c->activeIdxSize); active += sub->exclusiveId * c->activeIdxSize;
if (mmbit_set(groups, c->numGroups, sub->exclusiveId)) {
const u32 activeIdx = partial_load_u32(active, c->activeIdxSize);
is_alive = (activeIdx == top);
}
if (!is_alive) {
partial_store_u32(active, top, c->activeIdxSize);
}
} else { } else {
u8 *active = (u8 *)stream_state + c->activeIdxSize; active += c->activeOffset;
is_alive = mmbit_set(active, c->numRepeats, top); is_alive = mmbit_set(active, c->numRepeats, top);
} }
@ -263,8 +302,8 @@ void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset,
} else { } else {
DEBUG_PRINTF("repeat %u is already alive\n", top); DEBUG_PRINTF("repeat %u is already alive\n", top);
// Caller should ensure we're not stale. // Caller should ensure we're not stale.
assert(repeatHasMatch(info, rctrl, rstate, offset) != assert(!stale_checked
REPEAT_STALE); || repeatHasMatch(info, rctrl, rstate, offset) != REPEAT_STALE);
// Ignore duplicate top events. // Ignore duplicate top events.
u64a last = repeatLastTop(info, rctrl, rstate); u64a last = repeatLastTop(info, rctrl, rstate);
@ -292,11 +331,11 @@ void subCastleFindMatch(const struct Castle *c, const u64a begin,
u64a match = repeatNextMatch(info, rctrl, rstate, begin); u64a match = repeatNextMatch(info, rctrl, rstate, begin);
if (match == 0) { if (match == 0) {
DEBUG_PRINTF("no more matches for sub %u\n", subIdx); DEBUG_PRINTF("no more matches for sub %u\n", subIdx);
if (sub->exclusive) { if (sub->exclusiveId < c->numRepeats) {
partial_store_u32(stream_state, c->numRepeats, u8 *groups = (u8 *)stream_state + c->groupIterOffset;
c->activeIdxSize); mmbit_unset(groups, c->numGroups, sub->exclusiveId);
} else { } else {
u8 *active = (u8 *)stream_state + c->activeIdxSize; u8 *active = (u8 *)stream_state + c->activeOffset;
mmbit_unset(active, c->numRepeats, subIdx); mmbit_unset(active, c->numRepeats, subIdx);
} }
return; return;
@ -329,16 +368,20 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
*mloc = 0; *mloc = 0;
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(stream_state, c->activeIdxSize); u8 *active = (u8 *)stream_state;
if (activeIdx < c->numRepeats) { u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
subCastleFindMatch(c, begin, end, full_state, stream_state, mloc, subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
&found, activeIdx); &found, activeIdx);
} }
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
u8 *active = (u8 *)stream_state + c->activeIdxSize; u8 *active = (u8 *)stream_state + c->activeOffset;
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
i != MMB_INVALID; i != MMB_INVALID;
i = mmbit_iterate(active, c->numRepeats, i)) { i = mmbit_iterate(active, c->numRepeats, i)) {
@ -367,31 +410,38 @@ u64a subCastleNextMatch(const struct Castle *c, void *full_state,
return repeatNextMatch(info, rctrl, rstate, loc); return repeatNextMatch(info, rctrl, rstate, loc);
} }
static really_inline
void set_matching(const struct Castle *c, const u64a match, u8 *active,
u8 *matching, const u32 active_size, const u32 active_id,
const u32 matching_id, u64a *offset, const u64a end) {
if (match == 0) {
DEBUG_PRINTF("no more matches\n");
mmbit_unset(active, active_size, active_id);
} else if (match > end) {
// If we had a local copy of the active mmbit, we could skip
// looking at this repeat again. But we don't, so we just move
// on.
} else if (match == *offset) {
mmbit_set(matching, c->numRepeats, matching_id);
} else if (match < *offset) {
// New minimum offset.
*offset = match;
mmbit_clear(matching, c->numRepeats);
mmbit_set(matching, c->numRepeats, matching_id);
}
}
static really_inline static really_inline
void subCastleMatchLoop(const struct Castle *c, void *full_state, void subCastleMatchLoop(const struct Castle *c, void *full_state,
void *stream_state, const u64a end, void *stream_state, const u64a end,
const u64a loc, u64a *offset) { const u64a loc, u64a *offset) {
u8 *active = (u8 *)stream_state + c->activeIdxSize; u8 *active = (u8 *)stream_state + c->activeOffset;
u8 *matching = full_state; u8 *matching = full_state;
mmbit_clear(matching, c->numRepeats);
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
u64a match = subCastleNextMatch(c, full_state, stream_state, loc, i); u64a match = subCastleNextMatch(c, full_state, stream_state, loc, i);
if (match == 0) { set_matching(c, match, active, matching, c->numRepeats, i,
DEBUG_PRINTF("no more matches\n"); i, offset, end);
mmbit_unset(active, c->numRepeats, i);
} else if (match > end) {
// If we had a local copy of the active mmbit, we could skip
// looking at this repeat again. But we don't, so we just move
// on.
} else if (match == *offset) {
mmbit_set(matching, c->numRepeats, i);
} else if (match < *offset) {
// New minimum offset.
*offset = match;
mmbit_clear(matching, c->numRepeats);
mmbit_set(matching, c->numRepeats, i);
}
} }
} }
@ -434,61 +484,37 @@ char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
// full_state (scratch). // full_state (scratch).
u64a offset = end; // min offset of next match u64a offset = end; // min offset of next match
char found = 0;
u32 activeIdx = 0; u32 activeIdx = 0;
mmbit_clear(matching, c->numRepeats);
if (c->exclusive) { if (c->exclusive) {
activeIdx = partial_load_u32(stream_state, c->activeIdxSize); u8 *active = (u8 *)stream_state;
if (activeIdx < c->numRepeats) { u8 *groups = active + c->groupIterOffset;
u32 i = activeIdx; for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
DEBUG_PRINTF("subcastle %u\n", i); i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
u8 *cur = active + i * c->activeIdxSize;
activeIdx = partial_load_u32(cur, c->activeIdxSize);
u64a match = subCastleNextMatch(c, full_state, stream_state, u64a match = subCastleNextMatch(c, full_state, stream_state,
loc, i); loc, activeIdx);
set_matching(c, match, groups, matching, c->numGroups, i,
if (match == 0) { activeIdx, &offset, end);
DEBUG_PRINTF("no more matches\n");
partial_store_u32(stream_state, c->numRepeats,
c->activeIdxSize);
} else if (match > end) {
// If we had a local copy of the active mmbit, we could skip
// looking at this repeat again. But we don't, so we just move
// on.
} else if (match <= offset) {
if (match < offset) {
// New minimum offset.
offset = match;
}
found = 1;
}
} }
} }
const char hasMatch = found; if (c->exclusive != PURE_EXCLUSIVE) {
u64a newOffset = offset;
if (!c->pureExclusive) {
subCastleMatchLoop(c, full_state, stream_state, subCastleMatchLoop(c, full_state, stream_state,
end, loc, &newOffset); end, loc, &offset);
DEBUG_PRINTF("offset=%llu\n", newOffset);
if (mmbit_any(matching, c->numRepeats)) {
found = 1;
if (subCastleFireMatch(c, full_state, stream_state,
cb, ctx, newOffset) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
} }
DEBUG_PRINTF("offset=%llu\n", offset);
if (!found) { if (!mmbit_any(matching, c->numRepeats)) {
DEBUG_PRINTF("no more matches\n");
break; break;
} else if (hasMatch && offset == newOffset) {
const struct SubCastle *sub = getSubCastle(c, activeIdx);
DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, activeIdx);
if (cb(offset, sub->report, ctx) == MO_HALT_MATCHING) {
DEBUG_PRINTF("caller told us to halt\n");
return MO_HALT_MATCHING;
}
} }
loc = newOffset;
if (subCastleFireMatch(c, full_state, stream_state,
cb, ctx, offset) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
loc = offset;
} }
return MO_CONTINUE_MATCHING; return MO_CONTINUE_MATCHING;
@ -547,7 +573,8 @@ char castleScanShufti(const struct Castle *c, const u8 *buf, const size_t begin,
static really_inline static really_inline
char castleScanTruffle(const struct Castle *c, const u8 *buf, const size_t begin, char castleScanTruffle(const struct Castle *c, const u8 *buf, const size_t begin,
const size_t end, size_t *loc) { const size_t end, size_t *loc) {
const u8 *ptr = truffleExec(c->u.truffle.mask1, c->u.truffle.mask2, buf + begin, buf + end); const u8 *ptr = truffleExec(c->u.truffle.mask1, c->u.truffle.mask2,
buf + begin, buf + end);
if (ptr == buf + end) { if (ptr == buf + end) {
DEBUG_PRINTF("no escape found\n"); DEBUG_PRINTF("no escape found\n");
return 0; return 0;
@ -589,7 +616,103 @@ char castleScan(const struct Castle *c, const u8 *buf, const size_t begin,
} }
static really_inline static really_inline
void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp) { char castleRevScanVerm(const struct Castle *c, const u8 *buf,
const size_t begin, const size_t end, size_t *loc) {
const u8 *ptr = rvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
if (ptr == buf + begin - 1) {
DEBUG_PRINTF("no escape found\n");
return 0;
}
assert(loc);
assert(ptr >= buf && ptr < buf + end);
*loc = (size_t)(ptr - buf);
DEBUG_PRINTF("escape found at offset %zu\n", *loc);
return 1;
}
static really_inline
char castleRevScanNVerm(const struct Castle *c, const u8 *buf,
const size_t begin, const size_t end, size_t *loc) {
const u8 *ptr = rnvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
if (ptr == buf + begin - 1) {
DEBUG_PRINTF("no escape found\n");
return 0;
}
assert(loc);
assert(ptr >= buf && ptr < buf + end);
*loc = (size_t)(ptr - buf);
DEBUG_PRINTF("escape found at offset %zu\n", *loc);
return 1;
}
static really_inline
char castleRevScanShufti(const struct Castle *c, const u8 *buf,
const size_t begin, const size_t end, size_t *loc) {
const m128 mask_lo = c->u.shuf.mask_lo;
const m128 mask_hi = c->u.shuf.mask_hi;
const u8 *ptr = rshuftiExec(mask_lo, mask_hi, buf + begin, buf + end);
if (ptr == buf + begin - 1) {
DEBUG_PRINTF("no escape found\n");
return 0;
}
assert(loc);
assert(ptr >= buf && ptr < buf + end);
*loc = (size_t)(ptr - buf);
DEBUG_PRINTF("escape found at offset %zu\n", *loc);
return 1;
}
static really_inline
char castleRevScanTruffle(const struct Castle *c, const u8 *buf,
const size_t begin, const size_t end, size_t *loc) {
const u8 *ptr = rtruffleExec(c->u.truffle.mask1, c->u.truffle.mask2,
buf + begin, buf + end);
if (ptr == buf + begin - 1) {
DEBUG_PRINTF("no escape found\n");
return 0;
}
assert(loc);
assert(ptr >= buf && ptr < buf + end);
*loc = (size_t)(ptr - buf);
DEBUG_PRINTF("escape found at offset %zu\n", *loc);
return 1;
}
static really_inline
char castleRevScan(const struct Castle *c, const u8 *buf, const size_t begin,
const size_t end, size_t *loc) {
assert(begin <= end);
DEBUG_PRINTF("scanning backwards over (%zu,%zu]\n", begin, end);
if (begin == end) {
return 0;
}
switch (c->type) {
case CASTLE_DOT:
// Nothing can stop a dot scan!
return 0;
case CASTLE_VERM:
return castleRevScanVerm(c, buf, begin, end, loc);
case CASTLE_NVERM:
return castleRevScanNVerm(c, buf, begin, end, loc);
case CASTLE_SHUFTI:
return castleRevScanShufti(c, buf, begin, end, loc);
case CASTLE_TRUFFLE:
return castleRevScanTruffle(c, buf, begin, end, loc);
default:
DEBUG_PRINTF("unknown scan type!\n");
assert(0);
return 0;
}
}
static really_inline
void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp,
char stale_checked) {
const u32 event = q->items[q->cur].type; const u32 event = q->items[q->cur].type;
switch (event) { switch (event) {
case MQE_TOP: case MQE_TOP:
@ -603,11 +726,24 @@ void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp) {
assert(event < MQE_INVALID); assert(event < MQE_INVALID);
u32 top = event - MQE_TOP_FIRST; u32 top = event - MQE_TOP_FIRST;
DEBUG_PRINTF("top %u at offset %llu\n", top, sp); DEBUG_PRINTF("top %u at offset %llu\n", top, sp);
castleProcessTop(c, top, sp, q->state, q->streamState); castleProcessTop(c, top, sp, q->state, q->streamState, stale_checked);
break; break;
} }
} }
static really_inline
void clear_repeats(const struct Castle *c, const struct mq *q, u8 *active) {
DEBUG_PRINTF("clearing active repeats due to escape\n");
if (c->exclusive) {
u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
mmbit_clear(groups, c->numGroups);
}
if (c->exclusive != PURE_EXCLUSIVE) {
mmbit_clear(active, c->numRepeats);
}
}
static really_inline static really_inline
char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end, char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
enum MatchMode mode) { enum MatchMode mode) {
@ -630,7 +766,7 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
return 1; return 1;
} }
u8 *active = (u8 *)q->streamState + c->activeIdxSize; // active multibit u8 *active = (u8 *)q->streamState + c->activeOffset;// active multibit
assert(q->cur + 1 < q->end); // require at least two items assert(q->cur + 1 < q->end); // require at least two items
assert(q_cur_type(q) == MQE_START); assert(q_cur_type(q) == MQE_START);
@ -644,14 +780,8 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
char found = 0; char found = 0;
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(q->streamState, u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
c->activeIdxSize); found = mmbit_any(groups, c->numGroups);
if (activeIdx < c->numRepeats) {
found = 1;
} else if (c->pureExclusive) {
DEBUG_PRINTF("castle is dead\n");
goto scan_done;
}
} }
if (!found && !mmbit_any(active, c->numRepeats)) { if (!found && !mmbit_any(active, c->numRepeats)) {
@ -698,15 +828,7 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
} }
if (escape_found) { if (escape_found) {
DEBUG_PRINTF("clearing active repeats due to escape\n"); clear_repeats(c, q, active);
if (c->exclusive) {
partial_store_u32(q->streamState, c->numRepeats,
c->activeIdxSize);
}
if (!c->pureExclusive) {
mmbit_clear(active, c->numRepeats);
}
} }
} }
@ -720,15 +842,14 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
} }
sp = q_cur_offset(q); sp = q_cur_offset(q);
castleHandleEvent(c, q, sp); castleHandleEvent(c, q, sp, 1);
q->cur++; q->cur++;
} }
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(q->streamState, u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
c->activeIdxSize); if (mmbit_any_precise(groups, c->numGroups)) {
if (c->pureExclusive || activeIdx < c->numRepeats) { return 1;
return activeIdx < c->numRepeats;
} }
} }
@ -745,28 +866,34 @@ char nfaExecCastle0_Q2(const struct NFA *n, struct mq *q, s64a end) {
return nfaExecCastle0_Q_i(n, q, end, STOP_AT_MATCH); return nfaExecCastle0_Q_i(n, q, end, STOP_AT_MATCH);
} }
static really_inline static
void castleStreamSilent(const struct Castle *c, u8 *active, const u8 *buf, s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
size_t length) { assert(q_cur_type(q) == MQE_START);
DEBUG_PRINTF("entry\n"); assert(q_last_type(q) == MQE_END);
s64a sp = q_cur_loc(q);
s64a ep = q_last_loc(q);
// This call doesn't produce matches, so we elide the castleMatchLoop call DEBUG_PRINTF("finding final squash in (%lld, %lld]\n", sp, ep);
// entirely and just do escape scans to maintain the repeat.
size_t eloc = 0; size_t loc;
char escaped = castleScan(c, buf, 0, length, &eloc);
if (escaped) { if (ep > 0) {
assert(eloc < length); if (castleRevScan(c, q->buffer, sp > 0 ? sp : 0, ep, &loc)) {
DEBUG_PRINTF("escape found at %zu, clearing castle\n", eloc); return (s64a)loc;
if (c->exclusive) {
partial_store_u32(active - c->activeIdxSize,
c->numRepeats, c->activeIdxSize);
}
if (!c->pureExclusive) {
mmbit_clear(active, c->numRepeats);
} }
ep = 0;
} }
if (sp < 0) {
s64a hlen = q->hlength;
if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
return (s64a)loc - hlen;
}
ep = 0;
}
return sp - 1; /* the repeats are never killed */
} }
char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report) { char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report) {
@ -780,85 +907,44 @@ char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report) {
assert(q->cur + 1 < q->end); /* require at least two items */ assert(q->cur + 1 < q->end); /* require at least two items */
assert(q_cur_type(q) == MQE_START); assert(q_cur_type(q) == MQE_START);
u64a sp = q_cur_offset(q);
q->cur++;
DEBUG_PRINTF("sp=%llu\n", sp);
const struct Castle *c = getImplNfa(n); const struct Castle *c = getImplNfa(n);
u8 *active = (u8 *)q->streamState + c->activeIdxSize; u8 *active = (u8 *)q->streamState + c->activeOffset;
char found = 0;
u64a end_offset = q_last_loc(q) + q->offset;
s64a last_kill_loc = castleLastKillLoc(c, q);
DEBUG_PRINTF("all repeats killed at %lld (exec range %lld, %lld)\n",
last_kill_loc, q_cur_loc(q), q_last_loc(q));
assert(last_kill_loc < q_last_loc(q));
if (last_kill_loc != q_cur_loc(q) - 1) {
clear_repeats(c, q, active);
}
q->cur++; /* skip start event */
/* skip events prior to the repeats being squashed */
while (q_cur_loc(q) <= last_kill_loc) {
DEBUG_PRINTF("skipping moot event at %lld\n", q_cur_loc(q));
q->cur++;
assert(q->cur < q->end);
}
while (q->cur < q->end) { while (q->cur < q->end) {
DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q), DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q),
q_cur_offset(q)); q_cur_offset(q));
found = 0; u64a sp = q_cur_offset(q);
if (c->exclusive) { castleHandleEvent(c, q, sp, 0);
const u32 activeIdx = partial_load_u32(q->streamState,
c->activeIdxSize);
if (activeIdx < c->numRepeats) {
found = 1;
} else if (c->pureExclusive) {
DEBUG_PRINTF("castle is dead\n");
goto scan_done;
}
}
if (!found && !mmbit_any(active, c->numRepeats)) {
DEBUG_PRINTF("castle is dead\n");
goto scan_done;
}
u64a ep = q_cur_offset(q);
if (sp < q->offset) {
DEBUG_PRINTF("HISTORY BUFFER SCAN\n");
assert(q->offset - sp <= q->hlength);
u64a local_ep = MIN(q->offset, ep);
const u8 *ptr = q->history + q->hlength + sp - q->offset;
castleStreamSilent(c, active, ptr, local_ep - sp);
sp = local_ep;
}
found = 0;
if (c->exclusive) {
const u32 activeIdx = partial_load_u32(q->streamState,
c->activeIdxSize);
if (activeIdx < c->numRepeats) {
found = 1;
} else if (c->pureExclusive) {
DEBUG_PRINTF("castle is dead\n");
goto scan_done;
}
}
if (!found && !mmbit_any(active, c->numRepeats)) {
DEBUG_PRINTF("castle is dead\n");
goto scan_done;
}
if (sp < ep) {
DEBUG_PRINTF("MAIN BUFFER SCAN\n");
assert(ep - q->offset <= q->length);
const u8 *ptr = q->buffer + sp - q->offset;
castleStreamSilent(c, active, ptr, ep - sp);
}
scan_done:
sp = q_cur_offset(q);
castleDeactivateStaleSubs(c, sp, q->state, q->streamState);
castleHandleEvent(c, q, sp);
q->cur++; q->cur++;
} }
found = 0; castleDeactivateStaleSubs(c, end_offset, q->state, q->streamState);
char found = 0;
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(q->streamState, u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
c->activeIdxSize); found = mmbit_any_precise(groups, c->numGroups);
if (activeIdx < c->numRepeats) {
found = 1;
} else if (c->pureExclusive) {
DEBUG_PRINTF("castle is dead\n");
return 0;
}
} }
if (!found && !mmbit_any_precise(active, c->numRepeats)) { if (!found && !mmbit_any_precise(active, c->numRepeats)) {
@ -866,7 +952,7 @@ scan_done:
return 0; return 0;
} }
if (castleInAccept(c, q, report, sp)) { if (castleInAccept(c, q, report, end_offset)) {
return MO_MATCHES_PENDING; return MO_MATCHES_PENDING;
} }
@ -901,11 +987,12 @@ char nfaExecCastle0_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
const struct Castle *c = getImplNfa(n); const struct Castle *c = getImplNfa(n);
assert(q->streamState); assert(q->streamState);
if (c->exclusive) { if (c->exclusive) {
partial_store_u32(q->streamState, c->numRepeats, c->activeIdxSize); u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
mmbit_clear(groups, c->numGroups);
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
u8 *active = (u8 *)q->streamState + c->activeIdxSize; u8 *active = (u8 *)q->streamState + c->activeOffset;
mmbit_clear(active, c->numRepeats); mmbit_clear(active, c->numRepeats);
} }
return 0; return 0;
@ -919,11 +1006,12 @@ char nfaExecCastle0_initCompressedState(const struct NFA *n, UNUSED u64a offset,
const struct Castle *c = getImplNfa(n); const struct Castle *c = getImplNfa(n);
if (c->exclusive) { if (c->exclusive) {
partial_store_u32(state, c->numRepeats, c->activeIdxSize); u8 *groups = (u8 *)state + c->groupIterOffset;
mmbit_clear(groups, c->numGroups);
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
u8 *active = (u8 *)state + c->activeIdxSize; u8 *active = (u8 *)state + c->activeOffset;
mmbit_clear(active, c->numRepeats); mmbit_clear(active, c->numRepeats);
} }
return 0; return 0;
@ -954,16 +1042,19 @@ char nfaExecCastle0_queueCompressState(const struct NFA *n, const struct mq *q,
const u64a offset = q->offset + loc; const u64a offset = q->offset + loc;
DEBUG_PRINTF("offset=%llu\n", offset); DEBUG_PRINTF("offset=%llu\n", offset);
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(q->streamState, u8 *active = (u8 *)q->streamState;
c->activeIdxSize); u8 *groups = active + c->groupIterOffset;
if (activeIdx < c->numRepeats) { for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("packing state for sub %u\n", activeIdx); DEBUG_PRINTF("packing state for sub %u\n", activeIdx);
subCastleQueueCompressState(c, activeIdx, q, offset); subCastleQueueCompressState(c, activeIdx, q, offset);
} }
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
const u8 *active = (const u8 *)q->streamState + c->activeIdxSize; const u8 *active = (const u8 *)q->streamState + c->activeOffset;
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
DEBUG_PRINTF("packing state for sub %u\n", i); DEBUG_PRINTF("packing state for sub %u\n", i);
@ -997,15 +1088,19 @@ char nfaExecCastle0_expandState(const struct NFA *n, void *dest,
const struct Castle *c = getImplNfa(n); const struct Castle *c = getImplNfa(n);
if (c->exclusive) { if (c->exclusive) {
const u32 activeIdx = partial_load_u32(src, c->activeIdxSize); const u8 *active = (const u8 *)src;
if (activeIdx < c->numRepeats) { const u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
subCastleExpandState(c, activeIdx, dest, src, offset); subCastleExpandState(c, activeIdx, dest, src, offset);
} }
} }
if (!c->pureExclusive) { if (c->exclusive != PURE_EXCLUSIVE) {
// Unpack state for all active repeats. // Unpack state for all active repeats.
const u8 *active = (const u8 *)src + c->activeIdxSize; const u8 *active = (const u8 *)src + c->activeOffset;
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
subCastleExpandState(c, i, dest, src, offset); subCastleExpandState(c, i, dest, src, offset);
@ -1013,4 +1108,3 @@ char nfaExecCastle0_expandState(const struct NFA *n, void *dest,
} }
return 0; return 0;
} }

View File

@ -100,6 +100,7 @@ void nfaExecCastle0_dumpText(const struct NFA *nfa, FILE *f) {
fprintf(f, "unknown type %u\n", c->type); fprintf(f, "unknown type %u\n", c->type);
break; break;
} }
fprintf(f, "Stale Iter Offset: %u\n", c->staleIterOffset);
fprintf(f, "\n"); fprintf(f, "\n");
dumpTextReverse(nfa, f); dumpTextReverse(nfa, f);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,9 @@ struct SubCastle {
u32 streamStateOffset; //!< offset within stream state u32 streamStateOffset; //!< offset within stream state
u32 repeatInfoOffset; //!< offset of RepeatInfo structure u32 repeatInfoOffset; //!< offset of RepeatInfo structure
// relative to the start of SubCastle // relative to the start of SubCastle
char exclusive; //!< exclusive info of this SubCastle u32 exclusiveId; //!< exclusive group id of this SubCastle,
// set to the number of SubCastles in Castle
// if it is not exclusive
}; };
#define CASTLE_DOT 0 #define CASTLE_DOT 0
@ -51,6 +53,12 @@ struct SubCastle {
#define CASTLE_SHUFTI 3 #define CASTLE_SHUFTI 3
#define CASTLE_TRUFFLE 4 #define CASTLE_TRUFFLE 4
enum ExclusiveType {
NOT_EXCLUSIVE, //!< no subcastles are exclusive
EXCLUSIVE, //!< a subset of subcastles are exclusive
PURE_EXCLUSIVE //!< all subcastles are exclusive
};
/** /**
* \brief Castle engine structure. * \brief Castle engine structure.
* *
@ -63,26 +71,60 @@ struct SubCastle {
* - struct Castle * - struct Castle
* - struct SubCastle[numRepeats] * - struct SubCastle[numRepeats]
* - tables for sparse model repeats * - tables for sparse model repeats
* - sparse iterator for subcastles that may be stale
* *
* Castle stores an "active repeats" multibit in stream state, followed by the * Castle stores an "active repeats" multibit in stream state, followed by the
* packed repeat state for each SubCastle. If all SubCastles are mutual * packed repeat state for each SubCastle. If there are both exclusive and
* exclusive, we store current active SubCastle id instead of "active repeats" * non-exclusive SubCastle groups, we use an active id for each exclusive group
* multibit in stream state. If there are both exclusive and non-exclusive * and a multibit for the non-exclusive group. We also store an "active
* SubCastle groups, we use an active id for the exclusive group and a multibit * exclusive groups" multibit for exclusive groups. If all SubCastles are mutual
* for the non-exclusive group. * exclusive, we remove "active repeats" multibit from stream state.
* * Castle stream state:
* *
* * |---|
* * | | active subengine id for exclusive group 1
* * |---|
* * | | active subengine id for exclusive group 2(if necessary)
* * |---|
* * ...
* * |---|
* * | | "active repeats" multibit for non-exclusive subcastles
* * | | (if not all subcastles are exclusive)
* * |---|
* * | | active multibit for exclusive groups
* * | |
* * |---|
* * ||-|| common pool of stream state for exclusive group 1
* * ||-||
* * |---|
* * ||-|| common pool of stream state for exclusive group 2(if necessary)
* * ||-||
* * |---|
* * ...
* * |---|
* * | | stream state for each non-exclusive subcastles
* * ...
* * | |
* * |---|
* *
* In full state (stored in scratch space) it stores a temporary multibit over * In full state (stored in scratch space) it stores a temporary multibit over
* the repeats (used by \ref castleMatchLoop), followed by the repeat control * the repeats (used by \ref castleMatchLoop), followed by the repeat control
* blocks for each SubCastle. If all SubCastles are mutual exclusive, we only * blocks for each SubCastle.
* need to store the repeat control blocks for each SubCastle.
*/ */
struct ALIGN_AVX_DIRECTIVE Castle { struct ALIGN_AVX_DIRECTIVE Castle {
u32 numRepeats; u32 numRepeats; //!< number of repeats in Castle
u8 type; //!< tells us which scanning mechanism (below) to use u32 numGroups; //!< number of exclusive groups
char exclusive; //!< tells us if there are mutual exclusive SubCastles u8 type; //!< tells us which scanning mechanism (below) to use
char pureExclusive; //!< tells us if all SubCastles are mutual exclusive u8 exclusive; //!< tells us if there are mutual exclusive SubCastles
u8 activeIdxSize; //!< number of bytes in stream state to store u8 activeIdxSize; //!< number of bytes in stream state to store
// active SubCastle id for exclusive mode // active SubCastle id for exclusive mode
u32 activeOffset; //!< offset to active multibit for non-exclusive
// SubCastles
u32 staleIterOffset; //!< offset to a sparse iterator to check for stale
// sub castles
u32 groupIterOffset; //!< offset to a iterator to check the aliveness of
// exclusive groups
union { union {
struct { struct {
char c; char c;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -32,6 +32,7 @@
#include "castlecompile.h" #include "castlecompile.h"
#include "castle_internal.h" #include "castle_internal.h"
#include "limex_limits.h"
#include "nfa_internal.h" #include "nfa_internal.h"
#include "repeatcompile.h" #include "repeatcompile.h"
#include "shufticompile.h" #include "shufticompile.h"
@ -47,7 +48,9 @@
#include "util/dump_charclass.h" #include "util/dump_charclass.h"
#include "util/graph.h" #include "util/graph.h"
#include "util/make_unique.h" #include "util/make_unique.h"
#include "util/multibit_build.h"
#include "util/multibit_internal.h" #include "util/multibit_internal.h"
#include "util/report_manager.h"
#include "util/ue2_containers.h" #include "util/ue2_containers.h"
#include "util/verify_types.h" #include "util/verify_types.h"
#include "grey.h" #include "grey.h"
@ -63,7 +66,6 @@ using boost::adaptors::map_values;
namespace ue2 { namespace ue2 {
#define CASTLE_MAX_TOPS 32
#define CLIQUE_GRAPH_MAX_SIZE 1000 #define CLIQUE_GRAPH_MAX_SIZE 1000
static static
@ -204,7 +206,7 @@ bool graph_empty(const Graph &g) {
static static
vector<u32> removeClique(CliqueGraph &cg) { vector<u32> removeClique(CliqueGraph &cg) {
vector<vector<u32>> cliquesVec(1); vector<vector<u32>> cliquesVec(1);
DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg)); DEBUG_PRINTF("graph size:%zu\n", num_vertices(cg));
findCliqueGroup(cg, cliquesVec[0]); findCliqueGroup(cg, cliquesVec[0]);
while (!graph_empty(cg)) { while (!graph_empty(cg)) {
const vector<u32> &c = cliquesVec.back(); const vector<u32> &c = cliquesVec.back();
@ -236,7 +238,7 @@ vector<u32> removeClique(CliqueGraph &cg) {
} }
} }
DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size()); DEBUG_PRINTF("clique size:%zu\n", cliquesVec[id].size());
return cliquesVec[id]; return cliquesVec[id];
} }
@ -244,17 +246,18 @@ vector<u32> removeClique(CliqueGraph &cg) {
// the end locations where it overlaps with other literals, // the end locations where it overlaps with other literals,
// then the literals are mutual exclusive // then the literals are mutual exclusive
static static
bool findExclusivePair(const u32 id1, const u32 id2, bool findExclusivePair(const size_t id1, const size_t id2,
const size_t lower,
const vector<vector<size_t>> &min_reset_dist, const vector<vector<size_t>> &min_reset_dist,
const vector<vector<vector<CharReach>>> &triggers) { const vector<vector<vector<CharReach>>> &triggers) {
const auto &triggers1 = triggers[id1]; const auto &triggers1 = triggers[id1];
const auto &triggers2 = triggers[id2]; const auto &triggers2 = triggers[id2];
for (u32 i = 0; i < triggers1.size(); ++i) { for (size_t i = 0; i < triggers1.size(); ++i) {
for (u32 j = 0; j < triggers2.size(); ++j) { for (size_t j = 0; j < triggers2.size(); ++j) {
if (!literalOverlap(triggers1[i], triggers2[j], if (!literalOverlap(triggers1[i], triggers2[j],
min_reset_dist[id2][j]) || min_reset_dist[id2 - lower][j]) ||
!literalOverlap(triggers2[j], triggers1[i], !literalOverlap(triggers2[j], triggers1[i],
min_reset_dist[id1][i])) { min_reset_dist[id1 - lower][i])) {
return false; return false;
} }
} }
@ -263,40 +266,75 @@ bool findExclusivePair(const u32 id1, const u32 id2,
} }
static static
vector<u32> checkExclusion(const CharReach &cr, vector<vector<u32>> checkExclusion(u32 &streamStateSize,
const vector<vector<vector<CharReach>>> &triggers) { const CharReach &cr,
vector<u32> group; const vector<vector<vector<CharReach>>> &triggers,
if (!triggers.size() || triggers.size() == 1) { enum ExclusiveType &exclusive,
return group; const size_t numRepeats) {
} vector<vector<u32>> groups;
size_t trigSize = triggers.size();
DEBUG_PRINTF("trigSize %zu\n", trigSize);
vector<vector<size_t>> min_reset_dist; size_t lower = 0;
// get min reset distance for each repeat size_t total = 0;
for (auto it = triggers.begin(); it != triggers.end(); it++) { while (lower < trigSize) {
const vector<size_t> &tmp_dist = minResetDistToEnd(*it, cr); vector<CliqueVertex> vertices;
min_reset_dist.push_back(tmp_dist); unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
}
vector<CliqueVertex> vertices; vector<vector<size_t>> min_reset_dist;
unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>(); size_t upper = min(lower + CLIQUE_GRAPH_MAX_SIZE, trigSize);
for (u32 i = 0; i < triggers.size(); ++i) { // get min reset distance for each repeat
CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg); for (size_t i = lower; i < upper; i++) {
vertices.push_back(v); CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
} vertices.push_back(v);
// find exclusive pair for each repeat const vector<size_t> &tmp_dist =
for (u32 i = 0; i < triggers.size(); ++i) { minResetDistToEnd(triggers[i], cr);
CliqueVertex s = vertices[i]; min_reset_dist.push_back(tmp_dist);
for (u32 j = i + 1; j < triggers.size(); ++j) { }
if (findExclusivePair(i, j, min_reset_dist, triggers)) {
CliqueVertex d = vertices[j]; // find exclusive pair for each repeat
add_edge(s, d, *cg); for (size_t i = lower; i < upper; i++) {
CliqueVertex s = vertices[i - lower];
for (size_t j = i + 1; j < upper; j++) {
if (findExclusivePair(i, j, lower, min_reset_dist,
triggers)) {
CliqueVertex d = vertices[j - lower];
add_edge(s, d, *cg);
}
} }
} }
}
// find the largest exclusive group // find the largest exclusive group
return removeClique(*cg); auto clique = removeClique(*cg);
size_t cliqueSize = clique.size();
if (cliqueSize > 1) {
groups.push_back(clique);
exclusive = EXCLUSIVE;
total += cliqueSize;
}
lower += CLIQUE_GRAPH_MAX_SIZE;
}
DEBUG_PRINTF("clique size %zu, num of repeats %zu\n",
total, numRepeats);
if (total == numRepeats) {
exclusive = PURE_EXCLUSIVE;
streamStateSize = 0;
};
return groups;
}
namespace {
struct ExclusiveInfo {
/** Mapping between top and exclusive group id */
map<u32, u32> groupId;
/** Number of exclusive groups */
u32 numGroups = 0;
};
} }
static static
@ -305,10 +343,15 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
const vector<pair<depth, bool>> &repeatInfoPair, const vector<pair<depth, bool>> &repeatInfoPair,
u32 &scratchStateSize, u32 &streamStateSize, u32 &scratchStateSize, u32 &streamStateSize,
u32 &tableSize, vector<u64a> &tables, u32 &sparseRepeats, u32 &tableSize, vector<u64a> &tables, u32 &sparseRepeats,
const set<u32> &exclusiveGroup) { const ExclusiveInfo &exclusiveInfo,
vector<u32> &may_stale, const ReportManager &rm) {
const bool remap_reports = has_managed_reports(proto.kind);
u32 i = 0; u32 i = 0;
u32 maxStreamSize = 0; const auto &groupId = exclusiveInfo.groupId;
bool exclusive = exclusiveGroup.size() > 1; const auto &numGroups = exclusiveInfo.numGroups;
vector<u32> maxStreamSize(numGroups, 0);
for (auto it = proto.repeats.begin(), ite = proto.repeats.end(); for (auto it = proto.repeats.begin(), ite = proto.repeats.end();
it != ite; ++it, ++i) { it != ite; ++it, ++i) {
const PureRepeat &pr = it->second; const PureRepeat &pr = it->second;
@ -316,33 +359,35 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
bool is_reset = repeatInfoPair[i].second; bool is_reset = repeatInfoPair[i].second;
enum RepeatType rtype = chooseRepeatType(pr.bounds.min, pr.bounds.max, enum RepeatType rtype = chooseRepeatType(pr.bounds.min, pr.bounds.max,
min_period, is_reset); min_period, is_reset, true);
RepeatStateInfo rsi(rtype, pr.bounds.min, pr.bounds.max, min_period); RepeatStateInfo rsi(rtype, pr.bounds.min, pr.bounds.max, min_period);
DEBUG_PRINTF("sub %u: selected %s model for %s repeat\n", i, DEBUG_PRINTF("sub %u: selected %s model for %s repeat\n", i,
repeatTypeName(rtype), pr.bounds.str().c_str()); repeatTypeName(rtype), pr.bounds.str().c_str());
u32 subScratchStateSize;
u32 subStreamStateSize;
SubCastle &sub = subs[i]; SubCastle &sub = subs[i];
RepeatInfo &info = infos[i]; RepeatInfo &info = infos[i];
// handle exclusive case differently info.packedCtrlSize = rsi.packedCtrlSize;
if (exclusive && exclusiveGroup.find(i) != exclusiveGroup.end()) { u32 subStreamStateSize = verify_u32(rsi.packedCtrlSize + rsi.stateSize);
maxStreamSize = MAX(maxStreamSize, rsi.packedCtrlSize);
} else {
subScratchStateSize = verify_u32(sizeof(RepeatControl));
subStreamStateSize = verify_u32(rsi.packedCtrlSize + rsi.stateSize);
info.packedCtrlSize = rsi.packedCtrlSize; // Handle stream/scratch space alloc for exclusive case differently.
if (contains(groupId, i)) {
u32 id = groupId.at(i);
maxStreamSize[id] = max(maxStreamSize[id], subStreamStateSize);
// SubCastle full/stream state offsets are written in for the group
// below.
} else {
sub.fullStateOffset = scratchStateSize; sub.fullStateOffset = scratchStateSize;
sub.streamStateOffset = streamStateSize; sub.streamStateOffset = streamStateSize;
scratchStateSize += verify_u32(sizeof(RepeatControl));
scratchStateSize += subScratchStateSize;
streamStateSize += subStreamStateSize; streamStateSize += subStreamStateSize;
} }
if (pr.bounds.max.is_finite()) {
may_stale.push_back(i);
}
info.type = verify_u8(rtype); info.type = verify_u8(rtype);
info.repeatMin = depth_to_u32(pr.bounds.min); info.repeatMin = depth_to_u32(pr.bounds.min);
info.repeatMax = depth_to_u32(pr.bounds.max); info.repeatMax = depth_to_u32(pr.bounds.max);
@ -358,35 +403,44 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
info.encodingSize = rsi.encodingSize; info.encodingSize = rsi.encodingSize;
info.patchesOffset = rsi.patchesOffset; info.patchesOffset = rsi.patchesOffset;
sub.report = *pr.reports.begin(); assert(pr.reports.size() == 1);
ReportID id = *pr.reports.begin();
sub.report = remap_reports ? rm.getProgramOffset(id) : id;
if (rtype == REPEAT_SPARSE_OPTIMAL_P) { if (rtype == REPEAT_SPARSE_OPTIMAL_P) {
for (u32 j = 0; j < rsi.patchSize; j++) { for (u32 j = 0; j < rsi.patchSize; j++) {
tables.push_back(rsi.table[j]); tables.push_back(rsi.table[j]);
} }
sparseRepeats++; sparseRepeats++;
patchSize[i] = rsi.patchSize; patchSize[i] = rsi.patchSize;
tableSize += rsi.patchSize; tableSize += rsi.patchSize;
} }
} }
if (exclusive) { vector<u32> scratchOffset(numGroups, 0);
for (auto k : exclusiveGroup) { vector<u32> streamOffset(numGroups, 0);
SubCastle &sub = subs[k]; for (const auto &j : groupId) {
RepeatInfo &info = infos[k]; u32 top = j.first;
info.packedCtrlSize = maxStreamSize; u32 id = j.second;
SubCastle &sub = subs[top];
if (!scratchOffset[id]) {
sub.fullStateOffset = scratchStateSize; sub.fullStateOffset = scratchStateSize;
sub.streamStateOffset = streamStateSize; sub.streamStateOffset = streamStateSize;
scratchOffset[id] = scratchStateSize;
streamOffset[id] = streamStateSize;
scratchStateSize += verify_u32(sizeof(RepeatControl));
streamStateSize += maxStreamSize[id];
} else {
sub.fullStateOffset = scratchOffset[id];
sub.streamStateOffset = streamOffset[id];
} }
scratchStateSize += verify_u32(sizeof(RepeatControl));
streamStateSize += maxStreamSize;
} }
} }
aligned_unique_ptr<NFA> aligned_unique_ptr<NFA>
buildCastle(const CastleProto &proto, buildCastle(const CastleProto &proto,
const map<u32, vector<vector<CharReach>>> &triggers, const map<u32, vector<vector<CharReach>>> &triggers,
const CompileContext &cc) { const CompileContext &cc, const ReportManager &rm) {
assert(cc.grey.allowCastle); assert(cc.grey.allowCastle);
const size_t numRepeats = proto.repeats.size(); const size_t numRepeats = proto.repeats.size();
@ -418,8 +472,9 @@ buildCastle(const CastleProto &proto,
depth maxWidth(0); depth maxWidth(0);
u32 i = 0; u32 i = 0;
vector<u32> candidateRepeats; ExclusiveInfo exclusiveInfo;
vector<vector<vector<CharReach>>> candidateTriggers; vector<vector<vector<CharReach>>> candidateTriggers;
vector<u32> candidateRepeats;
vector<pair<depth, bool>> repeatInfoPair; vector<pair<depth, bool>> repeatInfoPair;
for (auto it = proto.repeats.begin(), ite = proto.repeats.end(); for (auto it = proto.repeats.begin(), ite = proto.repeats.end();
it != ite; ++it, ++i) { it != ite; ++it, ++i) {
@ -454,49 +509,60 @@ buildCastle(const CastleProto &proto,
repeatInfoPair.push_back(make_pair(min_period, is_reset)); repeatInfoPair.push_back(make_pair(min_period, is_reset));
if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) { candidateTriggers.push_back(triggers.at(top));
candidateTriggers.push_back(triggers.at(top)); candidateRepeats.push_back(i);
candidateRepeats.push_back(i);
}
} }
// Case 1: exclusive repeats // Case 1: exclusive repeats
bool exclusive = false; enum ExclusiveType exclusive = NOT_EXCLUSIVE;
bool pureExclusive = false;
u32 activeIdxSize = 0; u32 activeIdxSize = 0;
set<u32> exclusiveGroup; u32 groupIterOffset = 0;
if (cc.grey.castleExclusive) { if (cc.grey.castleExclusive) {
vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers); auto cliqueGroups =
const u32 exclusiveSize = tmpGroup.size(); checkExclusion(streamStateSize, cr, candidateTriggers,
if (exclusiveSize > 1) { exclusive, numRepeats);
// Case 1: mutual exclusive repeats group found, initialize state for (const auto &group : cliqueGroups) {
// sizes // mutual exclusive repeats group found,
exclusive = true; // update state sizes
activeIdxSize = calcPackedBytes(numRepeats + 1); activeIdxSize = calcPackedBytes(numRepeats + 1);
if (exclusiveSize == numRepeats) {
pureExclusive = true;
streamStateSize = 0;
scratchStateSize = 0;
}
streamStateSize += activeIdxSize; streamStateSize += activeIdxSize;
// replace with top values // replace with top values
for (const auto &val : tmpGroup) { for (const auto &val : group) {
exclusiveGroup.insert(candidateRepeats[val]); const u32 top = candidateRepeats[val];
exclusiveInfo.groupId[top] = exclusiveInfo.numGroups;
} }
exclusiveInfo.numGroups++;
} }
if (exclusive) {
groupIterOffset = streamStateSize;
streamStateSize += mmbit_size(exclusiveInfo.numGroups);
}
DEBUG_PRINTF("num of groups:%u\n", exclusiveInfo.numGroups);
} }
candidateRepeats.clear();
DEBUG_PRINTF("reach %s exclusive %u\n", describeClass(cr).c_str(), DEBUG_PRINTF("reach %s exclusive %u\n", describeClass(cr).c_str(),
exclusive); exclusive);
u32 tableSize = 0; u32 tableSize = 0;
u32 sparseRepeats = 0; u32 sparseRepeats = 0;
vector<u32> may_stale; /* sub castles that may go stale */
buildSubcastles(proto, subs, infos, patchSize, repeatInfoPair, buildSubcastles(proto, subs, infos, patchSize, repeatInfoPair,
scratchStateSize, streamStateSize, tableSize, scratchStateSize, streamStateSize, tableSize,
tables, sparseRepeats, exclusiveGroup); tables, sparseRepeats, exclusiveInfo, may_stale, rm);
const size_t total_size = DEBUG_PRINTF("%zu subcastles may go stale\n", may_stale.size());
vector<mmbit_sparse_iter> stale_iter;
if (!may_stale.empty()) {
mmbBuildSparseIterator(stale_iter, may_stale, numRepeats);
}
size_t total_size =
sizeof(NFA) + // initial NFA structure sizeof(NFA) + // initial NFA structure
sizeof(Castle) + // Castle structure sizeof(Castle) + // Castle structure
sizeof(SubCastle) * subs.size() + // SubCastles themselves sizeof(SubCastle) * subs.size() + // SubCastles themselves
@ -506,6 +572,9 @@ buildCastle(const CastleProto &proto,
sizeof(u64a) * sparseRepeats; // paddings for sizeof(u64a) * sparseRepeats; // paddings for
// REPEAT_SPARSE_OPTIMAL_P tables // REPEAT_SPARSE_OPTIMAL_P tables
total_size = ROUNDUP_N(total_size, alignof(mmbit_sparse_iter));
total_size += byte_length(stale_iter); // stale sparse iter
aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size); aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
nfa->type = verify_u8(CASTLE_NFA_0); nfa->type = verify_u8(CASTLE_NFA_0);
nfa->length = verify_u32(total_size); nfa->length = verify_u32(total_size);
@ -515,12 +584,15 @@ buildCastle(const CastleProto &proto,
nfa->minWidth = verify_u32(minWidth); nfa->minWidth = verify_u32(minWidth);
nfa->maxWidth = maxWidth.is_finite() ? verify_u32(maxWidth) : 0; nfa->maxWidth = maxWidth.is_finite() ? verify_u32(maxWidth) : 0;
char *ptr = (char *)nfa.get() + sizeof(NFA); char * const base_ptr = (char *)nfa.get() + sizeof(NFA);
char *ptr = base_ptr;
Castle *c = (Castle *)ptr; Castle *c = (Castle *)ptr;
c->numRepeats = verify_u32(subs.size()); c->numRepeats = verify_u32(subs.size());
c->exclusive = exclusive; c->numGroups = exclusiveInfo.numGroups;
c->pureExclusive = pureExclusive; c->exclusive = verify_s8(exclusive);
c->activeIdxSize = verify_u8(activeIdxSize); c->activeIdxSize = verify_u8(activeIdxSize);
c->activeOffset = verify_u32(c->numGroups * activeIdxSize);
c->groupIterOffset = groupIterOffset;
writeCastleScanEngine(cr, c); writeCastleScanEngine(cr, c);
@ -554,12 +626,22 @@ buildCastle(const CastleProto &proto,
} }
// set exclusive group info // set exclusive group info
if (exclusiveGroup.find(i) != exclusiveGroup.end()) { if (contains(exclusiveInfo.groupId, i)) {
sub->exclusive = 1; sub->exclusiveId = exclusiveInfo.groupId[i];
} else { } else {
sub->exclusive = 0; sub->exclusiveId = numRepeats;
} }
} }
ptr = base_ptr + total_size - sizeof(NFA) - byte_length(stale_iter);
assert(ptr + byte_length(stale_iter) == base_ptr + total_size - sizeof(NFA));
if (!stale_iter.empty()) {
c->staleIterOffset = verify_u32(ptr - base_ptr);
copy_bytes(ptr, stale_iter);
ptr += byte_length(stale_iter);
}
return nfa; return nfa;
} }
@ -603,7 +685,7 @@ depth findMaxWidth(const CastleProto &proto, u32 top) {
return proto.repeats.at(top).bounds.max; return proto.repeats.at(top).bounds.max;
} }
CastleProto::CastleProto(const PureRepeat &pr) { CastleProto::CastleProto(nfa_kind k, const PureRepeat &pr) : kind(k) {
assert(pr.reach.any()); assert(pr.reach.any());
assert(pr.reports.size() == 1); assert(pr.reports.size() == 1);
u32 top = 0; u32 top = 0;
@ -665,6 +747,7 @@ u32 CastleProto::merge(const PureRepeat &pr) {
bool mergeCastle(CastleProto &c1, const CastleProto &c2, bool mergeCastle(CastleProto &c1, const CastleProto &c2,
map<u32, u32> &top_map) { map<u32, u32> &top_map) {
assert(&c1 != &c2); assert(&c1 != &c2);
assert(c1.kind == c2.kind);
DEBUG_PRINTF("c1 has %zu repeats, c2 has %zu repeats\n", c1.repeats.size(), DEBUG_PRINTF("c1 has %zu repeats, c2 has %zu repeats\n", c1.repeats.size(),
c2.repeats.size()); c2.repeats.size());
@ -738,6 +821,7 @@ bool is_equal(const CastleProto &c1, ReportID report1, const CastleProto &c2,
ReportID report2) { ReportID report2) {
assert(!c1.repeats.empty()); assert(!c1.repeats.empty());
assert(!c2.repeats.empty()); assert(!c2.repeats.empty());
assert(c1.kind == c2.kind);
if (c1.reach() != c2.reach()) { if (c1.reach() != c2.reach()) {
DEBUG_PRINTF("different reach\n"); DEBUG_PRINTF("different reach\n");
@ -784,6 +868,7 @@ bool is_equal(const CastleProto &c1, ReportID report1, const CastleProto &c2,
bool is_equal(const CastleProto &c1, const CastleProto &c2) { bool is_equal(const CastleProto &c1, const CastleProto &c2) {
assert(!c1.repeats.empty()); assert(!c1.repeats.empty());
assert(!c2.repeats.empty()); assert(!c2.repeats.empty());
assert(c1.kind == c2.kind);
if (c1.reach() != c2.reach()) { if (c1.reach() != c2.reach()) {
DEBUG_PRINTF("different reach\n"); DEBUG_PRINTF("different reach\n");
@ -877,7 +962,7 @@ bool hasZeroMinBound(const CastleProto &proto) {
return false; return false;
} }
unique_ptr<NGHolder> makeHolder(const CastleProto &proto, nfa_kind kind, unique_ptr<NGHolder> makeHolder(const CastleProto &proto,
const CompileContext &cc) { const CompileContext &cc) {
assert(!proto.repeats.empty()); assert(!proto.repeats.empty());
@ -890,10 +975,10 @@ unique_ptr<NGHolder> makeHolder(const CastleProto &proto, nfa_kind kind,
} }
} }
unique_ptr<NGHolder> g = ue2::make_unique<NGHolder>(kind); auto g = ue2::make_unique<NGHolder>(proto.kind);
for (const auto &m : proto.repeats) { for (const auto &m : proto.repeats) {
if (m.first >= CASTLE_MAX_TOPS) { if (m.first >= NFA_MAX_TOP_MASKS) {
DEBUG_PRINTF("top %u too big for an NFA\n", m.first); DEBUG_PRINTF("top %u too big for an NFA\n", m.first);
return nullptr; return nullptr;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -51,6 +51,7 @@ namespace ue2 {
class CharReach; class CharReach;
class NGHolder; class NGHolder;
class ReportManager;
struct CompileContext; struct CompileContext;
/** /**
@ -65,7 +66,7 @@ struct CompileContext;
*/ */
struct CastleProto { struct CastleProto {
static constexpr size_t max_occupancy = 65536; // arbitrary limit static constexpr size_t max_occupancy = 65536; // arbitrary limit
explicit CastleProto(const PureRepeat &pr); CastleProto(nfa_kind k, const PureRepeat &pr);
const CharReach &reach() const; const CharReach &reach() const;
/** \brief Add a new repeat. */ /** \brief Add a new repeat. */
@ -94,6 +95,9 @@ struct CastleProto {
* so we track this explicitly instead of using repeats.size(). * so we track this explicitly instead of using repeats.size().
*/ */
u32 next_top = 1; u32 next_top = 1;
/** \brief Kind for this engine. */
nfa_kind kind;
}; };
std::set<ReportID> all_reports(const CastleProto &proto); std::set<ReportID> all_reports(const CastleProto &proto);
@ -119,7 +123,7 @@ void remapCastleTops(CastleProto &proto, std::map<u32, u32> &top_map);
ue2::aligned_unique_ptr<NFA> ue2::aligned_unique_ptr<NFA>
buildCastle(const CastleProto &proto, buildCastle(const CastleProto &proto,
const std::map<u32, std::vector<std::vector<CharReach>>> &triggers, const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
const CompileContext &cc); const CompileContext &cc, const ReportManager &rm);
/** /**
* \brief Merge two CastleProto prototypes together, if possible. * \brief Merge two CastleProto prototypes together, if possible.
@ -155,7 +159,7 @@ bool requiresDedupe(const CastleProto &proto,
/** /**
* \brief Build an NGHolder from a CastleProto. * \brief Build an NGHolder from a CastleProto.
*/ */
std::unique_ptr<NGHolder> makeHolder(const CastleProto &castle, nfa_kind kind, std::unique_ptr<NGHolder> makeHolder(const CastleProto &castle,
const CompileContext &cc); const CompileContext &cc);
} // namespace ue2 } // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -1049,15 +1049,16 @@ char nfaExecGough16_inAccept(const struct NFA *n, ReportID report,
} }
static static
void goughCheckEOD(const struct NFA *nfa, u16 s, char goughCheckEOD(const struct NFA *nfa, u16 s,
const struct gough_som_info *som, const struct gough_som_info *som,
u64a offset, SomNfaCallback cb, void *ctxt) { u64a offset, SomNfaCallback cb, void *ctxt) {
const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
const struct mstate_aux *aux = get_aux(m, s); const struct mstate_aux *aux = get_aux(m, s);
if (aux->accept_eod) { if (!aux->accept_eod) {
doReports(cb, ctxt, m, som, s, offset, 1, NULL, NULL, NULL); return MO_CONTINUE_MATCHING;
} }
return doReports(cb, ctxt, m, som, s, offset, 1, NULL, NULL, NULL);
} }
char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state, char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
@ -1065,8 +1066,8 @@ char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
UNUSED NfaCallback callback, UNUSED NfaCallback callback,
SomNfaCallback som_callback, void *context) { SomNfaCallback som_callback, void *context) {
const struct gough_som_info *som = getSomInfoConst(state); const struct gough_som_info *som = getSomInfoConst(state);
goughCheckEOD(nfa, *(const u8 *)state, som, offset, som_callback, context); return goughCheckEOD(nfa, *(const u8 *)state, som, offset, som_callback,
return 0; context);
} }
char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state, char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
@ -1075,8 +1076,8 @@ char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
SomNfaCallback som_callback, void *context) { SomNfaCallback som_callback, void *context) {
assert(ISALIGNED_N(state, 8)); assert(ISALIGNED_N(state, 8));
const struct gough_som_info *som = getSomInfoConst(state); const struct gough_som_info *som = getSomInfoConst(state);
goughCheckEOD(nfa, *(const u16 *)state, som, offset, som_callback, context); return goughCheckEOD(nfa, *(const u16 *)state, som, offset, som_callback,
return 0; context);
} }
char nfaExecGough8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { char nfaExecGough8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -41,8 +41,9 @@
#include "util/graph_range.h" #include "util/graph_range.h"
#include "util/make_unique.h" #include "util/make_unique.h"
#include "util/order_check.h" #include "util/order_check.h"
#include "util/verify_types.h" #include "util/report_manager.h"
#include "util/ue2_containers.h" #include "util/ue2_containers.h"
#include "util/verify_types.h"
#include "ue2common.h" #include "ue2common.h"
@ -77,18 +78,20 @@ namespace {
class gough_build_strat : public mcclellan_build_strat { class gough_build_strat : public mcclellan_build_strat {
public: public:
gough_build_strat(raw_som_dfa &r, const GoughGraph &g, gough_build_strat(
const map<dstate_id_t, gough_accel_state_info> &accel_info) raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm,
: mcclellan_build_strat(r), rdfa(r), gg(g), const map<dstate_id_t, gough_accel_state_info> &accel_info)
: mcclellan_build_strat(r, rm), rdfa(r), gg(g),
accel_gough_info(accel_info) {} accel_gough_info(accel_info) {}
unique_ptr<raw_report_info> gatherReports(vector<u32> &reports /* out */, unique_ptr<raw_report_info> gatherReports(vector<u32> &reports /* out */,
vector<u32> &reports_eod /* out */, vector<u32> &reports_eod /* out */,
u8 *isSingleReport /* out */, u8 *isSingleReport /* out */,
ReportID *arbReport /* out */) const override; ReportID *arbReport /* out */) const override;
void find_escape_strings(dstate_id_t this_idx, AccelScheme find_escape_strings(dstate_id_t this_idx) const override;
escape_info *out) const override;
size_t accelSize(void) const override { return sizeof(gough_accel); } size_t accelSize(void) const override { return sizeof(gough_accel); }
void buildAccel(dstate_id_t this_idx, void *accel_out) override; void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
void *accel_out) override;
u32 max_allowed_offset_accel() const override { return 0; }
raw_som_dfa &rdfa; raw_som_dfa &rdfa;
const GoughGraph &gg; const GoughGraph &gg;
@ -1034,7 +1037,8 @@ void update_accel_prog_offset(const gough_build_strat &gbs,
} }
aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision, aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
const CompileContext &cc) { const CompileContext &cc,
const ReportManager &rm) {
assert(somPrecision == 2 || somPrecision == 4 || somPrecision == 8 assert(somPrecision == 2 || somPrecision == 4 || somPrecision == 8
|| !cc.streaming); || !cc.streaming);
@ -1066,7 +1070,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
map<dstate_id_t, gough_accel_state_info> accel_allowed; map<dstate_id_t, gough_accel_state_info> accel_allowed;
find_allowed_accel_states(*cfg, blocks, &accel_allowed); find_allowed_accel_states(*cfg, blocks, &accel_allowed);
gough_build_strat gbs(raw, *cfg, accel_allowed); gough_build_strat gbs(raw, *cfg, rm, accel_allowed);
aligned_unique_ptr<NFA> basic_dfa = mcclellanCompile_i(raw, gbs, cc); aligned_unique_ptr<NFA> basic_dfa = mcclellanCompile_i(raw, gbs, cc);
assert(basic_dfa); assert(basic_dfa);
if (!basic_dfa) { if (!basic_dfa) {
@ -1145,32 +1149,44 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
return gough_dfa; return gough_dfa;
} }
void gough_build_strat::find_escape_strings(dstate_id_t this_idx, AccelScheme gough_build_strat::find_escape_strings(dstate_id_t this_idx) const {
escape_info *out) const { AccelScheme rv;
if (!contains(accel_gough_info, this_idx)) { if (!contains(accel_gough_info, this_idx)) {
out->outs = CharReach::dot(); rv.cr = CharReach::dot();
out->outs2_broken = true; rv.double_byte.clear();
return; return rv;
} }
mcclellan_build_strat::find_escape_strings(this_idx, out); rv = mcclellan_build_strat::find_escape_strings(this_idx);
if (!accel_gough_info.at(this_idx).two_byte) { assert(!rv.offset || rv.cr.all()); /* should have been limited by strat */
out->outs2_broken = true; if (rv.offset) {
rv.cr = CharReach::dot();
rv.double_byte.clear();
return rv;
} }
if (rv.double_offset
|| !accel_gough_info.at(this_idx).two_byte) {
rv.double_byte.clear();
}
return rv;
} }
void gough_build_strat::buildAccel(dstate_id_t this_idx, void *accel_out) { void gough_build_strat::buildAccel(dstate_id_t this_idx, const AccelScheme &info,
void *accel_out) {
assert(mcclellan_build_strat::accelSize() == sizeof(AccelAux)); assert(mcclellan_build_strat::accelSize() == sizeof(AccelAux));
gough_accel *accel = (gough_accel *)accel_out; gough_accel *accel = (gough_accel *)accel_out;
/* build a plain accelaux so we can work out where we can get to */ /* build a plain accelaux so we can work out where we can get to */
mcclellan_build_strat::buildAccel(this_idx, &accel->accel); mcclellan_build_strat::buildAccel(this_idx, info, &accel->accel);
DEBUG_PRINTF("state %hu is accel with type %hhu\n", this_idx, DEBUG_PRINTF("state %hu is accel with type %hhu\n", this_idx,
accel->accel.accel_type); accel->accel.accel_type);
if (accel->accel.accel_type == ACCEL_NONE) { if (accel->accel.accel_type == ACCEL_NONE) {
return; return;
} }
assert(!accel->accel.generic.offset);
assert(contains(accel_gough_info, this_idx)); assert(contains(accel_gough_info, this_idx));
accel->margin_dist = verify_u8(accel_gough_info.at(this_idx).margin); accel->margin_dist = verify_u8(accel_gough_info.at(this_idx).margin);
built_accel[accel] = this_idx; built_accel[accel] = this_idx;
@ -1182,10 +1198,11 @@ namespace {
struct raw_gough_report_list { struct raw_gough_report_list {
set<som_report> reports; set<som_report> reports;
explicit raw_gough_report_list( raw_gough_report_list(
const vector<pair<ReportID, GoughSSAVar *>> &raw_reports) { const vector<pair<ReportID, GoughSSAVar *>> &raw_reports,
const ReportManager &rm, bool do_remap) {
for (const auto &m : raw_reports) { for (const auto &m : raw_reports) {
ReportID r = m.first; ReportID r = do_remap ? rm.getProgramOffset(m.first) : m.first;
u32 impl_slot = INVALID_SLOT; u32 impl_slot = INVALID_SLOT;
if (m.second) { if (m.second) {
impl_slot = m.second->slot; impl_slot = m.second->slot;
@ -1214,11 +1231,13 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
vector<u32> &reports_eod, vector<u32> &reports_eod,
u8 *isSingleReport, u8 *isSingleReport,
ReportID *arbReport) const { ReportID *arbReport) const {
unique_ptr<raw_gough_report_info_impl> ri =
ue2::make_unique<raw_gough_report_info_impl>();
map<raw_gough_report_list, u32> rev;
DEBUG_PRINTF("gathering reports\n"); DEBUG_PRINTF("gathering reports\n");
const bool remap_reports = has_managed_reports(rdfa.kind);
auto ri = ue2::make_unique<raw_gough_report_info_impl>();
map<raw_gough_report_list, u32> rev;
assert(!rdfa.states.empty()); assert(!rdfa.states.empty());
vector<GoughVertex> verts(rdfa.states.size()); vector<GoughVertex> verts(rdfa.states.size());
@ -1237,7 +1256,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
continue; continue;
} }
raw_gough_report_list rrl(gg[v].reports); raw_gough_report_list rrl(gg[v].reports, rm, remap_reports);
DEBUG_PRINTF("non empty r %zu\n", reports.size()); DEBUG_PRINTF("non empty r %zu\n", reports.size());
if (rev.find(rrl) != rev.end()) { if (rev.find(rrl) != rev.end()) {
reports.push_back(rev[rrl]); reports.push_back(rev[rrl]);
@ -1256,7 +1275,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
} }
DEBUG_PRINTF("non empty r eod\n"); DEBUG_PRINTF("non empty r eod\n");
raw_gough_report_list rrl(gg[v].reports_eod); raw_gough_report_list rrl(gg[v].reports_eod, rm, remap_reports);
if (rev.find(rrl) != rev.end()) { if (rev.find(rrl) != rev.end()) {
reports_eod.push_back(rev[rrl]); reports_eod.push_back(rev[rrl]);
continue; continue;

View File

@ -89,7 +89,8 @@ struct raw_som_dfa : public raw_dfa {
}; };
aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision, aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
const CompileContext &cc); const CompileContext &cc,
const ReportManager &rm);
} // namespace ue2 } // namespace ue2

View File

@ -130,6 +130,9 @@ char repeatIsDead(const struct RepeatInfo *info,
return lstate->ctrl.ring.offset == REPEAT_DEAD; return lstate->ctrl.ring.offset == REPEAT_DEAD;
case REPEAT_TRAILER: case REPEAT_TRAILER:
return lstate->ctrl.trailer.offset == REPEAT_DEAD; return lstate->ctrl.trailer.offset == REPEAT_DEAD;
case REPEAT_ALWAYS:
assert(!"REPEAT_ALWAYS should only be used by Castle");
return 0;
} }
assert(0); assert(0);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -65,14 +65,13 @@ extern "C"
void *state, u8 key); \ void *state, u8 key); \
char gf_name##_B_Reverse(const struct NFA *n, u64a offset, const u8 *buf, \ char gf_name##_B_Reverse(const struct NFA *n, u64a offset, const u8 *buf, \
size_t buflen, const u8 *hbuf, size_t hlen, \ size_t buflen, const u8 *hbuf, size_t hlen, \
struct hs_scratch *scratch, NfaCallback cb, \ NfaCallback cb, void *context); \
void *context); \
char gf_name##_queueCompressState(const struct NFA *nfa, \ char gf_name##_queueCompressState(const struct NFA *nfa, \
const struct mq *q, s64a loc); \ const struct mq *q, s64a loc); \
char gf_name##_expandState(const struct NFA *nfa, void *dest, \ char gf_name##_expandState(const struct NFA *nfa, void *dest, \
const void *src, u64a offset, u8 key); \ const void *src, u64a offset, u8 key); \
enum nfa_zombie_status gf_name##_zombie_status(const struct NFA *nfa, \ enum nfa_zombie_status gf_name##_zombie_status(const struct NFA *nfa, \
struct mq *q, s64a loc); \ struct mq *q, s64a loc); \
GENERATE_NFA_DUMP_DECL(gf_name) GENERATE_NFA_DUMP_DECL(gf_name)
GENERATE_NFA_DECL(nfaExecLimEx32_1) GENERATE_NFA_DECL(nfaExecLimEx32_1)

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -38,6 +38,9 @@
#include "nfa_internal.h" #include "nfa_internal.h"
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "multishufti.h"
#include "multitruffle.h"
#include "multivermicelli.h"
#include "ue2common.h" #include "ue2common.h"
#include "vermicelli.h" #include "vermicelli.h"
#include "util/bitutils.h" #include "util/bitutils.h"
@ -46,74 +49,6 @@
#include "util/simd_utils_ssse3.h" #include "util/simd_utils_ssse3.h"
#include "util/shuffle_ssse3.h" #include "util/shuffle_ssse3.h"
static
const u8 *accelScan(const union AccelAux *aux, const u8 *ptr, const u8 *end) {
assert(ISALIGNED(aux)); // must be SIMD aligned for shufti
assert(end > ptr);
assert(end - ptr >= 16); // must be at least 16 bytes to scan
const u8 *start = ptr;
u8 offset;
switch (aux->accel_type) {
case ACCEL_VERM:
DEBUG_PRINTF("single vermicelli for 0x%02hhx\n", aux->verm.c);
offset = aux->verm.offset;
ptr = vermicelliExec(aux->verm.c, 0, ptr, end);
break;
case ACCEL_VERM_NOCASE:
DEBUG_PRINTF("single vermicelli-nocase for 0x%02hhx\n", aux->verm.c);
offset = aux->verm.offset;
ptr = vermicelliExec(aux->verm.c, 1, ptr, end);
break;
case ACCEL_DVERM:
DEBUG_PRINTF("double vermicelli for 0x%02hhx%02hhx\n",
aux->dverm.c1, aux->dverm.c2);
offset = aux->dverm.offset;
ptr = vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 0, ptr, end);
break;
case ACCEL_DVERM_NOCASE:
DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n",
aux->dverm.c1, aux->dverm.c2);
offset = aux->dverm.offset;
ptr = vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2,
1, ptr, end);
break;
case ACCEL_SHUFTI:
DEBUG_PRINTF("single shufti\n");
offset = aux->shufti.offset;
ptr = shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
break;
case ACCEL_DSHUFTI:
DEBUG_PRINTF("double shufti\n");
offset = aux->dshufti.offset;
ptr = shuftiDoubleExec(aux->dshufti.lo1, aux->dshufti.hi1,
aux->dshufti.lo2, aux->dshufti.hi2, ptr, end);
break;
case ACCEL_TRUFFLE:
DEBUG_PRINTF("truffle shuffle\n");
offset = aux->truffle.offset;
ptr = truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
break;
case ACCEL_RED_TAPE:
ptr = end; /* there is no escape */
offset = aux->generic.offset;
break;
default:
/* no acceleration, fall through and return current ptr */
offset = 0;
break;
}
if (offset) {
ptr -= offset;
if (ptr < start) {
return start;
}
}
return ptr;
}
static really_inline static really_inline
size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux, size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
const u8 *input, u32 idx, size_t i, size_t end) { const u8 *input, u32 idx, size_t i, size_t end) {
@ -134,7 +69,7 @@ size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
} }
aux = aux + aux_idx; aux = aux + aux_idx;
const u8 *ptr = accelScan(aux, &input[i], &input[end]); const u8 *ptr = run_accel(aux, &input[i], &input[end]);
assert(ptr >= &input[i]); assert(ptr >= &input[i]);
size_t j = (size_t)(ptr - input); size_t j = (size_t)(ptr - input);
DEBUG_PRINTF("accel skipped %zu of %zu chars\n", (j - i), (end - i)); DEBUG_PRINTF("accel skipped %zu of %zu chars\n", (j - i), (end - i));

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -80,9 +80,11 @@ struct precalcAccel {
CharReach double_cr; CharReach double_cr;
flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */ flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
u32 double_offset; u32 double_offset;
MultibyteAccelInfo ma_info;
}; };
struct meteor_accel_info { struct limex_accel_info {
ue2::unordered_set<NFAVertex> accelerable; ue2::unordered_set<NFAVertex> accelerable;
map<NFAStateSet, precalcAccel> precalc; map<NFAStateSet, precalcAccel> precalc;
ue2::unordered_map<NFAVertex, flat_set<NFAVertex> > friends; ue2::unordered_map<NFAVertex, flat_set<NFAVertex> > friends;
@ -162,7 +164,7 @@ struct build_info {
bool stateCompression; bool stateCompression;
const CompileContext &cc; const CompileContext &cc;
u32 num_states; u32 num_states;
meteor_accel_info accel; limex_accel_info accel;
}; };
// Constants for scoring mechanism // Constants for scoring mechanism
@ -334,12 +336,16 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
} }
struct AccelBuild { struct AccelBuild {
AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0) {} AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0), ma_len1(0),
ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
NFAVertex v; NFAVertex v;
u32 state; u32 state;
u32 offset; // offset correction to apply u32 offset; // offset correction to apply
CharReach stop1; // single-byte accel stop literals CharReach stop1; // single-byte accel stop literals
flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
u32 ma_len1; // multiaccel len1
u32 ma_len2; // multiaccel len2
MultibyteAccelInfo::multiaccel_type ma_type; // multiaccel type
}; };
static static
@ -354,7 +360,12 @@ void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
build.stop1 = CharReach::dot(); build.stop1 = CharReach::dot();
} else { } else {
const precalcAccel &precalc = bi.accel.precalc.at(ss); const precalcAccel &precalc = bi.accel.precalc.at(ss);
if (precalc.double_lits.empty()) { unsigned ma_len = precalc.ma_info.len1 + precalc.ma_info.len2;
if (ma_len >= MULTIACCEL_MIN_LEN) {
build.ma_len1 = precalc.ma_info.len1;
build.stop1 = precalc.ma_info.cr;
build.offset = precalc.ma_info.offset;
} else if (precalc.double_lits.empty()) {
build.stop1 = precalc.single_cr; build.stop1 = precalc.single_cr;
build.offset = precalc.single_offset; build.offset = precalc.single_offset;
} else { } else {
@ -534,7 +545,7 @@ void filterAccelStates(NGHolder &g, const map<u32, NFAVertex> &tops,
} }
static static
bool containsBadSubset(const meteor_accel_info &accel, bool containsBadSubset(const limex_accel_info &accel,
const NFAStateSet &state_set, const u32 effective_sds) { const NFAStateSet &state_set, const u32 effective_sds) {
NFAStateSet subset(state_set.size()); NFAStateSet subset(state_set.size());
for (size_t j = state_set.find_first(); j != state_set.npos; for (size_t j = state_set.find_first(); j != state_set.npos;
@ -555,11 +566,29 @@ bool containsBadSubset(const meteor_accel_info &accel,
} }
static static
void doAccelCommon(NGHolder &g, bool is_too_wide(const AccelScheme &as) {
ue2::unordered_map<NFAVertex, AccelScheme> &accel_map, return as.cr.count() > MAX_MERGED_ACCEL_STOPS;
const ue2::unordered_map<NFAVertex, u32> &state_ids, }
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
const u32 num_states, meteor_accel_info *accel) { static
void fillAccelInfo(build_info &bi) {
if (!bi.do_accel) {
return;
}
NGHolder &g = bi.h;
limex_accel_info &accel = bi.accel;
unordered_map<NFAVertex, AccelScheme> &accel_map = accel.accel_map;
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic = bi.br_cyclic;
const CompileContext &cc = bi.cc;
const unordered_map<NFAVertex, u32> &state_ids = bi.state_ids;
const u32 num_states = bi.num_states;
nfaFindAccelSchemes(g, br_cyclic, &accel_map);
filterAccelStates(g, bi.tops, &accel_map);
assert(accel_map.size() <= NFA_MAX_ACCEL_STATES);
vector<CharReach> refined_cr = reduced_cr(g, br_cyclic); vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);
vector<NFAVertex> astates; vector<NFAVertex> astates;
@ -590,7 +619,7 @@ void doAccelCommon(NGHolder &g,
} }
} }
if (containsBadSubset(*accel, state_set, effective_sds)) { if (containsBadSubset(accel, state_set, effective_sds)) {
DEBUG_PRINTF("accel %u has bad subset\n", i); DEBUG_PRINTF("accel %u has bad subset\n", i);
continue; /* if a subset failed to build we would too */ continue; /* if a subset failed to build we would too */
} }
@ -598,30 +627,37 @@ void doAccelCommon(NGHolder &g,
const bool allow_wide = allow_wide_accel(states, g, sds_or_proxy); const bool allow_wide = allow_wide_accel(states, g, sds_or_proxy);
AccelScheme as = nfaFindAccel(g, states, refined_cr, br_cyclic, AccelScheme as = nfaFindAccel(g, states, refined_cr, br_cyclic,
allow_wide); allow_wide, true);
if (as.cr.count() > MAX_MERGED_ACCEL_STOPS) { if (is_too_wide(as)) {
DEBUG_PRINTF("accel %u too wide (%zu, %d)\n", i, DEBUG_PRINTF("accel %u too wide (%zu, %d)\n", i,
as.cr.count(), MAX_MERGED_ACCEL_STOPS); as.cr.count(), MAX_MERGED_ACCEL_STOPS);
continue; continue;
} }
DEBUG_PRINTF("accel %u ok with offset %u\n", i, as.offset); DEBUG_PRINTF("accel %u ok with offset s%u, d%u\n", i, as.offset,
as.double_offset);
precalcAccel &pa = accel->precalc[state_set]; // try multibyte acceleration first
pa.single_offset = as.offset; MultibyteAccelInfo mai = nfaCheckMultiAccel(g, states, cc);
pa.single_cr = as.cr;
precalcAccel &pa = accel.precalc[state_set];
useful |= state_set; useful |= state_set;
if (states.size() == 1) { // if we successfully built a multibyte accel scheme, use that
DoubleAccelInfo b = findBestDoubleAccelInfo(g, states.front()); if (mai.type != MultibyteAccelInfo::MAT_NONE) {
if (pa.single_cr.count() > b.stop1.count()) { pa.ma_info = mai;
/* insert this information into the precalc accel info as it is
* better than the single scheme */ DEBUG_PRINTF("multibyte acceleration!\n");
pa.double_offset = b.offset; continue;
pa.double_lits = b.stop2;
pa.double_cr = b.stop1;
}
} }
pa.single_offset = as.offset;
pa.single_cr = as.cr;
if (as.double_byte.size() != 0) {
pa.double_offset = as.double_offset;
pa.double_lits = as.double_byte;
pa.double_cr = as.double_cr;
};
} }
for (const auto &m : accel_map) { for (const auto &m : accel_map) {
@ -638,31 +674,22 @@ void doAccelCommon(NGHolder &g,
state_set.reset(); state_set.reset();
state_set.set(state_id); state_set.set(state_id);
auto p_it = accel->precalc.find(state_set); bool is_multi = false;
if (p_it != accel->precalc.end()) { auto p_it = accel.precalc.find(state_set);
if (p_it != accel.precalc.end()) {
const precalcAccel &pa = p_it->second; const precalcAccel &pa = p_it->second;
offset = max(pa.double_offset, pa.single_offset); offset = max(pa.double_offset, pa.single_offset);
is_multi = pa.ma_info.type != MultibyteAccelInfo::MAT_NONE;
assert(offset <= MAX_ACCEL_DEPTH); assert(offset <= MAX_ACCEL_DEPTH);
} }
accel->accelerable.insert(v); accel.accelerable.insert(v);
findAccelFriends(g, v, br_cyclic, offset, &accel->friends[v]); if (!is_multi) {
findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
}
} }
} }
static
void fillAccelInfo(build_info &bi) {
if (!bi.do_accel) {
return;
}
nfaFindAccelSchemes(bi.h, bi.br_cyclic, &bi.accel.accel_map);
filterAccelStates(bi.h, bi.tops, &bi.accel.accel_map);
assert(bi.accel.accel_map.size() <= NFA_MAX_ACCEL_STATES);
doAccelCommon(bi.h, bi.accel.accel_map, bi.state_ids, bi.br_cyclic,
bi.num_states, &bi.accel);
}
/** The AccelAux structure has large alignment specified, and this makes some /** The AccelAux structure has large alignment specified, and this makes some
* compilers do odd things unless we specify a custom allocator. */ * compilers do odd things unless we specify a custom allocator. */
typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)> > typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)> >
@ -672,7 +699,7 @@ static
void buildAccel(const build_info &args, NFAStateSet &accelMask, void buildAccel(const build_info &args, NFAStateSet &accelMask,
NFAStateSet &accelFriendsMask, AccelAuxVector &auxvec, NFAStateSet &accelFriendsMask, AccelAuxVector &auxvec,
vector<u8> &accelTable) { vector<u8> &accelTable) {
const meteor_accel_info &accel = args.accel; const limex_accel_info &accel = args.accel;
// Init, all zeroes. // Init, all zeroes.
accelMask.resize(args.num_states); accelMask.resize(args.num_states);
@ -737,8 +764,16 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
if (contains(accel.precalc, states)) { if (contains(accel.precalc, states)) {
const precalcAccel &precalc = accel.precalc.at(states); const precalcAccel &precalc = accel.precalc.at(states);
ainfo.single_offset = precalc.single_offset; if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
ainfo.single_stops = precalc.single_cr; ainfo.ma_len1 = precalc.ma_info.len1;
ainfo.ma_len2 = precalc.ma_info.len2;
ainfo.multiaccel_offset = precalc.ma_info.offset;
ainfo.multiaccel_stops = precalc.ma_info.cr;
ainfo.ma_type = precalc.ma_info.type;
} else {
ainfo.single_offset = precalc.single_offset;
ainfo.single_stops = precalc.single_cr;
}
} }
buildAccelAux(ainfo, &aux); buildAccelAux(ainfo, &aux);
@ -2152,7 +2187,7 @@ u32 countAccelStates(NGHolder &h,
if (!cc.grey.allowLimExNFA) { if (!cc.grey.allowLimExNFA) {
DEBUG_PRINTF("limex not allowed\n"); DEBUG_PRINTF("limex not allowed\n");
return NFA_MAX_ACCEL_STATES + 1; return 0;
} }
// Sanity check the input data. // Sanity check the input data.
@ -2166,11 +2201,11 @@ u32 countAccelStates(NGHolder &h,
do_accel, state_compression, cc, num_states); do_accel, state_compression, cc, num_states);
// Acceleration analysis. // Acceleration analysis.
fillAccelInfo(bi); nfaFindAccelSchemes(bi.h, bi.br_cyclic, &bi.accel.accel_map);
u32 num_accel = verify_u32(bi.accel.accelerable.size()); u32 num_accel = verify_u32(bi.accel.accel_map.size());
DEBUG_PRINTF("found %u accel states\n", num_accel); DEBUG_PRINTF("found %u accel states\n", num_accel);
return min(num_accel, (u32)NFA_MAX_ACCEL_STATES); return num_accel;
} }
} // namespace ue2 } // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -79,11 +79,10 @@ aligned_unique_ptr<NFA> generate(NGHolder &g,
const CompileContext &cc); const CompileContext &cc);
/** /**
* \brief For a given graph, count the number of accel states it will have in * \brief For a given graph, count the number of accelerable states it has.
* an implementation.
* *
* \return the number of accel states, or NFA_MAX_ACCEL_STATES + 1 if an * Note that this number may be greater than the number that are actually
* implementation would not be constructible. * implementable.
*/ */
u32 countAccelStates(NGHolder &h, u32 countAccelStates(NGHolder &h,
const ue2::unordered_map<NFAVertex, u32> &states, const ue2::unordered_map<NFAVertex, u32> &states,

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,6 @@
#include "limex_internal.h" #include "limex_internal.h"
#include "nfa_api_util.h" #include "nfa_api_util.h"
#include "nfa_internal.h" #include "nfa_internal.h"
#include "scratch.h"
#include "util/uniform_ops.h" #include "util/uniform_ops.h"
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -176,8 +176,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex); const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
const ReportID *exReports = getExReports(limex); const ReportID *exReports = getExReports(limex);
const u32 *exceptionMap = limex->exceptionMap; const u32 *exceptionMap = limex->exceptionMap;
assert(ISALIGNED_CL(ctx));
assert(ISALIGNED_CL(&ctx->s));
STATE_T s = LOAD_STATE(&ctx->s); STATE_T s = LOAD_STATE(&ctx->s);
/* assert(ISALIGNED_16(exceptions)); */ /* assert(ISALIGNED_16(exceptions)); */
@ -533,17 +531,17 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
assert(q->cur + 1 < q->end); /* require at least two items */ assert(q->cur + 1 < q->end); /* require at least two items */
struct CONTEXT_T *ctx = q->scratch->nfaContext; struct CONTEXT_T ctx;
assert(ISALIGNED_CL(ctx)); ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
ctx->repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T)); ctx.repeat_state = q->streamState + limex->stateSize;
ctx->repeat_state = q->streamState + limex->stateSize; ctx.callback = q->cb;
ctx->callback = q->cb; ctx.context = q->context;
ctx->context = q->context; STORE_STATE(&ctx.cached_estate, ZERO_STATE);
STORE_STATE(&ctx->cached_estate, ZERO_STATE); ctx.cached_br = 0;
assert(q->items[q->cur].location >= 0); assert(q->items[q->cur].location >= 0);
DEBUG_PRINTF("LOAD STATE\n"); DEBUG_PRINTF("LOAD STATE\n");
STORE_STATE(&ctx->s, LOAD_STATE(q->state)); STORE_STATE(&ctx.s, LOAD_STATE(q->state));
assert(q->items[q->cur].type == MQE_START); assert(q->items[q->cur].type == MQE_START);
u64a offset = q->offset; u64a offset = q->offset;
@ -565,7 +563,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
/* do main buffer region */ /* do main buffer region */
DEBUG_PRINTF("MAIN BUFFER SCAN\n"); DEBUG_PRINTF("MAIN BUFFER SCAN\n");
assert(ep - offset <= q->length); assert(ep - offset <= q->length);
if (STREAMCB_FN(limex, q->buffer + sp - offset, ep - sp, ctx, sp) if (STREAMCB_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp)
== MO_HALT_MATCHING) { == MO_HALT_MATCHING) {
STORE_STATE(q->state, ZERO_STATE); STORE_STATE(q->state, ZERO_STATE);
return 0; return 0;
@ -584,19 +582,19 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
q->items[q->cur].type = MQE_START; q->items[q->cur].type = MQE_START;
q->items[q->cur].location = sp - offset; q->items[q->cur].location = sp - offset;
DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end); DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
STORE_STATE(q->state, LOAD_STATE(&ctx->s)); STORE_STATE(q->state, LOAD_STATE(&ctx.s));
return MO_ALIVE; return MO_ALIVE;
} }
JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, ctx, sp); JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
q->cur++; q->cur++;
} }
EXPIRE_ESTATE_FN(limex, ctx, sp); EXPIRE_ESTATE_FN(limex, &ctx, sp);
DEBUG_PRINTF("END\n"); DEBUG_PRINTF("END\n");
STORE_STATE(q->state, LOAD_STATE(&ctx->s)); STORE_STATE(q->state, LOAD_STATE(&ctx.s));
if (q->cur != q->end) { if (q->cur != q->end) {
q->cur--; q->cur--;
@ -605,7 +603,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
return MO_ALIVE; return MO_ALIVE;
} }
return ISNONZERO_STATE(LOAD_STATE(&ctx->s)); return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
} }
/* used by suffix execution in Rose */ /* used by suffix execution in Rose */
@ -628,16 +626,16 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
assert(q->cur + 1 < q->end); /* require at least two items */ assert(q->cur + 1 < q->end); /* require at least two items */
struct CONTEXT_T *ctx = q->scratch->nfaContext; struct CONTEXT_T ctx;
assert(ISALIGNED_CL(ctx)); ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
ctx->repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T)); ctx.repeat_state = q->streamState + limex->stateSize;
ctx->repeat_state = q->streamState + limex->stateSize; ctx.callback = q->cb;
ctx->callback = q->cb; ctx.context = q->context;
ctx->context = q->context; STORE_STATE(&ctx.cached_estate, ZERO_STATE);
STORE_STATE(&ctx->cached_estate, ZERO_STATE); ctx.cached_br = 0;
DEBUG_PRINTF("LOAD STATE\n"); DEBUG_PRINTF("LOAD STATE\n");
STORE_STATE(&ctx->s, LOAD_STATE(q->state)); STORE_STATE(&ctx.s, LOAD_STATE(q->state));
assert(q->items[q->cur].type == MQE_START); assert(q->items[q->cur].type == MQE_START);
u64a offset = q->offset; u64a offset = q->offset;
@ -661,7 +659,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
/* do main buffer region */ /* do main buffer region */
u64a final_look = 0; u64a final_look = 0;
assert(ep - offset <= q->length); assert(ep - offset <= q->length);
if (STREAMFIRST_FN(limex, q->buffer + sp - offset, ep - sp, ctx, sp, if (STREAMFIRST_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp,
&final_look) == MO_HALT_MATCHING) { &final_look) == MO_HALT_MATCHING) {
DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu offset:%llu\n", DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu offset:%llu\n",
final_look, sp, end_abs, offset); final_look, sp, end_abs, offset);
@ -669,7 +667,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
q->cur--; q->cur--;
q->items[q->cur].type = MQE_START; q->items[q->cur].type = MQE_START;
q->items[q->cur].location = sp + final_look - offset; q->items[q->cur].location = sp + final_look - offset;
STORE_STATE(q->state, LOAD_STATE(&ctx->s)); STORE_STATE(q->state, LOAD_STATE(&ctx.s));
return MO_MATCHES_PENDING; return MO_MATCHES_PENDING;
} }
@ -685,19 +683,19 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
q->items[q->cur].type = MQE_START; q->items[q->cur].type = MQE_START;
q->items[q->cur].location = sp - offset; q->items[q->cur].location = sp - offset;
DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end); DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
STORE_STATE(q->state, LOAD_STATE(&ctx->s)); STORE_STATE(q->state, LOAD_STATE(&ctx.s));
return MO_ALIVE; return MO_ALIVE;
} }
JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, ctx, sp); JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
q->cur++; q->cur++;
} }
EXPIRE_ESTATE_FN(limex, ctx, sp); EXPIRE_ESTATE_FN(limex, &ctx, sp);
DEBUG_PRINTF("END\n"); DEBUG_PRINTF("END\n");
STORE_STATE(q->state, LOAD_STATE(&ctx->s)); STORE_STATE(q->state, LOAD_STATE(&ctx.s));
if (q->cur != q->end) { if (q->cur != q->end) {
q->cur--; q->cur--;
@ -706,7 +704,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
return MO_ALIVE; return MO_ALIVE;
} }
return ISNONZERO_STATE(LOAD_STATE(&ctx->s)); return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
} }
// Used for execution Rose prefix/infixes. // Used for execution Rose prefix/infixes.
@ -720,15 +718,16 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
assert(q->cur + 1 < q->end); /* require at least two items */ assert(q->cur + 1 < q->end); /* require at least two items */
struct CONTEXT_T *ctx = q->scratch->nfaContext; struct CONTEXT_T ctx;
ctx->repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T)); ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
ctx->repeat_state = q->streamState + limex->stateSize; ctx.repeat_state = q->streamState + limex->stateSize;
ctx->callback = NULL; ctx.callback = NULL;
ctx->context = NULL; ctx.context = NULL;
STORE_STATE(&ctx->cached_estate, ZERO_STATE); STORE_STATE(&ctx.cached_estate, ZERO_STATE);
ctx.cached_br = 0;
DEBUG_PRINTF("LOAD STATE\n"); DEBUG_PRINTF("LOAD STATE\n");
STORE_STATE(&ctx->s, LOAD_STATE(q->state)); STORE_STATE(&ctx.s, LOAD_STATE(q->state));
assert(q->items[q->cur].type == MQE_START); assert(q->items[q->cur].type == MQE_START);
u64a offset = q->offset; u64a offset = q->offset;
@ -740,7 +739,7 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
if (n->maxWidth) { if (n->maxWidth) {
if (ep - sp > n->maxWidth) { if (ep - sp > n->maxWidth) {
sp = ep - n->maxWidth; sp = ep - n->maxWidth;
STORE_STATE(&ctx->s, INITIAL_FN(limex, !!sp)); STORE_STATE(&ctx.s, INITIAL_FN(limex, !!sp));
} }
} }
assert(ep >= sp); assert(ep >= sp);
@ -751,7 +750,7 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
u64a local_ep = MIN(offset, ep); u64a local_ep = MIN(offset, ep);
/* we are starting inside the history buffer */ /* we are starting inside the history buffer */
STREAMSILENT_FN(limex, q->history + q->hlength + sp - offset, STREAMSILENT_FN(limex, q->history + q->hlength + sp - offset,
local_ep - sp, ctx, sp); local_ep - sp, &ctx, sp);
sp = local_ep; sp = local_ep;
} }
@ -763,30 +762,30 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
/* do main buffer region */ /* do main buffer region */
DEBUG_PRINTF("MAIN BUFFER SCAN\n"); DEBUG_PRINTF("MAIN BUFFER SCAN\n");
assert(ep - offset <= q->length); assert(ep - offset <= q->length);
STREAMSILENT_FN(limex, q->buffer + sp - offset, ep - sp, ctx, sp); STREAMSILENT_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp);
DEBUG_PRINTF("SCAN DONE\n"); DEBUG_PRINTF("SCAN DONE\n");
scan_done: scan_done:
sp = ep; sp = ep;
JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, ctx, sp); JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
q->cur++; q->cur++;
} }
EXPIRE_ESTATE_FN(limex, ctx, sp); EXPIRE_ESTATE_FN(limex, &ctx, sp);
DEBUG_PRINTF("END, nfa is %s\n", DEBUG_PRINTF("END, nfa is %s\n",
ISNONZERO_STATE(ctx->s) ? "still alive" : "dead"); ISNONZERO_STATE(ctx.s) ? "still alive" : "dead");
STORE_STATE(q->state, LOAD_STATE(&ctx->s)); STORE_STATE(q->state, LOAD_STATE(&ctx.s));
if (JOIN(limexInAccept, SIZE)(limex, LOAD_STATE(&ctx->s), ctx->repeat_ctrl, if (JOIN(limexInAccept, SIZE)(limex, LOAD_STATE(&ctx.s), ctx.repeat_ctrl,
ctx->repeat_state, sp + 1, report)) { ctx.repeat_state, sp + 1, report)) {
return MO_MATCHES_PENDING; return MO_MATCHES_PENDING;
} }
return ISNONZERO_STATE(LOAD_STATE(&ctx->s)); return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
} }
char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state, char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
@ -813,42 +812,40 @@ char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
// Block mode reverse scan. // Block mode reverse scan.
char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset, char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
const u8 *buf, size_t buflen, const u8 *buf, size_t buflen,
const u8 *hbuf, size_t hlen, const u8 *hbuf, size_t hlen,
struct hs_scratch *scratch, NfaCallback cb, void *context) {
NfaCallback cb, void *context) {
assert(buf || hbuf); assert(buf || hbuf);
assert(buflen || hlen); assert(buflen || hlen);
/* This may be called INSIDE another NFA, so we need a separate struct CONTEXT_T ctx;
* context --> Hence the nfaContextSom */ ctx.repeat_ctrl = NULL;
struct CONTEXT_T *ctx = scratch->nfaContextSom; ctx.repeat_state = NULL;
ctx->repeat_ctrl = NULL; ctx.callback = cb;
ctx->repeat_state = NULL; ctx.context = context;
ctx->callback = cb; STORE_STATE(&ctx.cached_estate, ZERO_STATE);
ctx->context = context; ctx.cached_br = 0;
STORE_STATE(&ctx->cached_estate, ZERO_STATE);
const IMPL_NFA_T *limex = getImplNfa(n); const IMPL_NFA_T *limex = getImplNfa(n);
STORE_STATE(&ctx->s, INITIAL_FN(limex, 0)); // always anchored STORE_STATE(&ctx.s, INITIAL_FN(limex, 0)); // always anchored
// 'buf' may be null, for example when we're scanning at EOD time. // 'buf' may be null, for example when we're scanning at EOD time.
if (buflen) { if (buflen) {
assert(buf); assert(buf);
DEBUG_PRINTF("MAIN BUFFER SCAN, %zu bytes\n", buflen); DEBUG_PRINTF("MAIN BUFFER SCAN, %zu bytes\n", buflen);
offset -= buflen; offset -= buflen;
REV_STREAM_FN(limex, buf, buflen, ctx, offset); REV_STREAM_FN(limex, buf, buflen, &ctx, offset);
} }
if (hlen) { if (hlen) {
assert(hbuf); assert(hbuf);
DEBUG_PRINTF("HISTORY BUFFER SCAN, %zu bytes\n", hlen); DEBUG_PRINTF("HISTORY BUFFER SCAN, %zu bytes\n", hlen);
offset -= hlen; offset -= hlen;
REV_STREAM_FN(limex, hbuf, hlen, ctx, offset); REV_STREAM_FN(limex, hbuf, hlen, &ctx, offset);
} }
if (offset == 0 && ISNONZERO_STATE(LOAD_STATE(&ctx->s))) { if (offset == 0 && ISNONZERO_STATE(LOAD_STATE(&ctx.s))) {
TESTEOD_REV_FN(limex, &ctx->s, offset, cb, context); TESTEOD_REV_FN(limex, &ctx.s, offset, cb, context);
} }
// NOTE: return value is unused. // NOTE: return value is unused.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -445,14 +445,15 @@ char mcclellanExec8_i_ni(const struct mcclellan *m, u8 *state, const u8 *buf,
} }
static really_inline static really_inline
void mcclellanCheckEOD(const struct NFA *nfa, u16 s, u64a offset, char mcclellanCheckEOD(const struct NFA *nfa, u16 s, u64a offset,
NfaCallback cb, void *ctxt) { NfaCallback cb, void *ctxt) {
const struct mcclellan *m = getImplNfa(nfa); const struct mcclellan *m = getImplNfa(nfa);
const struct mstate_aux *aux = get_aux(m, s); const struct mstate_aux *aux = get_aux(m, s);
if (aux->accept_eod) { if (!aux->accept_eod) {
doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL); return MO_CONTINUE_MATCHING;
} }
return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL);
} }
static really_inline static really_inline
@ -1019,42 +1020,44 @@ void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
const u8 *buf, char top, size_t start_off, const u8 *buf, char top, size_t start_off,
size_t len, NfaCallback cb, void *ctxt) { size_t len, NfaCallback cb, void *ctxt) {
const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
if (top) {
*(u8 *)state = m->start_anchored; u8 s = top ? m->start_anchored : *(u8 *)state;
}
if (m->flags & MCCLELLAN_FLAG_SINGLE) { if (m->flags & MCCLELLAN_FLAG_SINGLE) {
mcclellanExec8_i(m, (u8 *)state, buf + start_off, len - start_off, mcclellanExec8_i(m, &s, buf + start_off, len - start_off,
start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT); start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
} else { } else {
mcclellanExec8_i(m, (u8 *)state, buf + start_off, len - start_off, mcclellanExec8_i(m, &s, buf + start_off, len - start_off,
start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT); start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
} }
*(u8 *)state = s;
} }
void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state, void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
const u8 *buf, char top, size_t start_off, const u8 *buf, char top, size_t start_off,
size_t len, NfaCallback cb, void *ctxt) { size_t len, NfaCallback cb, void *ctxt) {
const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
if (top) {
*(u16 *)state = m->start_anchored; u16 s = top ? m->start_anchored : unaligned_load_u16(state);
}
if (m->flags & MCCLELLAN_FLAG_SINGLE) { if (m->flags & MCCLELLAN_FLAG_SINGLE) {
mcclellanExec16_i(m, (u16 *)state, buf + start_off, len - start_off, mcclellanExec16_i(m, &s, buf + start_off, len - start_off,
start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT); start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
} else { } else {
mcclellanExec16_i(m, (u16 *)state, buf + start_off, len - start_off, mcclellanExec16_i(m, &s, buf + start_off, len - start_off,
start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT); start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
} }
unaligned_store_u16(state, s);
} }
char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state, char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
UNUSED const char *streamState, UNUSED const char *streamState,
u64a offset, NfaCallback callback, u64a offset, NfaCallback callback,
UNUSED SomNfaCallback som_cb, void *context) { UNUSED SomNfaCallback som_cb, void *context) {
mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback, context); return mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback,
return 0; context);
} }
char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state, char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
@ -1062,8 +1065,8 @@ char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
u64a offset, NfaCallback callback, u64a offset, NfaCallback callback,
UNUSED SomNfaCallback som_cb, void *context) { UNUSED SomNfaCallback som_cb, void *context) {
assert(ISALIGNED_N(state, 2)); assert(ISALIGNED_N(state, 2));
mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback, context); return mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback,
return 0; context);
} }
char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -29,8 +29,11 @@
#include "mcclellancompile.h" #include "mcclellancompile.h"
#include "accel.h" #include "accel.h"
#include "accelcompile.h"
#include "grey.h" #include "grey.h"
#include "mcclellan_internal.h" #include "mcclellan_internal.h"
#include "mcclellancompile_accel.h"
#include "mcclellancompile_util.h"
#include "nfa_internal.h" #include "nfa_internal.h"
#include "shufticompile.h" #include "shufticompile.h"
#include "trufflecompile.h" #include "trufflecompile.h"
@ -43,6 +46,7 @@
#include "util/container.h" #include "util/container.h"
#include "util/make_unique.h" #include "util/make_unique.h"
#include "util/order_check.h" #include "util/order_check.h"
#include "util/report_manager.h"
#include "util/ue2_containers.h" #include "util/ue2_containers.h"
#include "util/unaligned.h" #include "util/unaligned.h"
#include "util/verify_types.h" #include "util/verify_types.h"
@ -56,25 +60,18 @@
#include <set> #include <set>
#include <vector> #include <vector>
#include <boost/range/adaptor/map.hpp>
using namespace std; using namespace std;
using boost::adaptors::map_keys;
namespace ue2 { namespace ue2 {
/* compile time accel defs */
#define ACCEL_MAX_STOP_CHAR 160 /* larger than nfa, as we don't have a budget
and the nfa cheats on stop characters for
sets of states */
#define ACCEL_MAX_FLOATING_STOP_CHAR 192 /* accelerating sds is important */
namespace /* anon */ { namespace /* anon */ {
struct dstate_extra { struct dstate_extra {
u16 daddytaken; u16 daddytaken = 0;
bool shermanState; bool shermanState = false;
bool accelerable;
dstate_extra(void) : daddytaken(0), shermanState(false),
accelerable(false) {}
}; };
struct dfa_info { struct dfa_info {
@ -105,10 +102,6 @@ struct dfa_info {
return extra[raw_id].shermanState; return extra[raw_id].shermanState;
} }
bool is_accel(dstate_id_t raw_id) const {
return extra[raw_id].accelerable;
}
size_t size(void) const { return states.size(); } size_t size(void) const { return states.size(); }
}; };
@ -135,6 +128,13 @@ mstate_aux *getAux(NFA *n, dstate_id_t i) {
return aux; return aux;
} }
static
bool double_byte_ok(const AccelScheme &info) {
return !info.double_byte.empty()
&& info.double_cr.count() < info.double_byte.size()
&& info.double_cr.count() <= 2 && !info.double_byte.empty();
}
static static
void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) { void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
assert((size_t)succ_table % 2 == 0); assert((size_t)succ_table % 2 == 0);
@ -186,75 +186,45 @@ void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
} }
} }
void mcclellan_build_strat::find_escape_strings(dstate_id_t this_idx, u32 mcclellan_build_strat::max_allowed_offset_accel() const {
escape_info *out) const { return ACCEL_DFA_MAX_OFFSET_DEPTH;
const dstate &raw = rdfa.states[this_idx]; }
const auto &alpha_remap = rdfa.alpha_remap;
flat_set<pair<u8, u8>> outs2_local; AccelScheme mcclellan_build_strat::find_escape_strings(dstate_id_t this_idx)
for (unsigned i = 0; i < N_CHARS; i++) { const {
outs2_local.clear(); return find_mcclellan_escape_info(rdfa, this_idx,
max_allowed_offset_accel());
if (raw.next[alpha_remap[i]] != this_idx) {
out->outs.set(i);
DEBUG_PRINTF("next is %hu\n", raw.next[alpha_remap[i]]);
const dstate &raw_next = rdfa.states[raw.next[alpha_remap[i]]];
if (!raw_next.reports.empty() && generates_callbacks(rdfa.kind)) {
DEBUG_PRINTF("leads to report\n");
out->outs2_broken = true; /* cannot accelerate over reports */
}
for (unsigned j = 0; !out->outs2_broken && j < N_CHARS; j++) {
if (raw_next.next[alpha_remap[j]] == raw.next[alpha_remap[j]]) {
continue;
}
DEBUG_PRINTF("adding %02x %02x -> %hu to 2 \n", i, j,
raw_next.next[alpha_remap[j]]);
outs2_local.emplace((u8)i, (u8)j);
}
if (outs2_local.size() > 8) {
DEBUG_PRINTF("adding %02x to outs2_single\n", i);
out->outs2_single.set(i);
} else {
insert(&out->outs2, outs2_local);
}
if (out->outs2.size() > 8) {
DEBUG_PRINTF("outs2 too big\n");
out->outs2_broken = true;
}
}
}
} }
/** builds acceleration schemes for states */ /** builds acceleration schemes for states */
void mcclellan_build_strat::buildAccel(dstate_id_t this_idx, void *accel_out) { void mcclellan_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
const AccelScheme &info,
void *accel_out) {
AccelAux *accel = (AccelAux *)accel_out; AccelAux *accel = (AccelAux *)accel_out;
escape_info out;
find_escape_strings(this_idx, &out); DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
info.double_offset);
accel->generic.offset = verify_u8(info.offset);
if (!out.outs2_broken && out.outs2_single.none() if (double_byte_ok(info) && info.double_cr.none()
&& out.outs2.size() == 1) { && info.double_byte.size() == 1) {
accel->accel_type = ACCEL_DVERM; accel->accel_type = ACCEL_DVERM;
accel->dverm.c1 = out.outs2.begin()->first; accel->dverm.c1 = info.double_byte.begin()->first;
accel->dverm.c2 = out.outs2.begin()->second; accel->dverm.c2 = info.double_byte.begin()->second;
accel->dverm.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("state %hu is double vermicelli\n", this_idx); DEBUG_PRINTF("state %hu is double vermicelli\n", this_idx);
return; return;
} }
if (!out.outs2_broken && out.outs2_single.none() if (double_byte_ok(info) && info.double_cr.none()
&& (out.outs2.size() == 2 || out.outs2.size() == 4)) { && (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
bool ok = true; bool ok = true;
assert(!out.outs2.empty()); assert(!info.double_byte.empty());
u8 firstC = out.outs2.begin()->first & CASE_CLEAR; u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
u8 secondC = out.outs2.begin()->second & CASE_CLEAR; u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
for (const pair<u8, u8> &p : out.outs2) { for (const pair<u8, u8> &p : info.double_byte) {
if ((p.first & CASE_CLEAR) != firstC if ((p.first & CASE_CLEAR) != firstC
|| (p.second & CASE_CLEAR) != secondC) { || (p.second & CASE_CLEAR) != secondC) {
ok = false; ok = false;
@ -266,185 +236,76 @@ void mcclellan_build_strat::buildAccel(dstate_id_t this_idx, void *accel_out) {
accel->accel_type = ACCEL_DVERM_NOCASE; accel->accel_type = ACCEL_DVERM_NOCASE;
accel->dverm.c1 = firstC; accel->dverm.c1 = firstC;
accel->dverm.c2 = secondC; accel->dverm.c2 = secondC;
accel->dverm.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx); DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
return; return;
} }
u8 m1;
u8 m2;
if (buildDvermMask(info.double_byte, &m1, &m2)) {
accel->accel_type = ACCEL_DVERM_MASKED;
accel->dverm.offset = verify_u8(info.double_offset);
accel->dverm.c1 = info.double_byte.begin()->first & m1;
accel->dverm.c2 = info.double_byte.begin()->second & m2;
accel->dverm.m1 = m1;
accel->dverm.m2 = m2;
DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
accel->dverm.c1, accel->dverm.c2);
return;
}
} }
if (!out.outs2_broken && if (double_byte_ok(info)
(out.outs2_single.count() + out.outs2.size()) <= 8 && && shuftiBuildDoubleMasks(info.double_cr, info.double_byte,
out.outs2_single.count() < out.outs2.size() && &accel->dshufti.lo1, &accel->dshufti.hi1,
out.outs2_single.count() <= 2 && !out.outs2.empty()) { &accel->dshufti.lo2, &accel->dshufti.hi2)) {
accel->accel_type = ACCEL_DSHUFTI; accel->accel_type = ACCEL_DSHUFTI;
shuftiBuildDoubleMasks(out.outs2_single, out.outs2, accel->dshufti.offset = verify_u8(info.double_offset);
&accel->dshufti.lo1,
&accel->dshufti.hi1,
&accel->dshufti.lo2,
&accel->dshufti.hi2);
DEBUG_PRINTF("state %hu is double shufti\n", this_idx); DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
return; return;
} }
if (out.outs.none()) { if (info.cr.none()) {
accel->accel_type = ACCEL_RED_TAPE; accel->accel_type = ACCEL_RED_TAPE;
DEBUG_PRINTF("state %hu is a dead end full of bureaucratic red tape" DEBUG_PRINTF("state %hu is a dead end full of bureaucratic red tape"
" from which there is no escape\n", this_idx); " from which there is no escape\n", this_idx);
return; return;
} }
if (out.outs.count() == 1) { if (info.cr.count() == 1) {
accel->accel_type = ACCEL_VERM; accel->accel_type = ACCEL_VERM;
accel->verm.c = out.outs.find_first(); accel->verm.c = info.cr.find_first();
DEBUG_PRINTF("state %hu is vermicelli\n", this_idx); DEBUG_PRINTF("state %hu is vermicelli\n", this_idx);
return; return;
} }
if (out.outs.count() == 2 && out.outs.isCaselessChar()) { if (info.cr.count() == 2 && info.cr.isCaselessChar()) {
accel->accel_type = ACCEL_VERM_NOCASE; accel->accel_type = ACCEL_VERM_NOCASE;
accel->verm.c = out.outs.find_first() & CASE_CLEAR; accel->verm.c = info.cr.find_first() & CASE_CLEAR;
DEBUG_PRINTF("state %hu is caseless vermicelli\n", this_idx); DEBUG_PRINTF("state %hu is caseless vermicelli\n", this_idx);
return; return;
} }
if (out.outs.count() > ACCEL_MAX_FLOATING_STOP_CHAR) { if (info.cr.count() > ACCEL_DFA_MAX_FLOATING_STOP_CHAR) {
accel->accel_type = ACCEL_NONE; accel->accel_type = ACCEL_NONE;
DEBUG_PRINTF("state %hu is too broad\n", this_idx); DEBUG_PRINTF("state %hu is too broad\n", this_idx);
return; return;
} }
accel->accel_type = ACCEL_SHUFTI; accel->accel_type = ACCEL_SHUFTI;
if (-1 != shuftiBuildMasks(out.outs, &accel->shufti.lo, if (-1 != shuftiBuildMasks(info.cr, &accel->shufti.lo,
&accel->shufti.hi)) { &accel->shufti.hi)) {
DEBUG_PRINTF("state %hu is shufti\n", this_idx); DEBUG_PRINTF("state %hu is shufti\n", this_idx);
return; return;
} }
assert(!out.outs.none()); assert(!info.cr.none());
accel->accel_type = ACCEL_TRUFFLE; accel->accel_type = ACCEL_TRUFFLE;
truffleBuildMasks(out.outs, &accel->truffle.mask1, &accel->truffle.mask2); truffleBuildMasks(info.cr, &accel->truffle.mask1, &accel->truffle.mask2);
DEBUG_PRINTF("state %hu is truffle\n", this_idx); DEBUG_PRINTF("state %hu is truffle\n", this_idx);
} }
static
bool is_accel(const raw_dfa &raw, dstate_id_t sds_or_proxy,
dstate_id_t this_idx) {
if (!this_idx /* dead state is not accelerable */) {
return false;
}
/* Note on report acceleration states: While we can't accelerate while we
* are spamming out callbacks, the QR code paths don't raise reports
* during scanning so they can accelerate report states. */
if (generates_callbacks(raw.kind)
&& !raw.states[this_idx].reports.empty()) {
return false;
}
size_t single_limit = this_idx == sds_or_proxy ?
ACCEL_MAX_FLOATING_STOP_CHAR : ACCEL_MAX_STOP_CHAR;
DEBUG_PRINTF("inspecting %hu/%hu: %zu\n", this_idx, sds_or_proxy,
single_limit);
CharReach out;
for (u32 i = 0; i < N_CHARS; i++) {
if (raw.states[this_idx].next[raw.alpha_remap[i]] != this_idx) {
out.set(i);
}
}
if (out.count() <= single_limit) {
DEBUG_PRINTF("state %hu should be accelerable %zu\n", this_idx,
out.count());
return true;
}
DEBUG_PRINTF("state %hu is not accelerable has %zu\n", this_idx,
out.count());
return false;
}
static
bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
u16 top_remap = raw.alpha_remap[TOP];
for (u32 i = 0; i < raw.states[s].next.size(); i++) {
if (i != top_remap && raw.states[s].next[i] == s) {
return true;
}
}
return false;
}
static
dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
if (raw.start_floating != DEAD_STATE) {
DEBUG_PRINTF("has floating start\n");
return raw.start_floating;
}
DEBUG_PRINTF("looking for SDS proxy\n");
dstate_id_t s = raw.start_anchored;
if (has_self_loop(s, raw)) {
return s;
}
u16 top_remap = raw.alpha_remap[TOP];
ue2::unordered_set<dstate_id_t> seen;
while (true) {
seen.insert(s);
DEBUG_PRINTF("basis %hu\n", s);
/* check if we are connected to a state with a self loop */
for (u32 i = 0; i < raw.states[s].next.size(); i++) {
dstate_id_t t = raw.states[s].next[i];
if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
return t;
}
}
/* find a neighbour to use as a basis for looking for the sds proxy */
dstate_id_t t = DEAD_STATE;
for (u32 i = 0; i < raw.states[s].next.size(); i++) {
dstate_id_t tt = raw.states[s].next[i];
if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
t = tt;
break;
}
}
if (t == DEAD_STATE) {
/* we were unable to find a state to use as a SDS proxy */
return DEAD_STATE;
}
s = t;
seen.insert(t);
}
}
static
void populateAccelerationInfo(dfa_info &info, u32 *ac, const Grey &grey) {
*ac = 0; /* number of accelerable states */
if (!grey.accelerateDFA) {
return;
}
dstate_id_t sds_proxy = get_sds_or_proxy(info.raw);
DEBUG_PRINTF("sds %hu\n", sds_proxy);
for (size_t i = 0; i < info.size(); i++) {
if (is_accel(info.raw, sds_proxy, i)) {
++*ac;
info.extra[i].accelerable = true;
}
}
}
static static
void populateBasicInfo(size_t state_size, const dfa_info &info, void populateBasicInfo(size_t state_size, const dfa_info &info,
u32 total_size, u32 aux_offset, u32 accel_offset, u32 total_size, u32 aux_offset, u32 accel_offset,
@ -496,8 +357,16 @@ namespace {
struct raw_report_list { struct raw_report_list {
flat_set<ReportID> reports; flat_set<ReportID> reports;
explicit raw_report_list(const flat_set<ReportID> &reports_in) raw_report_list(const flat_set<ReportID> &reports_in,
: reports(reports_in) {} const ReportManager &rm, bool do_remap) {
if (do_remap) {
for (auto &id : reports_in) {
reports.insert(rm.getProgramOffset(id));
}
} else {
reports = reports_in;
}
}
bool operator<(const raw_report_list &b) const { bool operator<(const raw_report_list &b) const {
return reports < b.reports; return reports < b.reports;
@ -520,6 +389,8 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
ReportID *arbReport) const { ReportID *arbReport) const {
DEBUG_PRINTF("gathering reports\n"); DEBUG_PRINTF("gathering reports\n");
const bool remap_reports = has_managed_reports(rdfa.kind);
auto ri = ue2::make_unique<raw_report_info_impl>(); auto ri = ue2::make_unique<raw_report_info_impl>();
map<raw_report_list, u32> rev; map<raw_report_list, u32> rev;
@ -529,7 +400,7 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
continue; continue;
} }
raw_report_list rrl(s.reports); raw_report_list rrl(s.reports, rm, remap_reports);
DEBUG_PRINTF("non empty r\n"); DEBUG_PRINTF("non empty r\n");
if (rev.find(rrl) != rev.end()) { if (rev.find(rrl) != rev.end()) {
reports.push_back(rev[rrl]); reports.push_back(rev[rrl]);
@ -548,7 +419,7 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
} }
DEBUG_PRINTF("non empty r eod\n"); DEBUG_PRINTF("non empty r eod\n");
raw_report_list rrl(s.reports_eod); raw_report_list rrl(s.reports_eod, rm, remap_reports);
if (rev.find(rrl) != rev.end()) { if (rev.find(rrl) != rev.end()) {
reports_eod.push_back(rev[rrl]); reports_eod.push_back(rev[rrl]);
continue; continue;
@ -625,6 +496,14 @@ void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
} }
} }
static
void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
set<dstate_id_t> *accel_states) {
for (dstate_id_t i : accel_escape_info | map_keys) {
accel_states->insert(i);
}
}
static static
size_t calcShermanRegionSize(const dfa_info &info) { size_t calcShermanRegionSize(const dfa_info &info) {
size_t rv = 0; size_t rv = 0;
@ -692,14 +571,14 @@ int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
static static
aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info, aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
const CompileContext &cc) { const CompileContext &cc,
set<dstate_id_t> *accel_states) {
DEBUG_PRINTF("building mcclellan 16\n"); DEBUG_PRINTF("building mcclellan 16\n");
vector<u32> reports; /* index in ri for the appropriate report list */ vector<u32> reports; /* index in ri for the appropriate report list */
vector<u32> reports_eod; /* as above */ vector<u32> reports_eod; /* as above */
ReportID arb; ReportID arb;
u8 single; u8 single;
u32 accelCount;
u8 alphaShift = info.getAlphaShift(); u8 alphaShift = info.getAlphaShift();
assert(alphaShift <= 8); assert(alphaShift <= 8);
@ -711,9 +590,9 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
return nullptr; return nullptr;
} }
unique_ptr<raw_report_info> ri auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
= info.strat.gatherReports(reports, reports_eod, &single, &arb); map<dstate_id_t, AccelScheme> accel_escape_info
populateAccelerationInfo(info, &accelCount, cc.grey); = populateAccelerationInfo(info.raw, info.strat, cc.grey);
size_t tran_size = (1 << info.getAlphaShift()) size_t tran_size = (1 << info.getAlphaShift())
* sizeof(u16) * count_real_states; * sizeof(u16) * count_real_states;
@ -721,7 +600,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
size_t aux_size = sizeof(mstate_aux) * info.size(); size_t aux_size = sizeof(mstate_aux) * info.size();
size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size); size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size);
size_t accel_size = info.strat.accelSize() * accelCount; size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
size_t accel_offset = ROUNDUP_N(aux_offset + aux_size size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+ ri->getReportListSize(), 32); + ri->getReportListSize(), 32);
size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size); size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
@ -736,7 +615,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
char *nfa_base = (char *)nfa.get(); char *nfa_base = (char *)nfa.get();
populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset, populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
accelCount, arb, single, nfa.get()); accel_escape_info.size(), arb, single, nfa.get());
vector<u32> reportOffsets; vector<u32> reportOffsets;
@ -769,12 +648,12 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
fillInAux(&aux[fs], i, info, reports, reports_eod, reportOffsets); fillInAux(&aux[fs], i, info, reports, reports_eod, reportOffsets);
if (info.is_accel(i)) { if (contains(accel_escape_info, i)) {
this_aux->accel_offset = accel_offset; this_aux->accel_offset = accel_offset;
accel_offset += info.strat.accelSize(); accel_offset += info.strat.accelSize();
assert(accel_offset + sizeof(NFA) <= sherman_offset); assert(accel_offset + sizeof(NFA) <= sherman_offset);
assert(ISALIGNED_N(accel_offset, alignof(union AccelAux))); assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
info.strat.buildAccel(i, info.strat.buildAccel(i, accel_escape_info.at(i),
(void *)((char *)m + this_aux->accel_offset)); (void *)((char *)m + this_aux->accel_offset));
} }
} }
@ -798,12 +677,12 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets); fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
if (info.is_accel(i)) { if (contains(accel_escape_info, i)) {
this_aux->accel_offset = accel_offset; this_aux->accel_offset = accel_offset;
accel_offset += info.strat.accelSize(); accel_offset += info.strat.accelSize();
assert(accel_offset + sizeof(NFA) <= sherman_offset); assert(accel_offset + sizeof(NFA) <= sherman_offset);
assert(ISALIGNED_N(accel_offset, alignof(union AccelAux))); assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
info.strat.buildAccel(i, info.strat.buildAccel(i, accel_escape_info.at(i),
(void *)((char *)m + this_aux->accel_offset)); (void *)((char *)m + this_aux->accel_offset));
} }
@ -836,6 +715,10 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
markEdges(nfa.get(), succ_table, info); markEdges(nfa.get(), succ_table, info);
if (accel_states && nfa) {
fillAccelOut(accel_escape_info, accel_states);
}
return nfa; return nfa;
} }
@ -874,7 +757,9 @@ void fillInBasicState8(const dfa_info &info, mstate_aux *aux, u8 *succ_table,
} }
static static
void allocateFSN8(dfa_info &info, u16 *accel_limit, u16 *accept_limit) { void allocateFSN8(dfa_info &info,
const map<dstate_id_t, AccelScheme> &accel_escape_info,
u16 *accel_limit, u16 *accept_limit) {
info.states[0].impl_id = 0; /* dead is always 0 */ info.states[0].impl_id = 0; /* dead is always 0 */
vector<dstate_id_t> norm; vector<dstate_id_t> norm;
@ -886,7 +771,7 @@ void allocateFSN8(dfa_info &info, u16 *accel_limit, u16 *accept_limit) {
for (u32 i = 1; i < info.size(); i++) { for (u32 i = 1; i < info.size(); i++) {
if (!info.states[i].reports.empty()) { if (!info.states[i].reports.empty()) {
accept.push_back(i); accept.push_back(i);
} else if (info.is_accel(i)) { } else if (contains(accel_escape_info, i)) {
accel.push_back(i); accel.push_back(i);
} else { } else {
norm.push_back(i); norm.push_back(i);
@ -915,23 +800,23 @@ void allocateFSN8(dfa_info &info, u16 *accel_limit, u16 *accept_limit) {
static static
aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info, aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
const CompileContext &cc) { const CompileContext &cc,
set<dstate_id_t> *accel_states) {
DEBUG_PRINTF("building mcclellan 8\n"); DEBUG_PRINTF("building mcclellan 8\n");
vector<u32> reports; vector<u32> reports;
vector<u32> reports_eod; vector<u32> reports_eod;
ReportID arb; ReportID arb;
u8 single; u8 single;
u32 accelCount;
unique_ptr<raw_report_info> ri auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
= info.strat.gatherReports(reports, reports_eod, &single, &arb); map<dstate_id_t, AccelScheme> accel_escape_info
populateAccelerationInfo(info, &accelCount, cc.grey); = populateAccelerationInfo(info.raw, info.strat, cc.grey);
size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * info.size(); size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * info.size();
size_t aux_size = sizeof(mstate_aux) * info.size(); size_t aux_size = sizeof(mstate_aux) * info.size();
size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size); size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size);
size_t accel_size = info.strat.accelSize() * accelCount; size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
size_t accel_offset = ROUNDUP_N(aux_offset + aux_size size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+ ri->getReportListSize(), 32); + ri->getReportListSize(), 32);
size_t total_size = accel_offset + accel_size; size_t total_size = accel_offset + accel_size;
@ -951,9 +836,9 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
mcclellan *m = (mcclellan *)getMutableImplNfa(nfa.get()); mcclellan *m = (mcclellan *)getMutableImplNfa(nfa.get());
allocateFSN8(info, &m->accel_limit_8, &m->accept_limit_8); allocateFSN8(info, accel_escape_info, &m->accel_limit_8, &m->accept_limit_8);
populateBasicInfo(sizeof(u8), info, total_size, aux_offset, accel_offset, populateBasicInfo(sizeof(u8), info, total_size, aux_offset, accel_offset,
accelCount, arb, single, nfa.get()); accel_escape_info.size(), arb, single, nfa.get());
vector<u32> reportOffsets; vector<u32> reportOffsets;
@ -964,13 +849,14 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
mstate_aux *aux = (mstate_aux *)(nfa_base + aux_offset); mstate_aux *aux = (mstate_aux *)(nfa_base + aux_offset);
for (size_t i = 0; i < info.size(); i++) { for (size_t i = 0; i < info.size(); i++) {
if (info.is_accel(i)) { if (contains(accel_escape_info, i)) {
u32 j = info.implId(i); u32 j = info.implId(i);
aux[j].accel_offset = accel_offset; aux[j].accel_offset = accel_offset;
accel_offset += info.strat.accelSize(); accel_offset += info.strat.accelSize();
info.strat.buildAccel(i, (void *)((char *)m + aux[j].accel_offset)); info.strat.buildAccel(i, accel_escape_info.at(i),
(void *)((char *)m + aux[j].accel_offset));
} }
fillInBasicState8(info, aux, succ_table, reportOffsets, reports, fillInBasicState8(info, aux, succ_table, reportOffsets, reports,
@ -981,6 +867,10 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
DEBUG_PRINTF("rl size %zu\n", ri->size()); DEBUG_PRINTF("rl size %zu\n", ri->size());
if (accel_states && nfa) {
fillAccelOut(accel_escape_info, accel_states);
}
return nfa; return nfa;
} }
@ -1163,15 +1053,6 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
return false; return false;
} }
static
void fillAccelOut(const dfa_info &info, set<dstate_id_t> *accel_states) {
for (size_t i = 0; i < info.size(); i++) {
if (info.is_accel(i)) {
accel_states->insert(i);
}
}
}
aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat, aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
const CompileContext &cc, const CompileContext &cc,
set<dstate_id_t> *accel_states) { set<dstate_id_t> *accel_states) {
@ -1200,26 +1081,23 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
aligned_unique_ptr<NFA> nfa; aligned_unique_ptr<NFA> nfa;
if (!using8bit) { if (!using8bit) {
nfa = mcclellanCompile16(info, cc); nfa = mcclellanCompile16(info, cc, accel_states);
} else { } else {
nfa = mcclellanCompile8(info, cc); nfa = mcclellanCompile8(info, cc, accel_states);
} }
if (has_eod_reports) { if (has_eod_reports) {
nfa->flags |= NFA_ACCEPTS_EOD; nfa->flags |= NFA_ACCEPTS_EOD;
} }
if (accel_states && nfa) {
fillAccelOut(info, accel_states);
}
DEBUG_PRINTF("compile done\n"); DEBUG_PRINTF("compile done\n");
return nfa; return nfa;
} }
aligned_unique_ptr<NFA> mcclellanCompile(raw_dfa &raw, const CompileContext &cc, aligned_unique_ptr<NFA> mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm,
set<dstate_id_t> *accel_states) { set<dstate_id_t> *accel_states) {
mcclellan_build_strat mbs(raw); mcclellan_build_strat mbs(raw, rm);
return mcclellanCompile_i(raw, mbs, cc, accel_states); return mcclellanCompile_i(raw, mbs, cc, accel_states);
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -31,6 +31,7 @@
#include "rdfa.h" #include "rdfa.h"
#include "ue2common.h" #include "ue2common.h"
#include "util/accel_scheme.h"
#include "util/alloc.h" #include "util/alloc.h"
#include "util/charreach.h" #include "util/charreach.h"
#include "util/ue2_containers.h" #include "util/ue2_containers.h"
@ -43,6 +44,7 @@ struct NFA;
namespace ue2 { namespace ue2 {
class ReportManager;
struct CompileContext; struct CompileContext;
struct raw_report_info { struct raw_report_info {
@ -54,15 +56,9 @@ struct raw_report_info {
std::vector<u32> &ro /* out */) const = 0; std::vector<u32> &ro /* out */) const = 0;
}; };
struct escape_info {
CharReach outs;
CharReach outs2_single;
flat_set<std::pair<u8, u8>> outs2;
bool outs2_broken = false;
};
class dfa_build_strat { class dfa_build_strat {
public: public:
explicit dfa_build_strat(const ReportManager &rm_in) : rm(rm_in) {}
virtual ~dfa_build_strat(); virtual ~dfa_build_strat();
virtual raw_dfa &get_raw() const = 0; virtual raw_dfa &get_raw() const = 0;
virtual std::unique_ptr<raw_report_info> gatherReports( virtual std::unique_ptr<raw_report_info> gatherReports(
@ -70,25 +66,29 @@ public:
std::vector<u32> &reports_eod /* out */, std::vector<u32> &reports_eod /* out */,
u8 *isSingleReport /* out */, u8 *isSingleReport /* out */,
ReportID *arbReport /* out */) const = 0; ReportID *arbReport /* out */) const = 0;
virtual void find_escape_strings(dstate_id_t this_idx, virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const = 0;
escape_info *out) const = 0;
virtual size_t accelSize(void) const = 0; virtual size_t accelSize(void) const = 0;
virtual void buildAccel(dstate_id_t this_idx, void *accel_out) = 0; virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
void *accel_out) = 0;
protected:
const ReportManager &rm;
}; };
class mcclellan_build_strat : public dfa_build_strat { class mcclellan_build_strat : public dfa_build_strat {
public: public:
explicit mcclellan_build_strat(raw_dfa &r) : rdfa(r) {} mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
: dfa_build_strat(rm_in), rdfa(rdfa_in) {}
raw_dfa &get_raw() const override { return rdfa; } raw_dfa &get_raw() const override { return rdfa; }
std::unique_ptr<raw_report_info> gatherReports( std::unique_ptr<raw_report_info> gatherReports(
std::vector<u32> &reports /* out */, std::vector<u32> &reports /* out */,
std::vector<u32> &reports_eod /* out */, std::vector<u32> &reports_eod /* out */,
u8 *isSingleReport /* out */, u8 *isSingleReport /* out */,
ReportID *arbReport /* out */) const override; ReportID *arbReport /* out */) const override;
void find_escape_strings(dstate_id_t this_idx, AccelScheme find_escape_strings(dstate_id_t this_idx) const override;
escape_info *out) const override;
size_t accelSize(void) const override; size_t accelSize(void) const override;
void buildAccel(dstate_id_t this_idx, void *accel_out) override; void buildAccel(dstate_id_t this_idx,const AccelScheme &info,
void *accel_out) override;
virtual u32 max_allowed_offset_accel() const;
private: private:
raw_dfa &rdfa; raw_dfa &rdfa;
@ -98,6 +98,7 @@ private:
* states */ * states */
ue2::aligned_unique_ptr<NFA> ue2::aligned_unique_ptr<NFA>
mcclellanCompile(raw_dfa &raw, const CompileContext &cc, mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm,
std::set<dstate_id_t> *accel_states = nullptr); std::set<dstate_id_t> *accel_states = nullptr);
/* used internally by mcclellan/haig/gough compile process */ /* used internally by mcclellan/haig/gough compile process */

View File

@ -0,0 +1,422 @@
/*
* Copyright (c) 2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "mcclellancompile_accel.h"
#include "mcclellancompile_util.h"
#include "grey.h"
#include "nfagraph/ng_limex_accel.h"
#include "util/charreach.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include <vector>
#include <sstream>
#define PATHS_LIMIT 500
using namespace std;
namespace ue2 {
namespace {
struct path {
vector<CharReach> reach;
dstate_id_t dest = DEAD_STATE;
explicit path(dstate_id_t base) : dest(base) {}
};
}
static UNUSED
string describeClasses(const vector<CharReach> &v) {
std::ostringstream oss;
for (const auto &cr : v) {
describeClass(oss, cr);
}
return oss.str();
}
static
void dump_paths(const vector<path> &paths) {
for (UNUSED const auto &p : paths) {
DEBUG_PRINTF("[%s] -> %u\n", describeClasses(p.reach).c_str(), p.dest);
}
DEBUG_PRINTF("%zu paths\n", paths.size());
}
static
bool is_useful_path(const vector<path> &good, const path &p) {
for (const auto &g : good) {
assert(g.dest == p.dest);
assert(g.reach.size() <= p.reach.size());
auto git = g.reach.rbegin();
auto pit = p.reach.rbegin();
for (; git != g.reach.rend(); ++git, ++pit) {
if (!pit->isSubsetOf(*git)) {
goto next;
}
}
DEBUG_PRINTF("better: [%s] -> %u\n",
describeClasses(g.reach).c_str(), g.dest);
return false;
next:;
}
return true;
}
static
path append(const path &orig, const CharReach &cr, u32 new_dest) {
path p(new_dest);
p.reach = orig.reach;
p.reach.push_back(cr);
return p;
}
static
void extend(const raw_dfa &rdfa, const path &p,
map<u32, vector<path> > &all,
vector<path> &out) {
dstate s = rdfa.states[p.dest];
if (!p.reach.empty() && p.reach.back().none()) {
out.push_back(p);
return;
}
if (!s.reports.empty()) {
if (generates_callbacks(rdfa.kind)) {
out.push_back(p);
return;
} else {
path pp = append(p, CharReach(), p.dest);
all[p.dest].push_back(pp);
out.push_back(pp);
}
}
if (!s.reports_eod.empty()) {
path pp = append(p, CharReach(), p.dest);
all[p.dest].push_back(pp);
out.push_back(pp);
}
map<u32, CharReach> dest;
for (unsigned i = 0; i < N_CHARS; i++) {
u32 succ = s.next[rdfa.alpha_remap[i]];
dest[succ].set(i);
}
for (const auto &e : dest) {
path pp = append(p, e.second, e.first);
if (!is_useful_path(all[e.first], pp)) {
DEBUG_PRINTF("not useful: [%s] -> %u\n",
describeClasses(pp.reach).c_str(), pp.dest);
continue;
}
DEBUG_PRINTF("----good: [%s] -> %u\n",
describeClasses(pp.reach).c_str(), pp.dest);
all[e.first].push_back(pp);
out.push_back(pp);
}
}
static
vector<vector<CharReach> > generate_paths(const raw_dfa &rdfa, dstate_id_t base,
u32 len) {
vector<path> paths{ path(base) };
map<u32, vector<path> > all;
all[base].push_back(path(base));
for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
vector<path> next_gen;
for (const auto &p : paths) {
extend(rdfa, p, all, next_gen);
}
paths = move(next_gen);
}
dump_paths(paths);
vector<vector<CharReach> > rv;
for (auto &p : paths) {
rv.push_back(move(p.reach));
}
return rv;
}
static
AccelScheme look_for_offset_accel(const raw_dfa &rdfa, dstate_id_t base,
u32 max_allowed_accel_offset) {
DEBUG_PRINTF("looking for accel for %hu\n", base);
vector<vector<CharReach> > paths = generate_paths(rdfa, base,
max_allowed_accel_offset + 1);
AccelScheme as = findBestAccelScheme(paths, CharReach(), true);
DEBUG_PRINTF("found %s + %u\n", describeClass(as.cr).c_str(), as.offset);
return as;
}
static
vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
const CharReach &escape) {
set<u16> rv;
CharReach nonexit = ~escape;
for (auto i = nonexit.find_first(); i != CharReach::npos;
i = nonexit.find_next(i)) {
rv.insert(rdfa.alpha_remap[i]);
}
return vector<u16>(rv.begin(), rv.end());
}
static
set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
const AccelScheme &ei) {
DEBUG_PRINTF("looking for region around %hu\n", base);
set<dstate_id_t> region = {base};
if (!ei.double_byte.empty()) {
return region;
}
DEBUG_PRINTF("accel %s+%u\n", describeClass(ei.cr).c_str(), ei.offset);
const CharReach &escape = ei.cr;
auto nonexit_symbols = find_nonexit_symbols(rdfa, escape);
vector<dstate_id_t> pending = {base};
while (!pending.empty()) {
dstate_id_t curr = pending.back();
pending.pop_back();
for (auto s : nonexit_symbols) {
dstate_id_t t = rdfa.states[curr].next[s];
if (contains(region, t)) {
continue;
}
DEBUG_PRINTF(" %hu is in region\n", t);
region.insert(t);
pending.push_back(t);
}
}
return region;
}
static
bool better(const AccelScheme &a, const AccelScheme &b) {
if (!a.double_byte.empty() && b.double_byte.empty()) {
return true;
}
if (!b.double_byte.empty()) {
return false;
}
return a.cr.count() < b.cr.count();
}
static
vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
for (u32 i = 0; i < N_CHARS; i++) {
rv.at(rdfa.alpha_remap[i]).set(i);
}
return rv;
}
map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
const dfa_build_strat &strat,
const Grey &grey) {
map<dstate_id_t, AccelScheme> rv;
if (!grey.accelerateDFA) {
return rv;
}
dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
DEBUG_PRINTF("sds %hu\n", sds_proxy);
for (size_t i = 0; i < rdfa.states.size(); i++) {
if (i == DEAD_STATE) {
continue;
}
/* Note on report acceleration states: While we can't accelerate while we
* are spamming out callbacks, the QR code paths don't raise reports
* during scanning so they can accelerate report states. */
if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
continue;
}
size_t single_limit = i == sds_proxy ? ACCEL_DFA_MAX_FLOATING_STOP_CHAR
: ACCEL_DFA_MAX_STOP_CHAR;
DEBUG_PRINTF("inspecting %zu/%hu: %zu\n", i, sds_proxy, single_limit);
AccelScheme ei = strat.find_escape_strings(i);
if (ei.cr.count() > single_limit) {
DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
ei.cr.count());
continue;
}
DEBUG_PRINTF("state %zu should be accelerable %zu\n",
i, ei.cr.count());
rv[i] = ei;
}
/* provide accleration states to states in the region of sds */
if (contains(rv, sds_proxy)) {
AccelScheme sds_ei = rv[sds_proxy];
sds_ei.double_byte.clear(); /* region based on single byte scheme
* may differ from double byte */
DEBUG_PRINTF("looking to expand offset accel to nearby states, %zu\n",
sds_ei.cr.count());
auto sds_region = find_region(rdfa, sds_proxy, sds_ei);
for (auto s : sds_region) {
if (!contains(rv, s) || better(sds_ei, rv[s])) {
rv[s] = sds_ei;
}
}
}
return rv;
}
static
bool double_byte_ok(const AccelScheme &info) {
return !info.double_byte.empty()
&& info.double_cr.count() < info.double_byte.size()
&& info.double_cr.count() <= 2 && !info.double_byte.empty();
}
AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx,
u32 max_allowed_accel_offset) {
AccelScheme rv;
rv.cr.clear();
rv.offset = 0;
const dstate &raw = rdfa.states[this_idx];
const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
bool outs2_broken = false;
map<dstate_id_t, CharReach> succs;
for (u32 i = 0; i < rev_map.size(); i++) {
if (raw.next[i] == this_idx) {
continue;
}
const CharReach &cr_i = rev_map.at(i);
rv.cr |= cr_i;
dstate_id_t next_id = raw.next[i];
DEBUG_PRINTF("next is %hu\n", next_id);
const dstate &raw_next = rdfa.states[next_id];
if (outs2_broken) {
continue;
}
if (!raw_next.reports.empty() && generates_callbacks(rdfa.kind)) {
DEBUG_PRINTF("leads to report\n");
outs2_broken = true; /* cannot accelerate over reports */
continue;
}
succs[next_id] |= cr_i;
}
if (!outs2_broken) {
for (const auto &e : succs) {
const CharReach &cr_i = e.second;
const dstate &raw_next = rdfa.states[e.first];
CharReach cr_all_j;
for (u32 j = 0; j < rev_map.size(); j++) {
if (raw_next.next[j] == raw.next[j]) {
continue;
}
DEBUG_PRINTF("state %hu: adding sym %u -> %hu to 2 \n", e.first,
j, raw_next.next[j]);
cr_all_j |= rev_map.at(j);
}
if (cr_i.count() * cr_all_j.count() > 8) {
DEBUG_PRINTF("adding %zu to double_cr\n", cr_i.count());
rv.double_cr |= cr_i;
} else {
for (auto ii = cr_i.find_first(); ii != CharReach::npos;
ii = cr_i.find_next(ii)) {
for (auto jj = cr_all_j.find_first(); jj != CharReach::npos;
jj = cr_all_j.find_next(jj)) {
rv.double_byte.emplace((u8)ii, (u8)jj);
}
}
}
}
if (rv.double_byte.size() > 8) {
DEBUG_PRINTF("outs2 too big\n");
outs2_broken = true;
}
if (outs2_broken) {
rv.double_byte.clear();
}
}
DEBUG_PRINTF("this %u, sds proxy %hu\n", this_idx, get_sds_or_proxy(rdfa));
DEBUG_PRINTF("broken %d\n", outs2_broken);
if (!double_byte_ok(rv) && !is_triggered(rdfa.kind)
&& this_idx == rdfa.start_floating
&& this_idx != DEAD_STATE) {
DEBUG_PRINTF("looking for offset accel at %u\n", this_idx);
auto offset = look_for_offset_accel(rdfa, this_idx,
max_allowed_accel_offset);
DEBUG_PRINTF("width %zu vs %zu\n", offset.cr.count(),
rv.cr.count());
if (double_byte_ok(offset) || offset.cr.count() < rv.cr.count()) {
DEBUG_PRINTF("using offset accel\n");
rv = offset;
}
}
return rv;
}
}

View File

@ -0,0 +1,61 @@
/*
* Copyright (c) 2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MCCLELLANCOMPILE_ACCEL_H
#define MCCLELLANCOMPILE_ACCEL_H
#include "mcclellancompile.h"
#include <map>
namespace ue2 {
struct Grey;
#define ACCEL_DFA_MAX_OFFSET_DEPTH 4
/** Maximum tolerated number of escape character from an accel state.
* This is larger than nfa, as we don't have a budget and the nfa cheats on stop
* characters for sets of states */
#define ACCEL_DFA_MAX_STOP_CHAR 160
/** Maximum tolerated number of escape character from a sds accel state. Larger
* than normal states as accelerating sds is important. Matches NFA value */
#define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
std::map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
const dfa_build_strat &strat,
const Grey &grey);
AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa,
dstate_id_t this_idx,
u32 max_allowed_accel_offset);
}
#endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -228,13 +228,13 @@ void calc_min_dist_to_accept(const raw_dfa &raw,
} }
} }
void prune_overlong(raw_dfa &raw, u32 max_offset) { bool prune_overlong(raw_dfa &raw, u32 max_offset) {
DEBUG_PRINTF("pruning to at most %u\n", max_offset); DEBUG_PRINTF("pruning to at most %u\n", max_offset);
vector<u32> bob_dist; vector<u32> bob_dist;
u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist); u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
if (max_min_dist_bob <= max_offset) { if (max_min_dist_bob <= max_offset) {
return; return false;
} }
vector<vector<dstate_id_t> > in_edges; vector<vector<dstate_id_t> > in_edges;
@ -282,6 +282,8 @@ void prune_overlong(raw_dfa &raw, u32 max_offset) {
/* update specials */ /* update specials */
raw.start_floating = new_ids[raw.start_floating]; raw.start_floating = new_ids[raw.start_floating];
raw.start_anchored = new_ids[raw.start_anchored]; raw.start_anchored = new_ids[raw.start_anchored];
return true;
} }
set<ReportID> all_reports(const raw_dfa &rdfa) { set<ReportID> all_reports(const raw_dfa &rdfa) {
@ -334,4 +336,63 @@ size_t hash_dfa(const raw_dfa &rdfa) {
return v; return v;
} }
static
bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
u16 top_remap = raw.alpha_remap[TOP];
for (u32 i = 0; i < raw.states[s].next.size(); i++) {
if (i != top_remap && raw.states[s].next[i] == s) {
return true;
}
}
return false;
}
dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
if (raw.start_floating != DEAD_STATE) {
DEBUG_PRINTF("has floating start\n");
return raw.start_floating;
}
DEBUG_PRINTF("looking for SDS proxy\n");
dstate_id_t s = raw.start_anchored;
if (has_self_loop(s, raw)) {
return s;
}
u16 top_remap = raw.alpha_remap[TOP];
ue2::unordered_set<dstate_id_t> seen;
while (true) {
seen.insert(s);
DEBUG_PRINTF("basis %hu\n", s);
/* check if we are connected to a state with a self loop */
for (u32 i = 0; i < raw.states[s].next.size(); i++) {
dstate_id_t t = raw.states[s].next[i];
if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
return t;
}
}
/* find a neighbour to use as a basis for looking for the sds proxy */
dstate_id_t t = DEAD_STATE;
for (u32 i = 0; i < raw.states[s].next.size(); i++) {
dstate_id_t tt = raw.states[s].next[i];
if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
t = tt;
break;
}
}
if (t == DEAD_STATE) {
/* we were unable to find a state to use as a SDS proxy */
return DEAD_STATE;
}
s = t;
}
}
} // namespace ue2 } // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -29,16 +29,21 @@
#ifndef MCCLELLAN_COMPILE_UTIL_H #ifndef MCCLELLAN_COMPILE_UTIL_H
#define MCCLELLAN_COMPILE_UTIL_H #define MCCLELLAN_COMPILE_UTIL_H
#include "rdfa.h"
#include "ue2common.h" #include "ue2common.h"
#include <set> #include <set>
namespace ue2 { namespace ue2 {
struct raw_dfa;
u32 remove_leading_dots(raw_dfa &raw); u32 remove_leading_dots(raw_dfa &raw);
void prune_overlong(raw_dfa &raw, u32 max_offset);
/**
* Prunes any states which cannot be reached within max_offset from start of
* stream. Returns false if no changes are made to the rdfa
*/
bool prune_overlong(raw_dfa &raw, u32 max_offset);
std::set<ReportID> all_reports(const raw_dfa &rdfa); std::set<ReportID> all_reports(const raw_dfa &rdfa);
bool has_eod_accepts(const raw_dfa &rdfa); bool has_eod_accepts(const raw_dfa &rdfa);
bool has_non_eod_accepts(const raw_dfa &rdfa); bool has_non_eod_accepts(const raw_dfa &rdfa);
@ -50,6 +55,8 @@ size_t hash_dfa_no_reports(const raw_dfa &rdfa);
/** \brief Compute a simple hash of this raw_dfa, including its reports. */ /** \brief Compute a simple hash of this raw_dfa, including its reports. */
size_t hash_dfa(const raw_dfa &rdfa); size_t hash_dfa(const raw_dfa &rdfa);
dstate_id_t get_sds_or_proxy(const raw_dfa &raw);
} // namespace ue2 } // namespace ue2
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -124,7 +124,7 @@ char processReports(const struct mpv *m, u8 *reporters,
DEBUG_PRINTF("report %u at %llu\n", curr->report, DEBUG_PRINTF("report %u at %llu\n", curr->report,
report_offset); report_offset);
if (curr->unbounded) { if (curr->unbounded && !curr->simple_exhaust) {
assert(rl_count < m->puffette_count); assert(rl_count < m->puffette_count);
*rl = curr->report; *rl = curr->report;
++rl; ++rl;
@ -176,7 +176,9 @@ char processReportsForRange(const struct mpv *m, u8 *reporters,
return MO_CONTINUE_MATCHING; return MO_CONTINUE_MATCHING;
} }
for (u32 i = 2; i <= length; i++) { DEBUG_PRINTF("length=%zu, rl_count=%u\n", length, rl_count);
for (size_t i = 2; i <= length; i++) {
for (u32 j = 0; j < rl_count; j++) { for (u32 j = 0; j < rl_count; j++) {
if (cb(first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) { if (cb(first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) {
DEBUG_PRINTF("bailing\n"); DEBUG_PRINTF("bailing\n");

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -108,6 +108,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
fprintf(f, " Puffette %u\n", i); fprintf(f, " Puffette %u\n", i);
fprintf(f, " repeats: %u%s\n", p[i].repeats, fprintf(f, " repeats: %u%s\n", p[i].repeats,
p[i].unbounded ? "," : ""); p[i].unbounded ? "," : "");
if (p[i].simple_exhaust) {
fprintf(f, " simple exhaustible\n");
}
fprintf(f, " report id: %u\n", p[i].report); fprintf(f, " report id: %u\n", p[i].report);
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -40,6 +40,15 @@
struct mpv_puffette { struct mpv_puffette {
u32 repeats; u32 repeats;
char unbounded; char unbounded;
/**
* \brief Report is simple-exhaustible.
*
* If this is true, we do best-effort suppression of runs of reports, only
* delivering the first one.
*/
char simple_exhaust;
ReportID report; ReportID report;
}; };

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -36,6 +36,7 @@
#include "util/alloc.h" #include "util/alloc.h"
#include "util/multibit_internal.h" #include "util/multibit_internal.h"
#include "util/order_check.h" #include "util/order_check.h"
#include "util/report_manager.h"
#include "util/verify_types.h" #include "util/verify_types.h"
#include <algorithm> #include <algorithm>
@ -53,10 +54,8 @@ namespace ue2 {
namespace { namespace {
struct pcomp { struct pcomp {
bool operator()(const raw_puff &a, const raw_puff &b) const { bool operator()(const raw_puff &a, const raw_puff &b) const {
ORDER_CHECK(repeats); return tie(a.repeats, a.unbounded, a.simple_exhaust, a.report) <
ORDER_CHECK(unbounded); tie(b.repeats, b.unbounded, b.simple_exhaust, b.report);
ORDER_CHECK(report);
return false;
} }
}; };
@ -84,12 +83,21 @@ struct ClusterKey {
} // namespace } // namespace
static static
void writePuffette(mpv_puffette *out, const raw_puff &rp) { void writePuffette(mpv_puffette *out, const raw_puff &rp,
const ReportManager &rm) {
DEBUG_PRINTF("outputting %u %d %u to %p\n", rp.repeats, (int)rp.unbounded, DEBUG_PRINTF("outputting %u %d %u to %p\n", rp.repeats, (int)rp.unbounded,
rp.report, out); rp.report, out);
out->repeats = rp.repeats; out->repeats = rp.repeats;
out->unbounded = rp.unbounded; out->unbounded = rp.unbounded;
out->report = rp.report; out->simple_exhaust = rp.simple_exhaust;
out->report = rm.getProgramOffset(rp.report);
}
static
void writeSentinel(mpv_puffette *out) {
DEBUG_PRINTF("outputting sentinel to %p\n", out);
memset(out, 0, sizeof(*out));
out->report = INVALID_REPORT;
} }
static static
@ -148,8 +156,8 @@ void populateClusters(const vector<raw_puff> &puffs_in,
static static
void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it, void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it,
u32 counter_offset, mpv *m, mpv_kilopuff *kp, const ReportManager &rm, u32 counter_offset, mpv *m,
mpv_puffette **pa) { mpv_kilopuff *kp, mpv_puffette **pa) {
const CharReach &reach = it->first.reach; const CharReach &reach = it->first.reach;
const vector<raw_puff> &puffs = it->second; const vector<raw_puff> &puffs = it->second;
@ -182,11 +190,11 @@ void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it,
kp->puffette_offset = verify_u32((char *)*pa - (char *)m); kp->puffette_offset = verify_u32((char *)*pa - (char *)m);
for (size_t i = 0; i < puffs.size(); i++) { for (size_t i = 0; i < puffs.size(); i++) {
assert(!it->first.auto_restart || puffs[i].unbounded); assert(!it->first.auto_restart || puffs[i].unbounded);
writePuffette(*pa + i, puffs[i]); writePuffette(*pa + i, puffs[i], rm);
} }
*pa += puffs.size(); *pa += puffs.size();
writePuffette(*pa, raw_puff(0U, false, INVALID_REPORT, CharReach())); writeSentinel(*pa);
++*pa; ++*pa;
writeDeadPoint(kp, puffs); writeDeadPoint(kp, puffs);
@ -301,7 +309,8 @@ const mpv_counter_info &findCounter(const vector<mpv_counter_info> &counters,
} }
aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in, aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
const vector<raw_puff> &triggered_puffs) { const vector<raw_puff> &triggered_puffs,
const ReportManager &rm) {
assert(!puffs_in.empty() || !triggered_puffs.empty()); assert(!puffs_in.empty() || !triggered_puffs.empty());
u32 puffette_count = puffs_in.size() + triggered_puffs.size(); u32 puffette_count = puffs_in.size() + triggered_puffs.size();
@ -341,7 +350,7 @@ aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
+ sizeof(mpv_counter_info) * counters.size()); + sizeof(mpv_counter_info) * counters.size());
mpv_puffette *pa = pa_base; mpv_puffette *pa = pa_base;
writePuffette(pa, raw_puff(0U, false, INVALID_REPORT, CharReach())); writeSentinel(pa);
++pa; /* skip init sentinel */ ++pa; /* skip init sentinel */
@ -367,8 +376,9 @@ aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
mpv_kilopuff *kp_begin = (mpv_kilopuff *)(m + 1); mpv_kilopuff *kp_begin = (mpv_kilopuff *)(m + 1);
mpv_kilopuff *kp = kp_begin; mpv_kilopuff *kp = kp_begin;
for (auto it = puff_clusters.begin(); it != puff_clusters.end(); ++it) { for (auto it = puff_clusters.begin(); it != puff_clusters.end(); ++it) {
writeKiloPuff(it, findCounter(counters, kp - kp_begin).counter_offset, writeKiloPuff(it, rm,
m, kp, &pa); findCounter(counters, kp - kp_begin).counter_offset, m,
kp, &pa);
++kp; ++kp;
} }
assert((char *)pa == (char *)nfa.get() + len); assert((char *)pa == (char *)nfa.get() + len);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -40,14 +40,19 @@ struct NFA;
namespace ue2 { namespace ue2 {
class ReportManager;
struct raw_puff { struct raw_puff {
raw_puff(u32 repeats_in, bool unbounded_in, ReportID report_in, raw_puff(u32 repeats_in, bool unbounded_in, ReportID report_in,
const CharReach &reach_in, bool auto_restart_in = false) const CharReach &reach_in, bool auto_restart_in = false,
bool simple_exhaust_in = false)
: repeats(repeats_in), unbounded(unbounded_in), : repeats(repeats_in), unbounded(unbounded_in),
auto_restart(auto_restart_in), report(report_in), reach(reach_in) {} auto_restart(auto_restart_in), simple_exhaust(simple_exhaust_in),
report(report_in), reach(reach_in) {}
u32 repeats; /**< report match after this many matching bytes */ u32 repeats; /**< report match after this many matching bytes */
bool unbounded; /**< keep producing matches after repeats are reached */ bool unbounded; /**< keep producing matches after repeats are reached */
bool auto_restart; /**< for /[^X]{n}/ type patterns */ bool auto_restart; /**< for /[^X]{n}/ type patterns */
bool simple_exhaust; /* first report will exhaust us */
ReportID report; ReportID report;
CharReach reach; /**< = ~escapes */ CharReach reach; /**< = ~escapes */
}; };
@ -56,9 +61,9 @@ struct raw_puff {
* puffs in the triggered_puffs vector are enabled when an TOP_N event is * puffs in the triggered_puffs vector are enabled when an TOP_N event is
* delivered corresponding to their index in the vector * delivered corresponding to their index in the vector
*/ */
aligned_unique_ptr<NFA> aligned_unique_ptr<NFA> mpvCompile(const std::vector<raw_puff> &puffs,
mpvCompile(const std::vector<raw_puff> &puffs, const std::vector<raw_puff> &triggered_puffs,
const std::vector<raw_puff> &triggered_puffs); const ReportManager &rm);
} // namespace ue2 } // namespace ue2

265
src/nfa/multiaccel_common.h Normal file
View File

@ -0,0 +1,265 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTIACCEL_COMMON_H_
#define MULTIACCEL_COMMON_H_
#include "config.h"
#include "ue2common.h"
#include "util/join.h"
#include "util/bitutils.h"
/*
* When doing shifting, remember that the total number of shifts should be n-1
*/
#define VARISHIFT(src, dst, len) \
do { \
(dst) &= (src) >> (len); \
} while (0)
#define STATIC_SHIFT1(x) \
do { \
(x) &= (x) >> 1; \
} while (0)
#define STATIC_SHIFT2(x) \
do { \
(x) &= (x) >> 2;\
} while (0)
#define STATIC_SHIFT4(x) \
do { \
(x) &= (x) >> 4; \
} while (0)
#define STATIC_SHIFT8(x) \
do { \
(x) &= (x) >> 8; \
} while (0)
#define SHIFT1(x) \
do {} while (0)
#define SHIFT2(x) \
do { \
STATIC_SHIFT1(x); \
} while (0)
#define SHIFT3(x) \
do { \
STATIC_SHIFT1(x); \
STATIC_SHIFT1(x); \
} while (0)
#define SHIFT4(x) \
do { \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT5(x) \
do { \
SHIFT4(x); \
STATIC_SHIFT1(x); \
} while (0)
#define SHIFT6(x) \
do { \
SHIFT4(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT7(x) \
do { \
SHIFT4(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT8(x) \
do { \
SHIFT4(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT9(x) \
do { \
SHIFT8(x); \
STATIC_SHIFT1(x); \
} while (0)
#define SHIFT10(x) \
do { \
SHIFT8(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT11(x) \
do { \
SHIFT8(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT12(x); \
do { \
SHIFT8(x);\
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT13(x); \
do { \
SHIFT8(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT14(x) \
do { \
SHIFT8(x); \
STATIC_SHIFT2(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT15(x) \
do { \
SHIFT8(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT16(x) \
do { \
SHIFT8(x); \
STATIC_SHIFT8(x); \
} while (0)
#define SHIFT17(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT1(x); \
} while (0)
#define SHIFT18(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT19(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT20(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT21(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT22(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT2(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT23(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT24(x) \
do { \
SHIFT16(x); \
STATIC_SHIFT8(x); \
} while (0)
#define SHIFT25(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT1(x); \
} while (0)
#define SHIFT26(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT27(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
} while (0)
#define SHIFT28(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT29(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT30(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT2(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT31(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT1(x); \
STATIC_SHIFT2(x); \
STATIC_SHIFT4(x); \
} while (0)
#define SHIFT32(x) \
do { \
SHIFT24(x); \
STATIC_SHIFT8(x); \
} while (0)
/*
* this function is used by 32-bit multiaccel matchers. 32-bit matchers accept
* a 32-bit integer as a buffer, where low 16 bits is movemask result and
* high 16 bits are "don't care" values. this function is not expected to return
* a result higher than 16.
*/
static really_inline
const u8 *match32(const u8 *buf, const u32 z) {
if (unlikely(z != 0)) {
u32 pos = ctz32(z);
assert(pos < 16);
return buf + pos;
}
return NULL;
}
/*
* this function is used by 64-bit multiaccel matchers. 64-bit matchers accept
* a 64-bit integer as a buffer, where low 32 bits is movemask result and
* high 32 bits are "don't care" values. this function is not expected to return
* a result higher than 32.
*/
static really_inline
const u8 *match64(const u8 *buf, const u64a z) {
if (unlikely(z != 0)) {
u32 pos = ctz64(z);
assert(pos < 32);
return buf + pos;
}
return NULL;
}
#endif /* MULTIACCEL_COMMON_H_ */

View File

@ -0,0 +1,439 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "multiaccel_compilehelper.h"
using namespace std;
using namespace ue2;
#ifdef DEBUG
static const char* state_to_str[] = {
"FIRST_RUN",
"SECOND_RUN",
"WAITING_FOR_GRAB",
"FIRST_TAIL",
"SECOND_TAIL",
"STOPPED",
"INVALID"
};
static const char* type_to_str[] = {
"SHIFT",
"SHIFTGRAB",
"DOUBLESHIFT",
"DOUBLESHIFTGRAB",
"LONG",
"LONGGRAB",
"NONE"
};
static
void dumpMultiaccelState(const accel_data &d) {
DEBUG_PRINTF("type: %s state: %s len1: %u tlen1: %u len2: %u tlen2: %u\n",
type_to_str[(unsigned) d.type],
state_to_str[(unsigned) d.state],
d.len1, d.tlen1, d.len2, d.tlen2);
}
#endif
/* stop all the matching. this may render most schemes invalid. */
static
void stop(accel_data &d) {
switch (d.state) {
case STATE_STOPPED:
case STATE_INVALID:
break;
case STATE_FIRST_TAIL:
case STATE_SECOND_RUN:
/*
* Shift matchers are special case, because they have "tails".
* When shift matcher reaches a mid/endpoint, tail mode is
* activated, which looks for more matches to extend the match.
*
* For example, consider pattern /a{5}ba{3}/. Under normal circumstances,
* long-grab matcher will be picked for this pattern (matching a run of a's,
* followed by a not-a), because doubleshift matcher would be confused by
* consecutive a's and would parse the pattern as a.{0}a.{0}a (two shifts
* by 1) and throw out the rest of the pattern.
*
* With tails, we defer ending the run until we actually run out of
* matching characters, so the above pattern will now be parsed by
* doubleshift matcher as /a.{3}a.{3}a/ (two shifts by 4).
*
* So if we are stopping shift matchers, we should check if we aren't in
* the process of matching first tail or second run. If we are, we can't
* finish the second run as we are stopping, but we can try and split
* the first tail instead to obtain a valid second run.
*/
if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.tlen1 == 0) {
// can't split an empty void...
d.state = STATE_INVALID;
break;
}
d.len2 = 0;
d.state = STATE_STOPPED;
break;
case STATE_SECOND_TAIL:
d.state = STATE_STOPPED;
break;
case STATE_WAITING_FOR_GRAB:
case STATE_FIRST_RUN:
if (d.type == MultibyteAccelInfo::MAT_LONG) {
d.state = STATE_STOPPED;
} else {
d.state = STATE_INVALID;
}
break;
}
}
static
void validate(accel_data &d, unsigned max_len) {
// try and fit in all our tails
if (d.len1 + d.tlen1 + d.len2 + d.tlen2 < max_len && d.len2 > 0) {
// case 1: everything fits in
d.len1 += d.tlen1;
d.len2 += d.tlen2;
d.tlen1 = 0;
d.tlen2 = 0;
} else if (d.len1 + d.tlen1 + d.len2 < max_len && d.len2 > 0) {
// case 2: everything but the second tail fits in
d.len1 += d.tlen1;
d.tlen1 = 0;
// try going for a partial tail
if (d.tlen2 != 0) {
int new_tlen2 = max_len - 1 - d.len1 - d.len2;
if (new_tlen2 > 0) {
d.len2 += new_tlen2;
}
d.tlen2 = 0;
}
} else if (d.len1 + d.tlen1 < max_len) {
// case 3: first run and its tail fits in
if (d.type == MultibyteAccelInfo::MAT_DSHIFT ||
d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
// split the tail into a second run
d.len2 = d.tlen1;
} else {
d.len1 += d.tlen1;
d.len2 = 0;
}
d.tlen1 = 0;
d.tlen2 = 0;
} else if (d.len1 < max_len) {
// case 4: nothing but the first run fits in
// try going for a partial tail
if (d.tlen1 != 0) {
int new_tlen1 = max_len - 1 - d.len1;
if (new_tlen1 > 0) {
d.len1 += new_tlen1;
}
d.tlen1 = 0;
}
d.len2 = 0;
d.tlen2 = 0;
}
// if we removed our second run, doubleshift matchers are no longer valid
if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.len2 == 0) {
d.state = STATE_INVALID;
} else if ((d.type == MultibyteAccelInfo::MAT_LONG) && d.len1 >= max_len) {
// long matchers can just stop whenever they want to
d.len1 = max_len - 1;
}
// now, general sanity checks
if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) >= max_len) {
d.state = STATE_INVALID;
}
if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) < MULTIACCEL_MIN_LEN) {
d.state = STATE_INVALID;
}
}
static
void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
switch (d.type) {
case MultibyteAccelInfo::MAT_LONG:
{
/*
* For long matcher, we want lots of consecutive same-or-subset
* char-reaches
*/
if ((ref_cr & cur_cr) == cur_cr) {
d.len1++;
} else {
d.state = STATE_STOPPED;
}
}
break;
case MultibyteAccelInfo::MAT_LONGGRAB:
{
/*
* For long-grab matcher, we want lots of consecutive same-or-subset
* char-reaches with a negative match in the end.
*/
if ((ref_cr & cur_cr) == cur_cr) {
d.len1++;
} else if (!(ref_cr & cur_cr).any()) {
/* we grabbed, stop immediately */
d.state = STATE_STOPPED;
} else {
/* our run-n-grab was interrupted; mark as invalid */
d.state = STATE_INVALID;
}
}
break;
case MultibyteAccelInfo::MAT_SHIFTGRAB:
{
/*
* For shift-grab matcher, we want two matches separated by anything;
* however the second vertex *must* be a negative (non-overlapping) match.
*
* Shiftgrab matcher is identical to shift except for presence of grab.
*/
if (d.state == STATE_WAITING_FOR_GRAB) {
if ((ref_cr & cur_cr).any()) {
d.state = STATE_INVALID;
} else {
d.state = STATE_FIRST_RUN;
d.len1++;
}
return;
}
}
/* no break, falling through */
case MultibyteAccelInfo::MAT_SHIFT:
{
/*
* For shift-matcher, we want two matches separated by anything.
*/
if (ref_cr == cur_cr) {
// keep matching tail
switch (d.state) {
case STATE_FIRST_RUN:
d.state = STATE_FIRST_TAIL;
break;
case STATE_FIRST_TAIL:
d.tlen1++;
break;
default:
// shouldn't happen
assert(0);
}
} else {
switch (d.state) {
case STATE_FIRST_RUN:
// simply advance
d.len1++;
break;
case STATE_FIRST_TAIL:
// we found a non-matching char after tail, so stop
d.state = STATE_STOPPED;
break;
default:
// shouldn't happen
assert(0);
}
}
}
break;
case MultibyteAccelInfo::MAT_DSHIFTGRAB:
{
/*
* For double shift-grab matcher, we want two matches separated by
* either negative matches or dots; however the second vertex *must*
* be a negative match.
*
* Doubleshiftgrab matcher is identical to doubleshift except for
* presence of grab.
*/
if (d.state == STATE_WAITING_FOR_GRAB) {
if ((ref_cr & cur_cr).any()) {
d.state = STATE_INVALID;
} else {
d.state = STATE_FIRST_RUN;
d.len1++;
}
return;
}
}
/* no break, falling through */
case MultibyteAccelInfo::MAT_DSHIFT:
{
/*
* For double shift matcher, we want three matches, each separated
* by a lot of anything.
*
* Doubleshift matcher is complicated by presence of tails.
*/
if (ref_cr == cur_cr) {
// decide if we are activating second shift or matching tails
switch (d.state) {
case STATE_FIRST_RUN:
d.state = STATE_FIRST_TAIL;
d.len2 = 1; // we're now ready for our second run
break;
case STATE_FIRST_TAIL:
d.tlen1++;
break;
case STATE_SECOND_RUN:
d.state = STATE_SECOND_TAIL;
break;
case STATE_SECOND_TAIL:
d.tlen2++;
break;
default:
// shouldn't happen
assert(0);
}
} else {
switch (d.state) {
case STATE_FIRST_RUN:
d.len1++;
break;
case STATE_FIRST_TAIL:
// start second run
d.state = STATE_SECOND_RUN;
d.len2++;
break;
case STATE_SECOND_RUN:
d.len2++;
break;
case STATE_SECOND_TAIL:
// stop
d.state = STATE_STOPPED;
break;
default:
// shouldn't happen
assert(0);
}
}
}
break;
default:
// shouldn't happen
assert(0);
break;
}
}
MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr, u32 off,
unsigned max_len) :
cr(ref_cr), offset(off), max_len(max_len) {
int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
accels.resize(accel_num);
// mark everything as valid
for (int i = 0; i < accel_num; i++) {
accel_data &ad = accels[i];
ad.len1 = 1;
ad.type = (MultibyteAccelInfo::multiaccel_type) i;
/* for shift-grab matchers, we are waiting for the grab right at the start */
if (ad.type == MultibyteAccelInfo::MAT_SHIFTGRAB
|| ad.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
ad.state = STATE_WAITING_FOR_GRAB;
} else {
ad.state = STATE_FIRST_RUN;
}
}
}
bool MultiaccelCompileHelper::canAdvance() {
for (const accel_data &ad : accels) {
if (ad.state != STATE_STOPPED && ad.state != STATE_INVALID) {
return true;
}
}
return false;
}
void MultiaccelCompileHelper::advance(const CharReach &cur_cr) {
for (accel_data &ad : accels) {
if (ad.state == STATE_STOPPED || ad.state == STATE_INVALID) {
continue;
}
match(ad, cr, cur_cr);
#ifdef DEBUG
dumpMultiaccelState(ad);
#endif
}
}
MultibyteAccelInfo MultiaccelCompileHelper::getBestScheme() {
int best_len = 0;
accel_data best;
DEBUG_PRINTF("Stopping multiaccel compile\n");
for (accel_data &ad : accels) {
// stop our matching
stop(ad);
validate(ad, max_len);
#ifdef DEBUG
dumpMultiaccelState(ad);
#endif
// skip invalid schemes
if (ad.state == STATE_INVALID) {
continue;
}
DEBUG_PRINTF("Marking as viable\n");
// TODO: relative strengths of accel schemes? maybe e.g. a shorter
// long match would in some cases be preferable to a longer
// double shift match (for example, depending on length)?
int as_len = ad.len1 + ad.len2;
if (as_len >= best_len) {
DEBUG_PRINTF("Marking as best\n");
best_len = as_len;
best = ad;
}
}
// if we found at least one accel scheme, return it
if (best.state != STATE_INVALID) {
#ifdef DEBUG
DEBUG_PRINTF("Picked best multiaccel state:\n");
dumpMultiaccelState(best);
#endif
MultibyteAccelInfo info;
info.cr = cr;
info.offset = offset;
info.len1 = best.len1;
info.len2 = best.len2;
info.type = best.type;
return info;
}
return MultibyteAccelInfo();
}

View File

@ -26,44 +26,50 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "internal_report.h" #ifndef MULTIACCELCOMPILE_H_
#include "report.h" #define MULTIACCELCOMPILE_H_
#include "report_manager.h"
#include "ue2common.h"
#include "nfagraph/ng_limex_accel.h"
#include <vector>
namespace ue2 { namespace ue2 {
void writeInternalReport(const Report &report, const ReportManager &rm, /* accel scheme state machine */
internal_report *ir) { enum accel_scheme_state {
assert(ir); STATE_FIRST_RUN,
assert(ISALIGNED(ir)); STATE_SECOND_RUN,
STATE_WAITING_FOR_GRAB,
STATE_FIRST_TAIL,
STATE_SECOND_TAIL,
STATE_STOPPED,
STATE_INVALID
};
ir->type = report.type; struct accel_data {
ir->hasBounds = report.hasBounds() ? 1 : 0; MultibyteAccelInfo::multiaccel_type type = MultibyteAccelInfo::MAT_NONE;
ir->quashSom = report.quashSom ? 1 : 0; accel_scheme_state state = STATE_INVALID;
ir->minOffset = report.minOffset; unsigned len1 = 0; /* length of first run */
ir->maxOffset = report.maxOffset; unsigned len2 = 0; /* length of second run, if present */
ir->minLength = report.minLength; unsigned tlen1 = 0; /* first tail length */
ir->ekey = report.ekey; unsigned tlen2 = 0; /* second tail length */
ir->offsetAdjust = report.offsetAdjust; };
ir->onmatch = report.onmatch;
switch (report.type) { class MultiaccelCompileHelper {
case INTERNAL_ROSE_CHAIN: private:
ir->aux.topSquashDistance = report.topSquashDistance; const CharReach &cr;
break; u32 offset;
case EXTERNAL_CALLBACK_SOM_REV_NFA: std::vector<accel_data> accels;
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA: unsigned max_len;
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET: public:
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE: MultiaccelCompileHelper(const CharReach &cr, u32 off, unsigned max_len);
ir->aux.revNfaIndex = report.revNfaIndex; bool canAdvance();
break; MultibyteAccelInfo getBestScheme();
default: void advance(const ue2::CharReach &cr);
ir->aux.somDistance = report.somDistance; };
break;
}
// Dedupe keys are managed by ReportManager. }; // namespace
ir->dkey = rm.getDkey(report);
}
} // namespace ue2 #endif /* MULTIACCELCOMPILE_H_ */

View File

@ -0,0 +1,149 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTIACCEL_DOUBLESHIFT_H_
#define MULTIACCEL_DOUBLESHIFT_H_
#include "multiaccel_common.h"
#define DOUBLESHIFT_MATCH(len, match_t, match_sz) \
static really_inline \
const u8 * JOIN4(doubleshiftMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
if (unlikely(z)) { \
match_t tmp = z; \
z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
VARISHIFT(z, z, len); \
VARISHIFT(tmp, tmp, len2); \
VARISHIFT(tmp, z, len); \
return JOIN(match, match_sz)(buf, z); \
} \
return NULL; \
}
#define DOUBLESHIFT_MATCH_32_DEF(n) \
DOUBLESHIFT_MATCH(n, u32, 32)
#define DOUBLESHIFT_MATCH_64_DEF(n) \
DOUBLESHIFT_MATCH(n, u64a, 64)
#define DOUBLESHIFT_MATCH_DEF(n) \
DOUBLESHIFT_MATCH_32_DEF(n) \
DOUBLESHIFT_MATCH_64_DEF(n)
DOUBLESHIFT_MATCH_DEF(1)
DOUBLESHIFT_MATCH_DEF(2)
DOUBLESHIFT_MATCH_DEF(3)
DOUBLESHIFT_MATCH_DEF(4)
DOUBLESHIFT_MATCH_DEF(5)
DOUBLESHIFT_MATCH_DEF(6)
DOUBLESHIFT_MATCH_DEF(7)
DOUBLESHIFT_MATCH_DEF(8)
DOUBLESHIFT_MATCH_DEF(9)
DOUBLESHIFT_MATCH_DEF(10)
DOUBLESHIFT_MATCH_DEF(11)
DOUBLESHIFT_MATCH_DEF(12)
DOUBLESHIFT_MATCH_DEF(13)
DOUBLESHIFT_MATCH_DEF(14)
DOUBLESHIFT_MATCH_DEF(15)
DOUBLESHIFT_MATCH_64_DEF(16)
DOUBLESHIFT_MATCH_64_DEF(17)
DOUBLESHIFT_MATCH_64_DEF(18)
DOUBLESHIFT_MATCH_64_DEF(19)
DOUBLESHIFT_MATCH_64_DEF(20)
DOUBLESHIFT_MATCH_64_DEF(21)
DOUBLESHIFT_MATCH_64_DEF(22)
DOUBLESHIFT_MATCH_64_DEF(23)
DOUBLESHIFT_MATCH_64_DEF(24)
DOUBLESHIFT_MATCH_64_DEF(25)
DOUBLESHIFT_MATCH_64_DEF(26)
DOUBLESHIFT_MATCH_64_DEF(27)
DOUBLESHIFT_MATCH_64_DEF(28)
DOUBLESHIFT_MATCH_64_DEF(29)
DOUBLESHIFT_MATCH_64_DEF(30)
DOUBLESHIFT_MATCH_64_DEF(31)
static
const UNUSED u8 * (*doubleshift_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
{
// skip the first
0,
&doubleshiftMatch_32_1,
&doubleshiftMatch_32_2,
&doubleshiftMatch_32_3,
&doubleshiftMatch_32_4,
&doubleshiftMatch_32_5,
&doubleshiftMatch_32_6,
&doubleshiftMatch_32_7,
&doubleshiftMatch_32_8,
&doubleshiftMatch_32_9,
&doubleshiftMatch_32_10,
&doubleshiftMatch_32_11,
&doubleshiftMatch_32_12,
&doubleshiftMatch_32_13,
&doubleshiftMatch_32_14,
&doubleshiftMatch_32_15,
};
static
const UNUSED u8 * (*doubleshift_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
{
// skip the first
0,
&doubleshiftMatch_64_1,
&doubleshiftMatch_64_2,
&doubleshiftMatch_64_3,
&doubleshiftMatch_64_4,
&doubleshiftMatch_64_5,
&doubleshiftMatch_64_6,
&doubleshiftMatch_64_7,
&doubleshiftMatch_64_8,
&doubleshiftMatch_64_9,
&doubleshiftMatch_64_10,
&doubleshiftMatch_64_11,
&doubleshiftMatch_64_12,
&doubleshiftMatch_64_13,
&doubleshiftMatch_64_14,
&doubleshiftMatch_64_15,
&doubleshiftMatch_64_16,
&doubleshiftMatch_64_17,
&doubleshiftMatch_64_18,
&doubleshiftMatch_64_19,
&doubleshiftMatch_64_20,
&doubleshiftMatch_64_21,
&doubleshiftMatch_64_22,
&doubleshiftMatch_64_23,
&doubleshiftMatch_64_24,
&doubleshiftMatch_64_25,
&doubleshiftMatch_64_26,
&doubleshiftMatch_64_27,
&doubleshiftMatch_64_28,
&doubleshiftMatch_64_29,
&doubleshiftMatch_64_30,
&doubleshiftMatch_64_31,
};
#endif /* MULTIACCEL_DOUBLESHIFT_H_ */

View File

@ -0,0 +1,152 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTIACCEL_DOUBLESHIFTGRAB_H_
#define MULTIACCEL_DOUBLESHIFTGRAB_H_
#include "multiaccel_common.h"
#define DOUBLESHIFTGRAB_MATCH(len, match_t, match_sz) \
static really_inline \
const u8 * JOIN4(doubleshiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
if (unlikely(z)) { \
match_t neg = ~z; \
match_t tmp = z; \
z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
neg |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
VARISHIFT(z, z, len); \
VARISHIFT(tmp, tmp, len2); \
VARISHIFT(neg, z, 1); \
VARISHIFT(tmp, z, len); \
return JOIN(match, match_sz)(buf, z); \
} \
return NULL; \
}
#define DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
DOUBLESHIFTGRAB_MATCH(n, u32, 32)
#define DOUBLESHIFTGRAB_MATCH_64_DEF(n) \
DOUBLESHIFTGRAB_MATCH(n, u64a, 64)
#define DOUBLESHIFTGRAB_MATCH_DEF(n) \
DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
DOUBLESHIFTGRAB_MATCH_64_DEF(n)
DOUBLESHIFTGRAB_MATCH_DEF(1)
DOUBLESHIFTGRAB_MATCH_DEF(2)
DOUBLESHIFTGRAB_MATCH_DEF(3)
DOUBLESHIFTGRAB_MATCH_DEF(4)
DOUBLESHIFTGRAB_MATCH_DEF(5)
DOUBLESHIFTGRAB_MATCH_DEF(6)
DOUBLESHIFTGRAB_MATCH_DEF(7)
DOUBLESHIFTGRAB_MATCH_DEF(8)
DOUBLESHIFTGRAB_MATCH_DEF(9)
DOUBLESHIFTGRAB_MATCH_DEF(10)
DOUBLESHIFTGRAB_MATCH_DEF(11)
DOUBLESHIFTGRAB_MATCH_DEF(12)
DOUBLESHIFTGRAB_MATCH_DEF(13)
DOUBLESHIFTGRAB_MATCH_DEF(14)
DOUBLESHIFTGRAB_MATCH_DEF(15)
DOUBLESHIFTGRAB_MATCH_64_DEF(16)
DOUBLESHIFTGRAB_MATCH_64_DEF(17)
DOUBLESHIFTGRAB_MATCH_64_DEF(18)
DOUBLESHIFTGRAB_MATCH_64_DEF(19)
DOUBLESHIFTGRAB_MATCH_64_DEF(20)
DOUBLESHIFTGRAB_MATCH_64_DEF(21)
DOUBLESHIFTGRAB_MATCH_64_DEF(22)
DOUBLESHIFTGRAB_MATCH_64_DEF(23)
DOUBLESHIFTGRAB_MATCH_64_DEF(24)
DOUBLESHIFTGRAB_MATCH_64_DEF(25)
DOUBLESHIFTGRAB_MATCH_64_DEF(26)
DOUBLESHIFTGRAB_MATCH_64_DEF(27)
DOUBLESHIFTGRAB_MATCH_64_DEF(28)
DOUBLESHIFTGRAB_MATCH_64_DEF(29)
DOUBLESHIFTGRAB_MATCH_64_DEF(30)
DOUBLESHIFTGRAB_MATCH_64_DEF(31)
static
const UNUSED u8 * (*doubleshiftgrab_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
{
// skip the first
0,
&doubleshiftgrabMatch_32_1,
&doubleshiftgrabMatch_32_2,
&doubleshiftgrabMatch_32_3,
&doubleshiftgrabMatch_32_4,
&doubleshiftgrabMatch_32_5,
&doubleshiftgrabMatch_32_6,
&doubleshiftgrabMatch_32_7,
&doubleshiftgrabMatch_32_8,
&doubleshiftgrabMatch_32_9,
&doubleshiftgrabMatch_32_10,
&doubleshiftgrabMatch_32_11,
&doubleshiftgrabMatch_32_12,
&doubleshiftgrabMatch_32_13,
&doubleshiftgrabMatch_32_14,
&doubleshiftgrabMatch_32_15,
};
static
const UNUSED u8 * (*doubleshiftgrab_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
{
// skip the first
0,
&doubleshiftgrabMatch_64_1,
&doubleshiftgrabMatch_64_2,
&doubleshiftgrabMatch_64_3,
&doubleshiftgrabMatch_64_4,
&doubleshiftgrabMatch_64_5,
&doubleshiftgrabMatch_64_6,
&doubleshiftgrabMatch_64_7,
&doubleshiftgrabMatch_64_8,
&doubleshiftgrabMatch_64_9,
&doubleshiftgrabMatch_64_10,
&doubleshiftgrabMatch_64_11,
&doubleshiftgrabMatch_64_12,
&doubleshiftgrabMatch_64_13,
&doubleshiftgrabMatch_64_14,
&doubleshiftgrabMatch_64_15,
&doubleshiftgrabMatch_64_16,
&doubleshiftgrabMatch_64_17,
&doubleshiftgrabMatch_64_18,
&doubleshiftgrabMatch_64_19,
&doubleshiftgrabMatch_64_20,
&doubleshiftgrabMatch_64_21,
&doubleshiftgrabMatch_64_22,
&doubleshiftgrabMatch_64_23,
&doubleshiftgrabMatch_64_24,
&doubleshiftgrabMatch_64_25,
&doubleshiftgrabMatch_64_26,
&doubleshiftgrabMatch_64_27,
&doubleshiftgrabMatch_64_28,
&doubleshiftgrabMatch_64_29,
&doubleshiftgrabMatch_64_30,
&doubleshiftgrabMatch_64_31,
};
#endif /* MULTIACCEL_DOUBLESHIFTGRAB_H_ */

145
src/nfa/multiaccel_long.h Normal file
View File

@ -0,0 +1,145 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTIACCEL_LONG_H_
#define MULTIACCEL_LONG_H_
#include "multiaccel_common.h"
#define LONG_MATCH(len, match_t, match_sz) \
static really_inline \
const u8 * JOIN4(longMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
if (unlikely(z)) { \
z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
JOIN(SHIFT, len)(z); \
return JOIN(match, match_sz)(buf, z); \
} \
return NULL; \
}
#define LONG_MATCH_32_DEF(n) \
LONG_MATCH(n, u32, 32)
#define LONG_MATCH_64_DEF(n) \
LONG_MATCH(n, u64a, 64)
#define LONG_MATCH_DEF(n) \
LONG_MATCH_32_DEF(n) \
LONG_MATCH_64_DEF(n)
LONG_MATCH_DEF(1)
LONG_MATCH_DEF(2)
LONG_MATCH_DEF(3)
LONG_MATCH_DEF(4)
LONG_MATCH_DEF(5)
LONG_MATCH_DEF(6)
LONG_MATCH_DEF(7)
LONG_MATCH_DEF(8)
LONG_MATCH_DEF(9)
LONG_MATCH_DEF(10)
LONG_MATCH_DEF(11)
LONG_MATCH_DEF(12)
LONG_MATCH_DEF(13)
LONG_MATCH_DEF(14)
LONG_MATCH_DEF(15)
LONG_MATCH_64_DEF(16)
LONG_MATCH_64_DEF(17)
LONG_MATCH_64_DEF(18)
LONG_MATCH_64_DEF(19)
LONG_MATCH_64_DEF(20)
LONG_MATCH_64_DEF(21)
LONG_MATCH_64_DEF(22)
LONG_MATCH_64_DEF(23)
LONG_MATCH_64_DEF(24)
LONG_MATCH_64_DEF(25)
LONG_MATCH_64_DEF(26)
LONG_MATCH_64_DEF(27)
LONG_MATCH_64_DEF(28)
LONG_MATCH_64_DEF(29)
LONG_MATCH_64_DEF(30)
LONG_MATCH_64_DEF(31)
static
const UNUSED u8 *(*long_match_funcs_32[])(const u8 *buf, u32 z) =
{
// skip the first three
0,
&longMatch_32_1,
&longMatch_32_2,
&longMatch_32_3,
&longMatch_32_4,
&longMatch_32_5,
&longMatch_32_6,
&longMatch_32_7,
&longMatch_32_8,
&longMatch_32_9,
&longMatch_32_10,
&longMatch_32_11,
&longMatch_32_12,
&longMatch_32_13,
&longMatch_32_14,
&longMatch_32_15,
};
static
const UNUSED u8 *(*long_match_funcs_64[])(const u8 *buf, u64a z) =
{
// skip the first three
0,
&longMatch_64_1,
&longMatch_64_2,
&longMatch_64_3,
&longMatch_64_4,
&longMatch_64_5,
&longMatch_64_6,
&longMatch_64_7,
&longMatch_64_8,
&longMatch_64_9,
&longMatch_64_10,
&longMatch_64_11,
&longMatch_64_12,
&longMatch_64_13,
&longMatch_64_14,
&longMatch_64_15,
&longMatch_64_16,
&longMatch_64_17,
&longMatch_64_18,
&longMatch_64_19,
&longMatch_64_20,
&longMatch_64_21,
&longMatch_64_22,
&longMatch_64_23,
&longMatch_64_24,
&longMatch_64_25,
&longMatch_64_26,
&longMatch_64_27,
&longMatch_64_28,
&longMatch_64_29,
&longMatch_64_30,
&longMatch_64_31,
};
#endif /* MULTIACCEL_LONG_H_ */

View File

@ -0,0 +1,148 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTIACCEL_LONGGRAB_H_
#define MULTIACCEL_LONGGRAB_H_
#include "multiaccel_common.h"
#define LONGGRAB_MATCH(len, match_t, match_sz) \
static really_inline \
const u8 * JOIN4(longgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
if (unlikely(z)) { \
match_t tmp = ~z; \
tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
JOIN(SHIFT, len)(z); \
VARISHIFT(tmp, z, len); \
return JOIN(match, match_sz)(buf, z); \
} \
return NULL; \
}
#define LONGGRAB_MATCH_32_DEF(n) \
LONGGRAB_MATCH(n, u32, 32)
#define LONGGRAB_MATCH_64_DEF(n) \
LONGGRAB_MATCH(n, u64a, 64)
#define LONGGRAB_MATCH_DEF(n) \
LONGGRAB_MATCH_32_DEF(n) \
LONGGRAB_MATCH_64_DEF(n)
LONGGRAB_MATCH_DEF(1)
LONGGRAB_MATCH_DEF(2)
LONGGRAB_MATCH_DEF(3)
LONGGRAB_MATCH_DEF(4)
LONGGRAB_MATCH_DEF(5)
LONGGRAB_MATCH_DEF(6)
LONGGRAB_MATCH_DEF(7)
LONGGRAB_MATCH_DEF(8)
LONGGRAB_MATCH_DEF(9)
LONGGRAB_MATCH_DEF(10)
LONGGRAB_MATCH_DEF(11)
LONGGRAB_MATCH_DEF(12)
LONGGRAB_MATCH_DEF(13)
LONGGRAB_MATCH_DEF(14)
LONGGRAB_MATCH_DEF(15)
LONGGRAB_MATCH_64_DEF(16)
LONGGRAB_MATCH_64_DEF(17)
LONGGRAB_MATCH_64_DEF(18)
LONGGRAB_MATCH_64_DEF(19)
LONGGRAB_MATCH_64_DEF(20)
LONGGRAB_MATCH_64_DEF(21)
LONGGRAB_MATCH_64_DEF(22)
LONGGRAB_MATCH_64_DEF(23)
LONGGRAB_MATCH_64_DEF(24)
LONGGRAB_MATCH_64_DEF(25)
LONGGRAB_MATCH_64_DEF(26)
LONGGRAB_MATCH_64_DEF(27)
LONGGRAB_MATCH_64_DEF(28)
LONGGRAB_MATCH_64_DEF(29)
LONGGRAB_MATCH_64_DEF(30)
LONGGRAB_MATCH_64_DEF(31)
static
const UNUSED u8 *(*longgrab_match_funcs_32[])(const u8 *buf, u32 z) =
{
// skip the first three
0,
&longgrabMatch_32_1,
&longgrabMatch_32_2,
&longgrabMatch_32_3,
&longgrabMatch_32_4,
&longgrabMatch_32_5,
&longgrabMatch_32_6,
&longgrabMatch_32_7,
&longgrabMatch_32_8,
&longgrabMatch_32_9,
&longgrabMatch_32_10,
&longgrabMatch_32_11,
&longgrabMatch_32_12,
&longgrabMatch_32_13,
&longgrabMatch_32_14,
&longgrabMatch_32_15,
};
static
const UNUSED u8 *(*longgrab_match_funcs_64[])(const u8 *buf, u64a z) =
{
// skip the first three
0,
&longgrabMatch_64_1,
&longgrabMatch_64_2,
&longgrabMatch_64_3,
&longgrabMatch_64_4,
&longgrabMatch_64_5,
&longgrabMatch_64_6,
&longgrabMatch_64_7,
&longgrabMatch_64_8,
&longgrabMatch_64_9,
&longgrabMatch_64_10,
&longgrabMatch_64_11,
&longgrabMatch_64_12,
&longgrabMatch_64_13,
&longgrabMatch_64_14,
&longgrabMatch_64_15,
&longgrabMatch_64_16,
&longgrabMatch_64_17,
&longgrabMatch_64_18,
&longgrabMatch_64_19,
&longgrabMatch_64_20,
&longgrabMatch_64_21,
&longgrabMatch_64_22,
&longgrabMatch_64_23,
&longgrabMatch_64_24,
&longgrabMatch_64_25,
&longgrabMatch_64_26,
&longgrabMatch_64_27,
&longgrabMatch_64_28,
&longgrabMatch_64_29,
&longgrabMatch_64_30,
&longgrabMatch_64_31,
};
#endif /* MULTIACCEL_LONGGRAB_H_ */

145
src/nfa/multiaccel_shift.h Normal file
View File

@ -0,0 +1,145 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTIACCEL_SHIFT_H_
#define MULTIACCEL_SHIFT_H_
#include "multiaccel_common.h"
#define SHIFT_MATCH(len, match_t, match_sz) \
static really_inline \
const u8 * JOIN4(shiftMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
if (unlikely(z)) { \
z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
VARISHIFT(z, z, len); \
return JOIN(match, match_sz)(buf, z); \
} \
return NULL; \
}
#define SHIFT_MATCH_32_DEF(n) \
SHIFT_MATCH(n, u32, 32)
#define SHIFT_MATCH_64_DEF(n) \
SHIFT_MATCH(n, u64a, 64)
#define SHIFT_MATCH_DEF(n) \
SHIFT_MATCH_32_DEF(n) \
SHIFT_MATCH_64_DEF(n)
SHIFT_MATCH_DEF(1)
SHIFT_MATCH_DEF(2)
SHIFT_MATCH_DEF(3)
SHIFT_MATCH_DEF(4)
SHIFT_MATCH_DEF(5)
SHIFT_MATCH_DEF(6)
SHIFT_MATCH_DEF(7)
SHIFT_MATCH_DEF(8)
SHIFT_MATCH_DEF(9)
SHIFT_MATCH_DEF(10)
SHIFT_MATCH_DEF(11)
SHIFT_MATCH_DEF(12)
SHIFT_MATCH_DEF(13)
SHIFT_MATCH_DEF(14)
SHIFT_MATCH_DEF(15)
SHIFT_MATCH_64_DEF(16)
SHIFT_MATCH_64_DEF(17)
SHIFT_MATCH_64_DEF(18)
SHIFT_MATCH_64_DEF(19)
SHIFT_MATCH_64_DEF(20)
SHIFT_MATCH_64_DEF(21)
SHIFT_MATCH_64_DEF(22)
SHIFT_MATCH_64_DEF(23)
SHIFT_MATCH_64_DEF(24)
SHIFT_MATCH_64_DEF(25)
SHIFT_MATCH_64_DEF(26)
SHIFT_MATCH_64_DEF(27)
SHIFT_MATCH_64_DEF(28)
SHIFT_MATCH_64_DEF(29)
SHIFT_MATCH_64_DEF(30)
SHIFT_MATCH_64_DEF(31)
static
const UNUSED u8 * (*shift_match_funcs_32[])(const u8 *buf, u32 z) =
{
// skip the first
0,
&shiftMatch_32_1,
&shiftMatch_32_2,
&shiftMatch_32_3,
&shiftMatch_32_4,
&shiftMatch_32_5,
&shiftMatch_32_6,
&shiftMatch_32_7,
&shiftMatch_32_8,
&shiftMatch_32_9,
&shiftMatch_32_10,
&shiftMatch_32_11,
&shiftMatch_32_12,
&shiftMatch_32_13,
&shiftMatch_32_14,
&shiftMatch_32_15,
};
static
const UNUSED u8 * (*shift_match_funcs_64[])(const u8 *buf, u64a z) =
{
// skip the first
0,
&shiftMatch_64_1,
&shiftMatch_64_2,
&shiftMatch_64_3,
&shiftMatch_64_4,
&shiftMatch_64_5,
&shiftMatch_64_6,
&shiftMatch_64_7,
&shiftMatch_64_8,
&shiftMatch_64_9,
&shiftMatch_64_10,
&shiftMatch_64_11,
&shiftMatch_64_12,
&shiftMatch_64_13,
&shiftMatch_64_14,
&shiftMatch_64_15,
&shiftMatch_64_16,
&shiftMatch_64_17,
&shiftMatch_64_18,
&shiftMatch_64_19,
&shiftMatch_64_20,
&shiftMatch_64_21,
&shiftMatch_64_22,
&shiftMatch_64_23,
&shiftMatch_64_24,
&shiftMatch_64_25,
&shiftMatch_64_26,
&shiftMatch_64_27,
&shiftMatch_64_28,
&shiftMatch_64_29,
&shiftMatch_64_30,
&shiftMatch_64_31,
};
#endif /* MULTIACCEL_SHIFT_H_ */

View File

@ -0,0 +1,148 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTIACCEL_SHIFTGRAB_H_
#define MULTIACCEL_SHIFTGRAB_H_
#include "multiaccel_common.h"
#define SHIFTGRAB_MATCH(len, match_t, match_sz) \
static really_inline \
const u8 * JOIN4(shiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
if (unlikely(z)) { \
match_t tmp = ~z; \
z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
VARISHIFT(z, z, len); \
VARISHIFT(tmp, z, 1); \
return JOIN(match, match_sz)(buf, z); \
} \
return NULL; \
}
#define SHIFTGRAB_MATCH_32_DEF(n) \
SHIFTGRAB_MATCH(n, u32, 32)
#define SHIFTGRAB_MATCH_64_DEF(n) \
SHIFTGRAB_MATCH(n, u64a, 64)
#define SHIFTGRAB_MATCH_DEF(n) \
SHIFTGRAB_MATCH_32_DEF(n) \
SHIFTGRAB_MATCH_64_DEF(n)
SHIFTGRAB_MATCH_DEF(1)
SHIFTGRAB_MATCH_DEF(2)
SHIFTGRAB_MATCH_DEF(3)
SHIFTGRAB_MATCH_DEF(4)
SHIFTGRAB_MATCH_DEF(5)
SHIFTGRAB_MATCH_DEF(6)
SHIFTGRAB_MATCH_DEF(7)
SHIFTGRAB_MATCH_DEF(8)
SHIFTGRAB_MATCH_DEF(9)
SHIFTGRAB_MATCH_DEF(10)
SHIFTGRAB_MATCH_DEF(11)
SHIFTGRAB_MATCH_DEF(12)
SHIFTGRAB_MATCH_DEF(13)
SHIFTGRAB_MATCH_DEF(14)
SHIFTGRAB_MATCH_DEF(15)
SHIFTGRAB_MATCH_64_DEF(16)
SHIFTGRAB_MATCH_64_DEF(17)
SHIFTGRAB_MATCH_64_DEF(18)
SHIFTGRAB_MATCH_64_DEF(19)
SHIFTGRAB_MATCH_64_DEF(20)
SHIFTGRAB_MATCH_64_DEF(21)
SHIFTGRAB_MATCH_64_DEF(22)
SHIFTGRAB_MATCH_64_DEF(23)
SHIFTGRAB_MATCH_64_DEF(24)
SHIFTGRAB_MATCH_64_DEF(25)
SHIFTGRAB_MATCH_64_DEF(26)
SHIFTGRAB_MATCH_64_DEF(27)
SHIFTGRAB_MATCH_64_DEF(28)
SHIFTGRAB_MATCH_64_DEF(29)
SHIFTGRAB_MATCH_64_DEF(30)
SHIFTGRAB_MATCH_64_DEF(31)
static
const UNUSED u8 * (*shiftgrab_match_funcs_32[])(const u8 *buf, u32 z) =
{
// skip the first
0,
&shiftgrabMatch_32_1,
&shiftgrabMatch_32_2,
&shiftgrabMatch_32_3,
&shiftgrabMatch_32_4,
&shiftgrabMatch_32_5,
&shiftgrabMatch_32_6,
&shiftgrabMatch_32_7,
&shiftgrabMatch_32_8,
&shiftgrabMatch_32_9,
&shiftgrabMatch_32_10,
&shiftgrabMatch_32_11,
&shiftgrabMatch_32_12,
&shiftgrabMatch_32_13,
&shiftgrabMatch_32_14,
&shiftgrabMatch_32_15,
};
static
const UNUSED u8 * (*shiftgrab_match_funcs_64[])(const u8 *buf, u64a z) =
{
// skip the first
0,
&shiftgrabMatch_64_1,
&shiftgrabMatch_64_2,
&shiftgrabMatch_64_3,
&shiftgrabMatch_64_4,
&shiftgrabMatch_64_5,
&shiftgrabMatch_64_6,
&shiftgrabMatch_64_7,
&shiftgrabMatch_64_8,
&shiftgrabMatch_64_9,
&shiftgrabMatch_64_10,
&shiftgrabMatch_64_11,
&shiftgrabMatch_64_12,
&shiftgrabMatch_64_13,
&shiftgrabMatch_64_14,
&shiftgrabMatch_64_15,
&shiftgrabMatch_64_16,
&shiftgrabMatch_64_17,
&shiftgrabMatch_64_18,
&shiftgrabMatch_64_19,
&shiftgrabMatch_64_20,
&shiftgrabMatch_64_21,
&shiftgrabMatch_64_22,
&shiftgrabMatch_64_23,
&shiftgrabMatch_64_24,
&shiftgrabMatch_64_25,
&shiftgrabMatch_64_26,
&shiftgrabMatch_64_27,
&shiftgrabMatch_64_28,
&shiftgrabMatch_64_29,
&shiftgrabMatch_64_30,
&shiftgrabMatch_64_31,
};
#endif /* MULTIACCEL_SHIFTGRAB_H_ */

114
src/nfa/multishufti.c Normal file
View File

@ -0,0 +1,114 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Shufti: character class acceleration.
*
* Utilises the SSSE3 pshufb shuffle instruction
*/
#include "config.h"
#include "ue2common.h"
#include "multishufti.h"
#include "multiaccel_common.h"
#if !defined(__AVX2__)
#define MATCH_ALGO long_
#include "multiaccel_long.h"
#include "multishufti_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO longgrab_
#include "multiaccel_longgrab.h"
#include "multishufti_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO shift_
#include "multiaccel_shift.h"
#include "multishufti_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO shiftgrab_
#include "multiaccel_shiftgrab.h"
#include "multishufti_sse.h"
#undef MATCH_ALGO
#define MULTIACCEL_DOUBLE
#define MATCH_ALGO doubleshift_
#include "multiaccel_doubleshift.h"
#include "multishufti_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO doubleshiftgrab_
#include "multiaccel_doubleshiftgrab.h"
#include "multishufti_sse.h"
#undef MATCH_ALGO
#undef MULTIACCEL_DOUBLE
#else
#define MATCH_ALGO long_
#include "multiaccel_long.h"
#include "multishufti_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO longgrab_
#include "multiaccel_longgrab.h"
#include "multishufti_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO shift_
#include "multiaccel_shift.h"
#include "multishufti_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO shiftgrab_
#include "multiaccel_shiftgrab.h"
#include "multishufti_avx2.h"
#undef MATCH_ALGO
#define MULTIACCEL_DOUBLE
#define MATCH_ALGO doubleshift_
#include "multiaccel_doubleshift.h"
#include "multishufti_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO doubleshiftgrab_
#include "multiaccel_doubleshiftgrab.h"
#include "multishufti_avx2.h"
#undef MATCH_ALGO
#undef MULTIACCEL_DOUBLE
#endif

View File

@ -26,46 +26,42 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifndef SIDECAR_H /** \file
#define SIDECAR_H * \brief Multishufti: multibyte version of Shufti
*
* Utilises the SSSE3 pshufb shuffle instruction
*/
#ifndef MULTISHUFTI_H
#define MULTISHUFTI_H
#include "ue2common.h" #include "ue2common.h"
#include "util/simd_utils.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
struct sidecar; const u8 *long_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
struct sidecar_enabled; const u8 *buf_end, const u8 run_len);
struct sidecar_scratch;
/* const u8 *longgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
* Sidecar is guaranteed to return the first match of a given id. However, in const u8 *buf_end, const u8 run_len);
* various cases later matches may also be returned, as may matches for disabled
* ids
*/
typedef void (*SidecarCallback)(u64a offset, u32 id, void *context);
void sidecarExec(const struct sidecar *n, const u8 *buffer, size_t len, const u8 *shift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
struct sidecar_enabled *enabled, const u8 *buf_end, const u8 run_len);
struct sidecar_scratch *sidecar_scratch,
u64a base_offset, SidecarCallback cb, void *context);
u32 sidecarScratchSize(const struct sidecar *n); const u8 *shiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
const u8 *buf_end, const u8 run_len);
void sidecarEnabledInit(const struct sidecar *n, const u8 *doubleshift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
struct sidecar_enabled *enabled); const u8 *buf_end, const u8 run_len,
const u8 run2_len);
/* Note: sidecar literals need to be reenabled after they match. const u8 *doubleshiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
* This is purely because this behaviour is handy for rose. const u8 *buf_end, const u8 run_len,
* In rose, they always set their roles when fired (never have to postpone due const u8 run2_len);
* to history) and if cleared their preds are also cleared so a pred would also
* have to match again before we need to care about them again
*/
void sidecarEnabledUnion(const struct sidecar *n, struct sidecar_enabled *dest,
const struct sidecar_enabled *src);
#define ID_TERMINATOR (~0U)
#ifdef __cplusplus #ifdef __cplusplus
} }

122
src/nfa/multishufti_avx2.h Normal file
View File

@ -0,0 +1,122 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "shufti_common.h"
#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"
static really_inline
const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars,
const u8 *buf, const m256 low4bits,
const m256 zeroes, const u8 run_len
#ifdef MULTIACCEL_DOUBLE
, const u8 run_len2
#endif
) {
u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, ~z
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
}
const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
const u8 *buf,
const u8 *buf_end, u8 run_len
#ifdef MULTIACCEL_DOUBLE
, u8 run_len2
#endif
) {
assert(buf && buf_end);
assert(buf < buf_end);
// Slow path for small cases.
if (buf_end - buf < 32) {
return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
buf, buf_end);
}
const m256 zeroes = zeroes256();
const m256 low4bits = set32x8(0xf);
const m256 wide_mask_lo = set2x128(mask_lo);
const m256 wide_mask_hi = set2x128(mask_hi);
const u8 *rv;
size_t min = (size_t)buf % 32;
assert(buf_end - buf >= 32);
// Preconditioning: most of the time our buffer won't be aligned.
m256 chars = loadu256(buf);
rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf,
low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (rv) {
return rv;
}
buf += (32 - min);
// Unrolling was here, but it wasn't doing anything but taking up space.
// Reroll FTW.
const u8 *last_block = buf_end - 32;
while (buf < last_block) {
m256 lchars = load256(buf);
rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, lchars, buf,
low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (rv) {
return rv;
}
buf += 32;
}
// Use an unaligned load to mop up the last 32 bytes and get an accurate
// picture to buf_end.
assert(buf <= buf_end && buf >= buf_end - 32);
chars = loadu256(buf_end - 32);
rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf_end - 32,
low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (rv) {
return rv;
}
return buf_end;
}

266
src/nfa/multishufti_sse.h Normal file
View File

@ -0,0 +1,266 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "shufti_common.h"
#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"
/* Normal SSSE3 shufti */
static really_inline
const u8 *JOIN(MATCH_ALGO, fwdBlock)(m128 mask_lo, m128 mask_hi, m128 chars,
const u8 *buf, const m128 low4bits,
const m128 zeroes, const u8 run_len
#ifdef MULTIACCEL_DOUBLE
, const u8 run_len2
#endif
) {
// negate first 16 bits
u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes) ^ 0xFFFF;
return (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])(buf, z
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
}
/*
* 16-byte pipeline, for smaller scans
*/
static
const u8 *JOIN(MATCH_ALGO, shuftiPipeline16)(m128 mask_lo, m128 mask_hi,
const u8 *buf, const u8 *buf_end,
const m128 low4bits,
const m128 zeroes, const u8 run_len
#ifdef MULTIACCEL_DOUBLE
, const u8 run_len2
#endif
) {
const u8* ptr, *last_buf;
u32 last_res;
// pipeline prologue: scan first 16 bytes
m128 data = load128(buf);
u32 z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
last_buf = buf;
last_res = z;
buf += 16;
// now, start the pipeline!
assert((size_t)buf % 16 == 0);
for (; buf + 15 < buf_end; buf += 16) {
// scan more data
data = load128(buf);
z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
// do a comparison on previous result
ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
(last_buf, last_res
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (unlikely(ptr)) {
return ptr;
}
last_buf = buf;
last_res = z;
}
assert(buf <= buf_end && buf >= buf_end - 16);
// epilogue: compare final results
ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
(last_buf, last_res
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (unlikely(ptr)) {
return ptr;
}
return NULL;
}
/*
* 32-byte pipeline, for bigger scans
*/
static
const u8 *JOIN(MATCH_ALGO, shuftiPipeline32)(m128 mask_lo, m128 mask_hi,
const u8 *buf, const u8 *buf_end,
const m128 low4bits,
const m128 zeroes, const u8 run_len
#ifdef MULTIACCEL_DOUBLE
, const u8 run_len2
#endif
) {
const u8* ptr, *last_buf;
u32 res;
// pipeline prologue: scan first 32 bytes
m128 data1 = load128(buf);
u32 z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
m128 data2 = load128(buf + 16);
u32 z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
// store the results
u32 last_res = z1 | (z2 << 16);
last_buf = buf;
buf += 32;
// now, start the pipeline!
assert((size_t)buf % 16 == 0);
for (; buf + 31 < buf_end; buf += 32) {
// scan more data
data1 = load128(buf);
z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
data2 = load128(buf + 16);
z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
res = z1 | (z2 << 16);
// do a comparison on previous result
ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
(last_buf, last_res
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (unlikely(ptr)) {
return ptr;
}
last_res = res;
last_buf = buf;
}
// epilogue: compare final results
ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
(last_buf, last_res
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (unlikely(ptr)) {
return ptr;
}
// if we still have some data left, scan it too
for (; buf + 15 < buf_end; buf += 16) {
m128 chars = load128(buf);
ptr = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (unlikely(ptr)) {
return ptr;
}
}
assert(buf <= buf_end && buf >= buf_end - 16);
return NULL;
}
const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
const u8 *buf,
const u8 *buf_end, u8 run_len
#ifdef MULTIACCEL_DOUBLE
, u8 run_len2
#endif
) {
assert(buf && buf_end);
assert(buf < buf_end);
// Slow path for small cases.
if (buf_end - buf < 16) {
return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
buf, buf_end);
}
const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf);
const u8 *rv;
size_t min = (size_t)buf % 16;
assert(buf_end - buf >= 16);
// Preconditioning: most of the time our buffer won't be aligned.
m128 chars = loadu128(buf);
rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (rv) {
return rv;
}
buf += (16 - min);
// if we have enough data, run bigger pipeline; otherwise run smaller one
if (buf_end - buf >= 128) {
rv = JOIN(MATCH_ALGO, shuftiPipeline32)(mask_lo, mask_hi,
buf, buf_end, low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (unlikely(rv)) {
return rv;
}
} else if (buf_end - buf >= 16){
rv = JOIN(MATCH_ALGO, shuftiPipeline16)(mask_lo, mask_hi,
buf, buf_end, low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (unlikely(rv)) {
return rv;
}
}
// Use an unaligned load to mop up the last 16 bytes and get an accurate
// picture to buf_end.
chars = loadu128(buf_end - 16);
rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars,
buf_end - 16, low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
, run_len2
#endif
);
if (rv) {
return rv;
}
return buf_end;
}

111
src/nfa/multitruffle.c Normal file
View File

@ -0,0 +1,111 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ue2common.h"
#include "multitruffle.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"
#include "multiaccel_common.h"
#if !defined(__AVX2__)
#define MATCH_ALGO long_
#include "multiaccel_long.h"
#include "multitruffle_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO longgrab_
#include "multiaccel_longgrab.h"
#include "multitruffle_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO shift_
#include "multiaccel_shift.h"
#include "multitruffle_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO shiftgrab_
#include "multiaccel_shiftgrab.h"
#include "multitruffle_sse.h"
#undef MATCH_ALGO
#define MULTIACCEL_DOUBLE
#define MATCH_ALGO doubleshift_
#include "multiaccel_doubleshift.h"
#include "multitruffle_sse.h"
#undef MATCH_ALGO
#define MATCH_ALGO doubleshiftgrab_
#include "multiaccel_doubleshiftgrab.h"
#include "multitruffle_sse.h"
#undef MATCH_ALGO
#undef MULTIACCEL_DOUBLE
#else
#define MATCH_ALGO long_
#include "multiaccel_long.h"
#include "multitruffle_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO longgrab_
#include "multiaccel_longgrab.h"
#include "multitruffle_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO shift_
#include "multiaccel_shift.h"
#include "multitruffle_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO shiftgrab_
#include "multiaccel_shiftgrab.h"
#include "multitruffle_avx2.h"
#undef MATCH_ALGO
#define MULTIACCEL_DOUBLE
#define MATCH_ALGO doubleshift_
#include "multiaccel_doubleshift.h"
#include "multitruffle_avx2.h"
#undef MATCH_ALGO
#define MATCH_ALGO doubleshiftgrab_
#include "multiaccel_doubleshiftgrab.h"
#include "multitruffle_avx2.h"
#undef MATCH_ALGO
#undef MULTIACCEL_DOUBLE
#endif

73
src/nfa/multitruffle.h Normal file
View File

@ -0,0 +1,73 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MULTITRUFFLE_H
#define MULTITRUFFLE_H
/** \file
* \brief Multitruffle: multibyte version of Truffle.
*
* Utilises the SSSE3 pshufb shuffle instruction
*/
#include "util/simd_types.h"
#ifdef __cplusplus
extern "C"
{
#endif
const u8 *long_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end, const u8 run_len);
const u8 *longgrab_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end, const u8 run_len);
const u8 *shift_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end, const u8 run_len);
const u8 *shiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset, const u8 *buf,
const u8 *buf_end, const u8 run_len);
const u8 *doubleshift_truffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset, const u8 *buf,
const u8 *buf_end, const u8 run_len,
const u8 run2_len);
const u8 *doubleshiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset, const u8 *buf,
const u8 *buf_end, const u8 run_len,
const u8 run2_len);
#ifdef __cplusplus
}
#endif
#endif /* MULTITRUFFLE_H */

Some files were not shown because too many files have changed in this diff Show More