Merge branch 'github_develop' into github_master

This commit is contained in:
Hong, Yang A 2021-01-13 14:39:34 +00:00
commit 64a995bf44
84 changed files with 7256 additions and 533 deletions

View File

@ -2,6 +2,30 @@
This is a list of notable changes to Hyperscan, in reverse chronological order. This is a list of notable changes to Hyperscan, in reverse chronological order.
## [5.4.0] 2020-12-31
- Improvement on literal matcher "Fat Teddy" performance, including
support for Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R)
AVX-512 VBMI).
- Introduce a new 32-state shuffle-based DFA engine ("Sheng32"). This improves
scanning performance by leveraging AVX-512 VBMI.
- Introduce a new 64-state shuffle-based DFA engine ("Sheng64"). This improves
scanning performance by leveraging AVX-512 VBMI.
- Introduce a new shuffle-based hybrid DFA engine ("McSheng64"). This improves
scanning performance by leveraging AVX-512 VBMI.
- Improvement on exceptional state handling performance for LimEx NFA, including
support for AVX-512 VBMI.
- Improvement on lookaround performance with new models, including support for
AVX-512.
- Improvement on DFA state space efficiency.
- Optimization on decision of NFA/DFA generation.
- hsbench: add CSV dump support for hsbench.
- Bugfix for cmake error on Icelake under release mode.
- Bugfix in find_vertices_in_cycles() to avoid self-loop checking in SCC.
- Bugfix for issue #270: fix return value handling in chimera.
- Bugfix for issue #284: use correct free function in logical combination.
- Add BUILD_EXAMPLES cmake option to enable example code compilation. (#260)
- Some typo fixing. (#242, #259)
## [5.3.0] 2020-05-15 ## [5.3.0] 2020-05-15
- Improvement on literal matcher "Teddy" performance, including support for - Improvement on literal matcher "Teddy" performance, including support for
Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R) AVX-512 Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R) AVX-512

View File

@ -2,7 +2,7 @@ cmake_minimum_required (VERSION 2.8.11)
project (hyperscan C CXX) project (hyperscan C CXX)
set (HS_MAJOR_VERSION 5) set (HS_MAJOR_VERSION 5)
set (HS_MINOR_VERSION 3) set (HS_MINOR_VERSION 4)
set (HS_PATCH_VERSION 0) set (HS_PATCH_VERSION 0)
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
@ -133,6 +133,13 @@ CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in
option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime"
OFF) OFF)
option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime"
OFF)
if (BUILD_AVX512VBMI)
set(BUILD_AVX512 ON)
endif ()
option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF) option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)
# TODO: per platform config files? # TODO: per platform config files?
@ -277,6 +284,7 @@ else()
set(SKYLAKE_FLAG "-xCORE-AVX512") set(SKYLAKE_FLAG "-xCORE-AVX512")
else () else ()
set(SKYLAKE_FLAG "-march=skylake-avx512") set(SKYLAKE_FLAG "-march=skylake-avx512")
set(ICELAKE_FLAG "-march=icelake-server")
endif () endif ()
endif() endif()
@ -1197,6 +1205,9 @@ else (FAT_RUNTIME)
if (NOT BUILD_AVX512) if (NOT BUILD_AVX512)
set (DISPATCHER_DEFINE "-DDISABLE_AVX512_DISPATCH") set (DISPATCHER_DEFINE "-DDISABLE_AVX512_DISPATCH")
endif (NOT BUILD_AVX512) endif (NOT BUILD_AVX512)
if (NOT BUILD_AVX512VBMI)
set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DDISABLE_AVX512VBMI_DISPATCH")
endif (NOT BUILD_AVX512VBMI)
set_source_files_properties(src/dispatcher.c PROPERTIES set_source_files_properties(src/dispatcher.c PROPERTIES
COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}") COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
@ -1229,6 +1240,14 @@ else (FAT_RUNTIME)
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
) )
endif (BUILD_AVX512) endif (BUILD_AVX512)
if (BUILD_AVX512VBMI)
add_library(hs_exec_avx512vbmi OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx512vbmi>)
set_target_properties(hs_exec_avx512vbmi PROPERTIES
COMPILE_FLAGS "${ICELAKE_FLAG}"
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512vbmi ${CMAKE_MODULE_PATH}/keep.syms.in"
)
endif (BUILD_AVX512VBMI)
add_library(hs_exec_common OBJECT add_library(hs_exec_common OBJECT
${hs_exec_common_SRCS} ${hs_exec_common_SRCS}
@ -1287,6 +1306,15 @@ else (FAT_RUNTIME)
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
) )
endif (BUILD_AVX512) endif (BUILD_AVX512)
if (BUILD_AVX512VBMI)
add_library(hs_exec_shared_avx512vbmi OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx512vbmi>)
set_target_properties(hs_exec_shared_avx512vbmi PROPERTIES
COMPILE_FLAGS "${ICELAKE_FLAG}"
POSITION_INDEPENDENT_CODE TRUE
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512vbmi ${CMAKE_MODULE_PATH}/keep.syms.in"
)
endif (BUILD_AVX512VBMI)
add_library(hs_exec_common_shared OBJECT add_library(hs_exec_common_shared OBJECT
${hs_exec_common_SRCS} ${hs_exec_common_SRCS}
src/dispatcher.c src/dispatcher.c
@ -1380,7 +1408,7 @@ if (NOT BUILD_STATIC_LIBS)
add_library(hs ALIAS hs_shared) add_library(hs ALIAS hs_shared)
endif () endif ()
option(BUILD_EXAMPLES "Build Hyperscan example code (default TRUE)" TRUE)
if(NOT WIN32) if(NOT WIN32 AND BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)
endif() endif()

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2018, Intel Corporation * Copyright (c) 2018-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -345,6 +345,16 @@ ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func,
*/ */
#define CH_SCRATCH_IN_USE (-10) #define CH_SCRATCH_IN_USE (-10)
/**
* Unexpected internal error from Hyperscan.
*
* This error indicates that there was unexpected matching behaviors from
* Hyperscan. This could be related to invalid usage of scratch space or
* invalid memory operations by users.
*
*/
#define CH_UNKNOWN_HS_ERROR (-13)
/** /**
* Returned when pcre_exec (called for some expressions internally from @ref * Returned when pcre_exec (called for some expressions internally from @ref
* ch_scan) failed due to a fatal error. * ch_scan) failed due to a fatal error.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2018, Intel Corporation * Copyright (c) 2018-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -419,6 +419,7 @@ int HS_CDECL multiCallback(unsigned int id, unsigned long long from,
DEBUG_PRINTF("user callback told us to skip this pattern\n"); DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length; pd->scanStart = hyctx->length;
ret = HS_SUCCESS; ret = HS_SUCCESS;
hyctx->scratch->ret = ret;
} else if (ret == CH_FAIL_INTERNAL) { } else if (ret == CH_FAIL_INTERNAL) {
return ret; return ret;
} }
@ -590,11 +591,24 @@ ch_error_t ch_scan_i(const ch_database_t *hydb,
if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) { if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) {
ret = scanHyperscan(&hyctx, data, length); ret = scanHyperscan(&hyctx, data, length);
if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) { // Errors from pcre scan.
DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret); if (scratch->ret == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("Pcre terminates scan\n");
unmarkScratchInUse(scratch);
return CH_SCAN_TERMINATED;
} else if (scratch->ret != CH_SUCCESS) {
DEBUG_PRINTF("Pcre internal error\n");
unmarkScratchInUse(scratch); unmarkScratchInUse(scratch);
return scratch->ret; return scratch->ret;
} }
// Errors from Hyperscan scan. Note Chimera could terminate
// Hyperscan callback on purpose so this is not counted as an error.
if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
assert(scratch->ret == CH_SUCCESS);
DEBUG_PRINTF("Hyperscan returned error %d\n", ret);
unmarkScratchInUse(scratch);
return ret;
}
} }
DEBUG_PRINTF("Flush priority queue\n"); DEBUG_PRINTF("Flush priority queue\n");

View File

@ -17,10 +17,21 @@ if (BUILD_AVX512)
endif () endif ()
endif () endif ()
if (BUILD_AVX512VBMI)
CHECK_C_COMPILER_FLAG(${ICELAKE_FLAG} HAS_ARCH_ICELAKE)
if (NOT HAS_ARCH_ICELAKE)
message (FATAL_ERROR "AVX512VBMI not supported by compiler")
endif ()
endif ()
if (FAT_RUNTIME) if (FAT_RUNTIME)
# test the highest level microarch to make sure everything works # test the highest level microarch to make sure everything works
if (BUILD_AVX512) if (BUILD_AVX512)
if (BUILD_AVX512VBMI)
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ICELAKE_FLAG}")
else ()
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}") set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
endif (BUILD_AVX512VBMI)
else () else ()
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2") set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2")
endif () endif ()
@ -80,6 +91,9 @@ if (FAT_RUNTIME)
if (BUILD_AVX512 AND NOT HAVE_AVX512) if (BUILD_AVX512 AND NOT HAVE_AVX512)
message(FATAL_ERROR "AVX512 support requested but not supported") message(FATAL_ERROR "AVX512 support requested but not supported")
endif () endif ()
if (BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI)
message(FATAL_ERROR "AVX512VBMI support requested but not supported")
endif ()
else (NOT FAT_RUNTIME) else (NOT FAT_RUNTIME)
if (NOT HAVE_AVX2) if (NOT HAVE_AVX2)
message(STATUS "Building without AVX2 support") message(STATUS "Building without AVX2 support")
@ -87,6 +101,9 @@ else (NOT FAT_RUNTIME)
if (NOT HAVE_AVX512) if (NOT HAVE_AVX512)
message(STATUS "Building without AVX512 support") message(STATUS "Building without AVX512 support")
endif () endif ()
if (NOT HAVE_AVX512VBMI)
message(STATUS "Building without AVX512VBMI support")
endif ()
if (NOT HAVE_SSSE3) if (NOT HAVE_SSSE3)
message(FATAL_ERROR "A minimum of SSSE3 compiler support is required") message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
endif () endif ()

View File

@ -24,6 +24,9 @@
/* Define if building AVX-512 in the fat runtime. */ /* Define if building AVX-512 in the fat runtime. */
#cmakedefine BUILD_AVX512 #cmakedefine BUILD_AVX512
/* Define if building AVX512VBMI in the fat runtime. */
#cmakedefine BUILD_AVX512VBMI
/* Define to 1 if `backtrace' works. */ /* Define to 1 if `backtrace' works. */
#cmakedefine HAVE_BACKTRACE #cmakedefine HAVE_BACKTRACE

View File

@ -212,7 +212,7 @@ space is required for that context.
In the absence of recursive scanning, only one such space is required per thread In the absence of recursive scanning, only one such space is required per thread
and can (and indeed should) be allocated before data scanning is to commence. and can (and indeed should) be allocated before data scanning is to commence.
In a scenario where a set of expressions are compiled by a single "master" In a scenario where a set of expressions are compiled by a single "main"
thread and data will be scanned by multiple "worker" threads, the convenience thread and data will be scanned by multiple "worker" threads, the convenience
function :c:func:`ch_clone_scratch` allows multiple copies of an existing function :c:func:`ch_clone_scratch` allows multiple copies of an existing
scratch space to be made for each thread (rather than forcing the caller to pass scratch space to be made for each thread (rather than forcing the caller to pass

View File

@ -64,21 +64,21 @@ interpreted independently. No syntax association happens between any adjacent
characters. characters.
For example, given an expression written as :regexp:`/bc?/`. We could say it is For example, given an expression written as :regexp:`/bc?/`. We could say it is
a regluar expression, with the meaning that character ``b`` followed by nothing a regular expression, with the meaning that character ``b`` followed by nothing
or by one character ``c``. On the other view, we could also say it is a pure or by one character ``c``. On the other view, we could also say it is a pure
literal expression, with the meaning that this is a character sequence of 3-byte literal expression, with the meaning that this is a character sequence of 3-byte
length, containing characters ``b``, ``c`` and ``?``. In regular case, the length, containing characters ``b``, ``c`` and ``?``. In regular case, the
question mark character ``?`` has a particular syntax role called 0-1 quantifier, question mark character ``?`` has a particular syntax role called 0-1 quantifier,
which has an syntax association with the character ahead of it. Similar which has a syntax association with the character ahead of it. Similar
characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``, characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``. ``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``.
While in pure literal case, all these meta characters lost extra meanings While in pure literal case, all these meta characters lost extra meanings
expect for that they are just common ASCII codes. expect for that they are just common ASCII codes.
Hyperscan is initially designed to process common regular expressions. It is Hyperscan is initially designed to process common regular expressions. It is
hence embedded with a complex parser to do comprehensive regular grammer hence embedded with a complex parser to do comprehensive regular grammar
interpretion. Particularly, the identification of above meta characters is the interpretation. Particularly, the identification of above meta characters is the
basic step for the interpretion of far more complex regular grammers. basic step for the interpretation of far more complex regular grammars.
However in real cases, patterns may not always be regular expressions. They However in real cases, patterns may not always be regular expressions. They
could just be pure literals. Problem will come if the pure literals contain could just be pure literals. Problem will come if the pure literals contain
@ -165,7 +165,7 @@ The following regex constructs are supported by Hyperscan:
:regexp:`{n,}` are supported with limitations. :regexp:`{n,}` are supported with limitations.
* For arbitrary repeated sub-patterns: *n* and *m* should be either small * For arbitrary repeated sub-patterns: *n* and *m* should be either small
or infinite, e.g. :regexp:`(a|b}{4}`, :regexp:`(ab?c?d){4,10}` or or infinite, e.g. :regexp:`(a|b){4}`, :regexp:`(ab?c?d){4,10}` or
:regexp:`(ab(cd)*){6,}`. :regexp:`(ab(cd)*){6,}`.
* For single-character width sub-patterns such as :regexp:`[^\\a]` or * For single-character width sub-patterns such as :regexp:`[^\\a]` or

View File

@ -263,17 +263,19 @@ the current platform is supported by Hyperscan.
As of this release, the variants of the runtime that are built, and the CPU As of this release, the variants of the runtime that are built, and the CPU
capability that is required, are the following: capability that is required, are the following:
+----------+-------------------------------+---------------------------+ +--------------+---------------------------------+---------------------------+
| Variant | CPU Feature Flag(s) Required | gcc arch flag | | Variant | CPU Feature Flag(s) Required | gcc arch flag |
+==========+===============================+===========================+ +==============+=================================+===========================+
| Core 2 | ``SSSE3`` | ``-march=core2`` | | Core 2 | ``SSSE3`` | ``-march=core2`` |
+----------+-------------------------------+---------------------------+ +--------------+---------------------------------+---------------------------+
| Core i7 | ``SSE4_2`` and ``POPCNT`` | ``-march=corei7`` | | Core i7 | ``SSE4_2`` and ``POPCNT`` | ``-march=corei7`` |
+----------+-------------------------------+---------------------------+ +--------------+---------------------------------+---------------------------+
| AVX 2 | ``AVX2`` | ``-march=core-avx2`` | | AVX 2 | ``AVX2`` | ``-march=core-avx2`` |
+----------+-------------------------------+---------------------------+ +--------------+---------------------------------+---------------------------+
| AVX 512 | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` | | AVX 512 | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` |
+----------+-------------------------------+---------------------------+ +--------------+---------------------------------+---------------------------+
| AVX 512 VBMI | ``AVX512VBMI`` (see note below) | ``-march=icelake-server`` |
+--------------+---------------------------------+---------------------------+
.. note:: .. note::
@ -287,6 +289,16 @@ capability that is required, are the following:
cmake -DBUILD_AVX512=on <...> cmake -DBUILD_AVX512=on <...>
Hyperscan v5.3 adds support for AVX512VBMI instructions - in particular the
``AVX512VBMI`` instruction set that was introduced on Intel "Icelake" Xeon
processors - however the AVX512VBMI runtime variant is **not** enabled by
default in fat runtime builds as not all toolchains support AVX512VBMI
instruction sets. To build an AVX512VBMI runtime, the CMake variable
``BUILD_AVX512VBMI`` must be enabled manually during configuration. For
example: ::
cmake -DBUILD_AVX512VBMI=on <...>
As the fat runtime requires compiler, libc, and binutils support, at this time As the fat runtime requires compiler, libc, and binutils support, at this time
it will only be enabled for Linux builds where the compiler supports the it will only be enabled for Linux builds where the compiler supports the
`indirect function "ifunc" function attribute `indirect function "ifunc" function attribute

View File

@ -178,7 +178,7 @@ space is required for that context.
In the absence of recursive scanning, only one such space is required per thread In the absence of recursive scanning, only one such space is required per thread
and can (and indeed should) be allocated before data scanning is to commence. and can (and indeed should) be allocated before data scanning is to commence.
In a scenario where a set of expressions are compiled by a single "master" In a scenario where a set of expressions are compiled by a single "main"
thread and data will be scanned by multiple "worker" threads, the convenience thread and data will be scanned by multiple "worker" threads, the convenience
function :c:func:`hs_clone_scratch` allows multiple copies of an existing function :c:func:`hs_clone_scratch` allows multiple copies of an existing
scratch space to be made for each thread (rather than forcing the caller to pass scratch space to be made for each thread (rather than forcing the caller to pass

View File

@ -458,6 +458,9 @@ platform_t target_to_platform(const target_t &target_info) {
if (!target_info.has_avx512()) { if (!target_info.has_avx512()) {
p |= HS_PLATFORM_NOAVX512; p |= HS_PLATFORM_NOAVX512;
} }
if (!target_info.has_avx512vbmi()) {
p |= HS_PLATFORM_NOAVX512VBMI;
}
return p; return p;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -115,7 +115,8 @@ static
hs_error_t db_check_platform(const u64a p) { hs_error_t db_check_platform(const u64a p) {
if (p != hs_current_platform if (p != hs_current_platform
&& p != (hs_current_platform | hs_current_platform_no_avx2) && p != (hs_current_platform | hs_current_platform_no_avx2)
&& p != (hs_current_platform | hs_current_platform_no_avx512)) { && p != (hs_current_platform | hs_current_platform_no_avx512)
&& p != (hs_current_platform | hs_current_platform_no_avx512vbmi)) {
return HS_DB_PLATFORM_ERROR; return HS_DB_PLATFORM_ERROR;
} }
// passed all checks // passed all checks
@ -370,9 +371,11 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
u8 minor = (version >> 16) & 0xff; u8 minor = (version >> 16) & 0xff;
u8 major = (version >> 24) & 0xff; u8 major = (version >> 24) & 0xff;
const char *features = (plat & HS_PLATFORM_NOAVX512) const char *features = (plat & HS_PLATFORM_NOAVX512VBMI)
? (plat & HS_PLATFORM_NOAVX512)
? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2" ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
: "AVX512"; : "AVX512"
: "AVX512VBMI";
const char *mode = NULL; const char *mode = NULL;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -55,6 +55,7 @@ extern "C"
#define HS_PLATFORM_NOAVX2 (4<<13) #define HS_PLATFORM_NOAVX2 (4<<13)
#define HS_PLATFORM_NOAVX512 (8<<13) #define HS_PLATFORM_NOAVX512 (8<<13)
#define HS_PLATFORM_NOAVX512VBMI (0x10<<13)
/** \brief Platform features bitmask. */ /** \brief Platform features bitmask. */
typedef u64a platform_t; typedef u64a platform_t;
@ -66,6 +67,9 @@ const platform_t hs_current_platform = {
#endif #endif
#if !defined(HAVE_AVX512) #if !defined(HAVE_AVX512)
HS_PLATFORM_NOAVX512 | HS_PLATFORM_NOAVX512 |
#endif
#if !defined(HAVE_AVX512VBMI)
HS_PLATFORM_NOAVX512VBMI |
#endif #endif
0, 0,
}; };
@ -74,12 +78,20 @@ static UNUSED
const platform_t hs_current_platform_no_avx2 = { const platform_t hs_current_platform_no_avx2 = {
HS_PLATFORM_NOAVX2 | HS_PLATFORM_NOAVX2 |
HS_PLATFORM_NOAVX512 | HS_PLATFORM_NOAVX512 |
HS_PLATFORM_NOAVX512VBMI |
0, 0,
}; };
static UNUSED static UNUSED
const platform_t hs_current_platform_no_avx512 = { const platform_t hs_current_platform_no_avx512 = {
HS_PLATFORM_NOAVX512 | HS_PLATFORM_NOAVX512 |
HS_PLATFORM_NOAVX512VBMI |
0,
};
static UNUSED
const platform_t hs_current_platform_no_avx512vbmi = {
HS_PLATFORM_NOAVX512VBMI |
0, 0,
}; };

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -38,8 +38,14 @@
#define check_avx512() (0) #define check_avx512() (0)
#endif #endif
#if defined(DISABLE_AVX512VBMI_DISPATCH)
#define avx512vbmi_ disabled_
#define check_avx512vbmi() (0)
#endif
#define CREATE_DISPATCH(RTYPE, NAME, ...) \ #define CREATE_DISPATCH(RTYPE, NAME, ...) \
/* create defns */ \ /* create defns */ \
RTYPE JOIN(avx512vbmi_, NAME)(__VA_ARGS__); \
RTYPE JOIN(avx512_, NAME)(__VA_ARGS__); \ RTYPE JOIN(avx512_, NAME)(__VA_ARGS__); \
RTYPE JOIN(avx2_, NAME)(__VA_ARGS__); \ RTYPE JOIN(avx2_, NAME)(__VA_ARGS__); \
RTYPE JOIN(corei7_, NAME)(__VA_ARGS__); \ RTYPE JOIN(corei7_, NAME)(__VA_ARGS__); \
@ -52,6 +58,9 @@
\ \
/* resolver */ \ /* resolver */ \
static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \ static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \
if (check_avx512vbmi()) { \
return JOIN(avx512vbmi_, NAME); \
} \
if (check_avx512()) { \ if (check_avx512()) { \
return JOIN(avx512_, NAME); \ return JOIN(avx512_, NAME); \
} \ } \

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -107,6 +107,25 @@ void dumpTeddyReinforced(const u8 *rmsk, const u32 num_tables, FILE *f) {
} }
} }
static
void dumpTeddyDupMasks(const u8 *dmsk, u32 numMasks, FILE *f) {
// dump nibble masks
u32 maskWidth = 2;
fprintf(f, " dup nibble masks:\n");
for (u32 i = 0; i < numMasks * 2; i++) {
fprintf(f, " -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
u8 val = dmsk[i * 16 * maskWidth * 2 + j];
for (u32 k = 0; k < 8; k++) {
fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
}
fprintf(f, " ");
}
fprintf(f, "\n");
}
fprintf(f, "\n");
}
static static
void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) { void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
// dump nibble masks // dump nibble masks
@ -146,12 +165,17 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
u32 maskWidth = des->getNumBuckets() / 8; u32 maskWidth = des->getNumBuckets() / 8;
size_t headerSize = sizeof(Teddy); size_t headerSize = sizeof(Teddy);
size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
const u8 *teddy_base = (const u8 *)teddy; const u8 *teddy_base = (const u8 *)teddy;
const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize); const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
const u8 *rmsk = baseMsk + ROUNDUP_CL(maskLen);
dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f); dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
dumpTeddyReinforced(rmsk, maskWidth, f); size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
const u8 *rdmsk = baseMsk + ROUNDUP_CL(maskLen);
if (maskWidth == 1) { // reinforcement table in Teddy
dumpTeddyReinforced(rdmsk, maskWidth, f);
} else { // dup nibble mask table in Fat Teddy
assert(maskWidth == 2);
dumpTeddyDupMasks(rdmsk, des->numMasks, f);
}
dumpConfirms(teddy, teddy->confOffset, des->getNumBuckets(), f); dumpConfirms(teddy, teddy->confOffset, des->getNumBuckets(), f);
} }

View File

@ -284,14 +284,6 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
#define PREP_CONF_FN(val, n) \ #define PREP_CONF_FN(val, n) \
prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val) prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
};
#define TEDDY_VBMI_SL1_POS 15 #define TEDDY_VBMI_SL1_POS 15
#define TEDDY_VBMI_SL2_POS 14 #define TEDDY_VBMI_SL2_POS 14
#define TEDDY_VBMI_SL3_POS 13 #define TEDDY_VBMI_SL3_POS 13

View File

@ -109,6 +109,36 @@ const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
}; };
#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn) \
do { \
if (unlikely(chunk != ones_u64a)) { \
chunk = ~chunk; \
conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} while(0)
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn) \
do { \
if (unlikely(chunk != ones_u32)) { \
chunk = ~chunk; \
conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} while(0)
static really_inline
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
}
#else
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \ #define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
do { \ do { \
if (unlikely(chunk != ones_u64a)) { \ if (unlikely(chunk != ones_u64a)) { \
@ -134,203 +164,200 @@ const m256 *getMaskBase_fat(const struct Teddy *teddy) {
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
} }
#if defined(HAVE_AVX512_REVERT) // revert to AVX2 Fat Teddy #endif
static really_inline #if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {
return (const u64a *)((const u8 *)getMaskBase_fat(teddy) const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
+ ROUNDUP_CL(2 * numMask * sizeof(m256))); 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
} 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
};
#ifdef ARCH_64_BIT #ifdef ARCH_64_BIT
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \
do { \ do { \
if (unlikely(diff512(var, ones512()))) { \ if (unlikely(diff512(var, ones512()))) { \
m512 swap = swap256in512(var); \ m512 msk_interleave = load512(p_mask_interleave); \
m512 r = interleave512lo(var, swap); \ m512 r = vpermb512(msk_interleave, var); \
m128 r0 = extract128from512(r, 0); \ m128 r0 = extract128from512(r, 0); \
m128 r1 = extract128from512(r, 1); \ m128 r1 = extract128from512(r, 1); \
m128 r2 = extract128from512(r, 2); \
m128 r3 = extract128from512(r, 3); \
u64a part1 = movq(r0); \ u64a part1 = movq(r0); \
u64a part2 = extract64from128(r0, 1); \ u64a part2 = extract64from128(r0, 1); \
u64a part5 = movq(r1); \ u64a part3 = movq(r1); \
u64a part6 = extract64from128(r1, 1); \ u64a part4 = extract64from128(r1, 1); \
r = interleave512hi(var, swap); \ u64a part5 = movq(r2); \
r0 = extract128from512(r, 0); \ u64a part6 = extract64from128(r2, 1); \
r1 = extract128from512(r, 1); \ u64a part7 = movq(r3); \
u64a part3 = movq(r0); \ u64a part8 = extract64from128(r3, 1); \
u64a part4 = extract64from128(r0, 1); \ CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn); \
u64a part7 = movq(r1); \ CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn); \
u64a part8 = extract64from128(r1, 1); \ CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \ CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \ CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \ CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, conf_fn); \ CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, conf_fn); \
CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, conf_fn); \
CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, conf_fn); \
} \ } \
} while(0) } while(0)
#else #else
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \
do { \ do { \
if (unlikely(diff512(var, ones512()))) { \ if (unlikely(diff512(var, ones512()))) { \
m512 swap = swap256in512(var); \ m512 msk_interleave = load512(p_mask_interleave); \
m512 r = interleave512lo(var, swap); \ m512 r = vpermb512(msk_interleave, var); \
m128 r0 = extract128from512(r, 0); \ m128 r0 = extract128from512(r, 0); \
m128 r1 = extract128from512(r, 1); \ m128 r1 = extract128from512(r, 1); \
m128 r2 = extract128from512(r, 2); \
m128 r3 = extract128from512(r, 3); \
u32 part1 = movd(r0); \ u32 part1 = movd(r0); \
u32 part2 = extract32from128(r0, 1); \ u32 part2 = extract32from128(r0, 1); \
u32 part3 = extract32from128(r0, 2); \ u32 part3 = extract32from128(r0, 2); \
u32 part4 = extract32from128(r0, 3); \ u32 part4 = extract32from128(r0, 3); \
u32 part9 = movd(r1); \ u32 part5 = movd(r1); \
u32 part10 = extract32from128(r1, 1); \ u32 part6 = extract32from128(r1, 1); \
u32 part11 = extract32from128(r1, 2); \ u32 part7 = extract32from128(r1, 2); \
u32 part12 = extract32from128(r1, 3); \ u32 part8 = extract32from128(r1, 3); \
r = interleave512hi(var, swap); \ u32 part9 = movd(r2); \
r0 = extract128from512(r, 0); \ u32 part10 = extract32from128(r2, 1); \
r1 = extract128from512(r, 1); \ u32 part11 = extract32from128(r2, 2); \
u32 part5 = movd(r0); \ u32 part12 = extract32from128(r2, 3); \
u32 part6 = extract32from128(r0, 1); \ u32 part13 = movd(r3); \
u32 part7 = extract32from128(r0, 2); \ u32 part14 = extract32from128(r3, 1); \
u32 part8 = extract32from128(r0, 3); \ u32 part15 = extract32from128(r3, 2); \
u32 part13 = movd(r1); \ u32 part16 = extract32from128(r3, 3); \
u32 part14 = extract32from128(r1, 1); \ CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn); \
u32 part15 = extract32from128(r1, 2); \ CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn); \
u32 part16 = extract32from128(r1, 3); \ CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \ CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \ CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \ CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \ CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \ CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \ CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \ CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, conf_fn); \ CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, conf_fn); \ CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, conf_fn); \ CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, conf_fn); \ CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, conf_fn); \ CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, conf_fn); \
CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, conf_fn); \
CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, conf_fn); \
} \ } \
} while(0) } while(0)
#endif #endif
static really_inline #define PREP_FAT_SHUF_MASK \
m512 vectoredLoad2x256(m512 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m256 p_mask256;
m512 ret = set2x256(vectoredLoad256(&p_mask256, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks));
*p_mask = set2x256(p_mask256);
return ret;
}
#define PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val) \
m512 lo = and512(val, *lo_mask); \ m512 lo = and512(val, *lo_mask); \
m512 hi = and512(rshift64_m512(val, 4), *lo_mask) m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
#define PREP_FAT_SHUF_MASK \ #define FAT_TEDDY_VBMI_PSHUFB_OR_M1 \
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(set2x256(load256(ptr))); \ m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo), \
*c_16 = *(ptr + 15); \ pshufb_m512(dup_mask[1], hi));
m512 r_msk = set512_64(0ULL, r_msk_base_hi[*c_16], \
0ULL, r_msk_base_hi[*c_0], \ #define FAT_TEDDY_VBMI_PSHUFB_OR_M2 \
0ULL, r_msk_base_lo[*c_16], \ FAT_TEDDY_VBMI_PSHUFB_OR_M1 \
0ULL, r_msk_base_lo[*c_0]); \ m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo), \
*c_0 = *(ptr + 31) pshufb_m512(dup_mask[3], hi));
#define FAT_TEDDY_VBMI_PSHUFB_OR_M3 \
FAT_TEDDY_VBMI_PSHUFB_OR_M2 \
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo), \
pshufb_m512(dup_mask[5], hi));
#define FAT_TEDDY_VBMI_PSHUFB_OR_M4 \
FAT_TEDDY_VBMI_PSHUFB_OR_M3 \
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo), \
pshufb_m512(dup_mask[7], hi));
#define FAT_TEDDY_VBMI_SL1_MASK 0xfffffffefffffffeULL
#define FAT_TEDDY_VBMI_SL2_MASK 0xfffffffcfffffffcULL
#define FAT_TEDDY_VBMI_SL3_MASK 0xfffffff8fffffff8ULL
#define FAT_TEDDY_VBMI_SHIFT_M1
#define FAT_TEDDY_VBMI_SHIFT_M2 \
FAT_TEDDY_VBMI_SHIFT_M1 \
m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
#define FAT_TEDDY_VBMI_SHIFT_M3 \
FAT_TEDDY_VBMI_SHIFT_M2 \
m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
#define FAT_TEDDY_VBMI_SHIFT_M4 \
FAT_TEDDY_VBMI_SHIFT_M3 \
m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
#define FAT_SHIFT_OR_M1 \ #define FAT_SHIFT_OR_M1 \
or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi)) shuf_or_b0
#define FAT_SHIFT_OR_M2 \ #define FAT_SHIFT_OR_M2 \
or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \ or512(sl1, FAT_SHIFT_OR_M1)
pshufb_m512(dup_mask[3], hi)), \
1), FAT_SHIFT_OR_M1)
#define FAT_SHIFT_OR_M3 \ #define FAT_SHIFT_OR_M3 \
or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \ or512(sl2, FAT_SHIFT_OR_M2)
pshufb_m512(dup_mask[5], hi)), \
2), FAT_SHIFT_OR_M2)
#define FAT_SHIFT_OR_M4 \ #define FAT_SHIFT_OR_M4 \
or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \ or512(sl3, FAT_SHIFT_OR_M3)
pshufb_m512(dup_mask[7], hi)), \
3), FAT_SHIFT_OR_M3)
static really_inline static really_inline
m512 prep_conf_fat_teddy_no_reinforcement_m1(const m512 *lo_mask, m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
const m512 *dup_mask, UNUSED const m512 *sl_msk, const m512 val) {
const m512 val) { PREP_FAT_SHUF_MASK;
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); FAT_TEDDY_VBMI_PSHUFB_OR_M1;
FAT_TEDDY_VBMI_SHIFT_M1;
return FAT_SHIFT_OR_M1; return FAT_SHIFT_OR_M1;
} }
static really_inline static really_inline
m512 prep_conf_fat_teddy_no_reinforcement_m2(const m512 *lo_mask, m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
const m512 *dup_mask, const m512 *sl_msk, const m512 val) {
const m512 val) { PREP_FAT_SHUF_MASK;
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); FAT_TEDDY_VBMI_PSHUFB_OR_M2;
FAT_TEDDY_VBMI_SHIFT_M2;
return FAT_SHIFT_OR_M2; return FAT_SHIFT_OR_M2;
} }
static really_inline static really_inline
m512 prep_conf_fat_teddy_no_reinforcement_m3(const m512 *lo_mask, m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
const m512 *dup_mask, const m512 *sl_msk, const m512 val) {
const m512 val) { PREP_FAT_SHUF_MASK;
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); FAT_TEDDY_VBMI_PSHUFB_OR_M3;
FAT_TEDDY_VBMI_SHIFT_M3;
return FAT_SHIFT_OR_M3; return FAT_SHIFT_OR_M3;
} }
static really_inline static really_inline
m512 prep_conf_fat_teddy_no_reinforcement_m4(const m512 *lo_mask, m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
const m512 *dup_mask, const m512 *sl_msk, const m512 val) {
const m512 val) { PREP_FAT_SHUF_MASK;
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); FAT_TEDDY_VBMI_PSHUFB_OR_M4;
FAT_TEDDY_VBMI_SHIFT_M4;
return FAT_SHIFT_OR_M4; return FAT_SHIFT_OR_M4;
} }
static really_inline #define PREP_CONF_FAT_FN(val, n) \
m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask, prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
const u8 *ptr, const u64a *r_msk_base_lo,
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
PREP_FAT_SHUF_MASK;
return or512(FAT_SHIFT_OR_M1, r_msk);
}
static really_inline #define FAT_TEDDY_VBMI_SL1_POS 15
m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask, #define FAT_TEDDY_VBMI_SL2_POS 14
const u8 *ptr, const u64a *r_msk_base_lo, #define FAT_TEDDY_VBMI_SL3_POS 13
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
PREP_FAT_SHUF_MASK;
return or512(FAT_SHIFT_OR_M2, r_msk);
}
static really_inline #define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
const u8 *ptr, const u64a *r_msk_base_lo,
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
PREP_FAT_SHUF_MASK;
return or512(FAT_SHIFT_OR_M3, r_msk);
}
static really_inline #define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \
m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1 \
const u8 *ptr, const u64a *r_msk_base_lo, sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
PREP_FAT_SHUF_MASK;
return or512(FAT_SHIFT_OR_M4, r_msk);
}
#define PREP_CONF_FAT_FN_NO_REINFORCEMENT(val, n) \ #define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \
prep_conf_fat_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val) FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \
sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
#define PREP_CONF_FAT_FN(ptr, n) \ #define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4 \
prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, ptr, \ FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \
r_msk_base_lo, r_msk_base_hi, &c_0, &c_16) sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
/* /*
* In FAT teddy, it needs 2 bytes to represent result of each position, * In FAT teddy, it needs 2 bytes to represent result of each position,
@ -355,31 +382,15 @@ m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
* then do pshufb_m512(AABB, XYXY). * then do pshufb_m512(AABB, XYXY).
*/ */
#define DUP_FAT_MASK(a) mask_set2x256(set2x256(swap128in256(a)), 0xC3, a)
#define PREPARE_FAT_MASKS_1 \
dup_mask[0] = DUP_FAT_MASK(maskBase[0]); \
dup_mask[1] = DUP_FAT_MASK(maskBase[1]);
#define PREPARE_FAT_MASKS_2 \
PREPARE_FAT_MASKS_1 \
dup_mask[2] = DUP_FAT_MASK(maskBase[2]); \
dup_mask[3] = DUP_FAT_MASK(maskBase[3]);
#define PREPARE_FAT_MASKS_3 \
PREPARE_FAT_MASKS_2 \
dup_mask[4] = DUP_FAT_MASK(maskBase[4]); \
dup_mask[5] = DUP_FAT_MASK(maskBase[5]);
#define PREPARE_FAT_MASKS_4 \
PREPARE_FAT_MASKS_3 \
dup_mask[6] = DUP_FAT_MASK(maskBase[6]); \
dup_mask[7] = DUP_FAT_MASK(maskBase[7]);
#define PREPARE_FAT_MASKS(n) \ #define PREPARE_FAT_MASKS(n) \
m512 lo_mask = set64x8(0xf); \ m512 lo_mask = set64x8(0xf); \
m512 dup_mask[n * 2]; \ m512 sl_msk[n - 1]; \
PREPARE_FAT_MASKS_##n FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
#define FAT_TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffULL >> n_sh)
#define FAT_TEDDY_VBMI_CONF_MASK_FULL ((0xffffffffULL << n_sh) & 0xffffffffULL)
#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffULL >> (32 - n_sh))
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \ #define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
do { \ do { \
@ -389,67 +400,53 @@ do { \
const u8 *tryFloodDetect = a->firstFloodDetect; \ const u8 *tryFloodDetect = a->firstFloodDetect; \
u32 last_match = ones_u32; \ u32 last_match = ones_u32; \
const struct Teddy *teddy = (const struct Teddy *)fdr; \ const struct Teddy *teddy = (const struct Teddy *)fdr; \
const size_t iterBytes = 64; \ const size_t iterBytes = 32; \
u32 n_sh = n_msk - 1; \
const size_t loopBytes = 32 - n_sh; \
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
a->buf, a->len, a->start_offset); \ a->buf, a->len, a->start_offset); \
\ \
const m256 *maskBase = getMaskBase_fat(teddy); \ const m512 *dup_mask = getDupMaskBase(teddy, n_msk); \
PREPARE_FAT_MASKS(n_msk); \ PREPARE_FAT_MASKS(n_msk); \
const u32 *confBase = getConfBase(teddy); \ const u32 *confBase = getConfBase(teddy); \
\ \
const u64a *r_msk_base_lo = getReinforcedMaskBase_fat(teddy, n_msk); \ u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL; \
const u64a *r_msk_base_hi = r_msk_base_lo + (N_CHARS + 1); \ m512 p_mask = set_mask_m512(~((k << 32) | k)); \
u32 c_0 = 0x100; \ u32 overlap = 0; \
u32 c_16 = 0x100; \ u64a patch = 0; \
const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \ if (likely(ptr + loopBytes <= buf_end)) { \
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD; \
if (ptr < mainStart) { \ m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0)); \
ptr = mainStart - 32; \ m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk); \
m512 p_mask; \ r_0 = or512(r_0, p_mask0); \
m512 val_0 = vectoredLoad2x256(&p_mask, ptr, a->start_offset, \ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn); \
a->buf, buf_end, \ ptr += loopBytes; \
a->buf_history, a->len_history, n_msk); \ overlap = n_sh; \
m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \ patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH; \
r_0 = or512(r_0, p_mask); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
ptr += 32; \
} \ } \
\ \
if (ptr + 32 <= buf_end) { \ for (; ptr + loopBytes <= buf_end; ptr += loopBytes) { \
m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
ptr += 32; \
} \
\
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
__builtin_prefetch(ptr + (iterBytes * 4)); \
CHECK_FLOOD; \ CHECK_FLOOD; \
m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \ m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
m512 r_1 = PREP_CONF_FAT_FN(ptr + 32, n_msk); \
CONFIRM_FAT_TEDDY(r_1, 16, 32, NOT_CAUTIOUS, conf_fn); \
} \
\
if (ptr + 32 <= buf_end) { \
m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
ptr += 32; \
} \
\
assert(ptr + 32 > buf_end); \
if (ptr < buf_end) { \
m512 p_mask; \
m512 val_0 = vectoredLoad2x256(&p_mask, ptr, 0, ptr, buf_end, \
a->buf_history, a->len_history, n_msk); \
m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \
r_0 = or512(r_0, p_mask); \ r_0 = or512(r_0, p_mask); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn); \
} \
\
assert(ptr + loopBytes > buf_end); \
if (ptr < buf_end) { \
u32 left = (u32)(buf_end - ptr); \
u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left); \
m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1)); \
m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap)); \
m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk); \
r_0 = or512(r_0, p_mask1); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn); \
} \ } \
\ \
return HWLM_SUCCESS; \ return HWLM_SUCCESS; \
} while(0) } while(0)
#else // HAVE_AVX512 #else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
#ifdef ARCH_64_BIT #ifdef ARCH_64_BIT
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
@ -659,7 +656,7 @@ do { \
return HWLM_SUCCESS; \ return HWLM_SUCCESS; \
} while(0) } while(0)
#endif // HAVE_AVX512 #endif // HAVE_AVX512VBMI
hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr, hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a, const struct FDR_Runtime_Args *a,

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -353,6 +353,89 @@ void fillReinforcedMsk(u8 *rmsk, u16 c, u32 j, u8 bmsk) {
} }
} }
static
void fillDupNibbleMasks(const map<BucketIndex,
vector<LiteralIndex>> &bucketToLits,
const vector<hwlmLiteral> &lits,
u32 numMasks, size_t maskLen,
u8 *baseMsk) {
u32 maskWidth = 2;
memset(baseMsk, 0xff, maskLen);
for (const auto &b2l : bucketToLits) {
const u32 &bucket_id = b2l.first;
const vector<LiteralIndex> &ids = b2l.second;
const u8 bmsk = 1U << (bucket_id % 8);
for (const LiteralIndex &lit_id : ids) {
const hwlmLiteral &l = lits[lit_id];
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
const u32 sz = verify_u32(l.s.size());
// fill in masks
for (u32 j = 0; j < numMasks; j++) {
const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
const u32 lo_base0 = msk_id_lo * 32;
const u32 lo_base1 = msk_id_lo * 32 + 16;
const u32 hi_base0 = msk_id_hi * 32;
const u32 hi_base1 = msk_id_hi * 32 + 16;
// if we don't have a char at this position, fill in i
// locations in these masks with '1'
if (j >= sz) {
for (u32 n = 0; n < 16; n++) {
baseMsk[lo_base0 + n] &= ~bmsk;
baseMsk[lo_base1 + n] &= ~bmsk;
baseMsk[hi_base0 + n] &= ~bmsk;
baseMsk[hi_base1 + n] &= ~bmsk;
}
} else {
u8 c = l.s[sz - 1 - j];
// if we do have a char at this position
const u32 hiShift = 4;
u32 n_hi = (c >> hiShift) & 0xf;
u32 n_lo = c & 0xf;
if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
u8 m = l.msk[l.msk.size() - 1 - j];
u8 m_hi = (m >> hiShift) & 0xf;
u8 m_lo = m & 0xf;
u8 cmp = l.cmp[l.msk.size() - 1 - j];
u8 cmp_lo = cmp & 0xf;
u8 cmp_hi = (cmp >> hiShift) & 0xf;
for (u8 cm = 0; cm < 0x10; cm++) {
if ((cm & m_lo) == (cmp_lo & m_lo)) {
baseMsk[lo_base0 + cm] &= ~bmsk;
baseMsk[lo_base1 + cm] &= ~bmsk;
}
if ((cm & m_hi) == (cmp_hi & m_hi)) {
baseMsk[hi_base0 + cm] &= ~bmsk;
baseMsk[hi_base1 + cm] &= ~bmsk;
}
}
} else {
if (l.nocase && ourisalpha(c)) {
u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
baseMsk[hi_base0 + (n_hi & cmHalfClear)] &= ~bmsk;
baseMsk[hi_base1 + (n_hi & cmHalfClear)] &= ~bmsk;
baseMsk[hi_base0 + (n_hi | cmHalfSet)] &= ~bmsk;
baseMsk[hi_base1 + (n_hi | cmHalfSet)] &= ~bmsk;
} else {
baseMsk[hi_base0 + n_hi] &= ~bmsk;
baseMsk[hi_base1 + n_hi] &= ~bmsk;
}
baseMsk[lo_base0 + n_lo] &= ~bmsk;
baseMsk[lo_base1 + n_lo] &= ~bmsk;
}
}
}
}
}
}
static static
void fillNibbleMasks(const map<BucketIndex, void fillNibbleMasks(const map<BucketIndex,
vector<LiteralIndex>> &bucketToLits, vector<LiteralIndex>> &bucketToLits,
@ -479,14 +562,17 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
size_t headerSize = sizeof(Teddy); size_t headerSize = sizeof(Teddy);
size_t maskLen = eng.numMasks * 16 * 2 * maskWidth; size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
size_t reinforcedMaskLen = RTABLE_SIZE * maskWidth; size_t reinforcedDupMaskLen = RTABLE_SIZE * maskWidth;
if (maskWidth == 2) { // dup nibble mask table in Fat Teddy
reinforcedDupMaskLen = maskLen * 2;
}
auto floodTable = setupFDRFloodControl(lits, eng, grey); auto floodTable = setupFDRFloodControl(lits, eng, grey);
auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small); auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
// Note: we place each major structure here on a cacheline boundary. // Note: we place each major structure here on a cacheline boundary.
size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) + size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
ROUNDUP_CL(reinforcedMaskLen) + ROUNDUP_CL(reinforcedDupMaskLen) +
ROUNDUP_CL(confirmTable.size()) + floodTable.size(); ROUNDUP_CL(confirmTable.size()) + floodTable.size();
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64); auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
@ -502,7 +588,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
// Write confirm structures. // Write confirm structures.
u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) + u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
ROUNDUP_CL(reinforcedMaskLen); ROUNDUP_CL(reinforcedDupMaskLen);
assert(ISALIGNED_CL(ptr)); assert(ISALIGNED_CL(ptr));
teddy->confOffset = verify_u32(ptr - teddy_base); teddy->confOffset = verify_u32(ptr - teddy_base);
memcpy(ptr, confirmTable.get(), confirmTable.size()); memcpy(ptr, confirmTable.get(), confirmTable.size());
@ -519,9 +605,16 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen, fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
baseMsk); baseMsk);
if (maskWidth == 1) { // reinforcement table in Teddy
// Write reinforcement masks. // Write reinforcement masks.
u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen); u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth); fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
} else { // dup nibble mask table in Fat Teddy
assert(maskWidth == 2);
u8 *dupMsk = baseMsk + ROUNDUP_CL(maskLen);
fillDupNibbleMasks(bucketToLits, lits, eng.numMasks,
reinforcedDupMaskLen, dupMsk);
}
return fdr; return fdr;
} }

View File

@ -45,6 +45,16 @@ extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64]; extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
#endif #endif
#if defined(HAVE_AVX512VBMI)
static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
};
#endif
#ifdef ARCH_64_BIT #ifdef ARCH_64_BIT
#define TEDDY_CONF_TYPE u64a #define TEDDY_CONF_TYPE u64a
#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf) #define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2019, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -120,9 +120,10 @@ bool checkMode(unsigned int mode, hs_compile_error **comp_error) {
static static
bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) { bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) {
static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_GLM; static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_ICX;
static constexpr u32 HS_CPU_FEATURES_ALL = static constexpr u32 HS_CPU_FEATURES_ALL =
HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512; HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512 |
HS_CPU_FEATURES_AVX512VBMI;
if (!p) { if (!p) {
return true; return true;

View File

@ -42,7 +42,7 @@
/* The current Hyperscan version information. */ /* The current Hyperscan version information. */
#define HS_MAJOR 5 #define HS_MAJOR 5
#define HS_MINOR 3 #define HS_MINOR 4
#define HS_PATCH 0 #define HS_PATCH 0
#include "hs_compile.h" #include "hs_compile.h"

View File

@ -1034,6 +1034,15 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
*/ */
#define HS_CPU_FEATURES_AVX512 (1ULL << 3) #define HS_CPU_FEATURES_AVX512 (1ULL << 3)
/**
* CPU features flag - Intel(R) Advanced Vector Extensions 512
* Vector Byte Manipulation Instructions (Intel(R) AVX512VBMI)
*
* Setting this flag indicates that the target platform supports AVX512VBMI
* instructions. Using AVX512VBMI implies the use of AVX512.
*/
#define HS_CPU_FEATURES_AVX512VBMI (1ULL << 4)
/** @} */ /** @} */
/** /**
@ -1114,6 +1123,22 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
*/ */
#define HS_TUNE_FAMILY_GLM 8 #define HS_TUNE_FAMILY_GLM 8
/**
* Tuning Parameter - Intel(R) microarchitecture code name Icelake
*
* This indicates that the compiled database should be tuned for the
* Icelake microarchitecture.
*/
#define HS_TUNE_FAMILY_ICL 9
/**
* Tuning Parameter - Intel(R) microarchitecture code name Icelake Server
*
* This indicates that the compiled database should be tuned for the
* Icelake Server microarchitecture.
*/
#define HS_TUNE_FAMILY_ICX 10
/** @} */ /** @} */
/** /**

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -85,6 +85,18 @@ namespace ue2 {
*/ */
static constexpr u32 NO_STATE = ~0; static constexpr u32 NO_STATE = ~0;
/* Maximum number of states taken as a small NFA */
static constexpr u32 MAX_SMALL_NFA_STATES = 64;
/* Maximum bounded repeat upper bound to consider as a fast NFA */
static constexpr u64a MAX_REPEAT_SIZE = 200;
/* Maximum bounded repeat char reach size to consider as a fast NFA */
static constexpr u32 MAX_REPEAT_CHAR_REACH = 26;
/* Minimum bounded repeat trigger distance to consider as a fast NFA */
static constexpr u8 MIN_REPEAT_TRIGGER_DISTANCE = 6;
namespace { namespace {
struct precalcAccel { struct precalcAccel {
@ -1910,7 +1922,8 @@ struct Factory {
} }
static static
void writeExceptions(const map<ExceptionProto, vector<u32>> &exceptionMap, void writeExceptions(const build_info &args,
const map<ExceptionProto, vector<u32>> &exceptionMap,
const vector<u32> &repeatOffsets, implNFA_t *limex, const vector<u32> &repeatOffsets, implNFA_t *limex,
const u32 exceptionsOffset, const u32 exceptionsOffset,
const u32 reportListOffset) { const u32 reportListOffset) {
@ -1962,6 +1975,59 @@ struct Factory {
limex->exceptionOffset = exceptionsOffset; limex->exceptionOffset = exceptionsOffset;
limex->exceptionCount = ecount; limex->exceptionCount = ecount;
if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask);
u8 *shufMask = (u8 *)&limex->exceptionShufMask;
u8 *bitMask = (u8 *)&limex->exceptionBitMask;
u8 *andMask = (u8 *)&limex->exceptionAndMask;
u32 tot_cnt = 0;
u32 pos = 0;
bool valid = true;
size_t tot = sizeof(limex->exceptionMask);
size_t base = 0;
// We normally have up to 64 exceptions to handle,
// but treat 384 state Limex differently to simplify operations
size_t limit = 64;
if (args.num_states > 256 && args.num_states <= 384) {
limit = 48;
}
for (size_t i = 0; i < tot; i++) {
if (!exceptionMask[i]) {
continue;
}
u32 bit_cnt = popcount32(exceptionMask[i]);
tot_cnt += bit_cnt;
if (tot_cnt > limit) {
valid = false;
break;
}
u32 emsk = exceptionMask[i];
while (emsk) {
u32 t = findAndClearLSB_32(&emsk);
bitMask[pos] = 1U << t;
andMask[pos] = 1U << t;
shufMask[pos++] = i + base;
if (pos == 32 &&
(args.num_states > 128 && args.num_states <= 256)) {
base += 32;
}
}
}
// Avoid matching unused bytes
for (u32 i = pos; i < 64; i++) {
bitMask[i] = 0xff;
}
if (valid) {
setLimexFlag(limex, LIMEX_FLAG_EXTRACT_EXP);
}
}
} }
static static
@ -2287,7 +2353,7 @@ struct Factory {
writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset, writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset,
repeatsOffset); repeatsOffset);
writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset, writeExceptions(args, exceptionMap, repeatOffsets, limex, exceptionsOffset,
reportListOffset); reportListOffset);
writeLimexMasks(args, limex); writeLimexMasks(args, limex);
@ -2422,6 +2488,68 @@ bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
} }
#endif // NDEBUG #endif // NDEBUG
static
bool isFast(const build_info &args) {
const NGHolder &h = args.h;
const u32 num_states = args.num_states;
if (num_states > MAX_SMALL_NFA_STATES) {
return false;
}
unordered_map<NFAVertex, bool> pos_trigger;
for (u32 i = 0; i < args.repeats.size(); i++) {
const BoundedRepeatData &br = args.repeats[i];
assert(!contains(pos_trigger, br.pos_trigger));
pos_trigger[br.pos_trigger] = br.repeatMax <= MAX_REPEAT_SIZE;
}
// Small NFA without bounded repeat should be fast.
if (pos_trigger.empty()) {
return true;
}
vector<NFAVertex> cur;
unordered_set<NFAVertex> visited;
for (const auto &m : args.tops) {
for (NFAVertex v : m.second) {
cur.push_back(v);
visited.insert(v);
}
}
u8 pos_dist = 0;
while (!cur.empty()) {
vector<NFAVertex> next;
for (const auto &v : cur) {
if (contains(pos_trigger, v)) {
const CharReach &cr = h[v].char_reach;
if (!pos_trigger[v] && cr.count() > MAX_REPEAT_CHAR_REACH) {
return false;
}
}
for (const auto &w : adjacent_vertices_range(v, h)) {
if (w == v) {
continue;
}
u32 j = args.state_ids.at(w);
if (j == NO_STATE) {
continue;
}
if (!contains(visited, w)) {
next.push_back(w);
visited.insert(w);
}
}
}
if (++pos_dist >= MIN_REPEAT_TRIGGER_DISTANCE) {
break;
}
swap(cur, next);
}
return true;
}
static static
u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) { u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) {
u32 rv = 0; u32 rv = 0;
@ -2442,7 +2570,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
const unordered_map<NFAVertex, NFAStateSet> &squashMap, const unordered_map<NFAVertex, NFAStateSet> &squashMap,
const map<u32, set<NFAVertex>> &tops, const map<u32, set<NFAVertex>> &tops,
const set<NFAVertex> &zombies, bool do_accel, const set<NFAVertex> &zombies, bool do_accel,
bool stateCompression, u32 hint, bool stateCompression, bool &fast, u32 hint,
const CompileContext &cc) { const CompileContext &cc) {
const u32 num_states = max_state(states) + 1; const u32 num_states = max_state(states) + 1;
DEBUG_PRINTF("total states: %u\n", num_states); DEBUG_PRINTF("total states: %u\n", num_states);
@ -2497,6 +2625,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
if (nfa) { if (nfa) {
DEBUG_PRINTF("successful build with NFA engine: %s\n", DEBUG_PRINTF("successful build with NFA engine: %s\n",
nfa_type_name(limex_model)); nfa_type_name(limex_model));
fast = isFast(arg);
return nfa; return nfa;
} }
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -78,6 +78,7 @@ bytecode_ptr<NFA> generate(NGHolder &g,
const std::set<NFAVertex> &zombies, const std::set<NFAVertex> &zombies,
bool do_accel, bool do_accel,
bool stateCompression, bool stateCompression,
bool &fast,
u32 hint, u32 hint,
const CompileContext &cc); const CompileContext &cc);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -47,6 +47,8 @@
#define AND_STATE JOIN(and_, STATE_T) #define AND_STATE JOIN(and_, STATE_T)
#define EQ_STATE(a, b) (!JOIN(noteq_, STATE_T)((a), (b))) #define EQ_STATE(a, b) (!JOIN(noteq_, STATE_T)((a), (b)))
#define OR_STATE JOIN(or_, STATE_T) #define OR_STATE JOIN(or_, STATE_T)
#define EXPAND_STATE JOIN(expand_, STATE_T)
#define SHUFFLE_BYTE_STATE JOIN(shuffle_byte_, STATE_T)
#define TESTBIT_STATE JOIN(testbit_, STATE_T) #define TESTBIT_STATE JOIN(testbit_, STATE_T)
#define EXCEPTION_T JOIN(struct NFAException, SIZE) #define EXCEPTION_T JOIN(struct NFAException, SIZE)
#define CONTEXT_T JOIN(NFAContext, SIZE) #define CONTEXT_T JOIN(NFAContext, SIZE)
@ -208,7 +210,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
/** \brief Process all of the exceptions associated with the states in the \a /** \brief Process all of the exceptions associated with the states in the \a
* estate. */ * estate. */
static really_inline static really_inline
int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ, int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions, const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) { u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
assert(diffmask > 0); // guaranteed by caller macro assert(diffmask > 0); // guaranteed by caller macro
@ -233,6 +235,30 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
ctx->local_succ = ZERO_STATE; ctx->local_succ = ZERO_STATE;
#endif #endif
struct proto_cache new_cache = {0, NULL};
enum CacheResult cacheable = CACHE_RESULT;
#if defined(HAVE_AVX512VBMI) && SIZE > 64
if (likely(limex->flags & LIMEX_FLAG_EXTRACT_EXP)) {
m512 emask = EXPAND_STATE(*STATE_ARG_P);
emask = SHUFFLE_BYTE_STATE(load_m512(&limex->exceptionShufMask), emask);
emask = and512(emask, load_m512(&limex->exceptionAndMask));
u64a word = eq512mask(emask, load_m512(&limex->exceptionBitMask));
do {
u32 bit = FIND_AND_CLEAR_FN(&word);
const EXCEPTION_T *e = &exceptions[bit];
if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
#ifndef BIG_MODEL
&local_succ,
#endif
limex, offset, ctx, &new_cache, &cacheable,
in_rev, flags)) {
return PE_RV_HALT;
}
} while (word);
} else {
// A copy of the estate as an array of GPR-sized chunks. // A copy of the estate as an array of GPR-sized chunks.
CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
@ -243,9 +269,6 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
#endif #endif
memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T)); memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
struct proto_cache new_cache = {0, NULL};
enum CacheResult cacheable = CACHE_RESULT;
u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)]; u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
base_index[0] = 0; base_index[0] = 0;
for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) { for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
@ -276,6 +299,49 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
} }
} while (word); } while (word);
} while (diffmask); } while (diffmask);
}
#else
// A copy of the estate as an array of GPR-sized chunks.
CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
#ifdef ESTATE_ON_STACK
memcpy(chunks, &estate, sizeof(STATE_T));
#else
memcpy(chunks, estatep, sizeof(STATE_T));
#endif
memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
base_index[0] = 0;
for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
}
do {
u32 t = findAndClearLSB_32(&diffmask);
#ifdef ARCH_64_BIT
t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
#endif
assert(t < ARRAY_LENGTH(chunks));
CHUNK_T word = chunks[t];
assert(word != 0);
do {
u32 bit = FIND_AND_CLEAR_FN(&word);
u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
u32 idx = local_index + base_index[t];
const EXCEPTION_T *e = &exceptions[idx];
if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
#ifndef BIG_MODEL
&local_succ,
#endif
limex, offset, ctx, &new_cache, &cacheable,
in_rev, flags)) {
return PE_RV_HALT;
}
} while (word);
} while (diffmask);
#endif
#ifndef BIG_MODEL #ifndef BIG_MODEL
*succ = OR_STATE(*succ, local_succ); *succ = OR_STATE(*succ, local_succ);
@ -307,6 +373,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
#undef AND_STATE #undef AND_STATE
#undef EQ_STATE #undef EQ_STATE
#undef OR_STATE #undef OR_STATE
#undef EXPAND_STATE
#undef SHUFFLE_BYTE_STATE
#undef TESTBIT_STATE #undef TESTBIT_STATE
#undef PE_FN #undef PE_FN
#undef RUN_EXCEPTION_FN #undef RUN_EXCEPTION_FN

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -86,6 +86,7 @@
#define LIMEX_FLAG_COMPRESS_STATE 1 /**< pack state into stream state */ #define LIMEX_FLAG_COMPRESS_STATE 1 /**< pack state into stream state */
#define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */ #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
#define LIMEX_FLAG_CANNOT_DIE 4 /**< limex cannot have no states on */ #define LIMEX_FLAG_CANNOT_DIE 4 /**< limex cannot have no states on */
#define LIMEX_FLAG_EXTRACT_EXP 8 /**< use limex exception bit extraction */
enum LimExTrigger { enum LimExTrigger {
LIMEX_TRIGGER_NONE = 0, LIMEX_TRIGGER_NONE = 0,
@ -157,6 +158,9 @@ struct LimExNFA##size { \
u_##size shift[MAX_SHIFT_COUNT]; \ u_##size shift[MAX_SHIFT_COUNT]; \
u32 shiftCount; /**< number of shift masks used */ \ u32 shiftCount; /**< number of shift masks used */ \
u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */ \ u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */ \
m512 exceptionShufMask; /**< exception byte shuffle mask */ \
m512 exceptionBitMask; /**< exception bit mask */ \
m512 exceptionAndMask; /**< exception and mask */ \
}; };
CREATE_NFA_LIMEX(32) CREATE_NFA_LIMEX(32)

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2018, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -1477,6 +1477,7 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
bytecode_ptr<NFA> nfa; bytecode_ptr<NFA> nfa;
if (!using8bit) { if (!using8bit) {
// Wide state optimization
if (cc.grey.allowWideStates && strat.getType() == McClellan if (cc.grey.allowWideStates && strat.getType() == McClellan
&& !is_triggered(raw.kind)) { && !is_triggered(raw.kind)) {
find_wide_state(info); find_wide_state(info);
@ -1486,6 +1487,8 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
bool any_cyclic_near_anchored_state bool any_cyclic_near_anchored_state
= is_cyclic_near(raw, raw.start_anchored); = is_cyclic_near(raw, raw.start_anchored);
// Sherman optimization
if (info.impl_alpha_size > 16) {
for (u32 i = 0; i < info.size(); i++) { for (u32 i = 0; i < info.size(); i++) {
if (info.is_widestate(i)) { if (info.is_widestate(i)) {
continue; continue;
@ -1499,6 +1502,7 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy, DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
info.size() * info.impl_alpha_size, info.size(), info.size() * info.impl_alpha_size, info.size(),
info.impl_alpha_size); info.impl_alpha_size);
}
nfa = mcclellanCompile16(info, cc, accel_states); nfa = mcclellanCompile16(info, cc, accel_states);
} else { } else {

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -80,5 +80,78 @@ char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest,
#define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL #define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL
#define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL #define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL
#if defined(HAVE_AVX512VBMI)
/* 64-8 bit Sheng-McClellan hybrid */
char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
const char *streamState, u64a offset,
NfaCallback callback, void *context);
char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end);
char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end);
char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report);
char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q);
char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
struct mq *q);
char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q);
char nfaExecMcSheng64_8_queueInitState(const struct NFA *n, struct mq *q);
char nfaExecMcSheng64_8_initCompressedState(const struct NFA *n, u64a offset,
void *state, u8 key);
char nfaExecMcSheng64_8_queueCompressState(const struct NFA *nfa,
const struct mq *q, s64a loc);
char nfaExecMcSheng64_8_expandState(const struct NFA *nfa, void *dest,
const void *src, u64a offset, u8 key);
#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
/* 64-16 bit Sheng-McClellan hybrid */
char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
const char *streamState, u64a offset,
NfaCallback callback, void *context);
char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end);
char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end);
char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report);
char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q);
char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
struct mq *q);
char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q);
char nfaExecMcSheng64_16_queueInitState(const struct NFA *n, struct mq *q);
char nfaExecMcSheng64_16_initCompressedState(const struct NFA *n, u64a offset,
void *state, u8 key);
char nfaExecMcSheng64_16_queueCompressState(const struct NFA *nfa,
const struct mq *q, s64a loc);
char nfaExecMcSheng64_16_expandState(const struct NFA *nfa, void *dest,
const void *src, u64a offset, u8 key);
#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
#else // !HAVE_AVX512VBMI
#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
#define nfaExecMcSheng64_8_Q NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_Q2 NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_QR NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_inAccept NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_inAnyAccept NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_queueInitState NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_queueCompressState NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_expandState NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_initCompressedState NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_testEOD NFA_API_NO_IMPL
#define nfaExecMcSheng64_8_reportCurrent NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
#define nfaExecMcSheng64_16_Q NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_Q2 NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_QR NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_inAccept NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_inAnyAccept NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_queueInitState NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_queueCompressState NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_expandState NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_initCompressedState NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_testEOD NFA_API_NO_IMPL
#define nfaExecMcSheng64_16_reportCurrent NFA_API_NO_IMPL
#endif //end of HAVE_AVX512VBMI
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -64,7 +64,6 @@
#include <set> #include <set>
#include <deque> #include <deque>
#include <vector> #include <vector>
#include <boost/range/adaptor/map.hpp> #include <boost/range/adaptor/map.hpp>
using namespace std; using namespace std;
@ -244,6 +243,106 @@ void populateBasicInfo(size_t state_size, const dfa_info &info,
} }
} }
static
mstate_aux *getAux64(NFA *n, dstate_id_t i) {
mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(n);
mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset);
mstate_aux *aux = aux_base + i;
assert((const char *)aux < (const char *)n + m->length);
return aux;
}
static
void createShuffleMasks64(mcsheng64 *m, const dfa_info &info,
dstate_id_t sheng_end,
const map<dstate_id_t, AccelScheme> &accel_escape_info) {
DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end);
assert(sheng_end > DEAD_STATE + 1);
assert(sheng_end <= sizeof(m512) + 1);
vector<array<u8, sizeof(m512)>> masks;
masks.resize(info.alpha_size);
/* -1 to avoid wasting a slot as we do not include dead state */
vector<dstate_id_t> raw_ids;
raw_ids.resize(sheng_end - 1);
for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) {
assert(info.implId(s)); /* should not map to DEAD_STATE */
if (info.is_sheng(s)) {
raw_ids[info.extra[s].sheng_id] = s;
}
}
for (u32 i = 0; i < info.alpha_size; i++) {
if (i == info.alpha_remap[TOP]) {
continue;
}
auto &mask = masks[i];
assert(sizeof(mask) == sizeof(m512));
mask.fill(0);
for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) {
dstate_id_t raw_id = raw_ids[sheng_id];
dstate_id_t next_id = info.implId(info.states[raw_id].next[i]);
if (next_id == DEAD_STATE) {
next_id = sheng_end - 1;
} else if (next_id < sheng_end) {
next_id--;
}
DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id);
mask[sheng_id] = verify_u8(next_id);
}
}
for (u32 i = 0; i < N_CHARS; i++) {
assert(info.alpha_remap[i] != info.alpha_remap[TOP]);
memcpy((u8 *)&m->sheng_succ_masks[i],
(u8 *)masks[info.alpha_remap[i]].data(), sizeof(m512));
}
m->sheng_end = sheng_end;
m->sheng_accel_limit = sheng_end - 1;
for (dstate_id_t s : raw_ids) {
if (contains(accel_escape_info, s)) {
LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id);
}
}
}
static
void populateBasicInfo64(size_t state_size, const dfa_info &info,
u32 total_size, u32 aux_offset, u32 accel_offset,
u32 accel_count, ReportID arb, bool single, NFA *nfa) {
assert(state_size == sizeof(u16) || state_size == sizeof(u8));
nfa->length = total_size;
nfa->nPositions = info.states.size();
nfa->scratchStateSize = verify_u32(state_size);
nfa->streamStateSize = verify_u32(state_size);
if (state_size == sizeof(u8)) {
nfa->type = MCSHENG_64_NFA_8;
} else {
nfa->type = MCSHENG_64_NFA_16;
}
mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
for (u32 i = 0; i < 256; i++) {
m->remap[i] = verify_u8(info.alpha_remap[i]);
}
m->alphaShift = info.getAlphaShift();
m->length = total_size;
m->aux_offset = aux_offset;
m->accel_offset = accel_offset;
m->arb_report = arb;
m->state_count = verify_u16(info.size());
m->start_anchored = info.implId(info.raw.start_anchored);
m->start_floating = info.implId(info.raw.start_floating);
m->has_accel = accel_count ? 1 : 0;
if (single) {
m->flags |= MCSHENG_FLAG_SINGLE;
}
}
static static
size_t calcShermanRegionSize(const dfa_info &info) { size_t calcShermanRegionSize(const dfa_info &info) {
size_t rv = 0; size_t rv = 0;
@ -382,6 +481,7 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
} }
#define MAX_SHENG_STATES 16 #define MAX_SHENG_STATES 16
#define MAX_SHENG64_STATES 64
#define MAX_SHENG_LEAKINESS 0.05 #define MAX_SHENG_LEAKINESS 0.05
using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>; using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>;
@ -435,7 +535,8 @@ double leakiness(const RdfaGraph &g, dfa_info &info,
static static
dstate_id_t find_sheng_states(dfa_info &info, dstate_id_t find_sheng_states(dfa_info &info,
map<dstate_id_t, AccelScheme> &accel_escape_info) { map<dstate_id_t, AccelScheme> &accel_escape_info,
size_t max_sheng_states) {
RdfaGraph g(info.raw); RdfaGraph g(info.raw);
auto cyclics = find_vertices_in_cycles(g); auto cyclics = find_vertices_in_cycles(g);
@ -470,7 +571,7 @@ dstate_id_t find_sheng_states(dfa_info &info,
flat_set<dstate_id_t> considered = { DEAD_STATE }; flat_set<dstate_id_t> considered = { DEAD_STATE };
bool seen_back_edge = false; bool seen_back_edge = false;
while (!to_consider.empty() while (!to_consider.empty()
&& sheng_states.size() < MAX_SHENG_STATES) { && sheng_states.size() < max_sheng_states) {
auto v = to_consider.front(); auto v = to_consider.front();
to_consider.pop_front(); to_consider.pop_front();
if (!considered.insert(g[v].index).second) { if (!considered.insert(g[v].index).second) {
@ -616,6 +717,80 @@ void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
} }
} }
static
void fill_in_aux_info64(NFA *nfa, const dfa_info &info,
const map<dstate_id_t, AccelScheme> &accel_escape_info,
u32 accel_offset, UNUSED u32 accel_end_offset,
const vector<u32> &reports,
const vector<u32> &reports_eod,
u32 report_base_offset,
const raw_report_info &ri) {
mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
vector<u32> reportOffsets;
ri.fillReportLists(nfa, report_base_offset, reportOffsets);
for (u32 i = 0; i < info.size(); i++) {
u16 impl_id = info.implId(i);
mstate_aux *this_aux = getAux64(nfa, impl_id);
fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
if (contains(accel_escape_info, i)) {
this_aux->accel_offset = accel_offset;
accel_offset += info.strat.accelSize();
assert(accel_offset <= accel_end_offset);
assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
info.strat.buildAccel(i, accel_escape_info.at(i),
(void *)((char *)m + this_aux->accel_offset));
}
}
}
static
u16 get_edge_flags64(NFA *nfa, dstate_id_t target_impl_id) {
mstate_aux *aux = getAux64(nfa, target_impl_id);
u16 flags = 0;
if (aux->accept) {
flags |= ACCEPT_FLAG;
}
if (aux->accel_offset) {
flags |= ACCEL_FLAG;
}
return flags;
}
static
void fill_in_succ_table_64_16(NFA *nfa, const dfa_info &info,
dstate_id_t sheng_end,
UNUSED dstate_id_t sherman_base) {
u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng64));
u8 alphaShift = info.getAlphaShift();
assert(alphaShift <= 8);
for (size_t i = 0; i < info.size(); i++) {
if (!info.is_normal(i)) {
assert(info.implId(i) < sheng_end || info.is_sherman(i));
continue;
}
assert(info.implId(i) < sherman_base);
u16 normal_id = verify_u16(info.implId(i) - sheng_end);
for (size_t s = 0; s < info.impl_alpha_size; s++) {
dstate_id_t raw_succ = info.states[i].next[s];
u16 &entry = succ_table[((size_t)normal_id << alphaShift) + s];
entry = info.implId(raw_succ);
entry |= get_edge_flags64(nfa, entry);
}
}
}
#define MAX_SHERMAN_LIST_LEN 8 #define MAX_SHERMAN_LIST_LEN 8
static static
@ -842,6 +1017,8 @@ bytecode_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
assert(info.getAlphaShift() <= 8); assert(info.getAlphaShift() <= 8);
// Sherman optimization
if (info.impl_alpha_size > 16) {
u16 total_daddy = 0; u16 total_daddy = 0;
for (u32 i = 0; i < info.size(); i++) { for (u32 i = 0; i < info.size(); i++) {
find_better_daddy(info, i, find_better_daddy(info, i,
@ -853,6 +1030,7 @@ bytecode_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy, DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
info.size() * info.impl_alpha_size, info.size(), info.size() * info.impl_alpha_size, info.size(),
info.impl_alpha_size); info.impl_alpha_size);
}
u16 sherman_limit; u16 sherman_limit;
if (!allocateImplId16(info, sheng_end, &sherman_limit)) { if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
@ -931,6 +1109,160 @@ void fill_in_succ_table_8(NFA *nfa, const dfa_info &info,
} }
} }
static
void fill_in_sherman64(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
char *nfa_base = (char *)nfa;
mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
char *sherman_table = nfa_base + m->sherman_offset;
assert(ISALIGNED_16(sherman_table));
for (size_t i = 0; i < info.size(); i++) {
if (!info.is_sherman(i)) {
continue;
}
u16 fs = verify_u16(info.implId(i));
DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs);
assert(fs >= sherman_limit);
char *curr_sherman_entry
= sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
assert(curr_sherman_entry <= nfa_base + m->length);
u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken);
assert(len <= 9);
dstate_id_t d = info.states[i].daddy;
*(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE;
*(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len;
*(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d);
u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET);
for (u16 s = 0; s < info.impl_alpha_size; s++) {
if (info.states[i].next[s] != info.states[d].next[s]) {
*(chars++) = (u8)s;
}
}
u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len));
for (u16 s = 0; s < info.impl_alpha_size; s++) {
if (info.states[i].next[s] != info.states[d].next[s]) {
DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs,
info.implId(d),
info.implId(info.states[i].next[s]));
u16 entry_val = info.implId(info.states[i].next[s]);
entry_val |= get_edge_flags64(nfa, entry_val);
unaligned_store_u16((u8 *)states++, entry_val);
}
}
}
}
static
bytecode_ptr<NFA> mcsheng64Compile16(dfa_info&info, dstate_id_t sheng_end,
const map<dstate_id_t, AccelScheme>&accel_escape_info,
const Grey &grey) {
DEBUG_PRINTF("building mcsheng 64-16\n");
vector<u32> reports; /* index in ri for the appropriate report list */
vector<u32> reports_eod; /* as above */
ReportID arb;
u8 single;
assert(info.getAlphaShift() <= 8);
// Sherman optimization
if (info.impl_alpha_size > 16) {
u16 total_daddy = 0;
for (u32 i = 0; i < info.size(); i++) {
find_better_daddy(info, i,
is_cyclic_near(info.raw, info.raw.start_anchored),
grey);
total_daddy += info.extra[i].daddytaken;
}
DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
info.size() * info.impl_alpha_size, info.size(),
info.impl_alpha_size);
}
u16 sherman_limit;
if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
info.size());
return nullptr;
}
u16 count_real_states = sherman_limit - sheng_end;
auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16)
* count_real_states;
size_t aux_size = sizeof(mstate_aux) * info.size();
size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+ ri->getReportListSize(), 32);
size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
size_t sherman_size = calcShermanRegionSize(info);
size_t total_size = sherman_offset + sherman_size;
accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
populateBasicInfo64(sizeof(u16), info, total_size, aux_offset, accel_offset,
accel_escape_info.size(), arb, single, nfa.get());
createShuffleMasks64(m, info, sheng_end, accel_escape_info);
/* copy in the mc header information */
m->sherman_offset = sherman_offset;
m->sherman_end = total_size;
m->sherman_limit = sherman_limit;
DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end,
count_real_states, info.size());
fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
sherman_offset - sizeof(NFA), reports, reports_eod,
aux_offset + aux_size, *ri);
fill_in_succ_table_64_16(nfa.get(), info, sheng_end, sherman_limit);
fill_in_sherman64(nfa.get(), info, sherman_limit);
return nfa;
}
static
void fill_in_succ_table_64_8(NFA *nfa, const dfa_info &info,
dstate_id_t sheng_end) {
u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng64);
u8 alphaShift = info.getAlphaShift();
assert(alphaShift <= 8);
for (size_t i = 0; i < info.size(); i++) {
assert(!info.is_sherman(i));
if (!info.is_normal(i)) {
assert(info.implId(i) < sheng_end);
continue;
}
u8 normal_id = verify_u8(info.implId(i) - sheng_end);
for (size_t s = 0; s < info.impl_alpha_size; s++) {
dstate_id_t raw_succ = info.states[i].next[s];
succ_table[((size_t)normal_id << alphaShift) + s]
= info.implId(raw_succ);
}
}
}
static static
void allocateImplId8(dfa_info &info, dstate_id_t sheng_end, void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
const map<dstate_id_t, AccelScheme> &accel_escape_info, const map<dstate_id_t, AccelScheme> &accel_escape_info,
@ -1028,6 +1360,58 @@ bytecode_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
return nfa; return nfa;
} }
static
bytecode_ptr<NFA> mcsheng64Compile8(dfa_info &info, dstate_id_t sheng_end,
const map<dstate_id_t, AccelScheme> &accel_escape_info) {
DEBUG_PRINTF("building mcsheng 64-8\n");
vector<u32> reports;
vector<u32> reports_eod;
ReportID arb;
u8 single;
auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
size_t normal_count = info.size() - sheng_end;
size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count;
size_t aux_size = sizeof(mstate_aux) * info.size();
size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+ ri->getReportListSize(), 32);
size_t total_size = accel_offset + accel_size;
DEBUG_PRINTF("aux_size %zu\n", aux_size);
DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
DEBUG_PRINTF("accel_size %zu\n", accel_size);
DEBUG_PRINTF("accel_offset %zu\n", accel_offset);
DEBUG_PRINTF("total_size %zu\n", total_size);
accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
&m->accept_limit_8);
populateBasicInfo64(sizeof(u8), info, total_size, aux_offset, accel_offset,
accel_escape_info.size(), arb, single, nfa.get());
createShuffleMasks64(m, info, sheng_end, accel_escape_info);
fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
total_size - sizeof(NFA), reports, reports_eod,
aux_offset + aux_size, *ri);
fill_in_succ_table_64_8(nfa.get(), info, sheng_end);
DEBUG_PRINTF("rl size %zu\n", ri->size());
return nfa;
}
bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc, bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm) { const ReportManager &rm) {
if (!cc.grey.allowMcSheng) { if (!cc.grey.allowMcSheng) {
@ -1047,19 +1431,83 @@ bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
map<dstate_id_t, AccelScheme> accel_escape_info map<dstate_id_t, AccelScheme> accel_escape_info
= info.strat.getAccelInfo(cc.grey); = info.strat.getAccelInfo(cc.grey);
auto old_states = info.states;
dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info, MAX_SHENG_STATES);
dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info);
if (sheng_end <= DEAD_STATE + 1) { if (sheng_end <= DEAD_STATE + 1) {
info.states = old_states;
return nullptr; return nullptr;
} }
bytecode_ptr<NFA> nfa; bytecode_ptr<NFA> nfa;
if (!using8bit) { if (!using8bit) {
nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey); nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
} else { } else {
nfa = mcshengCompile8(info, sheng_end, accel_escape_info); nfa = mcshengCompile8(info, sheng_end, accel_escape_info);
} }
if (!nfa) {
info.states = old_states;
return nfa;
}
if (has_eod_reports) {
nfa->flags |= NFA_ACCEPTS_EOD;
}
DEBUG_PRINTF("compile done\n");
return nfa;
}
bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm) {
if (!cc.grey.allowMcSheng) {
return nullptr;
}
if (!cc.target_info.has_avx512vbmi()) {
DEBUG_PRINTF("McSheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
return nullptr;
}
mcclellan_build_strat mbs(raw, rm, false);
dfa_info info(mbs);
bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
* mode with our semantics */
raw.stripExtraEodReports();
}
bool has_eod_reports = raw.hasEodReports();
map<dstate_id_t, AccelScheme> accel_escape_info
= info.strat.getAccelInfo(cc.grey);
bool using64state = false; /*default flag*/
dstate_id_t sheng_end64;
sheng_end64 = find_sheng_states(info, accel_escape_info, MAX_SHENG64_STATES);
if (sheng_end64 <= DEAD_STATE + 1) {
return nullptr;
} else {
using64state = true;
}
bytecode_ptr<NFA> nfa;
if (using64state) {
assert((sheng_end64 > 17) && (sheng_end64 <= 65));
if (!using8bit) {
nfa = mcsheng64Compile16(info, sheng_end64, accel_escape_info, cc.grey);
} else {
assert(using8bit);
nfa = mcsheng64Compile8(info, sheng_end64, accel_escape_info);
assert(nfa);
assert(nfa->type == MCSHENG_64_NFA_8);
}
}
if (!nfa) { if (!nfa) {
return nfa; return nfa;
} }

View File

@ -42,7 +42,8 @@ struct raw_dfa;
bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc, bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm); const ReportManager &rm);
bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm);
bool has_accel_mcsheng(const NFA *nfa); bool has_accel_mcsheng(const NFA *nfa);
} // namespace ue2 } // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -41,3 +41,15 @@ const u64a mcsheng_pext_mask[8] = {
0x00ff00000000000f, 0x00ff00000000000f,
0xff0000000000000f, 0xff0000000000000f,
}; };
#if defined(HAVE_AVX512VBMI)
const u64a mcsheng64_pext_mask[8] = {
0, /* dummy */
0x000000000000ff3f,
0x0000000000ff003f,
0x00000000ff00003f,
0x000000ff0000003f,
0x0000ff000000003f,
0x00ff00000000003f,
0xff0000000000003f,
};
#endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -174,6 +174,124 @@ void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) {
} }
} }
static
const mstate_aux *getAux64(const NFA *n, dstate_id_t i) {
auto *m = (const mcsheng64 *)getImplNfa(n);
auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset);
const mstate_aux *aux = aux_base + i;
assert((const char *)aux < (const char *)n + m->length);
return aux;
}
static
void next_states64(const NFA *n, u16 s, u16 *t) {
const mcsheng64 *m = (const mcsheng64 *)getImplNfa(n);
const mstate_aux *aux = getAux64(n, s);
const u32 as = m->alphaShift;
assert(s != DEAD_STATE);
if (s < m->sheng_end) {
for (u16 c = 0; c < N_CHARS; c++) {
u8 sheng_s = s - 1;
auto trans_for_c = (const char *)&m->sheng_succ_masks[c];
assert(sheng_s < sizeof(m512));
u8 raw_succ = trans_for_c[sheng_s];
if (raw_succ == m->sheng_end - 1) {
t[c] = DEAD_STATE;
} else if (raw_succ < m->sheng_end) {
t[c] = raw_succ + 1;
} else {
t[c] = raw_succ;
}
}
} else if (n->type == MCSHENG_64_NFA_8) {
const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng64));
for (u16 c = 0; c < N_CHARS; c++) {
u32 normal_id = s - m->sheng_end;
t[c] = succ_table[(normal_id << as) + m->remap[c]];
}
} else {
u16 base_s = s;
const char *winfo_base = (const char *)n + m->sherman_offset;
const char *state_base
= winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit);
if (s >= m->sherman_limit) {
base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET);
assert(base_s >= m->sheng_end);
}
const u16 *succ_table = (const u16 *)((const char *)m
+ sizeof(mcsheng64));
for (u16 c = 0; c < N_CHARS; c++) {
u32 normal_id = base_s - m->sheng_end;
t[c] = succ_table[(normal_id << as) + m->remap[c]];
}
if (s >= m->sherman_limit) {
UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET);
assert(type == SHERMAN_STATE);
u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
const char *chars = state_base + SHERMAN_CHARS_OFFSET;
const u16 *states = (const u16 *)(state_base
+ SHERMAN_STATES_OFFSET(len));
for (u8 i = 0; i < len; i++) {
for (u16 c = 0; c < N_CHARS; c++) {
if (m->remap[c] == chars[i]) {
t[c] = unaligned_load_u16((const u8*)&states[i]);
}
}
}
}
for (u16 c = 0; c < N_CHARS; c++) {
t[c] &= STATE_MASK;
}
}
t[TOP] = aux->top & STATE_MASK;
}
static
void describeEdge64(FILE *f, const mcsheng64 *m, const u16 *t, u16 i) {
for (u16 s = 0; s < N_CHARS; s++) {
if (!t[s]) {
continue;
}
u16 ss;
for (ss = 0; ss < s; ss++) {
if (t[s] == t[ss]) {
break;
}
}
if (ss != s) {
continue;
}
CharReach reach;
for (ss = s; ss < 256; ss++) {
if (t[s] == t[ss]) {
reach.set(ss);
}
}
fprintf(f, "%u -> %u [ ", i, t[s]);
if (i < m->sheng_end && t[s] < m->sheng_end) {
fprintf(f, "color = red, fontcolor = red ");
}
fprintf(f, "label = \"");
describeClass(f, reach, 5, CC_OUT_DOT);
fprintf(f, "\" ];\n");
}
}
static static
void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) { void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) {
switch(accel->accel_type) { switch(accel->accel_type) {
@ -256,6 +374,66 @@ void describeNode(const NFA *n, const mcsheng *m, u16 i, FILE *f) {
} }
static
void describeNode64(const NFA *n, const mcsheng64 *m, u16 i, FILE *f) {
const mstate_aux *aux = getAux64(n, i);
bool isSherman = m->sherman_limit && i >= m->sherman_limit;
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
"label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
if (aux->accel_offset) {
dumpAccelDot(f, i, (const union AccelAux *)
((const char *)m + aux->accel_offset));
}
if (i && i < m->sheng_end) {
fprintf(f, "%u [color = red, fontcolor = red]; \n", i);
}
if (aux->accept_eod) {
fprintf(f, "%u [ color = darkorchid ];\n", i);
}
if (aux->accept) {
fprintf(f, "%u [ shape = doublecircle ];\n", i);
}
if (aux->top && aux->top != i) {
fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
aux->top);
}
if (i == m->start_anchored) {
fprintf(f, "STARTA -> %u [color = blue ]\n", i);
}
if (i == m->start_floating) {
fprintf(f, "STARTF -> %u [color = red ]\n", i);
}
if (isSherman) {
const char *winfo_base = (const char *)n + m->sherman_offset;
const char *state_base
= winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit);
assert(state_base < (const char *)m + m->length - sizeof(NFA));
UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET);
assert(type == SHERMAN_STATE);
fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
if (daddy) {
fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
i, daddy);
}
}
if (i && i < m->sheng_end) {
fprintf(f, "subgraph cluster_sheng { %u } \n", i);
}
}
static static
void dumpDotPreambleDfa(FILE *f) { void dumpDotPreambleDfa(FILE *f) {
dumpDotPreamble(f); dumpDotPreamble(f);
@ -392,6 +570,131 @@ void dump_text_8(const NFA *nfa, FILE *f) {
dumpTextReverse(nfa, f); dumpTextReverse(nfa, f);
} }
static
void dump64_dot_16(const NFA *nfa, FILE *f) {
auto *m = (const mcsheng64 *)getImplNfa(nfa);
dumpDotPreambleDfa(f);
for (u16 i = 1; i < m->state_count; i++) {
describeNode64(nfa, m, i, f);
u16 t[ALPHABET_SIZE];
next_states64(nfa, i, t);
describeEdge64(f, m, t, i);
}
fprintf(f, "}\n");
}
static
void dump64_dot_8(const NFA *nfa, FILE *f) {
auto m = (const mcsheng64 *)getImplNfa(nfa);
dumpDotPreambleDfa(f);
for (u16 i = 1; i < m->state_count; i++) {
describeNode64(nfa, m, i, f);
u16 t[ALPHABET_SIZE];
next_states64(nfa, i, t);
describeEdge64(f, m, t, i);
}
fprintf(f, "}\n");
}
static
void dumpAccelMasks64(FILE *f, const mcsheng64 *m, const mstate_aux *aux) {
fprintf(f, "\n");
fprintf(f, "Acceleration\n");
fprintf(f, "------------\n");
for (u16 i = 0; i < m->state_count; i++) {
if (!aux[i].accel_offset) {
continue;
}
auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset);
fprintf(f, "%05hu ", i);
dumpAccelInfo(f, *accel);
}
}
static
void describeAlphabet64(FILE *f, const mcsheng64 *m) {
map<u8, CharReach> rev;
for (u16 i = 0; i < N_CHARS; i++) {
rev[m->remap[i]].clear();
}
for (u16 i = 0; i < N_CHARS; i++) {
rev[m->remap[i]].set(i);
}
map<u8, CharReach>::const_iterator it;
fprintf(f, "\nAlphabet\n");
for (it = rev.begin(); it != rev.end(); ++it) {
fprintf(f, "%3hhu: ", it->first);
describeClass(f, it->second, 10240, CC_OUT_TEXT);
fprintf(f, "\n");
}
fprintf(f, "\n");
}
static
void dumpCommonHeader64(FILE *f, const mcsheng64 *m) {
fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report,
m->state_count, m->length);
fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
m->start_floating);
fprintf(f, "single accept: %d, has_accel: %d\n",
!!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel);
fprintf(f, "sheng_end: %hu\n", m->sheng_end);
fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit);
}
static
void dump64_text_8(const NFA *nfa, FILE *f) {
auto m = (const mcsheng64 *)getImplNfa(nfa);
auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
fprintf(f, "mcsheng 64-8\n");
dumpCommonHeader64(f, m);
fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8,
m->accept_limit_8);
fprintf(f, "\n");
describeAlphabet64(f, m);
dumpAccelMasks64(f, m, aux);
fprintf(f, "\n");
dumpTextReverse(nfa, f);
}
static
void dump64_text_16(const NFA *nfa, FILE *f) {
auto *m = (const mcsheng64 *)getImplNfa(nfa);
auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
fprintf(f, "mcsheng 64-16\n");
dumpCommonHeader64(f, m);
fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit,
(int)m->sherman_end);
fprintf(f, "\n");
describeAlphabet64(f, m);
dumpAccelMasks64(f, m, aux);
fprintf(f, "\n");
dumpTextReverse(nfa, f);
}
void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) { void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
assert(nfa->type == MCSHENG_NFA_16); assert(nfa->type == MCSHENG_NFA_16);
dump_text_16(nfa, StdioFile(base + ".txt", "w")); dump_text_16(nfa, StdioFile(base + ".txt", "w"));
@ -404,4 +707,16 @@ void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
dump_dot_8(nfa, StdioFile(base + ".dot", "w")); dump_dot_8(nfa, StdioFile(base + ".dot", "w"));
} }
void nfaExecMcSheng64_16_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
assert(nfa->type == MCSHENG_64_NFA_16);
dump64_text_16(nfa, StdioFile(base + ".txt", "w"));
dump64_dot_16(nfa, StdioFile(base + ".dot", "w"));
}
void nfaExecMcSheng64_8_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
assert(nfa->type == MCSHENG_64_NFA_8);
dump64_text_8(nfa, StdioFile(base + ".txt", "w"));
dump64_dot_8(nfa, StdioFile(base + ".dot", "w"));
}
} // namespace ue2 } // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,8 @@ namespace ue2 {
void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base); void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base);
void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base); void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base);
void nfaExecMcSheng64_8_dump(const struct NFA *nfa, const std::string &base);
void nfaExecMcSheng64_16_dump(const struct NFA *nfa, const std::string &base);
} // namespace ue2 } // namespace ue2
#endif // DUMP_SUPPORT #endif // DUMP_SUPPORT

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2018, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -92,4 +92,33 @@ struct mcsheng {
* representing the data from a u64a. */ * representing the data from a u64a. */
extern const u64a mcsheng_pext_mask[8]; extern const u64a mcsheng_pext_mask[8];
struct mcsheng64 {
u16 state_count; /**< total number of states */
u32 length; /**< length of dfa in bytes */
u16 start_anchored; /**< anchored start state */
u16 start_floating; /**< floating start state */
u32 aux_offset; /**< offset of the aux structures relative to the start of
* the nfa structure */
u32 sherman_offset; /**< offset of array of sherman state offsets the
* state_info structures relative to the start of the
* nfa structure */
u32 sherman_end; /**< offset of the end of the state_info structures
* relative to the start of the nfa structure */
u16 sheng_end; /**< first non-sheng state */
u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
* internal sheng ids */
u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
u16 accept_limit_8; /**< 8 bit, lowest accept state */
u16 sherman_limit; /**< lowest sherman state */
u8 alphaShift;
u8 flags;
u8 has_accel; /**< 1 iff there are any accel plans */
u8 remap[256]; /**< remaps characters to a smaller alphabet */
ReportID arb_report; /**< one of the accepts that this dfa may raise */
u32 accel_offset; /**< offset of accel structures from start of McClellan */
m512 sheng_succ_masks[N_CHARS];
};
extern const u64a mcsheng64_pext_mask[8];
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -76,6 +76,10 @@
DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func); \ DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func); \
DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func); \
DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \
DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func); \
DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func); \
DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func); \
DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func); \
default: \ default: \
assert(0); \ assert(0); \
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -181,7 +181,6 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
static const nfa_dispatch_fn has_repeats_other_than_firsts; \ static const nfa_dispatch_fn has_repeats_other_than_firsts; \
static const u32 stateAlign = \ static const u32 stateAlign = \
MAX(mlt_align, alignof(RepeatControl)); \ MAX(mlt_align, alignof(RepeatControl)); \
static const bool fast = mlt_size <= 64; \
}; \ }; \
const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel \ const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel \
= has_accel_limex<LimExNFA##mlt_size>; \ = has_accel_limex<LimExNFA##mlt_size>; \
@ -210,7 +209,6 @@ template<> struct NFATraits<MCCLELLAN_NFA_8> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 1; static const u32 stateAlign = 1;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -226,7 +224,6 @@ template<> struct NFATraits<MCCLELLAN_NFA_16> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 2; static const u32 stateAlign = 2;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -242,7 +239,6 @@ template<> struct NFATraits<GOUGH_NFA_8> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -258,7 +254,6 @@ template<> struct NFATraits<GOUGH_NFA_16> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -274,7 +269,6 @@ template<> struct NFATraits<MPV_NFA> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -290,7 +284,6 @@ template<> struct NFATraits<CASTLE_NFA> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -306,7 +299,6 @@ template<> struct NFATraits<LBR_NFA_DOT> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -322,7 +314,6 @@ template<> struct NFATraits<LBR_NFA_VERM> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -338,7 +329,6 @@ template<> struct NFATraits<LBR_NFA_NVERM> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -354,7 +344,6 @@ template<> struct NFATraits<LBR_NFA_SHUF> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -370,7 +359,6 @@ template<> struct NFATraits<LBR_NFA_TRUF> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 8; static const u32 stateAlign = 8;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -386,7 +374,6 @@ template<> struct NFATraits<SHENG_NFA> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 1; static const u32 stateAlign = 1;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -402,7 +389,6 @@ template<> struct NFATraits<TAMARAMA_NFA> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 64; static const u32 stateAlign = 64;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -418,7 +404,6 @@ template<> struct NFATraits<MCSHENG_NFA_8> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 1; static const u32 stateAlign = 1;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -434,7 +419,6 @@ template<> struct NFATraits<MCSHENG_NFA_16> {
UNUSED static const char *name; UNUSED static const char *name;
static const NFACategory category = NFA_OTHER; static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 2; static const u32 stateAlign = 2;
static const bool fast = true;
static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats; static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts; static const nfa_dispatch_fn has_repeats_other_than_firsts;
@ -446,6 +430,65 @@ const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_repeats_other_than_firsts =
const char *NFATraits<MCSHENG_NFA_16>::name = "Shengy McShengFace 16"; const char *NFATraits<MCSHENG_NFA_16>::name = "Shengy McShengFace 16";
#endif #endif
template<> struct NFATraits<SHENG_NFA_32> {
UNUSED static const char *name;
static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 1;
static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts;
};
const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_accel = has_accel_sheng;
const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_repeats = dispatch_false;
const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_repeats_other_than_firsts = dispatch_false;
#if defined(DUMP_SUPPORT)
const char *NFATraits<SHENG_NFA_32>::name = "Sheng 32";
#endif
template<> struct NFATraits<SHENG_NFA_64> {
UNUSED static const char *name;
static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 1;
static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts;
};
const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_accel = has_accel_sheng;
const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_repeats = dispatch_false;
const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_repeats_other_than_firsts = dispatch_false;
#if defined(DUMP_SUPPORT)
const char *NFATraits<SHENG_NFA_64>::name = "Sheng 64";
#endif
template<> struct NFATraits<MCSHENG_64_NFA_8> {
UNUSED static const char *name;
static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 1;
static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts;
};
const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_accel = has_accel_mcsheng;
const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats = dispatch_false;
const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
#if defined(DUMP_SUPPORT)
const char *NFATraits<MCSHENG_64_NFA_8>::name = "Shengy64 McShengFace 8";
#endif
template<> struct NFATraits<MCSHENG_64_NFA_16> {
UNUSED static const char *name;
static const NFACategory category = NFA_OTHER;
static const u32 stateAlign = 2;
static const nfa_dispatch_fn has_accel;
static const nfa_dispatch_fn has_repeats;
static const nfa_dispatch_fn has_repeats_other_than_firsts;
};
const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_accel = has_accel_mcsheng;
const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats = dispatch_false;
const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
#if defined(DUMP_SUPPORT)
const char *NFATraits<MCSHENG_64_NFA_16>::name = "Shengy64 McShengFace 16";
#endif
} // namespace } // namespace
#if defined(DUMP_SUPPORT) #if defined(DUMP_SUPPORT)
@ -473,20 +516,6 @@ u32 state_alignment(const NFA &nfa) {
return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, getStateAlign, nullptr); return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, getStateAlign, nullptr);
} }
namespace {
template<NFAEngineType t>
struct getFastness {
static u32 call(void *) {
return NFATraits<t>::fast;
}
};
}
bool is_fast(const NFA &nfa) {
NFAEngineType t = (NFAEngineType)nfa.type;
return DISPATCH_BY_NFA_TYPE(t, getFastness, nullptr);
}
namespace { namespace {
template<NFAEngineType t> template<NFAEngineType t>
struct is_limex { struct is_limex {

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -47,10 +47,6 @@ std::string describe(const NFA &nfa);
// For a given NFA, retrieve the alignment required by its uncompressed state. // For a given NFA, retrieve the alignment required by its uncompressed state.
u32 state_alignment(const NFA &nfa); u32 state_alignment(const NFA &nfa);
/* returns true if the nfa is considered 'fast'. TODO: work out what we mean by
* fast. */
bool is_fast(const NFA &n);
bool has_bounded_repeats_other_than_firsts(const NFA &n); bool has_bounded_repeats_other_than_firsts(const NFA &n);
bool has_bounded_repeats(const NFA &n); bool has_bounded_repeats(const NFA &n);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -81,6 +81,10 @@ namespace ue2 {
DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func); \ DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func); \
DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func); \
DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \
DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func); \
DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func); \
DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func); \
DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func); \
default: \ default: \
assert(0); \ assert(0); \
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -72,6 +72,10 @@ enum NFAEngineType {
TAMARAMA_NFA, /**< magic nfa container */ TAMARAMA_NFA, /**< magic nfa container */
MCSHENG_NFA_8, /**< magic pseudo nfa */ MCSHENG_NFA_8, /**< magic pseudo nfa */
MCSHENG_NFA_16, /**< magic pseudo nfa */ MCSHENG_NFA_16, /**< magic pseudo nfa */
SHENG_NFA_32, /**< magic pseudo nfa */
SHENG_NFA_64, /**< magic pseudo nfa */
MCSHENG_64_NFA_8, /**< magic pseudo nfa */
MCSHENG_64_NFA_16, /**< magic pseudo nfa */
/** \brief bogus NFA - not used */ /** \brief bogus NFA - not used */
INVALID_NFA INVALID_NFA
}; };
@ -148,7 +152,8 @@ static really_inline int isMcClellanType(u8 t) {
/** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid /** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid
* DFA. */ * DFA. */
static really_inline int isShengMcClellanType(u8 t) { static really_inline int isShengMcClellanType(u8 t) {
return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16; return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16 ||
t == MCSHENG_64_NFA_8 || t == MCSHENG_64_NFA_16;
} }
/** \brief True if the given type (from NFA::type) is a Gough DFA. */ /** \brief True if the given type (from NFA::type) is a Gough DFA. */
@ -157,10 +162,25 @@ static really_inline int isGoughType(u8 t) {
} }
/** \brief True if the given type (from NFA::type) is a Sheng DFA. */ /** \brief True if the given type (from NFA::type) is a Sheng DFA. */
static really_inline int isShengType(u8 t) { static really_inline int isSheng16Type(u8 t) {
return t == SHENG_NFA; return t == SHENG_NFA;
} }
/** \brief True if the given type (from NFA::type) is a Sheng32 DFA. */
static really_inline int isSheng32Type(u8 t) {
return t == SHENG_NFA_32;
}
/** \brief True if the given type (from NFA::type) is a Sheng64 DFA. */
static really_inline int isSheng64Type(u8 t) {
return t == SHENG_NFA_64;
}
/** \brief True if the given type (from NFA::type) is a Sheng16/32/64 DFA. */
static really_inline int isShengType(u8 t) {
return t == SHENG_NFA || t == SHENG_NFA_32 || t == SHENG_NFA_64;
}
/** /**
* \brief True if the given type (from NFA::type) is a McClellan, Gough or * \brief True if the given type (from NFA::type) is a McClellan, Gough or
* Sheng DFA. * Sheng DFA.

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -58,4 +58,86 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer, char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
size_t length, NfaCallback cb, void *context); size_t length, NfaCallback cb, void *context);
#if defined(HAVE_AVX512VBMI)
#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end);
char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end);
char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report);
char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
struct mq *q);
char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q);
char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q);
char nfaExecSheng32_queueCompressState(const struct NFA *nfa,
const struct mq *q, s64a loc);
char nfaExecSheng32_expandState(const struct NFA *nfa, void *dest,
const void *src, u64a offset, u8 key);
char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
void *state, u8 key);
char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
const char *streamState, u64a offset,
NfaCallback callback, void *context);
char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q);
char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
size_t length, NfaCallback cb, void *context);
#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end);
char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end);
char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report);
char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
struct mq *q);
char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q);
char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q);
char nfaExecSheng64_queueCompressState(const struct NFA *nfa,
const struct mq *q, s64a loc);
char nfaExecSheng64_expandState(const struct NFA *nfa, void *dest,
const void *src, u64a offset, u8 key);
char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
void *state, u8 key);
char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
const char *streamState, u64a offset,
NfaCallback callback, void *context);
char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q);
char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
size_t length, NfaCallback cb, void *context);
#else // !HAVE_AVX512VBMI
#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
#define nfaExecSheng32_Q NFA_API_NO_IMPL
#define nfaExecSheng32_Q2 NFA_API_NO_IMPL
#define nfaExecSheng32_QR NFA_API_NO_IMPL
#define nfaExecSheng32_inAccept NFA_API_NO_IMPL
#define nfaExecSheng32_inAnyAccept NFA_API_NO_IMPL
#define nfaExecSheng32_queueInitState NFA_API_NO_IMPL
#define nfaExecSheng32_queueCompressState NFA_API_NO_IMPL
#define nfaExecSheng32_expandState NFA_API_NO_IMPL
#define nfaExecSheng32_initCompressedState NFA_API_NO_IMPL
#define nfaExecSheng32_testEOD NFA_API_NO_IMPL
#define nfaExecSheng32_reportCurrent NFA_API_NO_IMPL
#define nfaExecSheng32_B NFA_API_NO_IMPL
#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
#define nfaExecSheng64_Q NFA_API_NO_IMPL
#define nfaExecSheng64_Q2 NFA_API_NO_IMPL
#define nfaExecSheng64_QR NFA_API_NO_IMPL
#define nfaExecSheng64_inAccept NFA_API_NO_IMPL
#define nfaExecSheng64_inAnyAccept NFA_API_NO_IMPL
#define nfaExecSheng64_queueInitState NFA_API_NO_IMPL
#define nfaExecSheng64_queueCompressState NFA_API_NO_IMPL
#define nfaExecSheng64_expandState NFA_API_NO_IMPL
#define nfaExecSheng64_initCompressedState NFA_API_NO_IMPL
#define nfaExecSheng64_testEOD NFA_API_NO_IMPL
#define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL
#define nfaExecSheng64_B NFA_API_NO_IMPL
#endif // end of HAVE_AVX512VBMI
#endif /* SHENG_H_ */ #endif /* SHENG_H_ */

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -52,6 +52,43 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
return (a | b | c | d) & (SHENG_STATE_FLAG_MASK); return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
} }
#if defined(HAVE_AVX512VBMI)
static really_inline
u8 isDeadState32(const u8 a) {
return a & SHENG32_STATE_DEAD;
}
static really_inline
u8 isAcceptState32(const u8 a) {
return a & SHENG32_STATE_ACCEPT;
}
static really_inline
u8 isAccelState32(const u8 a) {
return a & SHENG32_STATE_ACCEL;
}
static really_inline
u8 hasInterestingStates32(const u8 a, const u8 b, const u8 c, const u8 d) {
return (a | b | c | d) & (SHENG32_STATE_FLAG_MASK);
}
static really_inline
u8 isDeadState64(const u8 a) {
return a & SHENG64_STATE_DEAD;
}
static really_inline
u8 isAcceptState64(const u8 a) {
return a & SHENG64_STATE_ACCEPT;
}
static really_inline
u8 hasInterestingStates64(const u8 a, const u8 b, const u8 c, const u8 d) {
return (a | b | c | d) & (SHENG64_STATE_FLAG_MASK);
}
#endif
/* these functions should be optimized out, used by NO_MATCHES mode */ /* these functions should be optimized out, used by NO_MATCHES mode */
static really_inline static really_inline
u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c, u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c,
@ -71,66 +108,162 @@ u8 dummyFunc(UNUSED const u8 a) {
#define SHENG_IMPL sheng_cod #define SHENG_IMPL sheng_cod
#define DEAD_FUNC isDeadState #define DEAD_FUNC isDeadState
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_cod
#define DEAD_FUNC32 isDeadState32
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_cod
#define DEAD_FUNC64 isDeadState64
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl.h" #include "sheng_impl.h"
#undef SHENG_IMPL #undef SHENG_IMPL
#undef DEAD_FUNC #undef DEAD_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* callback output, can't die */ /* callback output, can't die */
#define SHENG_IMPL sheng_co #define SHENG_IMPL sheng_co
#define DEAD_FUNC dummyFunc #define DEAD_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_co
#define DEAD_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_co
#define DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl.h" #include "sheng_impl.h"
#undef SHENG_IMPL #undef SHENG_IMPL
#undef DEAD_FUNC #undef DEAD_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* stop at match, can die */ /* stop at match, can die */
#define SHENG_IMPL sheng_samd #define SHENG_IMPL sheng_samd
#define DEAD_FUNC isDeadState #define DEAD_FUNC isDeadState
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_samd
#define DEAD_FUNC32 isDeadState32
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_samd
#define DEAD_FUNC64 isDeadState64
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 1 #define STOP_AT_MATCH 1
#include "sheng_impl.h" #include "sheng_impl.h"
#undef SHENG_IMPL #undef SHENG_IMPL
#undef DEAD_FUNC #undef DEAD_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* stop at match, can't die */ /* stop at match, can't die */
#define SHENG_IMPL sheng_sam #define SHENG_IMPL sheng_sam
#define DEAD_FUNC dummyFunc #define DEAD_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_sam
#define DEAD_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_sam
#define DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 1 #define STOP_AT_MATCH 1
#include "sheng_impl.h" #include "sheng_impl.h"
#undef SHENG_IMPL #undef SHENG_IMPL
#undef DEAD_FUNC #undef DEAD_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* no match, can die */ /* no match, can die */
#define SHENG_IMPL sheng_nmd #define SHENG_IMPL sheng_nmd
#define DEAD_FUNC isDeadState #define DEAD_FUNC isDeadState
#define ACCEPT_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_nmd
#define DEAD_FUNC32 isDeadState32
#define ACCEPT_FUNC32 dummyFunc
#define SHENG64_IMPL sheng64_nmd
#define DEAD_FUNC64 isDeadState64
#define ACCEPT_FUNC64 dummyFunc
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl.h" #include "sheng_impl.h"
#undef SHENG_IMPL #undef SHENG_IMPL
#undef DEAD_FUNC #undef DEAD_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* no match, can't die */ /* no match, can't die */
#define SHENG_IMPL sheng_nm #define SHENG_IMPL sheng_nm
#define DEAD_FUNC dummyFunc #define DEAD_FUNC dummyFunc
#define ACCEPT_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_nm
#define DEAD_FUNC32 dummyFunc
#define ACCEPT_FUNC32 dummyFunc
#define SHENG64_IMPL sheng64_nm
#define DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 dummyFunc
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl.h" #include "sheng_impl.h"
#undef SHENG_IMPL #undef SHENG_IMPL
#undef DEAD_FUNC #undef DEAD_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* /*
@ -144,6 +277,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC isAccelState #define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_coda
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 isAccelState32
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define NO_SHENG64_IMPL
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -153,6 +296,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef NO_SHENG64_IMPL
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* callback output, can die, not accelerated */ /* callback output, can die, not accelerated */
@ -163,6 +316,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC dummyFunc #define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_cod
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 dummyFunc
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_4_cod
#define INTERESTING_FUNC64 hasInterestingStates64
#define INNER_DEAD_FUNC64 isDeadState64
#define OUTER_DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -172,6 +339,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef INTERESTING_FUNC64
#undef INNER_DEAD_FUNC64
#undef OUTER_DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* callback output, can't die, accelerated */ /* callback output, can't die, accelerated */
@ -182,6 +363,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC isAccelState #define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_coa
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 isAccelState32
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define NO_SHENG64_IMPL
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -191,6 +382,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef NO_SHENG64_IMPL
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* callback output, can't die, not accelerated */ /* callback output, can't die, not accelerated */
@ -201,6 +402,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC dummyFunc #define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_co
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 dummyFunc
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_4_co
#define INTERESTING_FUNC64 hasInterestingStates64
#define INNER_DEAD_FUNC64 dummyFunc
#define OUTER_DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -210,6 +425,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef INTERESTING_FUNC64
#undef INNER_DEAD_FUNC64
#undef OUTER_DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* stop at match, can die, accelerated */ /* stop at match, can die, accelerated */
@ -220,6 +449,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC isAccelState #define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_samda
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 isAccelState32
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define NO_SHENG64_IMPL
#endif
#define STOP_AT_MATCH 1 #define STOP_AT_MATCH 1
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -229,6 +468,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef NO_SHENG64_IMPL
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* stop at match, can die, not accelerated */ /* stop at match, can die, not accelerated */
@ -239,6 +488,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC dummyFunc #define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_samd
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 dummyFunc
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_4_samd
#define INTERESTING_FUNC64 hasInterestingStates64
#define INNER_DEAD_FUNC64 isDeadState64
#define OUTER_DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 1 #define STOP_AT_MATCH 1
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -248,6 +511,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef INTERESTING_FUNC64
#undef INNER_DEAD_FUNC64
#undef OUTER_DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* stop at match, can't die, accelerated */ /* stop at match, can't die, accelerated */
@ -258,6 +535,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC isAccelState #define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_sama
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 isAccelState32
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define NO_SHENG64_IMPL
#endif
#define STOP_AT_MATCH 1 #define STOP_AT_MATCH 1
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -267,6 +554,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef NO_SHENG64_IMPL
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* stop at match, can't die, not accelerated */ /* stop at match, can't die, not accelerated */
@ -277,6 +574,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC dummyFunc #define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState #define ACCEPT_FUNC isAcceptState
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_sam
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 dummyFunc
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#define SHENG64_IMPL sheng64_4_sam
#define INTERESTING_FUNC64 hasInterestingStates64
#define INNER_DEAD_FUNC64 dummyFunc
#define OUTER_DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 isAcceptState64
#endif
#define STOP_AT_MATCH 1 #define STOP_AT_MATCH 1
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -286,6 +597,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef INTERESTING_FUNC64
#undef INNER_DEAD_FUNC64
#undef OUTER_DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* no-match have interesting func as dummy, and die/accel checks are outer */ /* no-match have interesting func as dummy, and die/accel checks are outer */
@ -298,6 +623,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC dummyFunc #define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC isAccelState
#define ACCEPT_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_nmda
#define INTERESTING_FUNC32 dummyFunc4
#define INNER_DEAD_FUNC32 dummyFunc
#define OUTER_DEAD_FUNC32 isDeadState32
#define INNER_ACCEL_FUNC32 dummyFunc
#define OUTER_ACCEL_FUNC32 isAccelState32
#define ACCEPT_FUNC32 dummyFunc
#define NO_SHENG64_IMPL
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -307,6 +642,16 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef NO_SHENG64_IMPL
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* no match, can die, not accelerated */ /* no match, can die, not accelerated */
@ -317,6 +662,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC dummyFunc #define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_nmd
#define INTERESTING_FUNC32 dummyFunc4
#define INNER_DEAD_FUNC32 dummyFunc
#define OUTER_DEAD_FUNC32 isDeadState32
#define INNER_ACCEL_FUNC32 dummyFunc
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 dummyFunc
#define SHENG64_IMPL sheng64_4_nmd
#define INTERESTING_FUNC64 dummyFunc4
#define INNER_DEAD_FUNC64 dummyFunc
#define OUTER_DEAD_FUNC64 isDeadState64
#define ACCEPT_FUNC64 dummyFunc
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -326,6 +685,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef INTERESTING_FUNC64
#undef INNER_DEAD_FUNC64
#undef OUTER_DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
/* there is no performance benefit in accelerating a no-match case that can't /* there is no performance benefit in accelerating a no-match case that can't
@ -339,6 +712,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#define INNER_ACCEL_FUNC dummyFunc #define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc
#if defined(HAVE_AVX512VBMI)
#define SHENG32_IMPL sheng32_4_nm
#define INTERESTING_FUNC32 dummyFunc4
#define INNER_DEAD_FUNC32 dummyFunc
#define OUTER_DEAD_FUNC32 dummyFunc
#define INNER_ACCEL_FUNC32 dummyFunc
#define OUTER_ACCEL_FUNC32 dummyFunc
#define ACCEPT_FUNC32 dummyFunc
#define SHENG64_IMPL sheng64_4_nm
#define INTERESTING_FUNC64 dummyFunc4
#define INNER_DEAD_FUNC64 dummyFunc
#define OUTER_DEAD_FUNC64 dummyFunc
#define ACCEPT_FUNC64 dummyFunc
#endif
#define STOP_AT_MATCH 0 #define STOP_AT_MATCH 0
#include "sheng_impl4.h" #include "sheng_impl4.h"
#undef SHENG_IMPL #undef SHENG_IMPL
@ -348,6 +735,20 @@ u8 dummyFunc(UNUSED const u8 a) {
#undef INNER_ACCEL_FUNC #undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC #undef ACCEPT_FUNC
#if defined(HAVE_AVX512VBMI)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#undef OUTER_DEAD_FUNC32
#undef INNER_ACCEL_FUNC32
#undef OUTER_ACCEL_FUNC32
#undef ACCEPT_FUNC32
#undef SHENG64_IMPL
#undef INTERESTING_FUNC64
#undef INNER_DEAD_FUNC64
#undef OUTER_DEAD_FUNC64
#undef ACCEPT_FUNC64
#endif
#undef STOP_AT_MATCH #undef STOP_AT_MATCH
#endif // SHENG_DEFS_H #endif // SHENG_DEFS_H

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -95,3 +95,127 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
*scan_end = cur_buf; *scan_end = cur_buf;
return MO_CONTINUE_MATCHING; return MO_CONTINUE_MATCHING;
} }
#if defined(HAVE_AVX512VBMI)
static really_inline
char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
const struct sheng32 *s,
u8 *const cached_accept_state,
ReportID *const cached_accept_id,
u8 single, u64a base_offset, const u8 *buf, const u8 *start,
const u8 *end, const u8 **scan_end) {
DEBUG_PRINTF("Starting DFA execution in state %u\n",
*state & SHENG32_STATE_MASK);
const u8 *cur_buf = start;
if (DEAD_FUNC32(*state)) {
DEBUG_PRINTF("Dead on arrival\n");
*scan_end = end;
return MO_CONTINUE_MATCHING;
}
DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
m512 cur_state = set64x8(*state);
const m512 *masks = s->succ_masks;
while (likely(cur_buf != end)) {
const u8 c = *cur_buf;
const m512 succ_mask = masks[c];
cur_state = vpermb512(cur_state, succ_mask);
const u8 tmp = movd512(cur_state);
DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
tmp & SHENG32_STATE_FLAG_MASK);
if (unlikely(ACCEPT_FUNC32(tmp))) {
DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK);
u64a match_offset = base_offset + (cur_buf - buf) + 1;
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(u64a)(cur_buf - start));
*state = tmp;
*scan_end = cur_buf;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports32(s, cb, ctxt, tmp, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
cur_buf++;
}
*state = movd512(cur_state);
*scan_end = cur_buf;
return MO_CONTINUE_MATCHING;
}
static really_inline
char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
const struct sheng64 *s,
u8 *const cached_accept_state,
ReportID *const cached_accept_id,
u8 single, u64a base_offset, const u8 *buf, const u8 *start,
const u8 *end, const u8 **scan_end) {
DEBUG_PRINTF("Starting DFA execution in state %u\n",
*state & SHENG64_STATE_MASK);
const u8 *cur_buf = start;
if (DEAD_FUNC64(*state)) {
DEBUG_PRINTF("Dead on arrival\n");
*scan_end = end;
return MO_CONTINUE_MATCHING;
}
DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
m512 cur_state = set64x8(*state);
const m512 *masks = s->succ_masks;
while (likely(cur_buf != end)) {
const u8 c = *cur_buf;
const m512 succ_mask = masks[c];
cur_state = vpermb512(cur_state, succ_mask);
const u8 tmp = movd512(cur_state);
DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
tmp & SHENG64_STATE_FLAG_MASK);
if (unlikely(ACCEPT_FUNC64(tmp))) {
DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK);
u64a match_offset = base_offset + (cur_buf - buf) + 1;
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(u64a)(cur_buf - start));
*state = tmp;
*scan_end = cur_buf;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports64(s, cb, ctxt, tmp, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
cur_buf++;
}
*state = movd512(cur_state);
*scan_end = cur_buf;
return MO_CONTINUE_MATCHING;
}
#endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -282,3 +282,430 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
*scan_end = cur_buf; *scan_end = cur_buf;
return MO_CONTINUE_MATCHING; return MO_CONTINUE_MATCHING;
} }
#if defined(HAVE_AVX512VBMI)
static really_inline
char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
const struct sheng32 *s,
u8 *const cached_accept_state,
ReportID *const cached_accept_id,
u8 single, u64a base_offset, const u8 *buf, const u8 *start,
const u8 *end, const u8 **scan_end) {
DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
*state & SHENG32_STATE_MASK);
const u8 *cur_buf = start;
const u8 *min_accel_dist = start;
base_offset++;
DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) {
DEBUG_PRINTF("Accel state reached @ 0\n");
const union AccelAux *aaux =
get_accel32(s, *state & SHENG32_STATE_MASK);
const u8 *new_offset = run_accel(aaux, cur_buf, end);
if (new_offset < cur_buf + BAD_ACCEL_DIST) {
min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
} else {
min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
}
DEBUG_PRINTF("Next accel chance: %llu\n",
(u64a)(min_accel_dist - start));
DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
cur_buf = new_offset;
DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
}
if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) {
DEBUG_PRINTF("Dead on arrival\n");
*scan_end = end;
return MO_CONTINUE_MATCHING;
}
m512 cur_state = set64x8(*state);
const m512 *masks = s->succ_masks;
while (likely(end - cur_buf >= 4)) {
const u8 *b1 = cur_buf;
const u8 *b2 = cur_buf + 1;
const u8 *b3 = cur_buf + 2;
const u8 *b4 = cur_buf + 3;
const u8 c1 = *b1;
const u8 c2 = *b2;
const u8 c3 = *b3;
const u8 c4 = *b4;
const m512 succ_mask1 = masks[c1];
cur_state = vpermb512(cur_state, succ_mask1);
const u8 a1 = movd512(cur_state);
const m512 succ_mask2 = masks[c2];
cur_state = vpermb512(cur_state, succ_mask2);
const u8 a2 = movd512(cur_state);
const m512 succ_mask3 = masks[c3];
cur_state = vpermb512(cur_state, succ_mask3);
const u8 a3 = movd512(cur_state);
const m512 succ_mask4 = masks[c4];
cur_state = vpermb512(cur_state, succ_mask4);
const u8 a4 = movd512(cur_state);
DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
a1 & SHENG32_STATE_FLAG_MASK);
DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK,
a2 & SHENG32_STATE_FLAG_MASK);
DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK,
a3 & SHENG32_STATE_FLAG_MASK);
DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK,
a4 & SHENG32_STATE_FLAG_MASK);
if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) {
if (ACCEPT_FUNC32(a1)) {
u64a match_offset = base_offset + b1 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a1 & SHENG32_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b1 - start));
*scan_end = b1;
*state = a1;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports32(s, cb, ctxt, a1, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (ACCEPT_FUNC32(a2)) {
u64a match_offset = base_offset + b2 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a2 & SHENG32_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b2 - start));
*scan_end = b2;
*state = a2;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports32(s, cb, ctxt, a2, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (ACCEPT_FUNC32(a3)) {
u64a match_offset = base_offset + b3 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a3 & SHENG32_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b3 - start));
*scan_end = b3;
*state = a3;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports32(s, cb, ctxt, a3, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (ACCEPT_FUNC32(a4)) {
u64a match_offset = base_offset + b4 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a4 & SHENG32_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b4 - start));
*scan_end = b4;
*state = a4;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports32(s, cb, ctxt, a4, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (INNER_DEAD_FUNC32(a4)) {
DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
*scan_end = end;
*state = a4;
return MO_CONTINUE_MATCHING;
}
if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) {
DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
const union AccelAux *aaux =
get_accel32(s, a4 & SHENG32_STATE_MASK);
const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
} else {
min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
}
DEBUG_PRINTF("Next accel chance: %llu\n",
(u64a)(min_accel_dist - start));
DEBUG_PRINTF("Accel scanned %llu bytes\n",
(u64a)(new_offset - cur_buf - 4));
cur_buf = new_offset;
DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
continue;
}
}
if (OUTER_DEAD_FUNC32(a4)) {
DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
*scan_end = end;
*state = a4;
return MO_CONTINUE_MATCHING;
};
if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) {
DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
const union AccelAux *aaux =
get_accel32(s, a4 & SHENG32_STATE_MASK);
const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
} else {
min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
}
DEBUG_PRINTF("Next accel chance: %llu\n",
(u64a)(min_accel_dist - start));
DEBUG_PRINTF("Accel scanned %llu bytes\n",
(u64a)(new_offset - cur_buf - 4));
cur_buf = new_offset;
DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
continue;
};
cur_buf += 4;
}
*state = movd512(cur_state);
*scan_end = cur_buf;
return MO_CONTINUE_MATCHING;
}
#ifndef NO_SHENG64_IMPL
static really_inline
char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
const struct sheng64 *s,
u8 *const cached_accept_state,
ReportID *const cached_accept_id,
u8 single, u64a base_offset, const u8 *buf, const u8 *start,
const u8 *end, const u8 **scan_end) {
DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
*state & SHENG64_STATE_MASK);
const u8 *cur_buf = start;
base_offset++;
DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) {
DEBUG_PRINTF("Dead on arrival\n");
*scan_end = end;
return MO_CONTINUE_MATCHING;
}
m512 cur_state = set64x8(*state);
const m512 *masks = s->succ_masks;
while (likely(end - cur_buf >= 4)) {
const u8 *b1 = cur_buf;
const u8 *b2 = cur_buf + 1;
const u8 *b3 = cur_buf + 2;
const u8 *b4 = cur_buf + 3;
const u8 c1 = *b1;
const u8 c2 = *b2;
const u8 c3 = *b3;
const u8 c4 = *b4;
const m512 succ_mask1 = masks[c1];
cur_state = vpermb512(cur_state, succ_mask1);
const u8 a1 = movd512(cur_state);
const m512 succ_mask2 = masks[c2];
cur_state = vpermb512(cur_state, succ_mask2);
const u8 a2 = movd512(cur_state);
const m512 succ_mask3 = masks[c3];
cur_state = vpermb512(cur_state, succ_mask3);
const u8 a3 = movd512(cur_state);
const m512 succ_mask4 = masks[c4];
cur_state = vpermb512(cur_state, succ_mask4);
const u8 a4 = movd512(cur_state);
DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
a1 & SHENG64_STATE_FLAG_MASK);
DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK,
a2 & SHENG64_STATE_FLAG_MASK);
DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK,
a3 & SHENG64_STATE_FLAG_MASK);
DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK,
a4 & SHENG64_STATE_FLAG_MASK);
if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) {
if (ACCEPT_FUNC64(a1)) {
u64a match_offset = base_offset + b1 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a1 & SHENG64_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b1 - start));
*scan_end = b1;
*state = a1;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports64(s, cb, ctxt, a1, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (ACCEPT_FUNC64(a2)) {
u64a match_offset = base_offset + b2 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a2 & SHENG64_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b2 - start));
*scan_end = b2;
*state = a2;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports64(s, cb, ctxt, a2, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (ACCEPT_FUNC64(a3)) {
u64a match_offset = base_offset + b3 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a3 & SHENG64_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b3 - start));
*scan_end = b3;
*state = a3;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports64(s, cb, ctxt, a3, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (ACCEPT_FUNC64(a4)) {
u64a match_offset = base_offset + b4 - buf;
DEBUG_PRINTF("Accept state %u reached\n",
a4 & SHENG64_STATE_MASK);
DEBUG_PRINTF("Match @ %llu\n", match_offset);
if (STOP_AT_MATCH) {
DEBUG_PRINTF("Stopping at match @ %lli\n",
(s64a)(b4 - start));
*scan_end = b4;
*state = a4;
return MO_MATCHES_PENDING;
}
if (single) {
if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
} else {
if (fireReports64(s, cb, ctxt, a4, match_offset,
cached_accept_state, cached_accept_id,
0) == MO_HALT_MATCHING) {
return MO_HALT_MATCHING;
}
}
}
if (INNER_DEAD_FUNC64(a4)) {
DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
*scan_end = end;
*state = a4;
return MO_CONTINUE_MATCHING;
}
}
if (OUTER_DEAD_FUNC64(a4)) {
DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
*scan_end = end;
*state = a4;
return MO_CONTINUE_MATCHING;
}
cur_buf += 4;
}
*state = movd512(cur_state);
*scan_end = cur_buf;
return MO_CONTINUE_MATCHING;
}
#endif // !NO_SHENG64_IMPL
#endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -38,6 +38,17 @@
#define SHENG_STATE_MASK 0xF #define SHENG_STATE_MASK 0xF
#define SHENG_STATE_FLAG_MASK 0x70 #define SHENG_STATE_FLAG_MASK 0x70
#define SHENG32_STATE_ACCEPT 0x20
#define SHENG32_STATE_DEAD 0x40
#define SHENG32_STATE_ACCEL 0x80
#define SHENG32_STATE_MASK 0x1F
#define SHENG32_STATE_FLAG_MASK 0xE0
#define SHENG64_STATE_ACCEPT 0x40
#define SHENG64_STATE_DEAD 0x80
#define SHENG64_STATE_MASK 0x3F
#define SHENG64_STATE_FLAG_MASK 0xC0
#define SHENG_FLAG_SINGLE_REPORT 0x1 #define SHENG_FLAG_SINGLE_REPORT 0x1
#define SHENG_FLAG_CAN_DIE 0x2 #define SHENG_FLAG_CAN_DIE 0x2
#define SHENG_FLAG_HAS_ACCEL 0x4 #define SHENG_FLAG_HAS_ACCEL 0x4
@ -67,4 +78,30 @@ struct sheng {
ReportID report; ReportID report;
}; };
struct sheng32 {
m512 succ_masks[256];
u32 length;
u32 aux_offset;
u32 report_offset;
u32 accel_offset;
u8 n_states;
u8 anchored;
u8 floating;
u8 flags;
ReportID report;
};
struct sheng64 {
m512 succ_masks[256];
u32 length;
u32 aux_offset;
u32 report_offset;
u32 accel_offset;
u8 n_states;
u8 anchored;
u8 floating;
u8 flags;
ReportID report;
};
#endif /* SHENG_INTERNAL_H_ */ #endif /* SHENG_INTERNAL_H_ */

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -301,6 +301,28 @@ void dumpShuffleMask(const u8 chr, const u8 *buf, unsigned sz) {
} }
DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str()); DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
} }
static really_inline
void dumpShuffleMask32(const u8 chr, const u8 *buf, unsigned sz) {
stringstream o;
for (unsigned i = 0; i < sz; i++) {
o.width(2);
o << (buf[i] & SHENG32_STATE_MASK) << " ";
}
DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
}
static really_inline
void dumpShuffleMask64(const u8 chr, const u8 *buf, unsigned sz) {
stringstream o;
for (unsigned i = 0; i < sz; i++) {
o.width(2);
o << (buf[i] & SHENG64_STATE_MASK) << " ";
}
DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
}
#endif #endif
static static
@ -311,8 +333,15 @@ void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
} }
} }
template <typename T>
static static
u8 getShengState(dstate &state, dfa_info &info, u8 getShengState(UNUSED dstate &state, UNUSED dfa_info &info,
UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
return 0;
}
template <>
u8 getShengState<sheng>(dstate &state, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo) { map<dstate_id_t, AccelScheme> &accelInfo) {
u8 s = state.impl_id; u8 s = state.impl_id;
if (!state.reports.empty()) { if (!state.reports.empty()) {
@ -327,11 +356,41 @@ u8 getShengState(dstate &state, dfa_info &info,
return s; return s;
} }
template <>
u8 getShengState<sheng32>(dstate &state, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo) {
u8 s = state.impl_id;
if (!state.reports.empty()) {
s |= SHENG32_STATE_ACCEPT;
}
if (info.isDead(state)) {
s |= SHENG32_STATE_DEAD;
}
if (accelInfo.find(info.raw_id(state.impl_id)) != accelInfo.end()) {
s |= SHENG32_STATE_ACCEL;
}
return s;
}
template <>
u8 getShengState<sheng64>(dstate &state, dfa_info &info,
UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
u8 s = state.impl_id;
if (!state.reports.empty()) {
s |= SHENG64_STATE_ACCEPT;
}
if (info.isDead(state)) {
s |= SHENG64_STATE_DEAD;
}
return s;
}
template <typename T>
static static
void fillAccelAux(struct NFA *n, dfa_info &info, void fillAccelAux(struct NFA *n, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo) { map<dstate_id_t, AccelScheme> &accelInfo) {
DEBUG_PRINTF("Filling accel aux structures\n"); DEBUG_PRINTF("Filling accel aux structures\n");
sheng *s = (sheng *)getMutableImplNfa(n); T *s = (T *)getMutableImplNfa(n);
u32 offset = s->accel_offset; u32 offset = s->accel_offset;
for (dstate_id_t i = 0; i < info.size(); i++) { for (dstate_id_t i = 0; i < info.size(); i++) {
@ -349,10 +408,20 @@ void fillAccelAux(struct NFA *n, dfa_info &info,
} }
} }
template <typename T>
static static
void populateBasicInfo(struct NFA *n, dfa_info &info, void populateBasicInfo(UNUSED struct NFA *n, UNUSED dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo, u32 aux_offset, UNUSED map<dstate_id_t, AccelScheme> &accelInfo,
u32 report_offset, u32 accel_offset, u32 total_size, UNUSED u32 aux_offset, UNUSED u32 report_offset,
UNUSED u32 accel_offset, UNUSED u32 total_size,
UNUSED u32 dfa_size) {
}
template <>
void populateBasicInfo<sheng>(struct NFA *n, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo,
u32 aux_offset, u32 report_offset,
u32 accel_offset, u32 total_size,
u32 dfa_size) { u32 dfa_size) {
n->length = total_size; n->length = total_size;
n->scratchStateSize = 1; n->scratchStateSize = 1;
@ -369,14 +438,65 @@ void populateBasicInfo(struct NFA *n, dfa_info &info,
s->length = dfa_size; s->length = dfa_size;
s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0; s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
s->anchored = getShengState(info.anchored, info, accelInfo); s->anchored = getShengState<sheng>(info.anchored, info, accelInfo);
s->floating = getShengState(info.floating, info, accelInfo); s->floating = getShengState<sheng>(info.floating, info, accelInfo);
} }
template <>
void populateBasicInfo<sheng32>(struct NFA *n, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo,
u32 aux_offset, u32 report_offset,
u32 accel_offset, u32 total_size,
u32 dfa_size) {
n->length = total_size;
n->scratchStateSize = 1;
n->streamStateSize = 1;
n->nPositions = info.size();
n->type = SHENG_NFA_32;
n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
sheng32 *s = (sheng32 *)getMutableImplNfa(n);
s->aux_offset = aux_offset;
s->report_offset = report_offset;
s->accel_offset = accel_offset;
s->n_states = info.size();
s->length = dfa_size;
s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
s->anchored = getShengState<sheng32>(info.anchored, info, accelInfo);
s->floating = getShengState<sheng32>(info.floating, info, accelInfo);
}
template <>
void populateBasicInfo<sheng64>(struct NFA *n, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo,
u32 aux_offset, u32 report_offset,
u32 accel_offset, u32 total_size,
u32 dfa_size) {
n->length = total_size;
n->scratchStateSize = 1;
n->streamStateSize = 1;
n->nPositions = info.size();
n->type = SHENG_NFA_64;
n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
sheng64 *s = (sheng64 *)getMutableImplNfa(n);
s->aux_offset = aux_offset;
s->report_offset = report_offset;
s->accel_offset = accel_offset;
s->n_states = info.size();
s->length = dfa_size;
s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
s->anchored = getShengState<sheng64>(info.anchored, info, accelInfo);
s->floating = getShengState<sheng64>(info.floating, info, accelInfo);
}
template <typename T>
static static
void fillTops(NFA *n, dfa_info &info, dstate_id_t id, void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
map<dstate_id_t, AccelScheme> &accelInfo) { map<dstate_id_t, AccelScheme> &accelInfo) {
sheng *s = (sheng *)getMutableImplNfa(n); T *s = (T *)getMutableImplNfa(n);
u32 aux_base = s->aux_offset; u32 aux_base = s->aux_offset;
DEBUG_PRINTF("Filling tops for state %u\n", id); DEBUG_PRINTF("Filling tops for state %u\n", id);
@ -393,13 +513,14 @@ void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
DEBUG_PRINTF("Top transition for state %u: %u\n", id, top_state.impl_id); DEBUG_PRINTF("Top transition for state %u: %u\n", id, top_state.impl_id);
aux->top = getShengState(top_state, info, accelInfo); aux->top = getShengState<T>(top_state, info, accelInfo);
} }
template <typename T>
static static
void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports, void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
vector<u32> &reports_eod, vector<u32> &report_offsets) { vector<u32> &reports_eod, vector<u32> &report_offsets) {
sheng *s = (sheng *)getMutableImplNfa(n); T *s = (T *)getMutableImplNfa(n);
u32 aux_base = s->aux_offset; u32 aux_base = s->aux_offset;
auto raw_id = info.raw_id(id); auto raw_id = info.raw_id(id);
@ -419,17 +540,25 @@ void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
DEBUG_PRINTF("EOD report list offset: %u\n", aux->accept_eod); DEBUG_PRINTF("EOD report list offset: %u\n", aux->accept_eod);
} }
template <typename T>
static static
void fillSingleReport(NFA *n, ReportID r_id) { void fillSingleReport(NFA *n, ReportID r_id) {
sheng *s = (sheng *)getMutableImplNfa(n); T *s = (T *)getMutableImplNfa(n);
DEBUG_PRINTF("Single report ID: %u\n", r_id); DEBUG_PRINTF("Single report ID: %u\n", r_id);
s->report = r_id; s->report = r_id;
s->flags |= SHENG_FLAG_SINGLE_REPORT; s->flags |= SHENG_FLAG_SINGLE_REPORT;
} }
template <typename T>
static static
void createShuffleMasks(sheng *s, dfa_info &info, bool createShuffleMasks(UNUSED T *s, UNUSED dfa_info &info,
UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
return true;
}
template <>
bool createShuffleMasks<sheng>(sheng *s, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo) { map<dstate_id_t, AccelScheme> &accelInfo) {
for (u16 chr = 0; chr < 256; chr++) { for (u16 chr = 0; chr < 256; chr++) {
u8 buf[16] = {0}; u8 buf[16] = {0};
@ -437,19 +566,137 @@ void createShuffleMasks(sheng *s, dfa_info &info,
for (dstate_id_t idx = 0; idx < info.size(); idx++) { for (dstate_id_t idx = 0; idx < info.size(); idx++) {
auto &succ_state = info.next(idx, chr); auto &succ_state = info.next(idx, chr);
buf[idx] = getShengState(succ_state, info, accelInfo); buf[idx] = getShengState<sheng>(succ_state, info, accelInfo);
} }
#ifdef DEBUG #ifdef DEBUG
dumpShuffleMask(chr, buf, sizeof(buf)); dumpShuffleMask(chr, buf, sizeof(buf));
#endif #endif
memcpy(&s->shuffle_masks[chr], buf, sizeof(m128)); memcpy(&s->shuffle_masks[chr], buf, sizeof(m128));
} }
return true;
}
template <>
bool createShuffleMasks<sheng32>(sheng32 *s, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo) {
for (u16 chr = 0; chr < 256; chr++) {
u8 buf[64] = {0};
assert(info.size() <= 32);
for (dstate_id_t idx = 0; idx < info.size(); idx++) {
auto &succ_state = info.next(idx, chr);
buf[idx] = getShengState<sheng32>(succ_state, info, accelInfo);
buf[32 + idx] = buf[idx];
}
#ifdef DEBUG
dumpShuffleMask32(chr, buf, sizeof(buf));
#endif
memcpy(&s->succ_masks[chr], buf, sizeof(m512));
}
return true;
}
template <>
bool createShuffleMasks<sheng64>(sheng64 *s, dfa_info &info,
map<dstate_id_t, AccelScheme> &accelInfo) {
for (u16 chr = 0; chr < 256; chr++) {
u8 buf[64] = {0};
assert(info.size() <= 64);
for (dstate_id_t idx = 0; idx < info.size(); idx++) {
auto &succ_state = info.next(idx, chr);
if (accelInfo.find(info.raw_id(succ_state.impl_id))
!= accelInfo.end()) {
return false;
}
buf[idx] = getShengState<sheng64>(succ_state, info, accelInfo);
}
#ifdef DEBUG
dumpShuffleMask64(chr, buf, sizeof(buf));
#endif
memcpy(&s->succ_masks[chr], buf, sizeof(m512));
}
return true;
} }
bool has_accel_sheng(const NFA *) { bool has_accel_sheng(const NFA *) {
return true; /* consider the sheng region as accelerated */ return true; /* consider the sheng region as accelerated */
} }
template <typename T>
static
bytecode_ptr<NFA> shengCompile_int(raw_dfa &raw, const CompileContext &cc,
set<dstate_id_t> *accel_states,
sheng_build_strat &strat,
dfa_info &info) {
if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
* mode with our semantics */
raw.stripExtraEodReports();
}
auto accelInfo = strat.getAccelInfo(cc.grey);
// set impl_id of each dfa state
for (dstate_id_t i = 0; i < info.size(); i++) {
info[i].impl_id = i;
}
DEBUG_PRINTF("Anchored start state: %u, floating start state: %u\n",
info.anchored.impl_id, info.floating.impl_id);
u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(T));
vector<u32> reports, eod_reports, report_offsets;
u8 isSingle = 0;
ReportID single_report = 0;
auto ri =
strat.gatherReports(reports, eod_reports, &isSingle, &single_report);
u32 total_aux = sizeof(sstate_aux) * info.size();
u32 total_accel = strat.accelSize() * accelInfo.size();
u32 total_reports = ri->getReportListSize();
u32 reports_offset = nfa_size + total_aux;
u32 accel_offset =
ROUNDUP_N(reports_offset + total_reports, alignof(AccelAux));
u32 total_size = ROUNDUP_N(accel_offset + total_accel, 64);
DEBUG_PRINTF("NFA: %u, aux: %u, reports: %u, accel: %u, total: %u\n",
nfa_size, total_aux, total_reports, total_accel, total_size);
auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
populateBasicInfo<T>(nfa.get(), info, accelInfo, nfa_size,
reports_offset, accel_offset, total_size,
total_size - sizeof(NFA));
DEBUG_PRINTF("Setting up aux and report structures\n");
ri->fillReportLists(nfa.get(), reports_offset, report_offsets);
for (dstate_id_t idx = 0; idx < info.size(); idx++) {
fillTops<T>(nfa.get(), info, idx, accelInfo);
fillAux<T>(nfa.get(), info, idx, reports, eod_reports,
report_offsets);
}
if (isSingle) {
fillSingleReport<T>(nfa.get(), single_report);
}
fillAccelAux<T>(nfa.get(), info, accelInfo);
if (accel_states) {
fillAccelOut(accelInfo, accel_states);
}
if (!createShuffleMasks<T>((T *)getMutableImplNfa(nfa.get()), info, accelInfo)) {
return nullptr;
}
return nfa;
}
bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc, bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm, bool only_accel_init, const ReportManager &rm, bool only_accel_init,
set<dstate_id_t> *accel_states) { set<dstate_id_t> *accel_states) {
@ -473,65 +720,75 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
return nullptr; return nullptr;
} }
if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming return shengCompile_int<sheng>(raw, cc, accel_states, strat, info);
* mode with our semantics */
raw.stripExtraEodReports();
}
auto accelInfo = strat.getAccelInfo(cc.grey);
// set impl_id of each dfa state
for (dstate_id_t i = 0; i < info.size(); i++) {
info[i].impl_id = i;
} }
DEBUG_PRINTF("Anchored start state: %u, floating start state: %u\n", bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
info.anchored.impl_id, info.floating.impl_id); const ReportManager &rm, bool only_accel_init,
set<dstate_id_t> *accel_states) {
u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(sheng)); if (!cc.grey.allowSheng) {
vector<u32> reports, eod_reports, report_offsets; DEBUG_PRINTF("Sheng is not allowed!\n");
u8 isSingle = 0; return nullptr;
ReportID single_report = 0;
auto ri =
strat.gatherReports(reports, eod_reports, &isSingle, &single_report);
u32 total_aux = sizeof(sstate_aux) * info.size();
u32 total_accel = strat.accelSize() * accelInfo.size();
u32 total_reports = ri->getReportListSize();
u32 reports_offset = nfa_size + total_aux;
u32 accel_offset =
ROUNDUP_N(reports_offset + total_reports, alignof(AccelAux));
u32 total_size = ROUNDUP_N(accel_offset + total_accel, 64);
DEBUG_PRINTF("NFA: %u, aux: %u, reports: %u, accel: %u, total: %u\n",
nfa_size, total_aux, total_reports, total_accel, total_size);
auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, reports_offset,
accel_offset, total_size, total_size - sizeof(NFA));
DEBUG_PRINTF("Setting up aux and report structures\n");
ri->fillReportLists(nfa.get(), reports_offset, report_offsets);
for (dstate_id_t idx = 0; idx < info.size(); idx++) {
fillTops(nfa.get(), info, idx, accelInfo);
fillAux(nfa.get(), info, idx, reports, eod_reports, report_offsets);
}
if (isSingle) {
fillSingleReport(nfa.get(), single_report);
} }
fillAccelAux(nfa.get(), info, accelInfo); if (!cc.target_info.has_avx512vbmi()) {
DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
if (accel_states) { return nullptr;
fillAccelOut(accelInfo, accel_states);
} }
createShuffleMasks((sheng *)getMutableImplNfa(nfa.get()), info, accelInfo); sheng_build_strat strat(raw, rm, only_accel_init);
dfa_info info(strat);
DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
raw.start_anchored, raw.start_floating);
DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
info.can_die ? "can" : "cannot", info.size());
assert(info.size() > 16);
if (info.size() > 32) {
DEBUG_PRINTF("Too many states\n");
return nullptr;
}
return shengCompile_int<sheng32>(raw, cc, accel_states, strat, info);
}
bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm, bool only_accel_init,
set<dstate_id_t> *accel_states) {
if (!cc.grey.allowSheng) {
DEBUG_PRINTF("Sheng is not allowed!\n");
return nullptr;
}
if (!cc.target_info.has_avx512vbmi()) {
DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
return nullptr;
}
sheng_build_strat strat(raw, rm, only_accel_init);
dfa_info info(strat);
DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
raw.start_anchored, raw.start_floating);
DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
info.can_die ? "can" : "cannot", info.size());
assert(info.size() > 32);
if (info.size() > 64) {
DEBUG_PRINTF("Too many states\n");
return nullptr;
}
vector<dstate> old_states;
old_states = info.states;
auto nfa = shengCompile_int<sheng64>(raw, cc, accel_states, strat, info);
if (!nfa) {
info.states = old_states;
}
return nfa; return nfa;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2018, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -71,6 +71,14 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm, bool only_accel_init, const ReportManager &rm, bool only_accel_init,
std::set<dstate_id_t> *accel_states = nullptr); std::set<dstate_id_t> *accel_states = nullptr);
bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm, bool only_accel_init,
std::set<dstate_id_t> *accel_states = nullptr);
bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm, bool only_accel_init,
std::set<dstate_id_t> *accel_states = nullptr);
struct sheng_escape_info { struct sheng_escape_info {
CharReach outs; CharReach outs;
CharReach outs2_single; CharReach outs2_single;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -51,7 +51,7 @@ namespace ue2 {
static static
const sstate_aux *get_aux(const NFA *n, dstate_id_t i) { const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
assert(n && isShengType(n->type)); assert(n && isSheng16Type(n->type));
const sheng *s = (const sheng *)getImplNfa(n); const sheng *s = (const sheng *)getImplNfa(n);
const sstate_aux *aux_base = const sstate_aux *aux_base =
@ -64,6 +64,36 @@ const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
return aux; return aux;
} }
static
const sstate_aux *get_aux32(const NFA *n, dstate_id_t i) {
assert(n && isSheng32Type(n->type));
const sheng32 *s = (const sheng32 *)getImplNfa(n);
const sstate_aux *aux_base =
(const sstate_aux *)((const char *)n + s->aux_offset);
const sstate_aux *aux = aux_base + i;
assert((const char *)aux < (const char *)s + s->length);
return aux;
}
static
const sstate_aux *get_aux64(const NFA *n, dstate_id_t i) {
assert(n && isSheng64Type(n->type));
const sheng64 *s = (const sheng64 *)getImplNfa(n);
const sstate_aux *aux_base =
(const sstate_aux *)((const char *)n + s->aux_offset);
const sstate_aux *aux = aux_base + i;
assert((const char *)aux < (const char *)s + s->length);
return aux;
}
static static
void dumpHeader(FILE *f, const sheng *s) { void dumpHeader(FILE *f, const sheng *s) {
fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states, fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
@ -79,6 +109,36 @@ void dumpHeader(FILE *f, const sheng *s) {
!!(s->flags & SHENG_FLAG_SINGLE_REPORT)); !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
} }
static
void dumpHeader32(FILE *f, const sheng32 *s) {
fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
s->length);
fprintf(f, "aux base offset: %u, reports base offset: %u, "
"accel offset: %u\n",
s->aux_offset, s->report_offset, s->accel_offset);
fprintf(f, "anchored start state: %u, floating start state: %u\n",
s->anchored & SHENG32_STATE_MASK, s->floating & SHENG32_STATE_MASK);
fprintf(f, "has accel: %u can die: %u single report: %u\n",
!!(s->flags & SHENG_FLAG_HAS_ACCEL),
!!(s->flags & SHENG_FLAG_CAN_DIE),
!!(s->flags & SHENG_FLAG_SINGLE_REPORT));
}
static
void dumpHeader64(FILE *f, const sheng64 *s) {
fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
s->length);
fprintf(f, "aux base offset: %u, reports base offset: %u, "
"accel offset: %u\n",
s->aux_offset, s->report_offset, s->accel_offset);
fprintf(f, "anchored start state: %u, floating start state: %u\n",
s->anchored & SHENG64_STATE_MASK, s->floating & SHENG64_STATE_MASK);
fprintf(f, "has accel: %u can die: %u single report: %u\n",
!!(s->flags & SHENG_FLAG_HAS_ACCEL),
!!(s->flags & SHENG_FLAG_CAN_DIE),
!!(s->flags & SHENG_FLAG_SINGLE_REPORT));
}
static static
void dumpAux(FILE *f, u32 state, const sstate_aux *aux) { void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, " fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
@ -87,6 +147,22 @@ void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
aux->top & SHENG_STATE_MASK); aux->top & SHENG_STATE_MASK);
} }
static
void dumpAux32(FILE *f, u32 state, const sstate_aux *aux) {
fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
"accel offset: %u, top: %u\n",
state, aux->accept, aux->accept_eod, aux->accel,
aux->top & SHENG32_STATE_MASK);
}
static
void dumpAux64(FILE *f, u32 state, const sstate_aux *aux) {
fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
"accel offset: %u, top: %u\n",
state, aux->accept, aux->accept_eod, aux->accel,
aux->top & SHENG64_STATE_MASK);
}
static static
void dumpReports(FILE *f, const report_list *rl) { void dumpReports(FILE *f, const report_list *rl) {
fprintf(f, "reports count: %u\n", rl->count); fprintf(f, "reports count: %u\n", rl->count);
@ -115,6 +191,46 @@ void dumpMasks(FILE *f, const sheng *s) {
} }
} }
static
void dumpMasks32(FILE *f, const sheng32 *s) {
for (u32 chr = 0; chr < 256; chr++) {
u8 buf[64];
m512 succ_mask = s->succ_masks[chr];
memcpy(buf, &succ_mask, sizeof(m512));
fprintf(f, "%3u: ", chr);
for (u32 pos = 0; pos < 64; pos++) {
u8 c = buf[pos];
if (c & SHENG32_STATE_FLAG_MASK) {
fprintf(f, "%2u* ", c & SHENG32_STATE_MASK);
} else {
fprintf(f, "%2u ", c & SHENG32_STATE_MASK);
}
}
fprintf(f, "\n");
}
}
static
void dumpMasks64(FILE *f, const sheng64 *s) {
for (u32 chr = 0; chr < 256; chr++) {
u8 buf[64];
m512 succ_mask = s->succ_masks[chr];
memcpy(buf, &succ_mask, sizeof(m512));
fprintf(f, "%3u: ", chr);
for (u32 pos = 0; pos < 64; pos++) {
u8 c = buf[pos];
if (c & SHENG64_STATE_FLAG_MASK) {
fprintf(f, "%2u* ", c & SHENG64_STATE_MASK);
} else {
fprintf(f, "%2u ", c & SHENG64_STATE_MASK);
}
}
fprintf(f, "\n");
}
}
static static
void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) { void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) {
assert(nfa->type == SHENG_NFA); assert(nfa->type == SHENG_NFA);
@ -153,6 +269,82 @@ void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) {
fprintf(f, "\n"); fprintf(f, "\n");
} }
static
void nfaExecSheng32_dumpText(const NFA *nfa, FILE *f) {
assert(nfa->type == SHENG_NFA_32);
const sheng32 *s = (const sheng32 *)getImplNfa(nfa);
fprintf(f, "sheng32 DFA\n");
dumpHeader32(f, s);
for (u32 state = 0; state < s->n_states; state++) {
const sstate_aux *aux = get_aux32(nfa, state);
dumpAux32(f, state, aux);
if (aux->accept) {
fprintf(f, "report list:\n");
const report_list *rl =
(const report_list *)((const char *)nfa + aux->accept);
dumpReports(f, rl);
}
if (aux->accept_eod) {
fprintf(f, "EOD report list:\n");
const report_list *rl =
(const report_list *)((const char *)nfa + aux->accept_eod);
dumpReports(f, rl);
}
if (aux->accel) {
fprintf(f, "accel:\n");
const AccelAux *accel =
(const AccelAux *)((const char *)nfa + aux->accel);
dumpAccelInfo(f, *accel);
}
}
fprintf(f, "\n");
dumpMasks32(f, s);
fprintf(f, "\n");
}
static
void nfaExecSheng64_dumpText(const NFA *nfa, FILE *f) {
assert(nfa->type == SHENG_NFA_64);
const sheng64 *s = (const sheng64 *)getImplNfa(nfa);
fprintf(f, "sheng64 DFA\n");
dumpHeader64(f, s);
for (u32 state = 0; state < s->n_states; state++) {
const sstate_aux *aux = get_aux64(nfa, state);
dumpAux64(f, state, aux);
if (aux->accept) {
fprintf(f, "report list:\n");
const report_list *rl =
(const report_list *)((const char *)nfa + aux->accept);
dumpReports(f, rl);
}
if (aux->accept_eod) {
fprintf(f, "EOD report list:\n");
const report_list *rl =
(const report_list *)((const char *)nfa + aux->accept_eod);
dumpReports(f, rl);
}
if (aux->accel) {
fprintf(f, "accel:\n");
const AccelAux *accel =
(const AccelAux *)((const char *)nfa + aux->accel);
dumpAccelInfo(f, *accel);
}
}
fprintf(f, "\n");
dumpMasks64(f, s);
fprintf(f, "\n");
}
static static
void dumpDotPreambleDfa(FILE *f) { void dumpDotPreambleDfa(FILE *f) {
dumpDotPreamble(f); dumpDotPreamble(f);
@ -163,8 +355,14 @@ void dumpDotPreambleDfa(FILE *f) {
fprintf(f, "0 [style=invis];\n"); fprintf(f, "0 [style=invis];\n");
} }
template <typename T>
static static
void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) { void describeNode(UNUSED const NFA *n, UNUSED const T *s, UNUSED u16 i,
UNUSED FILE *f) {
}
template <>
void describeNode<sheng>(const NFA *n, const sheng *s, u16 i, FILE *f) {
const sstate_aux *aux = get_aux(n, i); const sstate_aux *aux = get_aux(n, i);
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, " fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
@ -193,6 +391,66 @@ void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) {
} }
} }
template <>
void describeNode<sheng32>(const NFA *n, const sheng32 *s, u16 i, FILE *f) {
const sstate_aux *aux = get_aux32(n, i);
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
"label = \"%u\" ]; \n",
i, i);
if (aux->accept_eod) {
fprintf(f, "%u [ color = darkorchid ];\n", i);
}
if (aux->accept) {
fprintf(f, "%u [ shape = doublecircle ];\n", i);
}
if (aux->top && (aux->top & SHENG32_STATE_MASK) != i) {
fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
aux->top & SHENG32_STATE_MASK);
}
if (i == (s->anchored & SHENG32_STATE_MASK)) {
fprintf(f, "STARTA -> %u [color = blue ]\n", i);
}
if (i == (s->floating & SHENG32_STATE_MASK)) {
fprintf(f, "STARTF -> %u [color = red ]\n", i);
}
}
template <>
void describeNode<sheng64>(const NFA *n, const sheng64 *s, u16 i, FILE *f) {
const sstate_aux *aux = get_aux64(n, i);
fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
"label = \"%u\" ]; \n",
i, i);
if (aux->accept_eod) {
fprintf(f, "%u [ color = darkorchid ];\n", i);
}
if (aux->accept) {
fprintf(f, "%u [ shape = doublecircle ];\n", i);
}
if (aux->top && (aux->top & SHENG64_STATE_MASK) != i) {
fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
aux->top & SHENG64_STATE_MASK);
}
if (i == (s->anchored & SHENG64_STATE_MASK)) {
fprintf(f, "STARTA -> %u [color = blue ]\n", i);
}
if (i == (s->floating & SHENG64_STATE_MASK)) {
fprintf(f, "STARTF -> %u [color = red ]\n", i);
}
}
static static
void describeEdge(FILE *f, const u16 *t, u16 i) { void describeEdge(FILE *f, const u16 *t, u16 i) {
for (u16 s = 0; s < N_CHARS; s++) { for (u16 s = 0; s < N_CHARS; s++) {
@ -228,7 +486,7 @@ void describeEdge(FILE *f, const u16 *t, u16 i) {
static static
void shengGetTransitions(const NFA *n, u16 state, u16 *t) { void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
assert(isShengType(n->type)); assert(isSheng16Type(n->type));
const sheng *s = (const sheng *)getImplNfa(n); const sheng *s = (const sheng *)getImplNfa(n);
const sstate_aux *aux = get_aux(n, state); const sstate_aux *aux = get_aux(n, state);
@ -244,6 +502,42 @@ void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
t[TOP] = aux->top & SHENG_STATE_MASK; t[TOP] = aux->top & SHENG_STATE_MASK;
} }
static
void sheng32GetTransitions(const NFA *n, u16 state, u16 *t) {
assert(isSheng32Type(n->type));
const sheng32 *s = (const sheng32 *)getImplNfa(n);
const sstate_aux *aux = get_aux32(n, state);
for (unsigned i = 0; i < N_CHARS; i++) {
u8 buf[64];
m512 succ_mask = s->succ_masks[i];
memcpy(buf, &succ_mask, sizeof(m512));
t[i] = buf[state] & SHENG32_STATE_MASK;
}
t[TOP] = aux->top & SHENG32_STATE_MASK;
}
static
void sheng64GetTransitions(const NFA *n, u16 state, u16 *t) {
assert(isSheng64Type(n->type));
const sheng64 *s = (const sheng64 *)getImplNfa(n);
const sstate_aux *aux = get_aux64(n, state);
for (unsigned i = 0; i < N_CHARS; i++) {
u8 buf[64];
m512 succ_mask = s->succ_masks[i];
memcpy(buf, &succ_mask, sizeof(m512));
t[i] = buf[state] & SHENG64_STATE_MASK;
}
t[TOP] = aux->top & SHENG64_STATE_MASK;
}
static static
void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) { void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
assert(nfa->type == SHENG_NFA); assert(nfa->type == SHENG_NFA);
@ -252,7 +546,7 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
dumpDotPreambleDfa(f); dumpDotPreambleDfa(f);
for (u16 i = 1; i < s->n_states; i++) { for (u16 i = 1; i < s->n_states; i++) {
describeNode(nfa, s, i, f); describeNode<sheng>(nfa, s, i, f);
u16 t[ALPHABET_SIZE]; u16 t[ALPHABET_SIZE];
@ -264,10 +558,62 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
fprintf(f, "}\n"); fprintf(f, "}\n");
} }
static
void nfaExecSheng32_dumpDot(const NFA *nfa, FILE *f) {
assert(nfa->type == SHENG_NFA_32);
const sheng32 *s = (const sheng32 *)getImplNfa(nfa);
dumpDotPreambleDfa(f);
for (u16 i = 1; i < s->n_states; i++) {
describeNode<sheng32>(nfa, s, i, f);
u16 t[ALPHABET_SIZE];
sheng32GetTransitions(nfa, i, t);
describeEdge(f, t, i);
}
fprintf(f, "}\n");
}
static
void nfaExecSheng64_dumpDot(const NFA *nfa, FILE *f) {
assert(nfa->type == SHENG_NFA_64);
const sheng64 *s = (const sheng64 *)getImplNfa(nfa);
dumpDotPreambleDfa(f);
for (u16 i = 1; i < s->n_states; i++) {
describeNode<sheng64>(nfa, s, i, f);
u16 t[ALPHABET_SIZE];
sheng64GetTransitions(nfa, i, t);
describeEdge(f, t, i);
}
fprintf(f, "}\n");
}
void nfaExecSheng_dump(const NFA *nfa, const string &base) { void nfaExecSheng_dump(const NFA *nfa, const string &base) {
assert(nfa->type == SHENG_NFA); assert(nfa->type == SHENG_NFA);
nfaExecSheng_dumpText(nfa, StdioFile(base + ".txt", "w")); nfaExecSheng_dumpText(nfa, StdioFile(base + ".txt", "w"));
nfaExecSheng_dumpDot(nfa, StdioFile(base + ".dot", "w")); nfaExecSheng_dumpDot(nfa, StdioFile(base + ".dot", "w"));
} }
void nfaExecSheng32_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
assert(nfa->type == SHENG_NFA_32);
nfaExecSheng32_dumpText(nfa, StdioFile(base + ".txt", "w"));
nfaExecSheng32_dumpDot(nfa, StdioFile(base + ".dot", "w"));
}
void nfaExecSheng64_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
assert(nfa->type == SHENG_NFA_64);
nfaExecSheng64_dumpText(nfa, StdioFile(base + ".txt", "w"));
nfaExecSheng64_dumpDot(nfa, StdioFile(base + ".dot", "w"));
}
} // namespace ue2 } // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -38,6 +38,8 @@ struct NFA;
namespace ue2 { namespace ue2 {
void nfaExecSheng_dump(const struct NFA *nfa, const std::string &base); void nfaExecSheng_dump(const struct NFA *nfa, const std::string &base);
void nfaExecSheng32_dump(const struct NFA *nfa, const std::string &base);
void nfaExecSheng64_dump(const struct NFA *nfa, const std::string &base);
} // namespace ue2 } // namespace ue2

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -632,8 +632,8 @@ bytecode_ptr<NFA>
constructNFA(const NGHolder &h_in, const ReportManager *rm, constructNFA(const NGHolder &h_in, const ReportManager *rm,
const map<u32, u32> &fixed_depth_tops, const map<u32, u32> &fixed_depth_tops,
const map<u32, vector<vector<CharReach>>> &triggers, const map<u32, vector<vector<CharReach>>> &triggers,
bool compress_state, bool do_accel, bool impl_test_only, u32 hint, bool compress_state, bool do_accel, bool impl_test_only,
const CompileContext &cc) { bool &fast, u32 hint, const CompileContext &cc) {
if (!has_managed_reports(h_in)) { if (!has_managed_reports(h_in)) {
rm = nullptr; rm = nullptr;
} else { } else {
@ -684,19 +684,19 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
} }
return generate(*h, state_ids, repeats, reportSquashMap, squashMap, tops, return generate(*h, state_ids, repeats, reportSquashMap, squashMap, tops,
zombies, do_accel, compress_state, hint, cc); zombies, do_accel, compress_state, fast, hint, cc);
} }
bytecode_ptr<NFA> bytecode_ptr<NFA>
constructNFA(const NGHolder &h_in, const ReportManager *rm, constructNFA(const NGHolder &h_in, const ReportManager *rm,
const map<u32, u32> &fixed_depth_tops, const map<u32, u32> &fixed_depth_tops,
const map<u32, vector<vector<CharReach>>> &triggers, const map<u32, vector<vector<CharReach>>> &triggers,
bool compress_state, const CompileContext &cc) { bool compress_state, bool &fast, const CompileContext &cc) {
const u32 hint = INVALID_NFA; const u32 hint = INVALID_NFA;
const bool do_accel = cc.grey.accelerateNFA; const bool do_accel = cc.grey.accelerateNFA;
const bool impl_test_only = false; const bool impl_test_only = false;
return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state, return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state,
do_accel, impl_test_only, hint, cc); do_accel, impl_test_only, fast, hint, cc);
} }
#ifndef RELEASE_BUILD #ifndef RELEASE_BUILD
@ -705,11 +705,11 @@ bytecode_ptr<NFA>
constructNFA(const NGHolder &h_in, const ReportManager *rm, constructNFA(const NGHolder &h_in, const ReportManager *rm,
const map<u32, u32> &fixed_depth_tops, const map<u32, u32> &fixed_depth_tops,
const map<u32, vector<vector<CharReach>>> &triggers, const map<u32, vector<vector<CharReach>>> &triggers,
bool compress_state, u32 hint, const CompileContext &cc) { bool compress_state, bool &fast, u32 hint, const CompileContext &cc) {
const bool do_accel = cc.grey.accelerateNFA; const bool do_accel = cc.grey.accelerateNFA;
const bool impl_test_only = false; const bool impl_test_only = false;
return constructNFA(h_in, rm, fixed_depth_tops, triggers, return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state,
compress_state, do_accel, impl_test_only, hint, cc); do_accel, impl_test_only, fast, hint, cc);
} }
#endif // RELEASE_BUILD #endif // RELEASE_BUILD
@ -739,9 +739,10 @@ bytecode_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
vector<BoundedRepeatData> repeats; vector<BoundedRepeatData> repeats;
unordered_map<NFAVertex, NFAStateSet> reportSquashMap; unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
unordered_map<NFAVertex, NFAStateSet> squashMap; unordered_map<NFAVertex, NFAStateSet> squashMap;
UNUSED bool fast = false;
return generate(h, state_ids, repeats, reportSquashMap, squashMap, tops, return generate(h, state_ids, repeats, reportSquashMap, squashMap, tops,
zombies, false, false, hint, cc); zombies, false, false, fast, hint, cc);
} }
bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h_in, bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h_in,

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -100,7 +100,7 @@ bytecode_ptr<NFA>
constructNFA(const NGHolder &g, const ReportManager *rm, constructNFA(const NGHolder &g, const ReportManager *rm,
const std::map<u32, u32> &fixed_depth_tops, const std::map<u32, u32> &fixed_depth_tops,
const std::map<u32, std::vector<std::vector<CharReach>>> &triggers, const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
bool compress_state, const CompileContext &cc); bool compress_state, bool &fast, const CompileContext &cc);
/** /**
* \brief Build a reverse NFA from the graph given, which should have already * \brief Build a reverse NFA from the graph given, which should have already
@ -129,7 +129,7 @@ bytecode_ptr<NFA>
constructNFA(const NGHolder &g, const ReportManager *rm, constructNFA(const NGHolder &g, const ReportManager *rm,
const std::map<u32, u32> &fixed_depth_tops, const std::map<u32, u32> &fixed_depth_tops,
const std::map<u32, std::vector<std::vector<CharReach>>> &triggers, const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
bool compress_state, u32 hint, const CompileContext &cc); bool compress_state, bool &fast, u32 hint, const CompileContext &cc);
/** /**
* \brief Build a reverse NFA (with model type hint) from the graph given, * \brief Build a reverse NFA (with model type hint) from the graph given,

View File

@ -69,14 +69,14 @@ struct LitGraphVertexProps {
LitGraphVertexProps() = default; LitGraphVertexProps() = default;
explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {} explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {}
ue2_literal::elem c; // string element (char + bool) ue2_literal::elem c; // string element (char + bool)
size_t index; // managed by ue2_graph size_t index = 0; // managed by ue2_graph
}; };
struct LitGraphEdgeProps { struct LitGraphEdgeProps {
LitGraphEdgeProps() = default; LitGraphEdgeProps() = default;
explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {} explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
u64a score = NO_LITERAL_AT_EDGE_SCORE; u64a score = NO_LITERAL_AT_EDGE_SCORE;
size_t index; // managed by ue2_graph size_t index = 0; // managed by ue2_graph
}; };
struct LitGraph struct LitGraph

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2018-2019, Intel Corporation * Copyright (c) 2018-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -33,6 +33,7 @@
#include "parser/parse_error.h" #include "parser/parse_error.h"
#include "util/container.h" #include "util/container.h"
#include "hs_compile.h" #include "hs_compile.h"
#include "allocator.h"
#include <vector> #include <vector>
@ -151,7 +152,7 @@ void ParsedLogical::validateSubIDs(const unsigned *ids,
if (info->unordered_matches) { if (info->unordered_matches) {
throw CompileError("Have unordered match in sub-expressions."); throw CompileError("Have unordered match in sub-expressions.");
} }
free(info); hs_misc_free(info);
} }
} }
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2019, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -767,10 +767,10 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
c_shift = c_len - ci->len; c_shift = c_len - ci->len;
c_len = ci->len; c_len = ci->len;
} }
copy_upto_32_bytes((u8 *)&data - offset, ci->buf, c_len); copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
} }
assert(h_shift + h_len + c_len + c_shift == 32); assert(h_shift + h_len + c_len + c_shift == 32);
copy_upto_32_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len); copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
} else { } else {
if (offset + 32 > (s64a)ci->len) { if (offset + 32 > (s64a)ci->len) {
if (offset >= (s64a)ci->len) { if (offset >= (s64a)ci->len) {
@ -779,7 +779,7 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
} }
c_len = ci->len - offset; c_len = ci->len - offset;
c_shift = 32 - c_len; c_shift = 32 - c_len;
copy_upto_32_bytes((u8 *)&data, ci->buf + offset, c_len); copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
} else { } else {
data = loadu256(ci->buf + offset); data = loadu256(ci->buf + offset);
} }
@ -800,12 +800,90 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
return 0; return 0;
} }
// get 128/256 bits data from history and current buffer. #ifdef HAVE_AVX512
static rose_inline
int roseCheckMask64(const struct core_info *ci, const u8 *and_mask,
const u8 *cmp_mask, const u64a neg_mask,
s32 checkOffset, u64a end) {
const s64a base_offset = (s64a)end - ci->buf_offset;
s64a offset = base_offset + checkOffset;
DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
DEBUG_PRINTF("too early, fail\n");
return 0;
}
m512 data = zeroes512(); // consists of the following four parts.
s32 c_shift = 0; // blank bytes after current.
s32 h_shift = 0; // blank bytes before history.
s32 h_len = 64; // number of bytes from history buffer.
s32 c_len = 0; // number of bytes from current buffer.
/* h_shift + h_len + c_len + c_shift = 64 need to be hold.*/
if (offset < 0) {
s32 h_offset = 0; // the start offset in history buffer.
if (offset < -(s64a)ci->hlen) {
if (offset + 64 <= -(s64a)ci->hlen) {
DEBUG_PRINTF("all before history\n");
return 1;
}
h_shift = -(offset + (s64a)ci->hlen);
h_len = 64 - h_shift;
} else {
h_offset = ci->hlen + offset;
}
if (offset + 64 > 0) {
// part in current buffer.
c_len = offset + 64;
h_len = -(offset + h_shift);
if (c_len > (s64a)ci->len) {
// out of current buffer.
c_shift = c_len - ci->len;
c_len = ci->len;
}
copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
}
assert(h_shift + h_len + c_len + c_shift == 64);
copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
} else {
if (offset + 64 > (s64a)ci->len) {
if (offset >= (s64a)ci->len) {
DEBUG_PRINTF("all in the future.\n");
return 1;
}
c_len = ci->len - offset;
c_shift = 64 - c_len;
copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
} else {
data = loadu512(ci->buf + offset);
}
}
DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
// we use valid_data_mask to blind bytes before history/in the future.
u64a valid_data_mask;
valid_data_mask = (~0ULL) << (h_shift + c_shift) >> (c_shift);
m512 and_mask_m512 = loadu512(and_mask);
m512 cmp_mask_m512 = loadu512(cmp_mask);
if (validateMask64(data, valid_data_mask, and_mask_m512,
cmp_mask_m512, neg_mask)) {
DEBUG_PRINTF("Mask64 passed\n");
return 1;
}
return 0;
}
#endif
// get 128/256/512 bits data from history and current buffer.
// return data and valid_data_mask. // return data and valid_data_mask.
static rose_inline static rose_inline
u32 getBufferDataComplex(const struct core_info *ci, const s64a loc, u64a getBufferDataComplex(const struct core_info *ci, const s64a loc,
u8 *data, const u32 data_len) { u8 *data, const u32 data_len) {
assert(data_len == 16 || data_len == 32); assert(data_len == 16 || data_len == 32 || data_len == 64);
s32 c_shift = 0; // blank bytes after current. s32 c_shift = 0; // blank bytes after current.
s32 h_shift = 0; // blank bytes before history. s32 h_shift = 0; // blank bytes before history.
s32 h_len = data_len; // number of bytes from history buffer. s32 h_len = data_len; // number of bytes from history buffer.
@ -831,10 +909,10 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
c_shift = c_len - ci->len; c_shift = c_len - ci->len;
c_len = ci->len; c_len = ci->len;
} }
copy_upto_32_bytes(data - loc, ci->buf, c_len); copy_upto_64_bytes(data - loc, ci->buf, c_len);
} }
assert(h_shift + h_len + c_len + c_shift == (s32)data_len); assert(h_shift + h_len + c_len + c_shift == (s32)data_len);
copy_upto_32_bytes(data + h_shift, ci->hbuf + h_offset, h_len); copy_upto_64_bytes(data + h_shift, ci->hbuf + h_offset, h_len);
} else { } else {
if (loc + data_len > (s64a)ci->len) { if (loc + data_len > (s64a)ci->len) {
if (loc >= (s64a)ci->len) { if (loc >= (s64a)ci->len) {
@ -843,8 +921,14 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
} }
c_len = ci->len - loc; c_len = ci->len - loc;
c_shift = data_len - c_len; c_shift = data_len - c_len;
copy_upto_32_bytes(data, ci->buf + loc, c_len); copy_upto_64_bytes(data, ci->buf + loc, c_len);
} else { } else {
#ifdef HAVE_AVX512
if (data_len == 64) {
storeu512(data, loadu512(ci->buf + loc));
return ~0ULL;
}
#endif
if (data_len == 16) { if (data_len == 16) {
storeu128(data, loadu128(ci->buf + loc)); storeu128(data, loadu128(ci->buf + loc));
return 0xffff; return 0xffff;
@ -857,6 +941,11 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift); DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len); DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
#ifdef HAVE_AVX512
if (data_len == 64) {
return (~0ULL) << (h_shift + c_shift) >> c_shift;
}
#endif
if (data_len == 16) { if (data_len == 16) {
return (u16)(0xffff << (h_shift + c_shift)) >> c_shift; return (u16)(0xffff << (h_shift + c_shift)) >> c_shift;
} else { } else {
@ -886,6 +975,19 @@ m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
return *(m256 *)data; return *(m256 *)data;
} }
#ifdef HAVE_AVX512
static rose_inline
m512 getData512(const struct core_info *ci, s64a offset, u64a *valid_data_mask) {
if (offset > 0 && offset + sizeof(m512) <= ci->len) {
*valid_data_mask = ~0ULL;
return loadu512(ci->buf + offset);
}
ALIGN_CL_DIRECTIVE u8 data[sizeof(m512)];
*valid_data_mask = getBufferDataComplex(ci, offset, data, 64);
return *(m512 *)data;
}
#endif
static rose_inline static rose_inline
int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask, int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask,
const u8 *bucket_select_mask, u32 neg_mask, const u8 *bucket_select_mask, u32 neg_mask,
@ -1025,6 +1127,83 @@ int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask,
} }
} }
#ifdef HAVE_AVX512
static rose_inline
int roseCheckShufti64x8(const struct core_info *ci, const u8 *hi_mask,
const u8 *lo_mask, const u8 *bucket_select_mask,
u64a neg_mask, s32 checkOffset, u64a end) {
const s64a base_offset = (s64a)end - ci->buf_offset;
s64a offset = base_offset + checkOffset;
DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
DEBUG_PRINTF("too early, fail\n");
return 0;
}
u64a valid_data_mask = 0;
m512 data = getData512(ci, offset, &valid_data_mask);
if (unlikely(!valid_data_mask)) {
return 1;
}
m512 hi_mask_m512 = loadu512(hi_mask);
m512 lo_mask_m512 = loadu512(lo_mask);
m512 bucket_select_mask_m512 = loadu512(bucket_select_mask);
if (validateShuftiMask64x8(data, hi_mask_m512, lo_mask_m512,
bucket_select_mask_m512,
neg_mask, valid_data_mask)) {
DEBUG_PRINTF("check shufti 64x8 successfully\n");
return 1;
} else {
return 0;
}
}
static rose_inline
int roseCheckShufti64x16(const struct core_info *ci, const u8 *hi_mask_1,
const u8 *hi_mask_2, const u8 *lo_mask_1,
const u8 *lo_mask_2, const u8 *bucket_select_mask_hi,
const u8 *bucket_select_mask_lo, u64a neg_mask,
s32 checkOffset, u64a end) {
const s64a base_offset = (s64a)end - ci->buf_offset;
s64a offset = base_offset + checkOffset;
DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
DEBUG_PRINTF("too early, fail\n");
return 0;
}
u64a valid_data_mask = 0;
m512 data = getData512(ci, offset, &valid_data_mask);
if (unlikely(!valid_data_mask)) {
return 1;
}
m512 hi_mask_1_m512 = loadu512(hi_mask_1);
m512 hi_mask_2_m512 = loadu512(hi_mask_2);
m512 lo_mask_1_m512 = loadu512(lo_mask_1);
m512 lo_mask_2_m512 = loadu512(lo_mask_2);
m512 bucket_select_mask_hi_m512 = loadu512(bucket_select_mask_hi);
m512 bucket_select_mask_lo_m512 = loadu512(bucket_select_mask_lo);
if (validateShuftiMask64x16(data, hi_mask_1_m512, hi_mask_2_m512,
lo_mask_1_m512, lo_mask_2_m512,
bucket_select_mask_hi_m512,
bucket_select_mask_lo_m512,
neg_mask, valid_data_mask)) {
DEBUG_PRINTF("check shufti 64x16 successfully\n");
return 1;
} else {
return 0;
}
}
#endif
static rose_inline static rose_inline
int roseCheckSingleLookaround(const struct RoseEngine *t, int roseCheckSingleLookaround(const struct RoseEngine *t,
const struct hs_scratch *scratch, const struct hs_scratch *scratch,
@ -2068,6 +2247,12 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
&&LABEL_ROSE_INSTR_FLUSH_COMBINATION, &&LABEL_ROSE_INSTR_FLUSH_COMBINATION,
&&LABEL_ROSE_INSTR_SET_EXHAUST, &&LABEL_ROSE_INSTR_SET_EXHAUST,
&&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION &&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION
#ifdef HAVE_AVX512
,
&&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
&&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
&&LABEL_ROSE_INSTR_CHECK_MASK_64 //!< 64-bytes and/cmp/neg mask check.
#endif
}; };
#endif #endif
@ -2258,6 +2443,45 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
} }
PROGRAM_NEXT_INSTRUCTION PROGRAM_NEXT_INSTRUCTION
#ifdef HAVE_AVX512
PROGRAM_CASE(CHECK_MASK_64) {
struct core_info *ci = &scratch->core_info;
if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
ri->neg_mask, ri->offset, end)) {
assert(ri->fail_jump);
pc += ri->fail_jump;
PROGRAM_NEXT_INSTRUCTION_JUMP
}
}
PROGRAM_NEXT_INSTRUCTION
PROGRAM_CASE(CHECK_SHUFTI_64x8) {
const struct core_info *ci = &scratch->core_info;
if (!roseCheckShufti64x8(ci, ri->hi_mask, ri->lo_mask,
ri->bucket_select_mask,
ri->neg_mask, ri->offset, end)) {
assert(ri->fail_jump);
pc += ri->fail_jump;
PROGRAM_NEXT_INSTRUCTION_JUMP;
}
}
PROGRAM_NEXT_INSTRUCTION
PROGRAM_CASE(CHECK_SHUFTI_64x16) {
const struct core_info *ci = &scratch->core_info;
if (!roseCheckShufti64x16(ci, ri->hi_mask_1, ri->hi_mask_2,
ri->lo_mask_1, ri->lo_mask_2,
ri->bucket_select_mask_hi,
ri->bucket_select_mask_lo,
ri->neg_mask, ri->offset, end)) {
assert(ri->fail_jump);
pc += ri->fail_jump;
PROGRAM_NEXT_INSTRUCTION_JUMP;
}
}
PROGRAM_NEXT_INSTRUCTION
#endif
PROGRAM_CASE(CHECK_INFIX) { PROGRAM_CASE(CHECK_INFIX) {
if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report, if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
end)) { end)) {
@ -2945,6 +3169,19 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
} }
L_PROGRAM_NEXT_INSTRUCTION L_PROGRAM_NEXT_INSTRUCTION
#ifdef HAVE_AVX512
L_PROGRAM_CASE(CHECK_MASK_64) {
struct core_info *ci = &scratch->core_info;
if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
ri->neg_mask, ri->offset, end)) {
assert(ri->fail_jump);
pc += ri->fail_jump;
L_PROGRAM_NEXT_INSTRUCTION_JUMP
}
}
L_PROGRAM_NEXT_INSTRUCTION
#endif
L_PROGRAM_CASE(CHECK_BYTE) { L_PROGRAM_CASE(CHECK_BYTE) {
const struct core_info *ci = &scratch->core_info; const struct core_info *ci = &scratch->core_info;
if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask, if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2019, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -554,7 +554,8 @@ void findFixedDepthTops(const RoseGraph &g, const set<PredTopPair> &triggers,
*/ */
static static
bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl, bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl,
bytecode_ptr<NFA> nfa_impl) { bytecode_ptr<NFA> nfa_impl,
bool fast_nfa) {
assert(nfa_impl); assert(nfa_impl);
assert(dfa_impl); assert(dfa_impl);
assert(isDfaType(dfa_impl->type)); assert(isDfaType(dfa_impl->type));
@ -584,7 +585,7 @@ bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl,
return nfa_impl; return nfa_impl;
} }
} else { } else {
if (n_accel) { if (n_accel && fast_nfa) {
return nfa_impl; return nfa_impl;
} else { } else {
return dfa_impl; return dfa_impl;
@ -632,6 +633,15 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
* bytecode and that they are usually run on small blocks */ * bytecode and that they are usually run on small blocks */
dfa = mcshengCompile(rdfa, cc, rm); dfa = mcshengCompile(rdfa, cc, rm);
} }
if (!dfa) {
dfa = sheng32Compile(rdfa, cc, rm, false);
}
if (!dfa) {
dfa = sheng64Compile(rdfa, cc, rm, false);
}
if (!dfa && !is_transient) {
dfa = mcshengCompile64(rdfa, cc, rm);
}
if (!dfa) { if (!dfa) {
// Sheng wasn't successful, so unleash McClellan! // Sheng wasn't successful, so unleash McClellan!
dfa = mcclellanCompile(rdfa, cc, rm, false); dfa = mcclellanCompile(rdfa, cc, rm, false);
@ -678,20 +688,21 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
} }
} }
bool fast_nfa = false;
auto n = constructNFA(holder, &rm, fixed_depth_tops, triggers, auto n = constructNFA(holder, &rm, fixed_depth_tops, triggers,
compress_state, cc); compress_state, fast_nfa, cc);
assert(n); assert(n);
if (oneTop && cc.grey.roseMcClellanSuffix) { if (oneTop && cc.grey.roseMcClellanSuffix) {
if (cc.grey.roseMcClellanSuffix == 2 || n->nPositions > 128 || if (cc.grey.roseMcClellanSuffix == 2 || n->nPositions > 128 ||
!has_bounded_repeats_other_than_firsts(*n)) { !has_bounded_repeats_other_than_firsts(*n) || !fast_nfa) {
auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0), auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0),
cc.grey); cc.grey);
if (rdfa) { if (rdfa) {
auto d = getDfa(*rdfa, false, cc, rm); auto d = getDfa(*rdfa, false, cc, rm);
assert(d); assert(d);
if (cc.grey.roseMcClellanSuffix != 2) { if (cc.grey.roseMcClellanSuffix != 2) {
n = pickImpl(move(d), move(n)); n = pickImpl(move(d), move(n), fast_nfa);
} else { } else {
n = move(d); n = move(d);
} }
@ -826,23 +837,24 @@ bytecode_ptr<NFA> makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
n = constructLBR(*left.graph(), triggers.begin()->second, cc, rm); n = constructLBR(*left.graph(), triggers.begin()->second, cc, rm);
} }
bool fast_nfa = false;
if (!n && left.graph()) { if (!n && left.graph()) {
map<u32, vector<vector<CharReach>>> triggers; map<u32, vector<vector<CharReach>>> triggers;
if (left.graph()->kind == NFA_INFIX) { if (left.graph()->kind == NFA_INFIX) {
findTriggerSequences(tbi, infixTriggers.at(left), &triggers); findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
} }
n = constructNFA(*left.graph(), nullptr, fixed_depth_tops, triggers, n = constructNFA(*left.graph(), nullptr, fixed_depth_tops, triggers,
compress_state, cc); compress_state, fast_nfa, cc);
} }
if (cc.grey.roseMcClellanPrefix == 1 && is_prefix && !left.dfa() if (cc.grey.roseMcClellanPrefix == 1 && is_prefix && !left.dfa()
&& left.graph() && left.graph()
&& (!n || !has_bounded_repeats_other_than_firsts(*n) || !is_fast(*n))) { && (!n || !has_bounded_repeats_other_than_firsts(*n) || !fast_nfa)) {
auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey); auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
if (rdfa) { if (rdfa) {
auto d = getDfa(*rdfa, is_transient, cc, rm); auto d = getDfa(*rdfa, is_transient, cc, rm);
assert(d); assert(d);
n = pickImpl(move(d), move(n)); n = pickImpl(move(d), move(n), fast_nfa);
} }
} }
@ -1627,17 +1639,18 @@ public:
const map<u32, u32> fixed_depth_tops; /* no tops */ const map<u32, u32> fixed_depth_tops; /* no tops */
const map<u32, vector<vector<CharReach>>> triggers; /* no tops */ const map<u32, vector<vector<CharReach>>> triggers; /* no tops */
bool compress_state = cc.streaming; bool compress_state = cc.streaming;
bool fast_nfa = false;
auto n = constructNFA(h, &rm, fixed_depth_tops, triggers, auto n = constructNFA(h, &rm, fixed_depth_tops, triggers,
compress_state, cc); compress_state, fast_nfa, cc);
// Try for a DFA upgrade. // Try for a DFA upgrade.
if (n && cc.grey.roseMcClellanOutfix && if (n && cc.grey.roseMcClellanOutfix &&
!has_bounded_repeats_other_than_firsts(*n)) { (!has_bounded_repeats_other_than_firsts(*n) || !fast_nfa)) {
auto rdfa = buildMcClellan(h, &rm, cc.grey); auto rdfa = buildMcClellan(h, &rm, cc.grey);
if (rdfa) { if (rdfa) {
auto d = getDfa(*rdfa, false, cc, rm); auto d = getDfa(*rdfa, false, cc, rm);
if (d) { if (d) {
n = pickImpl(move(d), move(n)); n = pickImpl(move(d), move(n), fast_nfa);
} }
} }
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2019, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -757,13 +757,12 @@ CharReach shufti2cr(const u8 *lo, const u8 *hi, u8 bucket_mask) {
static static
void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi, void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
const u8 *bucket_mask, u32 neg_mask, s32 offset) { const u8 *bucket_mask, u64a neg_mask, s32 offset) {
assert(len == 16 || len == 32); assert(len == 16 || len == 32 || len == 64);
os << " contents:" << endl; os << " contents:" << endl;
for (u32 idx = 0; idx < len; idx++) { for (u32 idx = 0; idx < len; idx++) {
CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]); CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
if (neg_mask & (1ULL << idx)) {
if (neg_mask & (1U << idx)) {
cr.flip(); cr.flip();
} }
@ -779,14 +778,13 @@ void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
static static
void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi, void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask, const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask,
const u8 *bucket_mask_2, u32 neg_mask, s32 offset) { const u8 *bucket_mask_2, u64a neg_mask, s32 offset) {
assert(len == 16 || len == 32); assert(len == 16 || len == 32 || len == 64);
os << " contents:" << endl; os << " contents:" << endl;
for (u32 idx = 0; idx < len; idx++) { for (u32 idx = 0; idx < len; idx++) {
CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]); CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]); cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]);
if (neg_mask & (1ULL << idx)) {
if (neg_mask & (1U << idx)) {
cr.flip(); cr.flip();
} }
@ -970,6 +968,20 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
} }
PROGRAM_NEXT_INSTRUCTION PROGRAM_NEXT_INSTRUCTION
PROGRAM_CASE(CHECK_MASK_64) {
os << " and_mask "
<< dumpStrMask(ri->and_mask, sizeof(ri->and_mask))
<< endl;
os << " cmp_mask "
<< dumpStrMask(ri->cmp_mask, sizeof(ri->cmp_mask))
<< endl;
os << " neg_mask 0x" << std::hex << std::setw(8)
<< std::setfill('0') << ri->neg_mask << std::dec << endl;
os << " offset " << ri->offset << endl;
os << " fail_jump " << offset + ri->fail_jump << endl;
}
PROGRAM_NEXT_INSTRUCTION
PROGRAM_CASE(CHECK_BYTE) { PROGRAM_CASE(CHECK_BYTE) {
os << " and_mask 0x" << std::hex << std::setw(2) os << " and_mask 0x" << std::hex << std::setw(2)
<< std::setfill('0') << u32{ri->and_mask} << std::dec << std::setfill('0') << u32{ri->and_mask} << std::dec
@ -1072,6 +1084,60 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
} }
PROGRAM_NEXT_INSTRUCTION PROGRAM_NEXT_INSTRUCTION
PROGRAM_CASE(CHECK_SHUFTI_64x8) {
os << " hi_mask "
<< dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
<< endl;
os << " lo_mask "
<< dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
<< endl;
os << " bucket_select_mask "
<< dumpStrMask(ri->bucket_select_mask,
sizeof(ri->bucket_select_mask))
<< endl;
os << " neg_mask 0x" << std::hex << std::setw(8)
<< std::setfill('0') << ri->neg_mask << std::dec << endl;
os << " offset " << ri->offset << endl;
os << " fail_jump " << offset + ri->fail_jump << endl;
dumpLookaroundShufti(os, 64, ri->lo_mask, ri->hi_mask,
ri->bucket_select_mask, ri->neg_mask,
ri->offset);
}
PROGRAM_NEXT_INSTRUCTION
PROGRAM_CASE(CHECK_SHUFTI_64x16) {
os << " hi_mask_1 "
<< dumpStrMask(ri->hi_mask_1, sizeof(ri->hi_mask_1))
<< endl;
os << " hi_mask_2 "
<< dumpStrMask(ri->hi_mask_2, sizeof(ri->hi_mask_2))
<< endl;
os << " lo_mask_1 "
<< dumpStrMask(ri->lo_mask_1, sizeof(ri->lo_mask_1))
<< endl;
os << " lo_mask_2 "
<< dumpStrMask(ri->lo_mask_2, sizeof(ri->lo_mask_2))
<< endl;
os << " bucket_select_mask_hi "
<< dumpStrMask(ri->bucket_select_mask_hi,
sizeof(ri->bucket_select_mask_hi))
<< endl;
os << " bucket_select_mask_lo "
<< dumpStrMask(ri->bucket_select_mask_lo,
sizeof(ri->bucket_select_mask_lo))
<< endl;
os << " neg_mask 0x" << std::hex << std::setw(8)
<< std::setfill('0') << ri->neg_mask << std::dec << endl;
os << " offset " << ri->offset << endl;
os << " fail_jump " << offset + ri->fail_jump << endl;
dumpLookaroundShufti(os, 64, ri->lo_mask_1, ri->hi_mask_1,
ri->lo_mask_2, ri->hi_mask_2,
ri->bucket_select_mask_lo,
ri->bucket_select_mask_hi,
ri->neg_mask, ri->offset);
}
PROGRAM_NEXT_INSTRUCTION
PROGRAM_CASE(CHECK_INFIX) { PROGRAM_CASE(CHECK_INFIX) {
os << " queue " << ri->queue << endl; os << " queue " << ri->queue << endl;
os << " lag " << ri->lag << endl; os << " lag " << ri->lag << endl;

View File

@ -96,7 +96,7 @@ bool eligibleForAlwaysOnGroup(const RoseBuildImpl &build, u32 id) {
static static
bool requires_group_assignment(const rose_literal_id &lit, bool requires_group_assignment(const rose_literal_id &lit,
const rose_literal_info &info) { const rose_literal_info &info) {
if (lit.delay) { /* we will check the shadow's master */ if (lit.delay) { /* we will check the shadow's leader */
return false; return false;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2019, Intel Corporation * Copyright (c) 2017-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -162,6 +162,17 @@ void RoseInstrCheckMask32::write(void *dest, RoseEngineBlob &blob,
inst->fail_jump = calc_jump(offset_map, this, target); inst->fail_jump = calc_jump(offset_map, this, target);
} }
void RoseInstrCheckMask64::write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const {
RoseInstrBase::write(dest, blob, offset_map);
auto *inst = static_cast<impl_type *>(dest);
copy(begin(and_mask), end(and_mask), inst->and_mask);
copy(begin(cmp_mask), end(cmp_mask), inst->cmp_mask);
inst->neg_mask = neg_mask;
inst->offset = offset;
inst->fail_jump = calc_jump(offset_map, this, target);
}
void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob, void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const { const OffsetMap &offset_map) const {
RoseInstrBase::write(dest, blob, offset_map); RoseInstrBase::write(dest, blob, offset_map);
@ -227,6 +238,36 @@ void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob,
inst->fail_jump = calc_jump(offset_map, this, target); inst->fail_jump = calc_jump(offset_map, this, target);
} }
void RoseInstrCheckShufti64x8::write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const {
RoseInstrBase::write(dest, blob, offset_map);
auto *inst = static_cast<impl_type *>(dest);
copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
copy(begin(bucket_select_mask), end(bucket_select_mask),
inst->bucket_select_mask);
inst->neg_mask = neg_mask;
inst->offset = offset;
inst->fail_jump = calc_jump(offset_map, this, target);
}
void RoseInstrCheckShufti64x16::write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const {
RoseInstrBase::write(dest, blob, offset_map);
auto *inst = static_cast<impl_type *>(dest);
copy(begin(hi_mask_1), end(hi_mask_1), inst->hi_mask_1);
copy(begin(hi_mask_2), end(hi_mask_2), inst->hi_mask_2);
copy(begin(lo_mask_1), end(lo_mask_1), inst->lo_mask_1);
copy(begin(lo_mask_2), end(lo_mask_2), inst->lo_mask_2);
copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi),
inst->bucket_select_mask_hi);
copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo),
inst->bucket_select_mask_lo);
inst->neg_mask = neg_mask;
inst->offset = offset;
inst->fail_jump = calc_jump(offset_map, this, target);
}
void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob, void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const { const OffsetMap &offset_map) const {
RoseInstrBase::write(dest, blob, offset_map); RoseInstrBase::write(dest, blob, offset_map);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2019, Intel Corporation * Copyright (c) 2017-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -519,6 +519,43 @@ public:
} }
}; };
class RoseInstrCheckMask64
: public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK_64,
ROSE_STRUCT_CHECK_MASK_64,
RoseInstrCheckMask64> {
public:
std::array<u8, 64> and_mask;
std::array<u8, 64> cmp_mask;
u64a neg_mask;
s32 offset;
const RoseInstruction *target;
RoseInstrCheckMask64(std::array<u8, 64> and_mask_in,
std::array<u8, 64> cmp_mask_in, u64a neg_mask_in,
s32 offset_in, const RoseInstruction *target_in)
: and_mask(std::move(and_mask_in)), cmp_mask(std::move(cmp_mask_in)),
neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
bool operator==(const RoseInstrCheckMask64 &ri) const {
return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
neg_mask == ri.neg_mask && offset == ri.offset &&
target == ri.target;
}
size_t hash() const override {
return hash_all(opcode, and_mask, cmp_mask, neg_mask, offset);
}
void write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const override;
bool equiv_to(const RoseInstrCheckMask64 &ri, const OffsetMap &offsets,
const OffsetMap &other_offsets) const {
return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
neg_mask == ri.neg_mask && offset == ri.offset &&
offsets.at(target) == other_offsets.at(ri.target);
}
};
class RoseInstrCheckByte class RoseInstrCheckByte
: public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BYTE, : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BYTE,
ROSE_STRUCT_CHECK_BYTE, ROSE_STRUCT_CHECK_BYTE,
@ -738,6 +775,109 @@ public:
} }
}; };
class RoseInstrCheckShufti64x8
: public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_64x8,
ROSE_STRUCT_CHECK_SHUFTI_64x8,
RoseInstrCheckShufti64x8> {
public:
std::array<u8, 64> hi_mask;
std::array<u8, 64> lo_mask;
std::array<u8, 64> bucket_select_mask;
u64a neg_mask;
s32 offset;
const RoseInstruction *target;
RoseInstrCheckShufti64x8(std::array<u8, 64> hi_mask_in,
std::array<u8, 64> lo_mask_in,
std::array<u8, 64> bucket_select_mask_in,
u64a neg_mask_in, s32 offset_in,
const RoseInstruction *target_in)
: hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
bucket_select_mask(std::move(bucket_select_mask_in)),
neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
bool operator==(const RoseInstrCheckShufti64x8 &ri) const {
return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
bucket_select_mask == ri.bucket_select_mask &&
neg_mask == ri.neg_mask && offset == ri.offset &&
target == ri.target;
}
size_t hash() const override {
return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask, neg_mask,
offset);
}
void write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const override;
bool equiv_to(const RoseInstrCheckShufti64x8 &ri, const OffsetMap &offsets,
const OffsetMap &other_offsets) const {
return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
bucket_select_mask == ri.bucket_select_mask &&
neg_mask == ri.neg_mask && offset == ri.offset &&
offsets.at(target) == other_offsets.at(ri.target);
}
};
class RoseInstrCheckShufti64x16
: public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_64x16,
ROSE_STRUCT_CHECK_SHUFTI_64x16,
RoseInstrCheckShufti64x16> {
public:
std::array<u8, 64> hi_mask_1;
std::array<u8, 64> hi_mask_2;
std::array<u8, 64> lo_mask_1;
std::array<u8, 64> lo_mask_2;
std::array<u8, 64> bucket_select_mask_hi;
std::array<u8, 64> bucket_select_mask_lo;
u64a neg_mask;
s32 offset;
const RoseInstruction *target;
RoseInstrCheckShufti64x16(std::array<u8, 64> hi_mask_1_in,
std::array<u8, 64> hi_mask_2_in,
std::array<u8, 64> lo_mask_1_in,
std::array<u8, 64> lo_mask_2_in,
std::array<u8, 64> bucket_select_mask_hi_in,
std::array<u8, 64> bucket_select_mask_lo_in,
u64a neg_mask_in, s32 offset_in,
const RoseInstruction *target_in)
: hi_mask_1(std::move(hi_mask_1_in)), hi_mask_2(std::move(hi_mask_2_in)),
lo_mask_1(std::move(lo_mask_1_in)), lo_mask_2(std::move(lo_mask_2_in)),
bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)),
bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)),
neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
bool operator==(const RoseInstrCheckShufti64x16 &ri) const {
return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 &&
lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 &&
bucket_select_mask_hi == ri.bucket_select_mask_hi &&
bucket_select_mask_lo == ri.bucket_select_mask_lo &&
neg_mask == ri.neg_mask && offset == ri.offset &&
target == ri.target;
}
size_t hash() const override {
return hash_all(opcode, hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2,
bucket_select_mask_hi, bucket_select_mask_lo, neg_mask,
offset);
}
void write(void *dest, RoseEngineBlob &blob,
const OffsetMap &offset_map) const override;
bool equiv_to(const RoseInstrCheckShufti64x16 &ri, const OffsetMap &offsets,
const OffsetMap &other_offsets) const {
return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 &&
lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 &&
bucket_select_mask_hi == ri.bucket_select_mask_hi &&
bucket_select_mask_lo == ri.bucket_select_mask_lo &&
neg_mask == ri.neg_mask && offset == ri.offset &&
offsets.at(target) == other_offsets.at(ri.target);
}
};
class RoseInstrCheckInfix class RoseInstrCheckInfix
: public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_INFIX, : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_INFIX,
ROSE_STRUCT_CHECK_INFIX, ROSE_STRUCT_CHECK_INFIX,

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -58,7 +58,7 @@ static const u32 MAX_FWD_LEN = 64;
static const u32 MAX_BACK_LEN = 64; static const u32 MAX_BACK_LEN = 64;
/** \brief Max lookaround entries for a role. */ /** \brief Max lookaround entries for a role. */
static const u32 MAX_LOOKAROUND_ENTRIES = 16; static const u32 MAX_LOOKAROUND_ENTRIES = 32;
/** \brief We would rather have lookarounds with smaller reach than this. */ /** \brief We would rather have lookarounds with smaller reach than this. */
static const u32 LOOKAROUND_WIDE_REACH = 200; static const u32 LOOKAROUND_WIDE_REACH = 200;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2019, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -1061,6 +1061,49 @@ bool makeRoleMask32(const vector<LookEntry> &look,
return true; return true;
} }
static
bool makeRoleMask64(const vector<LookEntry> &look,
RoseProgram &program, const target_t &target) {
if (!target.has_avx512()) {
return false;
}
if (look.back().offset >= look.front().offset + 64) {
return false;
}
s32 base_offset = verify_s32(look.front().offset);
array<u8, 64> and_mask, cmp_mask;
and_mask.fill(0);
cmp_mask.fill(0);
u64a neg_mask = 0;
for (const auto &entry : look) {
u8 andmask_u8, cmpmask_u8, flip;
if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) {
return false;
}
u32 shift = entry.offset - base_offset;
assert(shift < 64);
and_mask[shift] = andmask_u8;
cmp_mask[shift] = cmpmask_u8;
if (flip) {
neg_mask |= 1ULL << shift;
}
}
DEBUG_PRINTF("and_mask %s\n",
convertMaskstoString(and_mask.data(), 64).c_str());
DEBUG_PRINTF("cmp_mask %s\n",
convertMaskstoString(cmp_mask.data(), 64).c_str());
DEBUG_PRINTF("neg_mask %llx\n", neg_mask);
DEBUG_PRINTF("base_offset %d\n", base_offset);
const auto *end_inst = program.end_instruction();
auto ri = make_unique<RoseInstrCheckMask64>(and_mask, cmp_mask, neg_mask,
base_offset, end_inst);
program.add_before_end(move(ri));
return true;
}
// Sorting by the size of every bucket. // Sorting by the size of every bucket.
// Used in map<u32, vector<s8>, cmpNibble>. // Used in map<u32, vector<s8>, cmpNibble>.
struct cmpNibble { struct cmpNibble {
@ -1084,6 +1127,7 @@ void getAllBuckets(const vector<LookEntry> &look,
} else { } else {
neg_mask ^= 1ULL << (entry.offset - base_offset); neg_mask ^= 1ULL << (entry.offset - base_offset);
} }
map <u16, u16> lo2hi; map <u16, u16> lo2hi;
// We treat Ascii Table as a 16x16 grid. // We treat Ascii Table as a 16x16 grid.
// Push every row in cr into lo2hi and mark the row number. // Push every row in cr into lo2hi and mark the row number.
@ -1237,6 +1281,7 @@ makeCheckShufti16x16(u32 offset_range, u8 bucket_idx,
(hi_mask, lo_mask, bucket_select_mask_32, (hi_mask, lo_mask, bucket_select_mask_32,
neg_mask & 0xffff, base_offset, end_inst); neg_mask & 0xffff, base_offset, end_inst);
} }
static static
unique_ptr<RoseInstruction> unique_ptr<RoseInstruction>
makeCheckShufti32x16(u32 offset_range, u8 bucket_idx, makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
@ -1255,10 +1300,83 @@ makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
} }
static static
bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) { unique_ptr<RoseInstruction>
makeCheckShufti64x8(u32 offset_range, u8 bucket_idx,
const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
const array<u8, 64> &bucket_select_mask,
u64a neg_mask, s32 base_offset,
const RoseInstruction *end_inst) {
if (offset_range > 64 || bucket_idx > 8) {
return nullptr;
}
array<u8, 64> hi_mask_64;
array<u8, 64> lo_mask_64;
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin());
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 16);
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 32);
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 48);
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin());
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 16);
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 32);
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 48);
return make_unique<RoseInstrCheckShufti64x8>
(hi_mask_64, lo_mask_64, bucket_select_mask,
neg_mask, base_offset, end_inst);
}
static
unique_ptr<RoseInstruction>
makeCheckShufti64x16(u32 offset_range, u8 bucket_idx,
const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
const array<u8, 64> &bucket_select_mask_lo,
const array<u8, 64> &bucket_select_mask_hi,
u64a neg_mask, s32 base_offset,
const RoseInstruction *end_inst) {
if (offset_range > 64 || bucket_idx > 16) {
return nullptr;
}
array<u8, 64> hi_mask_1;
array<u8, 64> hi_mask_2;
array<u8, 64> lo_mask_1;
array<u8, 64> lo_mask_2;
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin());
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 16);
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 32);
copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 48);
copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin());
copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 16);
copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 32);
copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 48);
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin());
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 16);
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 32);
copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 48);
copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin());
copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 16);
copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 32);
copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 48);
return make_unique<RoseInstrCheckShufti64x16>
(hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2, bucket_select_mask_hi,
bucket_select_mask_lo, neg_mask, base_offset, end_inst);
}
static
bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program,
const target_t &target) {
s32 offset_limit;
if (target.has_avx512()) {
offset_limit = 64;
} else {
offset_limit = 32;
}
s32 base_offset = verify_s32(look.front().offset); s32 base_offset = verify_s32(look.front().offset);
if (look.back().offset >= base_offset + 32) { if (look.back().offset >= base_offset + offset_limit) {
return false; return false;
} }
@ -1266,17 +1384,40 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
u64a neg_mask_64; u64a neg_mask_64;
array<u8, 32> hi_mask; array<u8, 32> hi_mask;
array<u8, 32> lo_mask; array<u8, 32> lo_mask;
array<u8, 64> bucket_select_hi_64; // for AVX512
array<u8, 64> bucket_select_lo_64; // for AVX512
array<u8, 32> bucket_select_hi; array<u8, 32> bucket_select_hi;
array<u8, 32> bucket_select_lo; array<u8, 32> bucket_select_lo;
hi_mask.fill(0); hi_mask.fill(0);
lo_mask.fill(0); lo_mask.fill(0);
bucket_select_hi_64.fill(0);
bucket_select_lo_64.fill(0);
bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8. bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
bucket_select_lo.fill(0); bucket_select_lo.fill(0);
if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(), if (target.has_avx512()) {
bucket_select_lo.data(), neg_mask_64, bucket_idx, 32)) { if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi_64.data(),
bucket_select_lo_64.data(), neg_mask_64, bucket_idx,
32)) {
return false; return false;
} }
copy(bucket_select_hi_64.begin(), bucket_select_hi_64.begin() + 32,
bucket_select_hi.begin());
copy(bucket_select_lo_64.begin(), bucket_select_lo_64.begin() + 32,
bucket_select_lo.begin());
DEBUG_PRINTF("bucket_select_hi_64 %s\n",
convertMaskstoString(bucket_select_hi_64.data(), 64).c_str());
DEBUG_PRINTF("bucket_select_lo_64 %s\n",
convertMaskstoString(bucket_select_lo_64.data(), 64).c_str());
} else {
if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(),
bucket_select_lo.data(), neg_mask_64, bucket_idx,
32)) {
return false;
}
}
u32 neg_mask = (u32)neg_mask_64; u32 neg_mask = (u32)neg_mask_64;
DEBUG_PRINTF("hi_mask %s\n", DEBUG_PRINTF("hi_mask %s\n",
@ -1299,6 +1440,13 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
bucket_select_lo, neg_mask, base_offset, bucket_select_lo, neg_mask, base_offset,
end_inst); end_inst);
} }
if (target.has_avx512()) {
if (!ri) {
ri = makeCheckShufti64x8(offset_range, bucket_idx, hi_mask, lo_mask,
bucket_select_lo_64, neg_mask_64,
base_offset, end_inst);
}
}
if (!ri) { if (!ri) {
ri = makeCheckShufti16x16(offset_range, bucket_idx, hi_mask, lo_mask, ri = makeCheckShufti16x16(offset_range, bucket_idx, hi_mask, lo_mask,
bucket_select_lo, bucket_select_hi, bucket_select_lo, bucket_select_hi,
@ -1309,6 +1457,13 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
bucket_select_lo, bucket_select_hi, bucket_select_lo, bucket_select_hi,
neg_mask, base_offset, end_inst); neg_mask, base_offset, end_inst);
} }
if (target.has_avx512()) {
if (!ri) {
ri = makeCheckShufti64x16(offset_range, bucket_idx, hi_mask, lo_mask,
bucket_select_lo_64, bucket_select_hi_64,
neg_mask_64, base_offset, end_inst);
}
}
assert(ri); assert(ri);
program.add_before_end(move(ri)); program.add_before_end(move(ri));
@ -1321,7 +1476,7 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
*/ */
static static
void makeLookaroundInstruction(const vector<LookEntry> &look, void makeLookaroundInstruction(const vector<LookEntry> &look,
RoseProgram &program) { RoseProgram &program, const target_t &target) {
assert(!look.empty()); assert(!look.empty());
if (makeRoleByte(look, program)) { if (makeRoleByte(look, program)) {
@ -1345,7 +1500,11 @@ void makeLookaroundInstruction(const vector<LookEntry> &look,
return; return;
} }
if (makeRoleShufti(look, program)) { if (makeRoleMask64(look, program, target)) {
return;
}
if (makeRoleShufti(look, program, target)) {
return; return;
} }
@ -1386,7 +1545,7 @@ void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 lit_id,
return; // all caseful chars handled by HWLM mask. return; // all caseful chars handled by HWLM mask.
} }
makeLookaroundInstruction(look, program); makeLookaroundInstruction(look, program, build.cc.target_info);
} }
static static
@ -1730,7 +1889,7 @@ void makeRoleLookaround(const RoseBuildImpl &build,
findLookaroundMasks(build, v, look_more); findLookaroundMasks(build, v, look_more);
mergeLookaround(look, look_more); mergeLookaround(look, look_more);
if (!look.empty()) { if (!look.empty()) {
makeLookaroundInstruction(look, program); makeLookaroundInstruction(look, program, build.cc.target_info);
} }
return; return;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2019, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -208,7 +208,11 @@ enum RoseInstructionCode {
*/ */
ROSE_INSTR_LAST_FLUSH_COMBINATION, ROSE_INSTR_LAST_FLUSH_COMBINATION,
LAST_ROSE_INSTRUCTION = ROSE_INSTR_LAST_FLUSH_COMBINATION //!< Sentinel. ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
ROSE_INSTR_CHECK_MASK_64, //!< 64-bytes and/cmp/neg mask check.
LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MASK_64 //!< Sentinel.
}; };
struct ROSE_STRUCT_END { struct ROSE_STRUCT_END {
@ -285,6 +289,15 @@ struct ROSE_STRUCT_CHECK_MASK_32 {
u32 fail_jump; //!< Jump forward this many bytes on failure. u32 fail_jump; //!< Jump forward this many bytes on failure.
}; };
struct ROSE_STRUCT_CHECK_MASK_64 {
u8 code; //!< From enum RoseInstructionCode.
u8 and_mask[64]; //!< 64-byte and mask.
u8 cmp_mask[64]; //!< 64-byte cmp mask.
u64a neg_mask; //!< negation mask with 32 bits.
s32 offset; //!< Relative offset of the first byte.
u32 fail_jump; //!< Jump forward this many bytes on failure.
};
struct ROSE_STRUCT_CHECK_BYTE { struct ROSE_STRUCT_CHECK_BYTE {
u8 code; //!< From enum RoseInstructionCode. u8 code; //!< From enum RoseInstructionCode.
u8 and_mask; //!< 8-bits and mask. u8 and_mask; //!< 8-bits and mask.
@ -336,6 +349,29 @@ struct ROSE_STRUCT_CHECK_SHUFTI_32x16 {
u32 fail_jump; //!< Jump forward this many bytes on failure. u32 fail_jump; //!< Jump forward this many bytes on failure.
}; };
struct ROSE_STRUCT_CHECK_SHUFTI_64x8 {
u8 code; //!< From enum RoseInstructionCode.
u8 hi_mask[64]; //!< High nibble mask in shufti.
u8 lo_mask[64]; //!< Low nibble mask in shufti.
u8 bucket_select_mask[64]; //!< Mask for bucket assigning.
u64a neg_mask; //!< 64 bits negation mask.
s32 offset; //!< Relative offset of the first byte.
u32 fail_jump; //!< Jump forward this many bytes on failure.
};
struct ROSE_STRUCT_CHECK_SHUFTI_64x16 {
u8 code; //!< From enum RoseInstructionCode.
u8 hi_mask_1[64]; //!< 4 copies of 0-15 High nibble mask.
u8 hi_mask_2[64]; //!< 4 copies of 16-32 High nibble mask.
u8 lo_mask_1[64]; //!< 4 copies of 0-15 Low nibble mask.
u8 lo_mask_2[64]; //!< 4 copies of 16-32 Low nibble mask.
u8 bucket_select_mask_hi[64]; //!< Bucket mask for high 8 buckets.
u8 bucket_select_mask_lo[64]; //!< Bucket mask for low 8 buckets.
u64a neg_mask; //!< 64 bits negation mask.
s32 offset; //!< Relative offset of the first byte.
u32 fail_jump; //!< Jump forward this many bytes on failure.
};
struct ROSE_STRUCT_CHECK_INFIX { struct ROSE_STRUCT_CHECK_INFIX {
u8 code; //!< From enum RoseInstructionCode. u8 code; //!< From enum RoseInstructionCode.
u32 queue; //!< Queue of leftfix to check. u32 queue; //!< Queue of leftfix to check.

View File

@ -201,12 +201,12 @@ const u8 *prepScanBuffer(const struct core_info *ci,
} else { } else {
// Copy: first chunk from history buffer. // Copy: first chunk from history buffer.
assert(overhang <= ci->hlen); assert(overhang <= ci->hlen);
copy_upto_32_bytes(tempbuf, ci->hbuf + ci->hlen - overhang, copy_upto_64_bytes(tempbuf, ci->hbuf + ci->hlen - overhang,
overhang); overhang);
// Copy: second chunk from current buffer. // Copy: second chunk from current buffer.
size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang; size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang;
assert(copy_buf_len <= ci->len); assert(copy_buf_len <= ci->len);
copy_upto_32_bytes(tempbuf + overhang, ci->buf, copy_buf_len); copy_upto_64_bytes(tempbuf + overhang, ci->buf, copy_buf_len);
// Read from our temporary buffer for the hash. // Read from our temporary buffer for the hash.
base = tempbuf; base = tempbuf;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -41,6 +41,17 @@ void validateMask32Print(const u8 *mask) {
} }
printf("\n"); printf("\n");
} }
#ifdef HAVE_AVX512
static
void validateMask64Print(const u8 *mask) {
int i;
for (i = 0; i < 64; i++) {
printf("%02x ", mask[i]);
}
printf("\n");
}
#endif
#endif #endif
// check positive bytes in cmp_result. // check positive bytes in cmp_result.
@ -115,4 +126,29 @@ int validateMask32(const m256 data, const u32 valid_data_mask,
} }
} }
#ifdef HAVE_AVX512
static really_inline
int validateMask64(const m512 data, const u64a valid_data_mask,
const m512 and_mask, const m512 cmp_mask,
const u64a neg_mask) {
u64a cmp_result = ~eq512mask(and512(data, and_mask), cmp_mask);
#ifdef DEBUG
DEBUG_PRINTF("data\n");
validateMask64Print((const u8 *)&data);
DEBUG_PRINTF("cmp_result\n");
validateMask64Print((const u8 *)&cmp_result);
#endif
DEBUG_PRINTF("cmp_result %016llx neg_mask %016llx\n", cmp_result, neg_mask);
DEBUG_PRINTF("valid_data_mask %016llx\n", valid_data_mask);
if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) {
DEBUG_PRINTF("checkCompareResult64 passed\n");
return 1;
} else {
DEBUG_PRINTF("checkCompareResult64 failed\n");
return 0;
}
}
#endif
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -175,6 +175,84 @@ int validateShuftiMask32x16(const m256 data,
return !cmp_result; return !cmp_result;
} }
#ifdef HAVE_AVX512
static really_inline
int validateShuftiMask64x8(const m512 data, const m512 hi_mask,
const m512 lo_mask, const m512 and_mask,
const u64a neg_mask, const u64a valid_data_mask) {
m512 low4bits = set64x8(0xf);
m512 c_lo = pshufb_m512(lo_mask, and512(data, low4bits));
m512 c_hi = pshufb_m512(hi_mask,
rshift64_m512(andnot512(low4bits, data), 4));
m512 t = and512(c_lo, c_hi);
u64a nresult = eq512mask(and512(t, and_mask), zeroes512());
#ifdef DEBUG
DEBUG_PRINTF("data\n");
dumpMask(&data, 64);
DEBUG_PRINTF("hi_mask\n");
dumpMask(&hi_mask, 64);
DEBUG_PRINTF("lo_mask\n");
dumpMask(&lo_mask, 64);
DEBUG_PRINTF("c_lo\n");
dumpMask(&c_lo, 64);
DEBUG_PRINTF("c_hi\n");
dumpMask(&c_hi, 64);
DEBUG_PRINTF("nresult %llx\n", nresult);
DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
#endif
u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
return !cmp_result;
}
static really_inline
int validateShuftiMask64x16(const m512 data,
const m512 hi_mask_1, const m512 hi_mask_2,
const m512 lo_mask_1, const m512 lo_mask_2,
const m512 and_mask_hi, const m512 and_mask_lo,
const u64a neg_mask, const u64a valid_data_mask) {
m512 low4bits = set64x8(0xf);
m512 data_lo = and512(data, low4bits);
m512 data_hi = and512(rshift64_m512(data, 4), low4bits);
m512 c_lo_1 = pshufb_m512(lo_mask_1, data_lo);
m512 c_lo_2 = pshufb_m512(lo_mask_2, data_lo);
m512 c_hi_1 = pshufb_m512(hi_mask_1, data_hi);
m512 c_hi_2 = pshufb_m512(hi_mask_2, data_hi);
m512 t1 = and512(c_lo_1, c_hi_1);
m512 t2 = and512(c_lo_2, c_hi_2);
m512 result = or512(and512(t1, and_mask_lo), and512(t2, and_mask_hi));
u64a nresult = eq512mask(result, zeroes512());
#ifdef DEBUG
DEBUG_PRINTF("data\n");
dumpMask(&data, 64);
DEBUG_PRINTF("data_lo\n");
dumpMask(&data_lo, 64);
DEBUG_PRINTF("data_hi\n");
dumpMask(&data_hi, 64);
DEBUG_PRINTF("hi_mask_1\n");
dumpMask(&hi_mask_1, 64);
DEBUG_PRINTF("hi_mask_2\n");
dumpMask(&hi_mask_2, 64);
DEBUG_PRINTF("lo_mask_1\n");
dumpMask(&lo_mask_1, 64);
DEBUG_PRINTF("lo_mask_2\n");
dumpMask(&lo_mask_2, 64);
DEBUG_PRINTF("c_lo_1\n");
dumpMask(&c_lo_1, 64);
DEBUG_PRINTF("c_lo_2\n");
dumpMask(&c_lo_2, 64);
DEBUG_PRINTF("c_hi_1\n");
dumpMask(&c_hi_1, 64);
DEBUG_PRINTF("c_hi_2\n");
dumpMask(&c_hi_2, 64);
DEBUG_PRINTF("result\n");
dumpMask(&result, 64);
DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
#endif
u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
return !cmp_result;
}
#endif
static really_inline static really_inline
int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) { int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) {
u32 t = ~(data | hi_bits); u32 t = ~(data | hi_bits);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -793,6 +793,12 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
bytecode_ptr<NFA> dfa = nullptr; bytecode_ptr<NFA> dfa = nullptr;
if (cc.grey.allowSmallWriteSheng) { if (cc.grey.allowSmallWriteSheng) {
dfa = shengCompile(rdfa, cc, rm, only_accel_init, &accel_states); dfa = shengCompile(rdfa, cc, rm, only_accel_init, &accel_states);
if (!dfa) {
dfa = sheng32Compile(rdfa, cc, rm, only_accel_init, &accel_states);
}
if (!dfa) {
dfa = sheng64Compile(rdfa, cc, rm, only_accel_init, &accel_states);
}
} }
if (!dfa) { if (!dfa) {
dfa = mcclellanCompile(rdfa, cc, rm, only_accel_init, dfa = mcclellanCompile(rdfa, cc, rm, only_accel_init,

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -33,7 +33,7 @@
#include "simd_utils.h" #include "simd_utils.h"
static really_inline static really_inline
void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) { void copy_upto_64_bytes(u8 *dst, const u8 *src, unsigned int len) {
switch (len) { switch (len) {
case 0: case 0:
break; break;
@ -72,14 +72,41 @@ void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
case 16: case 16:
storeu128(dst, loadu128(src)); storeu128(dst, loadu128(src));
break; break;
case 32: case 17:
storeu256(dst, loadu256(src)); case 18:
break; case 19:
default: case 20:
assert(len < 32); case 21:
case 22:
case 23:
case 24:
case 25:
case 26:
case 27:
case 28:
case 29:
case 30:
case 31:
storeu128(dst + len - 16, loadu128(src + len - 16)); storeu128(dst + len - 16, loadu128(src + len - 16));
storeu128(dst, loadu128(src)); storeu128(dst, loadu128(src));
break; break;
case 32:
storeu256(dst, loadu256(src));
break;
#ifdef HAVE_AVX512
case 64:
storebytes512(dst, loadu512(src), 64);
break;
default:
assert(len < 64);
u64a k = (1ULL << len) - 1;
storeu_mask_m512(dst, k, loadu_maskz_m512(k, src));
break;
#else
default:
assert(0);
break;
#endif
} }
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -50,6 +50,11 @@ u64a cpuid_flags(void) {
cap |= HS_CPU_FEATURES_AVX512; cap |= HS_CPU_FEATURES_AVX512;
} }
if (check_avx512vbmi()) {
DEBUG_PRINTF("AVX512VBMI enabled\n");
cap |= HS_CPU_FEATURES_AVX512VBMI;
}
#if !defined(FAT_RUNTIME) && !defined(HAVE_AVX2) #if !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
cap &= ~HS_CPU_FEATURES_AVX2; cap &= ~HS_CPU_FEATURES_AVX2;
#endif #endif
@ -59,6 +64,11 @@ u64a cpuid_flags(void) {
cap &= ~HS_CPU_FEATURES_AVX512; cap &= ~HS_CPU_FEATURES_AVX512;
#endif #endif
#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512VBMI)) || \
(defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI))
cap &= ~HS_CPU_FEATURES_AVX512VBMI;
#endif
return cap; return cap;
} }
@ -105,6 +115,11 @@ static const struct family_id known_microarch[] = {
{ 0x6, 0x8E, HS_TUNE_FAMILY_SKL }, /* Kabylake Mobile */ { 0x6, 0x8E, HS_TUNE_FAMILY_SKL }, /* Kabylake Mobile */
{ 0x6, 0x9E, HS_TUNE_FAMILY_SKL }, /* Kabylake desktop */ { 0x6, 0x9E, HS_TUNE_FAMILY_SKL }, /* Kabylake desktop */
{ 0x6, 0x7D, HS_TUNE_FAMILY_ICL }, /* Icelake */
{ 0x6, 0x7E, HS_TUNE_FAMILY_ICL }, /* Icelake */
{ 0x6, 0x6A, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon-D */
{ 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */
}; };
#ifdef DUMP_SUPPORT #ifdef DUMP_SUPPORT
@ -120,6 +135,8 @@ const char *dumpTune(u32 tune) {
T_CASE(HS_TUNE_FAMILY_BDW); T_CASE(HS_TUNE_FAMILY_BDW);
T_CASE(HS_TUNE_FAMILY_SKL); T_CASE(HS_TUNE_FAMILY_SKL);
T_CASE(HS_TUNE_FAMILY_SKX); T_CASE(HS_TUNE_FAMILY_SKX);
T_CASE(HS_TUNE_FAMILY_ICL);
T_CASE(HS_TUNE_FAMILY_ICX);
} }
#undef T_CASE #undef T_CASE
return "unknown"; return "unknown";

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017, Intel Corporation * Copyright (c) 2017-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -74,11 +74,12 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
#define CPUID_HTT (1 << 28) #define CPUID_HTT (1 << 28)
// Structured Extended Feature Flags Enumeration Leaf ECX values // Structured Extended Feature Flags Enumeration Leaf ECX values
#define CPUID_AVX512VBMI (1 << 1)
// Structured Extended Feature Flags Enumeration Leaf EBX values
#define CPUID_BMI (1 << 3) #define CPUID_BMI (1 << 3)
#define CPUID_AVX2 (1 << 5) #define CPUID_AVX2 (1 << 5)
#define CPUID_BMI2 (1 << 8) #define CPUID_BMI2 (1 << 8)
// Structured Extended Feature Flags Enumeration Leaf EBX values
#define CPUID_AVX512F (1 << 16) #define CPUID_AVX512F (1 << 16)
#define CPUID_AVX512BW (1 << 30) #define CPUID_AVX512BW (1 << 30)
@ -186,6 +187,51 @@ int check_avx512(void) {
#endif #endif
} }
static inline
int check_avx512vbmi(void) {
#if defined(__INTEL_COMPILER)
return _may_i_use_cpu_feature(_FEATURE_AVX512VBMI);
#else
unsigned int eax, ebx, ecx, edx;
cpuid(1, 0, &eax, &ebx, &ecx, &edx);
/* check XSAVE is enabled by OS */
if (!(ecx & CPUID_XSAVE)) {
DEBUG_PRINTF("AVX and XSAVE not supported\n");
return 0;
}
/* check that AVX 512 registers are enabled by OS */
u64a xcr0 = xgetbv(0);
if ((xcr0 & CPUID_XCR0_AVX512) != CPUID_XCR0_AVX512) {
DEBUG_PRINTF("AVX512 registers not enabled\n");
return 0;
}
/* ECX and EDX contain capability flags */
ecx = 0;
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
if (!(ebx & CPUID_AVX512F)) {
DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n");
return 0;
}
if (!(ebx & CPUID_AVX512BW)) {
DEBUG_PRINTF("AVX512BW instructions not enabled\n");
return 0;
}
if (ecx & CPUID_AVX512VBMI) {
DEBUG_PRINTF("AVX512VBMI instructions enabled\n");
return 1;
}
return 0;
#endif
}
static inline static inline
int check_ssse3(void) { int check_ssse3(void) {
unsigned int eax, ebx, ecx, edx; unsigned int eax, ebx, ecx, edx;

View File

@ -170,6 +170,7 @@ find_vertices_in_cycles(const Graph &g) {
assert(!comp.empty()); assert(!comp.empty());
if (comp.size() > 1) { if (comp.size() > 1) {
insert(&rv, comp); insert(&rv, comp);
continue;
} }
vertex_descriptor v = *comp.begin(); vertex_descriptor v = *comp.begin();
if (hasSelfLoop(v, g)) { if (hasSelfLoop(v, g)) {

View File

@ -138,6 +138,12 @@ m128 lshift64_m128(m128 a, unsigned b) {
#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) #define eq128(a, b) _mm_cmpeq_epi8((a), (b))
#define movemask128(a) ((u32)_mm_movemask_epi8((a))) #define movemask128(a) ((u32)_mm_movemask_epi8((a)))
#if defined(HAVE_AVX512)
static really_inline m128 cast512to128(const m512 in) {
return _mm512_castsi512_si128(in);
}
#endif
static really_inline m128 set16x8(u8 c) { static really_inline m128 set16x8(u8 c) {
return _mm_set1_epi8(c); return _mm_set1_epi8(c);
} }
@ -156,6 +162,12 @@ static really_inline u32 movd512(const m512 in) {
// so we use 2-step convertions to work around. // so we use 2-step convertions to work around.
return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
} }
static really_inline u64a movq512(const m512 in) {
// NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
// so we use 2-step convertions to work around.
return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
}
#endif #endif
static really_inline u64a movq(const m128 in) { static really_inline u64a movq(const m128 in) {
@ -211,6 +223,24 @@ static really_inline m128 or128(m128 a, m128 b) {
return _mm_or_si128(a,b); return _mm_or_si128(a,b);
} }
#if defined(HAVE_AVX512VBMI)
static really_inline m512 expand128(m128 a) {
return _mm512_broadcast_i32x4(a);
}
static really_inline m512 expand256(m256 a) {
return _mm512_broadcast_i64x4(a);
}
static really_inline m512 expand384(m384 a) {
u64a *lo = (u64a*)&a.lo;
u64a *mid = (u64a*)&a.mid;
u64a *hi = (u64a*)&a.hi;
return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
lo[1], lo[0]);
}
#endif
static really_inline m128 andnot128(m128 a, m128 b) { static really_inline m128 andnot128(m128 a, m128 b) {
return _mm_andnot_si128(a, b); return _mm_andnot_si128(a, b);
} }
@ -1000,6 +1030,11 @@ m512 set8x64(u64a a) {
return _mm512_set1_epi64(a); return _mm512_set1_epi64(a);
} }
static really_inline
m512 set16x32(u32 a) {
return _mm512_set1_epi32(a);
}
static really_inline static really_inline
m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
@ -1017,6 +1052,26 @@ static really_inline
m512 set4x128(m128 a) { m512 set4x128(m128 a) {
return _mm512_broadcast_i32x4(a); return _mm512_broadcast_i32x4(a);
} }
static really_inline
m512 sadd_u8_m512(m512 a, m512 b) {
return _mm512_adds_epu8(a, b);
}
static really_inline
m512 max_u8_m512(m512 a, m512 b) {
return _mm512_max_epu8(a, b);
}
static really_inline
m512 min_u8_m512(m512 a, m512 b) {
return _mm512_min_epu8(a, b);
}
static really_inline
m512 sub_u8_m512(m512 a, m512 b) {
return _mm512_sub_epi8(a, b);
}
#endif #endif
static really_inline static really_inline
@ -1204,6 +1259,22 @@ m512 loadu512(const void *ptr) {
#endif #endif
} }
// unaligned store
static really_inline
void storeu512(void *ptr, m512 a) {
#if defined(HAVE_AVX512)
_mm512_storeu_si512((m512 *)ptr, a);
#elif defined(HAVE_AVX2)
storeu256(ptr, a.lo);
storeu256((char *)ptr + 32, a.hi);
#else
storeu128(ptr, a.lo.lo);
storeu128((char *)ptr + 16, a.lo.hi);
storeu128((char *)ptr + 32, a.hi.lo);
storeu128((char *)ptr + 48, a.hi.hi);
#endif
}
#if defined(HAVE_AVX512) #if defined(HAVE_AVX512)
static really_inline static really_inline
m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
@ -1215,10 +1286,20 @@ m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
return _mm512_mask_loadu_epi8(src, k, ptr); return _mm512_mask_loadu_epi8(src, k, ptr);
} }
static really_inline
void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
_mm512_mask_storeu_epi8(ptr, k, a);
}
static really_inline static really_inline
m512 set_mask_m512(__mmask64 k) { m512 set_mask_m512(__mmask64 k) {
return _mm512_movm_epi8(k); return _mm512_movm_epi8(k);
} }
static really_inline
m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
return _mm256_maskz_loadu_epi8(k, ptr);
}
#endif #endif
// packed unaligned store of first N bytes // packed unaligned store of first N bytes

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -50,6 +50,10 @@ bool target_t::can_run_on_code_built_for(const target_t &code_target) const {
return false; return false;
} }
if (!has_avx512vbmi() && code_target.has_avx512vbmi()) {
return false;
}
return true; return true;
} }
@ -64,6 +68,10 @@ bool target_t::has_avx512(void) const {
return cpu_features & HS_CPU_FEATURES_AVX512; return cpu_features & HS_CPU_FEATURES_AVX512;
} }
bool target_t::has_avx512vbmi(void) const {
return cpu_features & HS_CPU_FEATURES_AVX512VBMI;
}
bool target_t::is_atom_class(void) const { bool target_t::is_atom_class(void) const {
return tune == HS_TUNE_FAMILY_SLM || tune == HS_TUNE_FAMILY_GLM; return tune == HS_TUNE_FAMILY_SLM || tune == HS_TUNE_FAMILY_GLM;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -42,6 +42,8 @@ struct target_t {
bool has_avx512(void) const; bool has_avx512(void) const;
bool has_avx512vbmi(void) const;
bool is_atom_class(void) const; bool is_atom_class(void) const;
// This asks: can this target (the object) run on code that was built for // This asks: can this target (the object) run on code that was built for

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -101,6 +101,18 @@
#define or_m384(a, b) (or384(a, b)) #define or_m384(a, b) (or384(a, b))
#define or_m512(a, b) (or512(a, b)) #define or_m512(a, b) (or512(a, b))
#if defined(HAVE_AVX512VBMI)
#define expand_m128(a) (expand128(a))
#define expand_m256(a) (expand256(a))
#define expand_m384(a) (expand384(a))
#define expand_m512(a) (a)
#define shuffle_byte_m128(a, b) (pshufb_m512(b, a))
#define shuffle_byte_m256(a, b) (vpermb512(a, b))
#define shuffle_byte_m384(a, b) (vpermb512(a, b))
#define shuffle_byte_m512(a, b) (vpermb512(a, b))
#endif
#define and_u8(a, b) ((a) & (b)) #define and_u8(a, b) ((a) & (b))
#define and_u32(a, b) ((a) & (b)) #define and_u32(a, b) ((a) & (b))
#define and_u64a(a, b) ((a) & (b)) #define and_u64a(a, b) ((a) & (b))

View File

@ -88,6 +88,8 @@ public:
virtual void printStats() const = 0; virtual void printStats() const = 0;
virtual void printCsvStats() const = 0;
virtual void sqlStats(SqlDB &db) const = 0; virtual void sqlStats(SqlDB &db) const = 0;
}; };

View File

@ -187,6 +187,16 @@ void EngineChimera::printStats() const {
#endif #endif
} }
void EngineChimera::printCsvStats() const {
printf(",\"%s\"", compile_stats.signatures.c_str());
printf(",\"%zu\"", compile_stats.expressionCount);
printf(",\"0x%x\"", compile_stats.crc32);
printf(",\"%zu\"", compile_stats.compiledSize);
printf(",\"%zu\"", compile_stats.scratchSize);
printf(",\"%0.3Lf\"", compile_stats.compileSecs);
printf(",\"%u\"", compile_stats.peakMemorySize);
}
void EngineChimera::sqlStats(SqlDB &sqldb) const { void EngineChimera::sqlStats(SqlDB &sqldb) const {
ostringstream crc; ostringstream crc;
crc << "0x" << hex << compile_stats.crc32; crc << "0x" << hex << compile_stats.crc32;

View File

@ -89,6 +89,8 @@ public:
void printStats() const; void printStats() const;
void printCsvStats() const;
void sqlStats(SqlDB &db) const; void sqlStats(SqlDB &db) const;
private: private:

View File

@ -276,6 +276,17 @@ void EngineHyperscan::printStats() const {
#endif #endif
} }
void EngineHyperscan::printCsvStats() const {
printf(",\"%s\"", compile_stats.signatures.c_str());
printf(",\"%zu\"", compile_stats.expressionCount);
printf(",\"0x%x\"", compile_stats.crc32);
printf(",\"%zu\"", compile_stats.compiledSize);
printf(",\"%zu\"", compile_stats.streamSize);
printf(",\"%zu\"", compile_stats.scratchSize);
printf(",\"%0.3Lf\"", compile_stats.compileSecs);
printf(",\"%u\"", compile_stats.peakMemorySize);
}
void EngineHyperscan::sqlStats(SqlDB &sqldb) const { void EngineHyperscan::sqlStats(SqlDB &sqldb) const {
ostringstream crc; ostringstream crc;
crc << "0x" << hex << compile_stats.crc32; crc << "0x" << hex << compile_stats.crc32;

View File

@ -65,8 +65,8 @@ public:
class EngineHSStream : public EngineStream { class EngineHSStream : public EngineStream {
public: public:
~EngineHSStream(); ~EngineHSStream();
hs_stream_t *id; hs_stream_t *id = nullptr;
EngineHSContext *ctx; EngineHSContext *ctx = nullptr;
}; };
/** Hyperscan Engine for scanning data. */ /** Hyperscan Engine for scanning data. */
@ -98,6 +98,8 @@ public:
void printStats() const; void printStats() const;
void printCsvStats() const;
void sqlStats(SqlDB &db) const; void sqlStats(SqlDB &db) const;
private: private:

View File

@ -227,6 +227,15 @@ void EnginePCRE::printStats() const {
#endif #endif
} }
void EnginePCRE::printCsvStats() const {
printf(",\"%s\"", compile_stats.signatures.c_str());
printf(",\"%zu\"", compile_stats.expressionCount);
printf(",\"%zu\"", compile_stats.compiledSize);
printf(",\"%zu\"", compile_stats.scratchSize);
printf(",\"%0.3Lf\"", compile_stats.compileSecs);
printf(",\"%u\"", compile_stats.peakMemorySize);
}
void EnginePCRE::sqlStats(SqlDB &sqldb) const { void EnginePCRE::sqlStats(SqlDB &sqldb) const {
ostringstream crc; ostringstream crc;

View File

@ -62,7 +62,7 @@ public:
struct PcreDB { struct PcreDB {
bool highlander = false; bool highlander = false;
bool utf8 = false; bool utf8 = false;
u32 id; u32 id = 0;
pcre *db = nullptr; pcre *db = nullptr;
pcre_extra *extra = nullptr; pcre_extra *extra = nullptr;
}; };
@ -97,6 +97,8 @@ public:
void printStats() const; void printStats() const;
void printCsvStats() const;
void sqlStats(SqlDB &db) const; void sqlStats(SqlDB &db) const;
private: private:

View File

@ -98,6 +98,7 @@ bool display_per_scan = false;
ScanMode scan_mode = ScanMode::STREAMING; ScanMode scan_mode = ScanMode::STREAMING;
bool useHybrid = false; bool useHybrid = false;
bool usePcre = false; bool usePcre = false;
bool dumpCsvOut = false;
unsigned repeats = 20; unsigned repeats = 20;
string exprPath(""); string exprPath("");
string corpusFile(""); string corpusFile("");
@ -211,6 +212,7 @@ void usage(const char *error) {
printf(" Benchmark with threads on specified CPUs or CPU" printf(" Benchmark with threads on specified CPUs or CPU"
" range.\n"); " range.\n");
#endif #endif
printf(" -C Dump CSV output for tput matrix.\n");
printf(" -i DIR Don't compile, load from files in DIR" printf(" -i DIR Don't compile, load from files in DIR"
" instead.\n"); " instead.\n");
printf(" -w DIR After compiling, save to files in DIR.\n"); printf(" -w DIR After compiling, save to files in DIR.\n");
@ -275,6 +277,9 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
case 'c': case 'c':
corpusFile.assign(optarg); corpusFile.assign(optarg);
break; break;
case 'C':
dumpCsvOut = true;
break;
case 'd': { case 'd': {
unsigned dist; unsigned dist;
if (!fromString(optarg, dist)) { if (!fromString(optarg, dist)) {
@ -849,6 +854,40 @@ void displayResults(const vector<unique_ptr<ThreadContext>> &threads,
} }
} }
/** Dump benchmark results to csv. */
static
void displayCsvResults(const vector<unique_ptr<ThreadContext>> &threads,
const vector<DataBlock> &corpus_blocks) {
u64a bytesPerRun = byte_size(corpus_blocks);
u64a matchesPerRun = threads[0]->results[0].matches;
// Sanity check: all of our results should have the same match count.
for (const auto &t : threads) {
if (!all_of(begin(t->results), end(t->results),
[&matchesPerRun](const ResultEntry &e) {
return e.matches == matchesPerRun;
})) {
printf("\nWARNING: PER-SCAN MATCH COUNTS ARE INCONSISTENT!\n\n");
break;
}
}
u64a totalBytes = bytesPerRun * repeats * threads.size();
u64a totalBlocks = corpus_blocks.size() * repeats * threads.size();
printf(",\"%0.3f\"", totalSecs);
printf(",\"%0.2Lf\"", calc_mbps(totalSecs, totalBytes));
assert(bytesPerRun);
double matchRate = ((double)matchesPerRun * 1024) / bytesPerRun;
printf(",\"%llu\"", matchesPerRun);
printf(",\"%0.3f\"", matchRate);
double blockRate = (double)totalBlocks / (double)totalSecs;
printf(",\"%0.2f\"", blockRate);
printf("\n");
}
/** Dump per-scan throughput data to sql. */ /** Dump per-scan throughput data to sql. */
static static
void sqlPerScanResults(const vector<unique_ptr<ThreadContext>> &threads, void sqlPerScanResults(const vector<unique_ptr<ThreadContext>> &threads,
@ -982,7 +1021,9 @@ void runBenchmark(const Engine &db,
t->join(); t->join();
} }
if (sqloutFile.empty()) { if (dumpCsvOut) {
displayCsvResults(threads, corpus_blocks);
} else if (sqloutFile.empty()) {
// Display global results. // Display global results.
displayResults(threads, corpus_blocks); displayResults(threads, corpus_blocks);
} else { } else {
@ -1059,7 +1100,9 @@ int HS_CDECL main(int argc, char *argv[]) {
exit(1); exit(1);
} }
if (sqloutFile.empty()) { if (dumpCsvOut) {
engine->printCsvStats();
} else if (sqloutFile.empty()) {
// Display global results. // Display global results.
engine->printStats(); engine->printStats();
printf("\n"); printf("\n");

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -83,9 +83,10 @@ protected:
const map<u32, u32> fixed_depth_tops; const map<u32, u32> fixed_depth_tops;
const map<u32, vector<vector<CharReach>>> triggers; const map<u32, vector<vector<CharReach>>> triggers;
bool compress_state = false; bool compress_state = false;
bool fast_nfa = false;
nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state, nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state,
type, cc); fast_nfa, type, cc);
ASSERT_TRUE(nfa != nullptr); ASSERT_TRUE(nfa != nullptr);
full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64); full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
@ -376,9 +377,10 @@ protected:
const map<u32, u32> fixed_depth_tops; const map<u32, u32> fixed_depth_tops;
const map<u32, vector<vector<CharReach>>> triggers; const map<u32, vector<vector<CharReach>>> triggers;
bool compress_state = false; bool compress_state = false;
bool fast_nfa = false;
nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state, nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state,
type, cc); fast_nfa, type, cc);
ASSERT_TRUE(nfa != nullptr); ASSERT_TRUE(nfa != nullptr);
full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64); full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);