Compare commits

..

59 Commits

Author SHA1 Message Date
Chang, Harry
c37166d52b Merge branch 'github_develop' into github_master 2023-02-21 23:03:50 +00:00
Chang, Harry
f815639830 Bump version number for release 2023-02-21 22:57:45 +00:00
Chang, Harry
a775768988 changelog: updates for 5.4.1 release 2023-02-21 22:52:57 +00:00
Hong, Yang A
2fbef65905 fix nfa dump error 2023-02-15 05:51:00 +00:00
Hong, Yang A
277fc40089 scratch: add quick validity check
fix github issue #350
2023-02-15 05:51:00 +00:00
Hong, Yang A
5aa4bd565f stream close: free stream to avoid memory leak
fix github issue #303
2023-02-15 05:51:00 +00:00
Hong, Yang A
f47b69a01d Silence clang-14 warnings 2023-02-15 05:51:00 +00:00
Hong, Yang A
9e254af71f Fix cmake CMP0115 warning for CMake 3.20 and above 2023-02-15 05:51:00 +00:00
Hong, Yang A
c81293c696 update year 2022 2023-02-15 05:51:00 +00:00
Hong, Yang A
c1539d32df UTF-8 validation: fix one cotec check corner issue
fix github issue #362
2023-02-15 05:51:00 +00:00
Hong, Yang A
44b5955ecd chimera: fix SKIP flag issue
fix github issue #360
2023-02-15 05:51:00 +00:00
Hong, Yang A
e1f4542e65 stringop-overflow compatible fix 2023-02-15 05:51:00 +00:00
Hong, Yang A
a3ba1ad369 gcc-10(and above): fix compile issue caused by stringop-overflow 2023-02-15 05:49:59 +00:00
Liu Zixian
676490427c Add comment for stack size
Linux kernel default stack size should be enough for hscollider.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/include/uapi/asm/signal.h
2023-02-15 05:49:59 +00:00
Liu Zixian
838a04e66f fix build with glibc-2.34
SIGTSKSZ is no long a constant after glibc 2.34
https://sourceware.org/pipermail/libc-alpha/2021-August/129718.html
2023-02-15 05:49:59 +00:00
Hong, Yang A
f194a85d51 klocwork: fix risk issues 2023-02-15 05:49:59 +00:00
Chang, Harry
7bf5a9f5cd Corpus editor: fix random char value of UTF-8. 2023-02-15 05:49:04 +00:00
Chang, Harry
811f909d41 Corpus generator: fix random char value of UTF-8.
fixes github issue #184
2023-02-15 05:49:04 +00:00
Hong, Yang A
47bc68339f bugfix: fix overflow risk of strlen function 2023-02-15 05:48:20 +00:00
Hong, Yang A
1baf340d1c sanitiser bugfix 2023-02-15 05:47:58 +00:00
hongyang7
9b4ba34c68 Fix segfaults on allocation failure (#4)
Throw std::bad_alloc instead of returning nullptr from
ue2::AlignedAllocator. Allocators for STL containers are expected never
to return with an invalid pointer, and instead must throw on failure.
Violating this expectation can lead to invalid pointer dereferences.

Co-authored-by: johanngan <johanngan.us@gmail.com>
2023-02-15 05:47:36 +00:00
Hong, Yang A
85019432f4 bugfix: add vbmi case for test in database.cpp 2023-02-15 05:47:19 +00:00
Hong, Yang A
b386cbd20d bugfix: add vbmi platform parameter for tests in single.cpp 2023-02-15 05:47:19 +00:00
Chang, Harry
b254a88c43 Logical Combination: bypass combination flag in hs_expression_info.
Fixes github issue #291
2023-02-15 05:46:27 +00:00
Hong, Yang A
819da8df17 update year for bugfix #302-#305 2023-02-15 05:46:27 +00:00
Hong, Yang A
7f4a806118 mcclellan: improve wide-state checking in Sherman optimization
fixes github issue #305
2023-02-15 05:46:27 +00:00
Hong, Yang A
0b246c801a literal API: add instruction support
fixes github issue #303
2023-02-15 05:46:27 +00:00
Hong, Yang A
9e17e8520f literal API: add empty string check.
fixes github issue #302, #304
2023-02-15 05:46:27 +00:00
Wang Xiang W
1ecb3aef8b simd_utils: fix undefined instruction issue for 32-bit system
fixes github issue #292
2023-02-15 05:46:27 +00:00
Wang Xiang W
62e35c910b fat runtime: fix libc symbol parsing
fixes github issue #292
2023-02-15 05:46:27 +00:00
Hong, Yang A
95cd19c6f0 Example code: update header position 2023-02-15 05:46:27 +00:00
Hong, Yang A
98daf283b1 Example code: update year 2023-02-15 05:46:27 +00:00
Hong, Yang A
e0c489f98f Example code: bugfix of KW scan. 2023-02-15 05:46:27 +00:00
Hong, Yang A
64a995bf44 Merge branch 'github_develop' into github_master 2021-01-13 14:39:34 +00:00
Wang Xiang W
433d2f386a Bump version number for release 2021-01-13 12:26:47 +00:00
Wang Xiang W
76066b9ef2 changelog: updates for 5.4.0 release 2021-01-13 12:26:47 +00:00
Chang, Harry
66dc649197 Fix Klocwork scan issues. 2021-01-13 12:26:47 +00:00
Wang Xiang W
d1ea4c762a chimera: fix return value handling
Fixes github issue #270
2021-01-13 12:26:47 +00:00
Wang Xiang W
2945c9bd20 Limex: exception handling with AVX512 2021-01-13 12:26:47 +00:00
Chang, Harry
20e69f6ad8 Logical Combination: use hs_misc_free instead of free.
fixes github issue #284
2021-01-13 12:26:47 +00:00
Hong, Yang A
845ea5c9e3 examples: add cmake enabling option BUILD_EXAMPLES. 2021-01-13 12:26:47 +00:00
Piotr Skamruk
b16c6200ee [dev-reference] Fix minor typo in docs 2021-01-13 12:26:47 +00:00
Walt Stoneburner
1a43a63218 Fixed several typos
Fixed spellings of regular, interpretation, and grammar to improve readability.

Fixes github issue #242
2021-01-13 12:26:47 +00:00
Wang Xiang W
04d3be487d Adjust sensitive terms 2021-01-13 12:26:47 +00:00
Wang Xiang W
5eab583df5 limex: add fast NFA check 2021-01-13 12:26:47 +00:00
Chang, Harry
ddc247516c Discard HAVE_AVX512VBMI checks at Sheng/McSheng compile time. 2021-01-13 12:26:47 +00:00
Chang, Harry
5326b3e688 Add cpu feature / target info "AVX512VBMI". 2021-01-13 12:26:47 +00:00
Zhu,Wenjun
0102f03c9c MCSHENG64: extend to 64-state based on mcsheng 2021-01-13 12:26:47 +00:00
Hong, Yang A
f06e19e6cb lookaround:
add 64x8 and 64x16 shufti models
add mask64 model
expand entry quantity
2021-01-13 12:26:47 +00:00
Chang, Harry
00b697bb3b AVX512VBMI Fat Teddy. 2021-01-13 12:26:47 +00:00
Chang, Harry
007117146c Fix find_vertices_in_cycles(): don't check self-loop in SCC. 2021-01-13 12:26:47 +00:00
Chang, Harry
1bd99d9318 Fix cmake error on ICX under release mode. 2021-01-13 12:26:47 +00:00
Chang, Harry
0c4c149433 Fix sheng64 dump compile issue in clang. 2021-01-13 12:26:47 +00:00
Chang, Harry
d8dc1ad685 Fix sheng64 compile issue in clang and in DEBUG_OUTPUT mode on SKX. 2021-01-13 12:26:47 +00:00
Chang, Harry
27ab2e086d SHENG64: 64-state 1-byte shuffle based DFA. 2021-01-13 12:26:47 +00:00
Chang, Harry
cf06d552f8 SHENG32: Compile priority sheng > mcsheng > sheng32. 2021-01-13 12:26:47 +00:00
Chang, Harry
33cef12050 SHENG32: 32-state 1-byte shuffle based DFA. 2021-01-13 12:26:47 +00:00
Hong, Yang A
15f0ccd1b8 DFA: use sherman economically 2021-01-13 12:23:04 +00:00
Wang Xiang W
475ad00f53 hsbench: add CSV dump support 2021-01-13 12:20:49 +00:00
490 changed files with 13624 additions and 1415812 deletions

View File

@ -1,11 +0,0 @@
#unit/gtest/gtest-all.cc,build/src/parser/Parser.cpp,build/src/parser/control_verbs.cpp
#Dont change first comment ignores specific files from clang-tidy
Checks: 'clang-analyzer-*,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,performance-*,-performance-unnecessary-value-param,-performance-avoid-endl'
WarningsAsErrors: ''
HeaderFilterRegex: '.*'
SystemHeaders: false
FormatStyle: none
InheritParentConfig: true
User: user

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "simde"]
path = simde
url = https://github.com/simd-everywhere/simde.git

View File

@ -1,66 +0,0 @@
# Vectorscan Change Log
This is a list of notable changes to Vectorscan, in reverse chronological order. For Hyperscan Changelog, check CHANGELOG.md
## [5.4.11] 2023-11-19
- Refactor CMake build system to be much more modular.
- version in hs.h fell out of sync again #175
- Fix compile failures with recent compilers, namely clang-15 and gcc-13
- Fix clang 15,16 compilation errors on all platforms, refactor CMake build system #181
- Fix signed/unsigned char issue on Arm with Ragel generated code.
- Correct set_source_files_properties usage #189
- Fix build failure on Ubuntu 20.04
- Support building on Ubuntu 20.04 #180
- Require pkg-config during Cmake
- make pkgconfig a requirement #188
- Fix segfault on Fat runtimes with SVE2 code
- Move VERM16 enums to the end of the list #191
- Update README.md, add CHANGELOG-vectorscan.md and Contributors-vectorscan.md files
## [5.4.10] 2023-09-23
- Fix compilation with libcxx 16 by @rschu1ze in #144
- Fix use-of-uninitialized-value due to getData128() by @azat in #148
- Use std::vector instead of boost::container::small_vector under MSan by @azat in #149
- Feature/enable fat runtime arm by @markos in #165
- adding ifndef around HS_PUBLIC_API definition so that vectorscan can be statically linked into another shared library without exporting symbols by @jeffplaisance in #164
- Feature/backport hyperscan 2023 q3 by @markos in #169
- Prepare for 5.4.10 by @markos in #167
## [5.4.9] 2023-03-23
- Major change: Enable SVE & SVE2 builds and make it a supported architecture! (thanks to @abondarev84)
- Fix various clang-related bugs
- Fix Aarch64 bug in Parser.rl because of char signedness. Make unsigned char the default in the Parser for all architectures.
- Fix Power bug, multiple tests were failing.
- C++20 related change, use prefixed assume_aligned to avoid conflict with C++20 std::assume_aligned.
## [5.4.8] 2022-09-13
- CMake: Use non-deprecated method for finding python by @jth in #108
- Optimize vectorscan for aarch64 by using shrn instruction by @danlark1 in #113
- Fixed the PCRE download location by @pareenaverma in #116
- Bugfix/hyperscan backport 202208 by @markos in #118
- VSX optimizations by @markos in #119
- when compiling with mingw64, use __mingw_aligned_malloc() and __mingw_aligned_free() by @liquidaty in #121
- [NEON] simplify/optimize shift/align primitives by @markos in #123
- Merge develop to master by @markos in #124
## [5.4.7] 2022-05-05
- Fix word boundary assertions under C++20 by @BigRedEye in #90
- Fix all ASAN issues in vectorscan by @danlark1 in #93
- change FAT_RUNTIME to a normal option so it can be set to off by @a16bitsysop in #94
- Optimized and correct version of movemask128 for ARM by @danlark1 in #102
## [5.4.6] 2022-01-21
- Major refactoring of many engines to use internal SuperVector C++ templates library. Code size reduced to 1/3rd with no loss of performance in most cases.
- Microbenchmarking tool added for performance finetuning
- Arm Advanced SIMD/NEON fully ported. Initial work on SVE2 for a couple of engines.
- Power9 VSX ppc64le fully ported. Initial port needs some optimization.
- Clang compiler support added.
- Apple M1 support added.
- CI added, the following configurations are tested on every PR:
gcc-debug, gcc-release, clang-debug, clang-release:
Linux Intel: SSE4.2, AVX2, AVX512, FAT
Linux Arm
Linux Power9
clang-debug, clang-release:
MacOS Apple M1

View File

@ -2,14 +2,6 @@
This is a list of notable changes to Hyperscan, in reverse chronological order. This is a list of notable changes to Hyperscan, in reverse chronological order.
## [5.4.2] 2023-04-19
- Roll back bugfix for github issue #350: Besides using scratch for
corresponding database, Hyperscan also allows user to use larger scratch
allocated for another database. Users can leverage this property to achieve
safe scratch usage in multi-database scenarios. Behaviors beyond these are
discouraged and results are undefined.
- Fix hsdump issue due to invalid nfa type.
## [5.4.1] 2023-02-20 ## [5.4.1] 2023-02-20
- The Intel Hyperscan team is pleased to provide a bug fix release to our open source library. - The Intel Hyperscan team is pleased to provide a bug fix release to our open source library.
Intel also maintains an upgraded version available through your Intel sales representative. Intel also maintains an upgraded version available through your Intel sales representative.

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,4 @@
Copyright (c) 2015, Intel Corporation Copyright (c) 2015, Intel Corporation
Copyright (c) 2019-20, VectorCamp PC
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met: modification, are permitted provided that the following conditions are met:

View File

@ -1,25 +0,0 @@
394 Konstantinos Margaritis <konstantinos@vectorcamp.gr>
59 apostolos <apostolos.tapsas@vectorcamp.gr>
25 Hong, Yang A <yang.a.hong@intel.com>
19 George Wort <george.wort@arm.com>
16 Chang, Harry <harry.chang@intel.com>
7 Danila Kutenin <danilak@google.com>
7 Wang Xiang W <xiang.w.wang@intel.com>
6 Alex Bondarev <abondarev84@gmail.com>
5 Konstantinos Margaritis <konma@vectorcamp.gr>
3 Duncan Bellamy <dunk@denkimushi.com>
2 Azat Khuzhin <a3at.mail@gmail.com>
2 Jan Henning <jan.thilo.henning@sap.com>
1 BigRedEye <mail@bigredeye.me>
1 Daniel Kutenin <kutdanila@yandex.ru>
1 Danila Kutenin <kutdanila@yandex.ru>
1 Liu Zixian <hdu_sdlzx@163.com>
1 Mitchell Wasson <miwasson@cisco.com>
1 Piotr Skamruk <piotr.skamruk@gmail.com>
1 Robbie Williamson <robbie.williamson@arm.com>
1 Robert Schulze <robert@clickhouse.com>
1 Walt Stoneburner <wls@wwco.com>
1 Zhu,Wenjun <wenjun.zhu@intel.com>
1 hongyang7 <yang.a.hong@intel.com>
1 jplaisance <jeffplaisance@gmail.com>
1 liquidaty <info@liquidaty.com>

View File

@ -2,11 +2,6 @@ Hyperscan is licensed under the BSD License.
Copyright (c) 2015, Intel Corporation Copyright (c) 2015, Intel Corporation
Vectorscan is licensed under the BSD License.
Copyright (c) 2020, VectorCamp PC
Copyright (c) 2021, Arm Limited
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met: modification, are permitted provided that the following conditions are met:

251
README.md
View File

@ -1,252 +1,43 @@
# About Vectorscan # Hyperscan
A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD Hyperscan is a high-performance multiple regex matching library. It follows the
and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
access to hardware now. More platforms will follow in the future.
Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
port, which can be either used for platforms without official SIMD support,
as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
for reference and comparison purposes.
Vectorscan will follow Intel's API and internal algorithms where possible, but will not
hesitate to make code changes where it is thought of giving better performance or better
portability. In addition, the code will be gradually simplified and made more uniform and
all architecture specific -currently Intel- #ifdefs will be removed and abstracted away.
# Why was there a need for a fork?
Originally, the ARM porting was intended to be merged into Intel's own Hyperscan, and relevant
Pull Requests were made to the project for this reason. Unfortunately, the
PRs were rejected for now and the forseeable future, thus we have created Vectorscan for
our own multi-architectural and opensource collaborative needs.
The recent license change of Hyperscan makes Vectorscan even more relevant for the FLOSS ecosystem.
# What is Vectorscan/Hyperscan/?
Hyperscan and by extension Vectorscan is a high-performance multiple regex matching library. It follows the
regular expression syntax of the commonly-used libpcre library, but is a regular expression syntax of the commonly-used libpcre library, but is a
standalone library with its own C API. standalone library with its own C API.
Hyperscan/Vectorscan uses hybrid automata techniques to allow simultaneous matching of Hyperscan uses hybrid automata techniques to allow simultaneous matching of
large numbers (up to tens of thousands) of regular expressions and for the large numbers (up to tens of thousands) of regular expressions and for the
matching of regular expressions across streams of data. matching of regular expressions across streams of data.
Vectorscan is typically used in a DPI library stack, just like Hyperscan. Hyperscan is typically used in a DPI library stack.
# Documentation
Information on building the Hyperscan library and using its API is available in
the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/).
# License # License
Vectorscan follows a BSD License like the original Hyperscan (up to 5.4). Hyperscan is licensed under the BSD License. See the LICENSE file in the
project repository.
Vectorscan continues to be an open source project and we are committed to keep it that way.
See the LICENSE file in the project repository.
## Hyperscan License Change after 5.4
According to
[Accelerate Snort Performance with Hyperscan and Intel Xeon Processors on Public Clouds](https://networkbuilders.intel.com/docs/networkbuilders/accelerate-snort-performance-with-hyperscan-and-intel-xeon-processors-on-public-clouds-1680176363.pdf) versions of Hyperscan later than 5.4 are
going to be closed-source:
> The latest open-source version (BSD-3 license) of Hyperscan on Github is 5.4. Intel conducts continuous internal
> development and delivers new Hyperscan releases under Intel Proprietary License (IPL) beginning from 5.5 for interested
> customers. Please contact authors to learn more about getting new Hyperscan releases.
# Versioning # Versioning
The `master` branch on Github will always contain the most recent stable release of The `master` branch on Github will always contain the most recent release of
Hyperscan. Each version released to `master` goes through QA and testing before Hyperscan. Each version released to `master` goes through QA and testing before
it is released; if you're a user, rather than a developer, this is the version it is released; if you're a user, rather than a developer, this is the version
you should be using. you should be using.
Further development towards the next release takes place on the `develop` Further development towards the next release takes place on the `develop`
branch. All PRs are first made against the develop branch and if the pass the [Vectorscan CI](https://buildbot-ci.vectorcamp.gr/#/grid), then they get merged. Similarly with PRs from develop to master. branch.
# Compatibility with Hyperscan # Get Involved
Vectorscan aims to be ABI and API compatible with the last open source version of Intel Hyperscan 5.4. The official homepage for Hyperscan is at [www.hyperscan.io](https://www.hyperscan.io).
After careful consideration we decided that we will **NOT** aim to achieving compatibility with later Hyperscan versions 5.5/5.6 that have extended Hyperscan's API.
If keeping up to date with latest API of Hyperscan, you should talk to Intel and get a license to use that.
However, we intend to extend Vectorscan's API with user requested changes or API extensions and improvements that we think are best for the project.
# Installation If you have questions or comments, we encourage you to [join the mailing
list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
sending email to the list, or by creating an issue on Github.
## Debian/Ubuntu If you wish to contact the Hyperscan team at Intel directly, without posting
publicly to the mailing list, send email to
On recent Debian/Ubuntu systems, vectorscan should be directly available for installation: [hyperscan@intel.com](mailto:hyperscan@intel.com).
```
$ sudo apt install libvectorscan5
```
Or to install the devel package you can install `libvectorscan-dev` package:
```
$ sudo apt install libvectorscan-dev
```
For other distributions/OSes please check the [Wiki](https://github.com/VectorCamp/vectorscan/wiki/Installation-from-package)
# Build Instructions
The build system has recently been refactored to be more modular and easier to extend. For that reason,
some small but necessary changes were made that might break compatibility with how Hyperscan was built.
## Install Common Dependencies
### Debian/Ubuntu
In order to build on Debian/Ubuntu make sure you install the following build-dependencies
```
$ sudo apt install build-essential cmake ragel pkg-config libsqlite3-dev libpcap-dev
```
### Other distributions
TBD
### MacOS X (M1/M2/M3 CPUs only)
Assuming an existing HomeBrew installation:
```
% brew install boost cmake gcc libpcap pkg-config ragel sqlite
```
### *BSD
In NetBSD you will almost certainly need to have a newer compiler installed.
Also you will need to install cmake, sqlite, boost and ragel.
Also, libpcap is necessary for some of the benchmarks, so let's install that
as well.
When using pkgsrc, you would typically do this using something
similar to
```
pkg_add gcc12-12.3.0.tgz
pkg_add boost-headers-1.83.0.tgz boost-jam-1.83.0.tgz boost-libs-1.83.0nb1.tgz
pkg_add ragel-6.10.tgz
pkg_add cmake-3.28.1.tgz
pkg_add sqlite3-3.44.2.tgz
pkg_add libpcap-1.10.4.tgz
```
Version numbers etc will of course vary. One would either download the
binary packages or build them using pkgsrc. There exist some NetBSD pkg
tools like ```pkgin``` which help download e.g. dependencies as binary packages,
but overall NetBSD leaves a lot of detail exposed to the user.
The main package system used in NetBSD is pkgsrc and one will probably
want to read up more about it than is in the scope of this document.
See https://www.netbsd.org/docs/software/packages.html for more information.
This will not replace the compiler in the standard base distribution, and
cmake will probably find the base dist's compiler when it checks automatically.
Using the example of gcc12 from pkgsrc, one will need to set two
environment variables before starting:
```
export CC="/usr/pkg/gcc12/bin/cc"
export CXX="/usr/pkg/gcc12/bin/g++"
```
In FreeBSD similarly, you might want to install a different compiler.
If you want to use gcc, it is recommended to use gcc12.
You will also, as in NetBSD, need to install cmake, sqlite, boost and ragel packages.
Using the example of gcc12 from pkg:
installing the desired compiler:
```
pkg install gcc12
pkg install boost-all
pkg install ragel
pkg install cmake
pkg install sqlite
pkg install libpcap
pkg install ccache
```
and then before beginning the cmake and build process, set
the environment variables to point to this compiler:
```
export CC="/usr/local/bin/gcc"
export CXX="/usr/local/bin/g++"
```
A further note in FreeBSD, on the PowerPC and ARM platforms,
the gcc12 package installs to a slightly different name, on FreeBSD/ppc,
gcc12 will be found using:
```
export CC="/usr/local/bin/gcc12"
export CXX="/usr/local/bin/g++12"
```
Then continue with the build as below.
## Configure & build
In order to configure with `cmake` first create and cd into a build directory:
```
$ mkdir build
$ cd build
```
Then call `cmake` from inside the `build` directory:
```
$ cmake ../
```
Common options for Cmake are:
* `-DBUILD_STATIC_LIBS=[On|Off]` Build static libraries
* `-DBUILD_SHARED_LIBS=[On|Off]` Build shared libraries (if none are set static libraries are built by default)
* `-DCMAKE_BUILD_TYPE=[Release|Debug|RelWithDebInfo|MinSizeRel]` Configure build type and determine optimizations and certain features.
* `-DUSE_CPU_NATIVE=[On|Off]` Native CPU detection is off by default, however it is possible to build a performance-oriented non-fat library tuned to your CPU
* `-DFAT_RUNTIME=[On|Off]` Fat Runtime is only available for X86 32-bit/64-bit and AArch64 architectures and only on Linux. It is incompatible with `Debug` type and `USE_CPU_NATIVE`.
### Specific options for X86 32-bit/64-bit (Intel/AMD) CPUs
* `-DBUILD_AVX2=[On|Off]` Enable code for AVX2.
* `-DBUILD_AVX512=[On|Off]` Enable code for AVX512. Implies `BUILD_AVX2`.
* `-DBUILD_AVX512VBMI=[On|Off]` Enable code for AVX512 with VBMI extension. Implies `BUILD_AVX512`.
### Specific options for Arm 64-bit CPUs
* `-DBUILD_SVE=[On|Off]` Enable code for SVE, like on AWS Graviton3 CPUs. Not much code is ported just for SVE , but enabling SVE code production, does improve code generation, see [Benchmarks](https://github.com/VectorCamp/vectorscan/wiki/Benchmarks).
* `-DBUILD_SVE2=[On|Off]` Enable code for SVE2, implies `BUILD_SVE`. Most non-Neon code is written for SVE2
* `-DBUILD_SVE2_BITPERM=[On|Off]` Enable code for SVE2_BITPERM harwdare feature, implies `BUILD_SVE2`.
## Other options
* `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.
## SIMDe options
* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
## Build
If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
```
$ make -j <N>
```
will speed up the process. If all goes well, you should have the vectorscan library compiled.
# Contributions
The official homepage for Vectorscan is at [www.github.com/VectorCamp/vectorscan](https://www.github.com/VectorCamp/vectorscan).
# Vectorscan Development
All development of Vectorscan is done in public.
# Original Hyperscan links
For reference, the official homepage for Hyperscan is at [www.hyperscan.io](https://www.hyperscan.io).
# Hyperscan Documentation
Information on building the Hyperscan library and using its API is available in
the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/).
And you can find the source code [on Github](https://github.com/intel/hyperscan).
For Intel Hyperscan related issues and questions, please follow the relevant links there.

View File

@ -1,9 +0,0 @@
include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
add_executable(benchmarks benchmarks.cpp)
set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
"-Wall -Wno-unused-variable")
target_link_libraries(benchmarks hs)
endif()

View File

@ -1,309 +0,0 @@
/*
* Copyright (c) 2020, 2021, VectorCamp PC
* Copyright (c) 2023, 2024, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <chrono>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <functional>
#include <iostream>
#include <memory>
#include "util/arch.h"
#include "benchmarks.hpp"
#define MAX_LOOPS 1000000000
#define MAX_MATCHES 5
#define N 8
struct hlmMatchEntry {
size_t to;
u32 id;
hlmMatchEntry(size_t end, u32 identifier) : to(end), id(identifier) {}
};
std::vector<hlmMatchEntry> ctxt;
static hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
UNUSED struct hs_scratch *scratch) { // cppcheck-suppress constParameterCallback
DEBUG_PRINTF("match @%zu = %u\n", to, id);
ctxt.push_back(hlmMatchEntry(to, id));
return HWLM_CONTINUE_MATCHING;
}
template <typename InitFunc, typename BenchFunc>
static void run_benchmarks(int size, int loops, int max_matches,
bool is_reverse, MicroBenchmark &bench,
InitFunc &&init, BenchFunc &&func) {
init(bench);
double total_sec = 0.0;
double max_bw = 0.0;
double avg_time = 0.0;
if (max_matches) {
double avg_bw = 0.0;
int pos = 0;
for (int j = 0; j < max_matches - 1; j++) {
bench.buf[pos] = 'b';
pos = (j + 1) * size / max_matches;
bench.buf[pos] = 'a';
u64a actual_size = 0;
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < loops; i++) {
const u8 *res = func(bench);
if (is_reverse)
actual_size += bench.buf.data() + size - res;
else
actual_size += res - bench.buf.data();
}
auto end = std::chrono::steady_clock::now();
double dt = std::chrono::duration_cast<std::chrono::microseconds>(
end - start)
.count();
total_sec += dt;
/*convert microseconds to seconds*/
/*calculate bandwidth*/
double bw = (actual_size / dt) * 1000000.0 / 1048576.0;
/*std::cout << "act_size = " << act_size << std::endl;
std::cout << "dt = " << dt << std::endl;
std::cout << "bw = " << bw << std::endl;*/
avg_bw += bw;
/*convert to MB/s*/
max_bw = std::max(bw, max_bw);
/*calculate average time*/
avg_time += total_sec / loops;
}
avg_time /= max_matches;
avg_bw /= max_matches;
total_sec /= 1000000.0;
/*convert average time to us*/
printf("%-18s, %-12d, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7.3f\n",
bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
} else {
u64a total_size = 0;
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < loops; i++) {
func(bench);
}
auto end = std::chrono::steady_clock::now();
total_sec +=
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
/*calculate transferred size*/
total_size = (u64a)size * (u64a)loops;
/*calculate average time*/
avg_time = total_sec / loops;
/*convert microseconds to seconds*/
total_sec /= 1000000.0;
/*calculate maximum bandwidth*/
max_bw = total_size / total_sec;
/*convert to MB/s*/
max_bw /= 1048576.0;
printf("%-18s, %-12s, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7s\n",
bench.label, "0", size, loops, total_sec, avg_time, max_bw, "0");
}
}
int main(){
const int matches[] = {0, MAX_MATCHES};
std::vector<size_t> sizes;
for (size_t i = 0; i < N; i++)
sizes.push_back(16000 << i * 2);
const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
printf("%-18s, %-12s, %-10s, %-6s, %-10s, %-9s, %-8s, %-7s\n", "Matcher",
"max_matches", "size", "loops", "total_sec", "avg_time", "max_bw",
"avg_bw");
for (int m = 0; m < 2; m++) {
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Shufti", sizes[i]);
run_benchmarks(
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::shuftiBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return shuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Reverse Shufti", sizes[i]);
run_benchmarks(
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::shuftiBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return rshuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Truffle", sizes[i]);
run_benchmarks(
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return truffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Reverse Truffle", sizes[i]);
run_benchmarks(
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return rtruffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
#ifdef CAN_USE_WIDE_TRUFFLE
if(CAN_USE_WIDE_TRUFFLE) {
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Truffle Wide", sizes[i]);
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return truffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
}
);
}
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Reverse Truffle Wide", sizes[i]);
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return rtruffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
}
);
}
}
#endif
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Vermicelli", sizes[i]);
run_benchmarks(
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return vermicelliExec('a', 'b', b.buf.data(),
b.buf.data() + b.size);
});
}
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
run_benchmarks(
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark const &b) {
return rvermicelliExec('a', 'b', b.buf.data(),
b.buf.data() + b.size);
});
}
for (size_t i = 0; i < std::size(sizes); i++) {
// we imitate the noodle unit tests
std::string str;
const size_t char_len = 5;
str.resize(char_len + 2);
for (size_t j = 0; j < char_len; j++) {
srand(time(NULL));
int key = rand() % +36;
str[char_len] = charset[key];
str[char_len + 1] = '\0';
}
MicroBenchmark bench("Noodle", sizes[i]);
run_benchmarks(
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
ctxt.clear();
memset(b.buf.data(), 'a', b.size);
u32 id = 1000;
ue2::hwlmLiteral lit(str, true, id);
b.nt = ue2::noodBuildTable(lit);
assert(b.nt.get() != nullptr);
},
[&](MicroBenchmark &b) { // cppcheck-suppress constParameterReference
noodExec(b.nt.get(), b.buf.data(), b.size, 0,
hlmSimpleCallback, &b.scratch);
return b.buf.data() + b.size;
});
}
}
return 0;
}

View File

@ -1,67 +0,0 @@
/*
* Copyright (c) 2020, 2021, VectorCamp PC
* Copyright (c) 2024, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "hwlm/hwlm_literal.h"
#include "hwlm/noodle_build.h"
#include "hwlm/noodle_engine.h"
#include "hwlm/noodle_internal.h"
#include "nfa/shufti.h"
#include "nfa/shufticompile.h"
#include "nfa/truffle.h"
#include "nfa/trufflecompile.h"
#include "nfa/vermicelli.hpp"
#include "scratch.h"
#include "util/bytecode_ptr.h"
class MicroBenchmark {
public:
struct hs_scratch scratch{};
char const *label;
size_t size;
std::vector<u8> buf;
ue2::bytecode_ptr<noodTable> nt;
ue2::CharReach chars;
// Shufti/Truffle
union {
m256 truffle_mask;
struct {
#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
m128 truffle_mask_lo;
m128 truffle_mask_hi;
#else
m128 truffle_mask_hi;
m128 truffle_mask_lo;
#endif
};
};
MicroBenchmark(char const *label_, size_t size_)
: label(label_), size(size_), buf(size_){};
};

View File

@ -33,15 +33,17 @@ target_link_libraries(chimera hs pcre)
install(TARGETS chimera DESTINATION ${CMAKE_INSTALL_LIBDIR}) install(TARGETS chimera DESTINATION ${CMAKE_INSTALL_LIBDIR})
# expand out library names for pkgconfig static link info if (NOT WIN32)
foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}) # expand out library names for pkgconfig static link info
# this is fragile, but protects us from toolchain specific files foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
if (NOT EXISTS ${LIB}) # this is fragile, but protects us from toolchain specific files
set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}") if (NOT EXISTS ${LIB})
endif() set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
endforeach() endif()
set(PRIVATE_LIBS "${PRIVATE_LIBS} -L${LIBDIR} -lpcre") endforeach()
set(PRIVATE_LIBS "${PRIVATE_LIBS} -L${LIBDIR} -lpcre")
configure_file(libch.pc.in libch.pc @ONLY) # only replace @ quoted vars configure_file(libch.pc.in libch.pc @ONLY) # only replace @ quoted vars
install(FILES ${CMAKE_BINARY_DIR}/chimera/libch.pc install(FILES ${CMAKE_BINARY_DIR}/chimera/libch.pc
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
endif()

View File

@ -39,6 +39,7 @@
#include "hs_internal.h" #include "hs_internal.h"
#include "ue2common.h" #include "ue2common.h"
#include "util/compile_error.h" #include "util/compile_error.h"
#include "util/make_unique.h"
#include "util/multibit_build.h" #include "util/multibit_build.h"
#include "util/target_info.h" #include "util/target_info.h"
@ -494,7 +495,7 @@ void ch_compile_multi_int(const char *const *expressions, const unsigned *flags,
// First, build with libpcre. A build failure from libpcre will throw // First, build with libpcre. A build failure from libpcre will throw
// an exception up to the caller. // an exception up to the caller.
auto patternData = auto patternData =
std::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit, ue2::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit,
match_limit_recursion, platform); match_limit_recursion, platform);
pcres.push_back(move(patternData)); pcres.push_back(move(patternData));
PatternData &curr = *pcres.back(); PatternData &curr = *pcres.back();

View File

@ -1,56 +1,13 @@
option(BUILD_AVX512 "Enabling support for AVX512" OFF) # detect architecture features
option(BUILD_AVX512VBMI "Enabling support for AVX512VBMI" OFF) #
# must be called after determining where compiler intrinsics are defined
set(SKYLAKE_ARCH "skylake-avx512")
set(ICELAKE_ARCH "icelake-server")
set(SKYLAKE_FLAG "-march=${SKYLAKE_ARCH}")
set(ICELAKE_FLAG "-march=${ICELAKE_ARCH}")
if (NOT FAT_RUNTIME)
if (BUILD_AVX512VBMI)
message (STATUS "AVX512VBMI implies AVX512, enabling BUILD_AVX512")
set(BUILD_AVX512 ON)
set(BUILD_AVX2 ON)
set(ARCH_C_FLAGS "${ICELAKE_FLAG}")
set(ARCH_CXX_FLAGS "${ICELAKE_FLAG}")
set(X86_ARCH "${ICELAKE_ARCH}")
elseif (BUILD_AVX512)
message (STATUS "AVX512 implies AVX2, enabling BUILD_AVX2")
set(BUILD_AVX2 ON)
set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
set(X86_ARCH "${SKYLAKE_ARCH}")
elseif (BUILD_AVX2)
message (STATUS "Enabling BUILD_AVX2")
set(ARCH_C_FLAGS "-mavx2")
set(ARCH_CXX_FLAGS "-mavx2")
set(X86_ARCH "core-avx2")
else()
set(ARCH_C_FLAGS "-msse4.2")
set(ARCH_CXX_FLAGS "-msse4.2")
set(X86_ARCH "x86-64-v2")
endif()
else()
set(BUILD_AVX512VBMI ON)
set(BUILD_AVX512 ON)
set(BUILD_AVX2 ON)
set(ARCH_C_FLAGS "-msse4.2")
set(ARCH_CXX_FLAGS "-msse4.2")
set(X86_ARCH "x86-64-v2")
endif()
set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}")
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
if (HAVE_C_X86INTRIN_H) if (HAVE_C_X86INTRIN_H)
set (INTRIN_INC_H "x86intrin.h") set (INTRIN_INC_H "x86intrin.h")
elseif (HAVE_C_INTRIN_H) elseif (HAVE_C_INTRIN_H)
set (INTRIN_INC_H "intrin.h") set (INTRIN_INC_H "intrin.h")
else() else ()
message (FATAL_ERROR "No intrinsics header found for SSE/AVX2/AVX512") message (FATAL_ERROR "No intrinsics header found")
endif () endif ()
if (BUILD_AVX512) if (BUILD_AVX512)
@ -67,15 +24,30 @@ if (BUILD_AVX512VBMI)
endif () endif ()
endif () endif ()
# ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic if (FAT_RUNTIME)
# test the highest level microarch to make sure everything works
if (BUILD_AVX512)
if (BUILD_AVX512VBMI)
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ICELAKE_FLAG}")
else ()
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
endif (BUILD_AVX512VBMI)
else ()
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2")
endif ()
else (NOT FAT_RUNTIME)
# if not fat runtime, then test given cflags
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
endif ()
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() { int main() {
__m128i a = _mm_set1_epi8(1); __m128i a = _mm_set1_epi8(1);
(void)_mm_shuffle_epi8(a, a); (void)_mm_shuffle_epi8(a, a);
}" HAVE_SSE42) }" HAVE_SSSE3)
# now look for AVX2 # now look for AVX2
set(CMAKE_REQUIRED_FLAGS "-mavx2")
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX2__) #if !defined(__AVX2__)
#error no avx2 #error no avx2
@ -87,7 +59,6 @@ int main(){
}" HAVE_AVX2) }" HAVE_AVX2)
# and now for AVX512 # and now for AVX512
set(CMAKE_REQUIRED_FLAGS "${SKYLAKE_FLAG}")
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX512BW__) #if !defined(__AVX512BW__)
#error no avx512bw #error no avx512bw
@ -99,7 +70,6 @@ int main(){
}" HAVE_AVX512) }" HAVE_AVX512)
# and now for AVX512VBMI # and now for AVX512VBMI
set(CMAKE_REQUIRED_FLAGS "${ICELAKE_FLAG}")
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX512VBMI__) #if !defined(__AVX512VBMI__)
#error no avx512vbmi #error no avx512vbmi
@ -112,10 +82,10 @@ int main(){
}" HAVE_AVX512VBMI) }" HAVE_AVX512VBMI)
if (FAT_RUNTIME) if (FAT_RUNTIME)
if (NOT HAVE_SSE42) if (NOT HAVE_SSSE3)
message(FATAL_ERROR "SSE4.2 support required to build fat runtime") message(FATAL_ERROR "SSSE3 support required to build fat runtime")
endif () endif ()
if (BUILD_AVX2 AND NOT HAVE_AVX2) if (NOT HAVE_AVX2)
message(FATAL_ERROR "AVX2 support required to build fat runtime") message(FATAL_ERROR "AVX2 support required to build fat runtime")
endif () endif ()
if (BUILD_AVX512 AND NOT HAVE_AVX512) if (BUILD_AVX512 AND NOT HAVE_AVX512)
@ -125,7 +95,7 @@ if (FAT_RUNTIME)
message(FATAL_ERROR "AVX512VBMI support requested but not supported") message(FATAL_ERROR "AVX512VBMI support requested but not supported")
endif () endif ()
else (NOT FAT_RUNTIME) else (NOT FAT_RUNTIME)
if (NOT BUILD_AVX2) if (NOT HAVE_AVX2)
message(STATUS "Building without AVX2 support") message(STATUS "Building without AVX2 support")
endif () endif ()
if (NOT HAVE_AVX512) if (NOT HAVE_AVX512)
@ -134,7 +104,10 @@ else (NOT FAT_RUNTIME)
if (NOT HAVE_AVX512VBMI) if (NOT HAVE_AVX512VBMI)
message(STATUS "Building without AVX512VBMI support") message(STATUS "Building without AVX512VBMI support")
endif () endif ()
if (NOT HAVE_SSE42) if (NOT HAVE_SSSE3)
message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required") message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
endif () endif ()
endif () endif ()
unset (CMAKE_REQUIRED_FLAGS)
unset (INTRIN_INC_H)

View File

@ -1,111 +0,0 @@
if (USE_CPU_NATIVE)
# Detect best GNUCC_ARCH to tune for
if (CMAKE_COMPILER_IS_GNUCC)
message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
# If gcc doesn't recognise the host cpu, then mtune=native becomes
# generic, which isn't very good in some cases. march=native looks at
# cpuid info and then chooses the best microarch it can (and replaces
# the flag), so use that for tune.
set(TUNE_FLAG "mtune")
set(GNUCC_TUNE "")
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
# arg1 might exist if using ccache
string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -${TUNE_FLAG}=native)
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
OUTPUT_VARIABLE _GCC_OUTPUT)
set(_GCC_OUTPUT_TUNE ${_GCC_OUTPUT})
string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}=" POS)
string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
string(FIND "${_GCC_OUTPUT_TUNE}" "${TUNE_FLAG}=" POS_TUNE)
string(SUBSTRING "${_GCC_OUTPUT_TUNE}" ${POS_TUNE} -1 _GCC_OUTPUT_TUNE)
string(REGEX REPLACE "${TUNE_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_TUNE "${_GCC_OUTPUT_TUNE}")
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
# test the parsed flag
set (EXEC_ARGS ${CC_ARG1} -E - -${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE})
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
OUTPUT_QUIET ERROR_QUIET
INPUT_FILE /dev/null
RESULT_VARIABLE GNUCC_TUNE_TEST)
if (NOT GNUCC_TUNE_TEST EQUAL 0)
message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_TUNE} not valid, falling back to -mtune=native")
set(GNUCC_TUNE native)
else()
set(GNUCC_TUNE ${GNUCC_TUNE})
message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${GNUCC_TUNE}")
endif()
elseif (CMAKE_COMPILER_IS_CLANG)
if (ARCH_IA32 OR ARCH_X86_64)
set(GNUCC_ARCH x86-64-v2)
set(TUNE_FLAG generic)
elseif(ARCH_AARCH64)
if (BUILD_SVE2_BITPERM)
set(GNUCC_ARCH ${SVE2_BITPERM_ARCH})
elseif (BUILD_SVE2)
set(GNUCC_ARCH ${SVE2_ARCH})
elseif (BUILD_SVE)
set(GNUCC_ARCH ${SVE_ARCH})
else ()
set(GNUCC_ARCH ${ARMV8_ARCH})
endif()
set(TUNE_FLAG generic)
elseif(ARCH_ARM32)
set(GNUCC_ARCH armv7a)
set(TUNE_FLAG generic)
else()
set(GNUCC_ARCH native)
set(TUNE_FLAG generic)
endif()
message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
endif()
else()
if (SIMDE_BACKEND)
if (ARCH_IA32 OR ARCH_X86_64)
set(GNUCC_ARCH x86-64-v2)
set(TUNE_FLAG generic)
elseif(ARCH_AARCH64)
set(GNUCC_ARCH armv8-a)
set(TUNE_FLAG generic)
elseif(ARCH_ARM32)
set(GNUCC_ARCH armv7a)
set(TUNE_FLAG generic)
elseif(ARCH_PPC64EL)
set(GNUCC_ARCH power8)
set(TUNE_FLAG power8)
else()
set(GNUCC_ARCH x86-64-v2)
set(TUNE_FLAG generic)
endif()
elseif (ARCH_IA32 OR ARCH_X86_64)
set(GNUCC_ARCH ${X86_ARCH})
set(TUNE_FLAG generic)
elseif(ARCH_AARCH64)
if (BUILD_SVE2_BITPERM)
set(GNUCC_ARCH ${SVE2_BITPERM_ARCH})
elseif (BUILD_SVE2)
set(GNUCC_ARCH ${SVE2_ARCH})
elseif (BUILD_SVE)
set(GNUCC_ARCH ${SVE_ARCH})
else ()
set(GNUCC_ARCH ${ARMV8_ARCH})
endif()
set(TUNE_FLAG generic)
elseif(ARCH_ARM32)
set(GNUCC_ARCH armv7a)
set(TUNE_FLAG generic)
elseif(ARCH_PPC64EL)
set(GNUCC_ARCH power8)
set(TUNE_FLAG power8)
else()
set(GNUCC_ARCH native)
set(TUNE_FLAG native)
endif()
endif()

View File

@ -15,21 +15,13 @@ SYMSFILE=$(mktemp -p /tmp ${PREFIX}_rename.syms.XXXXX)
KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX) KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
# find the libc used by gcc # find the libc used by gcc
LIBC_SO=$("$@" --print-file-name=libc.so.6) LIBC_SO=$("$@" --print-file-name=libc.so.6)
NM_FLAG="-f"
if [ `uname` = "FreeBSD" ]; then
# for freebsd, we will specify the name,
# we will leave it work as is in linux
LIBC_SO=/lib/libc.so.7
# also, in BSD, the nm flag -F corresponds to the -f flag in linux.
NM_FLAG="-F"
fi
cp ${KEEPSYMS_IN} ${KEEPSYMS} cp ${KEEPSYMS_IN} ${KEEPSYMS}
# get all symbols from libc and turn them into patterns # get all symbols from libc and turn them into patterns
nm ${NM_FLAG} p -g -D ${LIBC_SO} | sed 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS} nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
# build the object # build the object
"$@" "$@"
# rename the symbols in the object # rename the symbols in the object
nm ${NM_FLAG} p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE} nm -f p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
if test -s ${SYMSFILE} if test -s ${SYMSFILE}
then then
objcopy --redefine-syms=${SYMSFILE} ${OUT} objcopy --redefine-syms=${SYMSFILE} ${OUT}

View File

@ -1,93 +0,0 @@
if (NOT FAT_RUNTIME)
if (BUILD_SVE2_BITPERM)
message (STATUS "SVE2_BITPERM implies SVE2, enabling BUILD_SVE2")
set(BUILD_SVE2 ON)
endif ()
if (BUILD_SVE2)
message (STATUS "SVE2 implies SVE, enabling BUILD_SVE")
set(BUILD_SVE ON)
endif ()
endif ()
if (CMAKE_COMPILER_IS_GNUCXX)
set(ARMV9BASE_MINVER "12")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ARMV9BASE_MINVER)
set(SVE2_ARCH "armv8-a+sve2")
else()
set(SVE2_ARCH "armv9-a")
endif()
else()
set(SVE2_ARCH "armv9-a")
endif()
set(ARMV8_ARCH "armv8-a")
set(SVE_ARCH "${ARMV8_ARCH}+sve")
set(SVE2_BITPERM_ARCH "${SVE2_ARCH}+sve2-bitperm")
CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
if (BUILD_SVE OR BUILD_SVE2 OR BUILD_SVE2_BITPERM OR FAT_RUNTIME)
set(CMAKE_REQUIRED_FLAGS "-march=${SVE_ARCH}")
CHECK_INCLUDE_FILE_CXX(arm_sve.h HAVE_C_ARM_SVE_H)
if (NOT HAVE_C_ARM_SVE_H)
message(FATAL_ERROR "arm_sve.h is required to build for SVE.")
endif()
endif()
CHECK_C_SOURCE_COMPILES("#include <arm_neon.h>
int main() {
int32x4_t a = vdupq_n_s32(1);
(void)a;
}" HAVE_NEON)
if (BUILD_SVE2_BITPERM)
set(CMAKE_REQUIRED_FLAGS "-march=${SVE2_BITPERM_ARCH}")
CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
int main() {
svuint8_t a = svbext(svdup_u8(1), svdup_u8(2));
(void)a;
}" HAVE_SVE2_BITPERM)
endif()
if (BUILD_SVE2)
set(CMAKE_REQUIRED_FLAGS "-march=${SVE2_ARCH}")
CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
int main() {
svuint8_t a = svbsl(svdup_u8(1), svdup_u8(2), svdup_u8(3));
(void)a;
}" HAVE_SVE2)
endif()
if (BUILD_SVE)
set(CMAKE_REQUIRED_FLAGS "-march=${SVE_ARCH}")
CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
int main() {
svuint8_t a = svdup_u8(1);
(void)a;
}" HAVE_SVE)
endif ()
if (FAT_RUNTIME)
if (NOT HAVE_NEON)
message(FATAL_ERROR "NEON support required to build fat runtime")
endif ()
if (BUILD_SVE AND NOT HAVE_SVE)
message(FATAL_ERROR "SVE support required to build fat runtime")
endif ()
if (BUILD_SVE2 AND NOT HAVE_SVE2)
message(FATAL_ERROR "SVE2 support required to build fat runtime")
endif ()
if (BUILD_SVE2_BITPERM AND NOT HAVE_SVE2_BITPERM)
message(FATAL_ERROR "SVE2 support required to build fat runtime")
endif ()
else (NOT FAT_RUNTIME)
if (NOT BUILD_SVE)
message(STATUS "Building without SVE support")
endif ()
if (NOT BUILD_SVE2)
message(STATUS "Building without SVE2 support")
endif ()
if (NOT HAVE_NEON)
message(FATAL_ERROR "Neon/ASIMD support required for Arm support")
endif ()
endif ()

View File

@ -1,106 +0,0 @@
# set compiler flags - more are tested and added later
set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra ")
set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra ")
if (NOT CMAKE_COMPILER_IS_CLANG)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
endif()
# Always use -Werror *also during release builds
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wall -Werror")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wall -Werror")
if (DISABLE_ASSERTS)
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
endif()
if(CMAKE_COMPILER_IS_GNUCC)
# spurious warnings?
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds ") #-Wno-maybe-uninitialized")
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized -Wno-uninitialized")
endif()
CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
if(FREEBSD OR NETBSD)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -gdwarf-4")
endif()
if(NETBSD)
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
endif()
if(MACOSX)
# Boost headers cause such complains on MacOS
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
endif()
# these end up in the config file
CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
# are we using libc++
CHECK_CXX_SYMBOL_EXISTS(_LIBCPP_VERSION ciso646 HAVE_LIBCPP)
if (RELEASE_BUILD)
if (HAS_C_HIDDEN)
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
endif()
if (HAS_CXX_HIDDEN)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
endif()
endif()
# testing a builtin takes a little more work
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
# Clang does not use __builtin_constant_p() the same way as gcc
if (NOT CMAKE_COMPILER_IS_CLANG)
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
endif()
# clang-14 complains about unused-but-set variable.
CHECK_CXX_COMPILER_FLAG("-Wunused-but-set-variable" CXX_UNUSED_BUT_SET_VAR)
if (CXX_UNUSED_BUT_SET_VAR)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
endif()
CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
if(CMAKE_COMPILER_IS_GNUCC)
if (CXX_IGNORED_ATTR)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
endif()
endif()
CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_NON_NULL)
if(CMAKE_COMPILER_IS_GNUCC)
if (CXX_NON_NULL)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-nonnull")
endif()
endif()
# note this for later, g++ doesn't have this flag but clang does
CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
# gcc complains about this
if(CMAKE_COMPILER_IS_GNUCC)
CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
endif()
endif()

View File

@ -1,27 +0,0 @@
CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H)
if (HAVE_C_PPC64EL_ALTIVEC_H)
set (INTRIN_INC_H "altivec.h")
else()
message (FATAL_ERROR "No intrinsics header found for VSX")
endif ()
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
vector int a = vec_splat_s32(1);
(void)a;
}" HAVE_VSX)
if (NOT HAVE_VSX)
message(FATAL_ERROR "VSX support required for Power support")
endif ()
# fix unit-internal seg fault for freebsd and gcc13
if (FREEBSD AND CMAKE_COMPILER_IS_GNUCXX)
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "13")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
endif ()
endif ()

View File

@ -1,20 +0,0 @@
# determine compiler
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_COMPILER_IS_CLANG TRUE)
set(CLANGCXX_MINVER "5")
message(STATUS "clang++ version ${CMAKE_CXX_COMPILER_VERSION}")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS CLANGCXX_MINVER)
message(FATAL_ERROR "A minimum of clang++ ${CLANGCXX_MINVER} is required for C++17 support")
endif()
string (REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)$" "\\1" CLANG_MAJOR_VERSION "${CMAKE_CXX_COMPILER_VERSION}")
endif()
# compiler version checks TODO: test more compilers
if (CMAKE_COMPILER_IS_GNUCXX)
set(GNUCXX_MINVER "9")
message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support")
endif()
endif()

View File

@ -15,36 +15,12 @@
/* "Define if building for EM64T" */ /* "Define if building for EM64T" */
#cmakedefine ARCH_X86_64 #cmakedefine ARCH_X86_64
/* "Define if building for ARM32" */
#cmakedefine ARCH_ARM32
/* "Define if building for AARCH64" */
#cmakedefine ARCH_AARCH64
/* "Define if building for PPC64EL" */
#cmakedefine ARCH_PPC64EL
/* "Define if cross compiling for AARCH64" */
#cmakedefine CROSS_COMPILE_AARCH64
/* Define if building SVE for AARCH64. */
#cmakedefine BUILD_SVE
/* Define if building SVE2 for AARCH64. */
#cmakedefine BUILD_SVE2
/* Define if building SVE2+BITPERM for AARCH64. */
#cmakedefine BUILD_SVE2_BITPERM
/* internal build, switch on dump support. */ /* internal build, switch on dump support. */
#cmakedefine DUMP_SUPPORT #cmakedefine DUMP_SUPPORT
/* Define if building "fat" runtime. */ /* Define if building "fat" runtime. */
#cmakedefine FAT_RUNTIME #cmakedefine FAT_RUNTIME
/* Define if building AVX2 in the fat runtime. */
#cmakedefine BUILD_AVX2
/* Define if building AVX-512 in the fat runtime. */ /* Define if building AVX-512 in the fat runtime. */
#cmakedefine BUILD_AVX512 #cmakedefine BUILD_AVX512
@ -72,15 +48,6 @@
/* C compiler has intrin.h */ /* C compiler has intrin.h */
#cmakedefine HAVE_C_INTRIN_H #cmakedefine HAVE_C_INTRIN_H
/* C compiler has arm_neon.h */
#cmakedefine HAVE_C_ARM_NEON_H
/* C compiler has arm_sve.h */
#cmakedefine HAVE_C_ARM_SVE_H
/* C compiler has arm_neon.h */
#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H
/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
0 if you don't. */ 0 if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import print_function
import os import os
import sys import sys
import datetime import datetime

View File

@ -1,54 +0,0 @@
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
set(LINUX TRUE)
endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
set(FREEBSD true)
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
#FIXME: find a nicer and more general way of doing this
if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
elseif(ARCH_AARCH64 AND (CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12"))
set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
endif()
endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
if(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
set(NETBSD true)
endif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(MACOSX TRUE)
endif()
if (ARCH_IA32 OR ARCH_X86_64)
option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ON)
else()
option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" OFF)
endif()
if (FAT_RUNTIME)
message("Checking Fat Runtime Requirements...")
if (USE_CPU_NATIVE AND FAT_RUNTIME)
message(FATAL_ERROR "Fat runtime is not compatible with Native CPU detection")
endif()
if (NOT (ARCH_IA32 OR ARCH_X86_64 OR ARCH_AARCH64))
message(FATAL_ERROR "Fat runtime is only supported on Intel and Aarch64 architectures")
else()
message(STATUS "Building Fat runtime for multiple microarchitectures")
message(STATUS "generator is ${CMAKE_GENERATOR}")
if (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
(CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
message (FATAL_ERROR "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
else()
include (${CMAKE_MODULE_PATH}/attrib.cmake)
if (NOT HAS_C_ATTR_IFUNC)
message(FATAL_ERROR "Compiler does not support ifunc attribute, cannot build fat runtime")
endif()
endif()
endif()
if (NOT RELEASE_BUILD)
message(FATAL_ERROR "Fat runtime is only built on Release builds")
endif()
endif ()

View File

@ -30,7 +30,7 @@ if (PCRE_BUILD_SOURCE)
#if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR < ${PCRE_REQUIRED_MINOR_VERSION} #if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR < ${PCRE_REQUIRED_MINOR_VERSION}
#error Incorrect pcre version #error Incorrect pcre version
#endif #endif
int main(void) {return 0;}" CORRECT_PCRE_VERSION) main() {}" CORRECT_PCRE_VERSION)
set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}") set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")
if (NOT CORRECT_PCRE_VERSION) if (NOT CORRECT_PCRE_VERSION)

View File

@ -1,12 +1,9 @@
# determine the target arch # determine the target arch
# really only interested in the preprocessor here # really only interested in the preprocessor here
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64) CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) set(ARCH_X86_64 ${ARCH_64_BIT})
if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) set(ARCH_IA32 ${ARCH_32_BIT})
set(ARCH_64_BIT TRUE)
else()
set(ARCH_32_BIT TRUE)
endif()

View File

@ -7,7 +7,7 @@ function(ragelmaker src_rl)
add_custom_command( add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${src_dir} COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}
COMMAND ${RAGEL} ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} -o ${rl_out} -G0 COMMAND ${RAGEL} ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} -o ${rl_out}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl}
) )
add_custom_target(ragel_${src_file} DEPENDS ${rl_out}) add_custom_target(ragel_${src_file} DEPENDS ${rl_out})

View File

@ -1,40 +0,0 @@
# Possible values:
# - `address` (ASan)
# - `memory` (MSan)
# - `undefined` (UBSan)
# - "" (no sanitizing)
option (SANITIZE "Enable one of the code sanitizers" "")
set (SAN_FLAGS "${SAN_FLAGS} -g -fno-omit-frame-pointer -DSANITIZER")
if (SANITIZE)
if (SANITIZE STREQUAL "address")
set (ASAN_FLAGS "-fsanitize=address -fsanitize-address-use-after-scope")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_FLAGS}")
endif()
elseif (SANITIZE STREQUAL "memory")
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set (FATAL_ERROR "GCC does not have memory sanitizer")
endif()
# MemorySanitizer flags are set according to the official documentation:
# https://clang.llvm.org/docs/MemorySanitizer.html#usage
set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-use-after-dtor -fsanitize-memory-track-origins -fno-optimize-sibling-calls")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
elseif (SANITIZE STREQUAL "undefined")
set (UBSAN_FLAGS "-fsanitize=undefined")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
endif()
else ()
message (FATAL_ERROR "Unknown sanitizer type: ${SANITIZE}")
endif ()
endif()

View File

@ -1,40 +0,0 @@
LIST(APPEND CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/simde)
CHECK_INCLUDE_FILES(simde/x86/sse4.2.h SIMDE_SSE42_H_FOUND)
if (SIMDE_SSE42_H_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
include_directories(${PROJECT_SOURCE_DIR}/simde)
if (CMAKE_COMPILER_IS_CLANG)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
if (ARCH_PPC64EL)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-altivec-src-compat")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-altivec-src-compat")
if (CLANG_MAJOR_VERSION EQUAL 15)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecate-lax-vec-conv-all")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecate-lax-vec-conv-all")
endif ()
endif()
endif()
if (BUILD_SSE2_SIMDE)
message("using BUILD_SSE2_SIMDE..")
set(SIMDE_NATIVE true)
set(ARCH_C_FLAGS "-msse2")
set(ARCH_CXX_FLAGS "-msse2")
set(X86_ARCH "x86-64")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
endif()
if (SIMDE_NATIVE AND NOT BUILD_SSE2_SIMDE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
endif()
else()
message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
endif()

View File

@ -1,19 +1,53 @@
# #
# sqlite is only used in hsbench, no need to special case its build, depend only on OS installations using pkg-config # a lot of noise to find sqlite
# #
option(SQLITE_PREFER_STATIC "Build sqlite3 statically instead of using an installed lib" OFF)
if(NOT WIN32 AND NOT SQLITE_PREFER_STATIC)
find_package(PkgConfig QUIET)
# first check for sqlite on the system # first check for sqlite on the system
pkg_check_modules(SQLITE3 sqlite3) pkg_check_modules(SQLITE3 sqlite3)
endif()
if (NOT SQLITE3_FOUND)
message(STATUS "looking for sqlite3 in source tree")
# look in the source tree
if (EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.h" AND
EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
message(STATUS " found sqlite3 in source tree")
set(SQLITE3_FOUND TRUE)
set(SQLITE3_BUILD_SOURCE TRUE)
set(SQLITE3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/sqlite3")
set(SQLITE3_LDFLAGS sqlite3_static)
else()
message(STATUS " no sqlite3 in source tree")
endif()
endif()
# now do version checks # now do version checks
if (SQLITE3_FOUND) if (SQLITE3_FOUND)
list(INSERT CMAKE_REQUIRED_INCLUDES 0 "${SQLITE3_INCLUDE_DIRS}") list(INSERT CMAKE_REQUIRED_INCLUDES 0 "${SQLITE3_INCLUDE_DIRS}")
if (SQLITE_VERSION LESS "3.8.10") CHECK_C_SOURCE_COMPILES("#include <sqlite3.h>\n#if SQLITE_VERSION_NUMBER >= 3008007 && SQLITE_VERSION_NUMBER < 3008010\n#error broken sqlite\n#endif\nint main() {return 0;}" SQLITE_VERSION_OK)
if (NOT SQLITE_VERSION_OK)
message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version") message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version")
endif() endif()
if (NOT SQLITE3_BUILD_SOURCE)
set(_SAVED_FLAGS ${CMAKE_REQUIRED_FLAGS})
list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS}) list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS})
CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2) CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2)
list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}") list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}")
list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS}) list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS})
else()
if (NOT TARGET sqlite3_static)
# build sqlite as a static lib to compile into our test programs
add_library(sqlite3_static STATIC "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
if (NOT WIN32)
set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-error -Wno-extra -Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION")
endif()
endif()
endif() endif()
endif()
# that's enough about sqlite

View File

@ -1,15 +0,0 @@
unknownMacro:*gtest-all.cc
knownConditionTrueFalse:*Parser.rl
knownConditionTrueFalse:*Parser.cpp
variableScope:*Parser.rl
duplicateBreak:*.rl
unreadVariable:*control_verbs.cpp
unreachableCode:*rose_build_dump.cpp
*:*simde/*
assertWithSideEffect
syntaxError
internalError
checkersReport
missingInclude
missingIncludeSystem
unmatchedSuppression

View File

@ -19,7 +19,6 @@ else()
set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build") set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees") set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html") set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
set(SPHINX_MAN_DIR "${CMAKE_CURRENT_BINARY_DIR}/man")
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in" configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
"${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY) "${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
@ -33,14 +32,4 @@ add_custom_target(dev-reference
"${SPHINX_HTML_DIR}" "${SPHINX_HTML_DIR}"
DEPENDS dev-reference-doxygen DEPENDS dev-reference-doxygen
COMMENT "Building HTML dev reference with Sphinx") COMMENT "Building HTML dev reference with Sphinx")
add_custom_target(dev-reference-man
${SPHINX_BUILD}
-b man
-c "${CMAKE_CURRENT_BINARY_DIR}"
-d "${SPHINX_CACHE_DIR}"
"${CMAKE_CURRENT_SOURCE_DIR}"
"${SPHINX_MAN_DIR}"
DEPENDS dev-reference-doxygen
COMMENT "Building man page reference with Sphinx")
endif() endif()

View File

@ -11,10 +11,10 @@ Introduction
************ ************
Chimera is a software regular expression matching engine that is a hybrid of Chimera is a software regular expression matching engine that is a hybrid of
Vectorscan and PCRE. The design goals of Chimera are to fully support PCRE Hyperscan and PCRE. The design goals of Chimera are to fully support PCRE
syntax as well as to take advantage of the high performance nature of Vectorscan. syntax as well as to take advantage of the high performance nature of Hyperscan.
Chimera inherits the design guideline of Vectorscan with C APIs for compilation Chimera inherits the design guideline of Hyperscan with C APIs for compilation
and scanning. and scanning.
The Chimera API itself is composed of two major components: The Chimera API itself is composed of two major components:
@ -65,13 +65,13 @@ For a given database, Chimera provides several guarantees:
.. note:: Chimera is designed to have the same matching behavior as PCRE, .. note:: Chimera is designed to have the same matching behavior as PCRE,
including greedy/ungreedy, capturing, etc. Chimera reports both including greedy/ungreedy, capturing, etc. Chimera reports both
**start offset** and **end offset** for each match like PCRE. Different **start offset** and **end offset** for each match like PCRE. Different
from the fashion of reporting all matches in Vectorscan, Chimera only reports from the fashion of reporting all matches in Hyperscan, Chimera only reports
non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will
match ``foofoofoofoo`` at offsets (0, 6) and (6, 12). match ``foofoofoofoo`` at offsets (0, 6) and (6, 12).
.. note:: Since Chimera is a hybrid of Vectorscan and PCRE in order to support .. note:: Since Chimera is a hybrid of Hyperscan and PCRE in order to support
full PCRE syntax, there will be extra performance overhead compared to full PCRE syntax, there will be extra performance overhead compared to
Vectorscan-only solution. Please always use Vectorscan for better performance Hyperscan-only solution. Please always use Hyperscan for better performance
unless you must need full PCRE syntax support. unless you must need full PCRE syntax support.
See :ref:`chruntime` for more details See :ref:`chruntime` for more details
@ -83,12 +83,12 @@ Requirements
The PCRE library (http://pcre.org/) version 8.41 is required for Chimera. The PCRE library (http://pcre.org/) version 8.41 is required for Chimera.
.. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source .. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source
directory under Vectorscan root directory in order to build Chimera. directory under Hyperscan root directory in order to build Chimera.
Beside this, both hardware and software requirements of Chimera are the same to Vectorscan. Beside this, both hardware and software requirements of Chimera are the same to Hyperscan.
See :ref:`hardware` and :ref:`software` for more details. See :ref:`hardware` and :ref:`software` for more details.
.. note:: Building Vectorscan will automatically generate Chimera library. .. note:: Building Hyperscan will automatically generate Chimera library.
Currently only static library is supported for Chimera, so please Currently only static library is supported for Chimera, so please
use static build type when configure CMake build options. use static build type when configure CMake build options.
@ -119,7 +119,7 @@ databases:
Compilation allows the Chimera library to analyze the given pattern(s) and Compilation allows the Chimera library to analyze the given pattern(s) and
pre-determine how to scan for these patterns in an optimized fashion using pre-determine how to scan for these patterns in an optimized fashion using
Vectorscan and PCRE. Hyperscan and PCRE.
=============== ===============
Pattern Support Pattern Support
@ -134,7 +134,7 @@ Semantics
========= =========
Chimera supports the exact same semantics of PCRE library. Moreover, it supports Chimera supports the exact same semantics of PCRE library. Moreover, it supports
multiple simultaneous pattern matching like Vectorscan and the multiple matches multiple simultaneous pattern matching like Hyperscan and the multiple matches
will be reported in order by end offset. will be reported in order by end offset.
.. _chruntime: .. _chruntime:

View File

@ -9,7 +9,7 @@ Compiling Patterns
Building a Database Building a Database
******************* *******************
The Vectorscan compiler API accepts regular expressions and converts them into a The Hyperscan compiler API accepts regular expressions and converts them into a
compiled pattern database that can then be used to scan data. compiled pattern database that can then be used to scan data.
The API provides three functions that compile regular expressions into The API provides three functions that compile regular expressions into
@ -24,7 +24,7 @@ databases:
#. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above, #. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
but allows :ref:`extparam` to be specified for each expression. but allows :ref:`extparam` to be specified for each expression.
Compilation allows the Vectorscan library to analyze the given pattern(s) and Compilation allows the Hyperscan library to analyze the given pattern(s) and
pre-determine how to scan for these patterns in an optimized fashion that would pre-determine how to scan for these patterns in an optimized fashion that would
be far too expensive to compute at run-time. be far too expensive to compute at run-time.
@ -48,10 +48,10 @@ To compile patterns to be used in streaming mode, the ``mode`` parameter of
block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
for one mode (streaming, block or vectored) can only be used in that mode. The for one mode (streaming, block or vectored) can only be used in that mode. The
version of Vectorscan used to produce a compiled pattern database must match the version of Hyperscan used to produce a compiled pattern database must match the
version of Vectorscan used to scan with it. version of Hyperscan used to scan with it.
Vectorscan provides support for targeting a database at a particular CPU Hyperscan provides support for targeting a database at a particular CPU
platform; see :ref:`instr_specialization` for details. platform; see :ref:`instr_specialization` for details.
===================== =====================
@ -75,14 +75,14 @@ characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
While in pure literal case, all these meta characters lost extra meanings While in pure literal case, all these meta characters lost extra meanings
expect for that they are just common ASCII codes. expect for that they are just common ASCII codes.
Vectorscan is initially designed to process common regular expressions. It is Hyperscan is initially designed to process common regular expressions. It is
hence embedded with a complex parser to do comprehensive regular grammar hence embedded with a complex parser to do comprehensive regular grammar
interpretation. Particularly, the identification of above meta characters is the interpretation. Particularly, the identification of above meta characters is the
basic step for the interpretation of far more complex regular grammars. basic step for the interpretation of far more complex regular grammars.
However in real cases, patterns may not always be regular expressions. They However in real cases, patterns may not always be regular expressions. They
could just be pure literals. Problem will come if the pure literals contain could just be pure literals. Problem will come if the pure literals contain
regular meta characters. Supposing fed directly into traditional Vectorscan regular meta characters. Supposing fed directly into traditional Hyperscan
compile API, all these meta characters will be interpreted in predefined ways, compile API, all these meta characters will be interpreted in predefined ways,
which is unnecessary and the result is totally out of expectation. To avoid which is unnecessary and the result is totally out of expectation. To avoid
such misunderstanding by traditional API, users have to preprocess these such misunderstanding by traditional API, users have to preprocess these
@ -90,7 +90,7 @@ literal patterns by converting the meta characters into some other formats:
either by adding a backslash ``\`` before certain meta characters, or by either by adding a backslash ``\`` before certain meta characters, or by
converting all the characters into a hexadecimal representation. converting all the characters into a hexadecimal representation.
In ``v5.2.0``, Vectorscan introduces 2 new compile APIs for pure literal patterns: In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns:
#. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern #. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
database. database.
@ -106,7 +106,7 @@ content directly into these APIs without worrying about writing regular meta
characters in their patterns. No preprocessing work is needed any more. characters in their patterns. No preprocessing work is needed any more.
For new APIs, the ``length`` of each literal pattern is a newly added parameter. For new APIs, the ``length`` of each literal pattern is a newly added parameter.
Vectorscan needs to locate the end position of the input expression via clearly Hyperscan needs to locate the end position of the input expression via clearly
knowing each literal's length, not by simply identifying character ``\0`` of a knowing each literal's length, not by simply identifying character ``\0`` of a
string. string.
@ -127,19 +127,19 @@ Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_SINGLEMATCH`,
Pattern Support Pattern Support
*************** ***************
Vectorscan supports the pattern syntax used by the PCRE library ("libpcre"), Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
described at <http://www.pcre.org/>. However, not all constructs available in described at <http://www.pcre.org/>. However, not all constructs available in
libpcre are supported. The use of unsupported constructs will result in libpcre are supported. The use of unsupported constructs will result in
compilation errors. compilation errors.
The version of PCRE used to validate Vectorscan's interpretation of this syntax The version of PCRE used to validate Hyperscan's interpretation of this syntax
is 8.41 or above. is 8.41 or above.
==================== ====================
Supported Constructs Supported Constructs
==================== ====================
The following regex constructs are supported by Vectorscan: The following regex constructs are supported by Hyperscan:
* Literal characters and strings, with all libpcre quoting and character * Literal characters and strings, with all libpcre quoting and character
escapes. escapes.
@ -177,7 +177,7 @@ The following regex constructs are supported by Vectorscan:
:c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern. :c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
* Lazy modifiers (:regexp:`?` appended to another quantifier, e.g. * Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
:regexp:`\\w+?`) are supported but ignored (as Vectorscan reports all :regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
matches). matches).
* Parenthesization, including the named and unnamed capturing and * Parenthesization, including the named and unnamed capturing and
@ -219,15 +219,15 @@ The following regex constructs are supported by Vectorscan:
.. note:: At this time, not all patterns can be successfully compiled with the .. note:: At this time, not all patterns can be successfully compiled with the
:c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for :c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
:ref:`som`. The patterns that support this flag are a subset of patterns that :ref:`som`. The patterns that support this flag are a subset of patterns that
can be successfully compiled with Vectorscan; notably, many bounded repeat can be successfully compiled with Hyperscan; notably, many bounded repeat
forms that can be compiled with Vectorscan without the Start of Match flag forms that can be compiled with Hyperscan without the Start of Match flag
enabled cannot be compiled with the flag enabled. enabled cannot be compiled with the flag enabled.
====================== ======================
Unsupported Constructs Unsupported Constructs
====================== ======================
The following regex constructs are not supported by Vectorscan: The following regex constructs are not supported by Hyperscan:
* Backreferences and capturing sub-expressions. * Backreferences and capturing sub-expressions.
* Arbitrary zero-width assertions. * Arbitrary zero-width assertions.
@ -246,32 +246,32 @@ The following regex constructs are not supported by Vectorscan:
Semantics Semantics
********* *********
While Vectorscan follows libpcre syntax, it provides different semantics. The While Hyperscan follows libpcre syntax, it provides different semantics. The
major departures from libpcre semantics are motivated by the requirements of major departures from libpcre semantics are motivated by the requirements of
streaming and multiple simultaneous pattern matching. streaming and multiple simultaneous pattern matching.
The major departures from libpcre semantics are: The major departures from libpcre semantics are:
#. **Multiple pattern matching**: Vectorscan allows matches to be reported for #. **Multiple pattern matching**: Hyperscan allows matches to be reported for
several patterns simultaneously. This is not equivalent to separating the several patterns simultaneously. This is not equivalent to separating the
patterns by :regexp:`|` in libpcre, which evaluates alternations patterns by :regexp:`|` in libpcre, which evaluates alternations
left-to-right. left-to-right.
#. **Lack of ordering**: the multiple matches that Vectorscan produces are not #. **Lack of ordering**: the multiple matches that Hyperscan produces are not
guaranteed to be ordered, although they will always fall within the bounds of guaranteed to be ordered, although they will always fall within the bounds of
the current scan. the current scan.
#. **End offsets only**: Vectorscan's default behaviour is only to report the end #. **End offsets only**: Hyperscan's default behaviour is only to report the end
offset of a match. Reporting of the start offset can be enabled with offset of a match. Reporting of the start offset can be enabled with
per-expression flags at pattern compile time. See :ref:`som` for details. per-expression flags at pattern compile time. See :ref:`som` for details.
#. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against #. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
``fooxyzbarbar`` will return two matches from Vectorscan -- at the points ``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast, corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
libpcre semantics by default would report only one match at ``fooxyzbarbar`` libpcre semantics by default would report only one match at ``fooxyzbarbar``
(greedy semantics) or, if non-greedy semantics were switched on, one match at (greedy semantics) or, if non-greedy semantics were switched on, one match at
``fooxyzbar``. This means that switching between greedy and non-greedy ``fooxyzbar``. This means that switching between greedy and non-greedy
semantics is a no-op in Vectorscan. semantics is a no-op in Hyperscan.
To support libpcre quantifier semantics while accurately reporting streaming To support libpcre quantifier semantics while accurately reporting streaming
matches at the time they occur is impossible. For example, consider the pattern matches at the time they occur is impossible. For example, consider the pattern
@ -299,7 +299,7 @@ as in block 3 -- which would constitute a better match for the pattern.
Start of Match Start of Match
============== ==============
In standard operation, Vectorscan will only provide the end offset of a match In standard operation, Hyperscan will only provide the end offset of a match
when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
is specified for a particular pattern, then the same set of matches is is specified for a particular pattern, then the same set of matches is
returned, but each match will also provide the leftmost possible start offset returned, but each match will also provide the leftmost possible start offset
@ -308,7 +308,7 @@ corresponding to its end offset.
Using the SOM flag entails a number of trade-offs and limitations: Using the SOM flag entails a number of trade-offs and limitations:
* Reduced pattern support: For many patterns, tracking SOM is complex and can * Reduced pattern support: For many patterns, tracking SOM is complex and can
result in Vectorscan failing to compile a pattern with a "Pattern too result in Hyperscan failing to compile a pattern with a "Pattern too
large" error, even if the pattern is supported in normal operation. large" error, even if the pattern is supported in normal operation.
* Increased stream state: At scan time, state space is required to track * Increased stream state: At scan time, state space is required to track
potential SOM offsets, and this must be stored in persistent stream state in potential SOM offsets, and this must be stored in persistent stream state in
@ -316,20 +316,20 @@ Using the SOM flag entails a number of trade-offs and limitations:
required to match a pattern. required to match a pattern.
* Performance overhead: Similarly, there is generally a performance cost * Performance overhead: Similarly, there is generally a performance cost
associated with tracking SOM. associated with tracking SOM.
* Incompatible features: Some other Vectorscan pattern flags (such as * Incompatible features: Some other Hyperscan pattern flags (such as
:c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be :c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
used in combination with SOM. Specifying them together with used in combination with SOM. Specifying them together with
:c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error. :c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
In streaming mode, the amount of precision delivered by SOM can be controlled In streaming mode, the amount of precision delivered by SOM can be controlled
with the SOM horizon flags. These instruct Vectorscan to deliver accurate SOM with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
information within a certain distance of the end offset, and return a special information within a certain distance of the end offset, and return a special
start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
small or medium SOM horizon will usually reduce the stream state required for a small or medium SOM horizon will usually reduce the stream state required for a
given database. given database.
.. note:: In streaming mode, the start offset returned for a match may refer to .. note:: In streaming mode, the start offset returned for a match may refer to
a point in the stream *before* the current block being scanned. Vectorscan a point in the stream *before* the current block being scanned. Hyperscan
provides no facility for accessing earlier blocks; if the calling application provides no facility for accessing earlier blocks; if the calling application
needs to inspect historical data, then it must store it itself. needs to inspect historical data, then it must store it itself.
@ -341,7 +341,7 @@ Extended Parameters
In some circumstances, more control over the matching behaviour of a pattern is In some circumstances, more control over the matching behaviour of a pattern is
required than can be specified easily using regular expression syntax. For required than can be specified easily using regular expression syntax. For
these scenarios, Vectorscan provides the :c:func:`hs_compile_ext_multi` function these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
that allows a set of "extended parameters" to be set on a per-pattern basis. that allows a set of "extended parameters" to be set on a per-pattern basis.
Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure, Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
@ -383,18 +383,18 @@ section.
Prefiltering Mode Prefiltering Mode
================= =================
Vectorscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
be used to implement a prefilter for a pattern than Vectorscan would not be used to implement a prefilter for a pattern than Hyperscan would not
ordinarily support. ordinarily support.
This flag instructs Vectorscan to compile an "approximate" version of this This flag instructs Hyperscan to compile an "approximate" version of this
pattern for use in a prefiltering application, even if Vectorscan does not pattern for use in a prefiltering application, even if Hyperscan does not
support the pattern in normal operation. support the pattern in normal operation.
The set of matches returned when this flag is used is guaranteed to be a The set of matches returned when this flag is used is guaranteed to be a
superset of the matches specified by the non-prefiltering expression. superset of the matches specified by the non-prefiltering expression.
If the pattern contains pattern constructs not supported by Vectorscan (such as If the pattern contains pattern constructs not supported by Hyperscan (such as
zero-width assertions, back-references or conditional references) these zero-width assertions, back-references or conditional references) these
constructs will be replaced internally with broader constructs that may match constructs will be replaced internally with broader constructs that may match
more often. more often.
@ -404,7 +404,7 @@ back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
approximated by having its back-reference replaced with its referent, forming approximated by having its back-reference replaced with its referent, forming
:regexp:`/\\w+ again \\w+/`. :regexp:`/\\w+ again \\w+/`.
Furthermore, in prefiltering mode Vectorscan may simplify a pattern that would Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
otherwise return a "Pattern too large" error at compile time, or for performance otherwise return a "Pattern too large" error at compile time, or for performance
reasons (subject to the matching guarantee above). reasons (subject to the matching guarantee above).
@ -422,22 +422,22 @@ matches for the pattern.
Instruction Set Specialization Instruction Set Specialization
****************************** ******************************
Vectorscan is able to make use of several modern instruction set features found Hyperscan is able to make use of several modern instruction set features found
on x86 processors to provide improvements in scanning performance. on x86 processors to provide improvements in scanning performance.
Some of these features are selected when the library is built; for example, Some of these features are selected when the library is built; for example,
Vectorscan will use the native ``POPCNT`` instruction on processors where it is Hyperscan will use the native ``POPCNT`` instruction on processors where it is
available and the library has been optimized for the host architecture. available and the library has been optimized for the host architecture.
.. note:: By default, the Vectorscan runtime is built with the ``-march=native`` .. note:: By default, the Hyperscan runtime is built with the ``-march=native``
compiler flag and (where possible) will make use of all instructions known by compiler flag and (where possible) will make use of all instructions known by
the host's C compiler. the host's C compiler.
To use some instruction set features, however, Vectorscan must build a To use some instruction set features, however, Hyperscan must build a
specialized database to support them. This means that the target platform must specialized database to support them. This means that the target platform must
be specified at pattern compile time. be specified at pattern compile time.
The Vectorscan compiler API functions all accept an optional The Hyperscan compiler API functions all accept an optional
:c:type:`hs_platform_info_t` argument, which describes the target platform :c:type:`hs_platform_info_t` argument, which describes the target platform
for the database to be built. If this argument is NULL, the database will be for the database to be built. If this argument is NULL, the database will be
targeted at the current host platform. targeted at the current host platform.
@ -467,7 +467,7 @@ See :ref:`api_constants` for the full list of CPU tuning and feature flags.
Approximate matching Approximate matching
******************** ********************
Vectorscan provides an experimental approximate matching mode, which will match Hyperscan provides an experimental approximate matching mode, which will match
patterns within a given edit distance. The exact matching behavior is defined as patterns within a given edit distance. The exact matching behavior is defined as
follows: follows:
@ -492,7 +492,7 @@ follows:
Here are a few examples of approximate matching: Here are a few examples of approximate matching:
* Pattern :regexp:`/foo/` can match ``foo`` when using regular Vectorscan * Pattern :regexp:`/foo/` can match ``foo`` when using regular Hyperscan
matching behavior. With approximate matching within edit distance 2, the matching behavior. With approximate matching within edit distance 2, the
pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``, pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
``f``, and anything else that lies within edit distance 2 of matching corpora ``f``, and anything else that lies within edit distance 2 of matching corpora
@ -513,7 +513,7 @@ matching support. Here they are, in a nutshell:
* Reduced pattern support: * Reduced pattern support:
* For many patterns, approximate matching is complex and can result in * For many patterns, approximate matching is complex and can result in
Vectorscan failing to compile a pattern with a "Pattern too large" error, Hyperscan failing to compile a pattern with a "Pattern too large" error,
even if the pattern is supported in normal operation. even if the pattern is supported in normal operation.
* Additionally, some patterns cannot be approximately matched because they * Additionally, some patterns cannot be approximately matched because they
reduce to so-called "vacuous" patterns (patterns that match everything). For reduce to so-called "vacuous" patterns (patterns that match everything). For
@ -548,7 +548,7 @@ Logical Combinations
******************** ********************
For situations when a user requires behaviour that depends on the presence or For situations when a user requires behaviour that depends on the presence or
absence of matches from groups of patterns, Vectorscan provides support for the absence of matches from groups of patterns, Hyperscan provides support for the
logical combination of patterns in a given pattern set, with three operators: logical combination of patterns in a given pattern set, with three operators:
``NOT``, ``AND`` and ``OR``. ``NOT``, ``AND`` and ``OR``.
@ -561,7 +561,7 @@ offset is *true* if the expression it refers to is *false* at this offset.
For example, ``NOT 101`` means that expression 101 has not yet matched at this For example, ``NOT 101`` means that expression 101 has not yet matched at this
offset. offset.
A logical combination is passed to Vectorscan at compile time as an expression. A logical combination is passed to Hyperscan at compile time as an expression.
This combination expression will raise matches at every offset where one of its This combination expression will raise matches at every offset where one of its
sub-expressions matches and the logical value of the whole expression is *true*. sub-expressions matches and the logical value of the whole expression is *true*.
@ -603,7 +603,7 @@ In a logical combination expression:
* Whitespace is ignored. * Whitespace is ignored.
To use a logical combination expression, it must be passed to one of the To use a logical combination expression, it must be passed to one of the
Vectorscan compile functions (:c:func:`hs_compile_multi`, Hyperscan compile functions (:c:func:`hs_compile_multi`,
:c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag, :c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag,
which identifies the pattern as a logical combination expression. The patterns which identifies the pattern as a logical combination expression. The patterns
referred to in the logical combination expression must be compiled together in referred to in the logical combination expression must be compiled together in
@ -613,7 +613,7 @@ When an expression has the :c:member:`HS_FLAG_COMBINATION` flag set, it ignores
all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the
:c:member:`HS_FLAG_QUIET` flag. :c:member:`HS_FLAG_QUIET` flag.
Vectorscan will accept logical combination expressions at compile time that Hyperscan will accept logical combination expressions at compile time that
evaluate to *true* when no patterns have matched, and report the match for evaluate to *true* when no patterns have matched, and report the match for
combination at end of data if no patterns have matched; for example: :: combination at end of data if no patterns have matched; for example: ::

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# Vectorscan documentation build configuration file, created by # Hyperscan documentation build configuration file, created by
# sphinx-quickstart on Tue Sep 29 15:59:19 2015. # sphinx-quickstart on Tue Sep 29 15:59:19 2015.
# #
# This file is execfile()d with the current directory set to its # This file is execfile()d with the current directory set to its
@ -43,8 +43,8 @@ source_suffix = '.rst'
master_doc = 'index' master_doc = 'index'
# General information about the project. # General information about the project.
project = u'Vectorscan' project = u'Hyperscan'
copyright = u'2015-2020, Intel Corporation; 2020-2024, VectorCamp; and other contributors' copyright = u'2015-2018, Intel Corporation'
# The version info for the project you're documenting, acts as replacement for # The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the # |version| and |release|, also used in various other places throughout the
@ -202,7 +202,7 @@ latex_elements = {
# (source start file, target name, title, # (source start file, target name, title,
# author, documentclass [howto, manual, or own class]). # author, documentclass [howto, manual, or own class]).
latex_documents = [ latex_documents = [
('index', 'Hyperscan.tex', u'Vectorscan Documentation', ('index', 'Hyperscan.tex', u'Hyperscan Documentation',
u'Intel Corporation', 'manual'), u'Intel Corporation', 'manual'),
] ]
@ -232,8 +232,8 @@ latex_documents = [
# One entry per manual page. List of tuples # One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section). # (source start file, name, description, authors, manual section).
man_pages = [ man_pages = [
('index', 'vectorscan', u'Vectorscan Documentation', ('index', 'hyperscan', u'Hyperscan Documentation',
[u'Intel Corporation'], 7) [u'Intel Corporation'], 1)
] ]
# If true, show URL addresses after external links. # If true, show URL addresses after external links.
@ -246,8 +246,8 @@ man_pages = [
# (source start file, target name, title, author, # (source start file, target name, title, author,
# dir menu entry, description, category) # dir menu entry, description, category)
texinfo_documents = [ texinfo_documents = [
('index', 'Vectorscan', u'Vectorscan Documentation', ('index', 'Hyperscan', u'Hyperscan Documentation',
u'Intel Corporation; VectorCamp', 'Vectorscan', 'High-performance regular expression matcher.', u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
'Miscellaneous'), 'Miscellaneous'),
] ]

View File

@ -7,41 +7,43 @@ Getting Started
Very Quick Start Very Quick Start
**************** ****************
#. Clone Vectorscan :: #. Clone Hyperscan ::
cd <where-you-want-vectorscan-source> cd <where-you-want-hyperscan-source>
git clone https://github.com/VectorCamp/vectorscan git clone git://github.com/intel/hyperscan
#. Configure Vectorscan #. Configure Hyperscan
Ensure that you have the correct :ref:`dependencies <software>` present, Ensure that you have the correct :ref:`dependencies <software>` present,
and then: and then:
:: ::
cd <where-you-want-to-build-vectorscan> cd <where-you-want-to-build-hyperscan>
mkdir <build-dir> mkdir <build-dir>
cd <build-dir> cd <build-dir>
cmake [-G <generator>] [options] <vectorscan-source-path> cmake [-G <generator>] [options] <hyperscan-source-path>
Known working generators: Known working generators:
* ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X) * ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
* ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files. * ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
* ``Visual Studio 15 2017`` --- Visual Studio projects
Unsupported generators that might work include: Generators that might work include:
* ``Xcode`` --- OS X Xcode projects. * ``Xcode`` --- OS X Xcode projects.
#. Build Vectorscan #. Build Hyperscan
Depending on the generator used: Depending on the generator used:
* ``cmake --build .`` --- will build everything * ``cmake --build .`` --- will build everything
* ``make -j<jobs>`` --- use makefiles in parallel * ``make -j<jobs>`` --- use makefiles in parallel
* ``ninja`` --- use Ninja build * ``ninja`` --- use Ninja build
* ``MsBuild.exe`` --- use Visual Studio MsBuild
* etc. * etc.
#. Check Vectorscan #. Check Hyperscan
Run the Vectorscan unit tests: :: Run the Hyperscan unit tests: ::
bin/unit-hyperscan bin/unit-hyperscan
@ -53,23 +55,20 @@ Requirements
Hardware Hardware
======== ========
Vectorscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
32-bit (IA-32 Architecture) modes as well as Arm v8.0+ aarch64, and POWER 8+ ppc64le 32-bit (IA-32 Architecture) modes.
machines.
Hyperscan is a high performance software library that takes advantage of recent Hyperscan is a high performance software library that takes advantage of recent
architecture advances. Intel architecture advances. At a minimum, support for Supplemental Streaming
SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
x86 processor.
Additionally, Vectorscan can make use of: Additionally, Hyperscan can make use of:
* Intel Streaming SIMD Extensions 4.2 (SSE4.2) * Intel Streaming SIMD Extensions 4.2 (SSE4.2)
* the POPCNT instruction * the POPCNT instruction
* Bit Manipulation Instructions (BMI, BMI2) * Bit Manipulation Instructions (BMI, BMI2)
* Intel Advanced Vector Extensions 2 (Intel AVX2) * Intel Advanced Vector Extensions 2 (Intel AVX2)
* Arm NEON
* Arm SVE and SVE2
* Arm SVE2 BITPERM
* IBM Power8/Power9 VSX
if present. if present.
@ -80,34 +79,40 @@ These can be determined at library compile time, see :ref:`target_arch`.
Software Software
======== ========
As a software library, Vectorscan doesn't impose any particular runtime As a software library, Hyperscan doesn't impose any particular runtime
software requirements, however to build the Vectorscan library we require a software requirements, however to build the Hyperscan library we require a
modern C and C++ compiler -- in particular, Vectorscan requires C99 and C++17 modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
compiler support. The supported compilers are: compiler support. The supported compilers are:
* GCC, v9 or higher * GCC, v4.8.1 or higher
* Clang, v5 or higher (with libstdc++ or libc++) * Clang, v3.4 or higher (with libstdc++ or libc++)
* Intel C++ Compiler v15 or higher
* Visual C++ 2017 Build Tools
Examples of operating systems that Vectorscan is known to work on include: Examples of operating systems that Hyperscan is known to work on include:
Linux: Linux:
* Ubuntu 20.04 LTS or newer * Ubuntu 14.04 LTS or newer
* RedHat/CentOS 7 or newer * RedHat/CentOS 7 or newer
* Fedora 38 or newer
* Debian 10
FreeBSD: FreeBSD:
* 10.0 or newer * 10.0 or newer
Windows:
* 8 or newer
Mac OS X: Mac OS X:
* 10.8 or newer, using XCode/Clang * 10.8 or newer, using XCode/Clang
Vectorscan *may* compile and run on other platforms, but there is no guarantee. Hyperscan *may* compile and run on other platforms, but there is no guarantee.
We currently have experimental support for Windows using Intel C++ Compiler
or Visual Studio 2017.
In addition, the following software is required for compiling the Vectorscan library: In addition, the following software is required for compiling the Hyperscan library:
======================================================= =========== ====================================== ======================================================= =========== ======================================
Dependency Version Notes Dependency Version Notes
@ -127,20 +132,20 @@ Ragel, you may use Cygwin to build it from source.
Boost Headers Boost Headers
------------- -------------
Compiling Vectorscan depends on a recent version of the Boost C++ header Compiling Hyperscan depends on a recent version of the Boost C++ header
library. If the Boost libraries are installed on the build machine in the library. If the Boost libraries are installed on the build machine in the
usual paths, CMake will find them. If the Boost libraries are not installed, usual paths, CMake will find them. If the Boost libraries are not installed,
the location of the Boost source tree can be specified during the CMake the location of the Boost source tree can be specified during the CMake
configuration step using the ``BOOST_ROOT`` variable (described below). configuration step using the ``BOOST_ROOT`` variable (described below).
Another alternative is to put a copy of (or a symlink to) the boost Another alternative is to put a copy of (or a symlink to) the boost
subdirectory in ``<vectorscanscan-source-path>/include/boost``. subdirectory in ``<hyperscan-source-path>/include/boost``.
For example: for the Boost-1.59.0 release: :: For example: for the Boost-1.59.0 release: ::
ln -s boost_1_59_0/boost <vectorscan-source-path>/include/boost ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
As Vectorscan uses the header-only parts of Boost, it is not necessary to As Hyperscan uses the header-only parts of Boost, it is not necessary to
compile the Boost libraries. compile the Boost libraries.
CMake Configuration CMake Configuration
@ -163,12 +168,11 @@ Common options for CMake include:
| | Valid options are Debug, Release, RelWithDebInfo, | | | Valid options are Debug, Release, RelWithDebInfo, |
| | and MinSizeRel. Default is RelWithDebInfo. | | | and MinSizeRel. Default is RelWithDebInfo. |
+------------------------+----------------------------------------------------+ +------------------------+----------------------------------------------------+
| BUILD_SHARED_LIBS | Build Vectorscan as a shared library instead of | | BUILD_SHARED_LIBS | Build Hyperscan as a shared library instead of |
| | the default static library. | | | the default static library. |
| | Default: Off |
+------------------------+----------------------------------------------------+ +------------------------+----------------------------------------------------+
| BUILD_STATIC_LIBS | Build Vectorscan as a static library. | | BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs. |
| | Default: On | | | Default off. |
+------------------------+----------------------------------------------------+ +------------------------+----------------------------------------------------+
| BOOST_ROOT | Location of Boost source tree. | | BOOST_ROOT | Location of Boost source tree. |
+------------------------+----------------------------------------------------+ +------------------------+----------------------------------------------------+
@ -176,64 +180,12 @@ Common options for CMake include:
+------------------------+----------------------------------------------------+ +------------------------+----------------------------------------------------+
| FAT_RUNTIME | Build the :ref:`fat runtime<fat_runtime>`. Default | | FAT_RUNTIME | Build the :ref:`fat runtime<fat_runtime>`. Default |
| | true on Linux, not available elsewhere. | | | true on Linux, not available elsewhere. |
| | Default: Off |
+------------------------+----------------------------------------------------+
| USE_CPU_NATIVE | Native CPU detection is off by default, however it |
| | is possible to build a performance-oriented non-fat|
| | library tuned to your CPU. |
| | Default: Off |
+------------------------+----------------------------------------------------+
| SANITIZE | Use libasan sanitizer to detect possible bugs. |
| | Valid options are address, memory and undefined. |
+------------------------+----------------------------------------------------+
| SIMDE_BACKEND | Enable SIMDe backend. If this is chosen all native |
| | (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be |
| | disabled and a SIMDe SSE4.2 emulation backend will |
| | be enabled. This will enable Vectorscan to build |
| | and run on architectures without SIMD. |
| | Default: Off |
+------------------------+----------------------------------------------------+
| SIMDE_NATIVE | Enable SIMDe native emulation of x86 SSE4.2 |
| | intrinsics on the building platform. That is, |
| | SSE4.2 intrinsics will be emulated using Neon on |
| | an Arm platform, or VSX on a Power platform, etc. |
| | Default: Off |
+------------------------+----------------------------------------------------+
X86 platform specific options include:
+------------------------+----------------------------------------------------+
| Variable | Description |
+========================+====================================================+
| BUILD_AVX2 | Enable code for AVX2. |
+------------------------+----------------------------------------------------+
| BUILD_AVX512 | Enable code for AVX512. Implies BUILD_AVX2. |
+------------------------+----------------------------------------------------+
| BUILD_AVX512VBMI | Enable code for AVX512 with VBMI extension. Implies|
| | BUILD_AVX512. |
+------------------------+----------------------------------------------------+
Arm platform specific options include:
+------------------------+----------------------------------------------------+
| Variable | Description |
+========================+====================================================+
| BUILD_SVE | Enable code for SVE, like on AWS Graviton3 CPUs. |
| | Not much code is ported just for SVE , but enabling|
| | SVE code production, does improve code generation, |
| | see Benchmarks. |
+------------------------+----------------------------------------------------+
| BUILD_SVE2 | Enable code for SVE2, implies BUILD_SVE. Most |
| | non-Neon code is written for SVE2. |
+------------------------+----------------------------------------------------+
| BUILD_SVE2_BITPERM | Enable code for SVE2_BITPERM harwdare feature, |
| | implies BUILD_SVE2. |
+------------------------+----------------------------------------------------+ +------------------------+----------------------------------------------------+
For example, to generate a ``Debug`` build: :: For example, to generate a ``Debug`` build: ::
cd <build-dir> cd <build-dir>
cmake -DCMAKE_BUILD_TYPE=Debug <vectorscan-source-path> cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
@ -241,7 +193,7 @@ Build Type
---------- ----------
CMake determines a number of features for a build based on the Build Type. CMake determines a number of features for a build based on the Build Type.
Vectorscan defaults to ``RelWithDebInfo``, i.e. "release with debugging Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
information". This is a performance optimized build without runtime assertions information". This is a performance optimized build without runtime assertions
but with debug symbols enabled. but with debug symbols enabled.
@ -249,7 +201,7 @@ The other types of builds are:
* ``Release``: as above, but without debug symbols * ``Release``: as above, but without debug symbols
* ``MinSizeRel``: a stripped release build * ``MinSizeRel``: a stripped release build
* ``Debug``: used when developing Vectorscan. Includes runtime assertions * ``Debug``: used when developing Hyperscan. Includes runtime assertions
(which has a large impact on runtime performance), and will also enable (which has a large impact on runtime performance), and will also enable
some other build features like building internal unit some other build features like building internal unit
tests. tests.
@ -259,7 +211,7 @@ The other types of builds are:
Target Architecture Target Architecture
------------------- -------------------
Unless using the :ref:`fat runtime<fat_runtime>`, by default Vectorscan will be Unless using the :ref:`fat runtime<fat_runtime>`, by default Hyperscan will be
compiled to target the instruction set of the processor of the machine that compiled to target the instruction set of the processor of the machine that
being used for compilation. This is done via the use of ``-march=native``. The being used for compilation. This is done via the use of ``-march=native``. The
result of this means that a library built on one machine may not work on a result of this means that a library built on one machine may not work on a
@ -271,7 +223,7 @@ CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. F
example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: :: example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
cmake -DCMAKE_C_FLAGS="-march=corei7" \ cmake -DCMAKE_C_FLAGS="-march=corei7" \
-DCMAKE_CXX_FLAGS="-march=corei7" <vectorscan-source-path> -DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
For more information, refer to :ref:`instr_specialization`. For more information, refer to :ref:`instr_specialization`.
@ -280,17 +232,17 @@ For more information, refer to :ref:`instr_specialization`.
Fat Runtime Fat Runtime
----------- -----------
A feature introduced in Hyperscan v4.4 is the ability for the Vectorscan A feature introduced in Hyperscan v4.4 is the ability for the Hyperscan
library to dispatch the most appropriate runtime code for the host processor. library to dispatch the most appropriate runtime code for the host processor.
This feature is called the "fat runtime", as a single Vectorscan library This feature is called the "fat runtime", as a single Hyperscan library
contains multiple copies of the runtime code for different instruction sets. contains multiple copies of the runtime code for different instruction sets.
.. note:: .. note::
The fat runtime feature is only available on Linux. Release builds of The fat runtime feature is only available on Linux. Release builds of
Vectorscan will default to having the fat runtime enabled where supported. Hyperscan will default to having the fat runtime enabled where supported.
When building the library with the fat runtime, the Vectorscan runtime code When building the library with the fat runtime, the Hyperscan runtime code
will be compiled multiple times for these different instruction sets, and will be compiled multiple times for these different instruction sets, and
these compiled objects are combined into one library. There are no changes to these compiled objects are combined into one library. There are no changes to
how user applications are built against this library. how user applications are built against this library.
@ -302,11 +254,11 @@ resolved so that the right version of each API function is used. There is no
impact on function call performance, as this check and resolution is performed impact on function call performance, as this check and resolution is performed
by the ELF loader once when the binary is loaded. by the ELF loader once when the binary is loaded.
If the Vectorscan library is used on x86 systems without ``SSSE4.2``, the runtime If the Hyperscan library is used on x86 systems without ``SSSE3``, the runtime
API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR` API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR`
instead of potentially executing illegal instructions. The API function instead of potentially executing illegal instructions. The API function
:c:func:`hs_valid_platform` can be used by application writers to determine if :c:func:`hs_valid_platform` can be used by application writers to determine if
the current platform is supported by Vectorscan. the current platform is supported by Hyperscan.
As of this release, the variants of the runtime that are built, and the CPU As of this release, the variants of the runtime that are built, and the CPU
capability that is required, are the following: capability that is required, are the following:
@ -347,11 +299,6 @@ capability that is required, are the following:
cmake -DBUILD_AVX512VBMI=on <...> cmake -DBUILD_AVX512VBMI=on <...>
Vectorscan add support for Arm processors and SVE, SV2 and SVE2_BITPERM.
example: ::
cmake -DBUILD_SVE=ON -DBUILD_SVE2=ON -DBUILD_SVE2_BITPERM=ON <...>
As the fat runtime requires compiler, libc, and binutils support, at this time As the fat runtime requires compiler, libc, and binutils support, at this time
it will only be enabled for Linux builds where the compiler supports the it will only be enabled for Linux builds where the compiler supports the
`indirect function "ifunc" function attribute `indirect function "ifunc" function attribute

View File

@ -1,5 +1,5 @@
############################################### ###############################################
Vectorscan |version| Developer's Reference Guide Hyperscan |version| Developer's Reference Guide
############################################### ###############################################
------- -------

View File

@ -5,11 +5,11 @@
Introduction Introduction
############ ############
Vectorscan is a software regular expression matching engine designed with Hyperscan is a software regular expression matching engine designed with
high performance and flexibility in mind. It is implemented as a library that high performance and flexibility in mind. It is implemented as a library that
exposes a straightforward C API. exposes a straightforward C API.
The Vectorscan API itself is composed of two major components: The Hyperscan API itself is composed of two major components:
*********** ***********
Compilation Compilation
@ -17,7 +17,7 @@ Compilation
These functions take a group of regular expressions, along with identifiers and These functions take a group of regular expressions, along with identifiers and
option flags, and compile them into an immutable database that can be used by option flags, and compile them into an immutable database that can be used by
the Vectorscan scanning API. This compilation process performs considerable the Hyperscan scanning API. This compilation process performs considerable
analysis and optimization work in order to build a database that will match the analysis and optimization work in order to build a database that will match the
given expressions efficiently. given expressions efficiently.
@ -36,8 +36,8 @@ See :ref:`compilation` for more detail.
Scanning Scanning
******** ********
Once a Vectorscan database has been created, it can be used to scan data in Once a Hyperscan database has been created, it can be used to scan data in
memory. Vectorscan provides several scanning modes, depending on whether the memory. Hyperscan provides several scanning modes, depending on whether the
data to be scanned is available as a single contiguous block, whether it is data to be scanned is available as a single contiguous block, whether it is
distributed amongst several blocks in memory at the same time, or whether it is distributed amongst several blocks in memory at the same time, or whether it is
to be scanned as a sequence of blocks in a stream. to be scanned as a sequence of blocks in a stream.
@ -45,7 +45,7 @@ to be scanned as a sequence of blocks in a stream.
Matches are delivered to the application via a user-supplied callback function Matches are delivered to the application via a user-supplied callback function
that is called synchronously for each match. that is called synchronously for each match.
For a given database, Vectorscan provides several guarantees: For a given database, Hyperscan provides several guarantees:
* No memory allocations occur at runtime with the exception of two * No memory allocations occur at runtime with the exception of two
fixed-size allocations, both of which should be done ahead of time for fixed-size allocations, both of which should be done ahead of time for
@ -56,7 +56,7 @@ For a given database, Vectorscan provides several guarantees:
call. call.
- **Stream state**: in streaming mode only, some state space is required to - **Stream state**: in streaming mode only, some state space is required to
store data that persists between scan calls for each stream. This allows store data that persists between scan calls for each stream. This allows
Vectorscan to track matches that span multiple blocks of data. Hyperscan to track matches that span multiple blocks of data.
* The sizes of the scratch space and stream state (in streaming mode) required * The sizes of the scratch space and stream state (in streaming mode) required
for a given database are fixed and determined at database compile time. This for a given database are fixed and determined at database compile time. This
@ -64,7 +64,7 @@ For a given database, Vectorscan provides several guarantees:
time, and these structures can be pre-allocated if required for performance time, and these structures can be pre-allocated if required for performance
reasons. reasons.
* Any pattern that has successfully been compiled by the Vectorscan compiler can * Any pattern that has successfully been compiled by the Hyperscan compiler can
be scanned against any input. There are no internal resource limits or other be scanned against any input. There are no internal resource limits or other
limitations at runtime that could cause a scan call to return an error. limitations at runtime that could cause a scan call to return an error.
@ -74,12 +74,12 @@ See :ref:`runtime` for more detail.
Tools Tools
***** *****
Some utilities for testing and benchmarking Vectorscan are included with the Some utilities for testing and benchmarking Hyperscan are included with the
library. See :ref:`tools` for more information. library. See :ref:`tools` for more information.
************ ************
Example Code Example Code
************ ************
Some simple example code demonstrating the use of the Vectorscan API is Some simple example code demonstrating the use of the Hyperscan API is
available in the ``examples/`` subdirectory of the Vectorscan distribution. available in the ``examples/`` subdirectory of the Hyperscan distribution.

View File

@ -4,7 +4,7 @@
Performance Considerations Performance Considerations
########################## ##########################
Vectorscan supports a wide range of patterns in all three scanning modes. It is Hyperscan supports a wide range of patterns in all three scanning modes. It is
capable of extremely high levels of performance, but certain patterns can capable of extremely high levels of performance, but certain patterns can
reduce performance markedly. reduce performance markedly.
@ -25,7 +25,7 @@ For example, caseless matching of :regexp:`/abc/` can be written as:
* :regexp:`/(?i)abc(?-i)/` * :regexp:`/(?i)abc(?-i)/`
* :regexp:`/abc/i` * :regexp:`/abc/i`
Vectorscan is capable of handling all these constructs. Unless there is a Hyperscan is capable of handling all these constructs. Unless there is a
specific reason otherwise, do not rewrite patterns from one form to another. specific reason otherwise, do not rewrite patterns from one form to another.
As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
@ -41,24 +41,24 @@ Library usage
.. tip:: Do not hand-optimize library usage. .. tip:: Do not hand-optimize library usage.
The Vectorscan library is capable of dealing with small writes, unusually large The Hyperscan library is capable of dealing with small writes, unusually large
and small pattern sets, etc. Unless there is a specific performance problem and small pattern sets, etc. Unless there is a specific performance problem
with some usage of the library, it is best to use Vectorscan in a simple and with some usage of the library, it is best to use Hyperscan in a simple and
direct fashion. For example, it is unlikely for there to be much benefit in direct fashion. For example, it is unlikely for there to be much benefit in
buffering input to the library into larger blocks unless streaming writes are buffering input to the library into larger blocks unless streaming writes are
tiny (say, 1-2 bytes at a time). tiny (say, 1-2 bytes at a time).
Unlike many other pattern matching products, Vectorscan will run faster with Unlike many other pattern matching products, Hyperscan will run faster with
small numbers of patterns and slower with large numbers of patterns in a smooth small numbers of patterns and slower with large numbers of patterns in a smooth
fashion (as opposed to, typically, running at a moderate speed up to some fixed fashion (as opposed to, typically, running at a moderate speed up to some fixed
limit then either breaking or running half as fast). limit then either breaking or running half as fast).
Vectorscan also provides high-throughput matching with a single thread of Hyperscan also provides high-throughput matching with a single thread of
control per core; if a database runs at 3.0 Gbps in Vectorscan it means that a control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
3000-bit block of data will be scanned in 1 microsecond in a single thread of 3000-bit block of data will be scanned in 1 microsecond in a single thread of
control, not that it is required to scan 22 3000-bit blocks of data in 22 control, not that it is required to scan 22 3000-bit blocks of data in 22
microseconds. Thus, it is not usually necessary to buffer data to supply microseconds. Thus, it is not usually necessary to buffer data to supply
Vectorscan with available parallelism. Hyperscan with available parallelism.
******************** ********************
Block-based matching Block-based matching
@ -72,7 +72,7 @@ accumulated before processing, it should be scanned in block rather than in
streaming mode. streaming mode.
Unnecessary use of streaming mode reduces the number of optimizations that can Unnecessary use of streaming mode reduces the number of optimizations that can
be applied in Vectorscan and may make some patterns run slower. be applied in Hyperscan and may make some patterns run slower.
If there is a mixture of 'block' and 'streaming' mode patterns, these should be If there is a mixture of 'block' and 'streaming' mode patterns, these should be
scanned in separate databases except in the case that the streaming patterns scanned in separate databases except in the case that the streaming patterns
@ -107,7 +107,7 @@ Allocate scratch ahead of time
Scratch allocation is not necessarily a cheap operation. Since it is the first Scratch allocation is not necessarily a cheap operation. Since it is the first
time (after compilation or deserialization) that a pattern database is used, time (after compilation or deserialization) that a pattern database is used,
Vectorscan performs some validation checks inside :c:func:`hs_alloc_scratch` and Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
must also allocate memory. must also allocate memory.
Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
@ -329,7 +329,7 @@ Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
the callback is considerably more expensive and general than the callback is considerably more expensive and general than
:regexp:`/foo.{300}bar/`. :regexp:`/foo.{300}bar/`.
Similarly, the :cpp:member:`hs_expr_ext::min_length` extended parameter can be Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
used to specify a lower bound on the length of the matches for a pattern. Using used to specify a lower bound on the length of the matches for a pattern. Using
this facility may be more lightweight in some circumstances than using the SOM this facility may be more lightweight in some circumstances than using the SOM
flag and post-confirming match length in the calling application. flag and post-confirming match length in the calling application.

View File

@ -6,35 +6,35 @@ Preface
Overview Overview
******** ********
Vectorscan is a regular expression engine designed to offer high performance, the Hyperscan is a regular expression engine designed to offer high performance, the
ability to match multiple expressions simultaneously and flexibility in ability to match multiple expressions simultaneously and flexibility in
scanning operation. scanning operation.
Patterns are provided to a compilation interface which generates an immutable Patterns are provided to a compilation interface which generates an immutable
pattern database. The scan interface then can be used to scan a target data pattern database. The scan interface then can be used to scan a target data
buffer for the given patterns, returning any matching results from that data buffer for the given patterns, returning any matching results from that data
buffer. Vectorscan also provides a streaming mode, in which matches that span buffer. Hyperscan also provides a streaming mode, in which matches that span
several blocks in a stream are detected. several blocks in a stream are detected.
This document is designed to facilitate code-level integration of the Vectorscan This document is designed to facilitate code-level integration of the Hyperscan
library with existing or new applications. library with existing or new applications.
:ref:`intro` is a short overview of the Vectorscan library, with more detail on :ref:`intro` is a short overview of the Hyperscan library, with more detail on
the Vectorscan API provided in the subsequent sections: :ref:`compilation` and the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
:ref:`runtime`. :ref:`runtime`.
:ref:`perf` provides details on various factors which may impact the :ref:`perf` provides details on various factors which may impact the
performance of a Vectorscan integration. performance of a Hyperscan integration.
:ref:`api_constants` and :ref:`api_files` provides a detailed summary of the :ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
Vectorscan Application Programming Interface (API). Hyperscan Application Programming Interface (API).
******** ********
Audience Audience
******** ********
This guide is aimed at developers interested in integrating Vectorscan into an This guide is aimed at developers interested in integrating Hyperscan into an
application. For information on building the Vectorscan library, see the Quick application. For information on building the Hyperscan library, see the Quick
Start Guide. Start Guide.
*********** ***********

View File

@ -4,7 +4,7 @@
Scanning for Patterns Scanning for Patterns
##################### #####################
Vectorscan provides three different scanning modes, each with its own scan Hyperscan provides three different scanning modes, each with its own scan
function beginning with ``hs_scan``. In addition, streaming mode has a number function beginning with ``hs_scan``. In addition, streaming mode has a number
of other API functions for managing stream state. of other API functions for managing stream state.
@ -33,8 +33,8 @@ See :c:type:`match_event_handler` for more information.
Streaming Mode Streaming Mode
************** **************
The core of the Vectorscan streaming runtime API consists of functions to open, The core of the Hyperscan streaming runtime API consists of functions to open,
scan, and close Vectorscan data streams: scan, and close Hyperscan data streams:
* :c:func:`hs_open_stream`: allocates and initializes a new stream for scanning. * :c:func:`hs_open_stream`: allocates and initializes a new stream for scanning.
@ -57,14 +57,14 @@ will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
still call :c:func:`hs_close_stream` to complete the clean-up process for that still call :c:func:`hs_close_stream` to complete the clean-up process for that
stream. stream.
Streams exist in the Vectorscan library so that pattern matching state can be Streams exist in the Hyperscan library so that pattern matching state can be
maintained across multiple blocks of target data -- without maintaining this maintained across multiple blocks of target data -- without maintaining this
state, it would not be possible to detect patterns that span these blocks of state, it would not be possible to detect patterns that span these blocks of
data. This, however, does come at the cost of requiring an amount of storage data. This, however, does come at the cost of requiring an amount of storage
per-stream (the size of this storage is fixed at compile time), and a slight per-stream (the size of this storage is fixed at compile time), and a slight
performance penalty in some cases to manage the state. performance penalty in some cases to manage the state.
While Vectorscan does always support a strict ordering of multiple matches, While Hyperscan does always support a strict ordering of multiple matches,
streaming matches will not be delivered at offsets before the current stream streaming matches will not be delivered at offsets before the current stream
write, with the exception of zero-width asserts, where constructs such as write, with the exception of zero-width asserts, where constructs such as
:regexp:`\\b` and :regexp:`$` can cause a match on the final character of a :regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
@ -76,7 +76,7 @@ Stream Management
================= =================
In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
:c:func:`hs_close_stream`, the Vectorscan API provides a number of other :c:func:`hs_close_stream`, the Hyperscan API provides a number of other
functions for the management of streams: functions for the management of streams:
* :c:func:`hs_reset_stream`: resets a stream to its initial state; this is * :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
@ -98,10 +98,10 @@ A stream object is allocated as a fixed size region of memory which has been
sized to ensure that no memory allocations are required during scan sized to ensure that no memory allocations are required during scan
operations. When the system is under memory pressure, it may be useful to reduce operations. When the system is under memory pressure, it may be useful to reduce
the memory consumed by streams that are not expected to be used soon. The the memory consumed by streams that are not expected to be used soon. The
Vectorscan API provides calls for translating a stream to and from a compressed Hyperscan API provides calls for translating a stream to and from a compressed
representation for this purpose. The compressed representation differs from the representation for this purpose. The compressed representation differs from the
full stream object as it does not reserve space for components which are not full stream object as it does not reserve space for components which are not
required given the current stream state. The Vectorscan API functions for this required given the current stream state. The Hyperscan API functions for this
functionality are: functionality are:
* :c:func:`hs_compress_stream`: fills the provided buffer with a compressed * :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
@ -157,7 +157,7 @@ scanned in block mode.
Scratch Space Scratch Space
************* *************
While scanning data, Vectorscan needs a small amount of temporary memory to store While scanning data, Hyperscan needs a small amount of temporary memory to store
on-the-fly internal data. This amount is unfortunately too large to fit on the on-the-fly internal data. This amount is unfortunately too large to fit on the
stack, particularly for embedded applications, and allocating memory dynamically stack, particularly for embedded applications, and allocating memory dynamically
is too expensive, so a pre-allocated "scratch" space must be provided to the is too expensive, so a pre-allocated "scratch" space must be provided to the
@ -170,7 +170,7 @@ databases, only a single scratch region is necessary: in this case, calling
will ensure that the scratch space is large enough to support scanning against will ensure that the scratch space is large enough to support scanning against
any of the given databases. any of the given databases.
While the Vectorscan library is re-entrant, the use of scratch spaces is not. While the Hyperscan library is re-entrant, the use of scratch spaces is not.
For example, if by design it is deemed necessary to run recursive or nested For example, if by design it is deemed necessary to run recursive or nested
scanning (say, from the match callback function), then an additional scratch scanning (say, from the match callback function), then an additional scratch
space is required for that context. space is required for that context.
@ -219,11 +219,11 @@ For example:
Custom Allocators Custom Allocators
***************** *****************
By default, structures used by Vectorscan at runtime (scratch space, stream By default, structures used by Hyperscan at runtime (scratch space, stream
state, etc) are allocated with the default system allocators, usually state, etc) are allocated with the default system allocators, usually
``malloc()`` and ``free()``. ``malloc()`` and ``free()``.
The Vectorscan API provides a facility for changing this behaviour to support The Hyperscan API provides a facility for changing this behaviour to support
applications that use custom memory allocators. applications that use custom memory allocators.
These functions are: These functions are:

View File

@ -4,7 +4,7 @@
Serialization Serialization
############# #############
For some applications, compiling Vectorscan pattern databases immediately prior For some applications, compiling Hyperscan pattern databases immediately prior
to use is not an appropriate design. Some users may wish to: to use is not an appropriate design. Some users may wish to:
* Compile pattern databases on a different host; * Compile pattern databases on a different host;
@ -14,9 +14,9 @@ to use is not an appropriate design. Some users may wish to:
* Control the region of memory in which the compiled database is located. * Control the region of memory in which the compiled database is located.
Vectorscan pattern databases are not completely flat in memory: they contain Hyperscan pattern databases are not completely flat in memory: they contain
pointers and have specific alignment requirements. Therefore, they cannot be pointers and have specific alignment requirements. Therefore, they cannot be
copied (or otherwise relocated) directly. To enable these use cases, Vectorscan copied (or otherwise relocated) directly. To enable these use cases, Hyperscan
provides functionality for serializing and deserializing compiled pattern provides functionality for serializing and deserializing compiled pattern
databases. databases.
@ -40,10 +40,10 @@ The API provides the following functions:
returns a string containing information about the database. This call is returns a string containing information about the database. This call is
analogous to :c:func:`hs_database_info`. analogous to :c:func:`hs_database_info`.
.. note:: Vectorscan performs both version and platform compatibility checks .. note:: Hyperscan performs both version and platform compatibility checks
upon deserialization. The :c:func:`hs_deserialize_database` and upon deserialization. The :c:func:`hs_deserialize_database` and
:c:func:`hs_deserialize_database_at` functions will only permit the :c:func:`hs_deserialize_database_at` functions will only permit the
deserialization of databases compiled with (a) the same version of Vectorscan deserialization of databases compiled with (a) the same version of Hyperscan
and (b) platform features supported by the current host platform. See and (b) platform features supported by the current host platform. See
:ref:`instr_specialization` for more information on platform specialization. :ref:`instr_specialization` for more information on platform specialization.
@ -51,17 +51,17 @@ The API provides the following functions:
The Runtime Library The Runtime Library
=================== ===================
The main Vectorscan library (``libhs``) contains both the compiler and runtime The main Hyperscan library (``libhs``) contains both the compiler and runtime
portions of the library. This means that in order to support the Vectorscan portions of the library. This means that in order to support the Hyperscan
compiler, which is written in C++, it requires C++ linkage and has a compiler, which is written in C++, it requires C++ linkage and has a
dependency on the C++ standard library. dependency on the C++ standard library.
Many embedded applications require only the scanning ("runtime") portion of the Many embedded applications require only the scanning ("runtime") portion of the
Vectorscan library. In these cases, pattern compilation generally takes place on Hyperscan library. In these cases, pattern compilation generally takes place on
another host, and serialized pattern databases are delivered to the application another host, and serialized pattern databases are delivered to the application
for use. for use.
To support these applications without requiring the C++ dependency, a To support these applications without requiring the C++ dependency, a
runtime-only version of the Vectorscan library, called ``libhs_runtime``, is also runtime-only version of the Hyperscan library, called ``libhs_runtime``, is also
distributed. This library does not depend on the C++ standard library and distributed. This library does not depend on the C++ standard library and
provides all Vectorscan functions other that those used to compile databases. provides all Hyperscan functions other that those used to compile databases.

View File

@ -4,14 +4,14 @@
Tools Tools
##### #####
This section describes the set of utilities included with the Vectorscan library. This section describes the set of utilities included with the Hyperscan library.
******************** ********************
Quick Check: hscheck Quick Check: hscheck
******************** ********************
The ``hscheck`` tool allows the user to quickly check whether Vectorscan supports The ``hscheck`` tool allows the user to quickly check whether Hyperscan supports
a group of patterns. If a pattern is rejected by Vectorscan's compiler, the a group of patterns. If a pattern is rejected by Hyperscan's compiler, the
compile error is provided on standard output. compile error is provided on standard output.
For example, given the following three patterns (the last of which contains a For example, given the following three patterns (the last of which contains a
@ -34,7 +34,7 @@ syntax error) in a file called ``/tmp/test``::
Benchmarker: hsbench Benchmarker: hsbench
******************** ********************
The ``hsbench`` tool provides an easy way to measure Vectorscan's performance The ``hsbench`` tool provides an easy way to measure Hyperscan's performance
for a particular set of patterns and corpus of data to be scanned. for a particular set of patterns and corpus of data to be scanned.
Patterns are supplied in the format described below in Patterns are supplied in the format described below in
@ -44,7 +44,7 @@ easy control of how a corpus is broken into blocks and streams.
.. note:: A group of Python scripts for constructing corpora databases from .. note:: A group of Python scripts for constructing corpora databases from
various input types, such as PCAP network traffic captures or text files, can various input types, such as PCAP network traffic captures or text files, can
be found in the Vectorscan source tree in ``tools/hsbench/scripts``. be found in the Hyperscan source tree in ``tools/hsbench/scripts``.
Running hsbench Running hsbench
=============== ===============
@ -56,7 +56,7 @@ produce output like this::
$ hsbench -e /tmp/patterns -c /tmp/corpus.db $ hsbench -e /tmp/patterns -c /tmp/corpus.db
Signatures: /tmp/patterns Signatures: /tmp/patterns
Vectorscan info: Version: 5.4.11 Features: AVX2 Mode: STREAM Hyperscan info: Version: 4.3.1 Features: AVX2 Mode: STREAM
Expression count: 200 Expression count: 200
Bytecode size: 342,540 bytes Bytecode size: 342,540 bytes
Database CRC: 0x6cd6b67c Database CRC: 0x6cd6b67c
@ -77,7 +77,7 @@ takes to perform all twenty scans. The number of repeats can be changed with the
``-n`` argument, and the results of each scan will be displayed if the ``-n`` argument, and the results of each scan will be displayed if the
``--per-scan`` argument is specified. ``--per-scan`` argument is specified.
To benchmark Vectorscan on more than one core, you can supply a list of cores To benchmark Hyperscan on more than one core, you can supply a list of cores
with the ``-T`` argument, which will instruct ``hsbench`` to start one with the ``-T`` argument, which will instruct ``hsbench`` to start one
benchmark thread per core given and compute the throughput from the time taken benchmark thread per core given and compute the throughput from the time taken
to complete all of them. to complete all of them.
@ -91,17 +91,17 @@ Correctness Testing: hscollider
******************************* *******************************
The ``hscollider`` tool, or Pattern Collider, provides a way to verify The ``hscollider`` tool, or Pattern Collider, provides a way to verify
Vectorscan's matching behaviour. It does this by compiling and scanning patterns Hyperscan's matching behaviour. It does this by compiling and scanning patterns
(either singly or in groups) against known corpora and comparing the results (either singly or in groups) against known corpora and comparing the results
against another engine (the "ground truth"). Two sources of ground truth for against another engine (the "ground truth"). Two sources of ground truth for
comparison are available: comparison are available:
* The PCRE library (http://pcre.org/). * The PCRE library (http://pcre.org/).
* An NFA simulation run on Vectorscan's compile-time graph representation. This * An NFA simulation run on Hyperscan's compile-time graph representation. This
is used if PCRE cannot support the pattern or if PCRE execution fails due to is used if PCRE cannot support the pattern or if PCRE execution fails due to
a resource limit. a resource limit.
Much of Vectorscan's testing infrastructure is built on ``hscollider``, and the Much of Hyperscan's testing infrastructure is built on ``hscollider``, and the
tool is designed to take advantage of multiple cores and provide considerable tool is designed to take advantage of multiple cores and provide considerable
flexibility in controlling the test. These options are described in the help flexibility in controlling the test. These options are described in the help
(``hscollider -h``) and include: (``hscollider -h``) and include:
@ -116,11 +116,11 @@ flexibility in controlling the test. These options are described in the help
Using hscollider to debug a pattern Using hscollider to debug a pattern
=================================== ===================================
One common use-case for ``hscollider`` is to determine whether Vectorscan will One common use-case for ``hscollider`` is to determine whether Hyperscan will
match a pattern in the expected location, and whether this accords with PCRE's match a pattern in the expected location, and whether this accords with PCRE's
behaviour for the same case. behaviour for the same case.
Here is an example. We put our pattern in a file in Vectorscan's pattern Here is an example. We put our pattern in a file in Hyperscan's pattern
format:: format::
$ cat /tmp/pat $ cat /tmp/pat
@ -172,7 +172,7 @@ individual matches are displayed in the output::
Total elapsed time: 0.00522815 secs. Total elapsed time: 0.00522815 secs.
We can see from this output that both PCRE and Vectorscan find matches ending at We can see from this output that both PCRE and Hyperscan find matches ending at
offset 33 and 45, and so ``hscollider`` considers this test case to have offset 33 and 45, and so ``hscollider`` considers this test case to have
passed. passed.
@ -180,13 +180,13 @@ passed.
corpus alignment 0, and ``-T 1`` instructs us to only use one thread.) corpus alignment 0, and ``-T 1`` instructs us to only use one thread.)
.. note:: In default operation, PCRE produces only one match for a scan, unlike .. note:: In default operation, PCRE produces only one match for a scan, unlike
Vectorscan's automata semantics. The ``hscollider`` tool uses libpcre's Hyperscan's automata semantics. The ``hscollider`` tool uses libpcre's
"callout" functionality to match Vectorscan's semantics. "callout" functionality to match Hyperscan's semantics.
Running a larger scan test Running a larger scan test
========================== ==========================
A set of patterns for testing purposes are distributed with Vectorscan, and these A set of patterns for testing purposes are distributed with Hyperscan, and these
can be tested via ``hscollider`` on an in-tree build. Two CMake targets are can be tested via ``hscollider`` on an in-tree build. Two CMake targets are
provided to do this easily: provided to do this easily:
@ -202,10 +202,10 @@ Debugging: hsdump
***************** *****************
When built in debug mode (using the CMake directive ``CMAKE_BUILD_TYPE`` set to When built in debug mode (using the CMake directive ``CMAKE_BUILD_TYPE`` set to
``Debug``), Vectorscan includes support for dumping information about its ``Debug``), Hyperscan includes support for dumping information about its
internals during pattern compilation with the ``hsdump`` tool. internals during pattern compilation with the ``hsdump`` tool.
This information is mostly of use to Vectorscan developers familiar with the This information is mostly of use to Hyperscan developers familiar with the
library's internal structure, but can be used to diagnose issues with patterns library's internal structure, but can be used to diagnose issues with patterns
and provide more information in bug reports. and provide more information in bug reports.
@ -215,7 +215,7 @@ and provide more information in bug reports.
Pattern Format Pattern Format
************** **************
All of the Vectorscan tools accept patterns in the same format, read from plain All of the Hyperscan tools accept patterns in the same format, read from plain
text files with one pattern per line. Each line looks like this: text files with one pattern per line. Each line looks like this:
* ``<integer id>:/<regex>/<flags>`` * ``<integer id>:/<regex>/<flags>``
@ -227,12 +227,12 @@ For example::
3:/^.{10,20}hatstand/m 3:/^.{10,20}hatstand/m
The integer ID is the value that will be reported when a match is found by The integer ID is the value that will be reported when a match is found by
Vectorscan and must be unique. Hyperscan and must be unique.
The pattern itself is a regular expression in PCRE syntax; see The pattern itself is a regular expression in PCRE syntax; see
:ref:`compilation` for more information on supported features. :ref:`compilation` for more information on supported features.
The flags are single characters that map to Vectorscan flags as follows: The flags are single characters that map to Hyperscan flags as follows:
========= ================================= =========== ========= ================================= ===========
Character API Flag Description Character API Flag Description
@ -256,7 +256,7 @@ between braces, separated by commas. For example::
1:/hatstand.*teakettle/s{min_offset=50,max_offset=100} 1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}
All Vectorscan tools will accept a pattern file (or a directory containing All Hyperscan tools will accept a pattern file (or a directory containing
pattern files) with the ``-e`` argument. If no further arguments constraining pattern files) with the ``-e`` argument. If no further arguments constraining
the pattern set are given, all patterns in those files are used. the pattern set are given, all patterns in those files are used.

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2021, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -113,10 +112,10 @@
* *
*/ */
#include <random>
#include <algorithm> #include <algorithm>
#include <cstring> #include <cstring>
#include <chrono> #include <chrono>
#include <climits>
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
@ -135,12 +134,7 @@
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <netinet/udp.h> #include <netinet/udp.h>
#include <netinet/ip_icmp.h> #include <netinet/ip_icmp.h>
#ifdef __NetBSD__
#include <net/ethertypes.h>
#include <net/if_ether.h>
#else
#include <net/ethernet.h> #include <net/ethernet.h>
#endif /* __NetBSD__ */
#include <arpa/inet.h> #include <arpa/inet.h>
#include <pcap.h> #include <pcap.h>
@ -158,8 +152,6 @@ using std::set;
using std::min; using std::min;
using std::max; using std::max;
using std::copy; using std::copy;
using std::random_device;
using std::mt19937;
enum Criterion { enum Criterion {
CRITERION_THROUGHPUT, CRITERION_THROUGHPUT,
@ -202,15 +194,15 @@ struct FiveTuple {
unsigned int dstPort; unsigned int dstPort;
// Construct a FiveTuple from a TCP or UDP packet. // Construct a FiveTuple from a TCP or UDP packet.
explicit FiveTuple(const struct ip *iphdr) { FiveTuple(const struct ip *iphdr) {
// IP fields // IP fields
protocol = iphdr->ip_p; protocol = iphdr->ip_p;
srcAddr = iphdr->ip_src.s_addr; srcAddr = iphdr->ip_src.s_addr;
dstAddr = iphdr->ip_dst.s_addr; dstAddr = iphdr->ip_dst.s_addr;
// UDP/TCP ports // UDP/TCP ports
const struct udphdr *uh = reinterpret_cast<const struct udphdr *> const struct udphdr *uh = (const struct udphdr *)
((reinterpret_cast<const char *>(iphdr)) + (iphdr->ip_hl * 4)); (((const char *)iphdr) + (iphdr->ip_hl * 4));
srcPort = uh->uh_sport; srcPort = uh->uh_sport;
dstPort = uh->uh_dport; dstPort = uh->uh_dport;
} }
@ -239,7 +231,7 @@ static
int onMatch(unsigned int id, unsigned long long from, unsigned long long to, int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
unsigned int flags, void *ctx) { unsigned int flags, void *ctx) {
// Our context points to a size_t storing the match count // Our context points to a size_t storing the match count
size_t *matches = static_cast<size_t *>(ctx); size_t *matches = (size_t *)ctx;
(*matches)++; (*matches)++;
return 0; // continue matching return 0; // continue matching
} }
@ -301,7 +293,7 @@ public:
// database. // database.
hs_error_t err = hs_alloc_scratch(db, &scratch); hs_error_t err = hs_alloc_scratch(db, &scratch);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: could not allocate scratch space. Exiting.\n"; cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
exit(-1); exit(-1);
} }
} }
@ -313,7 +305,8 @@ public:
size_t scratch_size; size_t scratch_size;
hs_error_t err = hs_scratch_size(scratch, &scratch_size); hs_error_t err = hs_scratch_size(scratch, &scratch_size);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: could not query scratch space size. Exiting.\n"; cerr << "ERROR: could not query scratch space size. Exiting."
<< endl;
exit(-1); exit(-1);
} }
return scratch_size; return scratch_size;
@ -339,9 +332,9 @@ public:
} }
// Valid TCP or UDP packet // Valid TCP or UDP packet
const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData const struct ip *iphdr = (const struct ip *)(pktData
+ sizeof(struct ether_header)); + sizeof(struct ether_header));
const char *payload = reinterpret_cast<const char *>(pktData) + offset; const char *payload = (const char *)pktData + offset;
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr), size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
stream_map.size())).first->second; stream_map.size())).first->second;
@ -357,8 +350,9 @@ public:
// Return the number of bytes scanned // Return the number of bytes scanned
size_t bytes() const { size_t bytes() const {
size_t sum = 0; size_t sum = 0;
auto packs = [](size_t z, const string &packet) { return z + packet.size(); }; for (const auto &packet : packets) {
sum += std::accumulate(packets.begin(), packets.end(), 0, packs); sum += packet.size();
}
return sum; return sum;
} }
@ -378,7 +372,7 @@ public:
for (auto &stream : streams) { for (auto &stream : streams) {
hs_error_t err = hs_open_stream(db, 0, &stream); hs_error_t err = hs_open_stream(db, 0, &stream);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: Unable to open stream. Exiting.\n"; cerr << "ERROR: Unable to open stream. Exiting." << endl;
exit(-1); exit(-1);
} }
} }
@ -387,11 +381,11 @@ public:
// Close all open Hyperscan streams (potentially generating any // Close all open Hyperscan streams (potentially generating any
// end-anchored matches) // end-anchored matches)
void closeStreams() { void closeStreams() {
for (const auto &stream : streams) { for (auto &stream : streams) {
hs_error_t err = hs_error_t err =
hs_close_stream(stream, scratch, onMatch, &matchCount); hs_close_stream(stream, scratch, onMatch, &matchCount);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: Unable to close stream. Exiting.\n"; cerr << "ERROR: Unable to close stream. Exiting." << endl;
exit(-1); exit(-1);
} }
} }
@ -406,7 +400,7 @@ public:
pkt.c_str(), pkt.length(), 0, pkt.c_str(), pkt.length(), 0,
scratch, onMatch, &matchCount); scratch, onMatch, &matchCount);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: Unable to scan packet. Exiting.\n"; cerr << "ERROR: Unable to scan packet. Exiting." << endl;
exit(-1); exit(-1);
} }
} }
@ -420,7 +414,7 @@ public:
hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0, hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0,
scratch, onMatch, &matchCount); scratch, onMatch, &matchCount);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: Unable to scan packet. Exiting.\n"; cerr << "ERROR: Unable to scan packet. Exiting." << endl;
exit(-1); exit(-1);
} }
} }
@ -440,7 +434,7 @@ class Sigdata {
public: public:
Sigdata() {} Sigdata() {}
explicit Sigdata(const char *filename) { Sigdata(const char *filename) {
parseFile(filename, patterns, flags, ids, originals); parseFile(filename, patterns, flags, ids, originals);
} }
@ -458,8 +452,9 @@ public:
// dynamic storage.) // dynamic storage.)
vector<const char *> cstrPatterns; vector<const char *> cstrPatterns;
cstrPatterns.reserve(patterns.size()); cstrPatterns.reserve(patterns.size());
auto pstr = [](const string &pattern) { return pattern.c_str(); }; for (const auto &pattern : patterns) {
std::transform(patterns.begin(), patterns.end(), std::back_inserter(cstrPatterns), pstr); cstrPatterns.push_back(pattern.c_str());
}
Clock clock; Clock clock;
clock.start(); clock.start();
@ -508,29 +503,29 @@ public:
static static
void usage(const char *) { void usage(const char *) {
cerr << "Usage:\n\n"; cerr << "Usage:" << endl << endl;
cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]\n" cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]" << endl
<< " [ -F factor_group_size ] [ -N | -S ] " << " [ -F factor_group_size ] [ -N | -S ] "
<< "<pattern file> <pcap file>\n\n" << "<pattern file> <pcap file>" << endl << endl
<< " -n repeats sets the number of times the PCAP is repeatedly " << " -n repeats sets the number of times the PCAP is repeatedly "
"scanned\n" << " with the pattern.\n" "scanned" << endl << " with the pattern." << endl
<< " -G generations sets the number of generations that the " << " -G generations sets the number of generations that the "
"algorithm is\n" << " run for.\n" "algorithm is" << endl << " run for." << endl
<< " -N sets non-streaming mode, -S sets streaming mode (default)." << " -N sets non-streaming mode, -S sets streaming mode (default)."
<< endl << " -F sets the factor group size (must be >0); this " << endl << " -F sets the factor group size (must be >0); this "
"allows the detection\n" "allows the detection" << endl
<< " of multiple interacting factors.\n" << "\n" << " of multiple interacting factors." << endl << "" << endl
<< " -C sets the 'criterion', which can be either:\n" << " -C sets the 'criterion', which can be either:" << endl
<< " t throughput (the default) - this requires a pcap file" << " t throughput (the default) - this requires a pcap file"
<< endl << " r scratch size\n" << endl << " r scratch size" << endl
<< " s stream state size\n" << " s stream state size" << endl
<< " c compile time\n" << " b bytecode size" << " c compile time" << endl << " b bytecode size"
<< endl << endl << endl << endl
<< "We recommend the use of a utility like 'taskset' on " << "We recommend the use of a utility like 'taskset' on "
"multiprocessor hosts to\n" "multiprocessor hosts to" << endl
<< "lock execution to a single processor: this will remove processor " << "lock execution to a single processor: this will remove processor "
"migration\n" "migration" << endl
<< "by the scheduler as a source of noise in the results.\n"; << "by the scheduler as a source of noise in the results." << endl;
} }
static static
@ -562,7 +557,7 @@ double measure_block_time(Benchmark &bench, unsigned int repeatCount) {
} }
static static
double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode, double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
unsigned repeatCount, Criterion criterion, unsigned repeatCount, Criterion criterion,
bool diagnose = true) { bool diagnose = true) {
double compileTime = 0; double compileTime = 0;
@ -573,7 +568,7 @@ double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode,
size_t dbSize; size_t dbSize;
hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize); hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: could not retrieve bytecode size\n"; cerr << "ERROR: could not retrieve bytecode size" << endl;
exit(1); exit(1);
} }
return dbSize; return dbSize;
@ -584,7 +579,7 @@ double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode,
size_t streamStateSize; size_t streamStateSize;
hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize); hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
cerr << "ERROR: could not retrieve stream state size\n"; cerr << "ERROR: could not retrieve stream state size" << endl;
exit(1); exit(1);
} }
return streamStateSize; return streamStateSize;
@ -602,9 +597,8 @@ double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode,
scan_time = measure_stream_time(bench, repeatCount); scan_time = measure_stream_time(bench, repeatCount);
} }
size_t bytes = bench.bytes(); size_t bytes = bench.bytes();
size_t matches = bench.matches();
if (diagnose) { if (diagnose) {
size_t matches = bench.matches();
std::ios::fmtflags f(cout.flags()); std::ios::fmtflags f(cout.flags());
cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
<< " sec, Scanned " << bytes * repeatCount << " bytes, Throughput " << " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
@ -664,6 +658,10 @@ int main(int argc, char **argv) {
break; break;
case 'n': case 'n':
repeatCount = atoi(optarg); repeatCount = atoi(optarg);
if (repeatCount <= 0 || repeatCount > UINT_MAX) {
cerr << "Invalid repeatCount." << endl;
exit(-1);
}
break; break;
default: default:
usage(argv[0]); usage(argv[0]);
@ -683,13 +681,14 @@ int main(int argc, char **argv) {
Benchmark bench; Benchmark bench;
if (criterion == CRITERION_THROUGHPUT) { if (criterion == CRITERION_THROUGHPUT) {
if (!bench.readStreams(pcapFile)) { if (!bench.readStreams(pcapFile)) {
cerr << "Unable to read packets from PCAP file. Exiting.\n"; cerr << "Unable to read packets from PCAP file. Exiting." << endl;
exit(-1); exit(-1);
} }
} }
if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) { if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) {
cerr << "Cannot evaluate stream state for block mode compile. Exiting.\n"; cerr << "Cannot evaluate stream state for block mode compile. Exiting."
<< endl;
exit(-1); exit(-1);
} }
@ -727,7 +726,7 @@ int main(int argc, char **argv) {
unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max); unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max);
cout << "Cutting signatures cumulatively for " << generations cout << "Cutting signatures cumulatively for " << generations
<< " generations\n"; << " generations" << endl;
for (unsigned gen = 0; gen < generations; ++gen) { for (unsigned gen = 0; gen < generations; ++gen) {
cout << "Generation " << gen << " "; cout << "Generation " << gen << " ";
set<unsigned> s(work_sigs.begin(), work_sigs.end()); set<unsigned> s(work_sigs.begin(), work_sigs.end());
@ -737,9 +736,7 @@ int main(int argc, char **argv) {
count++; count++;
cout << "." << std::flush; cout << "." << std::flush;
vector<unsigned> sv(s.begin(), s.end()); vector<unsigned> sv(s.begin(), s.end());
random_device rng; random_shuffle(sv.begin(), sv.end());
mt19937 urng(rng());
shuffle(sv.begin(), sv.end(), urng);
unsigned groups = factor_max + 1; unsigned groups = factor_max + 1;
for (unsigned current_group = 0; current_group < groups; for (unsigned current_group = 0; current_group < groups;
current_group++) { current_group++) {
@ -771,7 +768,7 @@ int main(int argc, char **argv) {
cout << "Performance: "; cout << "Performance: ";
print_criterion(criterion, best); print_criterion(criterion, best);
cout << " (" << std::fixed << std::setprecision(3) << (best / score_base) cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
<< "x) after cutting:\n"; << "x) after cutting:" << endl;
cout.flags(out_f); cout.flags(out_f);
// s now has factor_max signatures // s now has factor_max signatures
@ -794,7 +791,7 @@ int main(int argc, char **argv) {
static static
bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset, bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
unsigned int *length) { unsigned int *length) {
const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header)); const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
const tcphdr *th = nullptr; const tcphdr *th = nullptr;
// Ignore packets that aren't IPv4 // Ignore packets that aren't IPv4
@ -813,7 +810,7 @@ bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
switch (iph->ip_p) { switch (iph->ip_p) {
case IPPROTO_TCP: case IPPROTO_TCP:
th = reinterpret_cast<const tcphdr *>(reinterpret_cast<const char *>(iph) + ihlen); th = (const tcphdr *)((const char *)iph + ihlen);
thlen = th->th_off * 4; thlen = th->th_off * 4;
break; break;
case IPPROTO_UDP: case IPPROTO_UDP:
@ -850,7 +847,7 @@ static unsigned parseFlags(const string &flagsStr) {
case '\r': // stray carriage-return case '\r': // stray carriage-return
break; break;
default: default:
cerr << "Unsupported flag \'" << c << "\'\n"; cerr << "Unsupported flag \'" << c << "\'" << endl;
exit(-1); exit(-1);
} }
} }
@ -862,7 +859,7 @@ static void parseFile(const char *filename, vector<string> &patterns,
vector<string> &originals) { vector<string> &originals) {
ifstream inFile(filename); ifstream inFile(filename);
if (!inFile.good()) { if (!inFile.good()) {
cerr << "ERROR: Can't open pattern file \"" << filename << "\"\n"; cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
exit(-1); exit(-1);
} }
@ -892,7 +889,7 @@ static void parseFile(const char *filename, vector<string> &patterns,
size_t flagsStart = expr.find_last_of('/'); size_t flagsStart = expr.find_last_of('/');
if (flagsStart == string::npos) { if (flagsStart == string::npos) {
cerr << "ERROR: no trailing '/' char\n"; cerr << "ERROR: no trailing '/' char" << endl;
exit(-1); exit(-1);
} }

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2021, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -52,10 +51,10 @@
#include <cstring> #include <cstring>
#include <chrono> #include <chrono>
#include <climits>
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <numeric>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
@ -70,12 +69,7 @@
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <netinet/udp.h> #include <netinet/udp.h>
#include <netinet/ip_icmp.h> #include <netinet/ip_icmp.h>
#ifdef __NetBSD__
#include <net/ethertypes.h>
#include <net/if_ether.h>
#else
#include <net/ethernet.h> #include <net/ethernet.h>
#endif /* __NetBSD__ */
#include <arpa/inet.h> #include <arpa/inet.h>
#include <pcap.h> #include <pcap.h>
@ -100,15 +94,15 @@ struct FiveTuple {
unsigned int dstPort; unsigned int dstPort;
// Construct a FiveTuple from a TCP or UDP packet. // Construct a FiveTuple from a TCP or UDP packet.
explicit FiveTuple(const struct ip *iphdr) { FiveTuple(const struct ip *iphdr) {
// IP fields // IP fields
protocol = iphdr->ip_p; protocol = iphdr->ip_p;
srcAddr = iphdr->ip_src.s_addr; srcAddr = iphdr->ip_src.s_addr;
dstAddr = iphdr->ip_dst.s_addr; dstAddr = iphdr->ip_dst.s_addr;
// UDP/TCP ports // UDP/TCP ports
const char * iphdr_base = reinterpret_cast<const char *>(iphdr); const struct udphdr *uh =
const struct udphdr *uh = reinterpret_cast<const struct udphdr *>(iphdr_base + (iphdr->ip_hl * 4)); (const struct udphdr *)(((const char *)iphdr) + (iphdr->ip_hl * 4));
srcPort = uh->uh_sport; srcPort = uh->uh_sport;
dstPort = uh->uh_dport; dstPort = uh->uh_dport;
} }
@ -137,7 +131,7 @@ static
int onMatch(unsigned int id, unsigned long long from, unsigned long long to, int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
unsigned int flags, void *ctx) { unsigned int flags, void *ctx) {
// Our context points to a size_t storing the match count // Our context points to a size_t storing the match count
size_t *matches = static_cast<size_t *>(ctx); size_t *matches = (size_t *)ctx;
(*matches)++; (*matches)++;
return 0; // continue matching return 0; // continue matching
} }
@ -233,8 +227,9 @@ public:
} }
// Valid TCP or UDP packet // Valid TCP or UDP packet
const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData + sizeof(struct ether_header)); const struct ip *iphdr = (const struct ip *)(pktData
const char *payload = reinterpret_cast<const char *>(pktData) + offset; + sizeof(struct ether_header));
const char *payload = (const char *)pktData + offset;
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr), size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
stream_map.size())).first->second; stream_map.size())).first->second;
@ -250,8 +245,9 @@ public:
// Return the number of bytes scanned // Return the number of bytes scanned
size_t bytes() const { size_t bytes() const {
size_t sum = 0; size_t sum = 0;
auto packs = [](size_t z, const string &packet) { return z + packet.size(); }; for (const auto &packet : packets) {
sum += std::accumulate(packets.begin(), packets.end(), 0, packs); sum += packet.size();
}
return sum; return sum;
} }
@ -280,7 +276,7 @@ public:
// Close all open Hyperscan streams (potentially generating any // Close all open Hyperscan streams (potentially generating any
// end-anchored matches) // end-anchored matches)
void closeStreams() { void closeStreams() {
for (const auto &stream : streams) { for (auto &stream : streams) {
hs_error_t err = hs_close_stream(stream, scratch, onMatch, hs_error_t err = hs_close_stream(stream, scratch, onMatch,
&matchCount); &matchCount);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
@ -432,8 +428,7 @@ static void databasesFromFile(const char *filename,
// storage.) // storage.)
vector<const char*> cstrPatterns; vector<const char*> cstrPatterns;
for (const auto &pattern : patterns) { for (const auto &pattern : patterns) {
// cppcheck-suppress useStlAlgorithm cstrPatterns.push_back(pattern.c_str());
cstrPatterns.push_back(pattern.c_str()); //NOLINT (performance-inefficient-vector-operation)
} }
cout << "Compiling Hyperscan databases with " << patterns.size() cout << "Compiling Hyperscan databases with " << patterns.size()
@ -495,6 +490,10 @@ int main(int argc, char **argv) {
// Streaming mode scans. // Streaming mode scans.
double secsStreamingScan = 0.0, secsStreamingOpenClose = 0.0; double secsStreamingScan = 0.0, secsStreamingOpenClose = 0.0;
if (repeatCount <= 0 || repeatCount > UINT_MAX) {
cerr << "Invalid repeatCount." << endl;
exit(-1);
}
for (unsigned int i = 0; i < repeatCount; i++) { for (unsigned int i = 0; i < repeatCount; i++) {
// Open streams. // Open streams.
clock.start(); clock.start();
@ -574,8 +573,7 @@ int main(int argc, char **argv) {
*/ */
static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset, static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
unsigned int *length) { unsigned int *length) {
const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header)); const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
const char *iph_base = reinterpret_cast<const char *>(iph);
const tcphdr *th = nullptr; const tcphdr *th = nullptr;
// Ignore packets that aren't IPv4 // Ignore packets that aren't IPv4
@ -594,7 +592,7 @@ static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
switch (iph->ip_p) { switch (iph->ip_p) {
case IPPROTO_TCP: case IPPROTO_TCP:
th = reinterpret_cast<const tcphdr *>(iph_base + ihlen); th = (const tcphdr *)((const char *)iph + ihlen);
thlen = th->th_off * 4; thlen = th->th_off * 4;
break; break;
case IPPROTO_UDP: case IPPROTO_UDP:

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2021, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -57,6 +57,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <unistd.h>
#include <hs.h> #include <hs.h>
@ -67,7 +68,7 @@
* to pass in the pattern that was being searched for so we can print it out. * to pass in the pattern that was being searched for so we can print it out.
*/ */
static int eventHandler(unsigned int id, unsigned long long from, static int eventHandler(unsigned int id, unsigned long long from,
unsigned long long to, unsigned int flags, void *ctx) { // cppcheck-suppress constParameterCallback unsigned long long to, unsigned int flags, void *ctx) {
printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to); printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to);
return 0; return 0;
} }
@ -150,7 +151,16 @@ int main(int argc, char *argv[]) {
} }
char *pattern = argv[1]; char *pattern = argv[1];
const char *inputFN = argv[2]; char *inputFN = argv[2];
if (access(inputFN, F_OK) != 0) {
fprintf(stderr, "ERROR: file doesn't exist.\n");
return -1;
}
if (access(inputFN, R_OK) != 0) {
fprintf(stderr, "ERROR: can't be read.\n");
return -1;
}
/* First, we attempt to compile the pattern provided on the command line. /* First, we attempt to compile the pattern provided on the command line.
* We assume 'DOTALL' semantics, meaning that the '.' meta-character will * We assume 'DOTALL' semantics, meaning that the '.' meta-character will

View File

@ -4,7 +4,8 @@ libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@
includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@ includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
Name: libhs Name: libhs
Description: A portable fork of the high-performance regular expression matching library Description: Intel(R) Hyperscan Library
Version: @HS_VERSION@ Version: @HS_VERSION@
Libs: -L${libdir} -lhs Libs: -L${libdir} -lhs
Libs.private: @PRIVATE_LIBS@
Cflags: -I${includedir}/hs Cflags: -I${includedir}/hs

View File

@ -1,53 +0,0 @@
#
# Copyright (c) 2020-2023, VectorCamp PC
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Intel Corporation nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
import json
import sys
#reads from the clang-tidy config file the first comment to ignore specific files
# Get the paths from the command-line arguments
# python3 ../source/scripts/change_command.py ../source/.clang-tidy ./compile_commands.json
clang_tidy_config_path = sys.argv[1]
compile_commands_path = sys.argv[2]
# Load the data from the file
with open(compile_commands_path, 'r') as f:
data = json.load(f)
# Open the clang-tidy config file and read the first comment
with open(clang_tidy_config_path, 'r') as f:
for line in f:
if line.startswith('#'):
ignore_files = line[1:].strip().split(',')
break
# Filter out the entries for the ignored files
data = [entry for entry in data if not any(ignore_file in entry['file'] for ignore_file in ignore_files)]
# Write the result to the same file
with open(compile_commands_path, 'w') as f:
json.dump(data, f, indent=2)

1
simde

@ -1 +0,0 @@
Subproject commit 416091ebdb9e901b29d026633e73167d6353a0b0

View File

@ -176,8 +176,7 @@ void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
auto ecit = edge_cache.find(cache_key); auto ecit = edge_cache.find(cache_key);
if (ecit == edge_cache.end()) { if (ecit == edge_cache.end()) {
DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index); DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
NFAEdge e; NFAEdge e = add_edge(u, v, g);
std::tie(e, std::ignore) = add_edge(u, v, g);
edge_cache.emplace(cache_key, e); edge_cache.emplace(cache_key, e);
g[e].assert_flags = flags; g[e].assert_flags = flags;
if (++assert_edge_count > MAX_ASSERT_EDGES) { if (++assert_edge_count > MAX_ASSERT_EDGES) {
@ -230,12 +229,11 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,
/* we need to interpose a dummy dot vertex between v and accept if /* we need to interpose a dummy dot vertex between v and accept if
* required so that ^ doesn't match trailing \n */ * required so that ^ doesn't match trailing \n */
auto deads = [&g=g](const NFAEdge &e) { for (const auto &e : out_edges_range(v, g)) {
return (target(e, g) == g.accept); if (target(e, g) == g.accept) {
}; dead.push_back(e);
const auto &er = out_edges_range(v, g); }
std::copy_if(begin(er), end(er), std::back_inserter(dead), deads); }
/* assert has been resolved; clear flag */ /* assert has been resolved; clear flag */
g[v].assert_flags &= ~POS_FLAG_MULTILINE_START; g[v].assert_flags &= ~POS_FLAG_MULTILINE_START;
} }
@ -253,7 +251,6 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,
static static
bool hasAssertVertices(const NGHolder &g) { bool hasAssertVertices(const NGHolder &g) {
// cppcheck-suppress useStlAlgorithm
for (auto v : vertices_range(g)) { for (auto v : vertices_range(g)) {
int flags = g[v].assert_flags; int flags = g[v].assert_flags;
if (flags & WORDBOUNDARY_FLAGS) { if (flags & WORDBOUNDARY_FLAGS) {

View File

@ -417,7 +417,7 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
"HS_FLAG_SOM_LEFTMOST are supported in literal API."); "HS_FLAG_SOM_LEFTMOST are supported in literal API.");
} }
if (expLength == 0) { if (!strcmp(expression, "")) {
throw CompileError("Pure literal API doesn't support empty string."); throw CompileError("Pure literal API doesn't support empty string.");
} }
@ -443,7 +443,7 @@ bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
if (!rose) { if (!rose) {
DEBUG_PRINTF("error building rose\n"); DEBUG_PRINTF("error building rose\n");
assert(0); assert(0);
return bytecode_ptr<RoseEngine>(nullptr); return nullptr;
} }
dumpReportManager(ng.rm, ng.cc.grey); dumpReportManager(ng.rm, ng.cc.grey);
@ -478,7 +478,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
DEBUG_PRINTF("db size %zu\n", db_len); DEBUG_PRINTF("db size %zu\n", db_len);
DEBUG_PRINTF("db platform %llx\n", platform); DEBUG_PRINTF("db platform %llx\n", platform);
struct hs_database *db = static_cast<struct hs_database *>(hs_database_alloc(db_len)); struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
if (hs_check_alloc(db) != HS_SUCCESS) { if (hs_check_alloc(db) != HS_SUCCESS) {
hs_database_free(db); hs_database_free(db);
return nullptr; return nullptr;
@ -492,7 +492,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
DEBUG_PRINTF("shift is %zu\n", shift); DEBUG_PRINTF("shift is %zu\n", shift);
db->bytecode = offsetof(struct hs_database, bytes) - shift; db->bytecode = offsetof(struct hs_database, bytes) - shift;
char *bytecode = reinterpret_cast<char *>(db) + db->bytecode; char *bytecode = (char *)db + db->bytecode;
assert(ISALIGNED_CL(bytecode)); assert(ISALIGNED_CL(bytecode));
db->magic = HS_DB_MAGIC; db->magic = HS_DB_MAGIC;
@ -525,7 +525,7 @@ struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) {
throw CompileError("Internal error."); throw CompileError("Internal error.");
} }
const char *bytecode = reinterpret_cast<const char *>(rose.get()); const char *bytecode = (const char *)(rose.get());
const platform_t p = target_to_platform(ng.cc.target_info); const platform_t p = target_to_platform(ng.cc.target_info);
struct hs_database *db = dbCreate(bytecode, *length, p); struct hs_database *db = dbCreate(bytecode, *length, p);
if (!db) { if (!db) {

View File

@ -57,14 +57,15 @@ extern const hs_compile_error_t hs_badalloc = {
namespace ue2 { namespace ue2 {
hs_compile_error_t *generateCompileError(const string &err, int expression) { hs_compile_error_t *generateCompileError(const string &err, int expression) {
hs_compile_error_t *ret = static_cast<struct hs_compile_error *>(hs_misc_alloc(sizeof(hs_compile_error_t))); hs_compile_error_t *ret =
(struct hs_compile_error *)hs_misc_alloc(sizeof(hs_compile_error_t));
if (ret) { if (ret) {
hs_error_t e = hs_check_alloc(ret); hs_error_t e = hs_check_alloc(ret);
if (e != HS_SUCCESS) { if (e != HS_SUCCESS) {
hs_misc_free(ret); hs_misc_free(ret);
return const_cast<hs_compile_error_t *>(&hs_badalloc); return const_cast<hs_compile_error_t *>(&hs_badalloc);
} }
char *msg = static_cast<char *>(hs_misc_alloc(err.size() + 1)); char *msg = (char *)hs_misc_alloc(err.size() + 1);
if (msg) { if (msg) {
e = hs_check_alloc(msg); e = hs_check_alloc(msg);
if (e != HS_SUCCESS) { if (e != HS_SUCCESS) {

View File

@ -30,6 +30,7 @@
#include "config.h" #include "config.h"
#include "ue2common.h" #include "ue2common.h"
#include "util/arch.h" #include "util/arch.h"
#include "util/intrinsics.h"
#if !defined(HAVE_SSE42) #if !defined(HAVE_SSE42)
@ -542,13 +543,14 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
// Main aligned loop, processes eight bytes at a time. // Main aligned loop, processes eight bytes at a time.
u32 term1, term2;
for (size_t li = 0; li < running_length/8; li++) { for (size_t li = 0; li < running_length/8; li++) {
u32 block = *(const u32 *)p_buf; u32 block = *(const u32 *)p_buf;
crc ^= block; crc ^= block;
p_buf += 4; p_buf += 4;
u32 term1 = crc_tableil8_o88[crc & 0x000000FF] ^ term1 = crc_tableil8_o88[crc & 0x000000FF] ^
crc_tableil8_o80[(crc >> 8) & 0x000000FF]; crc_tableil8_o80[(crc >> 8) & 0x000000FF];
u32 term2 = crc >> 16; term2 = crc >> 16;
crc = term1 ^ crc = term1 ^
crc_tableil8_o72[term2 & 0x000000FF] ^ crc_tableil8_o72[term2 & 0x000000FF] ^
crc_tableil8_o64[(term2 >> 8) & 0x000000FF]; crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
@ -577,7 +579,53 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
} }
#else // HAVE_SSE42 #else // HAVE_SSE42
#include "util/arch/x86/crc32.h"
#ifdef ARCH_64_BIT
#define CRC_WORD 8
#define CRC_TYPE u64a
#define CRC_FUNC _mm_crc32_u64
#else
#define CRC_WORD 4
#define CRC_TYPE u32
#define CRC_FUNC _mm_crc32_u32
#endif
/*
* Use the crc32 instruction from SSE4.2 to compute our checksum - same
* polynomial as the above function.
*/
static really_inline
u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
const size_t length) {
u32 crc = running_crc;
// Process byte-by-byte until p_buf is aligned
const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
size_t init_bytes = aligned_buf - p_buf;
size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
size_t end_bytes = length - init_bytes - running_length;
while (p_buf < aligned_buf) {
crc = _mm_crc32_u8(crc, *p_buf++);
}
// Main aligned loop, processes a word at a time.
for (size_t li = 0; li < running_length/CRC_WORD; li++) {
CRC_TYPE block = *(const CRC_TYPE *)p_buf;
crc = CRC_FUNC(crc, block);
p_buf += CRC_WORD;
}
// Remaining bytes
for(size_t li = 0; li < end_bytes; li++) {
crc = _mm_crc32_u8(crc, *p_buf++);
}
return crc;
}
#endif #endif
#ifdef VERIFY_ASSERTION #ifdef VERIFY_ASSERTION

View File

@ -353,6 +353,12 @@ hs_error_t dbIsValid(const hs_database_t *db) {
return HS_SUCCESS; return HS_SUCCESS;
} }
#if defined(_WIN32)
#define SNPRINTF_COMPAT _snprintf
#else
#define SNPRINTF_COMPAT snprintf
#endif
/** Allocate a buffer and prints the database info into it. Returns an /** Allocate a buffer and prints the database info into it. Returns an
* appropriate error code on failure, or HS_SUCCESS on success. */ * appropriate error code on failure, or HS_SUCCESS on success. */
static static
@ -394,7 +400,9 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
return ret; return ret;
} }
int p_len = snprintf( // Note: SNPRINTF_COMPAT is a macro defined above, to cope with systems
// that don't have snprintf but have a workalike.
int p_len = SNPRINTF_COMPAT(
buf, len, "Version: %u.%u.%u Features: %s Mode: %s", buf, len, "Version: %u.%u.%u Features: %s Mode: %s",
major, minor, release, features, mode); major, minor, release, features, mode);
if (p_len < 0) { if (p_len < 0) {

View File

@ -51,7 +51,6 @@ extern "C"
// CPU type is the low 6 bits (we can't need more than 64, surely!) // CPU type is the low 6 bits (we can't need more than 64, surely!)
#define HS_PLATFORM_INTEL 1 #define HS_PLATFORM_INTEL 1
#define HS_PLATFORM_ARM 2
#define HS_PLATFORM_CPU_MASK 0x3F #define HS_PLATFORM_CPU_MASK 0x3F
#define HS_PLATFORM_NOAVX2 (4<<13) #define HS_PLATFORM_NOAVX2 (4<<13)
@ -79,18 +78,21 @@ static UNUSED
const platform_t hs_current_platform_no_avx2 = { const platform_t hs_current_platform_no_avx2 = {
HS_PLATFORM_NOAVX2 | HS_PLATFORM_NOAVX2 |
HS_PLATFORM_NOAVX512 | HS_PLATFORM_NOAVX512 |
HS_PLATFORM_NOAVX512VBMI HS_PLATFORM_NOAVX512VBMI |
0,
}; };
static UNUSED static UNUSED
const platform_t hs_current_platform_no_avx512 = { const platform_t hs_current_platform_no_avx512 = {
HS_PLATFORM_NOAVX512 | HS_PLATFORM_NOAVX512 |
HS_PLATFORM_NOAVX512VBMI HS_PLATFORM_NOAVX512VBMI |
0,
}; };
static UNUSED static UNUSED
const platform_t hs_current_platform_no_avx512vbmi = { const platform_t hs_current_platform_no_avx512vbmi = {
HS_PLATFORM_NOAVX512VBMI HS_PLATFORM_NOAVX512VBMI |
0,
}; };
/* /*
@ -112,7 +114,6 @@ struct hs_database {
static really_inline static really_inline
const void *hs_get_bytecode(const struct hs_database *db) { const void *hs_get_bytecode(const struct hs_database *db) {
// cppcheck-suppress cstyleCast
return ((const char *)db + db->bytecode); return ((const char *)db + db->bytecode);
} }

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2016-2020, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -31,41 +30,7 @@
#include "hs_common.h" #include "hs_common.h"
#include "hs_runtime.h" #include "hs_runtime.h"
#include "ue2common.h" #include "ue2common.h"
#include "util/cpuid_inline.h"
/* Streamlining the dispatch to eliminate runtime checking/branching:
* What we want to do is, first call to the function will run the resolve
* code and set the static resolved/dispatch pointer to point to the
* correct function. Subsequent calls to the function will go directly to
* the resolved ptr. The simplest way to accomplish this is, to
* initially set the pointer to the resolve function.
* To accomplish this in a manner invisible to the user,
* we do involve some rather ugly/confusing macros in here.
* There are four macros that assemble the code for each function
* we want to dispatch in this manner:
* CREATE_DISPATCH
* this generates the declarations for the candidate target functions,
* for the fat_dispatch function pointer, for the resolve_ function,
* points the function pointer to the resolve function, and contains
* most of the definition of the resolve function. The very end of the
* resolve function is completed by the next macro, because in the
* CREATE_DISPATCH macro we have the argument list with the arg declarations,
* which is needed to generate correct function signatures, but we
* can't generate from this, in a macro, a _call_ to one of those functions.
* CONNECT_ARGS_1
* this macro fills in the actual call at the end of the resolve function,
* with the correct arg list. hence the name connect args.
* CONNECT_DISPATCH_2
* this macro likewise gives up the beginning of the definition of the
* actual entry point function (the 'real name' that's called by the user)
* but again in the pass-through call, cannot invoke the target without
* getting the arg list , which is supplied by the final macro,
* CONNECT_ARGS_3
*
*/
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/cpuid_inline.h"
#include "util/join.h" #include "util/join.h"
#if defined(DISABLE_AVX512_DISPATCH) #if defined(DISABLE_AVX512_DISPATCH)
@ -91,274 +56,96 @@
return (RTYPE)HS_ARCH_ERROR; \ return (RTYPE)HS_ARCH_ERROR; \
} \ } \
\ \
/* dispatch routing pointer for this function */ \
/* initially point it at the resolve function */ \
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__); \
static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) = \
&JOIN(resolve_, NAME); \
\
/* resolver */ \ /* resolver */ \
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) { \ static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \
if (check_avx512vbmi()) { \ if (check_avx512vbmi()) { \
fat_dispatch_ ## NAME = &JOIN(avx512vbmi_, NAME); \ return JOIN(avx512vbmi_, NAME); \
} \ } \
else if (check_avx512()) { \ if (check_avx512()) { \
fat_dispatch_ ## NAME = &JOIN(avx512_, NAME); \ return JOIN(avx512_, NAME); \
} \ } \
else if (check_avx2()) { \ if (check_avx2()) { \
fat_dispatch_ ## NAME = &JOIN(avx2_, NAME); \ return JOIN(avx2_, NAME); \
} \ } \
else if (check_sse42() && check_popcnt()) { \ if (check_sse42() && check_popcnt()) { \
fat_dispatch_ ## NAME = &JOIN(corei7_, NAME); \ return JOIN(corei7_, NAME); \
} \ } \
else if (check_ssse3()) { \ if (check_ssse3()) { \
fat_dispatch_ ## NAME = &JOIN(core2_, NAME); \ return JOIN(core2_, NAME); \
} else { \
/* anything else is fail */ \
fat_dispatch_ ## NAME = &JOIN(error_, NAME); \
} \ } \
/* anything else is fail */ \
return JOIN(error_, NAME); \
/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
#elif defined(ARCH_AARCH64)
#include "util/arch/arm/cpuid_inline.h"
#include "util/join.h"
#define CREATE_DISPATCH(RTYPE, NAME, ...) \
/* create defns */ \
RTYPE JOIN(sve2_, NAME)(__VA_ARGS__); \
RTYPE JOIN(sve_, NAME)(__VA_ARGS__); \
RTYPE JOIN(neon_, NAME)(__VA_ARGS__); \
\
/* error func */ \
static inline RTYPE JOIN(error_, NAME)(__VA_ARGS__) { \
return (RTYPE)HS_ARCH_ERROR; \
} \ } \
\ \
/* dispatch routing pointer for this function */ \ /* function */ \
/* initially point it at the resolve function */ \
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__); \
static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) = \
&JOIN(resolve_, NAME); \
\
/* resolver */ \
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) { \
if (check_sve2()) { \
fat_dispatch_ ## NAME = &JOIN(sve2_, NAME); \
} \
else if (check_sve()) { \
fat_dispatch_ ## NAME = &JOIN(sve_, NAME); \
} \
else if (check_neon()) { \
fat_dispatch_ ## NAME = &JOIN(neon_, NAME); \
} else { \
/* anything else is fail */ \
fat_dispatch_ ## NAME = &JOIN(error_, NAME); \
} \
/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
#endif
#define CONNECT_ARGS_1(RTYPE, NAME, ...) \
return (*fat_dispatch_ ## NAME)(__VA_ARGS__); \
} \
#define CONNECT_DISPATCH_2(RTYPE, NAME, ...) \
/* new function */ \
HS_PUBLIC_API \ HS_PUBLIC_API \
RTYPE NAME(__VA_ARGS__) { \ RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
#define CONNECT_ARGS_3(RTYPE, NAME, ...) \
return (*fat_dispatch_ ## NAME)(__VA_ARGS__); \
} \
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-function"
/* this gets a bit ugly to compose the static redirect functions,
* as we necessarily need first the typed arg list and then just the arg
* names, twice in a row, to define the redirect function and the
* dispatch function call */
CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data, CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
unsigned length, unsigned flags, hs_scratch_t *scratch, unsigned length, unsigned flags, hs_scratch_t *scratch,
match_event_handler onEvent, void *userCtx); match_event_handler onEvent, void *userCtx);
CONNECT_ARGS_1(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
CONNECT_DISPATCH_2(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
unsigned length, unsigned flags, hs_scratch_t *scratch,
match_event_handler onEvent, void *userCtx);
CONNECT_ARGS_3(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database, CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database,
size_t *stream_size); size_t *stream_size);
CONNECT_ARGS_1(hs_error_t, hs_stream_size, database, stream_size);
CONNECT_DISPATCH_2(hs_error_t, hs_stream_size, const hs_database_t *database,
size_t *stream_size);
CONNECT_ARGS_3(hs_error_t, hs_stream_size, database, stream_size);
CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db, CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db,
size_t *size); size_t *size);
CONNECT_ARGS_1(hs_error_t, hs_database_size, db, size);
CONNECT_DISPATCH_2(hs_error_t, hs_database_size, const hs_database_t *db,
size_t *size);
CONNECT_ARGS_3(hs_error_t, hs_database_size, db, size);
CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db); CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db);
CONNECT_ARGS_1(hs_error_t, dbIsValid, db);
CONNECT_DISPATCH_2(hs_error_t, dbIsValid, const hs_database_t *db);
CONNECT_ARGS_3(hs_error_t, dbIsValid, db);
CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db); CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db);
CONNECT_ARGS_1(hs_error_t, hs_free_database, db);
CONNECT_DISPATCH_2(hs_error_t, hs_free_database, hs_database_t *db);
CONNECT_ARGS_3(hs_error_t, hs_free_database, db);
CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db, CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db,
unsigned int flags, hs_stream_t **stream); unsigned int flags, hs_stream_t **stream);
CONNECT_ARGS_1(hs_error_t, hs_open_stream, db, flags, stream);
CONNECT_DISPATCH_2(hs_error_t, hs_open_stream, const hs_database_t *db,
unsigned int flags, hs_stream_t **stream);
CONNECT_ARGS_3(hs_error_t, hs_open_stream, db, flags, stream);
CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data, CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
unsigned int length, unsigned int flags, hs_scratch_t *scratch, unsigned int length, unsigned int flags, hs_scratch_t *scratch,
match_event_handler onEvent, void *ctxt); match_event_handler onEvent, void *ctxt);
CONNECT_ARGS_1(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
CONNECT_DISPATCH_2(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
unsigned int length, unsigned int flags, hs_scratch_t *scratch,
match_event_handler onEvent, void *ctxt);
CONNECT_ARGS_3(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id, CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id,
hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt); hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
CONNECT_ARGS_1(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
CONNECT_DISPATCH_2(hs_error_t, hs_close_stream, hs_stream_t *id,
hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
CONNECT_ARGS_3(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db, CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db,
const char *const *data, const unsigned int *length, const char *const *data, const unsigned int *length,
unsigned int count, unsigned int flags, hs_scratch_t *scratch, unsigned int count, unsigned int flags, hs_scratch_t *scratch,
match_event_handler onevent, void *context); match_event_handler onevent, void *context);
CONNECT_ARGS_1(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
CONNECT_DISPATCH_2(hs_error_t, hs_scan_vector, const hs_database_t *db,
const char *const *data, const unsigned int *length,
unsigned int count, unsigned int flags, hs_scratch_t *scratch,
match_event_handler onevent, void *context);
CONNECT_ARGS_3(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info); CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
CONNECT_ARGS_1(hs_error_t, hs_database_info, db, info);
CONNECT_DISPATCH_2(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
CONNECT_ARGS_3(hs_error_t, hs_database_info, db, info);
CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id, CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
const hs_stream_t *from_id); const hs_stream_t *from_id);
CONNECT_ARGS_1(hs_error_t, hs_copy_stream, to_id, from_id);
CONNECT_DISPATCH_2(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
const hs_stream_t *from_id);
CONNECT_ARGS_3(hs_error_t, hs_copy_stream, to_id, from_id);
CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id, CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id,
unsigned int flags, hs_scratch_t *scratch, unsigned int flags, hs_scratch_t *scratch,
match_event_handler onEvent, void *context); match_event_handler onEvent, void *context);
CONNECT_ARGS_1(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
CONNECT_DISPATCH_2(hs_error_t, hs_reset_stream, hs_stream_t *id,
unsigned int flags, hs_scratch_t *scratch,
match_event_handler onEvent, void *context);
CONNECT_ARGS_3(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id, CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
const hs_stream_t *from_id, hs_scratch_t *scratch, const hs_stream_t *from_id, hs_scratch_t *scratch,
match_event_handler onEvent, void *context); match_event_handler onEvent, void *context);
CONNECT_ARGS_1(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
const hs_stream_t *from_id, hs_scratch_t *scratch,
match_event_handler onEvent, void *context);
CONNECT_ARGS_3(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db, CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db,
char **bytes, size_t *length); char **bytes, size_t *length);
CONNECT_ARGS_1(hs_error_t, hs_serialize_database, db, bytes, length);
CONNECT_DISPATCH_2(hs_error_t, hs_serialize_database, const hs_database_t *db,
char **bytes, size_t *length);
CONNECT_ARGS_3(hs_error_t, hs_serialize_database, db, bytes, length);
CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes, CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes,
const size_t length, hs_database_t **db); const size_t length, hs_database_t **db);
CONNECT_ARGS_1(hs_error_t, hs_deserialize_database, bytes, length, db);
CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database, const char *bytes,
const size_t length, hs_database_t **db);
CONNECT_ARGS_3(hs_error_t, hs_deserialize_database, bytes, length, db);
CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes, CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes,
const size_t length, hs_database_t *db); const size_t length, hs_database_t *db);
CONNECT_ARGS_1(hs_error_t, hs_deserialize_database_at, bytes, length, db);
CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database_at, const char *bytes,
const size_t length, hs_database_t *db);
CONNECT_ARGS_3(hs_error_t, hs_deserialize_database_at, bytes, length, db);
CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes, CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
size_t length, char **info); size_t length, char **info);
CONNECT_ARGS_1(hs_error_t, hs_serialized_database_info, bytes, length, info);
CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_info, const char *bytes,
size_t length, char **info);
CONNECT_ARGS_3(hs_error_t, hs_serialized_database_info, bytes, length, info);
CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes, CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
const size_t length, size_t *deserialized_size); const size_t length, size_t *deserialized_size);
CONNECT_ARGS_1(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_size, const char *bytes,
const size_t length, size_t *deserialized_size);
CONNECT_ARGS_3(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream, CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
char *buf, size_t buf_space, size_t *used_space); char *buf, size_t buf_space, size_t *used_space);
CONNECT_ARGS_1(hs_error_t, hs_compress_stream, stream,
buf, buf_space, used_space);
CONNECT_DISPATCH_2(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
char *buf, size_t buf_space, size_t *used_space);
CONNECT_ARGS_3(hs_error_t, hs_compress_stream, stream,
buf, buf_space, used_space);
CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db, CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
hs_stream_t **stream, const char *buf,size_t buf_size); hs_stream_t **stream, const char *buf,size_t buf_size);
CONNECT_ARGS_1(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
CONNECT_DISPATCH_2(hs_error_t, hs_expand_stream, const hs_database_t *db,
hs_stream_t **stream, const char *buf,size_t buf_size);
CONNECT_ARGS_3(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream, CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
const char *buf, size_t buf_size, hs_scratch_t *scratch, const char *buf, size_t buf_size, hs_scratch_t *scratch,
match_event_handler onEvent, void *context); match_event_handler onEvent, void *context);
CONNECT_ARGS_1(hs_error_t, hs_reset_and_expand_stream, to_stream,
buf, buf_size, scratch, onEvent, context);
CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
const char *buf, size_t buf_size, hs_scratch_t *scratch,
match_event_handler onEvent, void *context);
CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
buf, buf_size, scratch, onEvent, context);
/** INTERNALS **/ /** INTERNALS **/
CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen); CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
CONNECT_ARGS_1(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
CONNECT_DISPATCH_2(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
CONNECT_ARGS_3(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
#pragma GCC diagnostic pop
#pragma GCC diagnostic pop

View File

@ -36,7 +36,6 @@
#include "teddy.h" #include "teddy.h"
#include "teddy_internal.h" #include "teddy_internal.h"
#include "util/arch.h" #include "util/arch.h"
#include "util/bitutils.h"
#include "util/simd_utils.h" #include "util/simd_utils.h"
#include "util/uniform_ops.h" #include "util/uniform_ops.h"
@ -120,6 +119,20 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
}; };
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn(const u32 a, const u8 *b) {
u64a r;
#if defined(HAVE_BMI) && !defined(NO_ASM)
__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
#else
r = unaligned_load_u32(b) & ~a;
#endif
return r;
}
/* generates an initial state mask based on the last byte-ish of history rather /* generates an initial state mask based on the last byte-ish of history rather
* than being all accepting. If there is no history to consider, the state is * than being all accepting. If there is no history to consider, the state is
* generated based on the minimum length of each bucket in order to prevent * generated based on the minimum length of each bucket in order to prevent
@ -147,43 +160,33 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
/* +1: the zones ensure that we can read the byte at z->end */ /* +1: the zones ensure that we can read the byte at z->end */
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
u64a domain_mask = ~domain_mask_flipped; u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
u64a it_hi = *(const u64a *)itPtr; m128 st0 = load_m128_from_u64a(ft + reach0);
u64a it_lo = *(const u64a *)(itPtr + 8); m128 st1 = load_m128_from_u64a(ft + reach1);
u64a reach0 = domain_mask & it_hi; m128 st2 = load_m128_from_u64a(ft + reach2);
u64a reach1 = domain_mask & (it_hi >> 8); m128 st3 = load_m128_from_u64a(ft + reach3);
u64a reach2 = domain_mask & (it_hi >> 16);
u64a reach3 = domain_mask & (it_hi >> 24);
u64a reach4 = domain_mask & (it_hi >> 32);
u64a reach5 = domain_mask & (it_hi >> 40);
u64a reach6 = domain_mask & (it_hi >> 48);
u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
u64a reach8 = domain_mask & it_lo;
u64a reach9 = domain_mask & (it_lo >> 8);
u64a reach10 = domain_mask & (it_lo >> 16);
u64a reach11 = domain_mask & (it_lo >> 24);
u64a reach12 = domain_mask & (it_lo >> 32);
u64a reach13 = domain_mask & (it_lo >> 40);
u64a reach14 = domain_mask & (it_lo >> 48);
u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
m128 st0 = load_m128_from_u64a(ft + reach0); u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1); u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2); u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3); u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5); m128 st4 = load_m128_from_u64a(ft + reach4);
m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6); m128 st5 = load_m128_from_u64a(ft + reach5);
m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7); m128 st6 = load_m128_from_u64a(ft + reach6);
m128 st8 = load_m128_from_u64a(ft + reach8); m128 st7 = load_m128_from_u64a(ft + reach7);
m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2); st1 = lshiftbyte_m128(st1, 1);
m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3); st2 = lshiftbyte_m128(st2, 2);
m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4); st3 = lshiftbyte_m128(st3, 3);
m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5); st4 = lshiftbyte_m128(st4, 4);
m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6); st5 = lshiftbyte_m128(st5, 5);
m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7); st6 = lshiftbyte_m128(st6, 6);
st7 = lshiftbyte_m128(st7, 7);
st0 = or128(st0, st1); st0 = or128(st0, st1);
st2 = or128(st2, st3); st2 = or128(st2, st3);
@ -192,6 +195,39 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
st0 = or128(st0, st2); st0 = or128(st0, st2);
st4 = or128(st4, st6); st4 = or128(st4, st6);
st0 = or128(st0, st4); st0 = or128(st0, st4);
*s = or128(*s, st0);
*conf0 = movq(*s);
*s = rshiftbyte_m128(*s, 8);
*conf0 ^= ~0ULL;
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
m128 st8 = load_m128_from_u64a(ft + reach8);
m128 st9 = load_m128_from_u64a(ft + reach9);
m128 st10 = load_m128_from_u64a(ft + reach10);
m128 st11 = load_m128_from_u64a(ft + reach11);
u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
m128 st12 = load_m128_from_u64a(ft + reach12);
m128 st13 = load_m128_from_u64a(ft + reach13);
m128 st14 = load_m128_from_u64a(ft + reach14);
m128 st15 = load_m128_from_u64a(ft + reach15);
st9 = lshiftbyte_m128(st9, 1);
st10 = lshiftbyte_m128(st10, 2);
st11 = lshiftbyte_m128(st11, 3);
st12 = lshiftbyte_m128(st12, 4);
st13 = lshiftbyte_m128(st13, 5);
st14 = lshiftbyte_m128(st14, 6);
st15 = lshiftbyte_m128(st15, 7);
st8 = or128(st8, st9); st8 = or128(st8, st9);
st10 = or128(st10, st11); st10 = or128(st10, st11);
@ -200,14 +236,11 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
st8 = or128(st8, st10); st8 = or128(st8, st10);
st12 = or128(st12, st14); st12 = or128(st12, st14);
st8 = or128(st8, st12); st8 = or128(st8, st12);
*s = or128(*s, st8);
m128 st = or128(*s, st0); *conf8 = movq(*s);
*conf0 = movq(st) ^ ~0ULL; *s = rshiftbyte_m128(*s, 8);
st = rshiftbyte_m128(st, 8); *conf8 ^= ~0ULL;
st = or128(st, st8);
*conf8 = movq(st) ^ ~0ULL;
*s = rshiftbyte_m128(st, 8);
} }
static really_inline static really_inline
@ -215,7 +248,6 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped, UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach2 = andn(domain_mask_flipped, itPtr + 2); u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4); u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
@ -268,7 +300,6 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped, UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4); u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
u64a reach8 = andn(domain_mask_flipped, itPtr + 8); u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
@ -298,7 +329,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
static really_inline static really_inline
void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
const u32 *confBase, const struct FDR_Runtime_Args *a, const u32 *confBase, const struct FDR_Runtime_Args *a,
const u8 *ptr, u32 *last_match_id, const struct zone *z) { const u8 *ptr, u32 *last_match_id, struct zone *z) {
const u8 bucket = 8; const u8 bucket = 8;
if (likely(!*conf)) { if (likely(!*conf)) {
@ -308,7 +339,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
/* ptr is currently referring to a location in the zone's buffer, we also /* ptr is currently referring to a location in the zone's buffer, we also
* need a pointer in the original, main buffer for the final string compare. * need a pointer in the original, main buffer for the final string compare.
*/ */
const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr) const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
const u8 *confLoc = ptr; const u8 *confLoc = ptr;
@ -333,7 +364,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
} }
static really_inline static really_inline
void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) { void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
#ifdef DEBUG #ifdef DEBUG
DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf); DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n", DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
@ -665,10 +696,6 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
const u8 *tryFloodDetect = zz->floodPtr; \ const u8 *tryFloodDetect = zz->floodPtr; \
const u8 *start_ptr = zz->start; \ const u8 *start_ptr = zz->start; \
const u8 *end_ptr = zz->end; \ const u8 *end_ptr = zz->end; \
for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \
itPtr += 4*ITER_BYTES) { \
__builtin_prefetch(itPtr); \
} \
\ \
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
itPtr += ITER_BYTES) { \ itPtr += ITER_BYTES) { \
@ -712,7 +739,6 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
assert(ISALIGNED_CL(confBase)); assert(ISALIGNED_CL(confBase));
struct zone zones[ZONE_MAX]; struct zone zones[ZONE_MAX];
assert(fdr->domain > 8 && fdr->domain < 16); assert(fdr->domain > 8 && fdr->domain < 16);
memset(zones, 0, sizeof(zones));
size_t numZone = prepareZones(a->buf, a->len, size_t numZone = prepareZones(a->buf, a->len,
a->buf_history + a->len_history, a->buf_history + a->len_history,

View File

@ -44,6 +44,7 @@
#include "util/compare.h" #include "util/compare.h"
#include "util/container.h" #include "util/container.h"
#include "util/dump_mask.h" #include "util/dump_mask.h"
#include "util/make_unique.h"
#include "util/math.h" #include "util/math.h"
#include "util/noncopyable.h" #include "util/noncopyable.h"
#include "util/target_info.h" #include "util/target_info.h"
@ -98,7 +99,7 @@ public:
const FDREngineDescription &eng_in, const FDREngineDescription &eng_in,
bool make_small_in, const Grey &grey_in) bool make_small_in, const Grey &grey_in)
: eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()), : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()),
lits(std::move(lits_in)), bucketToLits(std::move(bucketToLits_in)), lits(move(lits_in)), bucketToLits(move(bucketToLits_in)),
make_small(make_small_in) {} make_small(make_small_in) {}
bytecode_ptr<FDR> build(); bytecode_ptr<FDR> build();
@ -127,7 +128,7 @@ void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) {
} }
void FDRCompiler::createInitialState(FDR *fdr) { void FDRCompiler::createInitialState(FDR *fdr) {
u8 *start = reinterpret_cast<u8 *>(&fdr->start); u8 *start = (u8 *)&fdr->start;
/* initial state should to be 1 in each slot in the bucket up to bucket /* initial state should to be 1 in each slot in the bucket up to bucket
* minlen - 1, and 0 thereafter */ * minlen - 1, and 0 thereafter */
@ -136,7 +137,6 @@ void FDRCompiler::createInitialState(FDR *fdr) {
const vector<LiteralIndex> &bucket_lits = bucketToLits[b]; const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
u32 min_len = ~0U; u32 min_len = ~0U;
for (const LiteralIndex &lit_idx : bucket_lits) { for (const LiteralIndex &lit_idx : bucket_lits) {
// cppcheck-suppress useStlAlgorithm
min_len = min(min_len, verify_u32(lits[lit_idx].s.length())); min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));
} }
@ -176,7 +176,7 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64); auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
assert(fdr); // otherwise would have thrown std::bad_alloc assert(fdr); // otherwise would have thrown std::bad_alloc
u8 *fdr_base = reinterpret_cast<u8 *>(fdr.get()); u8 *fdr_base = (u8 *)fdr.get();
// Write header. // Write header.
fdr->size = size; fdr->size = size;
@ -206,6 +206,7 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
assert(ISALIGNED_CL(ptr)); assert(ISALIGNED_CL(ptr));
fdr->floodOffset = verify_u32(ptr - fdr_base); fdr->floodOffset = verify_u32(ptr - fdr_base);
memcpy(ptr, floodTable.get(), floodTable.size()); memcpy(ptr, floodTable.get(), floodTable.size());
ptr += floodTable.size(); // last write, no need to round up
return fdr; return fdr;
} }
@ -493,18 +494,18 @@ map<BucketIndex, vector<LiteralIndex>> assignStringsToBuckets(
u32 cnt = last_id - first_id; u32 cnt = last_id - first_id;
// long literals first for included literals checking // long literals first for included literals checking
for (u32 k = 0; k < cnt; k++) { for (u32 k = 0; k < cnt; k++) {
litIds.emplace_back(last_id - k - 1); litIds.push_back(last_id - k - 1);
} }
i = j; i = j;
buckets.emplace_back(litIds); buckets.push_back(litIds);
} }
// reverse bucket id, longer literals come first // reverse bucket id, longer literals come first
map<BucketIndex, vector<LiteralIndex>> bucketToLits; map<BucketIndex, vector<LiteralIndex>> bucketToLits;
size_t bucketCnt = buckets.size(); size_t bucketCnt = buckets.size();
for (size_t i = 0; i < bucketCnt; i++) { for (size_t i = 0; i < bucketCnt; i++) {
bucketToLits.emplace(bucketCnt - i - 1, std::move(buckets[i])); bucketToLits.emplace(bucketCnt - i - 1, move(buckets[i]));
} }
return bucketToLits; return bucketToLits;
@ -867,7 +868,7 @@ unique_ptr<HWLMProto> fdrBuildProtoInternal(u8 engType,
auto bucketToLits = assignStringsToBuckets(lits, *des); auto bucketToLits = assignStringsToBuckets(lits, *des);
addIncludedInfo(lits, des->getNumBuckets(), bucketToLits); addIncludedInfo(lits, des->getNumBuckets(), bucketToLits);
auto proto = auto proto =
std::make_unique<HWLMProto>(engType, std::move(des), lits, bucketToLits, ue2::make_unique<HWLMProto>(engType, move(des), lits, bucketToLits,
make_small); make_small);
return proto; return proto;
} }

View File

@ -39,7 +39,6 @@ namespace ue2 {
size_t maxLen(const vector<hwlmLiteral> &lits) { size_t maxLen(const vector<hwlmLiteral> &lits) {
size_t rv = 0; size_t rv = 0;
for (const auto &lit : lits) { for (const auto &lit : lits) {
// cppcheck-suppress useStlAlgorithm
rv = max(rv, lit.s.size()); rv = max(rv, lit.s.size());
} }
return rv; return rv;

View File

@ -84,10 +84,9 @@ struct FDRConfirm {
static really_inline static really_inline
const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) { const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
// cppcheck-suppress cstyleCast
const u8 *base = (const u8 *)fdrc; const u8 *base = (const u8 *)fdrc;
// cppcheck-suppress cstyleCast const u32 *litIndex =
const u32 *litIndex =(const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32))); (const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
assert(ISALIGNED(litIndex)); assert(ISALIGNED(litIndex));
return litIndex; return litIndex;
} }

View File

@ -58,7 +58,7 @@ u64a make_u64a_mask(const vector<u8> &v) {
u64a mask = 0; u64a mask = 0;
size_t vlen = v.size(); size_t vlen = v.size();
size_t len = std::min(vlen, sizeof(mask)); size_t len = std::min(vlen, sizeof(mask));
u8 *m = reinterpret_cast<u8 *>(&mask); unsigned char *m = (unsigned char *)&mask;
memcpy(m + sizeof(mask) - len, &v[vlen - len], len); memcpy(m + sizeof(mask) - len, &v[vlen - len], len);
return mask; return mask;
} }
@ -159,10 +159,10 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
map<u32, vector<LiteralIndex> > res2lits; map<u32, vector<LiteralIndex> > res2lits;
hwlm_group_t gm = 0; hwlm_group_t gm = 0;
for (LiteralIndex i = 0; i < lits.size(); i++) { for (LiteralIndex i = 0; i < lits.size(); i++) {
const LitInfo & li = tmpLitInfo[i]; LitInfo & li = tmpLitInfo[i];
u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits); u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
DEBUG_PRINTF("%016llx --> %u\n", li.v, hash); DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
res2lits[hash].emplace_back(i); res2lits[hash].push_back(i);
gm |= li.groups; gm |= li.groups;
} }
@ -245,10 +245,10 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
fdrc->groups = gm; fdrc->groups = gm;
// After the FDRConfirm, we have the lit index array. // After the FDRConfirm, we have the lit index array.
u8 *fdrc_base = reinterpret_cast<u8 *>(fdrc.get()); u8 *fdrc_base = (u8 *)fdrc.get();
u8 *ptr = fdrc_base + sizeof(*fdrc); u8 *ptr = fdrc_base + sizeof(*fdrc);
ptr = ROUNDUP_PTR(ptr, alignof(u32)); ptr = ROUNDUP_PTR(ptr, alignof(u32));
u32 *bitsToLitIndex = reinterpret_cast<u32 *>(ptr); u32 *bitsToLitIndex = (u32 *)ptr;
ptr += bitsToLitIndexSize; ptr += bitsToLitIndexSize;
// After the lit index array, we have the LitInfo structures themselves, // After the lit index array, we have the LitInfo structures themselves,
@ -265,7 +265,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
LiteralIndex litIdx = *i; LiteralIndex litIdx = *i;
// Write LitInfo header. // Write LitInfo header.
LitInfo &finalLI = *(reinterpret_cast<LitInfo *>(ptr)); LitInfo &finalLI = *(LitInfo *)ptr;
finalLI = tmpLitInfo[litIdx]; finalLI = tmpLitInfo[litIdx];
ptr += sizeof(LitInfo); // String starts directly after LitInfo. ptr += sizeof(LitInfo); // String starts directly after LitInfo.
@ -294,20 +294,22 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
const EngineDescription &eng, const EngineDescription &eng,
const map<BucketIndex, vector<LiteralIndex>> &bucketToLits, const map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
bool make_small) { bool make_small) {
unique_ptr<TeddyEngineDescription> teddyDescr =
getTeddyDescription(eng.getID());
BC2CONF bc2Conf; BC2CONF bc2Conf;
u32 totalConfirmSize = 0; u32 totalConfirmSize = 0;
for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) { for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
if (contains(bucketToLits, b)) { if (contains(bucketToLits, b)) {
vector<hwlmLiteral> vl; vector<hwlmLiteral> vl;
for (const LiteralIndex &lit_idx : bucketToLits.at(b)) { for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
// cppcheck-suppress useStlAlgorithm vl.push_back(lits[lit_idx]);
vl.emplace_back(lits[lit_idx]);
} }
DEBUG_PRINTF("b %d sz %zu\n", b, vl.size()); DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
auto fc = getFDRConfirm(vl, make_small); auto fc = getFDRConfirm(vl, make_small);
totalConfirmSize += fc.size(); totalConfirmSize += fc.size();
bc2Conf.emplace(b, std::move(fc)); bc2Conf.emplace(b, move(fc));
} }
} }
@ -318,7 +320,7 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64); auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64);
assert(buf); // otherwise would have thrown std::bad_alloc assert(buf); // otherwise would have thrown std::bad_alloc
u32 *confBase = reinterpret_cast<u32 *>(buf.get()); u32 *confBase = (u32 *)buf.get();
u8 *ptr = buf.get() + totalConfSwitchSize; u8 *ptr = buf.get() + totalConfSwitchSize;
assert(ISALIGNED_CL(ptr)); assert(ISALIGNED_CL(ptr));

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2019, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -55,14 +54,9 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
if (likely(!start)) { if (likely(!start)) {
return; return;
} }
// these cplusplus checks are needed because this is included in both fdr.c and teddy.cpp
#ifdef __cplusplus
const struct LitInfo *li
= reinterpret_cast<const struct LitInfo *>(reinterpret_cast<const u8 *>(fdrc) + start);
#else
const struct LitInfo *li const struct LitInfo *li
= (const struct LitInfo *)((const u8 *)fdrc + start); = (const struct LitInfo *)((const u8 *)fdrc + start);
#endif
struct hs_scratch *scratch = a->scratch; struct hs_scratch *scratch = a->scratch;
assert(!scratch->fdr_conf); assert(!scratch->fdr_conf);
@ -80,20 +74,18 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
goto out; goto out;
} }
do{ // this do while is to block off the line below from the goto const u8 *loc = buf + i - li->size + 1;
const u8 *loc = buf + i - li->size + 1;
if (loc < buf) { if (loc < buf) {
u32 full_overhang = buf - loc; u32 full_overhang = buf - loc;
size_t len_history = a->len_history; size_t len_history = a->len_history;
// can't do a vectored confirm either if we don't have // can't do a vectored confirm either if we don't have
// the bytes // the bytes
if (full_overhang > len_history) { if (full_overhang > len_history) {
goto out; goto out;
}
} }
}while(0); }
assert(li->size <= sizeof(CONF_TYPE)); assert(li->size <= sizeof(CONF_TYPE));
if (unlikely(!(li->groups & *control))) { if (unlikely(!(li->groups & *control))) {

View File

@ -74,9 +74,9 @@ void dumpLitIndex(const FDRConfirm *fdrc, FILE *f) {
static static
void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms, void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms,
FILE *f) { FILE *f) {
const u32 *conf = reinterpret_cast<const u32 *>(reinterpret_cast<const char *>(fdr_base) + conf_offset); const u32 *conf = (const u32 *)((const char *)fdr_base + conf_offset);
for (u32 i = 0; i < num_confirms; i++) { for (u32 i = 0; i < num_confirms; i++) {
const auto *fdrc = reinterpret_cast<const FDRConfirm *>(reinterpret_cast<const char *>(conf) + conf[i]); const auto *fdrc = (const FDRConfirm *)((const char *)conf + conf[i]);
fprintf(f, " confirm %u\n", i); fprintf(f, " confirm %u\n", i);
fprintf(f, " andmsk 0x%016llx\n", fdrc->andmsk); fprintf(f, " andmsk 0x%016llx\n", fdrc->andmsk);
fprintf(f, " mult 0x%016llx\n", fdrc->mult); fprintf(f, " mult 0x%016llx\n", fdrc->mult);
@ -113,7 +113,7 @@ void dumpTeddyDupMasks(const u8 *dmsk, u32 numMasks, FILE *f) {
u32 maskWidth = 2; u32 maskWidth = 2;
fprintf(f, " dup nibble masks:\n"); fprintf(f, " dup nibble masks:\n");
for (u32 i = 0; i < numMasks * 2; i++) { for (u32 i = 0; i < numMasks * 2; i++) {
fprintf(f, " -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo"); fprintf(f, " -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
for (u32 j = 0; j < 16 * maskWidth * 2; j++) { for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
u8 val = dmsk[i * 16 * maskWidth * 2 + j]; u8 val = dmsk[i * 16 * maskWidth * 2 + j];
for (u32 k = 0; k < 8; k++) { for (u32 k = 0; k < 8; k++) {
@ -131,7 +131,7 @@ void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
// dump nibble masks // dump nibble masks
fprintf(f, " nibble masks:\n"); fprintf(f, " nibble masks:\n");
for (u32 i = 0; i < numMasks * 2; i++) { for (u32 i = 0; i < numMasks * 2; i++) {
fprintf(f, " -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo"); fprintf(f, " -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
for (u32 j = 0; j < 16 * maskWidth; j++) { for (u32 j = 0; j < 16 * maskWidth; j++) {
u8 val = baseMsk[i * 16 * maskWidth + j]; u8 val = baseMsk[i * 16 * maskWidth + j];
for (u32 k = 0; k < 8; k++) { for (u32 k = 0; k < 8; k++) {
@ -157,7 +157,7 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
fprintf(f, " buckets %u\n", des->getNumBuckets()); fprintf(f, " buckets %u\n", des->getNumBuckets());
fprintf(f, " packed %s\n", des->packed ? "true" : "false"); fprintf(f, " packed %s\n", des->packed ? "true" : "false");
fprintf(f, " strings %u\n", teddy->numStrings); fprintf(f, " strings %u\n", teddy->numStrings);
fprintf(f, " size %zu bytes\n", fdrSize(reinterpret_cast<const FDR *>(teddy))); fprintf(f, " size %zu bytes\n", fdrSize((const FDR *)teddy));
fprintf(f, " max length %u\n", teddy->maxStringLen); fprintf(f, " max length %u\n", teddy->maxStringLen);
fprintf(f, " floodoff %u (%x)\n", teddy->floodOffset, fprintf(f, " floodoff %u (%x)\n", teddy->floodOffset,
teddy->floodOffset); teddy->floodOffset);
@ -165,7 +165,7 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
u32 maskWidth = des->getNumBuckets() / 8; u32 maskWidth = des->getNumBuckets() / 8;
size_t headerSize = sizeof(Teddy); size_t headerSize = sizeof(Teddy);
const u8 *teddy_base = reinterpret_cast<const u8 *>(teddy); const u8 *teddy_base = (const u8 *)teddy;
const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize); const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f); dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
size_t maskLen = des->numMasks * 16 * 2 * maskWidth; size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
@ -201,7 +201,7 @@ void dumpFDR(const FDR *fdr, FILE *f) {
void fdrPrintStats(const FDR *fdr, FILE *f) { void fdrPrintStats(const FDR *fdr, FILE *f) {
if (fdrIsTeddy(fdr)) { if (fdrIsTeddy(fdr)) {
dumpTeddy(reinterpret_cast<const Teddy *>(fdr), f); dumpTeddy((const Teddy *)fdr, f);
} else { } else {
dumpFDR(fdr, f); dumpFDR(fdr, f);
} }

View File

@ -31,6 +31,7 @@
#include "hs_compile.h" #include "hs_compile.h"
#include "util/target_info.h" #include "util/target_info.h"
#include "util/compare.h" // for ourisalpha() #include "util/compare.h" // for ourisalpha()
#include "util/make_unique.h"
#include <cassert> #include <cassert>
#include <cstdlib> #include <cstdlib>
@ -71,7 +72,7 @@ u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
} else if (num_lits < 5000) { } else if (num_lits < 5000) {
// for larger but not huge sizes, go to stride 2 only if we have at // for larger but not huge sizes, go to stride 2 only if we have at
// least minlen 3 // least minlen 3
desiredStride = std::min(min_len - 1, 2UL); desiredStride = MIN(min_len - 1, 2);
} }
} }
@ -195,7 +196,7 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
} }
DEBUG_PRINTF("using engine %u\n", best->getID()); DEBUG_PRINTF("using engine %u\n", best->getID());
return std::make_unique<FDREngineDescription>(*best); return ue2::make_unique<FDREngineDescription>(*best);
} }
SchemeBitIndex FDREngineDescription::getSchemeBit(BucketIndex b, SchemeBitIndex FDREngineDescription::getSchemeBit(BucketIndex b,
@ -221,7 +222,7 @@ unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID) {
return nullptr; return nullptr;
} }
return std::make_unique<FDREngineDescription>(allDescs[engineID]); return ue2::make_unique<FDREngineDescription>(allDescs[engineID]);
} }
} // namespace ue2 } // namespace ue2

View File

@ -208,8 +208,8 @@ bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16); auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
assert(buf); // otherwise would have thrown std::bad_alloc assert(buf); // otherwise would have thrown std::bad_alloc
u32 *floodHeader = reinterpret_cast<u32 *>(buf.get()); u32 *floodHeader = (u32 *)buf.get();
FDRFlood *layoutFlood = reinterpret_cast<FDRFlood *>(buf.get() + floodHeaderSize); FDRFlood *layoutFlood = (FDRFlood *)(buf.get() + floodHeaderSize);
u32 currentFloodIndex = 0; u32 currentFloodIndex = 0;
for (const auto &m : flood2chars) { for (const auto &m : flood2chars) {

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -38,13 +37,6 @@
#define FLOOD_MINIMUM_SIZE 256 #define FLOOD_MINIMUM_SIZE 256
#define FLOOD_BACKOFF_START 32 #define FLOOD_BACKOFF_START 32
// this is because this file is included in both fdr.c and teddy.cpp
#if defined __cplusplus
#define CU64A_P_CAST(X) reinterpret_cast<const u64a*>(X)
#else
#define CU64A_P_CAST(X) (const u64a *)(X)
#endif
static really_inline static really_inline
const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) { const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
// if we don't have a flood at either the start or end, // if we don't have a flood at either the start or end,
@ -55,18 +47,18 @@ const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
/* entry points in runtime.c prefetch relevant data */ /* entry points in runtime.c prefetch relevant data */
#ifndef FLOOD_32 #ifndef FLOOD_32
u64a x11 = *CU64A_P_CAST(ROUNDUP_PTR(buf, 8)); u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
u64a x12 = *CU64A_P_CAST(ROUNDUP_PTR(buf+8, 8)); u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
if (x11 == x12) { if (x11 == x12) {
return buf + floodBackoff; return buf + floodBackoff;
} }
u64a x21 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2, 8)); u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
u64a x22 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2 + 8, 8)); u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
if (x21 == x22) { if (x21 == x22) {
return buf + floodBackoff; return buf + floodBackoff;
} }
u64a x31 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 24, 8)); u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
u64a x32 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 16, 8)); u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
if (x31 == x32) { if (x31 == x32) {
return buf + floodBackoff; return buf + floodBackoff;
} }
@ -114,15 +106,9 @@ const u8 * floodDetect(const struct FDR * fdr,
// go from c to our FDRFlood structure // go from c to our FDRFlood structure
u8 c = buf[i]; u8 c = buf[i];
#ifdef __cplusplus
const u8 * fBase = (reinterpret_cast<const u8 *>(fdr)) + fdr->floodOffset;
u32 fIdx = (reinterpret_cast<const u32 *>(fBase))[c];
const struct FDRFlood * fsb = reinterpret_cast<const struct FDRFlood *>(fBase + sizeof(u32) * 256);
#else
const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset; const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
u32 fIdx = ((const u32 *)fBase)[c]; u32 fIdx = ((const u32 *)fBase)[c];
const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256); const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
#endif
const struct FDRFlood * fl = &fsb[fIdx]; const struct FDRFlood * fl = &fsb[fIdx];
#ifndef FLOOD_32 #ifndef FLOOD_32
@ -130,7 +116,7 @@ const u8 * floodDetect(const struct FDR * fdr,
cmpVal |= cmpVal << 8; cmpVal |= cmpVal << 8;
cmpVal |= cmpVal << 16; cmpVal |= cmpVal << 16;
cmpVal |= cmpVal << 32; cmpVal |= cmpVal << 32;
u64a probe = *CU64A_P_CAST(ROUNDUP_PTR(buf+i, 8)); u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
#else #else
u32 cmpVal = c; u32 cmpVal = c;
cmpVal |= cmpVal << 8; cmpVal |= cmpVal << 8;
@ -153,16 +139,16 @@ const u8 * floodDetect(const struct FDR * fdr,
#ifndef FLOOD_32 #ifndef FLOOD_32
j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
for (; j + 32 < mainLoopLen; j += 32) { for (; j + 32 < mainLoopLen; j += 32) {
u64a v = *CU64A_P_CAST(buf + j); u64a v = *(const u64a *)(buf + j);
u64a v2 = *CU64A_P_CAST(buf + j + 8); u64a v2 = *(const u64a *)(buf + j + 8);
u64a v3 = *CU64A_P_CAST(buf + j + 16); u64a v3 = *(const u64a *)(buf + j + 16);
u64a v4 = *CU64A_P_CAST(buf + j + 24); u64a v4 = *(const u64a *)(buf + j + 24);
if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) { if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
break; break;
} }
} }
for (; j + 8 < mainLoopLen; j += 8) { for (; j + 8 < mainLoopLen; j += 8) {
u64a v = *CU64A_P_CAST(buf + j); u64a v = *(const u64a *)(buf + j);
if (v != cmpVal) { if (v != cmpVal) {
break; break;
} }
@ -186,11 +172,7 @@ const u8 * floodDetect(const struct FDR * fdr,
} }
#endif #endif
for (; j < mainLoopLen; j++) { for (; j < mainLoopLen; j++) {
#ifdef __cplusplus
u8 v = *(reinterpret_cast<const u8 *>(buf + j));
#else
u8 v = *(const u8 *)(buf + j); u8 v = *(const u8 *)(buf + j);
#endif
if (v != c) { if (v != c) {
break; break;
} }

1114
src/fdr/teddy.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,862 +0,0 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Teddy literal matcher: SSSE3 engine runtime.
*/
#include "fdr_internal.h"
#include "flood_runtime.h"
#include "teddy.h"
#include "teddy_internal.h"
#include "teddy_runtime_common.h"
#include "util/arch.h"
#include "util/simd_utils.h"
#ifdef ARCH_64_BIT
static really_inline
hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
CautionReason reason, const u8 *pt,
const u32* confBase,
const struct FDR_Runtime_Args *a,
hwlm_group_t *control,
u32 *last_match) {
if (unlikely(chunk != ones_u64a)) {
chunk = ~chunk;
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
control, last_match);
// adapted from CHECK_HWLM_TERMINATE_MATCHING
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
return HWLM_TERMINATED;
}
}
return HWLM_SUCCESS;
}
#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
#else // 32/64
static really_inline
hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
CautionReason reason, const u8 *pt,
const u32* confBase,
const struct FDR_Runtime_Args *a,
hwlm_group_t *control,
u32 *last_match) {
if (unlikely(chunk != ones_u32)) {
chunk = ~chunk;
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
control, last_match);
// adapted from CHECK_HWLM_TERMINATE_MATCHING
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
return HWLM_TERMINATED;
}
}
return HWLM_SUCCESS;
}
#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
#endif
#if defined(HAVE_AVX512VBMI) || defined(HAVE_AVX512) // common to both 512b's
static really_inline
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
}
#ifdef ARCH_64_BIT
static really_inline
hwlm_error_t confirm_teddy_64_512(m512 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff512(var, ones512()))) {
m128 p128_0 = extract128from512(var, 0);
m128 p128_1 = extract128from512(var, 1);
m128 p128_2 = extract128from512(var, 2);
m128 p128_3 = extract128from512(var, 3);
u64a part1 = movq(p128_0);
u64a part2 = movq(rshiftbyte_m128(p128_0, 8));
u64a part3 = movq(p128_1);
u64a part4 = movq(rshiftbyte_m128(p128_1, 8));
u64a part5 = movq(p128_2);
u64a part6 = movq(rshiftbyte_m128(p128_2, 8));
u64a part7 = movq(p128_3);
u64a part8 = movq(rshiftbyte_m128(p128_3, 8));
CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part5, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part6, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part7, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part8, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_teddy_512_f confirm_teddy_64_512
#else // 32/64
static really_inline
hwlm_error_t confirm_teddy_32_512(m512 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff512(var, ones512()))) {
m128 p128_0 = extract128from512(var, 0);
m128 p128_1 = extract128from512(var, 1);
m128 p128_2 = extract128from512(var, 2);
m128 p128_3 = extract128from512(var, 3);
u32 part1 = movd(p128_0);
u32 part2 = movd(rshiftbyte_m128(p128_0, 4));
u32 part3 = movd(rshiftbyte_m128(p128_0, 8));
u32 part4 = movd(rshiftbyte_m128(p128_0, 12));
u32 part5 = movd(p128_1);
u32 part6 = movd(rshiftbyte_m128(p128_1, 4));
u32 part7 = movd(rshiftbyte_m128(p128_1, 8));
u32 part8 = movd(rshiftbyte_m128(p128_1, 12));
u32 part9 = movd(p128_2);
u32 part10 = movd(rshiftbyte_m128(p128_2, 4));
u32 part11 = movd(rshiftbyte_m128(p128_2, 8));
u32 part12 = movd(rshiftbyte_m128(p128_2, 12));
u32 part13 = movd(p128_3);
u32 part14 = movd(rshiftbyte_m128(p128_3, 4));
u32 part15 = movd(rshiftbyte_m128(p128_3, 8));
u32 part16 = movd(rshiftbyte_m128(p128_3, 12));
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part9, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part10, bucket, offset + 36, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part11, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part12, bucket, offset + 44, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part13, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part14, bucket, offset + 52, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part15, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part16, bucket, offset + 60, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_teddy_512_f confirm_teddy_32_512
#endif // 32/64
#define CONFIRM_TEDDY_512(...) if(confirm_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
#endif // AVX512VBMI or AVX512
#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
#define TEDDY_VBMI_SL1_MASK 0xfffffffffffffffeULL
#define TEDDY_VBMI_SL2_MASK 0xfffffffffffffffcULL
#define TEDDY_VBMI_SL3_MASK 0xfffffffffffffff8ULL
template<int NMSK>
static really_inline
m512 prep_conf_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
const m512 *sl_msk, const m512 val) {
m512 lo = and512(val, *lo_mask);
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
pshufb_m512(dup_mask[1], hi));
if constexpr (NMSK == 1) return shuf_or_b0;
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
pshufb_m512(dup_mask[3], hi));
m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
pshufb_m512(dup_mask[5], hi));
m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
pshufb_m512(dup_mask[7], hi));
m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
}
#define TEDDY_VBMI_SL1_POS 15
#define TEDDY_VBMI_SL2_POS 14
#define TEDDY_VBMI_SL3_POS 13
#define TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffffffffffULL >> n_sh)
#define TEDDY_VBMI_CONF_MASK_FULL (0xffffffffffffffffULL << n_sh)
#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
#define TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffffffffffULL >> (64 - n_sh))
template<int NMSK>
hwlm_error_t fdr_exec_teddy_512vbmi_templ(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = ones_u32;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 64;
u32 n_sh = NMSK - 1;
const size_t loopBytes = 64 - n_sh;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
m512 lo_mask = set1_64x8(0xf);
m512 dup_mask[NMSK * 2];
m512 sl_msk[NMSK - 1];
dup_mask[0] = set1_4x128(maskBase[0]);
dup_mask[1] = set1_4x128(maskBase[1]);
if constexpr (NMSK > 1){
dup_mask[2] = set1_4x128(maskBase[2]);
dup_mask[3] = set1_4x128(maskBase[3]);
sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
}
if constexpr (NMSK > 2){
dup_mask[4] = set1_4x128(maskBase[4]);
dup_mask[5] = set1_4x128(maskBase[5]);
sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
}
if constexpr (NMSK > 3){
dup_mask[6] = set1_4x128(maskBase[6]);
dup_mask[7] = set1_4x128(maskBase[7]);
sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
}
const u32 *confBase = getConfBase(teddy);
u64a k = TEDDY_VBMI_CONF_MASK_FULL;
m512 p_mask = set_mask_m512(~k);
u32 overlap = 0;
u64a patch = 0;
if (likely(ptr + loopBytes <= buf_end)) {
m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr));
r_0 = or512(r_0, p_mask0);
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
ptr += loopBytes;
overlap = n_sh;
patch = TEDDY_VBMI_LOAD_MASK_PATCH;
}
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
__builtin_prefetch(ptr - n_sh + (64 * 2));
CHECK_FLOOD;
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr - n_sh));
r_0 = or512(r_0, p_mask);
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh);
}
assert(ptr + loopBytes > buf_end);
if (ptr < buf_end) {
u32 left = (u32)(buf_end - ptr);
u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);
m512 p_mask1 = set_mask_m512(~k1);
m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
r_0 = or512(r_0, p_mask1);
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr - overlap);
}
return HWLM_SUCCESS;
}
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512vbmi_templ
#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
/* both 512b versions use the same confirm teddy */
template <int NMSK>
static inline
m512 shift_or_512_templ(const m512 *dup_mask, m512 lo, m512 hi) {
return or512(lshift128_m512(or512(pshufb_m512(dup_mask[(NMSK - 1) * 2], lo),
pshufb_m512(dup_mask[(NMSK * 2) - 1], hi)),
NMSK - 1), shift_or_512_templ<NMSK - 1>(dup_mask, lo, hi));
}
template <>
m512 shift_or_512_templ<1>(const m512 *dup_mask, m512 lo, m512 hi){
return or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi));
}
template <int NMSK>
static really_inline
m512 prep_conf_teddy_no_reinforcement_512_templ(const m512 *lo_mask,
const m512 *dup_mask,
const m512 val) {
m512 lo = and512(val, *lo_mask);
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
return shift_or_512_templ<NMSK>(dup_mask, lo, hi);
}
template <int NMSK>
static really_inline
m512 prep_conf_teddy_512_templ(const m512 *lo_mask, const m512 *dup_mask,
const u8 *ptr, const u64a *r_msk_base,
u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
m512 lo = and512(load512(ptr), *lo_mask);
m512 hi = and512(rshift64_m512(load512(ptr), 4), *lo_mask);
*c_16 = *(ptr + 15);
*c_32 = *(ptr + 31);
*c_48 = *(ptr + 47);
m512 r_msk = set8x64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],
0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);
*c_0 = *(ptr + 63);
return or512(shift_or_512_templ<NMSK>(dup_mask, lo, hi), r_msk);
}
#define PREP_CONF_FN_512(ptr, n) \
prep_conf_teddy_512_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, \
&c_0, &c_16, &c_32, &c_48)
template <int NMSK>
hwlm_error_t fdr_exec_teddy_512_templ(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = ones_u32;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 128;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
m512 lo_mask = set1_64x8(0xf);
m512 dup_mask[NMSK * 2];
dup_mask[0] = set1_4x128(maskBase[0]);
dup_mask[1] = set1_4x128(maskBase[1]);
if constexpr (NMSK > 1){
dup_mask[2] = set1_4x128(maskBase[2]);
dup_mask[3] = set1_4x128(maskBase[3]);
}
if constexpr (NMSK > 2){
dup_mask[4] = set1_4x128(maskBase[4]);
dup_mask[5] = set1_4x128(maskBase[5]);
}
if constexpr (NMSK > 3){
dup_mask[6] = set1_4x128(maskBase[6]);
dup_mask[7] = set1_4x128(maskBase[7]);
}
const u32 *confBase = getConfBase(teddy);
const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
u32 c_0 = 0x100;
u32 c_16 = 0x100;
u32 c_32 = 0x100;
u32 c_48 = 0x100;
const u8 *mainStart = ROUNDUP_PTR(ptr, 64);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 64;
m512 p_mask;
m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,
a->buf, buf_end,
a->buf_history, a->len_history, NMSK);
m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask, val_0);
r_0 = or512(r_0, p_mask);
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
ptr += 64;
}
if (ptr + 64 <= buf_end) {
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
ptr += 64;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes * 4));
CHECK_FLOOD;
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
m512 r_1 = PREP_CONF_FN_512(ptr + 64, NMSK);
CONFIRM_TEDDY_512(r_1, 8, 64, NOT_CAUTIOUS, ptr);
}
if (ptr + 64 <= buf_end) {
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
ptr += 64;
}
assert(ptr + 64 > buf_end);
if (ptr < buf_end) {
m512 p_mask;
m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,
a->buf_history, a->len_history, NMSK);
m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask,val_0);
r_0 = or512(r_0, p_mask);
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
}
return HWLM_SUCCESS;
}
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512_templ
/* #endif // AVX512 vs AVX512VBMI * back to the original fully exclusive logic */
#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
#ifdef ARCH_64_BIT
hwlm_error_t confirm_teddy_64_256(m256 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff256(var, ones256()))) {
m128 lo = movdq_lo(var);
m128 hi = movdq_hi(var);
u64a part1 = movq(lo);
u64a part2 = movq(rshiftbyte_m128(lo, 8));
u64a part3 = movq(hi);
u64a part4 = movq(rshiftbyte_m128(hi, 8));
CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_teddy_256_f confirm_teddy_64_256
#else
hwlm_error_t confirm_teddy_32_256(m256 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff256(var, ones256()))) {
m128 lo = movdq_lo(var);
m128 hi = movdq_hi(var);
u32 part1 = movd(lo);
u32 part2 = movd(rshiftbyte_m128(lo, 4));
u32 part3 = movd(rshiftbyte_m128(lo, 8));
u32 part4 = movd(rshiftbyte_m128(lo, 12));
u32 part5 = movd(hi);
u32 part6 = movd(rshiftbyte_m128(hi, 4));
u32 part7 = movd(rshiftbyte_m128(hi, 8));
u32 part8 = movd(rshiftbyte_m128(hi, 12));
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_teddy_256_f confirm_teddy_32_256
#endif
#define CONFIRM_TEDDY_256(...) if(confirm_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
/*
static really_inline
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks));
*p_mask = set1_2x128(p_mask128);
return ret;
}
*/
template <int NMSK>
static inline
m256 shift_or_256_templ(const m256 *dup_mask, m256 lo, m256 hi){
return or256(lshift128_m256(or256(pshufb_m256(dup_mask[(NMSK-1)*2], lo),
pshufb_m256(dup_mask[(NMSK*2)-1], hi)),
(NMSK-1)), shift_or_256_templ<NMSK-1>(dup_mask, lo, hi));
}
template<>
m256 shift_or_256_templ<1>(const m256 *dup_mask, m256 lo, m256 hi){
return or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi));
}
template <int NMSK>
static really_inline
m256 prep_conf_teddy_no_reinforcement_256_templ(const m256 *lo_mask,
const m256 *dup_mask,
const m256 val) {
m256 lo = and256(val, *lo_mask);
m256 hi = and256(rshift64_m256(val, 4), *lo_mask);
return shift_or_256_templ<NMSK>(dup_mask, lo, hi);
}
template <int NMSK>
static really_inline
m256 prep_conf_teddy_256_templ(const m256 *lo_mask, const m256 *dup_mask,
const u8 *ptr, const u64a *r_msk_base,
u32 *c_0, u32 *c_128) {
m256 lo = and256(load256(ptr), *lo_mask);
m256 hi = and256(rshift64_m256(load256(ptr), 4), *lo_mask);
*c_128 = *(ptr + 15);
m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]);
*c_0 = *(ptr + 31);
return or256(shift_or_256_templ<NMSK>(dup_mask, lo, hi), r_msk);
}
#define PREP_CONF_FN_256_NO_REINFORCEMENT(val, n) \
prep_conf_teddy_no_reinforcement_256_templ<n>(&lo_mask, dup_mask, val)
#define PREP_CONF_FN_256(ptr, n) \
prep_conf_teddy_256_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
template <int NMSK>
hwlm_error_t fdr_exec_teddy_256_templ(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = ones_u32;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 64;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
//PREPARE_MASKS_256;
m256 lo_mask = set1_32x8(0xf);
m256 dup_mask[NMSK * 2];
dup_mask[0] = set1_2x128(maskBase[0]);
dup_mask[1] = set1_2x128(maskBase[1]);
if constexpr (NMSK > 1){
dup_mask[2] = set1_2x128(maskBase[2]);
dup_mask[3] = set1_2x128(maskBase[3]);
}
if constexpr (NMSK > 2){
dup_mask[4] = set1_2x128(maskBase[4]);
dup_mask[5] = set1_2x128(maskBase[5]);
}
if constexpr (NMSK > 3){
dup_mask[6] = set1_2x128(maskBase[6]);
dup_mask[7] = set1_2x128(maskBase[7]);
}
const u32 *confBase = getConfBase(teddy);
const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
u32 c_0 = 0x100;
u32 c_128 = 0x100;
const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 32;
m256 p_mask;
m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,
a->buf, buf_end,
a->buf_history, a->len_history, NMSK);
m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
r_0 = or256(r_0, p_mask);
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
ptr += 32;
}
if (ptr + 32 <= buf_end) {
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
ptr += 32;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes * 4));
CHECK_FLOOD;
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
m256 r_1 = PREP_CONF_FN_256(ptr + 32, NMSK);
CONFIRM_TEDDY_256(r_1, 8, 32, NOT_CAUTIOUS, ptr);
}
if (ptr + 32 <= buf_end) {
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
ptr += 32;
}
assert(ptr + 32 > buf_end);
if (ptr < buf_end) {
m256 p_mask;
m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,
a->buf_history, a->len_history, NMSK);
m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
r_0 = or256(r_0, p_mask);
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
}
return HWLM_SUCCESS;
}
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_256_templ
#else // not defined HAVE_AVX2
#ifdef ARCH_64_BIT
static really_inline
hwlm_error_t confirm_teddy_64_128(m128 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff128(var, ones128()))) {
u64a lo = 0;
u64a hi = 0;
u64a __attribute__((aligned(16))) vec[2];
store128(vec, var);
lo = vec[0];
hi = vec[1];
CONF_CHUNK_64(lo, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_64(hi, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_teddy_128_f confirm_teddy_64_128
#else // 32/64
static really_inline
hwlm_error_t confirm_teddy_32_128(m128 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff128(var, ones128()))) {
u32 part1 = movd(var);
u32 part2 = movd(rshiftbyte_m128(var, 4));
u32 part3 = movd(rshiftbyte_m128(var, 8));
u32 part4 = movd(rshiftbyte_m128(var, 12));
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_teddy_128_f confirm_teddy_32_128
#endif // 32/64
#define CONFIRM_TEDDY_128(...) if(confirm_teddy_128_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
template <int NMSK>
static really_inline
m128 prep_conf_teddy_128_templ(const m128 *maskBase, m128 val) {
m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r1 = or128(pshufb_m128(maskBase[0 * 2], lo),
pshufb_m128(maskBase[0 * 2 + 1], hi));
if constexpr (NMSK == 1) return r1;
m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
pshufb_m128(maskBase[1 * 2 + 1], hi));
m128 old_1 = zeroes128();
m128 res_shifted_1 = palignr(res_1, old_1, 16 - 1);
m128 r2 = or128(r1, res_shifted_1);
if constexpr (NMSK == 2) return r2;
m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
pshufb_m128(maskBase[2 * 2 + 1], hi));
m128 res_shifted_2 = palignr(res_2, old_1, 16 - 2);
m128 r3 = or128(r2, res_shifted_2);
if constexpr (NMSK == 3) return r3;
m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
pshufb_m128(maskBase[3 * 2 + 1], hi));
m128 res_shifted_3 = palignr(res_3, old_1, 16 - 3);
return or128(r3, res_shifted_3);
}
template <int NMSK>
hwlm_error_t fdr_exec_teddy_128_templ(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = ones_u32;
const struct Teddy *teddy = reinterpret_cast<const struct Teddy *>(fdr);
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m128 *maskBase = getMaskBase(teddy);
const u32 *confBase = getConfBase(teddy);
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,
a->buf, buf_end,
a->buf_history, a->len_history, NMSK);
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
r_0 = or128(r_0, p_mask);
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
ptr += 16;
}
if (ptr + 16 <= buf_end) {
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
ptr += 16;
}
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes * 4));
CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
m128 r_1 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr + 16));
CONFIRM_TEDDY_128(r_1, 8, 16, NOT_CAUTIOUS, ptr);
}
if (ptr + 16 <= buf_end) {
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
ptr += 16;
}
assert(ptr + 16 > buf_end);
if (ptr < buf_end) {
m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,
a->buf_history, a->len_history, NMSK);
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
r_0 = or128(r_0, p_mask);
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
}
return HWLM_SUCCESS;
}
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_128_templ
#endif // HAVE_AVX2 HAVE_AVX512
extern "C" {
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
}
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
}
hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
}
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
}
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
}
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
}
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
}
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
}
} // extern

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2017, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -40,10 +39,6 @@
struct FDR; // forward declaration from fdr_internal.h struct FDR; // forward declaration from fdr_internal.h
struct FDR_Runtime_Args; struct FDR_Runtime_Args;
#ifdef __cplusplus
extern "C" {
#endif
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr, hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a, const struct FDR_Runtime_Args *a,
hwlm_group_t control); hwlm_group_t control);
@ -111,8 +106,5 @@ hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
hwlm_group_t control); hwlm_group_t control);
#endif /* HAVE_AVX2 */ #endif /* HAVE_AVX2 */
#ifdef __cplusplus
}
#endif
#endif /* TEDDY_H_ */ #endif /* TEDDY_H_ */

709
src/fdr/teddy_avx2.c Normal file
View File

@ -0,0 +1,709 @@
/*
* Copyright (c) 2016-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Teddy literal matcher: AVX2 engine runtime.
*/
#include "fdr_internal.h"
#include "flood_runtime.h"
#include "teddy.h"
#include "teddy_internal.h"
#include "teddy_runtime_common.h"
#include "util/arch.h"
#include "util/simd_utils.h"
#if defined(HAVE_AVX2)
const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
};
#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn) \
do { \
if (unlikely(chunk != ones_u64a)) { \
chunk = ~chunk; \
conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} while(0)
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn) \
do { \
if (unlikely(chunk != ones_u32)) { \
chunk = ~chunk; \
conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} while(0)
static really_inline
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
}
#else
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
do { \
if (unlikely(chunk != ones_u64a)) { \
chunk = ~chunk; \
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} while(0)
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
do { \
if (unlikely(chunk != ones_u32)) { \
chunk = ~chunk; \
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} while(0)
static really_inline
const m256 *getMaskBase_fat(const struct Teddy *teddy) {
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
}
#endif
#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
};
#ifdef ARCH_64_BIT
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \
do { \
if (unlikely(diff512(var, ones512()))) { \
m512 msk_interleave = load512(p_mask_interleave); \
m512 r = vpermb512(msk_interleave, var); \
m128 r0 = extract128from512(r, 0); \
m128 r1 = extract128from512(r, 1); \
m128 r2 = extract128from512(r, 2); \
m128 r3 = extract128from512(r, 3); \
u64a part1 = movq(r0); \
u64a part2 = extract64from128(r0, 1); \
u64a part3 = movq(r1); \
u64a part4 = extract64from128(r1, 1); \
u64a part5 = movq(r2); \
u64a part6 = extract64from128(r2, 1); \
u64a part7 = movq(r3); \
u64a part8 = extract64from128(r3, 1); \
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
} \
} while(0)
#else
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \
do { \
if (unlikely(diff512(var, ones512()))) { \
m512 msk_interleave = load512(p_mask_interleave); \
m512 r = vpermb512(msk_interleave, var); \
m128 r0 = extract128from512(r, 0); \
m128 r1 = extract128from512(r, 1); \
m128 r2 = extract128from512(r, 2); \
m128 r3 = extract128from512(r, 3); \
u32 part1 = movd(r0); \
u32 part2 = extract32from128(r0, 1); \
u32 part3 = extract32from128(r0, 2); \
u32 part4 = extract32from128(r0, 3); \
u32 part5 = movd(r1); \
u32 part6 = extract32from128(r1, 1); \
u32 part7 = extract32from128(r1, 2); \
u32 part8 = extract32from128(r1, 3); \
u32 part9 = movd(r2); \
u32 part10 = extract32from128(r2, 1); \
u32 part11 = extract32from128(r2, 2); \
u32 part12 = extract32from128(r2, 3); \
u32 part13 = movd(r3); \
u32 part14 = extract32from128(r3, 1); \
u32 part15 = extract32from128(r3, 2); \
u32 part16 = extract32from128(r3, 3); \
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
} \
} while(0)
#endif
#define PREP_FAT_SHUF_MASK \
m512 lo = and512(val, *lo_mask); \
m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
#define FAT_TEDDY_VBMI_PSHUFB_OR_M1 \
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo), \
pshufb_m512(dup_mask[1], hi));
#define FAT_TEDDY_VBMI_PSHUFB_OR_M2 \
FAT_TEDDY_VBMI_PSHUFB_OR_M1 \
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo), \
pshufb_m512(dup_mask[3], hi));
#define FAT_TEDDY_VBMI_PSHUFB_OR_M3 \
FAT_TEDDY_VBMI_PSHUFB_OR_M2 \
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo), \
pshufb_m512(dup_mask[5], hi));
#define FAT_TEDDY_VBMI_PSHUFB_OR_M4 \
FAT_TEDDY_VBMI_PSHUFB_OR_M3 \
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo), \
pshufb_m512(dup_mask[7], hi));
#define FAT_TEDDY_VBMI_SL1_MASK 0xfffffffefffffffeULL
#define FAT_TEDDY_VBMI_SL2_MASK 0xfffffffcfffffffcULL
#define FAT_TEDDY_VBMI_SL3_MASK 0xfffffff8fffffff8ULL
#define FAT_TEDDY_VBMI_SHIFT_M1
#define FAT_TEDDY_VBMI_SHIFT_M2 \
FAT_TEDDY_VBMI_SHIFT_M1 \
m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
#define FAT_TEDDY_VBMI_SHIFT_M3 \
FAT_TEDDY_VBMI_SHIFT_M2 \
m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
#define FAT_TEDDY_VBMI_SHIFT_M4 \
FAT_TEDDY_VBMI_SHIFT_M3 \
m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
#define FAT_SHIFT_OR_M1 \
shuf_or_b0
#define FAT_SHIFT_OR_M2 \
or512(sl1, FAT_SHIFT_OR_M1)
#define FAT_SHIFT_OR_M3 \
or512(sl2, FAT_SHIFT_OR_M2)
#define FAT_SHIFT_OR_M4 \
or512(sl3, FAT_SHIFT_OR_M3)
static really_inline
m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
UNUSED const m512 *sl_msk, const m512 val) {
PREP_FAT_SHUF_MASK;
FAT_TEDDY_VBMI_PSHUFB_OR_M1;
FAT_TEDDY_VBMI_SHIFT_M1;
return FAT_SHIFT_OR_M1;
}
static really_inline
m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
const m512 *sl_msk, const m512 val) {
PREP_FAT_SHUF_MASK;
FAT_TEDDY_VBMI_PSHUFB_OR_M2;
FAT_TEDDY_VBMI_SHIFT_M2;
return FAT_SHIFT_OR_M2;
}
static really_inline
m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
const m512 *sl_msk, const m512 val) {
PREP_FAT_SHUF_MASK;
FAT_TEDDY_VBMI_PSHUFB_OR_M3;
FAT_TEDDY_VBMI_SHIFT_M3;
return FAT_SHIFT_OR_M3;
}
static really_inline
m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
const m512 *sl_msk, const m512 val) {
PREP_FAT_SHUF_MASK;
FAT_TEDDY_VBMI_PSHUFB_OR_M4;
FAT_TEDDY_VBMI_SHIFT_M4;
return FAT_SHIFT_OR_M4;
}
#define PREP_CONF_FAT_FN(val, n) \
prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
#define FAT_TEDDY_VBMI_SL1_POS 15
#define FAT_TEDDY_VBMI_SL2_POS 14
#define FAT_TEDDY_VBMI_SL3_POS 13
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1 \
sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \
sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4 \
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \
sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
/*
* In FAT teddy, it needs 2 bytes to represent result of each position,
* so each nibble's(for example, lo nibble of last byte) FAT teddy mask
* has 16x2 bytes:
* |----------------------------------|----------------------------------|
* 16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
* A B
* at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
* |----------------------------------|----------------------------------|
* 16bytes input data (lo nibbles) 16bytes duplicated data (lo nibbles)
* X X
* then do pshufb_m256(AB, XX).
*
* In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
* to 64 bytes:
* |----------------|----------------|----------------|----------------|
* X Y X Y
* in this case we need DUP_FAT_MASK to construct AABB:
* |----------------|----------------|----------------|----------------|
* A A B B
* then do pshufb_m512(AABB, XYXY).
*/
#define PREPARE_FAT_MASKS(n) \
m512 lo_mask = set64x8(0xf); \
m512 sl_msk[n - 1]; \
FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
#define FAT_TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffULL >> n_sh)
#define FAT_TEDDY_VBMI_CONF_MASK_FULL ((0xffffffffULL << n_sh) & 0xffffffffULL)
#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffULL >> (32 - n_sh))
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
do { \
const u8 *buf_end = a->buf + a->len; \
const u8 *ptr = a->buf + a->start_offset; \
u32 floodBackoff = FLOOD_BACKOFF_START; \
const u8 *tryFloodDetect = a->firstFloodDetect; \
u32 last_match = ones_u32; \
const struct Teddy *teddy = (const struct Teddy *)fdr; \
const size_t iterBytes = 32; \
u32 n_sh = n_msk - 1; \
const size_t loopBytes = 32 - n_sh; \
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
a->buf, a->len, a->start_offset); \
\
const m512 *dup_mask = getDupMaskBase(teddy, n_msk); \
PREPARE_FAT_MASKS(n_msk); \
const u32 *confBase = getConfBase(teddy); \
\
u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL; \
m512 p_mask = set_mask_m512(~((k << 32) | k)); \
u32 overlap = 0; \
u64a patch = 0; \
if (likely(ptr + loopBytes <= buf_end)) { \
u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD; \
m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0)); \
m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk); \
r_0 = or512(r_0, p_mask0); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn); \
ptr += loopBytes; \
overlap = n_sh; \
patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH; \
} \
\
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) { \
CHECK_FLOOD; \
m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk); \
r_0 = or512(r_0, p_mask); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn); \
} \
\
assert(ptr + loopBytes > buf_end); \
if (ptr < buf_end) { \
u32 left = (u32)(buf_end - ptr); \
u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left); \
m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1)); \
m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap)); \
m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk); \
r_0 = or512(r_0, p_mask1); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn); \
} \
\
return HWLM_SUCCESS; \
} while(0)
#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
#ifdef ARCH_64_BIT
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
if (unlikely(diff256(var, ones256()))) { \
m256 swap = swap128in256(var); \
m256 r = interleave256lo(var, swap); \
u64a part1 = extractlow64from256(r); \
u64a part2 = extract64from256(r, 1); \
r = interleave256hi(var, swap); \
u64a part3 = extractlow64from256(r); \
u64a part4 = extract64from256(r, 1); \
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
} \
} while(0)
#else
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
if (unlikely(diff256(var, ones256()))) { \
m256 swap = swap128in256(var); \
m256 r = interleave256lo(var, swap); \
u32 part1 = extractlow32from256(r); \
u32 part2 = extract32from256(r, 1); \
u32 part3 = extract32from256(r, 2); \
u32 part4 = extract32from256(r, 3); \
r = interleave256hi(var, swap); \
u32 part5 = extractlow32from256(r); \
u32 part6 = extract32from256(r, 1); \
u32 part7 = extract32from256(r, 2); \
u32 part8 = extract32from256(r, 3); \
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
} \
} while(0)
#endif
static really_inline
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks));
*p_mask = set2x128(p_mask128);
return ret;
}
static really_inline
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
return or256(pshufb_m256(maskBase[0 * 2], lo),
pshufb_m256(maskBase[0 * 2 + 1], hi));
}
static really_inline
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
pshufb_m256(maskBase[1 * 2 + 1], hi));
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
*old_1 = res_1;
return or256(r, res_shifted_1);
}
static really_inline
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
pshufb_m256(maskBase[2 * 2 + 1], hi));
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
*old_2 = res_2;
return or256(r, res_shifted_2);
}
static really_inline
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 *old_3, m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
pshufb_m256(maskBase[3 * 2 + 1], hi));
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
*old_3 = res_3;
return or256(r, res_shifted_3);
}
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
do { \
} while(0)
#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \
m256 res_old_1 = zeroes256();
#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \
m256 res_old_1 = zeroes256(); \
m256 res_old_2 = zeroes256();
#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \
m256 res_old_1 = zeroes256(); \
m256 res_old_2 = zeroes256(); \
m256 res_old_3 = zeroes256();
#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
#define PREP_CONF_FAT_FN_1(mask_base, val) \
prep_conf_fat_teddy_m1(mask_base, val)
#define PREP_CONF_FAT_FN_2(mask_base, val) \
prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
#define PREP_CONF_FAT_FN_3(mask_base, val) \
prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
#define PREP_CONF_FAT_FN_4(mask_base, val) \
prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
#define PREP_CONF_FAT_FN(mask_base, val, n) \
PREP_CONF_FAT_FN_##n(mask_base, val)
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
do { \
const u8 *buf_end = a->buf + a->len; \
const u8 *ptr = a->buf + a->start_offset; \
u32 floodBackoff = FLOOD_BACKOFF_START; \
const u8 *tryFloodDetect = a->firstFloodDetect; \
u32 last_match = ones_u32; \
const struct Teddy *teddy = (const struct Teddy *)fdr; \
const size_t iterBytes = 32; \
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
a->buf, a->len, a->start_offset); \
\
const m256 *maskBase = getMaskBase_fat(teddy); \
const u32 *confBase = getConfBase(teddy); \
\
FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \
const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
if (ptr < mainStart) { \
ptr = mainStart - 16; \
m256 p_mask; \
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \
a->buf, buf_end, \
a->buf_history, a->len_history, \
n_msk); \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
r_0 = or256(r_0, p_mask); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
ptr += 16; \
} \
\
if (ptr + 16 <= buf_end) { \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
ptr += 16; \
} \
\
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
__builtin_prefetch(ptr + (iterBytes * 4)); \
CHECK_FLOOD; \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \
} \
\
if (ptr + 16 <= buf_end) { \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
ptr += 16; \
} \
\
assert(ptr + 16 > buf_end); \
if (ptr < buf_end) { \
m256 p_mask; \
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \
a->buf_history, a->len_history, \
n_msk); \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
r_0 = or256(r_0, p_mask); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
} \
\
return HWLM_SUCCESS; \
} while(0)
#endif // HAVE_AVX512VBMI
hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
#endif // HAVE_AVX2

View File

@ -46,6 +46,7 @@
#include "util/alloc.h" #include "util/alloc.h"
#include "util/compare.h" #include "util/compare.h"
#include "util/container.h" #include "util/container.h"
#include "util/make_unique.h"
#include "util/noncopyable.h" #include "util/noncopyable.h"
#include "util/popcount.h" #include "util/popcount.h"
#include "util/small_vector.h" #include "util/small_vector.h"
@ -88,7 +89,7 @@ public:
const TeddyEngineDescription &eng_in, bool make_small_in, const TeddyEngineDescription &eng_in, bool make_small_in,
const Grey &grey_in) const Grey &grey_in)
: eng(eng_in), grey(grey_in), lits(lits_in), : eng(eng_in), grey(grey_in), lits(lits_in),
bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {} bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
bytecode_ptr<FDR> build(); bytecode_ptr<FDR> build();
}; };
@ -165,7 +166,7 @@ public:
nibbleSets[i * 2] = nibbleSets[i * 2 + 1] = 0xffff; nibbleSets[i * 2] = nibbleSets[i * 2 + 1] = 0xffff;
} }
} }
litIds.emplace_back(lit_id); litIds.push_back(lit_id);
sort_and_unique(litIds); sort_and_unique(litIds);
} }
@ -328,7 +329,7 @@ bool pack(const vector<hwlmLiteral> &lits,
static static
void initReinforcedTable(u8 *rmsk) { void initReinforcedTable(u8 *rmsk) {
u64a *mask = reinterpret_cast<u64a *>(rmsk); u64a *mask = (u64a *)rmsk;
fill_n(mask, N_CHARS, 0x00ffffffffffffffULL); fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
} }
@ -514,7 +515,7 @@ void fillReinforcedTable(const map<BucketIndex,
u8 *rtable_base, const u32 num_tables) { u8 *rtable_base, const u32 num_tables) {
vector<u8 *> tables; vector<u8 *> tables;
for (u32 i = 0; i < num_tables; i++) { for (u32 i = 0; i < num_tables; i++) {
tables.emplace_back(rtable_base + i * RTABLE_SIZE); tables.push_back(rtable_base + i * RTABLE_SIZE);
} }
for (auto t : tables) { for (auto t : tables) {
@ -576,8 +577,8 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64); auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
assert(fdr); // otherwise would have thrown std::bad_alloc assert(fdr); // otherwise would have thrown std::bad_alloc
Teddy *teddy = reinterpret_cast<Teddy *>(fdr.get()); // ugly Teddy *teddy = (Teddy *)fdr.get(); // ugly
u8 *teddy_base = reinterpret_cast<u8 *>(teddy); u8 *teddy_base = (u8 *)teddy;
// Write header. // Write header.
teddy->size = size; teddy->size = size;
@ -597,7 +598,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
assert(ISALIGNED_CL(ptr)); assert(ISALIGNED_CL(ptr));
teddy->floodOffset = verify_u32(ptr - teddy_base); teddy->floodOffset = verify_u32(ptr - teddy_base);
memcpy(ptr, floodTable.get(), floodTable.size()); memcpy(ptr, floodTable.get(), floodTable.size());
ptr += floodTable.size();
// Write teddy masks. // Write teddy masks.
u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize); u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
@ -622,7 +623,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
static static
bool assignStringsToBuckets( bool assignStringsToBuckets(
const vector<hwlmLiteral> &lits, const vector<hwlmLiteral> &lits,
const TeddyEngineDescription &eng, TeddyEngineDescription &eng,
map<BucketIndex, vector<LiteralIndex>> &bucketToLits) { map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
assert(eng.numMasks <= MAX_NUM_MASKS); assert(eng.numMasks <= MAX_NUM_MASKS);
if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) { if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
@ -676,7 +677,7 @@ unique_ptr<HWLMProto> teddyBuildProtoHinted(
return nullptr; return nullptr;
} }
return std::make_unique<HWLMProto>(engType, std::move(des), lits, return ue2::make_unique<HWLMProto>(engType, move(des), lits,
bucketToLits, make_small); bucketToLits, make_small);
} }

View File

@ -34,6 +34,7 @@
#include "fdr_engine_description.h" #include "fdr_engine_description.h"
#include "teddy_internal.h" #include "teddy_internal.h"
#include "teddy_engine_description.h" #include "teddy_engine_description.h"
#include "util/make_unique.h"
#include <cmath> #include <cmath>
@ -52,14 +53,14 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) { void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
static const TeddyEngineDef defns[] = { static const TeddyEngineDef defns[] = {
{ 3, HS_CPU_FEATURES_AVX2, 1, 16, false }, { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
{ 4, HS_CPU_FEATURES_AVX2, 1, 16, true }, { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true },
{ 5, HS_CPU_FEATURES_AVX2, 2, 16, false }, { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },
{ 6, HS_CPU_FEATURES_AVX2, 2, 16, true }, { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true },
{ 7, HS_CPU_FEATURES_AVX2, 3, 16, false }, { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false },
{ 8, HS_CPU_FEATURES_AVX2, 3, 16, true }, { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true },
{ 9, HS_CPU_FEATURES_AVX2, 4, 16, false }, { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false },
{ 10, HS_CPU_FEATURES_AVX2, 4, 16, true }, { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true },
{ 11, 0, 1, 8, false }, { 11, 0, 1, 8, false },
{ 12, 0, 1, 8, true }, { 12, 0, 1, 8, true },
{ 13, 0, 2, 8, false }, { 13, 0, 2, 8, false },
@ -71,7 +72,6 @@ void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
}; };
out->clear(); out->clear();
for (const auto &def : defns) { for (const auto &def : defns) {
// cppcheck-suppress useStlAlgorithm
out->emplace_back(def); out->emplace_back(def);
} }
} }
@ -124,7 +124,6 @@ bool isAllowed(const vector<hwlmLiteral> &vl, const TeddyEngineDescription &eng,
u32 n_small_lits = 0; u32 n_small_lits = 0;
for (const auto &lit : vl) { for (const auto &lit : vl) {
if (lit.s.length() < eng.numMasks) { if (lit.s.length() < eng.numMasks) {
// cppcheck-suppress useStlAlgorithm
n_small_lits++; n_small_lits++;
} }
} }
@ -198,7 +197,7 @@ chooseTeddyEngine(const target_t &target, const vector<hwlmLiteral> &vl) {
} }
DEBUG_PRINTF("using engine %u\n", best->getID()); DEBUG_PRINTF("using engine %u\n", best->getID());
return std::make_unique<TeddyEngineDescription>(*best); return ue2::make_unique<TeddyEngineDescription>(*best);
} }
unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) { unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
@ -206,9 +205,8 @@ unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
getTeddyDescriptions(&descs); getTeddyDescriptions(&descs);
for (const auto &desc : descs) { for (const auto &desc : descs) {
// cppcheck-suppress useStlAlgorithm
if (desc.getID() == engineID) { if (desc.getID() == engineID) {
return std::make_unique<TeddyEngineDescription>(desc); return ue2::make_unique<TeddyEngineDescription>(desc);
} }
} }

View File

@ -39,7 +39,7 @@ namespace ue2 {
#define TEDDY_BUCKET_LOAD 6 #define TEDDY_BUCKET_LOAD 6
struct TeddyEngineDef { //NOLINT (clang-analyzer-optin.performance.Padding) struct TeddyEngineDef {
u32 id; u32 id;
u64a cpu_features; u64a cpu_features;
u32 numMasks; u32 numMasks;

View File

@ -1,570 +0,0 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* fat teddy for AVX2 and AVX512VBMI */
#include "fdr_internal.h"
#include "flood_runtime.h"
#include "teddy.h"
#include "teddy_internal.h"
#include "teddy_runtime_common.h"
#include "util/arch.h"
#include "util/simd_utils.h"
#if defined(HAVE_AVX2)
#ifdef ARCH_64_BIT
static really_inline
hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
CautionReason reason, const u8 *pt,
const u32* confBase,
const struct FDR_Runtime_Args *a,
hwlm_group_t *control,
u32 *last_match) {
if (unlikely(chunk != ones_u64a)) {
chunk = ~chunk;
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
control, last_match);
// adapted from CHECK_HWLM_TERMINATE_MATCHING
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
return HWLM_TERMINATED;
}
}
return HWLM_SUCCESS;
}
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
#else
static really_inline
hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
CautionReason reason, const u8 *pt,
const u32* confBase,
const struct FDR_Runtime_Args *a,
hwlm_group_t *control,
u32 *last_match) {
if (unlikely(chunk != ones_u32)) {
chunk = ~chunk;
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
control, last_match);
// adapted from CHECK_HWLM_TERMINATE_MATCHING
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
return HWLM_TERMINATED;
}
}
return HWLM_SUCCESS;
}
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
#endif
#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
// fat 512 teddy is only with vbmi
static really_inline
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
}
const u8 ALIGN_CL_DIRECTIVE p_mask_interleave[64] = {
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
};
#ifdef ARCH_64_BIT
hwlm_error_t confirm_fat_teddy_64_512(m512 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff512(var, ones512()))) {
m512 msk_interleave = load512(p_mask_interleave);
m512 r = vpermb512(msk_interleave, var);
m128 r0 = extract128from512(r, 0);
m128 r1 = extract128from512(r, 1);
m128 r2 = extract128from512(r, 2);
m128 r3 = extract128from512(r, 3);
u64a part1 = movq(r0);
u64a part2 = extract64from128(r0, 1);
u64a part3 = movq(r1);
u64a part4 = extract64from128(r1, 1);
u64a part5 = movq(r2);
u64a part6 = extract64from128(r2, 1);
u64a part7 = movq(r3);
u64a part8 = extract64from128(r3, 1);
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_fat_teddy_512_f confirm_fat_teddy_64_512
#else // 32-64
hwlm_error_t confirm_fat_teddy_32_512(m512 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff512(var, ones512()))) {
m512 msk_interleave = load512(p_mask_interleave);
m512 r = vpermb512(msk_interleave, var);
m128 r0 = extract128from512(r, 0);
m128 r1 = extract128from512(r, 1);
m128 r2 = extract128from512(r, 2);
m128 r3 = extract128from512(r, 3);
u32 part1 = movd(r0);
u32 part2 = extract32from128(r0, 1);
u32 part3 = extract32from128(r0, 2);
u32 part4 = extract32from128(r0, 3);
u32 part5 = movd(r1);
u32 part6 = extract32from128(r1, 1);
u32 part7 = extract32from128(r1, 2);
u32 part8 = extract32from128(r1, 3);
u32 part9 = movd(r2);
u32 part10 = extract32from128(r2, 1);
u32 part11 = extract32from128(r2, 2);
u32 part12 = extract32from128(r2, 3);
u32 part13 = movd(r3);
u32 part14 = extract32from128(r3, 1);
u32 part15 = extract32from128(r3, 2);
u32 part16 = extract32from128(r3, 3);
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
#define confirm_fat_teddy_512_f confirm_fat_teddy_32_512
#endif // 32/64
#define CONFIRM_FAT_TEDDY_512(...) if(confirm_fat_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
#define TEDDY_VBMI_SL1_MASK 0xfffffffffffffffeULL
#define TEDDY_VBMI_SL2_MASK 0xfffffffffffffffcULL
#define TEDDY_VBMI_SL3_MASK 0xfffffffffffffff8ULL
#define FAT_TEDDY_VBMI_SL1_MASK 0xfffffffefffffffeULL
#define FAT_TEDDY_VBMI_SL2_MASK 0xfffffffcfffffffcULL
#define FAT_TEDDY_VBMI_SL3_MASK 0xfffffff8fffffff8ULL
#define FAT_TEDDY_VBMI_SL1_POS 15
#define FAT_TEDDY_VBMI_SL2_POS 14
#define FAT_TEDDY_VBMI_SL3_POS 13
#define FAT_TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffULL >> n_sh)
#define FAT_TEDDY_VBMI_CONF_MASK_FULL ((0xffffffffULL << n_sh) & 0xffffffffULL)
#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffULL >> (32 - n_sh))
template<int NMSK>
static really_inline
m512 prep_conf_fat_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
const m512 *sl_msk, const m512 val) {
m512 lo = and512(val, *lo_mask);
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
pshufb_m512(dup_mask[1], hi));
if constexpr (NMSK == 1) return shuf_or_b0;
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
pshufb_m512(dup_mask[3], hi));
m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
pshufb_m512(dup_mask[5], hi));
m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
pshufb_m512(dup_mask[7], hi));
m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
}
#define TEDDY_VBMI_SL1_POS 15
#define TEDDY_VBMI_SL2_POS 14
#define TEDDY_VBMI_SL3_POS 13
#define TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffffffffffULL >> n_sh)
#define TEDDY_VBMI_CONF_MASK_FULL (0xffffffffffffffffULL << n_sh)
#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
#define TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffffffffffULL >> (64 - n_sh))
template<int NMSK>
hwlm_error_t fdr_exec_fat_teddy_512vbmi_templ(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = ones_u32;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
u32 n_sh = NMSK - 1;
const size_t loopBytes = 32 - n_sh;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m512 *dup_mask = getDupMaskBase(teddy, NMSK);
m512 lo_mask = set1_64x8(0xf);
m512 sl_msk[NMSK - 1];
if constexpr (NMSK > 1){
sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
}
if constexpr (NMSK > 2){
sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
}
if constexpr (NMSK > 3){
sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
}
const u32 *confBase = getConfBase(teddy);
u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;
m512 p_mask = set_mask_m512(~((k << 32) | k));
u32 overlap = 0;
u64a patch = 0;
if (likely(ptr + loopBytes <= buf_end)) {
u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;
m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu_maskz_m256(k0, ptr)));
r_0 = or512(r_0, p_mask0);
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr);
ptr += loopBytes;
overlap = n_sh;
patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;
}
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
CHECK_FLOOD;
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu256(ptr - n_sh)));
r_0 = or512(r_0, p_mask);
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh);
}
assert(ptr + loopBytes > buf_end);
if (ptr < buf_end) {
u32 left = (u32)(buf_end - ptr);
u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);
m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));
m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
r_0 = or512(r_0, p_mask1);
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr - overlap);
}
return HWLM_SUCCESS;
}
#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_512vbmi_templ
#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
#ifdef ARCH_64_BIT
extern "C" {
hwlm_error_t confirm_fat_teddy_64_256(m256 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff256(var, ones256()))) {
m256 swap = swap128in256(var);
m256 r = interleave256lo(var, swap);
u64a part1 = extractlow64from256(r);
u64a part2 = extract64from256(r, 1);
r = interleave256hi(var, swap);
u64a part3 = extractlow64from256(r);
u64a part4 = extract64from256(r, 1);
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
} // extern C
#define confirm_fat_teddy_256_f confirm_fat_teddy_64_256
#else
extern "C" {
hwlm_error_t confirm_fat_teddy_32_256(m256 var, u8 bucket, u8 offset,
CautionReason reason, const u8 *ptr,
const struct FDR_Runtime_Args *a,
const u32* confBase, hwlm_group_t *control,
u32 *last_match) {
if (unlikely(diff256(var, ones256()))) {
m256 swap = swap128in256(var);
m256 r = interleave256lo(var, swap);
u32 part1 = extractlow32from256(r);
u32 part2 = extract32from256(r, 1);
u32 part3 = extract32from256(r, 2);
u32 part4 = extract32from256(r, 3);
r = interleave256hi(var, swap);
u32 part5 = extractlow32from256(r);
u32 part6 = extract32from256(r, 1);
u32 part7 = extract32from256(r, 2);
u32 part8 = extract32from256(r, 3);
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
}
return HWLM_SUCCESS;
}
} // extern C
#define confirm_fat_teddy_256_f confirm_fat_teddy_32_256
#endif
#define CONFIRM_FAT_TEDDY_256(...) if(confirm_fat_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
static really_inline
const m256 *getMaskBase_fat(const struct Teddy *teddy) {
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
}
static really_inline
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks));
*p_mask = set1_2x128(p_mask128);
return ret;
}
template<int NMSK>
static really_inline
m256 prep_conf_fat_teddy_256_templ(const m256 *maskBase, m256 val,
m256* old_1, m256* old_2, m256* old_3){
m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = or256(pshufb_m256(maskBase[0 * 2], lo),
pshufb_m256(maskBase[0 * 2 + 1], hi));
if constexpr (NMSK == 1) return r;
m256 res_1 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - (NMSK-1));
*old_1 = res_1;
r = or256(r, res_shifted_1);
if constexpr (NMSK == 2) return r;
m256 res_2 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - (NMSK-1));
*old_2 = res_2;
r = or256(r, res_shifted_2);
if constexpr (NMSK == 3) return r;
m256 res_3 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - (NMSK-1));
*old_3 = res_3;
return or256(r, res_shifted_3);
}
template<int NMSK>
hwlm_error_t fdr_exec_fat_teddy_256_templ(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = ones_u32;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_fat(teddy);
const u32 *confBase = getConfBase(teddy);
m256 res_old_1 = zeroes256();
m256 res_old_2 = zeroes256();
m256 res_old_3 = zeroes256();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,
a->buf, buf_end,
a->buf_history, a->len_history,
NMSK);
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
r_0 = or256(r_0, p_mask);
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
ptr += 16;
}
if (ptr + 16 <= buf_end) {
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes * 4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
m256 r_1 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr + 16), &res_old_1, &res_old_2, &res_old_3);
CONFIRM_FAT_TEDDY_256(r_1, 16, 16, NOT_CAUTIOUS, ptr);
}
if (ptr + 16 <= buf_end) {
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
ptr += 16;
}
assert(ptr + 16 > buf_end);
if (ptr < buf_end) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,
a->buf_history, a->len_history,
NMSK);
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
r_0 = or256(r_0, p_mask);
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
}
return HWLM_SUCCESS;
}
// this check is because it is possible to build with both AVX512VBMI and AVX2 defined,
// to replicate the behaviour of the original flow of control we give preference
// to the former. If we're building for both then this will be compiled multiple times
// with the desired variant defined by itself.
#ifndef FDR_EXEC_FAT_TEDDY_FN
#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_256_templ
#endif
#endif // HAVE_AVX2 for fat teddy
/* we only have fat teddy in these two modes */
// #if (defined(HAVE_AVX2) || defined(HAVE_AVX512VBMI)) && defined(FDR_EXEC_FAT_TEDDY_FN)
// #if defined(FDR_EXEC_FAT_TEDDY_FN)
extern "C" {
hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
}
hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
}
hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
}
hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
}
hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
}
hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
}
hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
}
hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
}
} // extern c
#endif // HAVE_AVX2 from the beginning

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2016-2020, Intel Corporation * Copyright (c) 2016-2020, Intel Corporation
* Copyright (c) 2024, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -41,6 +40,10 @@
#include "util/simd_utils.h" #include "util/simd_utils.h"
#include "util/uniform_ops.h" #include "util/uniform_ops.h"
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
#if defined(HAVE_AVX2)
extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
#endif
#if defined(HAVE_AVX512VBMI) #if defined(HAVE_AVX512VBMI)
static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = { static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
@ -139,37 +142,6 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
// |----------|-------|----------------|............| // |----------|-------|----------------|............|
// 0 start start+offset end(<=16) // 0 start start+offset end(<=16)
// p_mask ffff.....ffffff..ff0000...........00ffff.......... // p_mask ffff.....ffffff..ff0000...........00ffff..........
// replace the p_mask_arr table.
// m is the length of the zone of bytes==0 , n is
// the offset where that zone begins. more specifically, there are
// 16-n bytes of 1's before the zone begins.
// m,n 4,7 - 4 bytes of 0s, and 16-7 bytes of 1's before that.
// 00 00 00 00 ff..ff
// ff ff ff ff ff ff ff ff 00 00 00 00 ff..ff
// m,n 15,15 - 15 bytes of 0s , f's high, but also with 16-15=1 byte of 1s
// in the beginning - which push the ff at the end off the high end , leaving
// ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
// m,n 15,16 - 15 bytes of 0s, ff high , with 16-16 = 0 ones on the low end
// before that, so,
// 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff
// so to get the one part, with the f's high, we start out with 1's and
// shift them up (right) by m+n.
// now to fill in any ones that belong on the low end we have to take
// some 1's and shift them down. the ones zone there needs to be 16-n long,
// meaning shifted down by 16-(16-n) , or of course just n.
// then we should be able to or these together.
static really_inline
m128 p_mask_gen(u8 m, u8 n){
m128 a = ones128();
m128 b = ones128();
m%=17; n%=17;
m+=(16-n); m%=17;
a = rshiftbyte_m128(a, n);
b = lshiftbyte_m128(b, m);
return or128(a, b);
}
static really_inline static really_inline
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset, m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi, const u8 *lo, const u8 *hi,
@ -189,11 +161,13 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
uintptr_t avail = (uintptr_t)(hi - ptr); uintptr_t avail = (uintptr_t)(hi - ptr);
if (avail >= 16) { if (avail >= 16) {
assert(start_offset - start <= 16); assert(start_offset - start <= 16);
*p_mask = p_mask_gen(16 - start_offset + start, 16 - start_offset + start); *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+ 16 - start_offset + start);
return loadu128(ptr); return loadu128(ptr);
} }
assert(start_offset - start <= avail); assert(start_offset - start <= avail);
*p_mask = p_mask_gen(avail - start_offset + start, 16 - start_offset + start); *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+ 16 - start_offset + start);
copy_start = 0; copy_start = 0;
copy_len = avail; copy_len = avail;
} else { // start zone } else { // start zone
@ -206,7 +180,8 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
} }
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr)); uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
assert(start + start_offset <= end); assert(start + start_offset <= end);
*p_mask = p_mask_gen(end - start - start_offset, 16 - start - start_offset); *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+ 16 - start - start_offset);
copy_start = start; copy_start = start;
copy_len = end - start; copy_len = end - start;
} }
@ -295,20 +270,6 @@ void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
// |----------|-------|----------------|............| // |----------|-------|----------------|............|
// 0 start start+offset end(<=32) // 0 start start+offset end(<=32)
// p_mask ffff.....ffffff..ff0000...........00ffff.......... // p_mask ffff.....ffffff..ff0000...........00ffff..........
// like the pmask gen above this replaces the large array.
static really_inline
m256 fat_pmask_gen(u8 m, u8 n){
m256 a=ones256();
m256 b=ones256();
m%=33; n%=33;
m+=(32-n); m%=33;
a = rshift_byte_m256(a, m);
b = lshift_byte_m256(b, n);
return or256(a, b);
}
static really_inline static really_inline
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset, m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi, const u8 *lo, const u8 *hi,
@ -328,11 +289,13 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
uintptr_t avail = (uintptr_t)(hi - ptr); uintptr_t avail = (uintptr_t)(hi - ptr);
if (avail >= 32) { if (avail >= 32) {
assert(start_offset - start <= 32); assert(start_offset - start <= 32);
*p_mask = fat_pmask_gen(32 - start_offset + start, 32 - start_offset + start); *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+ 32 - start_offset + start);
return loadu256(ptr); return loadu256(ptr);
} }
assert(start_offset - start <= avail); assert(start_offset - start <= avail);
*p_mask = fat_pmask_gen(avail - start_offset + start, 32 - start_offset + start); *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+ 32 - start_offset + start);
copy_start = 0; copy_start = 0;
copy_len = avail; copy_len = avail;
} else { //start zone } else { //start zone
@ -345,7 +308,8 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
} }
uintptr_t end = MIN(32, (uintptr_t)(hi - ptr)); uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
assert(start + start_offset <= end); assert(start + start_offset <= end);
*p_mask = fat_pmask_gen(end - start - start_offset, 32 - start - start_offset); *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+ 32 - start - start_offset);
copy_start = start; copy_start = start;
copy_len = end - start; copy_len = end - start;
} }
@ -384,7 +348,7 @@ static really_inline
m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset, m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen, const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
const u32 nMasks) { const u32 nMasks) {
m512 val = zeroes512(); m512 val;
uintptr_t copy_start; uintptr_t copy_start;
uintptr_t copy_len; uintptr_t copy_len;
@ -464,13 +428,8 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
if (!cf) { if (!cf) {
continue; continue;
} }
#ifdef __cplusplus
const struct FDRConfirm *fdrc = reinterpret_cast<const struct FDRConfirm *>
(reinterpret_cast<const u8 *>(confBase) + cf);
#else
const struct FDRConfirm *fdrc = (const struct FDRConfirm *) const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
((const u8 *)confBase + cf); ((const u8 *)confBase + cf);
#endif
if (!(fdrc->groups & *control)) { if (!(fdrc->groups & *control)) {
continue; continue;
} }
@ -483,31 +442,18 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
static really_inline static really_inline
const m128 *getMaskBase(const struct Teddy *teddy) { const m128 *getMaskBase(const struct Teddy *teddy) {
#ifdef __cplusplus
return reinterpret_cast<const m128 *>(reinterpret_cast<const u8 *>(teddy) + ROUNDUP_CL(sizeof(struct Teddy)));
#else
return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
#endif
} }
static really_inline static really_inline
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) { const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
#ifdef __cplusplus
return reinterpret_cast<const u64a *>(reinterpret_cast<const u8 *>(getMaskBase(teddy))
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
#else
return (const u64a *)((const u8 *)getMaskBase(teddy) return (const u64a *)((const u8 *)getMaskBase(teddy)
+ ROUNDUP_CL(2 * numMask * sizeof(m128))); + ROUNDUP_CL(2 * numMask * sizeof(m128)));
#endif
} }
static really_inline static really_inline
const u32 *getConfBase(const struct Teddy *teddy) { const u32 *getConfBase(const struct Teddy *teddy) {
#ifdef __cplusplus
return reinterpret_cast<const u32 *>(reinterpret_cast<const u8 *>(teddy) + teddy->confOffset);
#else
return (const u32 *)((const u8 *)teddy + teddy->confOffset); return (const u32 *)((const u8 *)teddy + teddy->confOffset);
#endif
} }
#endif /* TEDDY_RUNTIME_COMMON_H_ */ #endif /* TEDDY_RUNTIME_COMMON_H_ */

View File

@ -44,11 +44,8 @@
#include "parser/prefilter.h" #include "parser/prefilter.h"
#include "parser/unsupported.h" #include "parser/unsupported.h"
#include "util/compile_error.h" #include "util/compile_error.h"
#include "util/arch/common/cpuid_flags.h" #include "util/cpuid_flags.h"
#if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/cpuid_inline.h"
#include "util/arch/x86/cpuid_inline.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#endif
#include "util/depth.h" #include "util/depth.h"
#include "util/popcount.h" #include "util/popcount.h"
#include "util/target_info.h" #include "util/target_info.h"
@ -199,13 +196,11 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
} }
#if defined(FAT_RUNTIME) #if defined(FAT_RUNTIME)
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
if (!check_ssse3()) { if (!check_ssse3()) {
*db = nullptr; *db = nullptr;
*comp_error = generateCompileError("Unsupported architecture", -1); *comp_error = generateCompileError("Unsupported architecture", -1);
return HS_ARCH_ERROR; return HS_ARCH_ERROR;
} }
#endif
#endif #endif
if (!checkMode(mode, comp_error)) { if (!checkMode(mode, comp_error)) {
@ -322,14 +317,13 @@ hs_compile_lit_multi_int(const char *const *expressions, const unsigned *flags,
*comp_error = generateCompileError("Invalid parameter: elements is zero", -1); *comp_error = generateCompileError("Invalid parameter: elements is zero", -1);
return HS_COMPILER_ERROR; return HS_COMPILER_ERROR;
} }
#if defined(FAT_RUNTIME) #if defined(FAT_RUNTIME)
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
if (!check_ssse3()) { if (!check_ssse3()) {
*db = nullptr; *db = nullptr;
*comp_error = generateCompileError("Unsupported architecture", -1); *comp_error = generateCompileError("Unsupported architecture", -1);
return HS_ARCH_ERROR; return HS_ARCH_ERROR;
} }
#endif
#endif #endif
if (!checkMode(mode, comp_error)) { if (!checkMode(mode, comp_error)) {
@ -503,12 +497,10 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
} }
#if defined(FAT_RUNTIME) #if defined(FAT_RUNTIME)
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
if (!check_ssse3()) { if (!check_ssse3()) {
*error = generateCompileError("Unsupported architecture", -1); *error = generateCompileError("Unsupported architecture", -1);
return HS_ARCH_ERROR; return HS_ARCH_ERROR;
} }
#endif
#endif #endif
if (!info) { if (!info) {
@ -589,7 +581,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
return HS_COMPILER_ERROR; return HS_COMPILER_ERROR;
} }
hs_expr_info *rv = static_cast<hs_expr_info *>(hs_misc_alloc(sizeof(*rv))); hs_expr_info *rv = (hs_expr_info *)hs_misc_alloc(sizeof(*rv));
if (!rv) { if (!rv) {
*error = const_cast<hs_compile_error_t *>(&hs_enomem); *error = const_cast<hs_compile_error_t *>(&hs_enomem);
return HS_COMPILER_ERROR; return HS_COMPILER_ERROR;
@ -636,11 +628,9 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform) {
extern "C" HS_PUBLIC_API extern "C" HS_PUBLIC_API
hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error) { hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error) {
#if defined(FAT_RUNTIME) #if defined(FAT_RUNTIME)
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
if (!check_ssse3()) { if (!check_ssse3()) {
return HS_ARCH_ERROR; return HS_ARCH_ERROR;
} }
#endif
#endif #endif
freeCompileError(error); freeCompileError(error);
return HS_SUCCESS; return HS_SUCCESS;

View File

@ -39,7 +39,12 @@
* the individual component headers for documentation. * the individual component headers for documentation.
*/ */
#include "hs_version.h" /* The current Hyperscan version information. */
#define HS_MAJOR 5
#define HS_MINOR 4
#define HS_PATCH 1
#include "hs_compile.h" #include "hs_compile.h"
#include "hs_runtime.h" #include "hs_runtime.h"

View File

@ -29,7 +29,11 @@
#ifndef HS_COMMON_H_ #ifndef HS_COMMON_H_
#define HS_COMMON_H_ #define HS_COMMON_H_
#if defined(_WIN32)
#define HS_CDECL __cdecl
#else
#define HS_CDECL #define HS_CDECL
#endif
#include <stdlib.h> #include <stdlib.h>
/** /**

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2017, Intel Corporation
* Copyright (c) 2020-2023, VectorCamp PC
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -27,36 +26,16 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "config.h"
#include "hs_common.h" #include "hs_common.h"
#include "ue2common.h" #include "util/cpuid_flags.h"
#if !defined(VS_SIMDE_BACKEND) #include "util/cpuid_inline.h"
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/cpuid_inline.h"
#elif defined(ARCH_AARCH64)
#include "util/arch/arm/cpuid_inline.h"
#endif
#endif
HS_PUBLIC_API HS_PUBLIC_API
hs_error_t HS_CDECL hs_valid_platform(void) { hs_error_t HS_CDECL hs_valid_platform(void) {
/* Vectorscan requires SSE4.2, anything else is a bonus */ /* Hyperscan requires SSSE3, anything else is a bonus */
#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64)) if (check_ssse3()) {
// cppcheck-suppress knownConditionTrueFalse
if (check_sse42()) {
return HS_SUCCESS; return HS_SUCCESS;
} else { } else {
return HS_ARCH_ERROR; return HS_ARCH_ERROR;
} }
#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
//check_neon returns true for now
// cppcheck-suppress knownConditionTrueFalse
if (check_neon()) {
return HS_SUCCESS;
} else {
return HS_ARCH_ERROR;
}
#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
return HS_SUCCESS;
#endif
} }

View File

@ -36,9 +36,5 @@
#define HS_VERSION_32BIT ((@HS_MAJOR_VERSION@ << 24) | (@HS_MINOR_VERSION@ << 16) | (@HS_PATCH_VERSION@ << 8) | 0) #define HS_VERSION_32BIT ((@HS_MAJOR_VERSION@ << 24) | (@HS_MINOR_VERSION@ << 16) | (@HS_PATCH_VERSION@ << 8) | 0)
#define HS_MAJOR @HS_MAJOR_VERSION@
#define HS_MINOR @HS_MINOR_VERSION@
#define HS_PATCH @HS_PATCH_VERSION@
#endif /* HS_VERSION_H_C6428FAF8E3713 */ #endif /* HS_VERSION_H_C6428FAF8E3713 */

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -39,7 +38,7 @@
#include "nfa/accel.h" #include "nfa/accel.h"
#include "nfa/shufti.h" #include "nfa/shufti.h"
#include "nfa/truffle.h" #include "nfa/truffle.h"
#include "nfa/vermicelli.hpp" #include "nfa/vermicelli.h"
#include <string.h> #include <string.h>
#define MIN_ACCEL_LEN_BLOCK 16 #define MIN_ACCEL_LEN_BLOCK 16
@ -63,22 +62,12 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n", DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n",
aux->dverm.c1, aux->dverm.c2); aux->dverm.c1, aux->dverm.c2);
return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end); return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end);
#ifdef HAVE_SVE2
case ACCEL_VERM16:
DEBUG_PRINTF("single vermicelli16\n");
return vermicelli16Exec(aux->verm16.mask, ptr, end);
#endif // HAVE_SVE2
case ACCEL_SHUFTI: case ACCEL_SHUFTI:
DEBUG_PRINTF("single shufti\n"); DEBUG_PRINTF("single shufti\n");
return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end); return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
case ACCEL_TRUFFLE: case ACCEL_TRUFFLE:
DEBUG_PRINTF("truffle\n"); DEBUG_PRINTF("truffle\n");
return truffleExec(aux->truffle.mask_lo, aux->truffle.mask_hi, ptr, end); return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
#ifdef CAN_USE_WIDE_TRUFFLE
case ACCEL_TRUFFLE_WIDE:
DEBUG_PRINTF("truffle wide\n");
return truffleExecWide(aux->truffle.mask, ptr, end);
#endif // CAN_USE_WIDE_TRUFFLE
default: default:
/* no acceleration, fall through and return current ptr */ /* no acceleration, fall through and return current ptr */
DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type); DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
@ -175,7 +164,8 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len); DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len);
*start += delta; *start += delta;
} else if (hlen) { } else if (hlen) {
DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", offset + ptr2 - found, hlen); UNUSED size_t remaining = offset + ptr2 - found;
DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", remaining, hlen);
} }
} }

View File

@ -46,6 +46,7 @@
#include "fdr/teddy_engine_description.h" #include "fdr/teddy_engine_description.h"
#include "util/compile_context.h" #include "util/compile_context.h"
#include "util/compile_error.h" #include "util/compile_error.h"
#include "util/make_unique.h"
#include "util/ue2string.h" #include "util/ue2string.h"
#include <cassert> #include <cassert>
@ -57,24 +58,24 @@ using namespace std;
namespace ue2 { namespace ue2 {
HWLMProto::HWLMProto(u8 engType_in, vector<hwlmLiteral> lits_in) HWLMProto::HWLMProto(u8 engType_in, vector<hwlmLiteral> lits_in)
: engType(engType_in), lits(std::move(lits_in)) {} : engType(engType_in), lits(move(lits_in)) {}
HWLMProto::HWLMProto(u8 engType_in, HWLMProto::HWLMProto(u8 engType_in,
unique_ptr<FDREngineDescription> eng_in, unique_ptr<FDREngineDescription> eng_in,
vector<hwlmLiteral> lits_in, vector<hwlmLiteral> lits_in,
map<u32, vector<u32>> bucketToLits_in, map<u32, vector<u32>> bucketToLits_in,
bool make_small_in) bool make_small_in)
: engType(engType_in), fdrEng(std::move(eng_in)), lits(std::move(lits_in)), : engType(engType_in), fdrEng(move(eng_in)), lits(move(lits_in)),
bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {} bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
HWLMProto::HWLMProto(u8 engType_in, HWLMProto::HWLMProto(u8 engType_in,
unique_ptr<TeddyEngineDescription> eng_in, unique_ptr<TeddyEngineDescription> eng_in,
vector<hwlmLiteral> lits_in, vector<hwlmLiteral> lits_in,
map<u32, vector<u32>> bucketToLits_in, map<u32, vector<u32>> bucketToLits_in,
bool make_small_in) bool make_small_in)
: engType(engType_in), teddyEng(std::move(eng_in)), : engType(engType_in), teddyEng(move(eng_in)),
lits(std::move(lits_in)), lits(move(lits_in)),
bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {} bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
HWLMProto::~HWLMProto() {} HWLMProto::~HWLMProto() {}
@ -93,7 +94,6 @@ void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
// Called by an assertion. // Called by an assertion.
static static
bool everyoneHasGroups(const vector<hwlmLiteral> &lits) { bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {
// cppcheck-suppress useStlAlgorithm
for (const auto &lit : lits) { for (const auto &lit : lits) {
if (!lit.groups) { if (!lit.groups) {
return false; return false;
@ -133,18 +133,18 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
if (noodle) { if (noodle) {
engSize = noodle.size(); engSize = noodle.size();
} }
eng = std::move(noodle); eng = move(noodle);
} else { } else {
DEBUG_PRINTF("building a new deal\n"); DEBUG_PRINTF("building a new deal\n");
auto fdr = fdrBuildTable(proto, cc.grey); auto fdr = fdrBuildTable(proto, cc.grey);
if (fdr) { if (fdr) {
engSize = fdr.size(); engSize = fdr.size();
} }
eng = std::move(fdr); eng = move(fdr);
} }
if (!eng) { if (!eng) {
return bytecode_ptr<HWLM>(nullptr); return nullptr;
} }
assert(engSize); assert(engSize);
@ -156,7 +156,6 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64); auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
h->type = proto.engType; h->type = proto.engType;
// cppcheck-suppress cstyleCast
memcpy(HWLM_DATA(h.get()), eng.get(), engSize); memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
return h; return h;
@ -202,7 +201,7 @@ hwlmBuildProto(vector<hwlmLiteral> &lits, bool make_small,
if (isNoodleable(lits, cc)) { if (isNoodleable(lits, cc)) {
DEBUG_PRINTF("build noodle table\n"); DEBUG_PRINTF("build noodle table\n");
proto = std::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits); proto = ue2::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits);
} else { } else {
DEBUG_PRINTF("building a new deal\n"); DEBUG_PRINTF("building a new deal\n");
proto = fdrBuildProto(HWLM_ENGINE_FDR, lits, make_small, proto = fdrBuildProto(HWLM_ENGINE_FDR, lits, make_small,
@ -220,12 +219,10 @@ size_t hwlmSize(const HWLM *h) {
switch (h->type) { switch (h->type) {
case HWLM_ENGINE_NOOD: case HWLM_ENGINE_NOOD:
// cppcheck-suppress cstyleCast engSize = noodSize((const noodTable *)HWLM_C_DATA(h));
engSize = noodSize(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)));
break; break;
case HWLM_ENGINE_FDR: case HWLM_ENGINE_FDR:
// cppcheck-suppress cstyleCast engSize = fdrSize((const FDR *)HWLM_C_DATA(h));
engSize = fdrSize(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)));
break; break;
} }

View File

@ -53,12 +53,10 @@ void hwlmGenerateDumpFiles(const HWLM *h, const string &base) {
switch (h->type) { switch (h->type) {
case HWLM_ENGINE_NOOD: case HWLM_ENGINE_NOOD:
// cppcheck-suppress cstyleCast noodPrintStats((const noodTable *)HWLM_C_DATA(h), f);
noodPrintStats(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)), f);
break; break;
case HWLM_ENGINE_FDR: case HWLM_ENGINE_FDR:
// cppcheck-suppress cstyleCast fdrPrintStats((const FDR *)HWLM_C_DATA(h), f);
fdrPrintStats(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)), f);
break; break;
default: default:
fprintf(f, "<unknown hwlm subengine>\n"); fprintf(f, "<unknown hwlm subengine>\n");

View File

@ -56,7 +56,7 @@ u64a make_u64a_mask(const vector<u8> &v) {
u64a mask = 0; u64a mask = 0;
size_t len = v.size(); size_t len = v.size();
u8 *m = reinterpret_cast<u8 *>(&mask); unsigned char *m = (unsigned char *)&mask;
DEBUG_PRINTF("making mask len %zu\n", len); DEBUG_PRINTF("making mask len %zu\n", len);
memcpy(m, &v[0], len); memcpy(m, &v[0], len);
return mask; return mask;
@ -156,7 +156,7 @@ void noodPrintStats(const noodTable *n, FILE *f) {
n->msk_len); n->msk_len);
fprintf(f, "String: "); fprintf(f, "String: ");
for (u32 i = 0; i < n->msk_len; i++) { for (u32 i = 0; i < n->msk_len; i++) {
const u8 *m = reinterpret_cast<const u8 *>(&n->cmp); const u8 *m = (const u8 *)&n->cmp;
if (isgraph(m[i]) && m[i] != '\\') { if (isgraph(m[i]) && m[i] != '\\') {
fprintf(f, "%c", m[i]); fprintf(f, "%c", m[i]);
} else { } else {

442
src/hwlm/noodle_engine.c Normal file
View File

@ -0,0 +1,442 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Noodle literal matcher: runtime.
*/
#include "hwlm.h"
#include "noodle_engine.h"
#include "noodle_internal.h"
#include "scratch.h"
#include "ue2common.h"
#include "util/arch.h"
#include "util/bitutils.h"
#include "util/compare.h"
#include "util/intrinsics.h"
#include "util/join.h"
#include "util/masked_move.h"
#include "util/partial_store.h"
#include "util/simd_utils.h"
#include <ctype.h>
#include <stdbool.h>
#include <string.h>
/** \brief Noodle runtime context. */
struct cb_info {
HWLMCallback cb; //!< callback function called on match
u32 id; //!< ID to pass to callback on match
struct hs_scratch *scratch; //!< scratch to pass to callback
size_t offsetAdj; //!< used in streaming mode
};
#if defined(HAVE_AVX512)
#define CHUNKSIZE 64
#define MASK_TYPE m512
#define Z_BITS 64
#define Z_TYPE u64a
#elif defined(HAVE_AVX2)
#define CHUNKSIZE 32
#define MASK_TYPE m256
#define Z_BITS 32
#define Z_TYPE u32
#else
#define CHUNKSIZE 16
#define MASK_TYPE m128
#define Z_BITS 32
#define Z_TYPE u32
#endif
#define RETURN_IF_TERMINATED(x) \
{ \
if ((x) == HWLM_TERMINATED) { \
return HWLM_TERMINATED; \
} \
}
#define SINGLE_ZSCAN() \
do { \
while (unlikely(z)) { \
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \
size_t matchPos = d - buf + pos; \
DEBUG_PRINTF("match pos %zu\n", matchPos); \
hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos); \
RETURN_IF_TERMINATED(rv); \
} \
} while (0)
#define DOUBLE_ZSCAN() \
do { \
while (unlikely(z)) { \
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \
size_t matchPos = d - buf + pos - 1; \
DEBUG_PRINTF("match pos %zu\n", matchPos); \
hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos); \
RETURN_IF_TERMINATED(rv); \
} \
} while (0)
static really_inline
u8 caseClear8(u8 x, bool noCase) {
return (u8)(noCase ? (x & (u8)0xdf) : x);
}
// Make sure the rest of the string is there. The single character scanner
// is used only for single chars with case insensitivity used correctly,
// so it can go straight to the callback if we get this far.
static really_inline
hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
char single, const struct cb_info *cbi, size_t pos) {
if (single) {
if (n->msk_len == 1) {
goto match;
}
}
assert(len >= n->msk_len);
u64a v =
partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
if ((v & n->msk) != n->cmp) {
/* mask didn't match */
return HWLM_SUCCESS;
}
match:
pos -= cbi->offsetAdj;
DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
if (rv == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATED;
}
return HWLM_SUCCESS;
}
#if defined(HAVE_AVX512)
#define CHUNKSIZE 64
#define MASK_TYPE m512
#include "noodle_engine_avx512.c"
#elif defined(HAVE_AVX2)
#define CHUNKSIZE 32
#define MASK_TYPE m256
#include "noodle_engine_avx2.c"
#else
#define CHUNKSIZE 16
#define MASK_TYPE m128
#include "noodle_engine_sse.c"
#endif
static really_inline
hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
size_t len, size_t start, bool noCase,
const struct cb_info *cbi) {
const MASK_TYPE mask1 = getMask(n->key0, noCase);
const MASK_TYPE caseMask = getCaseMask();
size_t offset = start + n->msk_len - 1;
size_t end = len;
assert(offset < end);
#if !defined(HAVE_AVX512)
hwlm_error_t rv;
if (end - offset < CHUNKSIZE) {
rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset,
end);
return rv;
}
if (end - offset == CHUNKSIZE) {
rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
cbi, offset, end);
return rv;
}
uintptr_t data = (uintptr_t)buf;
uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
uintptr_t last = data + end;
uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
uintptr_t s3Start = end - CHUNKSIZE;
if (offset != s2Start) {
// first scan out to the fast scan starting point
DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
cbi, offset, s2Start);
RETURN_IF_TERMINATED(rv);
}
if (likely(s2Start != s2End)) {
// scan as far as we can, bounded by the last point this key can
// possibly match
DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start,
s2End);
RETURN_IF_TERMINATED(rv);
}
// if we are done bail out
if (s2End == len) {
return HWLM_SUCCESS;
}
DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi,
s2End, len);
return rv;
#else // HAVE_AVX512
return scanSingle512(n, buf, len, noCase, caseMask, mask1, cbi, offset,
end);
#endif
}
static really_inline
hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
size_t len, size_t start, bool noCase,
const struct cb_info *cbi) {
// we stop scanning for the key-fragment when the rest of the key can't
// possibly fit in the remaining buffer
size_t end = len - n->key_offset + 2;
// the first place the key can match
size_t offset = start + n->msk_len - n->key_offset;
const MASK_TYPE caseMask = getCaseMask();
const MASK_TYPE mask1 = getMask(n->key0, noCase);
const MASK_TYPE mask2 = getMask(n->key1, noCase);
#if !defined(HAVE_AVX512)
hwlm_error_t rv;
if (end - offset < CHUNKSIZE) {
rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
offset, end);
return rv;
}
if (end - offset == CHUNKSIZE) {
rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
mask2, cbi, offset, end);
return rv;
}
uintptr_t data = (uintptr_t)buf;
uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
uintptr_t s1End = s2Start + 1;
uintptr_t last = data + end;
uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
uintptr_t s3Start = end - CHUNKSIZE;
uintptr_t off = offset;
if (s2Start != off) {
// first scan out to the fast scan starting point plus one char past to
// catch the key on the overlap
DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
mask2, cbi, off, s1End);
RETURN_IF_TERMINATED(rv);
}
off = s1End;
if (s2Start >= end) {
DEBUG_PRINTF("s2 == mL %zu\n", end);
return HWLM_SUCCESS;
}
if (likely(s2Start != s2End)) {
// scan as far as we can, bounded by the last point this key can
// possibly match
DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
s2Start, s2End);
RETURN_IF_TERMINATED(rv);
off = s2End;
}
// if there isn't enough data left to match the key, bail out
if (s2End == end) {
return HWLM_SUCCESS;
}
DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1,
mask2, cbi, off, end);
return rv;
#else // AVX512
return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
offset, end);
#endif // AVX512
}
static really_inline
hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf,
size_t len, size_t start,
const struct cb_info *cbi) {
return scanSingleMain(n, buf, len, start, 1, cbi);
}
static really_inline
hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf,
size_t len, size_t start,
const struct cb_info *cbi) {
return scanSingleMain(n, buf, len, start, 0, cbi);
}
// Single-character specialisation, used when keyLen = 1
static really_inline
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, bool noCase, const struct cb_info *cbi) {
if (!ourisalpha(n->key0)) {
noCase = 0; // force noCase off if we don't have an alphabetic char
}
// kinda ugly, but this forces constant propagation
if (noCase) {
return scanSingleNoCase(n, buf, len, start, cbi);
} else {
return scanSingleCase(n, buf, len, start, cbi);
}
}
static really_inline
hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf,
size_t len, size_t start,
const struct cb_info *cbi) {
return scanDoubleMain(n, buf, len, start, 1, cbi);
}
static really_inline
hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf,
size_t len, size_t start,
const struct cb_info *cbi) {
return scanDoubleMain(n, buf, len, start, 0, cbi);
}
static really_inline
hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, bool noCase, const struct cb_info *cbi) {
// kinda ugly, but this forces constant propagation
if (noCase) {
return scanDoubleNoCase(n, buf, len, start, cbi);
} else {
return scanDoubleCase(n, buf, len, start, cbi);
}
}
// main entry point for the scan code
static really_inline
hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, char single, bool noCase,
const struct cb_info *cbi) {
if (len - start < n->msk_len) {
// can't find string of length keyLen in a shorter buffer
return HWLM_SUCCESS;
}
if (single) {
return scanSingle(n, buf, len, start, noCase, cbi);
} else {
return scanDouble(n, buf, len, start, noCase, cbi);
}
}
/** \brief Block-mode scanner. */
hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, HWLMCallback cb,
struct hs_scratch *scratch) {
assert(n && buf);
struct cb_info cbi = {cb, n->id, scratch, 0};
DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
(const char *)&n->cmp, buf);
return scan(n, buf, len, start, n->single, n->nocase, &cbi);
}
/** \brief Streaming-mode scanner. */
hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
size_t hlen, const u8 *buf, size_t len,
HWLMCallback cb, struct hs_scratch *scratch) {
assert(n);
if (len + hlen < n->msk_len) {
DEBUG_PRINTF("not enough bytes for a match\n");
return HWLM_SUCCESS;
}
struct cb_info cbi = {cb, n->id, scratch, 0};
DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
n->msk_len, (const char *)&n->cmp, buf);
if (hlen && n->msk_len > 1) {
/*
* we have history, so build up a buffer from enough of the history
* buffer plus what we've been given to scan. Since this is relatively
* short, just check against msk+cmp per byte offset for matches.
*/
assert(hbuf);
u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
memset(temp_buf, 0, sizeof(temp_buf));
assert(n->msk_len);
size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
size_t tl2 = MIN((size_t)n->msk_len - 1, len);
assert(tl1 + tl2 <= sizeof(temp_buf));
assert(tl1 + tl2 >= n->msk_len);
assert(tl1 <= sizeof(u64a));
assert(tl2 <= sizeof(u64a));
DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
unaligned_store_u64a(temp_buf,
partial_load_u64a(hbuf + hlen - tl1, tl1));
unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
u64a v = unaligned_load_u64a(temp_buf + i);
if ((v & n->msk) == n->cmp) {
size_t m_end = -tl1 + i + n->msk_len - 1;
DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
if (rv == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATED;
}
}
}
}
assert(buf);
cbi.offsetAdj = 0;
return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
}

View File

@ -1,191 +0,0 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020, 2021, VectorCamp PC
* Copyright (c) 2021, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Noodle literal matcher: runtime.
*/
#include "hwlm.h"
#include "noodle_engine.h"
#include "noodle_internal.h"
#include "scratch.h"
#include "ue2common.h"
#include "util/arch.h"
#include "util/bitutils.h"
#include "util/compare.h"
#include "util/intrinsics.h"
#include "util/join.h"
#include "util/partial_store.h"
#include "util/simd_utils.h"
#if defined(HAVE_AVX2)
#include "util/arch/x86/masked_move.h"
#endif
#include <ctype.h>
#include <stdbool.h>
#include <string.h>
/** \brief Noodle runtime context. */
struct cb_info {
HWLMCallback cb; //!< callback function called on match
u32 id; //!< ID to pass to callback on match
struct hs_scratch *scratch; //!< scratch to pass to callback
size_t offsetAdj; //!< used in streaming mode
};
#define RETURN_IF_TERMINATED(x) \
{ \
if ((x) == HWLM_TERMINATED) { \
return HWLM_TERMINATED; \
} \
}
// Make sure the rest of the string is there. The single character scanner
// is used only for single chars with case insensitivity used correctly,
// so it can go straight to the callback if we get this far.
static really_inline
hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
bool needsConfirm, const struct cb_info *cbi, size_t pos) {
u64a v{0};
if (!needsConfirm) {
goto match;
}
assert(len >= n->msk_len);
v = partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
if ((v & n->msk) != n->cmp) {
/* mask didn't match */
return HWLM_SUCCESS;
}
match:
pos -= cbi->offsetAdj;
DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
if (rv == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATED;
}
return HWLM_SUCCESS;
}
#ifdef HAVE_SVE2
#include "noodle_engine_sve.hpp"
#else
#include "noodle_engine_simd.hpp"
#endif
// main entry point for the scan code
static really_inline
hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, char single, bool noCase,
const struct cb_info *cbi) {
if (len - start < n->msk_len) {
// can't find string of length keyLen in a shorter buffer
return HWLM_SUCCESS;
}
if (single) {
return scanSingle(n, buf, len, start, noCase, cbi);
} else {
return scanDouble(n, buf, len, start, noCase, cbi);
}
}
/** \brief Block-mode scanner. */
hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, HWLMCallback cb,
struct hs_scratch *scratch) {
assert(n && buf);
struct cb_info cbi = {cb, n->id, scratch, 0};
DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
(const char *)&n->cmp, buf);
return scan(n, buf, len, start, n->single, n->nocase, &cbi);
}
/** \brief Streaming-mode scanner. */
hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
size_t hlen, const u8 *buf, size_t len,
HWLMCallback cb, struct hs_scratch *scratch) {
assert(n);
if (len + hlen < n->msk_len) {
DEBUG_PRINTF("not enough bytes for a match\n");
return HWLM_SUCCESS;
}
struct cb_info cbi = {cb, n->id, scratch, 0};
DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
n->msk_len, (const char *)&n->cmp, buf);
if (hlen && n->msk_len > 1) {
/*
* we have history, so build up a buffer from enough of the history
* buffer plus what we've been given to scan. Since this is relatively
* short, just check against msk+cmp per byte offset for matches.
*/
assert(hbuf);
u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
memset(temp_buf, 0, sizeof(temp_buf));
assert(n->msk_len);
size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
size_t tl2 = MIN((size_t)n->msk_len - 1, len);
assert(tl1 + tl2 <= sizeof(temp_buf));
assert(tl1 + tl2 >= n->msk_len);
assert(tl1 <= sizeof(u64a));
assert(tl2 <= sizeof(u64a));
DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
unaligned_store_u64a(temp_buf,
partial_load_u64a(hbuf + hlen - tl1, tl1));
unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
u64a v = unaligned_load_u64a(temp_buf + i);
if ((v & n->msk) == n->cmp) {
size_t m_end = -tl1 + i + n->msk_len - 1;
DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
if (rv == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATED;
}
}
}
}
assert(buf);
cbi.offsetAdj = 0;
return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
}

View File

@ -0,0 +1,233 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* noodle scan parts for AVX */
static really_inline m256 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
return set32x8(k);
}
static really_inline m256 getCaseMask(void) {
return set32x8(0xdf);
}
static really_inline
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset, bool noCase,
m256 caseMask, m256 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
const size_t l = end - start;
m256 v = loadu256(d);
if (noCase) {
v = and256(v, caseMask);
}
u32 z = movemask256(eq256(mask1, v));
u32 buf_off = start - offset;
u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
z &= mask;
SINGLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset, bool noCase,
m256 caseMask, m256 mask1, m256 mask2,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
size_t l = end - start;
m256 v = loadu256(d);
if (noCase) {
v = and256(v, caseMask);
}
u32 z0 = movemask256(eq256(mask1, v));
u32 z1 = movemask256(eq256(mask2, v));
u32 z = (z0 << 1) & z1;
// mask out where we can't match
u32 buf_off = start - offset;
u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
z &= mask;
DOUBLE_ZSCAN();
return HWLM_SUCCESS;
}
// The short scan routine. It is used both to scan data up to an
// alignment boundary if needed and to finish off data that the aligned scan
// function can't handle (due to small/unaligned chunk at end)
static really_inline
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m256 caseMask, m256 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start;
size_t l = end - start;
DEBUG_PRINTF("l %zu\n", l);
assert(l <= 32);
if (!l) {
return HWLM_SUCCESS;
}
m256 v;
if (l < 4) {
u8 *vp = (u8*)&v;
switch (l) {
case 3: vp[2] = d[2]; // fallthrough
case 2: vp[1] = d[1]; // fallthrough
case 1: vp[0] = d[0]; // fallthrough
}
} else {
v = masked_move256_len(d, l);
}
if (noCase) {
v = and256(v, caseMask);
}
// mask out where we can't match
u32 mask = (0xFFFFFFFF >> (32 - l));
u32 z = mask & movemask256(eq256(mask1, v));
SINGLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m256 caseMask, m256 mask1,
m256 mask2, const struct cb_info *cbi,
size_t start, size_t end) {
const u8 *d = buf + start;
size_t l = end - start;
if (!l) {
return HWLM_SUCCESS;
}
assert(l <= 32);
m256 v;
DEBUG_PRINTF("d %zu\n", d - buf);
if (l < 4) {
u8 *vp = (u8*)&v;
switch (l) {
case 3: vp[2] = d[2]; // fallthrough
case 2: vp[1] = d[1]; // fallthrough
case 1: vp[0] = d[0]; // fallthrough
}
} else {
v = masked_move256_len(d, l);
}
if (noCase) {
v = and256(v, caseMask);
}
u32 z0 = movemask256(eq256(mask1, v));
u32 z1 = movemask256(eq256(mask2, v));
u32 z = (z0 << 1) & z1;
// mask out where we can't match
u32 mask = (0xFFFFFFFF >> (32 - l));
z &= mask;
DOUBLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m256 caseMask, m256 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start, *e = buf + end;
assert(d < e);
for (; d < e; d += 32) {
m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
u32 z = movemask256(eq256(mask1, v));
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d + 128);
SINGLE_ZSCAN();
}
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m256 caseMask, m256 mask1,
m256 mask2, const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start, *e = buf + end;
DEBUG_PRINTF("start %zu end %zu \n", start, end);
assert(d < e);
u32 lastz0 = 0;
for (; d < e; d += 32) {
m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
// we have to pull the masks out of the AVX registers because we can't
// byte shift between the lanes
u32 z0 = movemask256(eq256(mask1, v));
u32 z1 = movemask256(eq256(mask2, v));
u32 z = (lastz0 | (z0 << 1)) & z1;
lastz0 = z0 >> 31;
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d + 128);
DOUBLE_ZSCAN();
}
return HWLM_SUCCESS;
}

View File

@ -0,0 +1,191 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* noodle scan parts for AVX512 */
static really_inline
m512 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
return set64x8(k);
}
static really_inline
m512 getCaseMask(void) {
return set64x8(CASE_CLEAR);
}
// The short scan routine. It is used both to scan data up to an
// alignment boundary if needed and to finish off data that the aligned scan
// function can't handle (due to small/unaligned chunk at end)
static really_inline
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m512 caseMask, m512 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start;
ptrdiff_t scan_len = end - start;
DEBUG_PRINTF("scan_len %zu\n", scan_len);
assert(scan_len <= 64);
if (!scan_len) {
return HWLM_SUCCESS;
}
__mmask64 k = (~0ULL) >> (64 - scan_len);
DEBUG_PRINTF("load mask 0x%016llx\n", k);
m512 v = loadu_maskz_m512(k, d);
if (noCase) {
v = and512(v, caseMask);
}
// reuse the load mask to indicate valid bytes
u64a z = masked_eq512mask(k, mask1, v);
SINGLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len,
bool noCase, m512 caseMask, m512 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start;
const u8 *e = buf + end;
DEBUG_PRINTF("start %p end %p \n", d, e);
assert(d < e);
if (d + 64 >= e) {
goto tail;
}
// peel off first part to cacheline boundary
const u8 *d1 = ROUNDUP_PTR(d, 64);
if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start,
d1 - buf) == HWLM_TERMINATED) {
return HWLM_TERMINATED;
}
d = d1;
for (; d + 64 < e; d += 64) {
DEBUG_PRINTF("d %p e %p \n", d, e);
m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
u64a z = eq512mask(mask1, v);
__builtin_prefetch(d + 128);
SINGLE_ZSCAN();
}
tail:
DEBUG_PRINTF("d %p e %p \n", d, e);
// finish off tail
return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf,
e - buf);
}
static really_inline
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m512 caseMask, m512 mask1,
m512 mask2, const struct cb_info *cbi,
u64a *lastz0, size_t start, size_t end) {
DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
const u8 *d = buf + start;
ptrdiff_t scan_len = end - start;
if (!scan_len) {
return HWLM_SUCCESS;
}
assert(scan_len <= 64);
__mmask64 k = (~0ULL) >> (64 - scan_len);
DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len);
m512 v = loadu_maskz_m512(k, d);
if (noCase) {
v = and512(v, caseMask);
}
u64a z0 = masked_eq512mask(k, mask1, v);
u64a z1 = masked_eq512mask(k, mask2, v);
u64a z = (*lastz0 | (z0 << 1)) & z1;
DEBUG_PRINTF("z 0x%016llx\n", z);
DOUBLE_ZSCAN();
*lastz0 = z0 >> (scan_len - 1);
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len,
bool noCase, m512 caseMask, m512 mask1, m512 mask2,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start;
const u8 *e = buf + end;
u64a lastz0 = 0;
DEBUG_PRINTF("start %zu end %zu \n", start, end);
assert(d < e);
if (d + 64 >= e) {
goto tail;
}
// peel off first part to cacheline boundary
const u8 *d1 = ROUNDUP_PTR(d, 64);
if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
&lastz0, start, d1 - buf) == HWLM_TERMINATED) {
return HWLM_TERMINATED;
}
d = d1;
for (; d + 64 < e; d += 64) {
DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0);
m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
/* we have to pull the masks out of the AVX registers because we can't
byte shift between the lanes */
u64a z0 = eq512mask(mask1, v);
u64a z1 = eq512mask(mask2, v);
u64a z = (lastz0 | (z0 << 1)) & z1;
lastz0 = z0 >> 63;
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d + 256);
DEBUG_PRINTF("z 0x%016llx\n", z);
DOUBLE_ZSCAN();
}
tail:
DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
// finish off tail
return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
&lastz0, d - buf, end);
}

View File

@ -1,310 +0,0 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* SIMD engine agnostic noodle scan parts */
#include "util/supervector/supervector.hpp"
#include "util/supervector/casemask.hpp"
static really_really_inline
hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
Z_TYPE z, size_t len, const struct cb_info *cbi) {
while (unlikely(z)) {
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
size_t matchPos = d - buf + pos;
DEBUG_PRINTF("match pos %zu\n", matchPos);
hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
RETURN_IF_TERMINATED(rv);
}
return HWLM_SUCCESS;
}
static really_really_inline
hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
Z_TYPE z, size_t len, const struct cb_info *cbi) {
while (unlikely(z)) {
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
size_t matchPos = d - buf + pos - 1;
DEBUG_PRINTF("match pos %zu\n", matchPos);
hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
RETURN_IF_TERMINATED(rv);
}
return HWLM_SUCCESS;
}
template<uint16_t S>
static really_inline
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1,
const struct cb_info *cbi, size_t len, size_t start,
size_t end) {
const u8 *d = buf + start;
DEBUG_PRINTF("start %zu end %zu\n", start, end);
const size_t l = end - start;
DEBUG_PRINTF("l = %ld\n", l);
//assert(l <= 64);
if (!l) {
return HWLM_SUCCESS;
}
SuperVector<S> v = SuperVector<S>::Zeroes();
memcpy(&v.u, d, l);
typename SuperVector<S>::comparemask_type mask =
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
v = v & caseMask;
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
z = SuperVector<S>::iteration_mask(z);
return single_zscan(n, d, buf, z, len, cbi);
}
// The short scan routine. It is used both to scan data up to an
// alignment boundary if needed and to finish off data that the aligned scan
// function can't handle (due to small/unaligned chunk at end)
template<uint16_t S>
static really_inline
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1,
const struct cb_info *cbi, size_t len, size_t offset,
size_t start,
size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
const size_t l = end - start;
DEBUG_PRINTF("l = %ld\n", l);
assert(l <= 64);
if (!l) {
return HWLM_SUCCESS;
}
size_t buf_off = start - offset;
typename SuperVector<S>::comparemask_type mask =
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
<< (buf_off * SuperVector<S>::mask_width());
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
z = SuperVector<S>::iteration_mask(z);
return single_zscan(n, d, buf, z, len, cbi);
}
template<uint16_t S>
static really_inline
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
const struct cb_info *cbi, size_t len, size_t start, size_t end) {
const u8 *d = buf + start;
DEBUG_PRINTF("start %zu end %zu\n", start, end);
const size_t l = end - start;
assert(l <= S);
if (!l) {
return HWLM_SUCCESS;
}
SuperVector<S> v = SuperVector<S>::Zeroes();
memcpy(&v.u, d, l);
v = v & caseMask;
typename SuperVector<S>::comparemask_type mask =
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
typename SuperVector<S>::comparemask_type z =
mask & (z1 << (SuperVector<S>::mask_width())) & z2;
z = SuperVector<S>::iteration_mask(z);
return double_zscan(n, d, buf, z, len, cbi);
}
template<uint16_t S>
static really_inline
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
const size_t l = end - start;
assert(l <= S);
if (!l) {
return HWLM_SUCCESS;
}
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
size_t buf_off = start - offset;
typename SuperVector<S>::comparemask_type mask =
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
<< (buf_off * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
typename SuperVector<S>::comparemask_type z =
mask & (z1 << SuperVector<S>::mask_width()) & z2;
z = SuperVector<S>::iteration_mask(z);
return double_zscan(n, d, buf, z, len, cbi);
}
template <uint16_t S>
static really_inline
hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset,
SuperVector<S> caseMask, SuperVector<S> mask1,
const struct cb_info *cbi) {
size_t start = offset + n->msk_len - 1;
size_t end = len;
const u8 *d = buf + start;
const u8 *e = buf + end;
DEBUG_PRINTF("start %p end %p \n", d, e);
assert(d < e);
if (e - d < S) {
return scanSingleShort(n, buf, caseMask, mask1, cbi, len, start, end);
}
if (d + S <= e) {
// peel off first part to cacheline boundary
const u8 *d1 = ROUNDUP_PTR(d, S);
DEBUG_PRINTF("until aligned %p \n", d1);
if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
return HWLM_TERMINATED;
}
d = d1;
size_t loops = (end - (d - buf)) / S;
DEBUG_PRINTF("loops %ld \n", loops);
for (size_t i = 0; i < loops; i++, d+= S) {
DEBUG_PRINTF("d %p \n", d);
const u8 *base = ROUNDUP_PTR(d, 64);
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(base + 256);
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
z = SuperVector<S>::iteration_mask(z);
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
}
}
DEBUG_PRINTF("d %p e %p \n", d, e);
// finish off tail
size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
if (s2End == end) {
return HWLM_SUCCESS;
}
return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, end - S, s2End, len);
}
template <uint16_t S>
static really_inline
hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset,
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
const struct cb_info *cbi) {
// we stop scanning for the key-fragment when the rest of the key can't
// possibly fit in the remaining buffer
size_t end = len - n->key_offset + 2;
size_t start = offset + n->msk_len - n->key_offset;
typename SuperVector<S>::comparemask_type lastz1{0};
const u8 *d = buf + start;
const u8 *e = buf + end;
DEBUG_PRINTF("start %p end %p \n", d, e);
assert(d < e);
if (e - d < S) {
return scanDoubleShort(n, buf, caseMask, mask1, mask2, cbi, len, d - buf, end);
}
if (d + S <= e) {
// peel off first part to cacheline boundary
const u8 *d1 = ROUNDUP_PTR(d, S) + 1;
DEBUG_PRINTF("until aligned %p \n", d1);
if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
return HWLM_TERMINATED;
}
d = d1 - 1;
size_t loops = (end - (d - buf)) / S;
DEBUG_PRINTF("loops %ld \n", loops);
for (size_t i = 0; i < loops; i++, d+= S) {
DEBUG_PRINTF("d %p \n", d);
const u8 *base = ROUNDUP_PTR(d, 64);
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(base + 256);
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
typename SuperVector<S>::comparemask_type z =
(z1 << SuperVector<S>::mask_width() | lastz1) & z2;
lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
z = SuperVector<S>::iteration_mask(z);
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
}
if (loops == 0) {
d = d1;
}
}
// finish off tail
size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
if (s2End == end) {
return HWLM_SUCCESS;
}
return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, end - S, d - buf, end);
}
// Single-character specialisation, used when keyLen = 1
static really_inline
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, bool noCase, const struct cb_info *cbi) {
if (!ourisalpha(n->key0)) {
noCase = 0; // force noCase off if we don't have an alphabetic char
}
const SuperVector<VECTORSIZE> caseMask{noCase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
const SuperVector<VECTORSIZE> mask1{getMask<VECTORSIZE>(n->key0, noCase)};
return scanSingleMain(n, buf, len, start, caseMask, mask1, cbi);
}
static really_inline
hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, bool noCase, const struct cb_info *cbi) {
const SuperVector<VECTORSIZE> caseMask{noCase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
const SuperVector<VECTORSIZE> mask1{getMask<VECTORSIZE>(n->key0, noCase)};
const SuperVector<VECTORSIZE> mask2{getMask<VECTORSIZE>(n->key1, noCase)};
return scanDoubleMain(n, buf, len, start, caseMask, mask1, mask2, cbi);
}

View File

@ -0,0 +1,203 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* noodle scan parts for SSE */
static really_inline m128 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
return set16x8(k);
}
static really_inline m128 getCaseMask(void) {
return set16x8(0xdf);
}
static really_inline
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m128 caseMask, m128 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start;
size_t l = end - start;
DEBUG_PRINTF("l %zu\n", l);
assert(l <= 16);
if (!l) {
return HWLM_SUCCESS;
}
m128 v = zeroes128();
// we don't have a clever way of doing this move yet
memcpy(&v, d, l);
if (noCase) {
v = and128(v, caseMask);
}
// mask out where we can't match
u32 mask = (0xFFFF >> (16 - l));
u32 z = mask & movemask128(eq128(mask1, v));
SINGLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset, bool noCase,
m128 caseMask, m128 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
const size_t l = end - start;
m128 v = loadu128(d);
if (noCase) {
v = and128(v, caseMask);
}
u32 buf_off = start - offset;
u32 mask = ((1 << l) - 1) << buf_off;
u32 z = mask & movemask128(eq128(mask1, v));
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
z &= mask;
SINGLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m128 caseMask, m128 mask1,
m128 mask2, const struct cb_info *cbi,
size_t start, size_t end) {
const u8 *d = buf + start;
size_t l = end - start;
if (!l) {
return HWLM_SUCCESS;
}
assert(l <= 16);
DEBUG_PRINTF("d %zu\n", d - buf);
m128 v = zeroes128();
memcpy(&v, d, l);
if (noCase) {
v = and128(v, caseMask);
}
u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
eq128(mask2, v)));
// mask out where we can't match
u32 mask = (0xFFFF >> (16 - l));
z &= mask;
DOUBLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset, bool noCase,
m128 caseMask, m128 mask1, m128 mask2,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
size_t l = end - start;
m128 v = loadu128(d);
if (noCase) {
v = and128(v, caseMask);
}
u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
eq128(mask2, v)));
// mask out where we can't match
u32 buf_off = start - offset;
u32 mask = ((1 << l) - 1) << buf_off;
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
z &= mask;
DOUBLE_ZSCAN();
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m128 caseMask, m128 mask1,
const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start, *e = buf + end;
assert(d < e);
for (; d < e; d += 16) {
m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
u32 z = movemask128(eq128(mask1, v));
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d + 128);
SINGLE_ZSCAN();
}
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
size_t len, bool noCase, m128 caseMask, m128 mask1,
m128 mask2, const struct cb_info *cbi, size_t start,
size_t end) {
const u8 *d = buf + start, *e = buf + end;
assert(d < e);
m128 lastz1 = zeroes128();
for (; d < e; d += 16) {
m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
m128 z1 = eq128(mask1, v);
m128 z2 = eq128(mask2, v);
u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2));
lastz1 = z1;
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d + 128);
DEBUG_PRINTF("z 0x%08x\n", z);
DOUBLE_ZSCAN();
}
return HWLM_SUCCESS;
}

View File

@ -1,259 +0,0 @@
/*
* Copyright (c) 2021, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
static really_inline
hwlm_error_t checkMatched(const struct noodTable *n, const u8 *buf, size_t len,
const struct cb_info *cbi, const u8 *d,
svbool_t matched, bool needsConfirm) {
assert(d >= buf);
size_t basePos = d - buf;
svbool_t next_match = svpnext_b8(matched, svpfalse());
do {
svbool_t brk = svbrkb_z(svptrue_b8(), next_match);
size_t matchPos = basePos + svcntp_b8(svptrue_b8(), brk);
DEBUG_PRINTF("match pos %zu\n", matchPos);
assert(matchPos < len);
hwlmcb_rv_t rv = final(n, buf, len, needsConfirm, cbi, matchPos);
RETURN_IF_TERMINATED(rv);
next_match = svpnext_b8(matched, next_match);
} while (unlikely(svptest_any(svptrue_b8(), next_match)));
return HWLM_SUCCESS;
}
static really_inline
hwlm_error_t singleCheckMatched(const struct noodTable *n, const u8 *buf,
size_t len, const struct cb_info *cbi,
const u8 *d, svbool_t matched) {
if (unlikely(svptest_any(svptrue_b8(), matched))) {
hwlmcb_rv_t rv = checkMatched(n, buf, len, cbi, d, matched,
n->msk_len != 1);
RETURN_IF_TERMINATED(rv);
}
return HWLM_SUCCESS;
}
static really_inline
svbool_t singleMatched(svuint8_t chars, const u8 *d, svbool_t pg) {
return svmatch(pg, svld1_u8(pg, d), chars);
}
static really_inline
hwlm_error_t scanSingleOnce(const struct noodTable *n, const u8 *buf,
size_t len, const struct cb_info *cbi,
svuint8_t chars, const u8 *d, const u8 *e) {
DEBUG_PRINTF("start %p end %p\n", d, e);
assert(d < e);
assert(d >= buf);
DEBUG_PRINTF("l = %td\n", e - d);
svbool_t pg = svwhilelt_b8_s64(0, e - d);
svbool_t matched = singleMatched(chars, d, pg);
return singleCheckMatched(n, buf, len, cbi, d, matched);
}
static really_inline
hwlm_error_t scanSingleLoop(const struct noodTable *n, const u8 *buf,
size_t len, const struct cb_info *cbi,
svuint8_t chars, const u8 *d, const u8 *e) {
assert(d < e);
assert(d >= buf);
size_t loops = (e - d) / svcntb();
DEBUG_PRINTF("loops %zu \n", loops);
assert(d + (loops * svcntb()) <= e);
for (size_t i = 0; i < loops; i++, d += svcntb()) {
DEBUG_PRINTF("d %p \n", d);
svbool_t matched = singleMatched(chars, d, svptrue_b8());
hwlmcb_rv_t rv = singleCheckMatched(n, buf, len, cbi, d, matched);
RETURN_IF_TERMINATED(rv);
}
DEBUG_PRINTF("d %p e %p \n", d, e);
return d == e ? HWLM_SUCCESS
: scanSingleOnce(n, buf, len, cbi, chars, d, e);
}
static really_inline
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
size_t offset, bool noCase, const struct cb_info *cbi) {
if (!ourisalpha(n->key0)) {
noCase = false; // force noCase off if we don't have an alphabetic char
}
size_t start = offset + n->msk_len - 1;
const u8 *d = buf + start;
const u8 *e = buf + len;
DEBUG_PRINTF("start %p end %p \n", d, e);
assert(d < e);
assert(d >= buf);
svuint8_t chars = getCharMaskSingle(n->key0, noCase);
size_t scan_len = e - d;
if (scan_len <= svcntb()) {
return scanSingleOnce(n, buf, len, cbi, chars, d, e);
}
// peel off first part to align to the vector size
const u8 *d1 = ROUNDUP_PTR(d, svcntb_pat(SV_POW2));
if (d != d1) {
DEBUG_PRINTF("until aligned %p \n", d1);
hwlmcb_rv_t rv = scanSingleOnce(n, buf, len, cbi, chars, d, d1);
RETURN_IF_TERMINATED(rv);
}
return scanSingleLoop(n, buf, len, cbi, chars, d1, e);
}
static really_inline
hwlm_error_t doubleCheckMatched(const struct noodTable *n, const u8 *buf,
size_t len, const struct cb_info *cbi,
const u8 *d, svbool_t matched,
svbool_t matched_rot, svbool_t any) {
if (unlikely(svptest_any(svptrue_b8(), any))) {
// Project predicate onto vector.
svuint8_t matched_vec = svdup_u8_z(matched, 1);
// Shift vector to right by one and project back to the predicate.
matched = svcmpeq_n_u8(svptrue_b8(), svinsr_n_u8(matched_vec, 0), 1);
matched = svorr_z(svptrue_b8(), matched, matched_rot);
// d - 1 won't underflow as the first position in buf has been dealt
// with meaning that d > buf
assert(d > buf);
hwlmcb_rv_t rv = checkMatched(n, buf, len, cbi, d - 1, matched,
n->msk_len != 2);
RETURN_IF_TERMINATED(rv);
}
return HWLM_SUCCESS;
}
static really_inline
svbool_t doubleMatchedLoop(svuint16_t chars, const u8 *d,
svbool_t * const matched, svbool_t * const matched_rot) {
svuint16_t vec = svreinterpret_u16(svld1_u8(svptrue_b8(), d));
// d - 1 won't underflow as the first position in buf has been dealt
// with meaning that d > buf
svuint16_t vec_rot = svreinterpret_u16(svld1_u8(svptrue_b8(), d - 1));
*matched = svmatch(svptrue_b8(), vec, chars);
*matched_rot = svmatch(svptrue_b8(), vec_rot, chars);
return svorr_z(svptrue_b8(), *matched, *matched_rot);
}
static really_inline
hwlm_error_t scanDoubleOnce(const struct noodTable *n, const u8 *buf,
size_t len, const struct cb_info *cbi,
svuint8_t chars, const u8 *d, const u8 *e) {
DEBUG_PRINTF("start %p end %p\n", d, e);
assert(d < e);
assert(d > buf);
const ptrdiff_t size = e - d;
svbool_t pg = svwhilelt_b8_s64(0, size);
svbool_t pg_rot = svwhilelt_b8_s64(0, size + 1);
svuint16_t vec = svreinterpret_u16(svld1_u8(pg, d));
// d - 1 won't underflow as the first position in buf has been dealt
// with meaning that d > buf
svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, d - 1));
// we reuse u8 predicates for u16 lanes. This means that we will check against one
// extra \0 character at the end of the vector.
if(unlikely(n->key1 == '\0')) {
if (size % 2) {
// if odd, vec has an odd number of lanes and has the spurious \0
svbool_t lane_to_disable = svrev_b8(svpfirst(svrev_b8(pg), svpfalse()));
pg = sveor_z(svptrue_b8(), pg, lane_to_disable);
} else {
// if even, vec_rot has an odd number of lanes and has the spurious \0
// we need to disable the last active lane as well, but we know pg is
// the same as pg_rot without the last lane
pg_rot = pg;
}
}
svbool_t matched = svmatch(pg, vec, svreinterpret_u16(chars));
svbool_t matched_rot = svmatch(pg_rot, vec_rot, svreinterpret_u16(chars));
svbool_t any = svorr_z(svptrue_b8(), matched, matched_rot);
return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
}
static really_inline
hwlm_error_t scanDoubleLoop(const struct noodTable *n, const u8 *buf,
size_t len, const struct cb_info *cbi,
svuint8_t chars, const u8 *d, const u8 *e) {
assert(d < e);
assert(d > buf);
size_t loops = (e - d) / svcntb();
DEBUG_PRINTF("loops %zu \n", loops);
assert(d + (loops * svcntb()) <= e);
for (size_t i = 0; i < loops; i++, d += svcntb()) {
DEBUG_PRINTF("d %p \n", d);
svbool_t matched, matched_rot;
svbool_t any = doubleMatchedLoop(svreinterpret_u16(chars), d, &matched, &matched_rot);
hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
matched, matched_rot, any);
RETURN_IF_TERMINATED(rv);
}
DEBUG_PRINTF("d %p e %p \n", d, e);
return d == e ? HWLM_SUCCESS
: scanDoubleOnce(n, buf, len, cbi, chars, d, e);
}
static really_inline
hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
size_t offset, bool noCase, const struct cb_info *cbi) {
// we stop scanning for the key-fragment when the rest of the key can't
// possibly fit in the remaining buffer
size_t end = len - n->key_offset + 2;
size_t start = offset + n->msk_len - n->key_offset;
const u8 *d = buf + start;
const u8 *e = buf + end;
DEBUG_PRINTF("start %p end %p \n", d, e);
assert(d < e);
assert(d >= buf);
size_t scan_len = e - d;
if (scan_len < 2) {
return HWLM_SUCCESS;
}
++d;
svuint8_t chars = svreinterpret_u8(getCharMaskDouble(n->key0, n->key1, noCase));
if (scan_len <= svcntb()) {
return scanDoubleOnce(n, buf, len, cbi, chars, d, e);
}
// peel off first part to align to the vector size
const u8 *d1 = ROUNDUP_PTR(d, svcntb_pat(SV_POW2));
if (d != d1) {
DEBUG_PRINTF("until aligned %p \n", d1);
hwlmcb_rv_t rv = scanDoubleOnce(n, buf, len, cbi, chars,
d, d1);
RETURN_IF_TERMINATED(rv);
}
return scanDoubleLoop(n, buf, len, cbi, chars, d1, e);
}

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -30,7 +29,7 @@
#include "accel.h" #include "accel.h"
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "vermicelli.hpp" #include "vermicelli.h"
#include "ue2common.h" #include "ue2common.h"
const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
@ -82,39 +81,6 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
c_end - 1); c_end - 1);
break; break;
#ifdef HAVE_SVE2
case ACCEL_VERM16:
DEBUG_PRINTF("accel verm16 %p %p\n", c, c_end);
if (c_end - c < 16) {
return c;
}
rv = vermicelli16Exec(accel->verm16.mask, c, c_end);
break;
case ACCEL_DVERM16:
DEBUG_PRINTF("accel dverm16 %p %p\n", c, c_end);
if (c_end - c < 18) {
return c;
}
/* need to stop one early to get an accurate end state */
rv = vermicelliDouble16Exec(accel->dverm16.mask, accel->dverm16.firsts,
c, c_end - 1);
break;
case ACCEL_DVERM16_MASKED:
DEBUG_PRINTF("accel dverm16 masked %p %p\n", c, c_end);
if (c_end - c < 18) {
return c;
}
/* need to stop one early to get an accurate end state */
rv = vermicelliDoubleMasked16Exec(accel->mdverm16.mask, accel->mdverm16.c1,
accel->mdverm16.m1, c, c_end - 1);
break;
#endif // HAVE_SVE2
case ACCEL_DVERM_MASKED: case ACCEL_DVERM_MASKED:
DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end); DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end);
if (c + 16 + 1 >= c_end) { if (c + 16 + 1 >= c_end) {
@ -142,18 +108,9 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
return c; return c;
} }
rv = truffleExec(accel->truffle.mask_lo, accel->truffle.mask_hi, c, c_end); rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end);
break; break;
#ifdef CAN_USE_WIDE_TRUFFLE
case ACCEL_TRUFFLE_WIDE:
DEBUG_PRINTF("accel Truffle Wide %p %p\n", c, c_end);
if (c + 15 >= c_end) {
return c;
}
rv = truffleExecWide(accel->truffle.mask, c, c_end);
break;
#endif
case ACCEL_DSHUFTI: case ACCEL_DSHUFTI:
DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end); DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end);
if (c + 15 + 1 >= c_end) { if (c + 15 + 1 >= c_end) {

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -63,10 +62,6 @@ enum AccelType {
ACCEL_TRUFFLE, ACCEL_TRUFFLE,
ACCEL_RED_TAPE, ACCEL_RED_TAPE,
ACCEL_DVERM_MASKED, ACCEL_DVERM_MASKED,
ACCEL_VERM16,
ACCEL_DVERM16,
ACCEL_DVERM16_MASKED,
ACCEL_TRUFFLE_WIDE,
}; };
/** \brief Structure for accel framework. */ /** \brief Structure for accel framework. */
@ -102,24 +97,6 @@ union AccelAux {
u8 len1; u8 len1;
u8 len2; u8 len2;
} mdverm; } mdverm;
struct {
u8 accel_type;
u8 offset;
m128 mask;
} verm16;
struct {
u8 accel_type;
u8 offset;
u64a firsts;
m128 mask;
} dverm16;
struct {
u8 accel_type;
u8 offset;
u8 c1; // used for partial match
u8 m1; // used for partial match
m128 mask;
} mdverm16;
struct { struct {
u8 accel_type; u8 accel_type;
u8 offset; u8 offset;
@ -137,18 +114,8 @@ union AccelAux {
struct { struct {
u8 accel_type; u8 accel_type;
u8 offset; u8 offset;
union { m128 mask1;
m256 mask; m128 mask2;
struct {
#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
m128 mask_lo;
m128 mask_hi;
#else
m128 mask_hi;
m128 mask_lo;
#endif
};
};
} truffle; } truffle;
}; };

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -34,7 +33,6 @@
#include "nfagraph/ng_limex_accel.h" #include "nfagraph/ng_limex_accel.h"
#include "shufticompile.h" #include "shufticompile.h"
#include "trufflecompile.h" #include "trufflecompile.h"
#include "vermicellicompile.h"
#include "util/accel_scheme.h" #include "util/accel_scheme.h"
#include "util/charreach.h" #include "util/charreach.h"
#include "util/container.h" #include "util/container.h"
@ -107,7 +105,7 @@ static
path append(const path &orig, const CharReach &cr, u32 new_dest) { path append(const path &orig, const CharReach &cr, u32 new_dest) {
path p(new_dest); path p(new_dest);
p.reach = orig.reach; p.reach = orig.reach;
p.reach.emplace_back(cr); p.reach.push_back(cr);
return p; return p;
} }
@ -119,25 +117,25 @@ void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
const dstate &s = rdfa.states[p.dest]; const dstate &s = rdfa.states[p.dest];
if (!p.reach.empty() && p.reach.back().none()) { if (!p.reach.empty() && p.reach.back().none()) {
out.emplace_back(p); out.push_back(p);
return; return;
} }
if (!s.reports.empty()) { if (!s.reports.empty()) {
if (generates_callbacks(rdfa.kind)) { if (generates_callbacks(rdfa.kind)) {
out.emplace_back(p); out.push_back(p);
return; return;
} else { } else {
path pp = append(p, CharReach(), p.dest); path pp = append(p, CharReach(), p.dest);
all[p.dest].emplace_back(pp); all[p.dest].push_back(pp);
out.emplace_back(std::move(pp)); out.push_back(move(pp));
} }
} }
if (!s.reports_eod.empty()) { if (!s.reports_eod.empty()) {
path pp = append(p, CharReach(), p.dest); path pp = append(p, CharReach(), p.dest);
all[p.dest].emplace_back(pp); all[p.dest].push_back(pp);
out.emplace_back(std::move(pp)); out.push_back(move(pp));
} }
flat_map<u32, CharReach> dest; flat_map<u32, CharReach> dest;
@ -156,8 +154,8 @@ void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
DEBUG_PRINTF("----good: [%s] -> %u\n", DEBUG_PRINTF("----good: [%s] -> %u\n",
describeClasses(pp.reach).c_str(), pp.dest); describeClasses(pp.reach).c_str(), pp.dest);
all[e.first].emplace_back(pp); all[e.first].push_back(pp);
out.emplace_back(std::move(pp)); out.push_back(move(pp));
} }
} }
@ -167,14 +165,14 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa); const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
vector<path> paths{path(base)}; vector<path> paths{path(base)};
unordered_map<u32, vector<path>> all; unordered_map<u32, vector<path>> all;
all[base].emplace_back(path(base)); all[base].push_back(path(base));
for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) { for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
vector<path> next_gen; vector<path> next_gen;
for (const auto &p : paths) { for (const auto &p : paths) {
extend(rdfa, rev_map, p, all, next_gen); extend(rdfa, rev_map, p, all, next_gen);
} }
paths = std::move(next_gen); paths = move(next_gen);
} }
dump_paths(paths); dump_paths(paths);
@ -182,8 +180,7 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
vector<vector<CharReach>> rv; vector<vector<CharReach>> rv;
rv.reserve(paths.size()); rv.reserve(paths.size());
for (auto &p : paths) { for (auto &p : paths) {
// cppcheck-suppress useStlAlgorithm rv.push_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
rv.emplace_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
std::make_move_iterator(p.reach.end()))); std::make_move_iterator(p.reach.end())));
} }
return rv; return rv;
@ -321,7 +318,7 @@ set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
DEBUG_PRINTF(" %hu is in region\n", t); DEBUG_PRINTF(" %hu is in region\n", t);
region.insert(t); region.insert(t);
pending.emplace_back(t); pending.push_back(t);
} }
} }
@ -427,11 +424,10 @@ void
accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx, accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
const AccelScheme &info, const AccelScheme &info,
void *accel_out) { void *accel_out) {
AccelAux *accel = reinterpret_cast<AccelAux *>(accel_out); AccelAux *accel = (AccelAux *)accel_out;
DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset, DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
info.double_offset); info.double_offset);
// cppcheck-suppress redundantInitialization
accel->generic.offset = verify_u8(info.offset); accel->generic.offset = verify_u8(info.offset);
if (double_byte_ok(info) && info.double_cr.none() && if (double_byte_ok(info) && info.double_cr.none() &&
@ -444,87 +440,52 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
return; return;
} }
if (double_byte_ok(info) && info.double_cr.none()) { if (double_byte_ok(info) && info.double_cr.none() &&
if ((info.double_byte.size() == 2 || info.double_byte.size() == 4)) { (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
bool ok = true; bool ok = true;
assert(!info.double_byte.empty()); assert(!info.double_byte.empty());
u8 firstC = info.double_byte.begin()->first & CASE_CLEAR; u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
u8 secondC = info.double_byte.begin()->second & CASE_CLEAR; u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
for (const pair<u8, u8> &p : info.double_byte) { for (const pair<u8, u8> &p : info.double_byte) {
if ((p.first & CASE_CLEAR) != firstC || if ((p.first & CASE_CLEAR) != firstC ||
(p.second & CASE_CLEAR) != secondC) { (p.second & CASE_CLEAR) != secondC) {
ok = false; ok = false;
break; break;
}
}
if (ok) {
accel->accel_type = ACCEL_DVERM_NOCASE;
accel->dverm.c1 = firstC;
accel->dverm.c2 = secondC;
accel->dverm.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
return;
}
u8 m1;
u8 m2;
if (buildDvermMask(info.double_byte, &m1, &m2)) {
u8 c1 = info.double_byte.begin()->first & m1;
u8 c2 = info.double_byte.begin()->second & m2;
#ifdef HAVE_SVE2
if (vermicelliDoubleMasked16Build(c1, c2, m1, m2,
reinterpret_cast<u8 *>(&accel->mdverm16.mask))) {
accel->accel_type = ACCEL_DVERM16_MASKED;
accel->mdverm16.offset = verify_u8(info.double_offset);
accel->mdverm16.c1 = c1;
accel->mdverm16.m1 = m1;
DEBUG_PRINTF("building maskeddouble16-vermicelli for 0x%02hhx%02hhx\n",
c1, c2);
return;
} else if (info.double_byte.size() <= 8 &&
vermicelliDouble16Build(info.double_byte,
reinterpret_cast<u8 *>(&accel->dverm16.mask),
reinterpret_cast<u8 *>(&accel->dverm16.firsts))) {
accel->accel_type = ACCEL_DVERM16;
accel->dverm16.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("building double16-vermicelli\n");
return;
}
#endif // HAVE_SVE2
accel->accel_type = ACCEL_DVERM_MASKED;
accel->dverm.offset = verify_u8(info.double_offset);
accel->dverm.c1 = c1;
accel->dverm.c2 = c2;
accel->dverm.m1 = m1;
accel->dverm.m2 = m2;
DEBUG_PRINTF(
"building maskeddouble-vermicelli for 0x%02hhx%02hhx\n", c1, c2);
return;
} }
} }
#ifdef HAVE_SVE2
if (info.double_byte.size() <= 8 && if (ok) {
vermicelliDouble16Build(info.double_byte, accel->accel_type = ACCEL_DVERM_NOCASE;
reinterpret_cast<u8 *>(&accel->dverm16.mask), accel->dverm.c1 = firstC;
reinterpret_cast<u8 *>(&accel->dverm16.firsts))) { accel->dverm.c2 = secondC;
accel->accel_type = ACCEL_DVERM16; accel->dverm.offset = verify_u8(info.double_offset);
accel->dverm16.offset = verify_u8(info.double_offset); DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
DEBUG_PRINTF("building double16-vermicelli\n"); return;
}
u8 m1;
u8 m2;
if (buildDvermMask(info.double_byte, &m1, &m2)) {
accel->accel_type = ACCEL_DVERM_MASKED;
accel->dverm.offset = verify_u8(info.double_offset);
accel->dverm.c1 = info.double_byte.begin()->first & m1;
accel->dverm.c2 = info.double_byte.begin()->second & m2;
accel->dverm.m1 = m1;
accel->dverm.m2 = m2;
DEBUG_PRINTF(
"building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
accel->dverm.c1, accel->dverm.c2);
return; return;
} }
#endif // HAVE_SVE2
} }
if (double_byte_ok(info) && if (double_byte_ok(info) &&
shuftiBuildDoubleMasks( shuftiBuildDoubleMasks(
info.double_cr, info.double_byte, info.double_cr, info.double_byte, (u8 *)&accel->dshufti.lo1,
reinterpret_cast<u8 *>(&accel->dshufti.lo1), (u8 *)&accel->dshufti.hi1, (u8 *)&accel->dshufti.lo2,
reinterpret_cast<u8 *>(&accel->dshufti.hi1), (u8 *)&accel->dshufti.hi2)) {
reinterpret_cast<u8 *>(&accel->dshufti.lo2),
reinterpret_cast<u8 *>(&accel->dshufti.hi2))) {
accel->accel_type = ACCEL_DSHUFTI; accel->accel_type = ACCEL_DSHUFTI;
accel->dshufti.offset = verify_u8(info.double_offset); accel->dshufti.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("state %hu is double shufti\n", this_idx); DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
@ -553,15 +514,6 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
return; return;
} }
#ifdef HAVE_SVE2
if (info.cr.count() <= 16) {
accel->accel_type = ACCEL_VERM16;
vermicelli16Build(info.cr, reinterpret_cast<u8 *>(&accel->verm16.mask));
DEBUG_PRINTF("state %hu is vermicelli16\n", this_idx);
return;
}
#endif // HAVE_SVE2
if (info.cr.count() > max_floating_stop_char()) { if (info.cr.count() > max_floating_stop_char()) {
accel->accel_type = ACCEL_NONE; accel->accel_type = ACCEL_NONE;
DEBUG_PRINTF("state %hu is too broad\n", this_idx); DEBUG_PRINTF("state %hu is too broad\n", this_idx);
@ -569,27 +521,16 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
} }
accel->accel_type = ACCEL_SHUFTI; accel->accel_type = ACCEL_SHUFTI;
if (-1 != shuftiBuildMasks(info.cr, if (-1 != shuftiBuildMasks(info.cr, (u8 *)&accel->shufti.lo,
reinterpret_cast<u8 *>(&accel->shufti.lo), (u8 *)&accel->shufti.hi)) {
reinterpret_cast<u8 *>(&accel->shufti.hi))) {
DEBUG_PRINTF("state %hu is shufti\n", this_idx); DEBUG_PRINTF("state %hu is shufti\n", this_idx);
return; return;
} }
assert(!info.cr.none()); assert(!info.cr.none());
#if defined(CAN_USE_WIDE_TRUFFLE) accel->accel_type = ACCEL_TRUFFLE;
if(CAN_USE_WIDE_TRUFFLE) { truffleBuildMasks(info.cr, (u8 *)&accel->truffle.mask1,
accel->accel_type = ACCEL_TRUFFLE_WIDE; (u8 *)&accel->truffle.mask2);
truffleBuildMasksWide(info.cr,
reinterpret_cast<u8 *>(&accel->truffle.mask));
} else
#endif
{
accel->accel_type = ACCEL_TRUFFLE;
truffleBuildMasks(info.cr,
reinterpret_cast<u8 *>(&accel->truffle.mask_lo),
reinterpret_cast<u8 *>(&accel->truffle.mask_hi));
}
DEBUG_PRINTF("state %hu is truffle\n", this_idx); DEBUG_PRINTF("state %hu is truffle\n", this_idx);
} }

View File

@ -93,8 +93,6 @@ const char *accelName(u8 accel_type) {
return "double-shufti"; return "double-shufti";
case ACCEL_TRUFFLE: case ACCEL_TRUFFLE:
return "truffle"; return "truffle";
case ACCEL_TRUFFLE_WIDE:
return "truffle wide";
case ACCEL_RED_TAPE: case ACCEL_RED_TAPE:
return "red tape"; return "red tape";
default: default:
@ -180,13 +178,6 @@ void dumpTruffleCharReach(FILE *f, const u8 *hiset, const u8 *hiclear) {
describeClass(cr).c_str()); describeClass(cr).c_str());
} }
static
void dumpWideTruffleCharReach(FILE *f, const u8 *mask) {
CharReach cr = truffle2crWide(mask);
fprintf(f, "count %zu class %s\n", cr.count(),
describeClass(cr).c_str());
}
static static
void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) { void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) {
fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str()); fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str());
@ -219,38 +210,31 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
break; break;
case ACCEL_SHUFTI: { case ACCEL_SHUFTI: {
fprintf(f, "\n"); fprintf(f, "\n");
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.shufti.lo), dumpShuftiMasks(f, (const u8 *)&accel.shufti.lo,
reinterpret_cast<const u8 *>(&accel.shufti.hi)); (const u8 *)&accel.shufti.hi);
dumpShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.shufti.lo), dumpShuftiCharReach(f, (const u8 *)&accel.shufti.lo,
reinterpret_cast<const u8 *>(&accel.shufti.hi)); (const u8 *)&accel.shufti.hi);
break; break;
} }
case ACCEL_DSHUFTI: case ACCEL_DSHUFTI:
fprintf(f, "\n"); fprintf(f, "\n");
fprintf(f, "mask 1\n"); fprintf(f, "mask 1\n");
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1), dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo1,
reinterpret_cast<const u8 *>(&accel.dshufti.hi1)); (const u8 *)&accel.dshufti.hi1);
fprintf(f, "mask 2\n"); fprintf(f, "mask 2\n");
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo2), dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo2,
reinterpret_cast<const u8 *>(&accel.dshufti.hi2)); (const u8 *)&accel.dshufti.hi2);
dumpDShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1), dumpDShuftiCharReach(f, (const u8 *)&accel.dshufti.lo1,
reinterpret_cast<const u8 *>(&accel.dshufti.hi1), (const u8 *)&accel.dshufti.hi1,
reinterpret_cast<const u8 *>(&accel.dshufti.lo2), (const u8 *)&accel.dshufti.lo2,
reinterpret_cast<const u8 *>(&accel.dshufti.hi2)); (const u8 *)&accel.dshufti.hi2);
break; break;
case ACCEL_TRUFFLE: { case ACCEL_TRUFFLE: {
fprintf(f, "\n"); fprintf(f, "\n");
dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo), dumpTruffleMasks(f, (const u8 *)&accel.truffle.mask1,
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi)); (const u8 *)&accel.truffle.mask2);
dumpTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo), dumpTruffleCharReach(f, (const u8 *)&accel.truffle.mask1,
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi)); (const u8 *)&accel.truffle.mask2);
break;
}
case ACCEL_TRUFFLE_WIDE: {
fprintf(f, "\n");
dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
dumpWideTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask));
break; break;
} }
default: default:

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -30,7 +29,6 @@
#include "accel.h" #include "accel.h"
#include "accelcompile.h" #include "accelcompile.h"
#include "shufticompile.h" #include "shufticompile.h"
#include "vermicellicompile.h"
#include "trufflecompile.h" #include "trufflecompile.h"
#include "nfagraph/ng_limex_accel.h" /* for constants */ #include "nfagraph/ng_limex_accel.h" /* for constants */
#include "util/bitutils.h" #include "util/bitutils.h"
@ -73,20 +71,9 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
return; return;
} }
#ifdef HAVE_SVE2
if (outs <= 16) {
aux->accel_type = ACCEL_VERM16;
aux->verm16.offset = offset;
vermicelli16Build(info.single_stops, (u8 *)&aux->verm16.mask);
DEBUG_PRINTF("building vermicelli16\n");
return;
}
#endif
DEBUG_PRINTF("attempting shufti for %zu chars\n", outs); DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
if (-1 != shuftiBuildMasks(info.single_stops, if (-1 != shuftiBuildMasks(info.single_stops, (u8 *)&aux->shufti.lo,
reinterpret_cast<u8 *>(&aux->shufti.lo), (u8 *)&aux->shufti.hi)) {
reinterpret_cast<u8 *>(&aux->shufti.hi))) {
aux->accel_type = ACCEL_SHUFTI; aux->accel_type = ACCEL_SHUFTI;
aux->shufti.offset = offset; aux->shufti.offset = offset;
DEBUG_PRINTF("shufti built OK\n"); DEBUG_PRINTF("shufti built OK\n");
@ -97,20 +84,10 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
if (outs <= ACCEL_MAX_STOP_CHAR) { if (outs <= ACCEL_MAX_STOP_CHAR) {
DEBUG_PRINTF("building Truffle for %zu chars\n", outs); DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
aux->accel_type = ACCEL_TRUFFLE;
aux->truffle.offset = offset; aux->truffle.offset = offset;
#if defined(CAN_USE_WIDE_TRUFFLE) truffleBuildMasks(info.single_stops, (u8 *)&aux->truffle.mask1,
if(CAN_USE_WIDE_TRUFFLE) { (u8 *)&aux->truffle.mask2);
aux->accel_type = ACCEL_TRUFFLE_WIDE;
truffleBuildMasksWide(info.single_stops,
reinterpret_cast<u8 *>(&aux->truffle.mask));
} else
#endif
{
aux->accel_type = ACCEL_TRUFFLE;
truffleBuildMasks(info.single_stops,
reinterpret_cast<u8 *>(&aux->truffle.mask_lo),
reinterpret_cast<u8 *>(&aux->truffle.mask_hi));
}
return; return;
} }
@ -218,46 +195,16 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
u8 m2; u8 m2;
if (buildDvermMask(info.double_stop2, &m1, &m2)) { if (buildDvermMask(info.double_stop2, &m1, &m2)) {
u8 c1 = info.double_stop2.begin()->first & m1;
u8 c2 = info.double_stop2.begin()->second & m2;
#ifdef HAVE_SVE2
if (vermicelliDoubleMasked16Build(c1, c2, m1, m2, (u8 *)&aux->mdverm16.mask)) {
aux->accel_type = ACCEL_DVERM16_MASKED;
aux->mdverm16.offset = offset;
aux->mdverm16.c1 = c1;
aux->mdverm16.m1 = m1;
DEBUG_PRINTF("building maskeddouble16-vermicelli for 0x%02hhx%02hhx\n",
c1, c2);
return;
} else if (outs2 <= 8 &&
vermicelliDouble16Build(info.double_stop2,
reinterpret_cast<u8 *>(&aux->dverm16.mask),
reinterpret_cast<u8 *>(&aux->dverm16.firsts))) {
aux->accel_type = ACCEL_DVERM16;
aux->dverm16.offset = offset;
DEBUG_PRINTF("building double16-vermicelli\n");
return;
}
#endif // HAVE_SVE2
aux->accel_type = ACCEL_DVERM_MASKED; aux->accel_type = ACCEL_DVERM_MASKED;
aux->dverm.offset = offset; aux->dverm.offset = offset;
aux->dverm.c1 = c1; aux->dverm.c1 = info.double_stop2.begin()->first & m1;
aux->dverm.c2 = c2; aux->dverm.c2 = info.double_stop2.begin()->second & m2;
aux->dverm.m1 = m1; aux->dverm.m1 = m1;
aux->dverm.m2 = m2; aux->dverm.m2 = m2;
DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n", c1, c2); DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
aux->dverm.c1, aux->dverm.c2);
return; return;
} }
#ifdef HAVE_SVE2
if (outs2 <= 8 &&
vermicelliDouble16Build(info.double_stop2, (u8 *)&aux->dverm16.mask,
(u8 *)&aux->dverm16.firsts)) {
aux->accel_type = ACCEL_DVERM16;
aux->dverm16.offset = offset;
DEBUG_PRINTF("building double16-vermicelli\n");
return;
}
#endif // HAVE_SVE2
} }
if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438. if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438.
@ -266,11 +213,9 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
aux->accel_type = ACCEL_DSHUFTI; aux->accel_type = ACCEL_DSHUFTI;
aux->dshufti.offset = offset; aux->dshufti.offset = offset;
if (shuftiBuildDoubleMasks( if (shuftiBuildDoubleMasks(
info.double_stop1, info.double_stop2, info.double_stop1, info.double_stop2, (u8 *)&aux->dshufti.lo1,
reinterpret_cast<u8 *>(&aux->dshufti.lo1), (u8 *)&aux->dshufti.hi1, (u8 *)&aux->dshufti.lo2,
reinterpret_cast<u8 *>(&aux->dshufti.hi1), (u8 *)&aux->dshufti.hi2)) {
reinterpret_cast<u8 *>(&aux->dshufti.lo2),
reinterpret_cast<u8 *>(&aux->dshufti.hi2))) {
return; return;
} }
} }

View File

@ -1,81 +0,0 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Shufti: character class acceleration.
*/
template <uint16_t S>
static really_inline
const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) {
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
SuperVector<S> c_lo = chars & low4bits;
SuperVector<S> c_hi = chars.template vshr_8_imm<4>();
c_lo = mask_lo.template pshufb<false>(c_lo);
c_hi = mask_hi.template pshufb<false>(c_hi);
return (c_lo & c_hi) > (SuperVector<S>::Zeroes());
}
template <uint16_t S>
static really_inline
SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> *inout_t1, SuperVector<S> chars) {
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
SuperVector<S> chars_lo = chars & low4bits;
chars_lo.print8("chars_lo");
SuperVector<S> chars_hi = chars.template vshr_64_imm<4>() & low4bits;
chars_hi.print8("chars_hi");
SuperVector<S> c1_lo = mask1_lo.template pshufb<true>(chars_lo);
c1_lo.print8("c1_lo");
SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
c1_hi.print8("c1_hi");
SuperVector<S> new_t1 = c1_lo | c1_hi;
// t1 is the match mask for the first char of the patterns
new_t1.print8("t1");
SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
c2_lo.print8("c2_lo");
SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
c2_hi.print8("c2_hi");
SuperVector<S> t2 = c2_lo | c2_hi;
// t2 is the match mask for the second char of the patterns
t2.print8("t2");
// offset t1 so it aligns with t2. The hole created by the offset is filled
// with the last elements of the previous t1 so no info is lost.
// Bits set to 0 lining up indicate a match.
SuperVector<S> t = (new_t1.alignr(*inout_t1, S-1)) | t2;
t.print8("t");
*inout_t1 = new_t1;
return !t.eq(SuperVector<S>::Ones());
}

View File

@ -1,261 +0,0 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Truffle: character class acceleration.
*
*/
#ifdef HAVE_SVE
#ifdef HAVE_SVE2
/*
* blockSingleMask takes in a character set (as masks) and a string and return for each character
* of the string wether or not it is part of the set.
*
* 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit
* represents whether or not a character is in the character set. The 'highclear' and
* 'highset' in the name refers to the MSb of the byte of the character (allowing two
* 128-bit masks to cover all 256 values).
*
* The mask is an array of 32 bytes and is encoded this way:
* Let C be a character in the set. The bit describing that character is at byte[C%32] and
* within that byte, it's at bit[C/32]
* As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ...
*
* Assume the mask is in one of those configurations:
* - both masks are exactly 128b wide
* - the first mask is exactly 256b wide and the second is zeroed.
* - the first mask is more than 256b wide, with bits past the 256th being zero, and the second mask is zeroed.
*/
static really_inline
svuint8_t blockSingleMaskWideSVE2(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
const svuint8_t pshub_mask = svdup_u8(0x1f);
const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
svuint8x2_t shuf_mask_32 = svcreate2(shuf_mask_lo_highclear, shuf_mask_lo_highset);
/*
* svtbl2 does a table lookup. Each byte in the second argument indexes into the array of bytes
* in shuf_mask_32 and saves the result in the corresponding byte of byte_select.
* We mask the chars so that we are using the low nibble of char as the index.
*/
svuint8_t byte_select = svtbl2(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask));
/*
* We now have selected the byte that contain the bit corresponding to the char. We need to
* further filter it, otherwise we'd get a match for any character % 32 to a searched character
*
* The low nibble was used previously to select the byte out of the mask. The high nibble is
* used to select the bit out of the byte. So we shift everything right by 5.
*
* Using svtbl, we can make an array where each element is a different bit. Using the high
* nibble we can get a mask selecting only the bit out of a byte that may have the relevant
* charset char.
*/
svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5);
svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
/*
* We apply the bit_select mask onto the selected byte. What is left is the bit in the charset
* encoding the character in char. A non zero value means the char was in the charset
*
* The _x suffix only works if we process a full char vector. If we were to use a partial
* vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
* lanes may have arbitrary values
*/
return svand_x(svptrue_b8(), byte_select, bit_select);
}
#endif //HAVE_SVE2
/*
* blockSingleMask takes in a character set (as masks) and a string and return for each character
* of the string wether or not it is part of the set.
*
* 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit
* represents whether or not a character is in the character set. The 'highclear' and
* 'highset' in the name refers to the MSb of the byte of the character (allowing two
* 128-bit masks to cover all 256 values).
*
* The masks are arrays of 16 bytes each and are encoded this way:
* Let C be a character in the set. The bit describing that character is at byte[C%16] and
* within that byte, it's at bit[C/16]
* As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x40 0x00 0x00 0x00 ...
*
* Assume both mask are 128b wide. If they are larger, the additional bits must be zero
*/
static really_inline
svuint8_t blockSingleMaskSVE(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
const svuint8_t highconst = svdup_u8(0x80);
const svuint8_t pshub_mask = svdup_u8(0x8f);
const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
/*
* svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes
* in shuf_mask_lo_highclear and saves the result in the corresponding byte of byte_select_low.
* We mask the chars so that we are using the low nibble of char as the index but we keep the
* MSb so that high characters (not represented by the highclear mask) become an index out of
* bounds and result in a 0.
*/
svuint8_t byte_select_low = svtbl(shuf_mask_lo_highclear, svand_x(svptrue_b8(), chars, pshub_mask));
/*
* We flip the MSb of the chars and do the same table lookup with the highset mask.
* This way it's the characters with MSb cleared that will result in out of bands indexes.
* This allows us to cover the full range (0-127 and 128-255)
*/
svuint8_t char_high_flipped = sveor_x(svptrue_b8(), chars, highconst);
svuint8_t byte_select_high = svtbl(shuf_mask_lo_highset, svand_x(svptrue_b8(), char_high_flipped, pshub_mask));
/*
* We now have selected the byte that contain the bit corresponding to the char. We need to
* further filter it, otherwise we'd get a match for any character % 16 to a searched character
*
* The low nibble was used previously to select the byte out of the mask. The high nibble is
* used to select the bit out of the byte. So we shift everything right by 4.
*
* Using svtbl, we can make an array where each element is a different bit. Using the high
* nibble we can get a mask selecting only the bit out of a byte that may have the relevant
* charset char.
*/
svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 4);
svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
/*
* For every lane, only one of the byte selected may have a value, so we can OR them. We
* then apply the bit_select mask. What is left is the bit in the charset encoding the
* character in char. A non zero value means the char was in the charset
*
* The _x suffix only works if we process a full char vector. If we were to use a partial
* vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
* lanes may have arbitrary values
*/
return svand_x(svptrue_b8(), svorr_x(svptrue_b8(), byte_select_low, byte_select_high), bit_select);
}
/*
* blockSingleMask takes in a character set (as masks) and a string and return for each character
* of the string wether or not it is part of the set.
*
* 'shuf_mask_32' is a 256-bit masks where each bit represents whether or not a character is in
* the character set.
*
* The mask is an array of 32 bytes and is encoded this way:
* Let C be a character in the set. The bit describing that character is at byte[C%32] and
* within that byte, it's at bit[C/32]
* As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ...
*
* Assume both mask are 128b wide. If they are larger, the additional bits must be zero
*/
static really_inline
svuint8_t blockSingleMaskWideSVE(svuint8_t shuf_mask_32, svuint8_t chars) {//TODO I might have issues with the type
const svuint8_t pshub_mask = svdup_u8(0x1f);
const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
/*
* svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes
* in shuf_mask_32 and saves the result in the corresponding byte of byte_select.
* We mask the chars so that we are using the low nibble of char as the index.
*/
svuint8_t byte_select = svtbl(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask));
/*
* We now have selected the byte that contain the bit corresponding to the char. We need to
* further filter it, otherwise we'd get a match for any character % 32 to a searched character
*
* The low nibble was used previously to select the byte out of the mask. The high nibble is
* used to select the bit out of the byte. So we shift everything right by 5.
*
* Using svtbl, we can make an array where each element is a different bit. Using the high
* nibble we can get a mask selecting only the bit out of a byte that may have the relevant
* charset char.
*/
svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5);
svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
/*
* We apply the bit_select mask onto the selected byte. What is left is the bit in the charset
* encoding the character in char. A non zero value means the char was in the charset
*
* The _x suffix only works if we process a full char vector. If we were to use a partial
* vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
* lanes may have arbitrary values
*/
return svand_x(svptrue_b8(), byte_select, bit_select);
}
/* require normal truffle compilation. The 256b mask is split between the two parameters */
static really_inline
svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
return blockSingleMaskSVE(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
}
/* require wide truffle compilation. The 256b mask is fully contained in the first parameter */
static really_inline
svuint8_t blockSingleMaskWide32(svuint8_t shuf_mask_32, svuint8_t chars) {
return blockSingleMaskWideSVE(shuf_mask_32, chars);
}
#ifdef HAVE_SVE2
/* require wide truffle compilation. The 256b mask is split between the two parameters if the vector is 128b,
* or fully contained in the first parameter is it's 256b and more*/
static really_inline
svuint8_t blockSingleMaskWide(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
return blockSingleMaskWideSVE2(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
}
#endif //HAVE_SVE2
#endif //HAVE_SVE
/* require normal truffle compilation. The 256b mask is split between the two parameters */
template <uint16_t S>
static really_inline
const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
chars.print8("chars");
shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear");
shuf_mask_lo_highset.print8("shuf_mask_lo_highset");
SuperVector<S> highconst = SuperVector<S>::dup_u8(0x80);
highconst.print8("highconst");
SuperVector<S> shuf_mask_hi = SuperVector<S>::dup_u64(0x8040201008040201);
shuf_mask_hi.print8("shuf_mask_hi");
SuperVector<S> shuf1 = shuf_mask_lo_highclear.pshufb(chars);
shuf1.print8("shuf1");
SuperVector<S> t1 = chars ^ highconst;
t1.print8("t1");
SuperVector<S> shuf2 = shuf_mask_lo_highset.pshufb(t1);
shuf2.print8("shuf2");
SuperVector<S> t2 = highconst.opandnot(chars.template vshr_64_imm<4>());
t2.print8("t2");
SuperVector<S> shuf3 = shuf_mask_hi.pshufb(t2);
shuf3.print8("shuf3");
SuperVector<S> res = (shuf1 | shuf2) & shuf3;
res.print8("(shuf1 | shuf2) & shuf3");
return !res.eq(SuperVector<S>::Zeroes());
}

View File

@ -1,129 +0,0 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Vermicelli: single-byte and double-byte acceleration.
*/
template <uint16_t S>
static really_inline
const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = !chars.eq(casemask & data);
return first_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
data.print8("data");
chars.print8("chars");
casemask.print8("casemask");
SuperVector<S> mask = !chars.eq(casemask & data);
mask.print8("mask");
return last_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S, bool check_partial>
static really_inline
const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = mask1 & (mask2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
}
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S, bool check_partial>
static really_inline
const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = (mask1 << 1)& mask2;
DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | (SuperVector<S>::Ones() >> (S-1));
}
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S, bool check_partial>
static really_inline
const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
SuperVector<S> const mask1, SuperVector<S> const mask2,
u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
SuperVector<S> v1 = chars1.eq(data & mask1);
SuperVector<S> v2 = chars2.eq(data & mask2);
SuperVector<S> mask = v1 & (v2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
}
return first_non_zero_match<S>(buf, mask, len);
}

View File

@ -1,6 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -40,16 +39,12 @@
#include "repeat.h" #include "repeat.h"
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "vermicelli.hpp" #include "vermicelli.h"
#include "util/bitutils.h" #include "util/bitutils.h"
#include "util/multibit.h" #include "util/multibit.h"
#include "util/partial_store.h" #include "util/partial_store.h"
#include "ue2common.h" #include "ue2common.h"
#ifdef HAVE_SVE2
#include "castle_sve.h"
#endif
static really_inline static really_inline
const struct SubCastle *getSubCastle(const struct Castle *c, u32 num) { const struct SubCastle *getSubCastle(const struct Castle *c, u32 num) {
assert(num < c->numRepeats); assert(num < c->numRepeats);
@ -94,8 +89,8 @@ char subCastleReportCurrent(const struct Castle *c, struct mq *q,
const struct SubCastle *sub = getSubCastle(c, subIdx); const struct SubCastle *sub = getSubCastle(c, subIdx);
const struct RepeatInfo *info = getRepeatInfo(sub); const struct RepeatInfo *info = getRepeatInfo(sub);
const union RepeatControl *rctrl = getControl(q->state, sub); union RepeatControl *rctrl = getControl(q->state, sub);
const char *rstate = (char *)q->streamState + sub->streamStateOffset + char *rstate = (char *)q->streamState + sub->streamStateOffset +
info->packedCtrlSize; info->packedCtrlSize;
enum RepeatMatch match = enum RepeatMatch match =
repeatHasMatch(info, rctrl, rstate, offset); repeatHasMatch(info, rctrl, rstate, offset);
@ -118,10 +113,10 @@ int castleReportCurrent(const struct Castle *c, struct mq *q) {
if (c->exclusive) { if (c->exclusive) {
u8 *active = (u8 *)q->streamState; u8 *active = (u8 *)q->streamState;
const u8 *groups = active + c->groupIterOffset; u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize; u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
if (subCastleReportCurrent(c, q, if (subCastleReportCurrent(c, q,
@ -156,8 +151,8 @@ char subCastleInAccept(const struct Castle *c, struct mq *q,
} }
const struct RepeatInfo *info = getRepeatInfo(sub); const struct RepeatInfo *info = getRepeatInfo(sub);
const union RepeatControl *rctrl = getControl(q->state, sub); union RepeatControl *rctrl = getControl(q->state, sub);
const char *rstate = (char *)q->streamState + sub->streamStateOffset + char *rstate = (char *)q->streamState + sub->streamStateOffset +
info->packedCtrlSize; info->packedCtrlSize;
enum RepeatMatch match = enum RepeatMatch match =
repeatHasMatch(info, rctrl, rstate, offset); repeatHasMatch(info, rctrl, rstate, offset);
@ -180,10 +175,10 @@ char castleInAccept(const struct Castle *c, struct mq *q,
if (c->exclusive) { if (c->exclusive) {
u8 *active = (u8 *)q->streamState; u8 *active = (u8 *)q->streamState;
const u8 *groups = active + c->groupIterOffset; u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize; u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
if (subCastleInAccept(c, q, report, offset, activeIdx)) { if (subCastleInAccept(c, q, report, offset, activeIdx)) {
@ -213,8 +208,8 @@ void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
const struct SubCastle *sub = getSubCastle(c, subIdx); const struct SubCastle *sub = getSubCastle(c, subIdx);
const struct RepeatInfo *info = getRepeatInfo(sub); const struct RepeatInfo *info = getRepeatInfo(sub);
const union RepeatControl *rctrl = getControl(full_state, sub); union RepeatControl *rctrl = getControl(full_state, sub);
const char *rstate = (char *)stream_state + sub->streamStateOffset + char *rstate = (char *)stream_state + sub->streamStateOffset +
info->packedCtrlSize; info->packedCtrlSize;
if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) { if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) {
@ -242,10 +237,10 @@ void castleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
if (c->exclusive) { if (c->exclusive) {
u8 *active = (u8 *)stream_state; u8 *active = (u8 *)stream_state;
const u8 *groups = active + c->groupIterOffset; u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize; u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
subCastleDeactivateStaleSubs(c, offset, full_state, subCastleDeactivateStaleSubs(c, offset, full_state,
@ -329,8 +324,8 @@ void subCastleFindMatch(const struct Castle *c, const u64a begin,
size_t *mloc, char *found, const u32 subIdx) { size_t *mloc, char *found, const u32 subIdx) {
const struct SubCastle *sub = getSubCastle(c, subIdx); const struct SubCastle *sub = getSubCastle(c, subIdx);
const struct RepeatInfo *info = getRepeatInfo(sub); const struct RepeatInfo *info = getRepeatInfo(sub);
const union RepeatControl *rctrl = getControl(full_state, sub); union RepeatControl *rctrl = getControl(full_state, sub);
const char *rstate = (char *)stream_state + sub->streamStateOffset + char *rstate = (char *)stream_state + sub->streamStateOffset +
info->packedCtrlSize; info->packedCtrlSize;
u64a match = repeatNextMatch(info, rctrl, rstate, begin); u64a match = repeatNextMatch(info, rctrl, rstate, begin);
@ -374,10 +369,10 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
if (c->exclusive) { if (c->exclusive) {
u8 *active = (u8 *)stream_state; u8 *active = (u8 *)stream_state;
const u8 *groups = active + c->groupIterOffset; u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize; u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
subCastleFindMatch(c, begin, end, full_state, stream_state, mloc, subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
@ -386,7 +381,7 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
} }
if (c->exclusive != PURE_EXCLUSIVE) { if (c->exclusive != PURE_EXCLUSIVE) {
const u8 *active = (u8 *)stream_state + c->activeOffset; u8 *active = (u8 *)stream_state + c->activeOffset;
for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
i != MMB_INVALID; i != MMB_INVALID;
i = mmbit_iterate(active, c->numRepeats, i)) { i = mmbit_iterate(active, c->numRepeats, i)) {
@ -400,8 +395,8 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
} }
static really_inline static really_inline
u64a subCastleNextMatch(const struct Castle *c, const void *full_state, u64a subCastleNextMatch(const struct Castle *c, void *full_state,
const void *stream_state, const u64a loc, void *stream_state, const u64a loc,
const u32 subIdx) { const u32 subIdx) {
DEBUG_PRINTF("subcastle %u\n", subIdx); DEBUG_PRINTF("subcastle %u\n", subIdx);
const struct SubCastle *sub = getSubCastle(c, subIdx); const struct SubCastle *sub = getSubCastle(c, subIdx);
@ -489,14 +484,15 @@ char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
// full_state (scratch). // full_state (scratch).
u64a offset = end; // min offset of next match u64a offset = end; // min offset of next match
u32 activeIdx = 0;
mmbit_clear(matching, c->numRepeats); mmbit_clear(matching, c->numRepeats);
if (c->exclusive) { if (c->exclusive) {
u8 *active = (u8 *)stream_state; u8 *active = (u8 *)stream_state;
u8 *groups = active + c->groupIterOffset; u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize; u8 *cur = active + i * c->activeIdxSize;
u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); activeIdx = partial_load_u32(cur, c->activeIdxSize);
u64a match = subCastleNextMatch(c, full_state, stream_state, u64a match = subCastleNextMatch(c, full_state, stream_state,
loc, activeIdx); loc, activeIdx);
set_matching(c, match, groups, matching, c->numGroups, i, set_matching(c, match, groups, matching, c->numGroups, i,
@ -608,12 +604,6 @@ char castleScan(const struct Castle *c, const u8 *buf, const size_t begin,
return castleScanVerm(c, buf, begin, end, loc); return castleScanVerm(c, buf, begin, end, loc);
case CASTLE_NVERM: case CASTLE_NVERM:
return castleScanNVerm(c, buf, begin, end, loc); return castleScanNVerm(c, buf, begin, end, loc);
#ifdef HAVE_SVE2
case CASTLE_VERM16:
return castleScanVerm16(c, buf, begin, end, loc);
case CASTLE_NVERM16:
return castleScanNVerm16(c, buf, begin, end, loc);
#endif // HAVE_SVE2
case CASTLE_SHUFTI: case CASTLE_SHUFTI:
return castleScanShufti(c, buf, begin, end, loc); return castleScanShufti(c, buf, begin, end, loc);
case CASTLE_TRUFFLE: case CASTLE_TRUFFLE:
@ -709,12 +699,6 @@ char castleRevScan(const struct Castle *c, const u8 *buf, const size_t begin,
return castleRevScanVerm(c, buf, begin, end, loc); return castleRevScanVerm(c, buf, begin, end, loc);
case CASTLE_NVERM: case CASTLE_NVERM:
return castleRevScanNVerm(c, buf, begin, end, loc); return castleRevScanNVerm(c, buf, begin, end, loc);
#ifdef HAVE_SVE2
case CASTLE_VERM16:
return castleRevScanVerm16(c, buf, begin, end, loc);
case CASTLE_NVERM16:
return castleRevScanNVerm16(c, buf, begin, end, loc);
#endif // HAVE_SVE2
case CASTLE_SHUFTI: case CASTLE_SHUFTI:
return castleRevScanShufti(c, buf, begin, end, loc); return castleRevScanShufti(c, buf, begin, end, loc);
case CASTLE_TRUFFLE: case CASTLE_TRUFFLE:
@ -796,7 +780,7 @@ char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,
char found = 0; char found = 0;
if (c->exclusive) { if (c->exclusive) {
const u8 *groups = (u8 *)q->streamState + c->groupIterOffset; u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
found = mmbit_any(groups, c->numGroups); found = mmbit_any(groups, c->numGroups);
} }
@ -863,7 +847,7 @@ char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,
} }
if (c->exclusive) { if (c->exclusive) {
const u8 *groups = (u8 *)q->streamState + c->groupIterOffset; u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
if (mmbit_any_precise(groups, c->numGroups)) { if (mmbit_any_precise(groups, c->numGroups)) {
return 1; return 1;
} }
@ -883,7 +867,7 @@ char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end) {
} }
static static
s64a castleLastKillLoc(const struct Castle *c, const struct mq *q) { s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
assert(q_cur_type(q) == MQE_START); assert(q_cur_type(q) == MQE_START);
assert(q_last_type(q) == MQE_END); assert(q_last_type(q) == MQE_END);
s64a sp = q_cur_loc(q); s64a sp = q_cur_loc(q);
@ -906,6 +890,7 @@ s64a castleLastKillLoc(const struct Castle *c, const struct mq *q) {
if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) { if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
return (s64a)loc - hlen; return (s64a)loc - hlen;
} }
ep = 0;
} }
return sp - 1; /* the repeats are never killed */ return sp - 1; /* the repeats are never killed */
@ -957,7 +942,7 @@ char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report) {
char found = 0; char found = 0;
if (c->exclusive) { if (c->exclusive) {
const u8 *groups = (u8 *)q->streamState + c->groupIterOffset; u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
found = mmbit_any_precise(groups, c->numGroups); found = mmbit_any_precise(groups, c->numGroups);
} }
@ -1005,10 +990,10 @@ char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q) {
if (c->exclusive) { if (c->exclusive) {
u8 *active = (u8 *)q->streamState; u8 *active = (u8 *)q->streamState;
const u8 *groups = active + c->groupIterOffset; u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize; u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("subcastle %u\n", activeIdx); DEBUG_PRINTF("subcastle %u\n", activeIdx);
const struct SubCastle *sub = getSubCastle(c, activeIdx); const struct SubCastle *sub = getSubCastle(c, activeIdx);
@ -1077,7 +1062,7 @@ void subCastleQueueCompressState(const struct Castle *c, const u32 subIdx,
const struct mq *q, const u64a offset) { const struct mq *q, const u64a offset) {
const struct SubCastle *sub = getSubCastle(c, subIdx); const struct SubCastle *sub = getSubCastle(c, subIdx);
const struct RepeatInfo *info = getRepeatInfo(sub); const struct RepeatInfo *info = getRepeatInfo(sub);
const union RepeatControl *rctrl = getControl(q->state, sub); union RepeatControl *rctrl = getControl(q->state, sub);
char *packed = (char *)q->streamState + sub->streamStateOffset; char *packed = (char *)q->streamState + sub->streamStateOffset;
DEBUG_PRINTF("sub %u next match %llu\n", subIdx, DEBUG_PRINTF("sub %u next match %llu\n", subIdx,
repeatNextMatch(info, rctrl, repeatNextMatch(info, rctrl,
@ -1098,10 +1083,10 @@ char nfaExecCastle_queueCompressState(const struct NFA *n, const struct mq *q,
DEBUG_PRINTF("offset=%llu\n", offset); DEBUG_PRINTF("offset=%llu\n", offset);
if (c->exclusive) { if (c->exclusive) {
u8 *active = (u8 *)q->streamState; u8 *active = (u8 *)q->streamState;
const u8 *groups = active + c->groupIterOffset; u8 *groups = active + c->groupIterOffset;
for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
const u8 *cur = active + i * c->activeIdxSize; u8 *cur = active + i * c->activeIdxSize;
const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
DEBUG_PRINTF("packing state for sub %u\n", activeIdx); DEBUG_PRINTF("packing state for sub %u\n", activeIdx);
subCastleQueueCompressState(c, activeIdx, q, offset); subCastleQueueCompressState(c, activeIdx, q, offset);

Some files were not shown because too many files have changed in this diff Show More