mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-30 03:34:25 +03:00
Initial commit of Hyperscan
This commit is contained in:
24
examples/CMakeLists.txt
Normal file
24
examples/CMakeLists.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
find_library(PCAP_LIBRARY pcap)
|
||||
|
||||
if (NOT PCAP_LIBRARY)
|
||||
message(STATUS "Could not find libpcap - some examples will not be built")
|
||||
endif()
|
||||
|
||||
add_executable(simplegrep simplegrep.c)
|
||||
set_source_files_properties(simplegrep.c PROPERTIES COMPILE_FLAGS
|
||||
"-Wall -Wno-unused-parameter")
|
||||
target_link_libraries(simplegrep hs)
|
||||
|
||||
if (PCAP_LIBRARY)
|
||||
add_executable(pcapscan pcapscan.cc)
|
||||
set_source_files_properties(pcapscan.cc PROPERTIES COMPILE_FLAGS
|
||||
"-Wall -Wno-unused-parameter")
|
||||
target_link_libraries(pcapscan hs pcap)
|
||||
endif()
|
||||
|
||||
if (PCAP_LIBRARY)
|
||||
add_executable(patbench patbench.cc)
|
||||
set_source_files_properties(patbench.cc PROPERTIES COMPILE_FLAGS
|
||||
"-Wall -Wno-unused-parameter")
|
||||
target_link_libraries(patbench hs pcap)
|
||||
endif()
|
155
examples/README.md
Normal file
155
examples/README.md
Normal file
@@ -0,0 +1,155 @@
|
||||
Hyperscan Example Code
|
||||
======================
|
||||
|
||||
Copyright (C) 2015 Intel Corporation. All rights reserved.
|
||||
|
||||
The files in this directory contain example code demonstrating the use of the
|
||||
Hyperscan regular expression matching library. The examples have been
|
||||
constructed to be useful utility programs, but they have been simplified
|
||||
somewhat, so generally contain "shortcuts" that one would not take if building
|
||||
a "real" system.
|
||||
|
||||
The examples each contain a short description in a comment at the top of the
|
||||
file, including build instructions.
|
||||
|
||||
---
|
||||
|
||||
|
||||
Example 1: simplegrep
|
||||
---------------------
|
||||
|
||||
The first example program (`simplegrep.c`) is modelled on the ubiquitous grep
|
||||
tool to search a file for a single regular expression. 'simplegrep' does the
|
||||
same, but eschews a lot of grep's complexity: it is unable to read data from
|
||||
`stdin`, and doesn't support grep's plethora of command-line arguments.
|
||||
|
||||
This code is intended to be simple portable C99.
|
||||
|
||||
simplegrep demonstrates the following Hyperscan concepts:
|
||||
|
||||
- Single pattern compilation: As simplegrep can scan for one pattern only, it
|
||||
uses the `hs_compile` function instead of the multi-pattern variant:
|
||||
`hs_compile_multi`.
|
||||
|
||||
- Block mode pattern-matching: simplegrep will search a single data buffer
|
||||
for the given pattern, so it has no need to set up and tear down streams.
|
||||
(See the next section for a streaming mode example)
|
||||
|
||||
- Scratch space allocation and use: Hyperscan requires a small amount of
|
||||
temporary memory that is used in the `hs_scan` call. The caller needs to
|
||||
guarantee that only one instance of `hs_scan` is using the scratch space at a
|
||||
time, but there is no requirement that the same scratch area be used on
|
||||
consecutive calls to `hs_scan`. Given that it is expensive to allocate the
|
||||
scratch space, one would typically allocate all necessary scratch space at
|
||||
system startup and reuse it throughout execution of the program.
|
||||
|
||||
|
||||
Example 2: pcapscan
|
||||
-------------------
|
||||
|
||||
The second example program (`pcapscan.cc`) is a very simple packet scanning
|
||||
benchmark. It scans a given PCAP file full of network traffic against a group
|
||||
of regular expressions and returns some coarse performance measurements. This
|
||||
example provides a quick way to examine the performance achievable on a
|
||||
particular combination of platform, pattern set and input data.
|
||||
|
||||
In block mode, pcapscan scans each packet individually against a Hyperscan
|
||||
database. In streaming mode, pcapscan assigns packets to flows using a
|
||||
rudimentary connection tracker, then scans the packets in each flow with
|
||||
Hyperscan's streaming mode interface. This demonstrates the use of streaming
|
||||
mode operation to detect matches that straddle packet boundaries.
|
||||
|
||||
**Note**: the flow assignment implemented here is intended as a simple demo; it
|
||||
merely ensures that packets with the same 5-tuple are written to the same
|
||||
stream in the order in which they appear in the PCAP file. No packet
|
||||
re-ordering or connection state tracking (as you would expect to find in a real
|
||||
network scanning application) is done.
|
||||
|
||||
pcapscan introduces the following Hyperscan concepts:
|
||||
|
||||
- Multi-pattern compilation: Unlike simplegrep, pcapscan requires a file of
|
||||
expressions as input instead of a single pattern. pcapscan will read this
|
||||
file in, one pattern per line, and use it as input to the `hs_compile_multi`
|
||||
function. This function generates a pattern database that will match all the
|
||||
input patterns in parallel.
|
||||
|
||||
- Streamed pattern-matching: pcapscan uses the `hs_scan_stream` function
|
||||
(instead of the block-mode `hs_scan` call) to allow it to identify matches
|
||||
that occur in a stream of data, even if they straddle the boundaries between blocks.
|
||||
Streaming mode operation has a number of unique properties:
|
||||
|
||||
- Stream state that persists for the lifetime of the stream must be allocated
|
||||
with the `hs_open_stream` function before scanning can take place.
|
||||
Similarly, it must be freed with `hs_close_stream` after it is no longer
|
||||
needed. Each stream being scanned concurrently requires its own stream
|
||||
state.
|
||||
|
||||
- In streaming mode, a non-zero return from the user-specified event-handler
|
||||
function has consequences for the rest of that stream's lifetime: when a
|
||||
non-zero return occurs, it signals that no more of the stream should be
|
||||
scanned. Consequently if the user makes a subsequent call to
|
||||
`hs_scan_stream` on a stream whose processing was terminated in this way,
|
||||
hs_scan_stream will return `HS_SCAN_TERMINATED`. This case has not been
|
||||
demonstrated in pcapscan, as its callback always returns 0.
|
||||
|
||||
- Match handling during stream shutdown: As matches may occur when the
|
||||
`hs_close_stream` function is called, it too must be provided with scratch
|
||||
space in order to perform this match processing. Similarly, the user must
|
||||
be prepared to be issued match event callbacks during the `hs_close_stream`
|
||||
call. For this reason, we advise that stream shutdown be an integral part
|
||||
of the system design.
|
||||
|
||||
|
||||
Example 3: patbench
|
||||
-------------------
|
||||
|
||||
This program allows users to detect which signatures may be the most expensive
|
||||
in a set of patterns. It is designed for use with small to medium pattern set
|
||||
sizes (e.g. 5-500). If used with very large pattern sets it may take a very
|
||||
long time - the number of recompiles done is `g * O(lg2(n))` where `g` is the
|
||||
number of generations and `n` is the number of patterns (assuming that `n >>
|
||||
g`).
|
||||
|
||||
This utility will return a cumulative series of removed patterns. The first
|
||||
generation will find and remove a single pattern. The second generation will
|
||||
begin with the first pattern removed and find another pattern to remove, etc.
|
||||
So if we have 100 patterns and 15 generations, the final generation's score
|
||||
will be a run over 85 patterns.
|
||||
|
||||
This utility is probabilistic. It is possible that the pattern removed in a
|
||||
generation is not a particularly expensive pattern. To reduce noise in the
|
||||
results use 'taskset' and set the number of repeats to a level that still
|
||||
completes in reasonable time (this will reduce the effect of random measurement
|
||||
noise).
|
||||
|
||||
The criterion for performance can be altered by use of the `-C<x>` flag where
|
||||
`<x>` can be `t,r,s,c,b`, selecting pattern matching throughput, scratch size,
|
||||
stream state size (only available in streaming mode), compile time and bytecode
|
||||
size respectively.
|
||||
|
||||
This utility will also not produce good results if all the patterns are roughly
|
||||
equally expensive.
|
||||
|
||||
### Factor Group Size:
|
||||
|
||||
If there are multiple expensive patterns that are very similar on the
|
||||
left-hand-side or identical, this utility will typically not find these groups
|
||||
unless the `-F` flag is used to search for a group size that is equal to or
|
||||
larger than the size of the group of similar patterns.
|
||||
|
||||
Otherwise, removing a portion of the similar patterns will have no or almost no
|
||||
effect, and the search procedure used relies on the ability to remove all of
|
||||
the similar patterns in at least one search case, something which will only
|
||||
happen if the `factor_group_size` is large enough.
|
||||
|
||||
This alters the operation of the tool so that instead of trying to find the
|
||||
single pattern whose removal has the most effect by binary search (the default
|
||||
with `factor_group_size == 1`), we attempt to find the N patterns whose removal
|
||||
has the most effect by searching over `N + 1` evenly sized groups, removing
|
||||
only `1/(N + 1)` of the search signatures per iteration.
|
||||
|
||||
Note that the number of recompiles done greatly increases with increased factor
|
||||
group size. For example, with `factor_group_size = 1`, we do `g * 2 * lg2(n)`
|
||||
recompiles, while with `factor_group_size = 4`, we do `g * 4 * log(5/4)(n)`.
|
||||
Informally the number of generations we require goes up as we eliminate a
|
||||
smaller number of signatures and the we have to do more work per generation.
|
892
examples/patbench.cc
Normal file
892
examples/patbench.cc
Normal file
@@ -0,0 +1,892 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Hyperscan pattern benchmarker.
|
||||
*
|
||||
* This program allows users to detect which signatures may be the most
|
||||
* expensive in a set of patterns. It is designed for use with small to medium
|
||||
* pattern set sizes (e.g. 5-500). If used with very large pattern sets it may
|
||||
* take a very long time - the number of recompiles done is g * O(lg2(n)) where
|
||||
* g is the number of generations and n is the number of patterns (assuming
|
||||
* that n >> g).
|
||||
*
|
||||
* This utility will return a cumulative series of removed patterns. The first
|
||||
* generation will find and remove a single pattern. The second generation will
|
||||
* begin with the first pattern removed and find another pattern to remove,
|
||||
* etc. So if we have 100 patterns and 15 generations, the final generation's
|
||||
* score will be a run over 85 patterns.
|
||||
*
|
||||
* This utility is probabilistic. It is possible that the pattern removed in a
|
||||
* generation is not a particularly expensive pattern. To reduce noise in the
|
||||
* results use 'taskset' and set the number of repeats to a level that still
|
||||
* completes in reasonable time (this will reduce the effect of random
|
||||
* measurement noise).
|
||||
*
|
||||
* The criterion for performance can be altered by use of the -C<x> flag where
|
||||
* <x> can be t,r,s,c,b, selecting pattern matching throughput, scratch size,
|
||||
* stream state size (only available in streaming mode), compile time and
|
||||
* bytecode size respectively.
|
||||
*
|
||||
* This utility will also not produce good results if all the patterns are
|
||||
* roughly equally expensive.
|
||||
*
|
||||
* Factor Group Size:
|
||||
*
|
||||
* If there are multiple expensive patterns that are very similar on the
|
||||
* left-hand-side or identical, this utility will typically not find these
|
||||
* groups unless the -F flag is used to search for a group size that is equal
|
||||
* to or larger than the size of the group of similar patterns.
|
||||
*
|
||||
* Otherwise, removing a portion of the similar patterns will have no or almost
|
||||
* no effect, and the search procedure used relies on the ability to remove all
|
||||
* of the similar patterns in at least one search case, something which will
|
||||
* only happen if the factor_group_size is large enough.
|
||||
*
|
||||
* This alters the operation of our tool so that instead of trying to find the
|
||||
* single pattern whose removal has the most effect by binary search (the
|
||||
* default with factor_group_size == 1), we attempt to find the N patterns
|
||||
* whose removal has the most effect by searching over N+1 evenly sized groups,
|
||||
* removing only 1/(N+1) of the search signatures per iteration.
|
||||
*
|
||||
* Note that the number of recompiles done greatly increases with increased
|
||||
* factor group size. For example, with factor_group_size = 1, we do g * 2 *
|
||||
* lg2(n) recompiles, while with factor_group_size = 4, we do g * 4 *
|
||||
* log(5/4)(n). Informally the number of generations we require goes up as we
|
||||
* eliminate a smaller number of signatures and the we have to do more work per
|
||||
* generation.
|
||||
*
|
||||
*
|
||||
* Build instructions:
|
||||
*
|
||||
* g++ -o patbench patbench.cc $(pkg-config --cflags --libs libhs) -lpcap
|
||||
*
|
||||
* Usage:
|
||||
*
|
||||
* ./patbench [ -n repeats] [ -G generations] [ -C criterion ]
|
||||
* [ -F factor_group_size ] [ -N | -S ] <pattern file> <pcap file>
|
||||
*
|
||||
* -n repeats sets the number of times the PCAP is repeatedly scanned
|
||||
* with the pattern
|
||||
* -G generations sets the number of generations that the algorithm is
|
||||
* run for
|
||||
* -N sets non-streaming mode, -S sets streaming mode (default)
|
||||
* -F sets the factor group size (must be >0); this allows the detection
|
||||
* of multiple interacting factors
|
||||
*
|
||||
* -C sets the "criterion", which can be either:
|
||||
* t throughput (the default) - this requires a pcap file
|
||||
* r scratch size
|
||||
* s stream state size
|
||||
* c compile time
|
||||
* b bytecode size
|
||||
*
|
||||
* We recommend the use of a utility like 'taskset' on multiprocessor hosts to
|
||||
* lock execution to a single processor: this will remove processor migration
|
||||
* by the scheduler as a source of noise in the results.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
// We use the BSD primitives throughout as they exist on both BSD and Linux.
|
||||
#define __FAVOR_BSD
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/in_systm.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <netinet/ip_icmp.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include <pcap.h>
|
||||
|
||||
#include <hs.h>
|
||||
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
using std::ifstream;
|
||||
using std::string;
|
||||
using std::unordered_map;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
using std::min;
|
||||
using std::max;
|
||||
using std::copy;
|
||||
|
||||
enum Criterion {
|
||||
CRITERION_THROUGHPUT,
|
||||
CRITERION_BYTECODE_SIZE,
|
||||
CRITERION_COMPILE_TIME,
|
||||
CRITERION_STREAM_STATE,
|
||||
CRITERION_SCRATCH_SIZE
|
||||
};
|
||||
|
||||
static bool higher_is_better(Criterion c) {
|
||||
return c == CRITERION_THROUGHPUT;
|
||||
}
|
||||
|
||||
static void print_criterion(Criterion c, double val) {
|
||||
switch (c) {
|
||||
case CRITERION_THROUGHPUT:
|
||||
cout << std::fixed << std::setprecision(3) << val << " Megabits/s";
|
||||
break;
|
||||
case CRITERION_COMPILE_TIME:
|
||||
cout << std::fixed << std::setprecision(3) << val << " seconds";
|
||||
break;
|
||||
case CRITERION_BYTECODE_SIZE:
|
||||
case CRITERION_STREAM_STATE:
|
||||
case CRITERION_SCRATCH_SIZE:
|
||||
default:
|
||||
cout << static_cast<size_t>(val) << " bytes";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Key for identifying a stream in our pcap input data, using data from its IP
|
||||
// headers.
|
||||
struct FiveTuple {
|
||||
unsigned int protocol;
|
||||
unsigned int srcAddr;
|
||||
unsigned int srcPort;
|
||||
unsigned int dstAddr;
|
||||
unsigned int dstPort;
|
||||
|
||||
// Construct a FiveTuple from a TCP or UDP packet.
|
||||
FiveTuple(const struct ip *iphdr) {
|
||||
// IP fields
|
||||
protocol = iphdr->ip_p;
|
||||
srcAddr = iphdr->ip_src.s_addr;
|
||||
dstAddr = iphdr->ip_dst.s_addr;
|
||||
|
||||
// UDP/TCP ports
|
||||
const struct udphdr *uh = (const struct udphdr *)
|
||||
(((const char *)iphdr) + (iphdr->ip_hl * 4));
|
||||
srcPort = uh->uh_sport;
|
||||
dstPort = uh->uh_dport;
|
||||
}
|
||||
|
||||
bool operator==(const FiveTuple &a) const {
|
||||
return protocol == a.protocol && srcAddr == a.srcAddr &&
|
||||
srcPort == a.srcPort && dstAddr == a.dstAddr &&
|
||||
dstPort == a.dstPort;
|
||||
}
|
||||
};
|
||||
|
||||
// A *very* simple hash function, used when we create an unordered_map of
|
||||
// FiveTuple objects.
|
||||
struct FiveTupleHash {
|
||||
size_t operator()(const FiveTuple &x) const {
|
||||
return x.srcAddr ^ x.dstAddr ^ x.protocol ^ x.srcPort ^ x.dstPort;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function. See end of file.
|
||||
static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
unsigned int *length);
|
||||
|
||||
// Match event handler: called every time Hyperscan finds a match.
|
||||
static
|
||||
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
||||
unsigned int flags, void *ctx) {
|
||||
// Our context points to a size_t storing the match count
|
||||
size_t *matches = (size_t *)ctx;
|
||||
(*matches)++;
|
||||
return 0; // continue matching
|
||||
}
|
||||
|
||||
// Simple timing class
|
||||
class Clock {
|
||||
public:
|
||||
void start() {
|
||||
time_start = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
void stop() {
|
||||
time_end = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
double seconds() const {
|
||||
std::chrono::duration<double> delta = time_end - time_start;
|
||||
return delta.count();
|
||||
}
|
||||
private:
|
||||
std::chrono::time_point<std::chrono::system_clock> time_start, time_end;
|
||||
};
|
||||
|
||||
// Class wrapping all state associated with the benchmark
|
||||
class Benchmark {
|
||||
private:
|
||||
// Packet data to be scanned
|
||||
vector<string> packets;
|
||||
|
||||
// Stream ID for each packet
|
||||
vector<size_t> stream_ids;
|
||||
|
||||
// Map used to construct stream_ids
|
||||
unordered_map<FiveTuple, size_t, FiveTupleHash> stream_map;
|
||||
|
||||
// Hyperscan compiled database
|
||||
hs_database_t *db = nullptr;
|
||||
|
||||
// Hyperscan temporary scratch space
|
||||
hs_scratch_t *scratch = nullptr;
|
||||
|
||||
// Vector of Hyperscan stream state
|
||||
vector<hs_stream_t *> streams;
|
||||
|
||||
// Count of matches found while scanning
|
||||
size_t matchCount = 0;
|
||||
public:
|
||||
~Benchmark() {
|
||||
hs_free_scratch(scratch);
|
||||
hs_free_database(db);
|
||||
}
|
||||
|
||||
// Initialisation; after this call, Benchmark owns the database and will
|
||||
// ensure it is freed.
|
||||
void setDatabase(hs_database_t *hs_db) {
|
||||
hs_free_database(db); // Free previous database.
|
||||
db = hs_db;
|
||||
// (Re)allocate scratch to ensure that it is large enough to handle the
|
||||
// database.
|
||||
hs_error_t err = hs_alloc_scratch(db, &scratch);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
const hs_database_t *getDatabase() const {
|
||||
return db;
|
||||
}
|
||||
|
||||
size_t getScratchSize() const {
|
||||
size_t scratch_size;
|
||||
hs_error_t err = hs_scratch_size(scratch, &scratch_size);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not query scratch space size. Exiting."
|
||||
<< endl;
|
||||
exit(-1);
|
||||
}
|
||||
return scratch_size;
|
||||
}
|
||||
|
||||
// Read a set of streams from a pcap file
|
||||
bool readStreams(const char *pcapFile) {
|
||||
// Open PCAP file for input
|
||||
char errbuf[PCAP_ERRBUF_SIZE];
|
||||
pcap_t *pcapHandle = pcap_open_offline(pcapFile, errbuf);
|
||||
if (pcapHandle == nullptr) {
|
||||
cerr << "ERROR: Unable to open pcap file \"" << pcapFile
|
||||
<< "\": " << errbuf << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
struct pcap_pkthdr pktHeader;
|
||||
const unsigned char *pktData;
|
||||
while ((pktData = pcap_next(pcapHandle, &pktHeader)) != nullptr) {
|
||||
unsigned int offset = 0, length = 0;
|
||||
if (!payloadOffset(pktData, &offset, &length)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Valid TCP or UDP packet
|
||||
const struct ip *iphdr = (const struct ip *)(pktData
|
||||
+ sizeof(struct ether_header));
|
||||
const char *payload = (const char *)pktData + offset;
|
||||
|
||||
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
||||
stream_map.size())).first->second;
|
||||
|
||||
packets.push_back(string(payload, length));
|
||||
stream_ids.push_back(id);
|
||||
}
|
||||
pcap_close(pcapHandle);
|
||||
|
||||
return !packets.empty();
|
||||
}
|
||||
|
||||
// Return the number of bytes scanned
|
||||
size_t bytes() const {
|
||||
size_t sum = 0;
|
||||
for (const auto &packet : packets) {
|
||||
sum += packet.size();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Return the number of matches found.
|
||||
size_t matches() const {
|
||||
return matchCount;
|
||||
}
|
||||
|
||||
// Clear the number of matches found.
|
||||
void clearMatches() {
|
||||
matchCount = 0;
|
||||
}
|
||||
|
||||
// Open a Hyperscan stream for each stream in stream_ids
|
||||
void openStreams() {
|
||||
streams.resize(stream_map.size());
|
||||
for (auto &stream : streams) {
|
||||
hs_error_t err = hs_open_stream(db, 0, &stream);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to open stream. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close all open Hyperscan streams (potentially generating any
|
||||
// end-anchored matches)
|
||||
void closeStreams() {
|
||||
for (auto &stream : streams) {
|
||||
hs_error_t err =
|
||||
hs_close_stream(stream, scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to close stream. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scan each packet (in the ordering given in the PCAP file) through
|
||||
// Hyperscan using the streaming interface.
|
||||
void scanStreams() {
|
||||
for (size_t i = 0; i != packets.size(); ++i) {
|
||||
const std::string &pkt = packets[i];
|
||||
hs_error_t err = hs_scan_stream(streams[stream_ids[i]],
|
||||
pkt.c_str(), pkt.length(), 0,
|
||||
scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scan each packet (in the ordering given in the PCAP file) through
|
||||
// Hyperscan using the block-mode interface.
|
||||
void scanBlock() {
|
||||
for (size_t i = 0; i != packets.size(); ++i) {
|
||||
const std::string &pkt = packets[i];
|
||||
hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0,
|
||||
scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// helper function - see end of file
|
||||
static void parseFile(const char *filename, vector<string> &patterns,
|
||||
vector<unsigned> &flags, vector<unsigned> &ids,
|
||||
vector<string> &originals);
|
||||
|
||||
class Sigdata {
|
||||
vector<unsigned> flags;
|
||||
vector<unsigned> ids;
|
||||
vector<string> patterns;
|
||||
vector<string> originals;
|
||||
|
||||
public:
|
||||
Sigdata() {}
|
||||
Sigdata(const char *filename) {
|
||||
parseFile(filename, patterns, flags, ids, originals);
|
||||
|
||||
}
|
||||
|
||||
const string &get_original(unsigned index) const {
|
||||
return originals[index];
|
||||
}
|
||||
|
||||
hs_database_t *compileDatabase(unsigned mode, double *compileTime) const {
|
||||
hs_database_t *db = nullptr;
|
||||
hs_compile_error_t *compileErr;
|
||||
|
||||
// Turn our vector of strings into a vector of char*'s to pass in to
|
||||
// hs_compile_multi. (This is just using the vector of strings as
|
||||
// dynamic storage.)
|
||||
vector<const char *> cstrPatterns;
|
||||
cstrPatterns.reserve(patterns.size());
|
||||
for (const auto &pattern : patterns) {
|
||||
cstrPatterns.push_back(pattern.c_str());
|
||||
}
|
||||
|
||||
Clock clock;
|
||||
clock.start();
|
||||
hs_error_t err = hs_compile_multi(cstrPatterns.data(), flags.data(),
|
||||
ids.data(), cstrPatterns.size(), mode,
|
||||
nullptr, &db, &compileErr);
|
||||
clock.stop();
|
||||
if (err != HS_SUCCESS) {
|
||||
if (compileErr->expression < 0) {
|
||||
// The error does not refer to a particular expression.
|
||||
cerr << "ERROR: " << compileErr->message << endl;
|
||||
} else {
|
||||
cerr << "ERROR: Pattern '"
|
||||
<< patterns[compileErr->expression]
|
||||
<< "' failed with error '" << compileErr->message << "'"
|
||||
<< endl;
|
||||
}
|
||||
// As the compileErr pointer points to dynamically allocated memory,
|
||||
// if we get an error, we must be sure to release it. This is not
|
||||
// necessary when no error is detected.
|
||||
hs_free_compile_error(compileErr);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
*compileTime = clock.seconds();
|
||||
return db;
|
||||
}
|
||||
|
||||
unsigned size() const {
|
||||
return patterns.size();
|
||||
}
|
||||
|
||||
Sigdata cloneExclude(const set<unsigned> &excludeIndexSet) const {
|
||||
Sigdata c;
|
||||
for (unsigned i = 0, e = size(); i != e; ++i) {
|
||||
if (excludeIndexSet.find(i) == excludeIndexSet.end()) {
|
||||
c.flags.push_back(flags[i]);
|
||||
c.ids.push_back(ids[i]);
|
||||
c.patterns.push_back(patterns[i]);
|
||||
c.originals.push_back(originals[i]);
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
static
|
||||
void usage(const char *) {
|
||||
cerr << "Usage:" << endl << endl;
|
||||
cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]" << endl
|
||||
<< " [ -F factor_group_size ] [ -N | -S ] "
|
||||
<< "<pattern file> <pcap file>" << endl << endl
|
||||
<< " -n repeats sets the number of times the PCAP is repeatedly "
|
||||
"scanned" << endl << " with the pattern." << endl
|
||||
<< " -G generations sets the number of generations that the "
|
||||
"algorithm is" << endl << " run for." << endl
|
||||
<< " -N sets non-streaming mode, -S sets streaming mode (default)."
|
||||
<< endl << " -F sets the factor group size (must be >0); this "
|
||||
"allows the detection" << endl
|
||||
<< " of multiple interacting factors." << endl << "" << endl
|
||||
<< " -C sets the 'criterion', which can be either:" << endl
|
||||
<< " t throughput (the default) - this requires a pcap file"
|
||||
<< endl << " r scratch size" << endl
|
||||
<< " s stream state size" << endl
|
||||
<< " c compile time" << endl << " b bytecode size"
|
||||
<< endl << endl
|
||||
<< "We recommend the use of a utility like 'taskset' on "
|
||||
"multiprocessor hosts to" << endl
|
||||
<< "lock execution to a single processor: this will remove processor "
|
||||
"migration" << endl
|
||||
<< "by the scheduler as a source of noise in the results." << endl;
|
||||
}
|
||||
|
||||
static
|
||||
double measure_stream_time(Benchmark &bench, unsigned int repeatCount) {
|
||||
Clock clock;
|
||||
bench.clearMatches();
|
||||
clock.start();
|
||||
for (unsigned int i = 0; i < repeatCount; i++) {
|
||||
bench.openStreams();
|
||||
bench.scanStreams();
|
||||
bench.closeStreams();
|
||||
}
|
||||
clock.stop();
|
||||
double secsScan = clock.seconds();
|
||||
return secsScan;
|
||||
}
|
||||
|
||||
static
|
||||
double measure_block_time(Benchmark &bench, unsigned int repeatCount) {
|
||||
Clock clock;
|
||||
bench.clearMatches();
|
||||
clock.start();
|
||||
for (unsigned int i = 0; i < repeatCount; i++) {
|
||||
bench.scanBlock();
|
||||
}
|
||||
clock.stop();
|
||||
double secsScan = clock.seconds();
|
||||
return secsScan;
|
||||
}
|
||||
|
||||
static
|
||||
double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
||||
unsigned repeatCount, Criterion criterion,
|
||||
bool diagnose = true) {
|
||||
double compileTime = 0;
|
||||
bench.setDatabase(sigs.compileDatabase(mode, &compileTime));
|
||||
|
||||
switch (criterion) {
|
||||
case CRITERION_BYTECODE_SIZE: {
|
||||
size_t dbSize;
|
||||
hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not retrieve bytecode size" << endl;
|
||||
exit(1);
|
||||
}
|
||||
return dbSize;
|
||||
}
|
||||
case CRITERION_COMPILE_TIME:
|
||||
return compileTime;
|
||||
case CRITERION_STREAM_STATE: {
|
||||
size_t streamStateSize;
|
||||
hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not retrieve stream state size" << endl;
|
||||
exit(1);
|
||||
}
|
||||
return streamStateSize;
|
||||
}
|
||||
case CRITERION_SCRATCH_SIZE:
|
||||
return bench.getScratchSize();
|
||||
case CRITERION_THROUGHPUT:
|
||||
default:
|
||||
break; // do nothing - we are THROUGHPUT
|
||||
}
|
||||
double scan_time;
|
||||
if (mode == HS_MODE_NOSTREAM) {
|
||||
scan_time = measure_block_time(bench, repeatCount);
|
||||
} else {
|
||||
scan_time = measure_stream_time(bench, repeatCount);
|
||||
}
|
||||
size_t bytes = bench.bytes();
|
||||
size_t matches = bench.matches();
|
||||
if (diagnose) {
|
||||
cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
|
||||
<< " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
|
||||
<< std::fixed << std::setprecision(3)
|
||||
<< (bytes * 8 * repeatCount) / (scan_time * 1000000)
|
||||
<< " Mbps, Matches " << matches << endl;
|
||||
}
|
||||
return (bytes * 8 * repeatCount) / (scan_time * 1000000);
|
||||
}
|
||||
|
||||
// Main entry point.
|
||||
int main(int argc, char **argv) {
|
||||
unsigned int repeatCount = 1;
|
||||
unsigned int mode = HS_MODE_STREAM;
|
||||
Criterion criterion = CRITERION_THROUGHPUT;
|
||||
unsigned int gen_max = 10;
|
||||
unsigned int factor_max = 1;
|
||||
// Process command line arguments.
|
||||
int opt;
|
||||
while ((opt = getopt(argc, argv, "SNn:G:F:C:")) != -1) {
|
||||
switch (opt) {
|
||||
case 'F':
|
||||
factor_max = atoi(optarg);
|
||||
break;
|
||||
case 'G':
|
||||
gen_max = atoi(optarg);
|
||||
break;
|
||||
case 'S':
|
||||
mode = HS_MODE_STREAM;
|
||||
break;
|
||||
case 'N':
|
||||
mode = HS_MODE_NOSTREAM;
|
||||
break;
|
||||
case 'C':
|
||||
switch (optarg[0]) {
|
||||
case 't':
|
||||
criterion = CRITERION_THROUGHPUT;
|
||||
break;
|
||||
case 'b':
|
||||
criterion = CRITERION_BYTECODE_SIZE;
|
||||
break;
|
||||
case 'c':
|
||||
criterion = CRITERION_COMPILE_TIME;
|
||||
break;
|
||||
case 's':
|
||||
criterion = CRITERION_STREAM_STATE;
|
||||
break;
|
||||
case 'r':
|
||||
criterion = CRITERION_SCRATCH_SIZE;
|
||||
break;
|
||||
default:
|
||||
cerr << "Unrecognised criterion: " << optarg[0] << endl;
|
||||
usage(argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
break;
|
||||
case 'n':
|
||||
repeatCount = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
usage(argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
if (argc - optind != ((criterion == CRITERION_THROUGHPUT) ? 2 : 1)) {
|
||||
usage(argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const char *patternFile = argv[optind];
|
||||
const char *pcapFile = argv[optind + 1];
|
||||
|
||||
// Read our input PCAP file in
|
||||
Benchmark bench;
|
||||
if (criterion == CRITERION_THROUGHPUT) {
|
||||
if (!bench.readStreams(pcapFile)) {
|
||||
cerr << "Unable to read packets from PCAP file. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) {
|
||||
cerr << "Cannot evaluate stream state for block mode compile. Exiting."
|
||||
<< endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
cout << "Base signatures: " << patternFile;
|
||||
if (pcapFile) {
|
||||
cout << "\tPCAP input file: " << pcapFile
|
||||
<< "\tRepeat count: " << repeatCount;
|
||||
}
|
||||
if (mode == HS_MODE_STREAM) {
|
||||
cout << "\tMode: streaming";
|
||||
} else {
|
||||
cout << "\tMode: block";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
Sigdata sigs(patternFile);
|
||||
|
||||
// calculate and show a baseline
|
||||
eval_set(bench, sigs, mode, repeatCount, criterion);
|
||||
|
||||
set<unsigned> work_sigs, exclude;
|
||||
|
||||
for (unsigned i = 0; i < sigs.size(); ++i) {
|
||||
work_sigs.insert(i);
|
||||
}
|
||||
|
||||
double score_base =
|
||||
eval_set(bench, sigs, mode, repeatCount, criterion, false);
|
||||
bool maximize = higher_is_better(criterion);
|
||||
cout << "Number of signatures: " << sigs.size() << endl;
|
||||
cout << "Base performance: ";
|
||||
print_criterion(criterion, score_base);
|
||||
cout << endl;
|
||||
|
||||
unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max);
|
||||
|
||||
cout << "Cutting signatures cumulatively for " << generations
|
||||
<< " generations" << endl;
|
||||
for (unsigned gen = 0; gen < generations; ++gen) {
|
||||
cout << "Generation " << gen << " ";
|
||||
set<unsigned> s(work_sigs.begin(), work_sigs.end());
|
||||
double best = maximize ? 0 : 1000000000000.0;
|
||||
unsigned count = 0;
|
||||
while (s.size() > factor_max) {
|
||||
count++;
|
||||
cout << "." << std::flush;
|
||||
vector<unsigned> sv(s.begin(), s.end());
|
||||
random_shuffle(sv.begin(), sv.end());
|
||||
unsigned groups = factor_max + 1;
|
||||
for (unsigned current_group = 0; current_group < groups;
|
||||
current_group++) {
|
||||
unsigned sz = sv.size();
|
||||
unsigned lo = (current_group * sz) / groups;
|
||||
unsigned hi = ((current_group + 1) * sz) / groups;
|
||||
|
||||
set<unsigned> s_part1(sv.begin(), sv.begin() + lo);
|
||||
set<unsigned> s_part2(sv.begin() + hi, sv.end());
|
||||
set<unsigned> s_tmp = s_part1;
|
||||
s_tmp.insert(s_part2.begin(), s_part2.end());
|
||||
set<unsigned> tmp = s_tmp;
|
||||
tmp.insert(exclude.begin(), exclude.end());
|
||||
Sigdata sigs_tmp = sigs.cloneExclude(tmp);
|
||||
double score = eval_set(bench, sigs_tmp, mode, repeatCount,
|
||||
criterion, false);
|
||||
|
||||
if ((current_group == 0) ||
|
||||
(!maximize ? (score < best) : (score > best))) {
|
||||
s = s_tmp;
|
||||
best = score;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (unsigned i = count; i < 16; i++) {
|
||||
cout << " ";
|
||||
}
|
||||
cout << "Performance: ";
|
||||
print_criterion(criterion, best);
|
||||
cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
|
||||
<< "x) after cutting:" << endl;
|
||||
|
||||
// s now has factor_max signatures
|
||||
for (const auto &found : s) {
|
||||
exclude.insert(found);
|
||||
work_sigs.erase(found);
|
||||
cout << sigs.get_original(found) << endl;
|
||||
}
|
||||
|
||||
cout << endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to locate the offset of the first byte of the payload in the
|
||||
* given ethernet frame. Offset into the packet, and the length of the payload
|
||||
* are returned in the arguments @a offset and @a length.
|
||||
*/
|
||||
static
|
||||
bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
unsigned int *length) {
|
||||
const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
|
||||
const tcphdr *th = nullptr;
|
||||
|
||||
// Ignore packets that aren't IPv4
|
||||
if (iph->ip_v != 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Ignore fragmented packets.
|
||||
if (iph->ip_off & htons(IP_MF | IP_OFFMASK)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// IP header length, and transport header length.
|
||||
unsigned int ihlen = iph->ip_hl * 4;
|
||||
unsigned int thlen = 0;
|
||||
|
||||
switch (iph->ip_p) {
|
||||
case IPPROTO_TCP:
|
||||
th = (const tcphdr *)((const char *)iph + ihlen);
|
||||
thlen = th->th_off * 4;
|
||||
break;
|
||||
case IPPROTO_UDP:
|
||||
thlen = sizeof(udphdr);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
*offset = sizeof(ether_header) + ihlen + thlen;
|
||||
*length = sizeof(ether_header) + ntohs(iph->ip_len) - *offset;
|
||||
|
||||
return *length != 0;
|
||||
}
|
||||
|
||||
static unsigned parseFlags(const string &flagsStr) {
|
||||
unsigned flags = 0;
|
||||
for (const auto &c : flagsStr) {
|
||||
switch (c) {
|
||||
case 'i':
|
||||
flags |= HS_FLAG_CASELESS; break;
|
||||
case 'm':
|
||||
flags |= HS_FLAG_MULTILINE; break;
|
||||
case 's':
|
||||
flags |= HS_FLAG_DOTALL; break;
|
||||
case 'H':
|
||||
flags |= HS_FLAG_SINGLEMATCH; break;
|
||||
case 'V':
|
||||
flags |= HS_FLAG_ALLOWEMPTY; break;
|
||||
case '8':
|
||||
flags |= HS_FLAG_UTF8; break;
|
||||
case 'W':
|
||||
flags |= HS_FLAG_UCP; break;
|
||||
default:
|
||||
cerr << "Unsupported flag \'" << c << "\'" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
static void parseFile(const char *filename, vector<string> &patterns,
|
||||
vector<unsigned> &flags, vector<unsigned> &ids,
|
||||
vector<string> &originals) {
|
||||
ifstream inFile(filename);
|
||||
if (!inFile.good()) {
|
||||
cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for (unsigned i = 1; !inFile.eof(); ++i) {
|
||||
string line;
|
||||
getline(inFile, line);
|
||||
|
||||
// if line is empty, or a comment, we can skip it
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// otherwise, it should be ID:PCRE, e.g.
|
||||
// 10001:/foobar/is
|
||||
|
||||
size_t colonIdx = line.find_first_of(':');
|
||||
if (colonIdx == string::npos) {
|
||||
cerr << "ERROR: Could not parse line " << i << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// we should have an unsigned int as an ID, before the colon
|
||||
unsigned id = std::stoi(line.substr(0, colonIdx).c_str());
|
||||
|
||||
// rest of the expression is the PCRE
|
||||
const string expr(line.substr(colonIdx + 1));
|
||||
|
||||
size_t flagsStart = expr.find_last_of('/');
|
||||
if (flagsStart == string::npos) {
|
||||
cerr << "ERROR: no trailing '/' char" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
string pcre(expr.substr(1, flagsStart - 1));
|
||||
string flagsStr(expr.substr(flagsStart + 1, expr.size() - flagsStart));
|
||||
unsigned flag = parseFlags(flagsStr);
|
||||
|
||||
originals.push_back(line);
|
||||
patterns.push_back(pcre);
|
||||
flags.push_back(flag);
|
||||
ids.push_back(id);
|
||||
}
|
||||
}
|
679
examples/pcapscan.cc
Normal file
679
examples/pcapscan.cc
Normal file
@@ -0,0 +1,679 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Hyperscan example program 2: pcapscan
|
||||
*
|
||||
* This example is a very simple packet scanning benchmark. It scans a given
|
||||
* PCAP file full of network traffic against a group of regular expressions and
|
||||
* returns some coarse performance measurements. This example provides a quick
|
||||
* way to examine the performance achievable on a particular combination of
|
||||
* platform, pattern set and input data.
|
||||
*
|
||||
* Build instructions:
|
||||
*
|
||||
* g++ -std=c++11 -O2 -o pcapscan pcapscan.cc $(pkg-config --cflags --libs libhs) -lpcap
|
||||
*
|
||||
* Usage:
|
||||
*
|
||||
* ./pcapscan [-n repeats] <pattern file> <pcap file>
|
||||
*
|
||||
* We recommend the use of a utility like 'taskset' on multiprocessor hosts to
|
||||
* pin execution to a single processor: this will remove processor migration
|
||||
* by the scheduler as a source of noise in the results.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <cstring>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
// We use the BSD primitives throughout as they exist on both BSD and Linux.
|
||||
#define __FAVOR_BSD
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/in_systm.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <netinet/ip_icmp.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include <pcap.h>
|
||||
|
||||
#include <hs.h>
|
||||
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
using std::ifstream;
|
||||
using std::string;
|
||||
using std::unordered_map;
|
||||
using std::vector;
|
||||
|
||||
// Key for identifying a stream in our pcap input data, using data from its IP
|
||||
// headers.
|
||||
struct FiveTuple {
|
||||
unsigned int protocol;
|
||||
unsigned int srcAddr;
|
||||
unsigned int srcPort;
|
||||
unsigned int dstAddr;
|
||||
unsigned int dstPort;
|
||||
|
||||
// Construct a FiveTuple from a TCP or UDP packet.
|
||||
FiveTuple(const struct ip *iphdr) {
|
||||
// IP fields
|
||||
protocol = iphdr->ip_p;
|
||||
srcAddr = iphdr->ip_src.s_addr;
|
||||
dstAddr = iphdr->ip_dst.s_addr;
|
||||
|
||||
// UDP/TCP ports
|
||||
const struct udphdr *uh =
|
||||
(const struct udphdr *)(((const char *)iphdr) + (iphdr->ip_hl * 4));
|
||||
srcPort = uh->uh_sport;
|
||||
dstPort = uh->uh_dport;
|
||||
}
|
||||
|
||||
bool operator==(const FiveTuple &a) const {
|
||||
return protocol == a.protocol && srcAddr == a.srcAddr &&
|
||||
srcPort == a.srcPort && dstAddr == a.dstAddr &&
|
||||
dstPort == a.dstPort;
|
||||
}
|
||||
};
|
||||
|
||||
// A *very* simple hash function, used when we create an unordered_map of
|
||||
// FiveTuple objects.
|
||||
struct FiveTupleHash {
|
||||
size_t operator()(const FiveTuple &x) const {
|
||||
return x.srcAddr ^ x.dstAddr ^ x.protocol ^ x.srcPort ^ x.dstPort;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function. See end of file.
|
||||
static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
unsigned int *length);
|
||||
|
||||
// Match event handler: called every time Hyperscan finds a match.
|
||||
static
|
||||
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
||||
unsigned int flags, void *ctx) {
|
||||
// Our context points to a size_t storing the match count
|
||||
size_t *matches = (size_t *)ctx;
|
||||
(*matches)++;
|
||||
return 0; // continue matching
|
||||
}
|
||||
|
||||
// Simple timing class
|
||||
class Clock {
|
||||
public:
|
||||
void start() {
|
||||
time_start = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
void stop() {
|
||||
time_end = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
double seconds() const {
|
||||
std::chrono::duration<double> delta = time_end - time_start;
|
||||
return delta.count();
|
||||
}
|
||||
private:
|
||||
std::chrono::time_point<std::chrono::system_clock> time_start, time_end;
|
||||
};
|
||||
|
||||
// Class wrapping all state associated with the benchmark
|
||||
class Benchmark {
|
||||
private:
|
||||
// Packet data to be scanned.
|
||||
vector<string> packets;
|
||||
|
||||
// The stream ID to which each packet belongs
|
||||
vector<size_t> stream_ids;
|
||||
|
||||
// Map used to construct stream_ids
|
||||
unordered_map<FiveTuple, size_t, FiveTupleHash> stream_map;
|
||||
|
||||
// Hyperscan compiled database (streaming mode)
|
||||
const hs_database_t *db_streaming;
|
||||
|
||||
// Hyperscan compiled database (block mode)
|
||||
const hs_database_t *db_block;
|
||||
|
||||
// Hyperscan temporary scratch space (used in both modes)
|
||||
hs_scratch_t *scratch;
|
||||
|
||||
// Vector of Hyperscan stream state (used in streaming mode)
|
||||
vector<hs_stream_t *> streams;
|
||||
|
||||
// Count of matches found during scanning
|
||||
size_t matchCount;
|
||||
|
||||
public:
|
||||
Benchmark(const hs_database_t *streaming, const hs_database_t *block)
|
||||
: db_streaming(streaming), db_block(block), scratch(nullptr),
|
||||
matchCount(0) {
|
||||
// Allocate enough scratch space to handle either streaming or block
|
||||
// mode, so we only need the one scratch region.
|
||||
hs_error_t err = hs_alloc_scratch(db_streaming, &scratch);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
// This second call will increase the scratch size if more is required
|
||||
// for block mode.
|
||||
err = hs_alloc_scratch(db_block, &scratch);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
~Benchmark() {
|
||||
// Free scratch region
|
||||
hs_free_scratch(scratch);
|
||||
}
|
||||
|
||||
// Read a set of streams from a pcap file
|
||||
bool readStreams(const char *pcapFile) {
|
||||
// Open PCAP file for input
|
||||
char errbuf[PCAP_ERRBUF_SIZE];
|
||||
pcap_t *pcapHandle = pcap_open_offline(pcapFile, errbuf);
|
||||
if (pcapHandle == nullptr) {
|
||||
cerr << "ERROR: Unable to open pcap file \"" << pcapFile
|
||||
<< "\": " << errbuf << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
struct pcap_pkthdr pktHeader;
|
||||
const unsigned char *pktData;
|
||||
while ((pktData = pcap_next(pcapHandle, &pktHeader)) != nullptr) {
|
||||
unsigned int offset = 0, length = 0;
|
||||
if (!payloadOffset(pktData, &offset, &length)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Valid TCP or UDP packet
|
||||
const struct ip *iphdr = (const struct ip *)(pktData
|
||||
+ sizeof(struct ether_header));
|
||||
const char *payload = (const char *)pktData + offset;
|
||||
|
||||
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
||||
stream_map.size())).first->second;
|
||||
|
||||
packets.push_back(string(payload, length));
|
||||
stream_ids.push_back(id);
|
||||
}
|
||||
pcap_close(pcapHandle);
|
||||
|
||||
return !packets.empty();
|
||||
}
|
||||
|
||||
// Return the number of bytes scanned
|
||||
size_t bytes() const {
|
||||
size_t sum = 0;
|
||||
for (const auto &packet : packets) {
|
||||
sum += packet.size();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Return the number of matches found.
|
||||
size_t matches() const {
|
||||
return matchCount;
|
||||
}
|
||||
|
||||
// Clear the number of matches found.
|
||||
void clearMatches() {
|
||||
matchCount = 0;
|
||||
}
|
||||
|
||||
// Open a Hyperscan stream for each stream in stream_ids
|
||||
void openStreams() {
|
||||
streams.resize(stream_map.size());
|
||||
for (auto &stream : streams) {
|
||||
hs_error_t err = hs_open_stream(db_streaming, 0, &stream);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to open stream. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close all open Hyperscan streams (potentially generating any
|
||||
// end-anchored matches)
|
||||
void closeStreams() {
|
||||
for (auto &stream : streams) {
|
||||
hs_error_t err = hs_close_stream(stream, scratch, onMatch,
|
||||
&matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to close stream. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scan each packet (in the ordering given in the PCAP file) through
|
||||
// Hyperscan using the streaming interface.
|
||||
void scanStreams() {
|
||||
for (size_t i = 0; i != packets.size(); ++i) {
|
||||
const std::string &pkt = packets[i];
|
||||
hs_error_t err = hs_scan_stream(streams[stream_ids[i]],
|
||||
pkt.c_str(), pkt.length(), 0,
|
||||
scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scan each packet (in the ordering given in the PCAP file) through
|
||||
// Hyperscan using the block-mode interface.
|
||||
void scanBlock() {
|
||||
for (size_t i = 0; i != packets.size(); ++i) {
|
||||
const std::string &pkt = packets[i];
|
||||
hs_error_t err = hs_scan(db_block, pkt.c_str(), pkt.length(), 0,
|
||||
scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Display some information about the compiled database and scanned data.
|
||||
void displayStats() {
|
||||
size_t numPackets = packets.size();
|
||||
size_t numStreams = stream_map.size();
|
||||
size_t numBytes = bytes();
|
||||
hs_error_t err;
|
||||
|
||||
cout << numPackets << " packets in " << numStreams
|
||||
<< " streams, totalling " << numBytes << " bytes." << endl;
|
||||
cout << "Average packet length: " << numBytes / numPackets << " bytes."
|
||||
<< endl;
|
||||
cout << "Average stream length: " << numBytes / numStreams << " bytes."
|
||||
<< endl;
|
||||
cout << endl;
|
||||
|
||||
size_t dbStream_size = 0;
|
||||
err = hs_database_size(db_streaming, &dbStream_size);
|
||||
if (err == HS_SUCCESS) {
|
||||
cout << "Streaming mode Hyperscan database size : "
|
||||
<< dbStream_size << " bytes." << endl;
|
||||
} else {
|
||||
cout << "Error getting streaming mode Hyperscan database size"
|
||||
<< endl;
|
||||
}
|
||||
|
||||
size_t dbBlock_size = 0;
|
||||
err = hs_database_size(db_block, &dbBlock_size);
|
||||
if (err == HS_SUCCESS) {
|
||||
cout << "Block mode Hyperscan database size : "
|
||||
<< dbBlock_size << " bytes." << endl;
|
||||
} else {
|
||||
cout << "Error getting block mode Hyperscan database size"
|
||||
<< endl;
|
||||
}
|
||||
|
||||
size_t stream_size = 0;
|
||||
err = hs_stream_size(db_streaming, &stream_size);
|
||||
if (err == HS_SUCCESS) {
|
||||
cout << "Streaming mode Hyperscan stream state size: "
|
||||
<< stream_size << " bytes (per stream)." << endl;
|
||||
} else {
|
||||
cout << "Error getting stream state size" << endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// helper function - see end of file
|
||||
static void parseFile(const char *filename, vector<string> &patterns,
|
||||
vector<unsigned> &flags, vector<unsigned> &ids);
|
||||
|
||||
static hs_database_t *buildDatabase(const vector<const char *> &expressions,
|
||||
const vector<unsigned> flags,
|
||||
const vector<unsigned> ids,
|
||||
unsigned int mode) {
|
||||
hs_database_t *db;
|
||||
hs_compile_error_t *compileErr;
|
||||
hs_error_t err;
|
||||
|
||||
Clock clock;
|
||||
clock.start();
|
||||
|
||||
err = hs_compile_multi(expressions.data(), flags.data(), ids.data(),
|
||||
expressions.size(), mode, nullptr, &db, &compileErr);
|
||||
|
||||
clock.stop();
|
||||
|
||||
if (err != HS_SUCCESS) {
|
||||
if (compileErr->expression < 0) {
|
||||
// The error does not refer to a particular expression.
|
||||
cerr << "ERROR: " << compileErr->message << endl;
|
||||
} else {
|
||||
cerr << "ERROR: Pattern '" << expressions[compileErr->expression]
|
||||
<< "' failed compilation with error: " << compileErr->message
|
||||
<< endl;
|
||||
}
|
||||
// As the compileErr pointer points to dynamically allocated memory, if
|
||||
// we get an error, we must be sure to release it. This is not
|
||||
// necessary when no error is detected.
|
||||
hs_free_compile_error(compileErr);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
cout << "Hyperscan " << (mode == HS_MODE_STREAM ? "streaming" : "block")
|
||||
<< " mode database compiled in " << clock.seconds() << " seconds."
|
||||
<< endl;
|
||||
|
||||
return db;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function will read in the file with the specified name, with an
|
||||
* expression per line, ignoring lines starting with '#' and build a Hyperscan
|
||||
* database for it.
|
||||
*/
|
||||
static void databasesFromFile(const char *filename,
|
||||
hs_database_t **db_streaming,
|
||||
hs_database_t **db_block) {
|
||||
// hs_compile_multi requires three parallel arrays containing the patterns,
|
||||
// flags and ids that we want to work with. To achieve this we use
|
||||
// vectors and new entries onto each for each valid line of input from
|
||||
// the pattern file.
|
||||
vector<string> patterns;
|
||||
vector<unsigned> flags;
|
||||
vector<unsigned> ids;
|
||||
|
||||
// do the actual file reading and string handling
|
||||
parseFile(filename, patterns, flags, ids);
|
||||
|
||||
// Turn our vector of strings into a vector of char*'s to pass in to
|
||||
// hs_compile_multi. (This is just using the vector of strings as dynamic
|
||||
// storage.)
|
||||
vector<const char*> cstrPatterns;
|
||||
for (const auto &pattern : patterns) {
|
||||
cstrPatterns.push_back(pattern.c_str());
|
||||
}
|
||||
|
||||
cout << "Compiling Hyperscan databases with " << patterns.size()
|
||||
<< " patterns." << endl;
|
||||
|
||||
*db_streaming = buildDatabase(cstrPatterns, flags, ids, HS_MODE_STREAM);
|
||||
*db_block = buildDatabase(cstrPatterns, flags, ids, HS_MODE_BLOCK);
|
||||
}
|
||||
|
||||
static void usage(const char *prog) {
|
||||
cerr << "Usage: " << prog << " [-n repeats] <pattern file> <pcap file>" << endl;
|
||||
}
|
||||
|
||||
// Main entry point.
|
||||
int main(int argc, char **argv) {
|
||||
unsigned int repeatCount = 1;
|
||||
|
||||
// Process command line arguments.
|
||||
int opt;
|
||||
while ((opt = getopt(argc, argv, "n:")) != -1) {
|
||||
switch (opt) {
|
||||
case 'n':
|
||||
repeatCount = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
usage(argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
if (argc - optind != 2) {
|
||||
usage(argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const char *patternFile = argv[optind];
|
||||
const char *pcapFile = argv[optind + 1];
|
||||
|
||||
// Read our pattern set in and build Hyperscan databases from it.
|
||||
cout << "Pattern file: " << patternFile << endl;
|
||||
hs_database_t *db_streaming, *db_block;
|
||||
databasesFromFile(patternFile, &db_streaming, &db_block);
|
||||
|
||||
// Read our input PCAP file in
|
||||
Benchmark bench(db_streaming, db_block);
|
||||
cout << "PCAP input file: " << pcapFile << endl;
|
||||
if (!bench.readStreams(pcapFile)) {
|
||||
cerr << "Unable to read packets from PCAP file. Exiting." << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (repeatCount != 1) {
|
||||
cout << "Repeating PCAP scan " << repeatCount << " times." << endl;
|
||||
}
|
||||
|
||||
bench.displayStats();
|
||||
|
||||
Clock clock;
|
||||
|
||||
// Streaming mode scans.
|
||||
double secsStreamingScan = 0.0, secsStreamingOpenClose = 0.0;
|
||||
for (unsigned int i = 0; i < repeatCount; i++) {
|
||||
// Open streams.
|
||||
clock.start();
|
||||
bench.openStreams();
|
||||
clock.stop();
|
||||
secsStreamingOpenClose += clock.seconds();
|
||||
|
||||
// Scan all our packets in streaming mode.
|
||||
clock.start();
|
||||
bench.scanStreams();
|
||||
clock.stop();
|
||||
secsStreamingScan += clock.seconds();
|
||||
|
||||
// Close streams.
|
||||
clock.start();
|
||||
bench.closeStreams();
|
||||
clock.stop();
|
||||
secsStreamingOpenClose += clock.seconds();
|
||||
}
|
||||
|
||||
// Collect data from streaming mode scans.
|
||||
size_t bytes = bench.bytes();
|
||||
double tputStreamScanning = (bytes * 8 * repeatCount) / secsStreamingScan;
|
||||
double tputStreamOverhead = (bytes * 8 * repeatCount) / (secsStreamingScan + secsStreamingOpenClose);
|
||||
size_t matchesStream = bench.matches();
|
||||
double matchRateStream = matchesStream / ((bytes * repeatCount) / 1024.0); // matches per kilobyte
|
||||
|
||||
// Scan all our packets in block mode.
|
||||
bench.clearMatches();
|
||||
clock.start();
|
||||
for (unsigned int i = 0; i < repeatCount; i++) {
|
||||
bench.scanBlock();
|
||||
}
|
||||
clock.stop();
|
||||
double secsScanBlock = clock.seconds();
|
||||
|
||||
// Collect data from block mode scans.
|
||||
double tputBlockScanning = (bytes * 8 * repeatCount) / secsScanBlock;
|
||||
size_t matchesBlock = bench.matches();
|
||||
double matchRateBlock = matchesBlock / ((bytes * repeatCount) / 1024.0); // matches per kilobyte
|
||||
|
||||
cout << endl << "Streaming mode:" << endl << endl;
|
||||
cout << " Total matches: " << matchesStream << endl;
|
||||
cout << std::fixed << std::setprecision(4);
|
||||
cout << " Match rate: " << matchRateStream << " matches/kilobyte" << endl;
|
||||
cout << std::fixed << std::setprecision(2);
|
||||
cout << " Throughput (with stream overhead): "
|
||||
<< tputStreamOverhead/1000000 << " megabits/sec" << endl;
|
||||
cout << " Throughput (no stream overhead): "
|
||||
<< tputStreamScanning/1000000 << " megabits/sec" << endl;
|
||||
|
||||
cout << endl << "Block mode:" << endl << endl;
|
||||
cout << " Total matches: " << matchesBlock << endl;
|
||||
cout << std::fixed << std::setprecision(4);
|
||||
cout << " Match rate: " << matchRateBlock << " matches/kilobyte" << endl;
|
||||
cout << std::fixed << std::setprecision(2);
|
||||
cout << " Throughput: "
|
||||
<< tputBlockScanning/1000000 << " megabits/sec" << endl;
|
||||
|
||||
cout << endl;
|
||||
if (bytes < (2*1024*1024)) {
|
||||
cout << endl << "WARNING: Input PCAP file is less than 2MB in size." << endl
|
||||
<< "This test may have been too short to calculate accurate results." << endl;
|
||||
}
|
||||
|
||||
// Close Hyperscan databases
|
||||
hs_free_database(db_streaming);
|
||||
hs_free_database(db_block);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to locate the offset of the first byte of the payload in the
|
||||
* given ethernet frame. Offset into the packet, and the length of the payload
|
||||
* are returned in the arguments @a offset and @a length.
|
||||
*/
|
||||
static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
unsigned int *length) {
|
||||
const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
|
||||
const tcphdr *th = nullptr;
|
||||
|
||||
// Ignore packets that aren't IPv4
|
||||
if (iph->ip_v != 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Ignore fragmented packets.
|
||||
if (iph->ip_off & htons(IP_MF|IP_OFFMASK)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// IP header length, and transport header length.
|
||||
unsigned int ihlen = iph->ip_hl * 4;
|
||||
unsigned int thlen = 0;
|
||||
|
||||
switch (iph->ip_p) {
|
||||
case IPPROTO_TCP:
|
||||
th = (const tcphdr *)((const char *)iph + ihlen);
|
||||
thlen = th->th_off * 4;
|
||||
break;
|
||||
case IPPROTO_UDP:
|
||||
thlen = sizeof(udphdr);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
*offset = sizeof(ether_header) + ihlen + thlen;
|
||||
*length = sizeof(ether_header) + ntohs(iph->ip_len) - *offset;
|
||||
|
||||
return *length != 0;
|
||||
}
|
||||
|
||||
static unsigned parseFlags(const string &flagsStr) {
|
||||
unsigned flags = 0;
|
||||
for (const auto &c : flagsStr) {
|
||||
switch (c) {
|
||||
case 'i':
|
||||
flags |= HS_FLAG_CASELESS; break;
|
||||
case 'm':
|
||||
flags |= HS_FLAG_MULTILINE; break;
|
||||
case 's':
|
||||
flags |= HS_FLAG_DOTALL; break;
|
||||
case 'H':
|
||||
flags |= HS_FLAG_SINGLEMATCH; break;
|
||||
case 'V':
|
||||
flags |= HS_FLAG_ALLOWEMPTY; break;
|
||||
case '8':
|
||||
flags |= HS_FLAG_UTF8; break;
|
||||
case 'W':
|
||||
flags |= HS_FLAG_UCP; break;
|
||||
default:
|
||||
cerr << "Unsupported flag \'" << c << "\'" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
static void parseFile(const char *filename, vector<string> &patterns,
|
||||
vector<unsigned> &flags, vector<unsigned> &ids) {
|
||||
ifstream inFile(filename);
|
||||
if (!inFile.good()) {
|
||||
cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for (unsigned i = 1; !inFile.eof(); ++i) {
|
||||
string line;
|
||||
getline(inFile, line);
|
||||
|
||||
// if line is empty, or a comment, we can skip it
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// otherwise, it should be ID:PCRE, e.g.
|
||||
// 10001:/foobar/is
|
||||
|
||||
size_t colonIdx = line.find_first_of(':');
|
||||
if (colonIdx == string::npos) {
|
||||
cerr << "ERROR: Could not parse line " << i << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// we should have an unsigned int as an ID, before the colon
|
||||
unsigned id = std::stoi(line.substr(0, colonIdx).c_str());
|
||||
|
||||
// rest of the expression is the PCRE
|
||||
const string expr(line.substr(colonIdx + 1));
|
||||
|
||||
size_t flagsStart = expr.find_last_of('/');
|
||||
if (flagsStart == string::npos) {
|
||||
cerr << "ERROR: no trailing '/' char" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
string pcre(expr.substr(1, flagsStart - 1));
|
||||
string flagsStr(expr.substr(flagsStart + 1, expr.size() - flagsStart));
|
||||
unsigned flag = parseFlags(flagsStr);
|
||||
|
||||
patterns.push_back(pcre);
|
||||
flags.push_back(flag);
|
||||
ids.push_back(id);
|
||||
}
|
||||
}
|
||||
|
221
examples/simplegrep.c
Normal file
221
examples/simplegrep.c
Normal file
@@ -0,0 +1,221 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Hyperscan example program 1: simplegrep
|
||||
*
|
||||
* This is a simple example of Hyperscan's most basic functionality: it will
|
||||
* search a given input file for a pattern supplied as a command-line argument.
|
||||
* It is intended to demonstrate correct usage of the hs_compile and hs_scan
|
||||
* functions of Hyperscan.
|
||||
*
|
||||
* Patterns are scanned in 'DOTALL' mode, which is equivalent to PCRE's '/s'
|
||||
* modifier. This behaviour can be changed by modifying the "flags" argument to
|
||||
* hs_compile.
|
||||
*
|
||||
* Build instructions:
|
||||
*
|
||||
* gcc -o simplegrep simplegrep.c $(pkg-config --cflags --libs libhs)
|
||||
*
|
||||
* Usage:
|
||||
*
|
||||
* ./simplegrep <pattern> <input file>
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* ./simplegrep int simplegrep.c
|
||||
*
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <hs.h>
|
||||
|
||||
/**
|
||||
* This is the function that will be called for each match that occurs. @a ctx
|
||||
* is to allow you to have some application-specific state that you will get
|
||||
* access to for each match. In our simple example we're just going to use it
|
||||
* to pass in the pattern that was being searched for so we can print it out.
|
||||
*/
|
||||
static int eventHandler(unsigned int id, unsigned long long from,
|
||||
unsigned long long to, unsigned int flags, void *ctx) {
|
||||
printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill a data buffer from the given filename, returning it and filling @a
|
||||
* length with its length. Returns NULL on failure.
|
||||
*/
|
||||
static char *readInputData(const char *inputFN, unsigned int *length) {
|
||||
FILE *f = fopen(inputFN, "r");
|
||||
if (!f) {
|
||||
fprintf(stderr, "ERROR: unable to open file \"%s\": %s\n", inputFN,
|
||||
strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* We use fseek/ftell to get our data length, in order to keep this example
|
||||
* code as portable as possible. */
|
||||
if (fseek(f, 0, SEEK_END) != 0) {
|
||||
fprintf(stderr, "ERROR: unable to seek file \"%s\": %s\n", inputFN,
|
||||
strerror(errno));
|
||||
fclose(f);
|
||||
return NULL;
|
||||
}
|
||||
long dataLen = ftell(f);
|
||||
if (dataLen < 0) {
|
||||
fprintf(stderr, "ERROR: ftell() failed: %s\n", strerror(errno));
|
||||
fclose(f);
|
||||
return NULL;
|
||||
}
|
||||
if (fseek(f, 0, SEEK_SET) != 0) {
|
||||
fprintf(stderr, "ERROR: unable to seek file \"%s\": %s\n", inputFN,
|
||||
strerror(errno));
|
||||
fclose(f);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Hyperscan's hs_scan function accepts length as an unsigned int, so we
|
||||
* limit the size of our buffer appropriately. */
|
||||
if ((unsigned long)dataLen > UINT_MAX) {
|
||||
dataLen = UINT_MAX;
|
||||
printf("WARNING: clipping data to %lu bytes\n", dataLen);
|
||||
} else if (dataLen == 0) {
|
||||
fprintf(stderr, "ERROR: input file \"%s\" is empty\n", inputFN);
|
||||
fclose(f);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *inputData = malloc(dataLen);
|
||||
if (!inputData) {
|
||||
fprintf(stderr, "ERROR: unable to malloc %lu bytes\n", dataLen);
|
||||
fclose(f);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *p = inputData;
|
||||
size_t bytesLeft = dataLen;
|
||||
while (bytesLeft) {
|
||||
size_t bytesRead = fread(p, 1, bytesLeft, f);
|
||||
bytesLeft -= bytesRead;
|
||||
p += bytesRead;
|
||||
if (ferror(f) != 0) {
|
||||
fprintf(stderr, "ERROR: fread() failed\n");
|
||||
free(inputData);
|
||||
fclose(f);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(f);
|
||||
|
||||
*length = (unsigned int)dataLen;
|
||||
return inputData;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "Usage: %s <pattern> <input file>\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *pattern = argv[1];
|
||||
char *inputFN = argv[2];
|
||||
|
||||
/* First, we attempt to compile the pattern provided on the command line.
|
||||
* We assume 'DOTALL' semantics, meaning that the '.' meta-character will
|
||||
* match newline characters. The compiler will analyse the given pattern and
|
||||
* either return a compiled Hyperscan database, or an error message
|
||||
* explaining why the pattern didn't compile.
|
||||
*/
|
||||
hs_database_t *database;
|
||||
hs_compile_error_t *compile_err;
|
||||
if (hs_compile(pattern, HS_FLAG_DOTALL, HS_MODE_BLOCK, NULL, &database,
|
||||
&compile_err) != HS_SUCCESS) {
|
||||
fprintf(stderr, "ERROR: Unable to compile pattern \"%s\": %s\n",
|
||||
pattern, compile_err->message);
|
||||
hs_free_compile_error(compile_err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Next, we read the input data file into a buffer. */
|
||||
unsigned int length;
|
||||
char *inputData = readInputData(inputFN, &length);
|
||||
if (!inputData) {
|
||||
hs_free_database(database);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Finally, we issue a call to hs_scan, which will search the input buffer
|
||||
* for the pattern represented in the bytecode. Note that in order to do
|
||||
* this, scratch space needs to be allocated with the hs_alloc_scratch
|
||||
* function. In typical usage, you would reuse this scratch space for many
|
||||
* calls to hs_scan, but as we're only doing one, we'll be allocating it
|
||||
* and deallocating it as soon as our matching is done.
|
||||
*
|
||||
* When matches occur, the specified callback function (eventHandler in
|
||||
* this file) will be called. Note that although it is reminiscent of
|
||||
* asynchronous APIs, Hyperscan operates synchronously: all matches will be
|
||||
* found, and all callbacks issued, *before* hs_scan returns.
|
||||
*
|
||||
* In this example, we provide the input pattern as the context pointer so
|
||||
* that the callback is able to print out the pattern that matched on each
|
||||
* match event.
|
||||
*/
|
||||
hs_scratch_t *scratch = NULL;
|
||||
if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
|
||||
fprintf(stderr, "ERROR: Unable to allocate scratch space. Exiting.\n");
|
||||
free(inputData);
|
||||
hs_free_database(database);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("Scanning %u bytes with Hyperscan\n", length);
|
||||
|
||||
if (hs_scan(database, inputData, length, 0, scratch, eventHandler,
|
||||
pattern) != HS_SUCCESS) {
|
||||
fprintf(stderr, "ERROR: Unable to scan input buffer. Exiting.\n");
|
||||
hs_free_scratch(scratch);
|
||||
free(inputData);
|
||||
hs_free_database(database);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Scanning is complete, any matches have been handled, so now we just
|
||||
* clean up and exit.
|
||||
*/
|
||||
hs_free_scratch(scratch);
|
||||
free(inputData);
|
||||
hs_free_database(database);
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user