hscollider: tool for testing Hyperscan match behaviour against PCRE

This commit is contained in:
Alex Coyte 2017-12-12 09:29:20 +11:00 committed by Xiang Wang
parent fae8d21127
commit 1330265ced
32 changed files with 6960 additions and 0 deletions

62
cmake/pcre.cmake Normal file
View File

@ -0,0 +1,62 @@
# first look in pcre-$version or pcre subdirs
if (PCRE_SOURCE)
# either provided on cmdline or we've seen it already
set (PCRE_BUILD_SOURCE TRUE)
elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION})
set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION})
set (PCRE_BUILD_SOURCE TRUE)
elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre)
set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre)
set (PCRE_BUILD_SOURCE TRUE)
endif()
if (PCRE_BUILD_SOURCE)
if (NOT IS_ABSOLUTE ${PCRE_SOURCE})
set(PCRE_SOURCE "${CMAKE_BINARY_DIR}/${PCRE_SOURCE}")
endif ()
set (saved_INCLUDES "${CMAKE_REQUIRED_INCLUDES}")
set (CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${PCRE_SOURCE}")
if (PCRE_CHECKED)
set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre)
set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre)
# already processed this file and set up pcre building
return()
endif ()
# first, check version number
CHECK_C_SOURCE_COMPILES("#include <pcre.h.generic>
#if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR != ${PCRE_REQUIRED_MINOR_VERSION}
#error Incorrect pcre version
#endif
main() {}" CORRECT_PCRE_VERSION)
set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")
if (NOT CORRECT_PCRE_VERSION)
unset(CORRECT_PCRE_VERSION CACHE)
message(FATAL_ERROR "Incorrect version of pcre - version ${PCRE_REQUIRED_VERSION} is required")
else()
message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} - building from source.")
endif()
# PCRE compile options
option(PCRE_BUILD_PCRECPP OFF)
option(PCRE_BUILD_PCREGREP OFF)
option(PCRE_SHOW_REPORT OFF)
set(PCRE_SUPPORT_UNICODE_PROPERTIES ON CACHE BOOL "Build pcre with unicode")
add_subdirectory(${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre EXCLUDE_FROM_ALL)
set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre)
set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre)
else ()
# pkgconf should save us
find_package(PkgConfig)
pkg_check_modules(PCRE libpcre=${PCRE_REQUIRED_VERSION})
if (PCRE_FOUND)
message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION}")
else ()
message(FATAL_ERROR "PCRE version ${PCRE_REQUIRED_VERSION} not found")
endif ()
endif (PCRE_BUILD_SOURCE)
set (PCRE_CHECKED TRUE PARENT_SCOPE)

View File

@ -0,0 +1,291 @@
/*
* Copyright (c) 2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BOUNDEDQUEUE_H
#define BOUNDEDQUEUE_H
#include <algorithm>
#include <cassert>
#include <condition_variable>
#include <memory>
#include <mutex>
#include <queue>
#include <type_traits>
#include <vector>
#include <boost/core/noncopyable.hpp>
//#define QUEUE_STATS 1
#ifdef QUEUE_STATS
#include <iostream>
class BoundedQueueStats {
public:
size_t pop = 0; //!< Number of pop operations.
size_t pop_block = 0; //!< Number of pop operations that had to block.
size_t push = 0; //!< Number of push operations.
size_t push_elements = 0; //!< Number of elements pushed.
size_t push_block = 0; //!< Number of push operations that had to block.
size_t refill = 0; //!< Number of refills done.
size_t stolen_from = 0; //!< Number of times we were stolen from.
void dump() const {
std::cout << "pop : " << pop << std::endl;
std::cout << "pop_block : " << pop_block << std::endl;
std::cout << "push : " << push << std::endl;
std::cout << "push_elements : " << push_elements << std::endl;
std::cout << "push_block : " << push_block << std::endl;
std::cout << "refill : " << refill << std::endl;
std::cout << "stolen_from : " << stolen_from << std::endl;
}
};
#endif
template<typename T>
class BoundedQueue : boost::noncopyable {
private:
// Encapsulates a queue and the mutex used to protect access to it.
class MutexQueue {
public:
// Forwarded queue operations.
void push(std::unique_ptr<T> elem) { q.push(std::move(elem)); }
void pop() { q.pop(); }
std::unique_ptr<T> &front() { return q.front(); }
bool empty() const { return q.empty(); }
size_t size() const { return q.size(); }
// Acquire the mutex lock.
std::unique_lock<std::mutex> lock() {
return std::unique_lock<std::mutex>(mutex);
}
#ifdef QUEUE_STATS
BoundedQueueStats stats;
#endif
private:
std::mutex mutex;
std::queue<std::unique_ptr<T>> q;
};
public:
BoundedQueue(size_t consumers, size_t size)
: max_elements(size), consumer_q(consumers) {
assert(consumers > 0);
assert(size > 0);
}
#ifdef QUEUE_STATS
~BoundedQueue() {
std::cout << "Global queue stats:" << std::endl;
global_q.stats.dump();
std::cout << std::endl;
for (size_t i = 0; i < consumer_q.size(); i++) {
std::cout << "Consumer queue " << i << ":" << std::endl;
consumer_q[i].stats.dump();
std::cout << std::endl;
}
}
#endif // QUEUE_STATS
void push(std::unique_ptr<T> elem) {
auto lock = global_q.lock();
#ifdef QUEUE_STATS
global_q.stats.push++;
global_q.stats.push_elements++;
if (global_q.size() >= max_elements) {
global_q.stats.push_block++;
}
#endif // QUEUE_STATS
// Block until queue is able to accept new elements.
cond_can_accept.wait(lock,
[&] { return global_q.size() < max_elements; });
assert(global_q.size() < max_elements);
global_q.push(std::move(elem));
cond_can_consume.notify_all();
}
template<class Iter>
void push(Iter begin, Iter end) {
using ElemType = typename std::remove_reference<decltype(*begin)>::type;
static_assert(std::is_same<ElemType, std::unique_ptr<T>>::value,
"Iterator must be over unique_ptr<T>");
if (begin == end) {
return;
}
auto lock = global_q.lock();
#ifdef QUEUE_STATS
global_q.stats.push++;
global_q.stats.push_elements += std::distance(begin, end);
if (global_q.size() >= max_elements) {
global_q.stats.push_block++;
}
#endif // QUEUE_STATS
// Block until queue is able to accept new elements.
cond_can_accept.wait(lock,
[&] { return global_q.size() < max_elements; });
assert(global_q.size() < max_elements);
for (auto it = begin; it != end; ++it) {
global_q.push(std::move(*it));
}
cond_can_consume.notify_all();
}
std::unique_ptr<T> pop(size_t consumer_id) {
assert(consumer_id < consumer_q.size());
auto &q = consumer_q[consumer_id];
// Try and satisfy the request from our per-consumer queue.
{
auto consumer_lock = q.lock();
if (!q.empty()) {
return pop_from_queue(q);
}
}
// Try and satisfy the request with a refill from the global queue.
{
auto lock = global_q.lock();
if (!global_q.empty()) {
auto consumer_lock = q.lock();
return refill_and_pop(q);
}
}
// Try and satisfy the request by stealing it from another queue.
for (size_t i = 1; i < consumer_q.size(); i++) {
size_t victim_id = (consumer_id + i) % consumer_q.size();
auto &victim_q = consumer_q[victim_id];
auto victim_lock = victim_q.lock();
// Note: we don't steal sentinel elements.
if (!victim_q.empty() && victim_q.front() != nullptr) {
#ifdef QUEUE_STATS
victim_q.stats.stolen_from++;
#endif
return pop_from_queue(victim_q);
}
}
// All avenues exhausted, we must block until we've received a new
// element.
auto lock = global_q.lock();
#ifdef QUEUE_STATS
global_q.stats.pop_block++;
#endif
cond_can_consume.wait(lock, [&]{ return !global_q.empty(); });
assert(!global_q.empty());
auto consumer_lock = q.lock();
return refill_and_pop(q);
}
private:
std::unique_ptr<T> pop_from_queue(MutexQueue &q) {
assert(!q.empty());
auto elem = std::move(q.front());
q.pop();
#ifdef QUEUE_STATS
q.stats.pop++;
#endif
return elem;
}
std::unique_ptr<T> refill_and_pop(MutexQueue &q) {
assert(!global_q.empty());
#ifdef QUEUE_STATS
q.stats.refill++;
#endif
auto elem = pop_from_queue(global_q);
if (elem == nullptr) {
return elem; // Sentinel.
}
// Grab all subsequent elements that share the same ID.
const auto &id = elem->id;
while (!global_q.empty()) {
auto &first = global_q.front();
if (first == nullptr) {
#ifdef QUEUE_STATS
q.stats.push++;
q.stats.push_elements++;
#endif
// Sentinel element. We can grab one, but no more.
q.push(pop_from_queue(global_q));
break;
}
if (first->id != id) {
break;
}
#ifdef QUEUE_STATS
q.stats.push++;
q.stats.push_elements++;
#endif
q.push(pop_from_queue(global_q));
}
if (global_q.size() < max_elements) {
cond_can_accept.notify_all();
}
return elem;
}
// Maximum number of elements in the global queue (subsequent push
// operations will block). Note that we may overshoot this value when
// handling bulk push operations.
const size_t max_elements;
// Global queue.
MutexQueue global_q;
// Per-consumer queues.
std::vector<MutexQueue> consumer_q;
// Condition variable for producers to wait on when the queue is full.
std::condition_variable cond_can_accept;
// Condition variable for consumers to wait on when the queue is empty.
std::condition_variable cond_can_consume;
};
#ifdef QUEUE_STATS
#undef QUEUE_STATS
#endif
#endif // BOUNDEDQUEUE_H

View File

@ -0,0 +1,79 @@
# we have a fixed requirement for PCRE
set(PCRE_REQUIRED_MAJOR_VERSION 8)
set(PCRE_REQUIRED_MINOR_VERSION 41)
set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
include (${CMAKE_MODULE_PATH}/pcre.cmake)
include_directories(${PCRE_INCLUDE_DIRS})
include(${CMAKE_MODULE_PATH}/backtrace.cmake)
# we need static libs - too much deep magic for shared libs
if (NOT BUILD_STATIC_LIBS)
return ()
endif ()
CHECK_FUNCTION_EXISTS(sigaltstack HAVE_SIGALTSTACK)
CHECK_FUNCTION_EXISTS(sigaction HAVE_SIGACTION)
CHECK_FUNCTION_EXISTS(setrlimit HAVE_SETRLIMIT)
set_source_files_properties(
${CMAKE_CURRENT_BINARY_DIR}/ColliderCorporaParser.cpp
PROPERTIES
COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}")
ragelmaker(ColliderCorporaParser.rl)
# only set these after all tests are done
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
SET(hscollider_SOURCES
common.h
BoundedQueue.h
Corpora.cpp
FileCorpora.h
FileCorpora.cpp
ColliderCorporaParser.h
ColliderCorporaParser.cpp
NfaGeneratedCorpora.h
NfaGeneratedCorpora.cpp
GraphTruth.h
GraphTruth.cpp
GroundTruth.h
GroundTruth.cpp
UltimateTruth.h
UltimateTruth.cpp
ResultSet.h
args.cpp
args.h
limit.cpp
pcre_util.cpp
sig.cpp
sig.h
DatabaseProxy.h
Thread.h
Thread.cpp
main.cpp
)
set_source_files_properties(${hscollider_SOURCES} PROPERTIES
INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR})
add_executable(hscollider ${hscollider_SOURCES})
add_dependencies(hscollider ragel_ColliderCorporaParser)
add_dependencies(hscollider pcre)
if(NOT WIN32)
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
expressionutil corpusomatic crosscompileutil pthread
"${BACKTRACE_LDFLAGS}")
if(HAVE_BACKTRACE)
set_source_files_properties(hscollider_SOURCES COMPILE_FLAGS
"${BACKTRACE_CFLAGS}")
endif()
else() # WIN32
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
expressionutil corpusomatic crosscompileutil)
endif()

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FILECORPORAPARSER_H
#define FILECORPORAPARSER_H
#include <string>
struct Corpus;
// parse an escaped string into a real data buffer
bool parseCorpus(const std::string &line, Corpus &c, unsigned int &id);
#endif

View File

@ -0,0 +1,150 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ColliderCorporaParser.h"
#include "Corpora.h"
#include "ue2common.h"
#include <cassert>
#include <cstdlib>
#include <string>
#include <cstdio>
using namespace std;
namespace /* anonymous */ {
// Take a string like '\xFF' and convert it to the character it represents
char unhex(const char *start, UNUSED const char *end) {
assert(start + 4 == end);
assert(start[0] == '\\');
assert(start[1] == 'x');
assert(isxdigit(start[2]));
assert(isxdigit(start[2]));
char temp[3] = {start[2], start[3], 0};
return strtol(temp, nullptr, 16);
}
%%{
machine FileCorporaParser;
action accumulateNum {
num = (num * 10) + (fc - '0');
}
action handleHexEscaped {
sout.push_back(unhex(ts, te));
}
action handleSpecial {
switch (*(ts+1)) {
case '0': sout.push_back('\x00'); break;
case 'a': sout.push_back('\x07'); break;
case 'e': sout.push_back('\x1b'); break;
case 'f': sout.push_back('\x0c'); break;
case 'n': sout.push_back('\x0a'); break;
case 'v': sout.push_back('\x0b'); break;
case 'r': sout.push_back('\x0d'); break;
case 't': sout.push_back('\x09'); break;
default: fbreak;
}
}
action handleMatch {
c.matches.insert(num);
}
write data;
}%%
} // namespace
bool parseCorpus(const string &line, Corpus &c, unsigned int &id) {
const char *p = line.c_str();
const char *pe = p + line.size();
const char *eof = pe;
const char *ts;
const char *te;
int cs;
UNUSED int act;
// For storing integers as they're scanned
unsigned int num = 0;
string &sout = c.data;
%%{
id = ( digit @accumulateNum)+ >{num = 0;} @{id = num;};
backslashed = '\\' ^alnum;
specials = '\\' [0aefnvrt];
hexescaped = '\\x' xdigit{2};
corpus_old := |*
hexescaped => handleHexEscaped;
specials => handleSpecial;
backslashed => { sout.push_back(*(ts + 1)); };
any => { sout.push_back(*ts); };
*|;
corpus_new := |*
hexescaped => handleHexEscaped;
specials => handleSpecial;
backslashed => { sout.push_back(*(ts + 1)); };
any - '"' => { sout.push_back(*ts); };
'"' => { fgoto colon_sep; };
*|;
colon_sep := |*
':' => {fgoto match_list; };
*|;
match_list := |*
(' '* (digit @accumulateNum)+ ' '* ','?) >{num = 0;} => handleMatch;
*|;
# Old simple line format
line_old = id ':' @{ fgoto corpus_old; };
# New line format with matches
line_new = id "=\"" @{ c.hasMatches = true; fgoto corpus_new; };
main := ( line_new | line_old );
# Initialize and execute
write init;
write exec;
}%%
return (cs != FileCorporaParser_error) && (p == pe);
}

View File

@ -0,0 +1,31 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "Corpora.h"
CorporaSource::~CorporaSource() { }

View File

@ -0,0 +1,68 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CORPORA_H
#define CORPORA_H
#include <set>
#include <string>
#include <vector>
#include <boost/core/noncopyable.hpp>
struct Corpus {
Corpus() : hasMatches(false) {}
explicit Corpus(const std::string &s) : data(s), hasMatches(false) {}
std::string data; // Corpus itself
bool hasMatches; // Have the matches been pre-calculated?
std::set<unsigned int> matches; // end-offsets of matches
};
struct CorpusFailure {
explicit CorpusFailure(const std::string &s) : message(s) {}
std::string message;
};
// Abstract class for a corpora source: new ways to load or generate corpora
// can be written by subclassing this class and providing its generate
// method.
class CorporaSource : boost::noncopyable {
public:
// destructor
virtual ~CorporaSource();
// Make a copy of this corpora source.
virtual CorporaSource *clone() const = 0;
// Generate corpora for the given signature ID, adding them to the
// vector of strings provided.
virtual void generate(unsigned id, std::vector<Corpus> &data) = 0;
};
#endif // CORPORA_H

View File

@ -0,0 +1,88 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UE2COLLIDER_DATABASEPROXY_H
#define UE2COLLIDER_DATABASEPROXY_H
#include "UltimateTruth.h"
#include <memory>
#include <mutex>
#include <set>
#include <string>
#include <boost/core/noncopyable.hpp>
/**
* When a compile fails for the first time, we throw this exception so that a
* compilation error can be reported to the user. Subsequent failures will
* simply return nullptr rather than throwing this exception.
*/
struct CompileFailed {
public:
explicit CompileFailed(const std::string &err) : error(err) {}
std::string error;
};
class DatabaseProxy : boost::noncopyable {
public:
explicit DatabaseProxy(const std::set<unsigned> &expr_ids)
: ids(expr_ids) {}
explicit DatabaseProxy(std::shared_ptr<HyperscanDB> built_db)
: db(built_db) {}
std::shared_ptr<HyperscanDB> get(const UltimateTruth &ultimate) {
std::lock_guard<std::mutex> lock(mutex);
if (failed) {
// We have previously failed to compile this database.
return nullptr;
}
if (db) {
return db;
}
// Database hasn't been compiled yet.
std::string error;
db = ultimate.compile(ids, error);
if (!db) {
failed = true;
throw CompileFailed(error);
}
return db;
}
private:
std::mutex mutex;
std::shared_ptr<HyperscanDB> db;
std::set<unsigned> ids;
bool failed = false; // Database failed compilation.
};
#endif // UE2COLLIDER_DATABASEPROXY_H

View File

@ -0,0 +1,99 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ColliderCorporaParser.h"
#include "FileCorpora.h"
#include "common.h"
#include "util/expression_path.h"
#include <iostream>
#include <fstream>
#include <boost/algorithm/string/trim.hpp>
using namespace std;
// Returns true if this line is empty or a comment and should be skipped
static
bool emptyLine(const string& line) {
return line.empty() || line[0] == '#';
}
FileCorpora *FileCorpora::clone() const {
FileCorpora *copy = new FileCorpora();
copy->corpora_by_pat = corpora_by_pat;
return copy;
}
bool FileCorpora::readLine(const string &line) {
unsigned id = 0;
Corpus c;
bool rv = parseCorpus(line, c, id);
if (rv) {
corpora_by_pat[id].push_back(c);
return true;
} else {
return false;
}
}
bool FileCorpora::readFile(const string &filename) {
ifstream f(filename.c_str());
if (!f.good()) {
return false;
}
unsigned lineNum = 0;
string line;
while (getline(f, line)) {
lineNum++;
boost::trim(line);
if (emptyLine(line)) {
continue;
}
if (!readLine(line)) {
cerr << "Error in corpora file parsing line " << lineNum << endl;
return false;
}
}
return !corpora_by_pat.empty();
}
void FileCorpora::generate(unsigned id,
vector<Corpus> &data) {
auto i = corpora_by_pat.find(id);
if (i == corpora_by_pat.end() || i->second.empty()) {
throw CorpusFailure("no corpora found for pattern.");
}
data.insert(data.end(), i->second.begin(), i->second.end());
}

View File

@ -0,0 +1,57 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FILECORPORA_H
#define FILECORPORA_H
#include "Corpora.h"
#include <string>
#include <vector>
#include <list>
#include <map>
class FileCorpora : public CorporaSource {
public:
// copy
FileCorpora *clone() const override;
// read corpora in from a file
bool readFile(const std::string &filename);
// generator
void generate(unsigned id, std::vector<Corpus> &data) override;
private:
// read in a line from our file
bool readLine(const std::string &line);
std::map<unsigned, std::list<Corpus>> corpora_by_pat;
};
#endif

View File

@ -0,0 +1,308 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "GraphTruth.h"
#include "common.h"
#include "expressions.h"
#include "ExpressionParser.h"
#include "ng_find_matches.h"
#include "pcre_util.h"
#include "grey.h"
#include "hs_compile.h"
#include "ue2common.h"
#include "compiler/compiler.h"
#include "nfagraph/ng.h"
#include "nfagraph/ng_depth.h"
#include "nfagraph/ng_dump.h"
#include "nfagraph/ng_fuzzy.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_util.h"
#include "parser/Parser.h"
#include "parser/unsupported.h"
#include "util/compile_context.h"
#include "util/make_unique.h"
#include "util/report_manager.h"
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <ostream>
#include <string>
#include <vector>
using namespace std;
using namespace ue2;
// Struct to store the actual compiled NFA graph.
class CompiledNG : boost::noncopyable {
public:
CompiledNG(unique_ptr<NGHolder> g_in,
unique_ptr<ReportManager> rm_in)
: g(std::move(g_in)), rm(std::move(rm_in)) {}
unique_ptr<ue2::NGHolder> g;
unique_ptr<ue2::ReportManager> rm;
};
static
void populateMatchSet(ResultSet &rs, const set<pair<size_t, size_t>> &matches,
const CNGInfo &cngi) {
for (const auto &m : matches) {
u64a from = m.first;
u64a to = m.second;
if (g_streamOffset) {
// Subtract stream offset imposed by offset test.
u64a offset = min(100ull, g_streamOffset);
assert(to >= offset);
from -= min(offset, from);
to -= offset;
}
u64a len = to - from;
if (to < cngi.min_offset || to > cngi.max_offset ||
len < cngi.min_length) {
// this match does not satisfy extparams constraints
DEBUG_PRINTF("skipping NFA Match @ (%llu,%llu)\n", from, to);
continue;
}
if (!cngi.som) {
from = 0;
}
rs.addMatch(from, to);
}
}
CNGInfo::CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in)
: id(id_in), m_expr(m_expr_in) {}
CNGInfo::~CNGInfo() = default;
void CNGInfo::compile() {
auto i = m_expr.find(id);
if (i == m_expr.end()) {
throw NGCompileFailure("ID not found in expression map.");
}
string re;
unsigned hs_flags;
hs_expr_ext ext;
// read the flags for NFA compiler
if (!readExpression(i->second, re, &hs_flags, &ext)) {
throw NGCompileFailure("Cannot parse expression flags.");
}
// make sure we respect collider's UTF-8 setting
if (force_utf8) {
hs_flags |= HS_FLAG_UTF8;
}
try {
bool isStreaming = colliderMode == MODE_STREAMING;
bool isVectored = colliderMode == MODE_VECTORED;
CompileContext cc(isStreaming, isVectored, get_current_target(),
Grey());
ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext);
// UE-2850: ParsedExpression may have updated the utf8 flag if the
// original expression starts with (*UTF8)
utf8 |= pe.expr.utf8;
auto rm = ue2::make_unique<ReportManager>(cc.grey);
// Expressions containing zero-width assertions and other extended pcre
// types aren't supported yet. This call will throw a ParseError
// exception if the component tree contains such a construct.
checkUnsupported(*pe.component);
pe.component->checkEmbeddedStartAnchor(true);
pe.component->checkEmbeddedEndAnchor(true);
// edit distance may be set globally
if (force_edit_distance) {
pe.expr.edit_distance = edit_distance;
}
// validate_fuzzy_compile checks this, but we don't need to build the
// graph to know it will fail
if (pe.expr.edit_distance && utf8) {
throw NGCompileFailure("UTF-8 patterns cannot be "
"approximately matched");
}
auto built_expr = buildGraph(*rm, cc, pe);
auto &expr = built_expr.expr;
auto &g = built_expr.g;
if (expr.edit_distance || expr.hamm_distance) {
// check if this pattern can be approximately matched, throws
// CompileError on failure
bool hamming = expr.hamm_distance > 0;
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
validate_fuzzy_compile(*g, e_dist, hamming, utf8, cc.grey);
}
if (isVacuous(*g)) {
if (som) {
throw NGUnsupportedFailure("Vacuous patterns are not supported "
"in SOM mode");
}
if (expr.min_length > 0) {
throw NGUnsupportedFailure("Vacuous patterns are not supported "
"in combination with min_length");
}
}
cng = make_unique<CompiledNG>(move(g), move(rm));
} catch (CompileError &e) {
throw NGCompileFailure(e.reason);
} catch (NGUnsupportedFailure &e) {
throw NGCompileFailure(e.msg);
} catch (...) {
throw NGCompileFailure("NFA graph construction failed");
}
}
GraphTruth::GraphTruth(ostream &os, const ExpressionMap &expr)
: out(os), m_expr(expr) {}
unique_ptr<CNGInfo> GraphTruth::preprocess(unsigned id,
bool ignoreUnsupported) {
bool highlander = false;
bool prefilter = false;
bool som = false;
auto i = m_expr.find(id);
if (i == m_expr.end()) {
throw NGCompileFailure("ID not found in expression map.");
}
string re;
unsigned flags, hs_flags;
hs_expr_ext ext;
// read the flags for NFA compiler
if (!readExpression(i->second, re, &hs_flags, &ext)) {
throw NGCompileFailure("Cannot parse expression flags.");
}
// read PCRE flags
if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som)) {
throw NGCompileFailure("Cannot get PCRE flags.");
}
if (force_utf8) {
hs_flags |= HS_FLAG_UTF8;
}
// edit distance might be set globally
if (force_edit_distance) {
ext.edit_distance = edit_distance;
}
// SOM flags might be set globally.
som |= !!somFlags;
if (force_prefilter) {
prefilter = true;
}
u64a supported_flags = HS_EXT_FLAG_HAMMING_DISTANCE |
HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET |
HS_EXT_FLAG_MAX_OFFSET | HS_EXT_FLAG_MIN_LENGTH;
if (ext.flags & ~supported_flags) {
if (!ignoreUnsupported) {
throw NGUnsupportedFailure("Unsupported extended flags specified.");
}
}
auto cngi = make_unique<CNGInfo>(id, m_expr);
cngi->utf8 = hs_flags & HS_FLAG_UTF8;
cngi->highlander = highlander;
cngi->prefilter = prefilter;
cngi->som = som;
cngi->min_offset = ext.min_offset;
cngi->max_offset = ext.max_offset;
cngi->min_length = ext.min_length;
cngi->max_edit_distance = ext.edit_distance;
cngi->max_hamm_distance = ext.hamming_distance;
return cngi;
}
bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi,
const string &buffer, ResultSet &rs, string &) {
set<pair<size_t, size_t>> matches;
if (g_streamOffset) {
size_t offset = MIN(100, g_streamOffset);
assert(offset > 0);
const string preamble(string(offset, '\0'));
set<pair<size_t, size_t>> pre_matches;
// First, scan an empty buffer size of the preamble so that we can
// discard any matches therein after the real scan, later. We use
// notEod so that end-anchors in our expression don't match at the
// end of the buffer.
if (!findMatches(*cng.g, *cng.rm, preamble, pre_matches,
cngi.max_edit_distance, cngi.max_hamm_distance, true,
cngi.utf8)) {
return false;
}
// Real scan.
if (!findMatches(*cng.g, *cng.rm, preamble + buffer, matches,
cngi.max_edit_distance, cngi.max_hamm_distance, false,
cngi.utf8)) {
return false;
}
// Erase any matches due entirely to the preamble.
for (const auto &m : pre_matches) {
matches.erase(m);
}
} else {
if (!findMatches(*cng.g, *cng.rm, buffer, matches,
cngi.max_edit_distance, cngi.max_hamm_distance, false,
cngi.utf8)) {
return false;
}
}
populateMatchSet(rs, matches, cngi);
if (echo_matches) {
for (const auto &m : rs.matches) {
out << "NFA Match @ (" << m.from << "," << m.to << ")" << endl;
}
}
return true;
}

View File

@ -0,0 +1,144 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef GRAPHTRUTH_H
#define GRAPHTRUTH_H
#include "expressions.h"
#include "ResultSet.h"
#include "hs_compile.h" // for hs_expr_ext
#include "ue2common.h"
#include <memory>
#include <mutex>
#include <string>
#include <boost/core/noncopyable.hpp>
namespace ue2 {
class ReportManager;
struct BoundaryReports;
} // namespace ue2
struct NGCompileFailure {
explicit NGCompileFailure(const std::string &msg_s) : msg(msg_s) {}
std::string msg;
};
struct NGUnsupportedFailure {
explicit NGUnsupportedFailure(const std::string &msg_s) : msg(msg_s) {}
std::string msg;
};
// Struct to store the actual compiled NFA graph.
class CompiledNG;
// Struct to store the precompile information about the graph.
class CNGInfo : boost::noncopyable {
public:
CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in);
~CNGInfo();
bool is_bad() {
std::lock_guard<std::mutex> lock(bad_mutex);
bool val = bad;
return val;
}
void mark_bad() {
std::lock_guard<std::mutex> lock(bad_mutex);
bad = true;
}
const CompiledNG *get() {
std::lock_guard<std::mutex> lock(cng_mutex);
if (cng) {
return cng.get();
}
// NFA graph hasn't been compiled yet.
try {
compile();
} catch (NGCompileFailure &e) {
throw NGCompileFailure(e);
} catch (NGUnsupportedFailure &e) {
throw NGCompileFailure(e.msg);
}
return cng.get();
}
u64a min_offset = 0;
u64a max_offset = 0;
u64a min_length = 0;
u32 max_edit_distance = 0;
u32 max_hamm_distance = 0;
bool utf8 = false;
bool highlander = false;
bool prefilter = false;
bool som = false;
private:
void compile();
// If NFA graph scan failed for some reason, we mark it as bad and skip
// the remaining tests for it for performance reasons.
bool bad = false;
std::mutex bad_mutex; // serialised accesses to bad flag.
std::unique_ptr<CompiledNG> cng; // compiled NFA graph
std::mutex cng_mutex; // serialised accesses to NFA graph
unsigned id;
// Our expression map
const ExpressionMap &m_expr;
};
class GraphTruth : boost::noncopyable {
public:
GraphTruth(std::ostream &os, const ExpressionMap &expr);
bool run(unsigned id, const CompiledNG &cng, const CNGInfo &cngi,
const std::string &buffer, ResultSet &rs, std::string &error);
std::unique_ptr<CNGInfo> preprocess(unsigned id,
bool ignoreUnsupported = false);
private:
// Output stream.
std::ostream &out;
// Our expression map
const ExpressionMap &m_expr;
};
#endif

View File

@ -0,0 +1,513 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "common.h"
#include "ExpressionParser.h"
#include "expressions.h"
#include "GroundTruth.h"
#include "pcre_util.h"
#include "hs_compile.h" // for hs_expr_ext
#include "ue2common.h"
#include "parser/control_verbs.h"
#include "parser/Parser.h"
#include "parser/parse_error.h"
#include "util/make_unique.h"
#include "util/unicode_def.h"
#include "util/unordered.h"
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ostream>
#include <sstream>
#include <string>
#include <vector>
#include <pcre.h>
/* -X, -Y support
* as PCRE performance is `non-linear' and these options add a large amount of
* scanning, the following short cuts are used:
* 1: the suffix is not scanned - we are more interested in the matches from
* the original corpora.
* 2: only the last 50 bytes of the prefix is scanned. This may lead to some
* minor correctness issues for a few patterns.
*/
using namespace std;
using namespace ue2;
// We store matches in a hash table as we're likely to see lots of them. These
// are moved into a ResultSet at the end.
using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
namespace {
struct CalloutContext {
explicit CalloutContext(ostream &os) : out(os) {}
ostream &out;
PcreMatchSet matches;
};
}
static
int pcreCallOut(pcre_callout_block *block) {
assert(block);
assert(block->callout_data);
CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
if (echo_matches) {
ctx->out << "PCRE Match @ (" << block->start_match << ","
<< block->current_position << ")" << endl;
}
unsigned int from = block->start_match;
unsigned int to = block->current_position;
assert(from <= to);
ctx->matches.insert(make_pair(from, to));
return 1;
}
static
bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
bool *prefilter, bool *som, hs_expr_ext *ext) {
string regex;
unsigned int hs_flags = 0;
if (!readExpression(expr, regex, &hs_flags, ext)) {
return false;
}
expr.swap(regex);
if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
return false;
}
if (force_utf8) {
*flags |= PCRE_UTF8;
}
if (force_prefilter) {
*prefilter = true;
}
return true;
}
static
string pcreErrStr(int err) {
switch (err) {
case PCRE_ERROR_NOMATCH:
return "PCRE_ERROR_NOMATCH";
case PCRE_ERROR_NULL:
return "PCRE_ERROR_NULL";
case PCRE_ERROR_BADOPTION:
return "PCRE_ERROR_BADOPTION";
case PCRE_ERROR_BADMAGIC:
return "PCRE_ERROR_BADMAGIC";
#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
case PCRE_ERROR_UNKNOWN_OPCODE:
return "PCRE_ERROR_UNKNOWN_OPCODE";
#else
case PCRE_ERROR_UNKNOWN_NODE:
return "PCRE_ERROR_UNKNOWN_NODE";
#endif
case PCRE_ERROR_NOMEMORY:
return "PCRE_ERROR_NOMEMORY";
case PCRE_ERROR_NOSUBSTRING:
return "PCRE_ERROR_NOSUBSTRING";
case PCRE_ERROR_MATCHLIMIT:
return "PCRE_ERROR_MATCHLIMIT";
case PCRE_ERROR_CALLOUT:
return "PCRE_ERROR_CALLOUT";
case PCRE_ERROR_BADUTF8:
return "PCRE_ERROR_BADUTF8";
case PCRE_ERROR_BADUTF8_OFFSET:
return "PCRE_ERROR_BADUTF8_OFFSET";
case PCRE_ERROR_PARTIAL:
return "PCRE_ERROR_PARTIAL";
case PCRE_ERROR_BADPARTIAL:
return "PCRE_ERROR_BADPARTIAL";
case PCRE_ERROR_INTERNAL:
return "PCRE_ERROR_INTERNAL";
case PCRE_ERROR_BADCOUNT:
return "PCRE_ERROR_BADCOUNT";
#if defined(PCRE_ERROR_RECURSIONLIMIT)
case PCRE_ERROR_RECURSIONLIMIT:
return "PCRE_ERROR_RECURSIONLIMIT";
#endif
case PCRE_ERROR_DFA_UITEM:
return "PCRE_ERROR_DFA_UITEM";
case PCRE_ERROR_DFA_UCOND:
return "PCRE_ERROR_DFA_UCOND";
case PCRE_ERROR_DFA_UMLIMIT:
return "PCRE_ERROR_DFA_UMLIMIT";
case PCRE_ERROR_DFA_WSSIZE:
return "PCRE_ERROR_DFA_WSSIZE";
case PCRE_ERROR_DFA_RECURSE:
return "PCRE_ERROR_DFA_RECURSE";
default:
{
ostringstream oss;
oss << "Unknown PCRE error (value: " << err << ")";
return oss.str();
}
}
}
GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
unsigned long int limit,
unsigned long int limit_recursion)
: out(os), m_expr(expr), matchLimit(limit),
matchLimitRecursion(limit_recursion) {}
void GroundTruth::global_prep() {
// We're using pcre callouts
pcre_callout = &pcreCallOut;
}
static
void addCallout(string &re) {
// If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
// it at the front. We reuse the control verbs mini-parser for this.
size_t startpos = 0;
try {
ue2::ParseMode mode;
const char *ptr = ue2::read_control_verbs(
re.c_str(), re.c_str() + re.size(), 0, mode);
startpos = ptr - re.c_str();
} catch (const ue2::ParseError &err) {
// fall through
}
assert(startpos <= re.length());
re.insert(startpos, "(?:");
// We include a \E to close any open \Q quoted block. If there isn't
// one, pcre will ignore the \E.
re.append("\\E)(?C)");
}
unique_ptr<CompiledPcre>
GroundTruth::compile(unsigned id, bool no_callouts) {
bool highlander = false;
bool prefilter = false;
bool som = false;
// we can still match approximate matching patterns with PCRE if edit
// distance 0 is requested
if (force_edit_distance && edit_distance) {
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
}
ExpressionMap::const_iterator i = m_expr.find(id);
if (i == m_expr.end()) {
throw PcreCompileFailure("ID not found in expression map.");
}
string re(i->second);
unsigned flags;
hs_expr_ext ext;
// Decode the flags
if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
throw PcreCompileFailure("Unable to decode flags.");
}
// filter out flags not supported by PCRE
u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
HS_EXT_FLAG_MIN_LENGTH;
if (ext.flags & ~supported) {
// edit distance is a known unsupported flag, so just throw a soft error
if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
}
if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
throw SoftPcreCompileFailure(
"Hamming distance not supported by PCRE.");
}
throw PcreCompileFailure("Unsupported extended flags.");
}
// SOM flags might be set globally.
som |= !!somFlags;
// For traditional Hyperscan, add global callout to pattern.
if (!no_callouts) {
addCallout(re);
}
// Compile the pattern
const char *errptr = nullptr;
int errloc = 0;
int errcode = 0;
unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
compiled->utf8 = flags & PCRE_UTF8;
compiled->highlander = highlander;
compiled->prefilter = prefilter;
compiled->som = som;
compiled->min_offset = ext.min_offset;
compiled->max_offset = ext.max_offset;
compiled->min_length = ext.min_length;
compiled->expression = i->second; // original PCRE
flags |= PCRE_NO_AUTO_POSSESS;
compiled->bytecode =
pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
if (!compiled->bytecode || errptr) {
assert(errcode);
ostringstream oss;
oss << "Failed to compile expression '" << re << '\'';
oss << " (" << errptr << " at " << errloc << ").";
if (errcode == 20) { // "regular expression is too large"
throw SoftPcreCompileFailure(oss.str());
} else if (errcode == 25) { // "lookbehind assertion is not fixed length"
throw SoftPcreCompileFailure(oss.str());
} else {
throw PcreCompileFailure(oss.str());
}
}
// Study the pattern
shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
free);
if (errptr) {
ostringstream oss;
oss << "Error studying pattern (" << errptr << ").";
throw PcreCompileFailure(oss.str());
}
int infoRes =
pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
&compiled->captureCount);
if (infoRes < PCRE_ERROR_NOMATCH) {
ostringstream oss;
oss << "Error determining number of capturing subpatterns ("
<< pcreErrStr(infoRes) << ").";
throw PcreCompileFailure(oss.str());
}
return compiled;
}
static
void filterLeftmostSom(ResultSet &rs) {
if (rs.matches.size() <= 1) {
return;
}
set<u64a> seen; // End offsets.
set<MatchResult>::iterator it = rs.matches.begin();
while (it != rs.matches.end()) {
if (seen.insert(it->to).second) {
++it; // First time we've seen this end-offset.
} else {
rs.matches.erase(it++); // Dupe with a "righter" SOM.
}
}
}
static
void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
set<MatchResult>::iterator it = rs.matches.begin();
while (it != rs.matches.end()) {
unsigned int from = it->from, to = it->to;
unsigned int len = to - from;
if (to < compiled.min_offset || to > compiled.max_offset ||
len < compiled.min_length) {
rs.matches.erase(it++);
} else {
++it;
}
}
}
static
int scanBasic(const CompiledPcre &compiled, const string &buffer,
const pcre_extra &extra, vector<int> &ovector,
CalloutContext &ctx) {
const size_t prefix_len = g_corpora_prefix.size();
const size_t suffix_len = g_corpora_suffix.size();
size_t begin_offset = prefix_len - MIN(50, prefix_len);
size_t real_len = buffer.size();
if (suffix_len > 2) {
real_len -= suffix_len - 2;
}
int flags = suffix_len ? PCRE_NOTEOL : 0;
int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
begin_offset, flags, &ovector[0], ovector.size());
if (!g_corpora_prefix.empty()) {
PcreMatchSet tmp;
tmp.swap(ctx.matches);
for (const auto &m : tmp) {
unsigned from = m.first;
unsigned to = m.second;
if (to >= prefix_len && to <= buffer.size() - suffix_len) {
from = from < prefix_len ? 0 : from - prefix_len;
to -= prefix_len;
ctx.matches.insert(make_pair(from, to));
}
}
}
return ret;
}
static
int scanOffset(const CompiledPcre &compiled, const string &buffer,
const pcre_extra &extra, vector<int> &ovector,
CalloutContext &ctx) {
size_t offset = MIN(100, g_streamOffset);
assert(offset > 0);
const string buf(string(offset, '\0') + buffer);
// First, scan our preamble so that we can discard any matches therein
// after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
// our expression don't match at the end of the preamble.
int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
PCRE_NOTEOL, &ovector[0], ovector.size());
if (ret < PCRE_ERROR_NOMATCH) {
return ret;
}
PcreMatchSet pre_matches;
pre_matches.swap(ctx.matches);
// Real scan.
ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
&ovector[0], ovector.size());
if (ret < PCRE_ERROR_NOMATCH) {
return ret;
}
// Erase any matches due entirely to the preamble.
for (const auto &m : pre_matches) {
ctx.matches.erase(m);
}
return ret;
}
bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
const string &buffer, ResultSet &rs, string &error) {
CalloutContext ctx(out);
pcre_extra extra;
extra.flags = 0;
// Switch on callouts.
extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
extra.callout_data = &ctx;
// Set the match_limit (in order to bound execution time on very complex
// patterns)
extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
extra.match_limit = matchLimit;
extra.match_limit_recursion = matchLimitRecursion;
#ifdef PCRE_NO_START_OPTIMIZE
// Switch off optimizations that may result in callouts not occurring.
extra.flags |= PCRE_NO_START_OPTIMIZE;
#endif
// Ensure there's enough room in the ovector for the capture groups in this
// pattern.
int ovecsize = (compiled.captureCount + 1) * 3;
ovector.resize(ovecsize);
int ret;
switch (colliderMode) {
case MODE_BLOCK:
case MODE_STREAMING:
case MODE_VECTORED:
if (g_streamOffset) {
ret = scanOffset(compiled, buffer, extra, ovector, ctx);
} else {
ret = scanBasic(compiled, buffer, extra, ovector, ctx);
}
break;
default:
assert(0);
ret = PCRE_ERROR_NULL;
break;
}
if (ret < PCRE_ERROR_NOMATCH) {
error = pcreErrStr(ret);
return false;
}
// Move matches into a ResultSet.
for (const auto &m : ctx.matches) {
unsigned long long from = m.first;
unsigned long long to = m.second;
if (g_streamOffset) {
// Subtract stream offset imposed by offset test.
unsigned long long offset = min(100ull, g_streamOffset);
assert(to >= offset);
from -= min(offset, from);
to -= offset;
}
rs.addMatch(from, to);
}
// If we have no matches, there's no further work to do.
if (rs.matches.empty()) {
return true;
}
if (compiled.som) {
filterLeftmostSom(rs);
}
filterExtParams(rs, compiled);
// If we haven't been asked for SOM, strip the from offsets.
if (!compiled.som) {
set<MatchResult> endonly;
for (const auto &m : rs.matches) {
endonly.insert(MatchResult(0, m.to));
}
rs.matches.swap(endonly);
}
return true;
}

View File

@ -0,0 +1,126 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef GROUNDTRUTH_H
#define GROUNDTRUTH_H
#include "expressions.h"
#include "ResultSet.h"
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include <pcre.h>
#include <boost/core/noncopyable.hpp>
// Thrown by GroundTruth::compile in the event of a PCRE compile failure.
struct PcreCompileFailure {
PcreCompileFailure(const std::string &msg_s) : msg(msg_s) {}
std::string msg;
};
// Thrown in the event of a "soft" PCRE compile failure, one that we don't want
// to consider a ue2collider failure (e.g. "regular expression too large").
struct SoftPcreCompileFailure : PcreCompileFailure {
SoftPcreCompileFailure(const std::string &msg_s)
: PcreCompileFailure(msg_s) {}
};
// Struct to store everything about a PCRE. Note that the code assumes that
// once populated, the data in this structure will remain constant while tests
// are running, except for the bad flag (which is protected by a mutex).
class CompiledPcre : boost::noncopyable {
public:
CompiledPcre() {}
~CompiledPcre() {
free(bytecode);
}
bool is_bad() {
std::lock_guard<std::mutex> lock(bad_mutex);
bool val = bad;
return val;
}
void mark_bad() {
std::lock_guard<std::mutex> lock(bad_mutex);
bad = true;
}
std::string expression;
pcre *bytecode = nullptr;
unsigned long long min_offset = 0;
unsigned long long max_offset = ~0ULL;
unsigned long long min_length = 0;
int captureCount = 0;
bool utf8 = false;
bool highlander = false;
bool prefilter = false;
bool som = false;
private:
// If a PCRE has hit its match recursion limit when scanning a corpus, we
// mark it as bad and skip the remaining tests for it for performance
// reasons.
bool bad = false;
std::mutex bad_mutex; // serialised accesses to bad flag.
};
// Wrapper around libpcre to generate results for an expression and corpus.
class GroundTruth : boost::noncopyable {
public:
GroundTruth(std::ostream &os, const ExpressionMap &expr,
unsigned long limit, unsigned long limit_recursion);
static void global_prep();
std::unique_ptr<CompiledPcre> compile(unsigned id,
bool no_callouts = false);
bool run(unsigned id, const CompiledPcre &compiled,
const std::string &buffer, ResultSet &rs, std::string &error);
private:
// Output stream.
std::ostream &out;
// Our expression map
const ExpressionMap &m_expr;
// PCRE match limit
const unsigned long int matchLimit;
const unsigned long int matchLimitRecursion;
// Persistent ovector used to run tests.
std::vector<int> ovector;
};
#endif

View File

@ -0,0 +1,146 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ng_corpus_properties.h"
#include "ng_corpus_generator.h"
#include "NfaGeneratedCorpora.h"
#include "ExpressionParser.h"
#include "grey.h"
#include "hs_compile.h"
#include "compiler/compiler.h"
#include "nfagraph/ng.h"
#include "parser/parse_error.h"
#include "parser/Parser.h"
#include "parser/prefilter.h"
#include "parser/unsupported.h"
#include "util/compile_context.h"
#include "util/compile_error.h"
#include "util/report_manager.h"
#include "util/target_info.h"
#include <string>
#include <sstream>
#include <vector>
using namespace std;
using namespace ue2;
NfaGeneratedCorpora::NfaGeneratedCorpora(const ExpressionMap &expr,
const CorpusProperties &props,
bool force_utf8_mode_in,
bool force_prefilter_mode_in)
: m_expr(expr), m_props(props), force_utf8_mode(force_utf8_mode_in),
force_prefilter_mode(force_prefilter_mode_in) {
// empty
}
NfaGeneratedCorpora *NfaGeneratedCorpora::clone() const {
return new NfaGeneratedCorpora(m_expr, m_props, force_utf8_mode,
force_prefilter_mode);
}
void NfaGeneratedCorpora::generate(unsigned id, vector<Corpus> &data) {
ExpressionMap::const_iterator i = m_expr.find(id);
if (i == m_expr.end()) {
throw CorpusFailure("Expression not found.");
}
string re;
u32 hs_flags;
hs_expr_ext ext;
if (!readExpression(i->second, re, &hs_flags, &ext)) {
throw CorpusFailure("Expression could not be read: " + i->second);
}
if (force_utf8_mode) {
hs_flags |= HS_FLAG_UTF8;
}
if (force_prefilter_mode) {
hs_flags |= HS_FLAG_PREFILTER;
}
// Wrap the UE2 parser and compiler functionality and use it to generate
// corpora for us.
vector<string> c;
try {
ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext);
// Apply prefiltering transformations if desired.
if (pe.expr.prefilter) {
prefilterTree(pe.component, ParseMode(hs_flags));
}
// Bail on patterns with unsupported constructs.
checkUnsupported(*pe.component);
pe.component->checkEmbeddedStartAnchor(true);
pe.component->checkEmbeddedEndAnchor(true);
CompileContext cc(false, false, get_current_target(), Grey());
ReportManager rm(cc.grey);
auto built_expr = buildGraph(rm, cc, pe);
if (!built_expr.g) {
// A more specific error should probably have been thrown by
// buildGraph.
throw CorpusFailure("could not build graph.");
}
const auto cg =
makeCorpusGenerator(*built_expr.g, built_expr.expr, m_props);
cg->generateCorpus(c);
}
catch (const ParseError &e) {
throw CorpusFailure("compilation failed, " + e.reason);
}
catch (const CompileError &e) {
throw CorpusFailure("compilation failed, " + e.reason);
}
catch (const std::bad_alloc &) {
throw CorpusFailure("out of memory.");
}
catch (const CorpusGenerationFailure &e) {
// if corpus generation failed, just pass up the error message
throw CorpusFailure("corpus generation failed: " + e.message);
}
catch (...) {
throw CorpusFailure("unknown error.");
}
if (c.empty()) {
throw CorpusFailure("no corpora generated.");
}
data.reserve(data.size() + c.size());
for (const auto &e : c) {
data.push_back(Corpus(e));
}
}

View File

@ -0,0 +1,61 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef NFAGENERATEDCORPORA_H
#define NFAGENERATEDCORPORA_H
#include "Corpora.h"
#include "ng_corpus_properties.h"
#include "expressions.h"
#include <string>
#include <vector>
// Corpora associated with a pattern set
class NfaGeneratedCorpora : public CorporaSource {
public:
NfaGeneratedCorpora(const ExpressionMap &expr,
const CorpusProperties &props, bool force_utf8_mode_in,
bool force_prefilter_mode_in);
NfaGeneratedCorpora *clone() const override;
void generate(unsigned id, std::vector<Corpus> &data) override;
private:
// Expressions handled by this corpora object
const ExpressionMap &m_expr;
// CorpusProperties policy object
CorpusProperties m_props;
bool force_utf8_mode;
bool force_prefilter_mode;
};
#endif

View File

@ -0,0 +1,139 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef RESULTSET_H
#define RESULTSET_H
#include <iostream>
#include <map>
#include <set>
#include <utility>
#include <vector>
// Class representing a single match, encapsulating to/from offsets.
class MatchResult {
public:
MatchResult(unsigned long long start, unsigned long long end)
: from(start), to(end) {}
bool operator<(const MatchResult &a) const {
if (from != a.from) {
return from < a.from;
}
return to < a.to;
}
bool operator==(const MatchResult &a) const {
return from == a.from && to == a.to;
}
unsigned long long from;
unsigned long long to;
};
enum ResultSource {
RESULT_FROM_UE2,
RESULT_FROM_PCRE,
RESULT_FROM_GRAPH,
};
inline
std::ostream &operator<<(std::ostream &out, ResultSource src) {
switch (src) {
case RESULT_FROM_UE2:
out << "UE2";
break;
case RESULT_FROM_GRAPH:
out << "Graph";
break;
case RESULT_FROM_PCRE:
out << "PCRE";
break;
}
return out;
}
class ResultSet {
public:
// Constructor.
explicit ResultSet(ResultSource s) : src(s) {}
// Can be constructed with a set of end-offsets.
ResultSet(const std::set<unsigned int> &m, ResultSource s) : src(s) {
for (const auto &offset : m) {
matches.emplace(0, offset);
}
}
// Equality.
bool operator==(const ResultSet &other) const {
return uoom == other.uoom &&
match_after_halt == other.match_after_halt &&
invalid_id == other.invalid_id &&
matches == other.matches;
}
// Inequality.
bool operator!=(const ResultSet &other) const { return !(*this == other); }
// Add a match.
void addMatch(unsigned long long from, unsigned long long to,
int block = 0) {
MatchResult m(from, to);
matches.insert(m);
if (matches_by_block[block].find(m) != matches_by_block[block].end()) {
dupe_matches.insert(m);
} else {
matches_by_block[block].insert(m);
}
}
// Unexpected out of order match seen.
bool uoom = false;
// A match was received after termination was requested.
bool match_after_halt = false;
// A match from an invalid ID was seen.
bool invalid_id = false;
// Ordered set of matches.
std::set<MatchResult> matches;
// Matches grouped by stream write/block that we see them in.
std::map<int, std::set<MatchResult>> matches_by_block;
// Dupe matches that we have seen.
std::set<MatchResult> dupe_matches;
/* Where these results came from (does not take part in comparisions) */
ResultSource src;
};
#endif

View File

@ -0,0 +1,95 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "Thread.h"
#include "common.h"
#include "sig.h"
#include <cstdlib>
#include <iostream>
#include <pthread.h>
static const size_t COLLIDER_THREAD_STACK_SIZE = 8192 * 1024;
void Thread::start() {
// Some systems, notably Mac OS X, use a default stack size that is
// smaller than what we want (particularly given that we're planning on
// running PCRE, which recurses inside pcre_exec). We attempt to
// increase it to 8MB.
int ret;
pthread_attr_t attr;
ret = pthread_attr_init(&attr);
if (ret) {
std::cerr << "pthread_attr_init failed" << std::endl;
exit(1);
}
size_t stacksize = 0;
ret = pthread_attr_getstacksize(&attr, &stacksize);
if (ret) {
std::cerr << "Warning: can't query stack size with "
"pthread_attr_getstacksize" << std::endl;
goto create_thread;
}
if (stacksize < COLLIDER_THREAD_STACK_SIZE) {
ret = pthread_attr_setstacksize(&attr, COLLIDER_THREAD_STACK_SIZE);
if (ret) {
std::cerr << "Warning: pthread_attr_setstacksize failed, "
"unable to set stack size to "
<< COLLIDER_THREAD_STACK_SIZE << " bytes." << std::endl;
// Fall through: this isn't necessarily fatal (yet!)
}
}
create_thread:
ret = pthread_create(&thread, &attr, &runThread, this);
if (ret) {
std::cerr << "pthread_create failed for thread id " << thread_id
<< std::endl;
exit(1);
}
}
// Dispatch
void *Thread::runThread(void *thr) {
if (!no_signal_handler) {
setSignalStack();
}
((Thread *)thr)->run();
return nullptr;
}
void Thread::join() { pthread_join(thread, nullptr); }
Thread::Thread(size_t num) : thread_id(num) {}
Thread::~Thread() {}

60
tools/hscollider/Thread.h Normal file
View File

@ -0,0 +1,60 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UE2COLLIDER_THREAD_H
#define UE2COLLIDER_THREAD_H
#include <cstdlib>
#include <pthread.h>
#include <boost/core/noncopyable.hpp>
class Thread : boost::noncopyable {
public:
explicit Thread(size_t num);
virtual ~Thread();
virtual void start();
// Dispatch
static void *runThread(void *thr);
virtual void join();
// Implemented by subclasses.
virtual void run() = 0;
protected:
const size_t thread_id;
private:
pthread_t thread;
};
#endif // UE2COLLIDER_THREAD_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,142 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef ULTIMATETRUTH_H
#define ULTIMATETRUTH_H
#include "expressions.h"
#include "hs.h"
#include <memory>
#include <ostream>
#include <set>
#include <string>
#include <vector>
#include <boost/core/noncopyable.hpp>
namespace ue2 {
struct Grey;
} // namespace ue2
class HyperscanDB;
class ResultSet;
// Wrapper around ue2 to generate results for an expression and corpus.
class UltimateTruth : boost::noncopyable {
public:
UltimateTruth(std::ostream &os, const ExpressionMap &expr,
const hs_platform_info *plat, const ue2::Grey &grey,
unsigned streamBlocks = 0);
~UltimateTruth();
std::shared_ptr<HyperscanDB> compile(const std::set<unsigned> &ids,
std::string &error) const;
bool saveDatabase(const HyperscanDB &db,
const std::string &filename) const;
std::shared_ptr<HyperscanDB>
loadDatabase(const std::string &filename,
const std::set<unsigned> &ids) const;
// Are we runnable? (i.e. not xcompiling)
bool runnable() const {
return !m_xcompile;
}
bool run(unsigned id, std::shared_ptr<const HyperscanDB> db,
const std::string &buffer, bool single_pattern, unsigned align,
ResultSet &rs);
// Returns a value completely representing this object's compile options.
unsigned int describe() const;
std::string dbFilename(const std::set<unsigned int> &ids) const;
private:
bool blockScan(const HyperscanDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx,
ResultSet *rs);
bool streamingScan(const HyperscanDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx,
ResultSet *rs);
bool vectoredScan(const HyperscanDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx,
ResultSet *rs);
char *setupScanBuffer(const char *buf, size_t len, size_t align);
char *setupVecScanBuffer(const char *buf, size_t len, size_t align,
unsigned int block_id);
bool allocScratch(std::shared_ptr<const HyperscanDB> db);
bool cloneScratch(void);
std::string dbSettingsHash(const std::set<unsigned int> &ids) const;
const ue2::Grey &grey;
// Output stream.
std::ostream &out;
// Our expression map
const ExpressionMap &m_expr;
// Are we cross-compiling, and therefore unable to scan at all?
bool m_xcompile;
// Our mode flags to pass into the compiler: calculated from streaming,
// etc.
unsigned m_mode;
// In streaming mode, what is the number of blocks to chop data into?
unsigned m_streamBlocks;
// Scratch space for Hyperscan.
hs_scratch_t *scratch;
// Temporary scan buffer used for realigned scanning
std::vector<char> m_scanBuf;
std::vector<std::vector<char> > raw_blocks; /* temp scan buffers used by
* vectored mode */
// Last database we successfully allocated scratch for, so that we can
// avoid unnecessarily reallocating for it.
std::shared_ptr<const HyperscanDB> last_db;
const hs_platform_info *platform;
};
#endif

570
tools/hscollider/args.cpp Normal file
View File

@ -0,0 +1,570 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ng_corpus_properties.h"
#include "args.h"
#include "common.h"
#include "cross_compile.h"
#include "util/expression_path.h"
#include "util/string_util.h"
#include "grey.h"
#include "ue2common.h"
#include "hs_compile.h" // for HS_MODE_*
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <string>
#include <vector>
#include <getopt.h>
#define xstr(s) str(s)
#define str(s) #s
using namespace ue2;
using namespace std;
// display usage information, with an optional error
static
void usage(const char *name, const char *error) {
printf("Usage: %s [OPTIONS...]\n\n", name);
printf("General Options:\n\n");
printf(" -h Display help and exit.\n");
printf(" -G OVERRIDES Overrides for the grey box.\n");
printf(" -e PATH Path to expression directory or file.\n");
printf(" -s FILE Signature file to use.\n");
printf(" -z NUM Signature ID to use.\n");
printf(" -c FILE Load corpora from FILE rather than using "
"generator.\n");
printf(" -w FILE After running, save corpora (with matches) to "
"FILE.\n");
printf(" -a [BAND] Compile all expressions in UE2 (but still match "
"singly).\n");
printf(" If BAND, compile patterns in groups of size "
"BAND.\n");
printf(" -t NUM Use streaming mode, split data into ~NUM "
"blocks.\n");
printf(" -V NUM Use vectored mode, split data into ~NUM "
"blocks.\n");
printf(" -Z {R or 0-%d} Only test one alignment, either as given or "
"'R' for random.\n", MAX_MAX_UE2_ALIGN - 1);
printf(" -q Quiet; display only match differences, no other "
"failures.\n");
printf(" -v Verbose; display successes as well as "
"failures.\n");
printf("\n");
printf("Pattern flags:\n");
printf("\n");
printf(" -8 Force UTF8 mode on all patterns.\n");
printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n");
printf(" -E DISTANCE Match all patterns within edit distance"
" DISTANCE.\n");
printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n");
printf("\n");
printf("Testing mode options:\n");
printf("\n");
printf(" -d NUM Set SOM precision mode (default: 8 (large)).\n");
printf(" -O NUM In streaming mode, set initial offset to NUM.\n");
printf(" -k NUM Terminate callback after NUM matches per "
"pattern.\n");
printf(" --copy-scratch Copy scratch after each scan call.\n");
printf(" --copy-stream Copy stream state after each scan call.\n");
printf(" --compress-expand Compress and expand stream state after each "
"scan call.\n");
printf(" --compress-reset-expand Compress, reset and expand stream state "
"after each scan call.\n");
printf(" --mangle-scratch Mangle scratch space after each scan call.\n");
printf(" --no-nfa Disable NFA graph execution engine.\n");
printf(" --no-pcre Disable PCRE engine.\n");
printf(" --test-nfa Disable UE2 engine (test NFA against PCRE).\n");
printf(" --abort-on-fail Abort, rather than exit, on failure.\n");
printf(" --no-signal-handler Do not handle handle signals (to generate "
"backtraces).\n");
printf("\n");
printf("Memory and resource control options:\n");
printf("\n");
printf(" -T NUM Run with NUM threads.\n");
printf(" -M NUM Set maximum memory allocated to NUM megabytes per"
" thread.\n");
printf(" (0 means no limit, default is 1000 MB).\n");
printf(" -m NUM Set PCRE_MATCH_LIMIT (default: %lu).\n",
DEFAULT_PCRE_MATCH_LIMIT);
printf(" -r NUM Set PCRE_MATCH_LIMIT_RECURSION (default: %lu).\n",
DEFAULT_PCRE_MATCH_RECURSION_LIMIT);
printf("\n");
printf("Cross-compiling:\n");
printf("\n");
printf(" -x NAME Cross-compile for arch NAME.\n");
printf(" -i DIR Don't compile, load from files in DIR "
"instead.\n");
printf(" -o DIR After compiling, save to files in DIR.\n");
printf("\n");
printf("Corpus generation options:\n");
printf("\n");
printf(" -n NUM Max corpora to generate for a given signature "
"(default: %u).\n", DEFAULT_CORPUS_GENERATOR_LIMIT);
printf(" -R NUM Random seed to use (default: seeded from "
"time()).\n");
printf(" -p NUM,NUM,NUM Percentage probabilities of "
"(match,unmatch,random) char.\n");
printf(" -C NUM,NUM Follow cycles (min,max) times.\n");
printf(" -P NUM,NUM Add a random prefix of length between "
"(min,max).\n");
printf(" -S NUM,NUM Add a random suffix of length between "
"(min,max).\n");
printf(" -D NUM Apply an edit distance (default: 0) to each "
"corpus.\n");
printf(" -b NUM Limit alphabet to NUM characters, starting at "
"lower-case 'a'.\n");
printf("\n");
if (error) {
printf("Error: %s\n", error);
}
}
void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
vector<string> *corpora, UNUSED Grey *grey,
unique_ptr<hs_platform_info> *plat_out) {
static const char options[]
= "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8";
s32 in_multi = 0;
s32 in_corpora = 0;
int pcreFlag = 1;
int nfaFlag = 1;
int ue2Flag = 1;
int copyScratch = 0;
int copyStream = 0;
int mangleScratch = 0;
int compressFlag = 0;
int compressResetFlag = 0;
static const struct option longopts[] = {
{"copy-scratch", 0, &copyScratch, 1},
{"copy-stream", 0, &copyStream, 1},
{"mangle-scratch", 0, &mangleScratch, 1},
{"prefilter", 0, &force_prefilter, 1},
{"no-pcre", 0, &pcreFlag, 0},
{"no-nfa", 0, &nfaFlag, 0},
{"test-nfa", 0, &ue2Flag, 0},
{"abort-on-fail", 0, &abort_on_failure, 1},
{"no-signal-handler", 0, &no_signal_handler, 1},
{"compress-expand", 0, &compressFlag, 1},
{"compress-reset-expand", 0, &compressResetFlag, 1},
{nullptr, 0, nullptr, 0}};
for (;;) {
int c = getopt_long(argc, argv, options, longopts, nullptr);
if (c < 0) {
break;
}
switch (c) {
case 'a':
g_ue2CompileAll = true;
in_multi = 2;
break;
case 'b': {
unsigned sz;
if (!fromString(optarg, sz) || sz > 256) {
usage(argv[0], "Must provide an integer argument <= 256"
"to '-b' flag");
exit(1);
}
corpus_gen_prop.alphabetSize = sz;
break;
}
case 'c':
in_corpora = 2;
break;
case 'C': {
vector<unsigned> nums;
if (!strToList(optarg, nums) || nums.size() != 2
|| nums[0] > nums[1]) {
usage(argv[0], "Cycle limit '-C' argument takes a list of "
" integers: MIN,MAX");
exit(1);
}
corpus_gen_prop.setCycleLimit(nums[0], nums[1]);
break;
}
case 'd': {
unsigned dist;
if (!fromString(optarg, dist)) {
usage(argv[0],
"Must provide an integer argument to '-d' flag");
exit(1);
}
switch (dist) {
case 2:
somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL;
break;
case 4:
somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM;
break;
case 8:
somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
break;
default:
usage(argv[0], "SOM precision must be 2, 4 or 8");
exit(1);
}
break;
}
case 'D': {
unsigned dist;
if (!fromString(optarg, dist)) {
usage(argv[0],
"Must provide an integer argument to '-D' flag");
exit(1);
}
corpus_gen_prop.editDistance = dist;
break;
}
case 'e':
g_exprPath.assign(optarg);
break;
case 'E': {
u32 dist;
if (!fromString(optarg, dist)) {
usage(argv[0], "Argument to '-E' flag must be an integer");
exit(1);
}
force_edit_distance = true;
edit_distance = dist;
break;
}
#ifndef RELEASE_BUILD
case 'G':
applyGreyOverrides(grey, string(optarg));
break;
#endif
case 'h':
usage(argv[0], nullptr);
exit(0);
case 'i':
loadDatabases = true;
serializePath = optarg;
break;
case 'k':
if (!fromString(optarg, limit_matches) || limit_matches < 1) {
usage(argv[0],
"Must provide a positive integer argument to '-k' "
"flag");
exit(1);
}
break;
case 'L':
somFlags = HS_FLAG_SOM_LEFTMOST;
break;
case 'm':
if (!fromString(optarg, g_matchLimit) || g_matchLimit < 1) {
usage(argv[0],
"Must provide a positive integer argument to '-m' "
"flag");
exit(1);
}
break;
case 'M':
if (!fromString(optarg, g_memoryLimit)) {
usage(argv[0],
"Must provide a positive (or zero) integer argument "
"to '-M' flag");
exit(1);
}
break;
case 'n': {
unsigned int count;
if (!fromString(optarg, count)) {
usage(argv[0], "Argument to '-n' flag must be an integer");
exit(1);
}
corpus_gen_prop.corpusLimit = count;
break;
}
case 'o':
saveDatabases = true;
serializePath = optarg;
break;
case 'O':
if (!fromString(optarg, g_streamOffset)) {
usage(argv[0],
"Argument '-O' flag must be a positive integer");
exit(1);
}
break;
case 'p': {
vector<unsigned> prob;
if (!strToList(optarg, prob) || prob.size() != 3) {
usage(argv[0], "Probabilities '-p' argument takes a list "
"of three integers: MATCH,UNMATCH,RANDOM");
exit(1);
}
if (!corpus_gen_prop.setPercentages(prob[0], prob[1],
prob[2])) {
usage(argv[0],
"Unable to set corpus generator probabilities.");
exit(1);
}
break;
}
case 'P': {
vector<unsigned> nums;
if (!strToList(optarg, nums) || nums.size() != 2
|| nums[0] > nums[1]) {
usage(argv[0], "Prefix '-P' argument takes a list of two"
" integers: MIN,MAX");
exit(1);
}
corpus_gen_prop.prefixRange = min_max(nums[0], nums[1]);
break;
}
case 'q':
g_quiet++;
break;
case 'r':
if (!fromString(optarg, g_matchLimitRecursion)
|| g_matchLimitRecursion < 1) {
usage(argv[0], "Must provide a positive integer argument "
"to '-r' flag");
exit(1);
}
break;
case 'R': {
if (!fromString(optarg, randomSeed)) {
usage(argv[0], "Argument to '-R' flag must be an integer");
exit(1);
}
corpus_gen_prop.seed(randomSeed);
break;
}
case 's':
g_signatureFiles.push_back(optarg);
break;
case 'S': {
vector<unsigned> nums;
if (!strToList(optarg, nums) || nums.size() != 2 ||
nums[0] > nums[1]) {
usage(argv[0], "Suffix '-S' argument takes a list of two"
" integers: MIN,MAX");
exit(1);
}
corpus_gen_prop.suffixRange = min_max(nums[0], nums[1]);
break;
}
case 't':
if (colliderMode != MODE_BLOCK) {
usage(argv[0], "You can only use one mode at a time!");
exit(1);
}
colliderMode = MODE_STREAMING;
if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) {
usage(argv[0], "Must provide a positive integer argument "
"to '-t' flag");
exit(1);
}
break;
case 'T':
if (!fromString(optarg, numThreads) || numThreads < 1) {
usage(argv[0], "Must provide a positive integer argument "
"to '-T' flag");
exit(1);
}
break;
case 'v':
if (g_verbose) {
echo_matches = true;
}
g_verbose = true;
break;
case 'V':
if (colliderMode != MODE_BLOCK) {
usage(argv[0], "You can only use one mode at a time!");
exit(1);
}
colliderMode = MODE_VECTORED;
if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) {
usage(argv[0], "Must provide a positive integer argument "
"to '-t' flag");
exit(1);
}
break;
case 'w':
saveCorpora = true;
saveCorporaFile = optarg;
break;
case 'x':
*plat_out = xcompileReadMode(optarg);
if (!*plat_out) {
usage(argv[0], xcompileUsage().c_str());
exit(1);
}
break;
case 'X': {
u32 count;
if (!fromString(optarg, count)) {
usage(argv[0], "Argument to '-X' flag must be an integer");
exit(1);
}
g_corpora_prefix.insert(g_corpora_prefix.end(), count, '~');
break;
}
case 'Y':
{
u32 count;
if (!fromString(optarg, count)) {
usage(argv[0], "Argument to '-Y' flag must be an integer");
exit(1);
}
g_corpora_suffix.insert(g_corpora_suffix.end(), count, '~');
break;
}
case 'z':
if (!strToList(optarg, g_signatures)) {
usage(argv[0],
"Argument to '-z' flag must be a list of integers");
exit(1);
}
break;
case 'Z':
static constexpr unsigned ALIGN_LIMIT = MAX_MAX_UE2_ALIGN - 1;
if (optarg == string("R")) {
// Random min alignment selected.
use_random_alignment = true;
break;
} else if (!fromString(optarg, min_ue2_align)
|| min_ue2_align > ALIGN_LIMIT) {
usage(argv[0], "Argument must be 'R' or numeric < "
xstr(MAX_MAX_UE2_ALIGN) " to '-Z'");
exit(1);
}
max_ue2_align = min_ue2_align + 1;
break;
case '8':
force_utf8 = true;
break;
case 1:
if (in_multi) {
if (!fromString(optarg, multicompile_bands)) {
usage(argv[0],
"Argument to '-a' flag must be an integer");
exit(1);
}
break;
} else if (in_corpora) {
corpora->push_back(optarg);
in_corpora = 2;
break;
}
case 0:
break;
default:
usage(argv[0], "Unrecognised command line argument.");
exit(1);
}
in_multi = MAX(0, in_multi - 1);
in_corpora = MAX(0, in_corpora - 1);
}
if (g_streamOffset && !g_streamBlocks) {
usage(argv[0], "stream offset requires streams");
exit(1);
}
if (g_exprPath.empty() && !g_signatureFiles.empty()) {
/* attempt to infer an expression directory */
for (const auto &fname : g_signatureFiles) {
string exprPath = inferExpressionPath(fname);
if (!g_exprPath.empty() && exprPath != g_exprPath) {
usage(argv[0], "Only one expression path is allowed.");
}
g_exprPath.assign(exprPath);
}
}
// Must have a valid expression path
if (g_exprPath.empty()) {
usage(argv[0], "Must specify an expression path with the -e option.");
exit(1);
}
// If we've been handed an expr file and no restrictions, use 'em all!
if (!isDir(g_exprPath) && isFile(g_exprPath) && g_signatureFiles.empty()
&& g_signatures.empty()) {
g_allSignatures = true;
}
// Must have a valid signature file
if (g_signatureFiles.empty() && g_signatures.empty() && !g_allSignatures) {
usage(argv[0], "Must specify a signature file with the -s option.");
exit(1);
}
// Cannot ask for both loading and saving
if (loadDatabases && saveDatabases) {
usage(argv[0], "You cannot both load and save databases.");
exit(1);
}
// Cannot ask for cross-compile and loading
if (loadDatabases && *plat_out) {
usage(argv[0], "You cannot both load and xcompile of databases.");
exit(1);
}
// need at least two pattern engines active
if (nfaFlag + pcreFlag + ue2Flag < 2) {
usage(argv[0], "At least two pattern engines should be active.");
exit(1);
}
if (copyStream && !g_streamBlocks) {
usage(argv[0], "Copying streams only makes sense in streaming mode.");
exit(1);
}
if (compressFlag && compressResetFlag) {
usage(argv[0],
"Only use one of --compress-expand and --compress-reset-expand.");
exit(1);
}
// set booleans appropriately
use_NFA = (bool) nfaFlag;
use_PCRE = (bool) pcreFlag;
use_UE2 = (bool) ue2Flag;
use_copy_scratch = (bool) copyScratch;
use_copy_stream = (bool) copyStream;
use_mangle_scratch = (bool) mangleScratch;
use_compress_expand = (bool)compressFlag;
use_compress_reset_expand = (bool)compressResetFlag;
}

46
tools/hscollider/args.h Normal file
View File

@ -0,0 +1,46 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef ARGS_H
#define ARGS_H
#include <memory>
#include <string>
#include <vector>
namespace ue2 {
struct Grey;
}
struct hs_platform_info;
class CorpusProperties;
void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
std::vector<std::string> *corpora, ue2::Grey *grey,
std::unique_ptr<hs_platform_info> *plat_out);
#endif

92
tools/hscollider/common.h Normal file
View File

@ -0,0 +1,92 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef COMMON_H
#define COMMON_H
#include <cstddef>
#include <string>
#include <vector>
enum ColliderMode {
MODE_BLOCK,
MODE_STREAMING,
MODE_VECTORED
};
extern unsigned numThreads;
extern enum ColliderMode colliderMode;
extern unsigned int somFlags;
extern bool loadDatabases;
extern bool saveDatabases;
extern bool saveCorpora;
extern std::string saveCorporaFile;
extern std::string serializePath;
extern bool echo_matches;
extern int g_quiet;
extern bool g_verbose;
extern std::string g_exprPath;
extern std::vector<std::string> g_signatureFiles;
extern bool g_allSignatures;
extern bool g_ue2CompileAll;
extern unsigned g_streamBlocks;
extern unsigned long long g_streamOffset;
extern std::string g_corpora_prefix;
extern std::string g_corpora_suffix;
extern unsigned multicompile_bands;
extern std::string g_corporaFile;
extern std::vector<unsigned> g_signatures;
extern unsigned long int g_matchLimit;
extern unsigned long int g_matchLimitRecursion;
extern unsigned min_ue2_align;
extern unsigned max_ue2_align;
extern size_t g_memoryLimit;
extern bool force_utf8;
extern int force_prefilter;
extern unsigned somPrecisionMode;
extern unsigned limit_matches;
extern unsigned randomSeed;
extern bool use_random_alignment;
extern bool use_PCRE;
extern bool use_NFA;
extern bool use_UE2;
extern bool use_copy_scratch;
extern bool use_copy_stream;
extern bool use_mangle_scratch;
extern bool use_compress_expand;
extern bool use_compress_reset_expand;
extern int abort_on_failure;
extern int no_signal_handler;
extern bool force_edit_distance;
extern unsigned edit_distance;
// Constants
static const unsigned long int DEFAULT_PCRE_MATCH_LIMIT = 10*1000*1000;
static const unsigned long int DEFAULT_PCRE_MATCH_RECURSION_LIMIT = 10000;
#define MAX_MAX_UE2_ALIGN 64
#endif

View File

@ -0,0 +1,63 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "limit.h"
#include <cstdlib>
#if defined(HAVE_SETRLIMIT)
#include <cerrno>
#include <cstdio>
#include <cstring>
#include <iostream>
#include <sys/resource.h>
void setMemoryLimit(size_t mbytes) {
size_t bytes = mbytes * 1024 * 1024;
struct rlimit r;
r.rlim_cur = bytes;
r.rlim_max = bytes;
int rv = setrlimit(RLIMIT_DATA, &r);
if (rv != 0) {
std::cerr << "setrlimit(RLIMIT_DATA, ...) failed: " <<
strerror(errno) << std::endl;
}
rv = setrlimit(RLIMIT_AS, &r);
if (rv != 0) {
std::cerr << "setrlimit(RLIMIT_AS, ...) failed: " <<
strerror(errno) << std::endl;
}
}
#else // no setrlimit
void setMemoryLimit(size_t) {}
#endif

36
tools/hscollider/limit.h Normal file
View File

@ -0,0 +1,36 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LIMIT_H
#define LIMIT_H
#include <cstddef>
void setMemoryLimit(size_t mbytes);
#endif // LIMIT_H

2002
tools/hscollider/main.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,90 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "pcre_util.h"
#include "hs.h"
#include <assert.h>
#include <pcre.h> /* for pcre flags */
bool getPcreFlags(unsigned int hs_flags, unsigned int *flags,
bool *highlander, bool *prefilter, bool *som) {
assert(flags);
assert(highlander);
assert(prefilter);
assert(som);
*flags = 0;
*highlander = false;
*prefilter = false;
*som = false;
if (hs_flags & HS_FLAG_CASELESS) {
*flags |= PCRE_CASELESS;
hs_flags &= ~HS_FLAG_CASELESS;
}
if (hs_flags & HS_FLAG_DOTALL) {
*flags |= PCRE_DOTALL;
hs_flags &= ~HS_FLAG_DOTALL;
}
if (hs_flags & HS_FLAG_MULTILINE) {
*flags |= PCRE_MULTILINE;
hs_flags &= ~HS_FLAG_MULTILINE;
}
if (hs_flags & HS_FLAG_UCP) {
*flags |= PCRE_UCP;
hs_flags &= ~HS_FLAG_UCP;
}
if (hs_flags & HS_FLAG_UTF8) {
*flags |= PCRE_UTF8;
hs_flags &= ~HS_FLAG_UTF8;
}
if (hs_flags & HS_FLAG_SINGLEMATCH) {
*highlander = true;
hs_flags &= ~HS_FLAG_SINGLEMATCH;
}
if (hs_flags & HS_FLAG_PREFILTER) {
*prefilter = true;
hs_flags &= ~HS_FLAG_PREFILTER;
}
if (hs_flags & HS_FLAG_SOM_LEFTMOST) {
*som = true;
hs_flags &= ~HS_FLAG_SOM_LEFTMOST;
}
// Flags that are irrelevant to PCRE.
hs_flags &= ~HS_FLAG_ALLOWEMPTY;
if (hs_flags) {
// You've added new flags, haven't you?
assert(0);
return false;
}
return true;
}

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef PCRE_UTIL_H
#define PCRE_UTIL_H
/** Translates the given hyperscan flags into pcre flags (where appropriate)
* and other bools (for flags which are not directly translateable).
*
* Returns false if an unknown hyperscan flag is encountered.
*/
bool getPcreFlags(unsigned int hs_flags, unsigned int *pcre_flags,
bool *highlander, bool *prefilter, bool *som);
#endif /* PCRE_UTIL_H */

185
tools/hscollider/sig.cpp Normal file
View File

@ -0,0 +1,185 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "sig.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctype.h>
#include <string>
#ifdef HAVE_SIGACTION
#include <signal.h>
#endif
#ifdef HAVE_BACKTRACE
#include <execinfo.h>
#include <unistd.h>
#endif
#define BACKTRACE_BUFFER_SIZE 200
TLS_VARIABLE volatile int debug_stage = STAGE_UNDEFINED;
TLS_VARIABLE volatile int debug_expr = 0;
TLS_VARIABLE const char * volatile debug_expr_ptr = nullptr;
TLS_VARIABLE volatile int debug_corpus = 0;
TLS_VARIABLE const char * volatile debug_corpus_ptr = nullptr;
TLS_VARIABLE volatile size_t debug_corpus_len = 0;
extern std::string g_cmdline;
#ifdef HAVE_SIGACTION
static void sighandler(int signum) {
/* NOTE: This signal handler is designed solely to provide more information
* when a crash occurs in ue2collider -- it makes calls to signal-unsafe
* functions like printf() and backtrace() by design, since we're already
* in deep trouble and are going to exit anyway. */
fflush(stdout);
printf("signal %d\n", signum);
printf("\nFailing cmdline was:\n%s\n\n", g_cmdline.c_str());
printf("expression %d ", debug_expr);
switch(debug_stage) {
case STAGE_UE2_COMPILE:
printf("ue2 compile\n");
break;
case STAGE_UE2_RUN:
printf("corpus %d ue2 scan\n", debug_corpus);
break;
case STAGE_PCRE_COMPILE:
printf("pcre compile\n");
break;
case STAGE_PCRE_RUN:
printf("corpus %d pcre scan\n", debug_corpus);
break;
case STAGE_GRAPH_PREPROCESS:
printf("graph preprocess\n");
break;
case STAGE_GRAPH_COMPILE:
printf("graph compile\n");
break;
case STAGE_GRAPH_RUN:
printf("corpus %d graph scan\n", debug_corpus);
break;
default:
case STAGE_UNDEFINED:
printf("unknown stage\n");
break;
}
printf("\n");
if (debug_expr_ptr) {
printf("expression %p\n", debug_expr_ptr);
printf("%d:%s\n\n", debug_expr, debug_expr_ptr);
}
if (debug_stage == STAGE_PCRE_RUN || debug_stage == STAGE_UE2_RUN) {
printf("corpus %p len %zu\n", debug_corpus_ptr, debug_corpus_len);
printf("%d:", debug_expr);
for (size_t i = 0; i < debug_corpus_len && debug_corpus_ptr; i++) {
unsigned char c = debug_corpus_ptr[i];
if (c == '\n') {
printf("\\n");
} else if (c == '\t') {
printf("\\t");
} else if (c == '\r') {
printf("\\r");
} else if (0x20 <= c && c <= 0x7e && c != '\\') {
printf("%c", c);
} else {
printf("\\x%02hhx", c);
}
}
printf("\n\n");
}
fflush(stdout);
#ifdef HAVE_BACKTRACE
static void *bt[BACKTRACE_BUFFER_SIZE];
int count = backtrace(bt, BACKTRACE_BUFFER_SIZE);
if (count) {
backtrace_symbols_fd(bt, count, STDOUT_FILENO);
} else {
printf("(Call to backtrace() returns zero count.)\n");
}
#else
printf("(Backtrace unavailable on this platform.)\n");
#endif
_exit(signum);
}
#endif // HAVE_SIGACTION
void installSignalHandler(void) {
#ifdef HAVE_SIGACTION
struct sigaction act;
memset(&act, 0, sizeof(act));
act.sa_handler = sighandler;
act.sa_flags = 0;
sigemptyset(&act.sa_mask);
sigaddset(&act.sa_mask, SIGSEGV);
sigaddset(&act.sa_mask, SIGBUS);
sigaddset(&act.sa_mask, SIGFPE);
sigaddset(&act.sa_mask, SIGILL);
sigaddset(&act.sa_mask, SIGABRT);
sigaction(SIGBUS, &act, nullptr);
sigaction(SIGFPE, &act, nullptr);
sigaction(SIGILL, &act, nullptr);
sigaction(SIGABRT, &act, nullptr);
sigaction(SIGSEGV, &act, nullptr);
setSignalStack();
#endif // HAVE_SIGACTION
}
#ifdef HAVE_SIGALTSTACK
static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ];
#endif
void setSignalStack(void) {
#ifdef HAVE_SIGALTSTACK
struct sigaction act;
memset(&act, 0, sizeof(act));
act.sa_handler = sighandler;
act.sa_flags = 0;
stack_t alt_stack;
memset(&alt_stack, 0, sizeof(alt_stack));
alt_stack.ss_flags = 0;
alt_stack.ss_size = SIGSTKSZ;
alt_stack.ss_sp = alt_stack_loc;
if (!sigaltstack(&alt_stack, nullptr)) {
act.sa_flags |= SA_ONSTACK;
}
sigaction(SIGSEGV, &act, nullptr);
#endif
}

57
tools/hscollider/sig.h Normal file
View File

@ -0,0 +1,57 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIG_H
#define SIG_H
#include <cstddef> // for size_t
#define STAGE_UNDEFINED 0
#define STAGE_UE2_COMPILE 1
#define STAGE_UE2_RUN 2
#define STAGE_PCRE_COMPILE 3
#define STAGE_PCRE_RUN 4
#define STAGE_GRAPH_PREPROCESS 5
#define STAGE_GRAPH_COMPILE 6
#define STAGE_GRAPH_RUN 7
#define TLS_VARIABLE __thread
extern TLS_VARIABLE volatile int debug_stage;
extern TLS_VARIABLE volatile int debug_expr;
extern TLS_VARIABLE const char * volatile debug_expr_ptr;
extern TLS_VARIABLE volatile int debug_corpus;
extern TLS_VARIABLE const char * volatile debug_corpus_ptr;
extern TLS_VARIABLE volatile size_t debug_corpus_len;
void installSignalHandler(void);
// Must be called by every thread.
void setSignalStack(void);
#endif

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIMPLE_TIMER_H
#define SIMPLE_TIMER_H
#include <chrono>
class SimpleTimer {
public:
SimpleTimer();
double elapsed() const;
private:
std::chrono::time_point<std::chrono::system_clock> start;
};
SimpleTimer::SimpleTimer() {
start = std::chrono::system_clock::now();
}
double SimpleTimer::elapsed() const {
std::chrono::time_point<std::chrono::system_clock> end;
end = std::chrono::system_clock::now();
std::chrono::duration<double> delta = end - start;
return delta.count();
}
#endif // SIMPLE_TIMER_H