mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
hscollider: tool for testing Hyperscan match behaviour against PCRE
This commit is contained in:
parent
fae8d21127
commit
1330265ced
62
cmake/pcre.cmake
Normal file
62
cmake/pcre.cmake
Normal file
@ -0,0 +1,62 @@
|
||||
# first look in pcre-$version or pcre subdirs
|
||||
if (PCRE_SOURCE)
|
||||
# either provided on cmdline or we've seen it already
|
||||
set (PCRE_BUILD_SOURCE TRUE)
|
||||
elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION})
|
||||
set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION})
|
||||
set (PCRE_BUILD_SOURCE TRUE)
|
||||
elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre)
|
||||
set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre)
|
||||
set (PCRE_BUILD_SOURCE TRUE)
|
||||
endif()
|
||||
|
||||
if (PCRE_BUILD_SOURCE)
|
||||
if (NOT IS_ABSOLUTE ${PCRE_SOURCE})
|
||||
set(PCRE_SOURCE "${CMAKE_BINARY_DIR}/${PCRE_SOURCE}")
|
||||
endif ()
|
||||
set (saved_INCLUDES "${CMAKE_REQUIRED_INCLUDES}")
|
||||
set (CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${PCRE_SOURCE}")
|
||||
|
||||
if (PCRE_CHECKED)
|
||||
set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre)
|
||||
set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre)
|
||||
|
||||
# already processed this file and set up pcre building
|
||||
return()
|
||||
endif ()
|
||||
|
||||
# first, check version number
|
||||
CHECK_C_SOURCE_COMPILES("#include <pcre.h.generic>
|
||||
#if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR != ${PCRE_REQUIRED_MINOR_VERSION}
|
||||
#error Incorrect pcre version
|
||||
#endif
|
||||
main() {}" CORRECT_PCRE_VERSION)
|
||||
set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")
|
||||
|
||||
if (NOT CORRECT_PCRE_VERSION)
|
||||
unset(CORRECT_PCRE_VERSION CACHE)
|
||||
message(FATAL_ERROR "Incorrect version of pcre - version ${PCRE_REQUIRED_VERSION} is required")
|
||||
else()
|
||||
message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} - building from source.")
|
||||
endif()
|
||||
|
||||
# PCRE compile options
|
||||
option(PCRE_BUILD_PCRECPP OFF)
|
||||
option(PCRE_BUILD_PCREGREP OFF)
|
||||
option(PCRE_SHOW_REPORT OFF)
|
||||
set(PCRE_SUPPORT_UNICODE_PROPERTIES ON CACHE BOOL "Build pcre with unicode")
|
||||
add_subdirectory(${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre EXCLUDE_FROM_ALL)
|
||||
set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre)
|
||||
set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre)
|
||||
else ()
|
||||
# pkgconf should save us
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(PCRE libpcre=${PCRE_REQUIRED_VERSION})
|
||||
if (PCRE_FOUND)
|
||||
message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION}")
|
||||
else ()
|
||||
message(FATAL_ERROR "PCRE version ${PCRE_REQUIRED_VERSION} not found")
|
||||
endif ()
|
||||
endif (PCRE_BUILD_SOURCE)
|
||||
|
||||
set (PCRE_CHECKED TRUE PARENT_SCOPE)
|
291
tools/hscollider/BoundedQueue.h
Normal file
291
tools/hscollider/BoundedQueue.h
Normal file
@ -0,0 +1,291 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef BOUNDEDQUEUE_H
|
||||
#define BOUNDEDQUEUE_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <condition_variable>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
//#define QUEUE_STATS 1
|
||||
|
||||
#ifdef QUEUE_STATS
|
||||
|
||||
#include <iostream>
|
||||
|
||||
class BoundedQueueStats {
|
||||
public:
|
||||
size_t pop = 0; //!< Number of pop operations.
|
||||
size_t pop_block = 0; //!< Number of pop operations that had to block.
|
||||
size_t push = 0; //!< Number of push operations.
|
||||
size_t push_elements = 0; //!< Number of elements pushed.
|
||||
size_t push_block = 0; //!< Number of push operations that had to block.
|
||||
size_t refill = 0; //!< Number of refills done.
|
||||
size_t stolen_from = 0; //!< Number of times we were stolen from.
|
||||
|
||||
void dump() const {
|
||||
std::cout << "pop : " << pop << std::endl;
|
||||
std::cout << "pop_block : " << pop_block << std::endl;
|
||||
std::cout << "push : " << push << std::endl;
|
||||
std::cout << "push_elements : " << push_elements << std::endl;
|
||||
std::cout << "push_block : " << push_block << std::endl;
|
||||
std::cout << "refill : " << refill << std::endl;
|
||||
std::cout << "stolen_from : " << stolen_from << std::endl;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
class BoundedQueue : boost::noncopyable {
|
||||
private:
|
||||
// Encapsulates a queue and the mutex used to protect access to it.
|
||||
class MutexQueue {
|
||||
public:
|
||||
// Forwarded queue operations.
|
||||
void push(std::unique_ptr<T> elem) { q.push(std::move(elem)); }
|
||||
void pop() { q.pop(); }
|
||||
std::unique_ptr<T> &front() { return q.front(); }
|
||||
bool empty() const { return q.empty(); }
|
||||
size_t size() const { return q.size(); }
|
||||
|
||||
// Acquire the mutex lock.
|
||||
std::unique_lock<std::mutex> lock() {
|
||||
return std::unique_lock<std::mutex>(mutex);
|
||||
}
|
||||
|
||||
#ifdef QUEUE_STATS
|
||||
BoundedQueueStats stats;
|
||||
#endif
|
||||
|
||||
private:
|
||||
std::mutex mutex;
|
||||
std::queue<std::unique_ptr<T>> q;
|
||||
};
|
||||
|
||||
public:
|
||||
BoundedQueue(size_t consumers, size_t size)
|
||||
: max_elements(size), consumer_q(consumers) {
|
||||
assert(consumers > 0);
|
||||
assert(size > 0);
|
||||
}
|
||||
|
||||
#ifdef QUEUE_STATS
|
||||
~BoundedQueue() {
|
||||
std::cout << "Global queue stats:" << std::endl;
|
||||
global_q.stats.dump();
|
||||
std::cout << std::endl;
|
||||
for (size_t i = 0; i < consumer_q.size(); i++) {
|
||||
std::cout << "Consumer queue " << i << ":" << std::endl;
|
||||
consumer_q[i].stats.dump();
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
#endif // QUEUE_STATS
|
||||
|
||||
void push(std::unique_ptr<T> elem) {
|
||||
auto lock = global_q.lock();
|
||||
|
||||
#ifdef QUEUE_STATS
|
||||
global_q.stats.push++;
|
||||
global_q.stats.push_elements++;
|
||||
if (global_q.size() >= max_elements) {
|
||||
global_q.stats.push_block++;
|
||||
}
|
||||
#endif // QUEUE_STATS
|
||||
|
||||
// Block until queue is able to accept new elements.
|
||||
cond_can_accept.wait(lock,
|
||||
[&] { return global_q.size() < max_elements; });
|
||||
assert(global_q.size() < max_elements);
|
||||
|
||||
global_q.push(std::move(elem));
|
||||
cond_can_consume.notify_all();
|
||||
}
|
||||
|
||||
template<class Iter>
|
||||
void push(Iter begin, Iter end) {
|
||||
using ElemType = typename std::remove_reference<decltype(*begin)>::type;
|
||||
static_assert(std::is_same<ElemType, std::unique_ptr<T>>::value,
|
||||
"Iterator must be over unique_ptr<T>");
|
||||
|
||||
if (begin == end) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto lock = global_q.lock();
|
||||
|
||||
#ifdef QUEUE_STATS
|
||||
global_q.stats.push++;
|
||||
global_q.stats.push_elements += std::distance(begin, end);
|
||||
if (global_q.size() >= max_elements) {
|
||||
global_q.stats.push_block++;
|
||||
}
|
||||
#endif // QUEUE_STATS
|
||||
|
||||
// Block until queue is able to accept new elements.
|
||||
cond_can_accept.wait(lock,
|
||||
[&] { return global_q.size() < max_elements; });
|
||||
assert(global_q.size() < max_elements);
|
||||
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
global_q.push(std::move(*it));
|
||||
}
|
||||
cond_can_consume.notify_all();
|
||||
}
|
||||
|
||||
std::unique_ptr<T> pop(size_t consumer_id) {
|
||||
assert(consumer_id < consumer_q.size());
|
||||
auto &q = consumer_q[consumer_id];
|
||||
|
||||
// Try and satisfy the request from our per-consumer queue.
|
||||
{
|
||||
auto consumer_lock = q.lock();
|
||||
if (!q.empty()) {
|
||||
return pop_from_queue(q);
|
||||
}
|
||||
}
|
||||
|
||||
// Try and satisfy the request with a refill from the global queue.
|
||||
{
|
||||
auto lock = global_q.lock();
|
||||
if (!global_q.empty()) {
|
||||
auto consumer_lock = q.lock();
|
||||
return refill_and_pop(q);
|
||||
}
|
||||
}
|
||||
|
||||
// Try and satisfy the request by stealing it from another queue.
|
||||
for (size_t i = 1; i < consumer_q.size(); i++) {
|
||||
size_t victim_id = (consumer_id + i) % consumer_q.size();
|
||||
auto &victim_q = consumer_q[victim_id];
|
||||
auto victim_lock = victim_q.lock();
|
||||
// Note: we don't steal sentinel elements.
|
||||
if (!victim_q.empty() && victim_q.front() != nullptr) {
|
||||
#ifdef QUEUE_STATS
|
||||
victim_q.stats.stolen_from++;
|
||||
#endif
|
||||
return pop_from_queue(victim_q);
|
||||
}
|
||||
}
|
||||
|
||||
// All avenues exhausted, we must block until we've received a new
|
||||
// element.
|
||||
auto lock = global_q.lock();
|
||||
#ifdef QUEUE_STATS
|
||||
global_q.stats.pop_block++;
|
||||
#endif
|
||||
cond_can_consume.wait(lock, [&]{ return !global_q.empty(); });
|
||||
assert(!global_q.empty());
|
||||
auto consumer_lock = q.lock();
|
||||
return refill_and_pop(q);
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<T> pop_from_queue(MutexQueue &q) {
|
||||
assert(!q.empty());
|
||||
auto elem = std::move(q.front());
|
||||
q.pop();
|
||||
#ifdef QUEUE_STATS
|
||||
q.stats.pop++;
|
||||
#endif
|
||||
return elem;
|
||||
}
|
||||
|
||||
std::unique_ptr<T> refill_and_pop(MutexQueue &q) {
|
||||
assert(!global_q.empty());
|
||||
|
||||
#ifdef QUEUE_STATS
|
||||
q.stats.refill++;
|
||||
#endif
|
||||
|
||||
auto elem = pop_from_queue(global_q);
|
||||
if (elem == nullptr) {
|
||||
return elem; // Sentinel.
|
||||
}
|
||||
|
||||
// Grab all subsequent elements that share the same ID.
|
||||
const auto &id = elem->id;
|
||||
while (!global_q.empty()) {
|
||||
auto &first = global_q.front();
|
||||
if (first == nullptr) {
|
||||
#ifdef QUEUE_STATS
|
||||
q.stats.push++;
|
||||
q.stats.push_elements++;
|
||||
#endif
|
||||
// Sentinel element. We can grab one, but no more.
|
||||
q.push(pop_from_queue(global_q));
|
||||
break;
|
||||
}
|
||||
if (first->id != id) {
|
||||
break;
|
||||
}
|
||||
#ifdef QUEUE_STATS
|
||||
q.stats.push++;
|
||||
q.stats.push_elements++;
|
||||
#endif
|
||||
q.push(pop_from_queue(global_q));
|
||||
}
|
||||
|
||||
if (global_q.size() < max_elements) {
|
||||
cond_can_accept.notify_all();
|
||||
}
|
||||
|
||||
return elem;
|
||||
}
|
||||
|
||||
// Maximum number of elements in the global queue (subsequent push
|
||||
// operations will block). Note that we may overshoot this value when
|
||||
// handling bulk push operations.
|
||||
const size_t max_elements;
|
||||
|
||||
// Global queue.
|
||||
MutexQueue global_q;
|
||||
|
||||
// Per-consumer queues.
|
||||
std::vector<MutexQueue> consumer_q;
|
||||
|
||||
// Condition variable for producers to wait on when the queue is full.
|
||||
std::condition_variable cond_can_accept;
|
||||
|
||||
// Condition variable for consumers to wait on when the queue is empty.
|
||||
std::condition_variable cond_can_consume;
|
||||
};
|
||||
|
||||
#ifdef QUEUE_STATS
|
||||
#undef QUEUE_STATS
|
||||
#endif
|
||||
|
||||
#endif // BOUNDEDQUEUE_H
|
79
tools/hscollider/CMakeLists.txt
Normal file
79
tools/hscollider/CMakeLists.txt
Normal file
@ -0,0 +1,79 @@
|
||||
# we have a fixed requirement for PCRE
|
||||
set(PCRE_REQUIRED_MAJOR_VERSION 8)
|
||||
set(PCRE_REQUIRED_MINOR_VERSION 41)
|
||||
set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
|
||||
|
||||
include (${CMAKE_MODULE_PATH}/pcre.cmake)
|
||||
|
||||
include_directories(${PCRE_INCLUDE_DIRS})
|
||||
|
||||
include(${CMAKE_MODULE_PATH}/backtrace.cmake)
|
||||
|
||||
# we need static libs - too much deep magic for shared libs
|
||||
if (NOT BUILD_STATIC_LIBS)
|
||||
return ()
|
||||
endif ()
|
||||
|
||||
CHECK_FUNCTION_EXISTS(sigaltstack HAVE_SIGALTSTACK)
|
||||
CHECK_FUNCTION_EXISTS(sigaction HAVE_SIGACTION)
|
||||
CHECK_FUNCTION_EXISTS(setrlimit HAVE_SETRLIMIT)
|
||||
|
||||
set_source_files_properties(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ColliderCorporaParser.cpp
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
||||
ragelmaker(ColliderCorporaParser.rl)
|
||||
|
||||
# only set these after all tests are done
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
|
||||
|
||||
SET(hscollider_SOURCES
|
||||
common.h
|
||||
BoundedQueue.h
|
||||
Corpora.cpp
|
||||
FileCorpora.h
|
||||
FileCorpora.cpp
|
||||
ColliderCorporaParser.h
|
||||
ColliderCorporaParser.cpp
|
||||
NfaGeneratedCorpora.h
|
||||
NfaGeneratedCorpora.cpp
|
||||
GraphTruth.h
|
||||
GraphTruth.cpp
|
||||
GroundTruth.h
|
||||
GroundTruth.cpp
|
||||
UltimateTruth.h
|
||||
UltimateTruth.cpp
|
||||
ResultSet.h
|
||||
args.cpp
|
||||
args.h
|
||||
limit.cpp
|
||||
pcre_util.cpp
|
||||
sig.cpp
|
||||
sig.h
|
||||
DatabaseProxy.h
|
||||
Thread.h
|
||||
Thread.cpp
|
||||
main.cpp
|
||||
)
|
||||
|
||||
set_source_files_properties(${hscollider_SOURCES} PROPERTIES
|
||||
INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(hscollider ${hscollider_SOURCES})
|
||||
add_dependencies(hscollider ragel_ColliderCorporaParser)
|
||||
add_dependencies(hscollider pcre)
|
||||
|
||||
if(NOT WIN32)
|
||||
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
|
||||
expressionutil corpusomatic crosscompileutil pthread
|
||||
"${BACKTRACE_LDFLAGS}")
|
||||
|
||||
if(HAVE_BACKTRACE)
|
||||
set_source_files_properties(hscollider_SOURCES COMPILE_FLAGS
|
||||
"${BACKTRACE_CFLAGS}")
|
||||
endif()
|
||||
else() # WIN32
|
||||
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
|
||||
expressionutil corpusomatic crosscompileutil)
|
||||
endif()
|
39
tools/hscollider/ColliderCorporaParser.h
Normal file
39
tools/hscollider/ColliderCorporaParser.h
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef FILECORPORAPARSER_H
|
||||
#define FILECORPORAPARSER_H
|
||||
|
||||
#include <string>
|
||||
|
||||
struct Corpus;
|
||||
|
||||
// parse an escaped string into a real data buffer
|
||||
bool parseCorpus(const std::string &line, Corpus &c, unsigned int &id);
|
||||
|
||||
#endif
|
150
tools/hscollider/ColliderCorporaParser.rl
Normal file
150
tools/hscollider/ColliderCorporaParser.rl
Normal file
@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ColliderCorporaParser.h"
|
||||
#include "Corpora.h"
|
||||
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cstdio>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace /* anonymous */ {
|
||||
|
||||
// Take a string like '\xFF' and convert it to the character it represents
|
||||
char unhex(const char *start, UNUSED const char *end) {
|
||||
assert(start + 4 == end);
|
||||
assert(start[0] == '\\');
|
||||
assert(start[1] == 'x');
|
||||
assert(isxdigit(start[2]));
|
||||
assert(isxdigit(start[2]));
|
||||
|
||||
char temp[3] = {start[2], start[3], 0};
|
||||
|
||||
return strtol(temp, nullptr, 16);
|
||||
}
|
||||
|
||||
%%{
|
||||
machine FileCorporaParser;
|
||||
|
||||
action accumulateNum {
|
||||
num = (num * 10) + (fc - '0');
|
||||
}
|
||||
|
||||
action handleHexEscaped {
|
||||
sout.push_back(unhex(ts, te));
|
||||
}
|
||||
|
||||
action handleSpecial {
|
||||
switch (*(ts+1)) {
|
||||
case '0': sout.push_back('\x00'); break;
|
||||
case 'a': sout.push_back('\x07'); break;
|
||||
case 'e': sout.push_back('\x1b'); break;
|
||||
case 'f': sout.push_back('\x0c'); break;
|
||||
case 'n': sout.push_back('\x0a'); break;
|
||||
case 'v': sout.push_back('\x0b'); break;
|
||||
case 'r': sout.push_back('\x0d'); break;
|
||||
case 't': sout.push_back('\x09'); break;
|
||||
default: fbreak;
|
||||
}
|
||||
}
|
||||
|
||||
action handleMatch {
|
||||
c.matches.insert(num);
|
||||
}
|
||||
|
||||
write data;
|
||||
}%%
|
||||
|
||||
} // namespace
|
||||
|
||||
bool parseCorpus(const string &line, Corpus &c, unsigned int &id) {
|
||||
const char *p = line.c_str();
|
||||
const char *pe = p + line.size();
|
||||
const char *eof = pe;
|
||||
const char *ts;
|
||||
const char *te;
|
||||
int cs;
|
||||
UNUSED int act;
|
||||
|
||||
// For storing integers as they're scanned
|
||||
unsigned int num = 0;
|
||||
|
||||
string &sout = c.data;
|
||||
|
||||
%%{
|
||||
id = ( digit @accumulateNum)+ >{num = 0;} @{id = num;};
|
||||
|
||||
backslashed = '\\' ^alnum;
|
||||
specials = '\\' [0aefnvrt];
|
||||
hexescaped = '\\x' xdigit{2};
|
||||
|
||||
corpus_old := |*
|
||||
hexescaped => handleHexEscaped;
|
||||
specials => handleSpecial;
|
||||
backslashed => { sout.push_back(*(ts + 1)); };
|
||||
any => { sout.push_back(*ts); };
|
||||
*|;
|
||||
|
||||
corpus_new := |*
|
||||
hexescaped => handleHexEscaped;
|
||||
specials => handleSpecial;
|
||||
backslashed => { sout.push_back(*(ts + 1)); };
|
||||
any - '"' => { sout.push_back(*ts); };
|
||||
'"' => { fgoto colon_sep; };
|
||||
*|;
|
||||
|
||||
colon_sep := |*
|
||||
':' => {fgoto match_list; };
|
||||
*|;
|
||||
|
||||
match_list := |*
|
||||
(' '* (digit @accumulateNum)+ ' '* ','?) >{num = 0;} => handleMatch;
|
||||
*|;
|
||||
|
||||
# Old simple line format
|
||||
line_old = id ':' @{ fgoto corpus_old; };
|
||||
|
||||
# New line format with matches
|
||||
line_new = id "=\"" @{ c.hasMatches = true; fgoto corpus_new; };
|
||||
|
||||
main := ( line_new | line_old );
|
||||
|
||||
# Initialize and execute
|
||||
write init;
|
||||
write exec;
|
||||
}%%
|
||||
|
||||
return (cs != FileCorporaParser_error) && (p == pe);
|
||||
}
|
31
tools/hscollider/Corpora.cpp
Normal file
31
tools/hscollider/Corpora.cpp
Normal file
@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "Corpora.h"
|
||||
|
||||
CorporaSource::~CorporaSource() { }
|
68
tools/hscollider/Corpora.h
Normal file
68
tools/hscollider/Corpora.h
Normal file
@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef CORPORA_H
|
||||
#define CORPORA_H
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
struct Corpus {
|
||||
Corpus() : hasMatches(false) {}
|
||||
explicit Corpus(const std::string &s) : data(s), hasMatches(false) {}
|
||||
|
||||
std::string data; // Corpus itself
|
||||
bool hasMatches; // Have the matches been pre-calculated?
|
||||
std::set<unsigned int> matches; // end-offsets of matches
|
||||
};
|
||||
|
||||
struct CorpusFailure {
|
||||
explicit CorpusFailure(const std::string &s) : message(s) {}
|
||||
std::string message;
|
||||
};
|
||||
|
||||
// Abstract class for a corpora source: new ways to load or generate corpora
|
||||
// can be written by subclassing this class and providing its generate
|
||||
// method.
|
||||
class CorporaSource : boost::noncopyable {
|
||||
public:
|
||||
// destructor
|
||||
virtual ~CorporaSource();
|
||||
|
||||
// Make a copy of this corpora source.
|
||||
virtual CorporaSource *clone() const = 0;
|
||||
|
||||
// Generate corpora for the given signature ID, adding them to the
|
||||
// vector of strings provided.
|
||||
virtual void generate(unsigned id, std::vector<Corpus> &data) = 0;
|
||||
};
|
||||
|
||||
#endif // CORPORA_H
|
88
tools/hscollider/DatabaseProxy.h
Normal file
88
tools/hscollider/DatabaseProxy.h
Normal file
@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef UE2COLLIDER_DATABASEPROXY_H
|
||||
#define UE2COLLIDER_DATABASEPROXY_H
|
||||
|
||||
#include "UltimateTruth.h"
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
/**
|
||||
* When a compile fails for the first time, we throw this exception so that a
|
||||
* compilation error can be reported to the user. Subsequent failures will
|
||||
* simply return nullptr rather than throwing this exception.
|
||||
*/
|
||||
struct CompileFailed {
|
||||
public:
|
||||
explicit CompileFailed(const std::string &err) : error(err) {}
|
||||
std::string error;
|
||||
};
|
||||
|
||||
class DatabaseProxy : boost::noncopyable {
|
||||
public:
|
||||
explicit DatabaseProxy(const std::set<unsigned> &expr_ids)
|
||||
: ids(expr_ids) {}
|
||||
|
||||
explicit DatabaseProxy(std::shared_ptr<HyperscanDB> built_db)
|
||||
: db(built_db) {}
|
||||
|
||||
std::shared_ptr<HyperscanDB> get(const UltimateTruth &ultimate) {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (failed) {
|
||||
// We have previously failed to compile this database.
|
||||
return nullptr;
|
||||
}
|
||||
if (db) {
|
||||
return db;
|
||||
}
|
||||
|
||||
// Database hasn't been compiled yet.
|
||||
std::string error;
|
||||
db = ultimate.compile(ids, error);
|
||||
if (!db) {
|
||||
failed = true;
|
||||
throw CompileFailed(error);
|
||||
}
|
||||
|
||||
return db;
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mutex;
|
||||
std::shared_ptr<HyperscanDB> db;
|
||||
std::set<unsigned> ids;
|
||||
bool failed = false; // Database failed compilation.
|
||||
};
|
||||
|
||||
#endif // UE2COLLIDER_DATABASEPROXY_H
|
99
tools/hscollider/FileCorpora.cpp
Normal file
99
tools/hscollider/FileCorpora.cpp
Normal file
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ColliderCorporaParser.h"
|
||||
#include "FileCorpora.h"
|
||||
#include "common.h"
|
||||
#include "util/expression_path.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <boost/algorithm/string/trim.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
// Returns true if this line is empty or a comment and should be skipped
|
||||
static
|
||||
bool emptyLine(const string& line) {
|
||||
return line.empty() || line[0] == '#';
|
||||
}
|
||||
|
||||
FileCorpora *FileCorpora::clone() const {
|
||||
FileCorpora *copy = new FileCorpora();
|
||||
copy->corpora_by_pat = corpora_by_pat;
|
||||
return copy;
|
||||
}
|
||||
|
||||
bool FileCorpora::readLine(const string &line) {
|
||||
unsigned id = 0;
|
||||
Corpus c;
|
||||
bool rv = parseCorpus(line, c, id);
|
||||
if (rv) {
|
||||
corpora_by_pat[id].push_back(c);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool FileCorpora::readFile(const string &filename) {
|
||||
ifstream f(filename.c_str());
|
||||
if (!f.good()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned lineNum = 0;
|
||||
string line;
|
||||
while (getline(f, line)) {
|
||||
lineNum++;
|
||||
|
||||
boost::trim(line);
|
||||
|
||||
if (emptyLine(line)) {
|
||||
continue;
|
||||
}
|
||||
if (!readLine(line)) {
|
||||
cerr << "Error in corpora file parsing line " << lineNum << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return !corpora_by_pat.empty();
|
||||
}
|
||||
|
||||
void FileCorpora::generate(unsigned id,
|
||||
vector<Corpus> &data) {
|
||||
auto i = corpora_by_pat.find(id);
|
||||
if (i == corpora_by_pat.end() || i->second.empty()) {
|
||||
throw CorpusFailure("no corpora found for pattern.");
|
||||
}
|
||||
|
||||
data.insert(data.end(), i->second.begin(), i->second.end());
|
||||
}
|
57
tools/hscollider/FileCorpora.h
Normal file
57
tools/hscollider/FileCorpora.h
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef FILECORPORA_H
|
||||
#define FILECORPORA_H
|
||||
|
||||
#include "Corpora.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <map>
|
||||
|
||||
class FileCorpora : public CorporaSource {
|
||||
public:
|
||||
// copy
|
||||
FileCorpora *clone() const override;
|
||||
|
||||
// read corpora in from a file
|
||||
bool readFile(const std::string &filename);
|
||||
|
||||
// generator
|
||||
void generate(unsigned id, std::vector<Corpus> &data) override;
|
||||
|
||||
private:
|
||||
// read in a line from our file
|
||||
bool readLine(const std::string &line);
|
||||
|
||||
std::map<unsigned, std::list<Corpus>> corpora_by_pat;
|
||||
};
|
||||
|
||||
#endif
|
308
tools/hscollider/GraphTruth.cpp
Normal file
308
tools/hscollider/GraphTruth.cpp
Normal file
@ -0,0 +1,308 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "GraphTruth.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "expressions.h"
|
||||
#include "ExpressionParser.h"
|
||||
#include "ng_find_matches.h"
|
||||
#include "pcre_util.h"
|
||||
|
||||
#include "grey.h"
|
||||
#include "hs_compile.h"
|
||||
#include "ue2common.h"
|
||||
#include "compiler/compiler.h"
|
||||
#include "nfagraph/ng.h"
|
||||
#include "nfagraph/ng_depth.h"
|
||||
#include "nfagraph/ng_dump.h"
|
||||
#include "nfagraph/ng_fuzzy.h"
|
||||
#include "nfagraph/ng_holder.h"
|
||||
#include "nfagraph/ng_util.h"
|
||||
#include "parser/Parser.h"
|
||||
#include "parser/unsupported.h"
|
||||
#include "util/compile_context.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/report_manager.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
using namespace ue2;
|
||||
|
||||
// Struct to store the actual compiled NFA graph.
|
||||
class CompiledNG : boost::noncopyable {
|
||||
public:
|
||||
CompiledNG(unique_ptr<NGHolder> g_in,
|
||||
unique_ptr<ReportManager> rm_in)
|
||||
: g(std::move(g_in)), rm(std::move(rm_in)) {}
|
||||
unique_ptr<ue2::NGHolder> g;
|
||||
unique_ptr<ue2::ReportManager> rm;
|
||||
};
|
||||
|
||||
static
|
||||
void populateMatchSet(ResultSet &rs, const set<pair<size_t, size_t>> &matches,
|
||||
const CNGInfo &cngi) {
|
||||
for (const auto &m : matches) {
|
||||
u64a from = m.first;
|
||||
u64a to = m.second;
|
||||
if (g_streamOffset) {
|
||||
// Subtract stream offset imposed by offset test.
|
||||
u64a offset = min(100ull, g_streamOffset);
|
||||
assert(to >= offset);
|
||||
from -= min(offset, from);
|
||||
to -= offset;
|
||||
}
|
||||
u64a len = to - from;
|
||||
|
||||
if (to < cngi.min_offset || to > cngi.max_offset ||
|
||||
len < cngi.min_length) {
|
||||
// this match does not satisfy extparams constraints
|
||||
DEBUG_PRINTF("skipping NFA Match @ (%llu,%llu)\n", from, to);
|
||||
continue;
|
||||
}
|
||||
if (!cngi.som) {
|
||||
from = 0;
|
||||
}
|
||||
rs.addMatch(from, to);
|
||||
}
|
||||
}
|
||||
|
||||
CNGInfo::CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in)
|
||||
: id(id_in), m_expr(m_expr_in) {}
|
||||
|
||||
CNGInfo::~CNGInfo() = default;
|
||||
|
||||
void CNGInfo::compile() {
|
||||
auto i = m_expr.find(id);
|
||||
if (i == m_expr.end()) {
|
||||
throw NGCompileFailure("ID not found in expression map.");
|
||||
}
|
||||
|
||||
string re;
|
||||
unsigned hs_flags;
|
||||
hs_expr_ext ext;
|
||||
|
||||
// read the flags for NFA compiler
|
||||
if (!readExpression(i->second, re, &hs_flags, &ext)) {
|
||||
throw NGCompileFailure("Cannot parse expression flags.");
|
||||
}
|
||||
// make sure we respect collider's UTF-8 setting
|
||||
if (force_utf8) {
|
||||
hs_flags |= HS_FLAG_UTF8;
|
||||
}
|
||||
|
||||
try {
|
||||
bool isStreaming = colliderMode == MODE_STREAMING;
|
||||
bool isVectored = colliderMode == MODE_VECTORED;
|
||||
CompileContext cc(isStreaming, isVectored, get_current_target(),
|
||||
Grey());
|
||||
ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext);
|
||||
|
||||
// UE-2850: ParsedExpression may have updated the utf8 flag if the
|
||||
// original expression starts with (*UTF8)
|
||||
utf8 |= pe.expr.utf8;
|
||||
|
||||
auto rm = ue2::make_unique<ReportManager>(cc.grey);
|
||||
|
||||
// Expressions containing zero-width assertions and other extended pcre
|
||||
// types aren't supported yet. This call will throw a ParseError
|
||||
// exception if the component tree contains such a construct.
|
||||
checkUnsupported(*pe.component);
|
||||
|
||||
pe.component->checkEmbeddedStartAnchor(true);
|
||||
pe.component->checkEmbeddedEndAnchor(true);
|
||||
|
||||
// edit distance may be set globally
|
||||
if (force_edit_distance) {
|
||||
pe.expr.edit_distance = edit_distance;
|
||||
}
|
||||
|
||||
// validate_fuzzy_compile checks this, but we don't need to build the
|
||||
// graph to know it will fail
|
||||
if (pe.expr.edit_distance && utf8) {
|
||||
throw NGCompileFailure("UTF-8 patterns cannot be "
|
||||
"approximately matched");
|
||||
}
|
||||
|
||||
auto built_expr = buildGraph(*rm, cc, pe);
|
||||
auto &expr = built_expr.expr;
|
||||
auto &g = built_expr.g;
|
||||
|
||||
if (expr.edit_distance || expr.hamm_distance) {
|
||||
// check if this pattern can be approximately matched, throws
|
||||
// CompileError on failure
|
||||
bool hamming = expr.hamm_distance > 0;
|
||||
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
|
||||
validate_fuzzy_compile(*g, e_dist, hamming, utf8, cc.grey);
|
||||
}
|
||||
|
||||
if (isVacuous(*g)) {
|
||||
if (som) {
|
||||
throw NGUnsupportedFailure("Vacuous patterns are not supported "
|
||||
"in SOM mode");
|
||||
}
|
||||
if (expr.min_length > 0) {
|
||||
throw NGUnsupportedFailure("Vacuous patterns are not supported "
|
||||
"in combination with min_length");
|
||||
}
|
||||
}
|
||||
|
||||
cng = make_unique<CompiledNG>(move(g), move(rm));
|
||||
} catch (CompileError &e) {
|
||||
throw NGCompileFailure(e.reason);
|
||||
} catch (NGUnsupportedFailure &e) {
|
||||
throw NGCompileFailure(e.msg);
|
||||
} catch (...) {
|
||||
throw NGCompileFailure("NFA graph construction failed");
|
||||
}
|
||||
}
|
||||
|
||||
GraphTruth::GraphTruth(ostream &os, const ExpressionMap &expr)
|
||||
: out(os), m_expr(expr) {}
|
||||
|
||||
unique_ptr<CNGInfo> GraphTruth::preprocess(unsigned id,
|
||||
bool ignoreUnsupported) {
|
||||
bool highlander = false;
|
||||
bool prefilter = false;
|
||||
bool som = false;
|
||||
|
||||
auto i = m_expr.find(id);
|
||||
if (i == m_expr.end()) {
|
||||
throw NGCompileFailure("ID not found in expression map.");
|
||||
}
|
||||
|
||||
string re;
|
||||
unsigned flags, hs_flags;
|
||||
hs_expr_ext ext;
|
||||
|
||||
// read the flags for NFA compiler
|
||||
if (!readExpression(i->second, re, &hs_flags, &ext)) {
|
||||
throw NGCompileFailure("Cannot parse expression flags.");
|
||||
}
|
||||
// read PCRE flags
|
||||
if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som)) {
|
||||
throw NGCompileFailure("Cannot get PCRE flags.");
|
||||
}
|
||||
if (force_utf8) {
|
||||
hs_flags |= HS_FLAG_UTF8;
|
||||
}
|
||||
|
||||
// edit distance might be set globally
|
||||
if (force_edit_distance) {
|
||||
ext.edit_distance = edit_distance;
|
||||
}
|
||||
|
||||
// SOM flags might be set globally.
|
||||
som |= !!somFlags;
|
||||
|
||||
if (force_prefilter) {
|
||||
prefilter = true;
|
||||
}
|
||||
|
||||
u64a supported_flags = HS_EXT_FLAG_HAMMING_DISTANCE |
|
||||
HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET |
|
||||
HS_EXT_FLAG_MAX_OFFSET | HS_EXT_FLAG_MIN_LENGTH;
|
||||
if (ext.flags & ~supported_flags) {
|
||||
if (!ignoreUnsupported) {
|
||||
throw NGUnsupportedFailure("Unsupported extended flags specified.");
|
||||
}
|
||||
}
|
||||
|
||||
auto cngi = make_unique<CNGInfo>(id, m_expr);
|
||||
cngi->utf8 = hs_flags & HS_FLAG_UTF8;
|
||||
cngi->highlander = highlander;
|
||||
cngi->prefilter = prefilter;
|
||||
cngi->som = som;
|
||||
cngi->min_offset = ext.min_offset;
|
||||
cngi->max_offset = ext.max_offset;
|
||||
cngi->min_length = ext.min_length;
|
||||
cngi->max_edit_distance = ext.edit_distance;
|
||||
cngi->max_hamm_distance = ext.hamming_distance;
|
||||
|
||||
return cngi;
|
||||
}
|
||||
|
||||
bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi,
|
||||
const string &buffer, ResultSet &rs, string &) {
|
||||
set<pair<size_t, size_t>> matches;
|
||||
|
||||
if (g_streamOffset) {
|
||||
size_t offset = MIN(100, g_streamOffset);
|
||||
assert(offset > 0);
|
||||
const string preamble(string(offset, '\0'));
|
||||
|
||||
set<pair<size_t, size_t>> pre_matches;
|
||||
|
||||
// First, scan an empty buffer size of the preamble so that we can
|
||||
// discard any matches therein after the real scan, later. We use
|
||||
// notEod so that end-anchors in our expression don't match at the
|
||||
// end of the buffer.
|
||||
if (!findMatches(*cng.g, *cng.rm, preamble, pre_matches,
|
||||
cngi.max_edit_distance, cngi.max_hamm_distance, true,
|
||||
cngi.utf8)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Real scan.
|
||||
if (!findMatches(*cng.g, *cng.rm, preamble + buffer, matches,
|
||||
cngi.max_edit_distance, cngi.max_hamm_distance, false,
|
||||
cngi.utf8)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Erase any matches due entirely to the preamble.
|
||||
for (const auto &m : pre_matches) {
|
||||
matches.erase(m);
|
||||
}
|
||||
} else {
|
||||
if (!findMatches(*cng.g, *cng.rm, buffer, matches,
|
||||
cngi.max_edit_distance, cngi.max_hamm_distance, false,
|
||||
cngi.utf8)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
populateMatchSet(rs, matches, cngi);
|
||||
|
||||
if (echo_matches) {
|
||||
for (const auto &m : rs.matches) {
|
||||
out << "NFA Match @ (" << m.from << "," << m.to << ")" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
144
tools/hscollider/GraphTruth.h
Normal file
144
tools/hscollider/GraphTruth.h
Normal file
@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef GRAPHTRUTH_H
|
||||
#define GRAPHTRUTH_H
|
||||
|
||||
#include "expressions.h"
|
||||
#include "ResultSet.h"
|
||||
|
||||
#include "hs_compile.h" // for hs_expr_ext
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class ReportManager;
|
||||
struct BoundaryReports;
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
struct NGCompileFailure {
|
||||
explicit NGCompileFailure(const std::string &msg_s) : msg(msg_s) {}
|
||||
std::string msg;
|
||||
};
|
||||
|
||||
struct NGUnsupportedFailure {
|
||||
explicit NGUnsupportedFailure(const std::string &msg_s) : msg(msg_s) {}
|
||||
std::string msg;
|
||||
};
|
||||
|
||||
// Struct to store the actual compiled NFA graph.
|
||||
class CompiledNG;
|
||||
|
||||
// Struct to store the precompile information about the graph.
|
||||
class CNGInfo : boost::noncopyable {
|
||||
public:
|
||||
CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in);
|
||||
~CNGInfo();
|
||||
|
||||
bool is_bad() {
|
||||
std::lock_guard<std::mutex> lock(bad_mutex);
|
||||
bool val = bad;
|
||||
return val;
|
||||
}
|
||||
|
||||
void mark_bad() {
|
||||
std::lock_guard<std::mutex> lock(bad_mutex);
|
||||
bad = true;
|
||||
}
|
||||
|
||||
const CompiledNG *get() {
|
||||
std::lock_guard<std::mutex> lock(cng_mutex);
|
||||
|
||||
if (cng) {
|
||||
return cng.get();
|
||||
}
|
||||
|
||||
// NFA graph hasn't been compiled yet.
|
||||
try {
|
||||
compile();
|
||||
} catch (NGCompileFailure &e) {
|
||||
throw NGCompileFailure(e);
|
||||
} catch (NGUnsupportedFailure &e) {
|
||||
throw NGCompileFailure(e.msg);
|
||||
}
|
||||
|
||||
return cng.get();
|
||||
}
|
||||
|
||||
u64a min_offset = 0;
|
||||
u64a max_offset = 0;
|
||||
u64a min_length = 0;
|
||||
u32 max_edit_distance = 0;
|
||||
u32 max_hamm_distance = 0;
|
||||
bool utf8 = false;
|
||||
bool highlander = false;
|
||||
bool prefilter = false;
|
||||
bool som = false;
|
||||
private:
|
||||
void compile();
|
||||
// If NFA graph scan failed for some reason, we mark it as bad and skip
|
||||
// the remaining tests for it for performance reasons.
|
||||
bool bad = false;
|
||||
std::mutex bad_mutex; // serialised accesses to bad flag.
|
||||
|
||||
std::unique_ptr<CompiledNG> cng; // compiled NFA graph
|
||||
std::mutex cng_mutex; // serialised accesses to NFA graph
|
||||
|
||||
unsigned id;
|
||||
|
||||
// Our expression map
|
||||
const ExpressionMap &m_expr;
|
||||
};
|
||||
|
||||
|
||||
class GraphTruth : boost::noncopyable {
|
||||
public:
|
||||
GraphTruth(std::ostream &os, const ExpressionMap &expr);
|
||||
|
||||
bool run(unsigned id, const CompiledNG &cng, const CNGInfo &cngi,
|
||||
const std::string &buffer, ResultSet &rs, std::string &error);
|
||||
|
||||
std::unique_ptr<CNGInfo> preprocess(unsigned id,
|
||||
bool ignoreUnsupported = false);
|
||||
|
||||
private:
|
||||
// Output stream.
|
||||
std::ostream &out;
|
||||
|
||||
// Our expression map
|
||||
const ExpressionMap &m_expr;
|
||||
};
|
||||
|
||||
#endif
|
513
tools/hscollider/GroundTruth.cpp
Normal file
513
tools/hscollider/GroundTruth.cpp
Normal file
@ -0,0 +1,513 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ExpressionParser.h"
|
||||
#include "expressions.h"
|
||||
#include "GroundTruth.h"
|
||||
#include "pcre_util.h"
|
||||
|
||||
#include "hs_compile.h" // for hs_expr_ext
|
||||
#include "ue2common.h"
|
||||
#include "parser/control_verbs.h"
|
||||
#include "parser/Parser.h"
|
||||
#include "parser/parse_error.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/unicode_def.h"
|
||||
#include "util/unordered.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <pcre.h>
|
||||
|
||||
/* -X, -Y support
|
||||
* as PCRE performance is `non-linear' and these options add a large amount of
|
||||
* scanning, the following short cuts are used:
|
||||
* 1: the suffix is not scanned - we are more interested in the matches from
|
||||
* the original corpora.
|
||||
* 2: only the last 50 bytes of the prefix is scanned. This may lead to some
|
||||
* minor correctness issues for a few patterns.
|
||||
*/
|
||||
|
||||
using namespace std;
|
||||
using namespace ue2;
|
||||
|
||||
// We store matches in a hash table as we're likely to see lots of them. These
|
||||
// are moved into a ResultSet at the end.
|
||||
using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
|
||||
|
||||
namespace {
|
||||
struct CalloutContext {
|
||||
explicit CalloutContext(ostream &os) : out(os) {}
|
||||
ostream &out;
|
||||
PcreMatchSet matches;
|
||||
};
|
||||
}
|
||||
|
||||
static
|
||||
int pcreCallOut(pcre_callout_block *block) {
|
||||
assert(block);
|
||||
assert(block->callout_data);
|
||||
CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
|
||||
|
||||
if (echo_matches) {
|
||||
ctx->out << "PCRE Match @ (" << block->start_match << ","
|
||||
<< block->current_position << ")" << endl;
|
||||
}
|
||||
|
||||
unsigned int from = block->start_match;
|
||||
unsigned int to = block->current_position;
|
||||
assert(from <= to);
|
||||
|
||||
ctx->matches.insert(make_pair(from, to));
|
||||
return 1;
|
||||
}
|
||||
|
||||
static
|
||||
bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
|
||||
bool *prefilter, bool *som, hs_expr_ext *ext) {
|
||||
string regex;
|
||||
unsigned int hs_flags = 0;
|
||||
if (!readExpression(expr, regex, &hs_flags, ext)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
expr.swap(regex);
|
||||
|
||||
if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (force_utf8) {
|
||||
*flags |= PCRE_UTF8;
|
||||
}
|
||||
|
||||
if (force_prefilter) {
|
||||
*prefilter = true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static
|
||||
string pcreErrStr(int err) {
|
||||
switch (err) {
|
||||
case PCRE_ERROR_NOMATCH:
|
||||
return "PCRE_ERROR_NOMATCH";
|
||||
case PCRE_ERROR_NULL:
|
||||
return "PCRE_ERROR_NULL";
|
||||
case PCRE_ERROR_BADOPTION:
|
||||
return "PCRE_ERROR_BADOPTION";
|
||||
case PCRE_ERROR_BADMAGIC:
|
||||
return "PCRE_ERROR_BADMAGIC";
|
||||
#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
|
||||
case PCRE_ERROR_UNKNOWN_OPCODE:
|
||||
return "PCRE_ERROR_UNKNOWN_OPCODE";
|
||||
#else
|
||||
case PCRE_ERROR_UNKNOWN_NODE:
|
||||
return "PCRE_ERROR_UNKNOWN_NODE";
|
||||
#endif
|
||||
case PCRE_ERROR_NOMEMORY:
|
||||
return "PCRE_ERROR_NOMEMORY";
|
||||
case PCRE_ERROR_NOSUBSTRING:
|
||||
return "PCRE_ERROR_NOSUBSTRING";
|
||||
case PCRE_ERROR_MATCHLIMIT:
|
||||
return "PCRE_ERROR_MATCHLIMIT";
|
||||
case PCRE_ERROR_CALLOUT:
|
||||
return "PCRE_ERROR_CALLOUT";
|
||||
case PCRE_ERROR_BADUTF8:
|
||||
return "PCRE_ERROR_BADUTF8";
|
||||
case PCRE_ERROR_BADUTF8_OFFSET:
|
||||
return "PCRE_ERROR_BADUTF8_OFFSET";
|
||||
case PCRE_ERROR_PARTIAL:
|
||||
return "PCRE_ERROR_PARTIAL";
|
||||
case PCRE_ERROR_BADPARTIAL:
|
||||
return "PCRE_ERROR_BADPARTIAL";
|
||||
case PCRE_ERROR_INTERNAL:
|
||||
return "PCRE_ERROR_INTERNAL";
|
||||
case PCRE_ERROR_BADCOUNT:
|
||||
return "PCRE_ERROR_BADCOUNT";
|
||||
#if defined(PCRE_ERROR_RECURSIONLIMIT)
|
||||
case PCRE_ERROR_RECURSIONLIMIT:
|
||||
return "PCRE_ERROR_RECURSIONLIMIT";
|
||||
#endif
|
||||
case PCRE_ERROR_DFA_UITEM:
|
||||
return "PCRE_ERROR_DFA_UITEM";
|
||||
case PCRE_ERROR_DFA_UCOND:
|
||||
return "PCRE_ERROR_DFA_UCOND";
|
||||
case PCRE_ERROR_DFA_UMLIMIT:
|
||||
return "PCRE_ERROR_DFA_UMLIMIT";
|
||||
case PCRE_ERROR_DFA_WSSIZE:
|
||||
return "PCRE_ERROR_DFA_WSSIZE";
|
||||
case PCRE_ERROR_DFA_RECURSE:
|
||||
return "PCRE_ERROR_DFA_RECURSE";
|
||||
default:
|
||||
{
|
||||
ostringstream oss;
|
||||
oss << "Unknown PCRE error (value: " << err << ")";
|
||||
return oss.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
|
||||
unsigned long int limit,
|
||||
unsigned long int limit_recursion)
|
||||
: out(os), m_expr(expr), matchLimit(limit),
|
||||
matchLimitRecursion(limit_recursion) {}
|
||||
|
||||
void GroundTruth::global_prep() {
|
||||
// We're using pcre callouts
|
||||
pcre_callout = &pcreCallOut;
|
||||
}
|
||||
|
||||
static
|
||||
void addCallout(string &re) {
|
||||
// If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
|
||||
// it at the front. We reuse the control verbs mini-parser for this.
|
||||
size_t startpos = 0;
|
||||
try {
|
||||
ue2::ParseMode mode;
|
||||
const char *ptr = ue2::read_control_verbs(
|
||||
re.c_str(), re.c_str() + re.size(), 0, mode);
|
||||
startpos = ptr - re.c_str();
|
||||
} catch (const ue2::ParseError &err) {
|
||||
// fall through
|
||||
}
|
||||
assert(startpos <= re.length());
|
||||
re.insert(startpos, "(?:");
|
||||
// We include a \E to close any open \Q quoted block. If there isn't
|
||||
// one, pcre will ignore the \E.
|
||||
re.append("\\E)(?C)");
|
||||
}
|
||||
|
||||
unique_ptr<CompiledPcre>
|
||||
GroundTruth::compile(unsigned id, bool no_callouts) {
|
||||
bool highlander = false;
|
||||
bool prefilter = false;
|
||||
bool som = false;
|
||||
|
||||
// we can still match approximate matching patterns with PCRE if edit
|
||||
// distance 0 is requested
|
||||
if (force_edit_distance && edit_distance) {
|
||||
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
|
||||
}
|
||||
|
||||
ExpressionMap::const_iterator i = m_expr.find(id);
|
||||
if (i == m_expr.end()) {
|
||||
throw PcreCompileFailure("ID not found in expression map.");
|
||||
}
|
||||
|
||||
string re(i->second);
|
||||
unsigned flags;
|
||||
hs_expr_ext ext;
|
||||
|
||||
// Decode the flags
|
||||
if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
|
||||
throw PcreCompileFailure("Unable to decode flags.");
|
||||
}
|
||||
|
||||
// filter out flags not supported by PCRE
|
||||
u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
|
||||
HS_EXT_FLAG_MIN_LENGTH;
|
||||
if (ext.flags & ~supported) {
|
||||
// edit distance is a known unsupported flag, so just throw a soft error
|
||||
if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
|
||||
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
|
||||
}
|
||||
if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
|
||||
throw SoftPcreCompileFailure(
|
||||
"Hamming distance not supported by PCRE.");
|
||||
}
|
||||
throw PcreCompileFailure("Unsupported extended flags.");
|
||||
}
|
||||
|
||||
// SOM flags might be set globally.
|
||||
som |= !!somFlags;
|
||||
|
||||
// For traditional Hyperscan, add global callout to pattern.
|
||||
if (!no_callouts) {
|
||||
addCallout(re);
|
||||
}
|
||||
|
||||
// Compile the pattern
|
||||
const char *errptr = nullptr;
|
||||
int errloc = 0;
|
||||
int errcode = 0;
|
||||
|
||||
unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
|
||||
compiled->utf8 = flags & PCRE_UTF8;
|
||||
compiled->highlander = highlander;
|
||||
compiled->prefilter = prefilter;
|
||||
compiled->som = som;
|
||||
compiled->min_offset = ext.min_offset;
|
||||
compiled->max_offset = ext.max_offset;
|
||||
compiled->min_length = ext.min_length;
|
||||
compiled->expression = i->second; // original PCRE
|
||||
flags |= PCRE_NO_AUTO_POSSESS;
|
||||
|
||||
compiled->bytecode =
|
||||
pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
|
||||
|
||||
if (!compiled->bytecode || errptr) {
|
||||
assert(errcode);
|
||||
ostringstream oss;
|
||||
oss << "Failed to compile expression '" << re << '\'';
|
||||
oss << " (" << errptr << " at " << errloc << ").";
|
||||
if (errcode == 20) { // "regular expression is too large"
|
||||
throw SoftPcreCompileFailure(oss.str());
|
||||
} else if (errcode == 25) { // "lookbehind assertion is not fixed length"
|
||||
throw SoftPcreCompileFailure(oss.str());
|
||||
} else {
|
||||
throw PcreCompileFailure(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
// Study the pattern
|
||||
shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
|
||||
free);
|
||||
if (errptr) {
|
||||
ostringstream oss;
|
||||
oss << "Error studying pattern (" << errptr << ").";
|
||||
throw PcreCompileFailure(oss.str());
|
||||
}
|
||||
|
||||
int infoRes =
|
||||
pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
|
||||
&compiled->captureCount);
|
||||
if (infoRes < PCRE_ERROR_NOMATCH) {
|
||||
ostringstream oss;
|
||||
oss << "Error determining number of capturing subpatterns ("
|
||||
<< pcreErrStr(infoRes) << ").";
|
||||
throw PcreCompileFailure(oss.str());
|
||||
}
|
||||
|
||||
return compiled;
|
||||
}
|
||||
|
||||
static
|
||||
void filterLeftmostSom(ResultSet &rs) {
|
||||
if (rs.matches.size() <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
set<u64a> seen; // End offsets.
|
||||
set<MatchResult>::iterator it = rs.matches.begin();
|
||||
while (it != rs.matches.end()) {
|
||||
if (seen.insert(it->to).second) {
|
||||
++it; // First time we've seen this end-offset.
|
||||
} else {
|
||||
rs.matches.erase(it++); // Dupe with a "righter" SOM.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
|
||||
set<MatchResult>::iterator it = rs.matches.begin();
|
||||
while (it != rs.matches.end()) {
|
||||
unsigned int from = it->from, to = it->to;
|
||||
unsigned int len = to - from;
|
||||
if (to < compiled.min_offset || to > compiled.max_offset ||
|
||||
len < compiled.min_length) {
|
||||
rs.matches.erase(it++);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
int scanBasic(const CompiledPcre &compiled, const string &buffer,
|
||||
const pcre_extra &extra, vector<int> &ovector,
|
||||
CalloutContext &ctx) {
|
||||
const size_t prefix_len = g_corpora_prefix.size();
|
||||
const size_t suffix_len = g_corpora_suffix.size();
|
||||
|
||||
size_t begin_offset = prefix_len - MIN(50, prefix_len);
|
||||
size_t real_len = buffer.size();
|
||||
|
||||
if (suffix_len > 2) {
|
||||
real_len -= suffix_len - 2;
|
||||
}
|
||||
|
||||
int flags = suffix_len ? PCRE_NOTEOL : 0;
|
||||
int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
|
||||
begin_offset, flags, &ovector[0], ovector.size());
|
||||
|
||||
if (!g_corpora_prefix.empty()) {
|
||||
PcreMatchSet tmp;
|
||||
tmp.swap(ctx.matches);
|
||||
|
||||
for (const auto &m : tmp) {
|
||||
unsigned from = m.first;
|
||||
unsigned to = m.second;
|
||||
if (to >= prefix_len && to <= buffer.size() - suffix_len) {
|
||||
from = from < prefix_len ? 0 : from - prefix_len;
|
||||
to -= prefix_len;
|
||||
ctx.matches.insert(make_pair(from, to));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static
|
||||
int scanOffset(const CompiledPcre &compiled, const string &buffer,
|
||||
const pcre_extra &extra, vector<int> &ovector,
|
||||
CalloutContext &ctx) {
|
||||
size_t offset = MIN(100, g_streamOffset);
|
||||
assert(offset > 0);
|
||||
|
||||
const string buf(string(offset, '\0') + buffer);
|
||||
|
||||
// First, scan our preamble so that we can discard any matches therein
|
||||
// after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
|
||||
// our expression don't match at the end of the preamble.
|
||||
int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
|
||||
PCRE_NOTEOL, &ovector[0], ovector.size());
|
||||
if (ret < PCRE_ERROR_NOMATCH) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
PcreMatchSet pre_matches;
|
||||
pre_matches.swap(ctx.matches);
|
||||
|
||||
// Real scan.
|
||||
ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
|
||||
&ovector[0], ovector.size());
|
||||
if (ret < PCRE_ERROR_NOMATCH) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Erase any matches due entirely to the preamble.
|
||||
for (const auto &m : pre_matches) {
|
||||
ctx.matches.erase(m);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
|
||||
const string &buffer, ResultSet &rs, string &error) {
|
||||
CalloutContext ctx(out);
|
||||
|
||||
pcre_extra extra;
|
||||
extra.flags = 0;
|
||||
|
||||
// Switch on callouts.
|
||||
extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
|
||||
extra.callout_data = &ctx;
|
||||
|
||||
// Set the match_limit (in order to bound execution time on very complex
|
||||
// patterns)
|
||||
extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
|
||||
extra.match_limit = matchLimit;
|
||||
extra.match_limit_recursion = matchLimitRecursion;
|
||||
|
||||
#ifdef PCRE_NO_START_OPTIMIZE
|
||||
// Switch off optimizations that may result in callouts not occurring.
|
||||
extra.flags |= PCRE_NO_START_OPTIMIZE;
|
||||
#endif
|
||||
|
||||
// Ensure there's enough room in the ovector for the capture groups in this
|
||||
// pattern.
|
||||
int ovecsize = (compiled.captureCount + 1) * 3;
|
||||
ovector.resize(ovecsize);
|
||||
|
||||
int ret;
|
||||
switch (colliderMode) {
|
||||
case MODE_BLOCK:
|
||||
case MODE_STREAMING:
|
||||
case MODE_VECTORED:
|
||||
if (g_streamOffset) {
|
||||
ret = scanOffset(compiled, buffer, extra, ovector, ctx);
|
||||
} else {
|
||||
ret = scanBasic(compiled, buffer, extra, ovector, ctx);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
ret = PCRE_ERROR_NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ret < PCRE_ERROR_NOMATCH) {
|
||||
error = pcreErrStr(ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Move matches into a ResultSet.
|
||||
for (const auto &m : ctx.matches) {
|
||||
unsigned long long from = m.first;
|
||||
unsigned long long to = m.second;
|
||||
|
||||
if (g_streamOffset) {
|
||||
// Subtract stream offset imposed by offset test.
|
||||
unsigned long long offset = min(100ull, g_streamOffset);
|
||||
assert(to >= offset);
|
||||
from -= min(offset, from);
|
||||
to -= offset;
|
||||
}
|
||||
|
||||
rs.addMatch(from, to);
|
||||
}
|
||||
|
||||
// If we have no matches, there's no further work to do.
|
||||
if (rs.matches.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (compiled.som) {
|
||||
filterLeftmostSom(rs);
|
||||
}
|
||||
|
||||
filterExtParams(rs, compiled);
|
||||
|
||||
// If we haven't been asked for SOM, strip the from offsets.
|
||||
if (!compiled.som) {
|
||||
set<MatchResult> endonly;
|
||||
for (const auto &m : rs.matches) {
|
||||
endonly.insert(MatchResult(0, m.to));
|
||||
}
|
||||
rs.matches.swap(endonly);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
126
tools/hscollider/GroundTruth.h
Normal file
126
tools/hscollider/GroundTruth.h
Normal file
@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef GROUNDTRUTH_H
|
||||
#define GROUNDTRUTH_H
|
||||
|
||||
#include "expressions.h"
|
||||
#include "ResultSet.h"
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <pcre.h>
|
||||
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
// Thrown by GroundTruth::compile in the event of a PCRE compile failure.
|
||||
struct PcreCompileFailure {
|
||||
PcreCompileFailure(const std::string &msg_s) : msg(msg_s) {}
|
||||
std::string msg;
|
||||
};
|
||||
|
||||
// Thrown in the event of a "soft" PCRE compile failure, one that we don't want
|
||||
// to consider a ue2collider failure (e.g. "regular expression too large").
|
||||
struct SoftPcreCompileFailure : PcreCompileFailure {
|
||||
SoftPcreCompileFailure(const std::string &msg_s)
|
||||
: PcreCompileFailure(msg_s) {}
|
||||
};
|
||||
|
||||
// Struct to store everything about a PCRE. Note that the code assumes that
|
||||
// once populated, the data in this structure will remain constant while tests
|
||||
// are running, except for the bad flag (which is protected by a mutex).
|
||||
class CompiledPcre : boost::noncopyable {
|
||||
public:
|
||||
CompiledPcre() {}
|
||||
~CompiledPcre() {
|
||||
free(bytecode);
|
||||
}
|
||||
|
||||
bool is_bad() {
|
||||
std::lock_guard<std::mutex> lock(bad_mutex);
|
||||
bool val = bad;
|
||||
return val;
|
||||
}
|
||||
|
||||
void mark_bad() {
|
||||
std::lock_guard<std::mutex> lock(bad_mutex);
|
||||
bad = true;
|
||||
}
|
||||
|
||||
std::string expression;
|
||||
pcre *bytecode = nullptr;
|
||||
unsigned long long min_offset = 0;
|
||||
unsigned long long max_offset = ~0ULL;
|
||||
unsigned long long min_length = 0;
|
||||
int captureCount = 0;
|
||||
bool utf8 = false;
|
||||
bool highlander = false;
|
||||
bool prefilter = false;
|
||||
bool som = false;
|
||||
|
||||
private:
|
||||
// If a PCRE has hit its match recursion limit when scanning a corpus, we
|
||||
// mark it as bad and skip the remaining tests for it for performance
|
||||
// reasons.
|
||||
bool bad = false;
|
||||
std::mutex bad_mutex; // serialised accesses to bad flag.
|
||||
};
|
||||
|
||||
// Wrapper around libpcre to generate results for an expression and corpus.
|
||||
class GroundTruth : boost::noncopyable {
|
||||
public:
|
||||
GroundTruth(std::ostream &os, const ExpressionMap &expr,
|
||||
unsigned long limit, unsigned long limit_recursion);
|
||||
|
||||
static void global_prep();
|
||||
|
||||
std::unique_ptr<CompiledPcre> compile(unsigned id,
|
||||
bool no_callouts = false);
|
||||
|
||||
bool run(unsigned id, const CompiledPcre &compiled,
|
||||
const std::string &buffer, ResultSet &rs, std::string &error);
|
||||
|
||||
private:
|
||||
// Output stream.
|
||||
std::ostream &out;
|
||||
|
||||
// Our expression map
|
||||
const ExpressionMap &m_expr;
|
||||
|
||||
// PCRE match limit
|
||||
const unsigned long int matchLimit;
|
||||
const unsigned long int matchLimitRecursion;
|
||||
|
||||
// Persistent ovector used to run tests.
|
||||
std::vector<int> ovector;
|
||||
};
|
||||
|
||||
#endif
|
146
tools/hscollider/NfaGeneratedCorpora.cpp
Normal file
146
tools/hscollider/NfaGeneratedCorpora.cpp
Normal file
@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ng_corpus_properties.h"
|
||||
#include "ng_corpus_generator.h"
|
||||
#include "NfaGeneratedCorpora.h"
|
||||
#include "ExpressionParser.h"
|
||||
|
||||
#include "grey.h"
|
||||
#include "hs_compile.h"
|
||||
#include "compiler/compiler.h"
|
||||
#include "nfagraph/ng.h"
|
||||
#include "parser/parse_error.h"
|
||||
#include "parser/Parser.h"
|
||||
#include "parser/prefilter.h"
|
||||
#include "parser/unsupported.h"
|
||||
#include "util/compile_context.h"
|
||||
#include "util/compile_error.h"
|
||||
#include "util/report_manager.h"
|
||||
#include "util/target_info.h"
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
using namespace ue2;
|
||||
|
||||
NfaGeneratedCorpora::NfaGeneratedCorpora(const ExpressionMap &expr,
|
||||
const CorpusProperties &props,
|
||||
bool force_utf8_mode_in,
|
||||
bool force_prefilter_mode_in)
|
||||
: m_expr(expr), m_props(props), force_utf8_mode(force_utf8_mode_in),
|
||||
force_prefilter_mode(force_prefilter_mode_in) {
|
||||
// empty
|
||||
}
|
||||
|
||||
NfaGeneratedCorpora *NfaGeneratedCorpora::clone() const {
|
||||
return new NfaGeneratedCorpora(m_expr, m_props, force_utf8_mode,
|
||||
force_prefilter_mode);
|
||||
}
|
||||
|
||||
void NfaGeneratedCorpora::generate(unsigned id, vector<Corpus> &data) {
|
||||
ExpressionMap::const_iterator i = m_expr.find(id);
|
||||
if (i == m_expr.end()) {
|
||||
throw CorpusFailure("Expression not found.");
|
||||
}
|
||||
|
||||
string re;
|
||||
u32 hs_flags;
|
||||
hs_expr_ext ext;
|
||||
if (!readExpression(i->second, re, &hs_flags, &ext)) {
|
||||
throw CorpusFailure("Expression could not be read: " + i->second);
|
||||
}
|
||||
|
||||
if (force_utf8_mode) {
|
||||
hs_flags |= HS_FLAG_UTF8;
|
||||
}
|
||||
|
||||
if (force_prefilter_mode) {
|
||||
hs_flags |= HS_FLAG_PREFILTER;
|
||||
}
|
||||
|
||||
// Wrap the UE2 parser and compiler functionality and use it to generate
|
||||
// corpora for us.
|
||||
vector<string> c;
|
||||
|
||||
try {
|
||||
ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext);
|
||||
|
||||
// Apply prefiltering transformations if desired.
|
||||
if (pe.expr.prefilter) {
|
||||
prefilterTree(pe.component, ParseMode(hs_flags));
|
||||
}
|
||||
|
||||
// Bail on patterns with unsupported constructs.
|
||||
checkUnsupported(*pe.component);
|
||||
pe.component->checkEmbeddedStartAnchor(true);
|
||||
pe.component->checkEmbeddedEndAnchor(true);
|
||||
|
||||
CompileContext cc(false, false, get_current_target(), Grey());
|
||||
ReportManager rm(cc.grey);
|
||||
auto built_expr = buildGraph(rm, cc, pe);
|
||||
if (!built_expr.g) {
|
||||
// A more specific error should probably have been thrown by
|
||||
// buildGraph.
|
||||
throw CorpusFailure("could not build graph.");
|
||||
}
|
||||
|
||||
const auto cg =
|
||||
makeCorpusGenerator(*built_expr.g, built_expr.expr, m_props);
|
||||
cg->generateCorpus(c);
|
||||
}
|
||||
catch (const ParseError &e) {
|
||||
throw CorpusFailure("compilation failed, " + e.reason);
|
||||
}
|
||||
catch (const CompileError &e) {
|
||||
throw CorpusFailure("compilation failed, " + e.reason);
|
||||
}
|
||||
catch (const std::bad_alloc &) {
|
||||
throw CorpusFailure("out of memory.");
|
||||
}
|
||||
catch (const CorpusGenerationFailure &e) {
|
||||
// if corpus generation failed, just pass up the error message
|
||||
throw CorpusFailure("corpus generation failed: " + e.message);
|
||||
}
|
||||
catch (...) {
|
||||
throw CorpusFailure("unknown error.");
|
||||
}
|
||||
|
||||
if (c.empty()) {
|
||||
throw CorpusFailure("no corpora generated.");
|
||||
}
|
||||
|
||||
data.reserve(data.size() + c.size());
|
||||
for (const auto &e : c) {
|
||||
data.push_back(Corpus(e));
|
||||
}
|
||||
}
|
61
tools/hscollider/NfaGeneratedCorpora.h
Normal file
61
tools/hscollider/NfaGeneratedCorpora.h
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef NFAGENERATEDCORPORA_H
|
||||
#define NFAGENERATEDCORPORA_H
|
||||
|
||||
#include "Corpora.h"
|
||||
#include "ng_corpus_properties.h"
|
||||
#include "expressions.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// Corpora associated with a pattern set
|
||||
class NfaGeneratedCorpora : public CorporaSource {
|
||||
public:
|
||||
NfaGeneratedCorpora(const ExpressionMap &expr,
|
||||
const CorpusProperties &props, bool force_utf8_mode_in,
|
||||
bool force_prefilter_mode_in);
|
||||
|
||||
NfaGeneratedCorpora *clone() const override;
|
||||
|
||||
void generate(unsigned id, std::vector<Corpus> &data) override;
|
||||
|
||||
private:
|
||||
// Expressions handled by this corpora object
|
||||
const ExpressionMap &m_expr;
|
||||
|
||||
// CorpusProperties policy object
|
||||
CorpusProperties m_props;
|
||||
|
||||
bool force_utf8_mode;
|
||||
bool force_prefilter_mode;
|
||||
};
|
||||
|
||||
#endif
|
139
tools/hscollider/ResultSet.h
Normal file
139
tools/hscollider/ResultSet.h
Normal file
@ -0,0 +1,139 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef RESULTSET_H
|
||||
#define RESULTSET_H
|
||||
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
// Class representing a single match, encapsulating to/from offsets.
|
||||
class MatchResult {
|
||||
public:
|
||||
MatchResult(unsigned long long start, unsigned long long end)
|
||||
: from(start), to(end) {}
|
||||
|
||||
bool operator<(const MatchResult &a) const {
|
||||
if (from != a.from) {
|
||||
return from < a.from;
|
||||
}
|
||||
return to < a.to;
|
||||
}
|
||||
|
||||
bool operator==(const MatchResult &a) const {
|
||||
return from == a.from && to == a.to;
|
||||
}
|
||||
|
||||
unsigned long long from;
|
||||
unsigned long long to;
|
||||
};
|
||||
|
||||
enum ResultSource {
|
||||
RESULT_FROM_UE2,
|
||||
RESULT_FROM_PCRE,
|
||||
RESULT_FROM_GRAPH,
|
||||
};
|
||||
|
||||
inline
|
||||
std::ostream &operator<<(std::ostream &out, ResultSource src) {
|
||||
switch (src) {
|
||||
case RESULT_FROM_UE2:
|
||||
out << "UE2";
|
||||
break;
|
||||
case RESULT_FROM_GRAPH:
|
||||
out << "Graph";
|
||||
break;
|
||||
case RESULT_FROM_PCRE:
|
||||
out << "PCRE";
|
||||
break;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
class ResultSet {
|
||||
public:
|
||||
// Constructor.
|
||||
explicit ResultSet(ResultSource s) : src(s) {}
|
||||
|
||||
// Can be constructed with a set of end-offsets.
|
||||
ResultSet(const std::set<unsigned int> &m, ResultSource s) : src(s) {
|
||||
for (const auto &offset : m) {
|
||||
matches.emplace(0, offset);
|
||||
}
|
||||
}
|
||||
|
||||
// Equality.
|
||||
bool operator==(const ResultSet &other) const {
|
||||
return uoom == other.uoom &&
|
||||
match_after_halt == other.match_after_halt &&
|
||||
invalid_id == other.invalid_id &&
|
||||
matches == other.matches;
|
||||
}
|
||||
|
||||
// Inequality.
|
||||
bool operator!=(const ResultSet &other) const { return !(*this == other); }
|
||||
|
||||
// Add a match.
|
||||
void addMatch(unsigned long long from, unsigned long long to,
|
||||
int block = 0) {
|
||||
MatchResult m(from, to);
|
||||
matches.insert(m);
|
||||
|
||||
if (matches_by_block[block].find(m) != matches_by_block[block].end()) {
|
||||
dupe_matches.insert(m);
|
||||
} else {
|
||||
matches_by_block[block].insert(m);
|
||||
}
|
||||
}
|
||||
|
||||
// Unexpected out of order match seen.
|
||||
bool uoom = false;
|
||||
|
||||
// A match was received after termination was requested.
|
||||
bool match_after_halt = false;
|
||||
|
||||
// A match from an invalid ID was seen.
|
||||
bool invalid_id = false;
|
||||
|
||||
// Ordered set of matches.
|
||||
std::set<MatchResult> matches;
|
||||
|
||||
// Matches grouped by stream write/block that we see them in.
|
||||
std::map<int, std::set<MatchResult>> matches_by_block;
|
||||
|
||||
// Dupe matches that we have seen.
|
||||
std::set<MatchResult> dupe_matches;
|
||||
|
||||
/* Where these results came from (does not take part in comparisions) */
|
||||
ResultSource src;
|
||||
};
|
||||
|
||||
#endif
|
95
tools/hscollider/Thread.cpp
Normal file
95
tools/hscollider/Thread.cpp
Normal file
@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "Thread.h"
|
||||
#include "common.h"
|
||||
#include "sig.h"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
static const size_t COLLIDER_THREAD_STACK_SIZE = 8192 * 1024;
|
||||
|
||||
void Thread::start() {
|
||||
// Some systems, notably Mac OS X, use a default stack size that is
|
||||
// smaller than what we want (particularly given that we're planning on
|
||||
// running PCRE, which recurses inside pcre_exec). We attempt to
|
||||
// increase it to 8MB.
|
||||
int ret;
|
||||
pthread_attr_t attr;
|
||||
ret = pthread_attr_init(&attr);
|
||||
if (ret) {
|
||||
std::cerr << "pthread_attr_init failed" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
size_t stacksize = 0;
|
||||
ret = pthread_attr_getstacksize(&attr, &stacksize);
|
||||
if (ret) {
|
||||
std::cerr << "Warning: can't query stack size with "
|
||||
"pthread_attr_getstacksize" << std::endl;
|
||||
goto create_thread;
|
||||
}
|
||||
|
||||
if (stacksize < COLLIDER_THREAD_STACK_SIZE) {
|
||||
ret = pthread_attr_setstacksize(&attr, COLLIDER_THREAD_STACK_SIZE);
|
||||
if (ret) {
|
||||
std::cerr << "Warning: pthread_attr_setstacksize failed, "
|
||||
"unable to set stack size to "
|
||||
<< COLLIDER_THREAD_STACK_SIZE << " bytes." << std::endl;
|
||||
// Fall through: this isn't necessarily fatal (yet!)
|
||||
}
|
||||
}
|
||||
|
||||
create_thread:
|
||||
ret = pthread_create(&thread, &attr, &runThread, this);
|
||||
if (ret) {
|
||||
std::cerr << "pthread_create failed for thread id " << thread_id
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Dispatch
|
||||
void *Thread::runThread(void *thr) {
|
||||
if (!no_signal_handler) {
|
||||
setSignalStack();
|
||||
}
|
||||
((Thread *)thr)->run();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void Thread::join() { pthread_join(thread, nullptr); }
|
||||
|
||||
Thread::Thread(size_t num) : thread_id(num) {}
|
||||
|
||||
Thread::~Thread() {}
|
60
tools/hscollider/Thread.h
Normal file
60
tools/hscollider/Thread.h
Normal file
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef UE2COLLIDER_THREAD_H
|
||||
#define UE2COLLIDER_THREAD_H
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
class Thread : boost::noncopyable {
|
||||
public:
|
||||
explicit Thread(size_t num);
|
||||
virtual ~Thread();
|
||||
|
||||
virtual void start();
|
||||
|
||||
// Dispatch
|
||||
static void *runThread(void *thr);
|
||||
|
||||
virtual void join();
|
||||
|
||||
// Implemented by subclasses.
|
||||
virtual void run() = 0;
|
||||
|
||||
protected:
|
||||
const size_t thread_id;
|
||||
|
||||
private:
|
||||
pthread_t thread;
|
||||
};
|
||||
|
||||
#endif // UE2COLLIDER_THREAD_H
|
1026
tools/hscollider/UltimateTruth.cpp
Normal file
1026
tools/hscollider/UltimateTruth.cpp
Normal file
File diff suppressed because it is too large
Load Diff
142
tools/hscollider/UltimateTruth.h
Normal file
142
tools/hscollider/UltimateTruth.h
Normal file
@ -0,0 +1,142 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef ULTIMATETRUTH_H
|
||||
#define ULTIMATETRUTH_H
|
||||
|
||||
#include "expressions.h"
|
||||
|
||||
#include "hs.h"
|
||||
|
||||
#include <memory>
|
||||
#include <ostream>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
struct Grey;
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
class HyperscanDB;
|
||||
class ResultSet;
|
||||
|
||||
// Wrapper around ue2 to generate results for an expression and corpus.
|
||||
class UltimateTruth : boost::noncopyable {
|
||||
public:
|
||||
UltimateTruth(std::ostream &os, const ExpressionMap &expr,
|
||||
const hs_platform_info *plat, const ue2::Grey &grey,
|
||||
unsigned streamBlocks = 0);
|
||||
|
||||
~UltimateTruth();
|
||||
|
||||
std::shared_ptr<HyperscanDB> compile(const std::set<unsigned> &ids,
|
||||
std::string &error) const;
|
||||
|
||||
bool saveDatabase(const HyperscanDB &db,
|
||||
const std::string &filename) const;
|
||||
|
||||
std::shared_ptr<HyperscanDB>
|
||||
loadDatabase(const std::string &filename,
|
||||
const std::set<unsigned> &ids) const;
|
||||
|
||||
// Are we runnable? (i.e. not xcompiling)
|
||||
bool runnable() const {
|
||||
return !m_xcompile;
|
||||
}
|
||||
|
||||
bool run(unsigned id, std::shared_ptr<const HyperscanDB> db,
|
||||
const std::string &buffer, bool single_pattern, unsigned align,
|
||||
ResultSet &rs);
|
||||
|
||||
// Returns a value completely representing this object's compile options.
|
||||
unsigned int describe() const;
|
||||
|
||||
std::string dbFilename(const std::set<unsigned int> &ids) const;
|
||||
|
||||
private:
|
||||
bool blockScan(const HyperscanDB &db, const std::string &buffer,
|
||||
size_t align, match_event_handler callback, void *ctx,
|
||||
ResultSet *rs);
|
||||
bool streamingScan(const HyperscanDB &db, const std::string &buffer,
|
||||
size_t align, match_event_handler callback, void *ctx,
|
||||
ResultSet *rs);
|
||||
bool vectoredScan(const HyperscanDB &db, const std::string &buffer,
|
||||
size_t align, match_event_handler callback, void *ctx,
|
||||
ResultSet *rs);
|
||||
|
||||
char *setupScanBuffer(const char *buf, size_t len, size_t align);
|
||||
|
||||
char *setupVecScanBuffer(const char *buf, size_t len, size_t align,
|
||||
unsigned int block_id);
|
||||
|
||||
bool allocScratch(std::shared_ptr<const HyperscanDB> db);
|
||||
|
||||
bool cloneScratch(void);
|
||||
|
||||
std::string dbSettingsHash(const std::set<unsigned int> &ids) const;
|
||||
|
||||
const ue2::Grey &grey;
|
||||
|
||||
// Output stream.
|
||||
std::ostream &out;
|
||||
|
||||
// Our expression map
|
||||
const ExpressionMap &m_expr;
|
||||
|
||||
// Are we cross-compiling, and therefore unable to scan at all?
|
||||
bool m_xcompile;
|
||||
|
||||
// Our mode flags to pass into the compiler: calculated from streaming,
|
||||
// etc.
|
||||
unsigned m_mode;
|
||||
|
||||
// In streaming mode, what is the number of blocks to chop data into?
|
||||
unsigned m_streamBlocks;
|
||||
|
||||
// Scratch space for Hyperscan.
|
||||
hs_scratch_t *scratch;
|
||||
|
||||
// Temporary scan buffer used for realigned scanning
|
||||
std::vector<char> m_scanBuf;
|
||||
|
||||
std::vector<std::vector<char> > raw_blocks; /* temp scan buffers used by
|
||||
* vectored mode */
|
||||
|
||||
// Last database we successfully allocated scratch for, so that we can
|
||||
// avoid unnecessarily reallocating for it.
|
||||
std::shared_ptr<const HyperscanDB> last_db;
|
||||
|
||||
const hs_platform_info *platform;
|
||||
};
|
||||
|
||||
#endif
|
570
tools/hscollider/args.cpp
Normal file
570
tools/hscollider/args.cpp
Normal file
@ -0,0 +1,570 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ng_corpus_properties.h"
|
||||
#include "args.h"
|
||||
#include "common.h"
|
||||
#include "cross_compile.h"
|
||||
#include "util/expression_path.h"
|
||||
#include "util/string_util.h"
|
||||
|
||||
#include "grey.h"
|
||||
#include "ue2common.h"
|
||||
#include "hs_compile.h" // for HS_MODE_*
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <getopt.h>
|
||||
|
||||
#define xstr(s) str(s)
|
||||
#define str(s) #s
|
||||
|
||||
using namespace ue2;
|
||||
using namespace std;
|
||||
|
||||
// display usage information, with an optional error
|
||||
static
|
||||
void usage(const char *name, const char *error) {
|
||||
printf("Usage: %s [OPTIONS...]\n\n", name);
|
||||
printf("General Options:\n\n");
|
||||
printf(" -h Display help and exit.\n");
|
||||
printf(" -G OVERRIDES Overrides for the grey box.\n");
|
||||
printf(" -e PATH Path to expression directory or file.\n");
|
||||
printf(" -s FILE Signature file to use.\n");
|
||||
printf(" -z NUM Signature ID to use.\n");
|
||||
printf(" -c FILE Load corpora from FILE rather than using "
|
||||
"generator.\n");
|
||||
printf(" -w FILE After running, save corpora (with matches) to "
|
||||
"FILE.\n");
|
||||
printf(" -a [BAND] Compile all expressions in UE2 (but still match "
|
||||
"singly).\n");
|
||||
printf(" If BAND, compile patterns in groups of size "
|
||||
"BAND.\n");
|
||||
printf(" -t NUM Use streaming mode, split data into ~NUM "
|
||||
"blocks.\n");
|
||||
printf(" -V NUM Use vectored mode, split data into ~NUM "
|
||||
"blocks.\n");
|
||||
printf(" -Z {R or 0-%d} Only test one alignment, either as given or "
|
||||
"'R' for random.\n", MAX_MAX_UE2_ALIGN - 1);
|
||||
printf(" -q Quiet; display only match differences, no other "
|
||||
"failures.\n");
|
||||
printf(" -v Verbose; display successes as well as "
|
||||
"failures.\n");
|
||||
printf("\n");
|
||||
printf("Pattern flags:\n");
|
||||
printf("\n");
|
||||
printf(" -8 Force UTF8 mode on all patterns.\n");
|
||||
printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n");
|
||||
printf(" -E DISTANCE Match all patterns within edit distance"
|
||||
" DISTANCE.\n");
|
||||
printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n");
|
||||
printf("\n");
|
||||
printf("Testing mode options:\n");
|
||||
printf("\n");
|
||||
printf(" -d NUM Set SOM precision mode (default: 8 (large)).\n");
|
||||
printf(" -O NUM In streaming mode, set initial offset to NUM.\n");
|
||||
printf(" -k NUM Terminate callback after NUM matches per "
|
||||
"pattern.\n");
|
||||
printf(" --copy-scratch Copy scratch after each scan call.\n");
|
||||
printf(" --copy-stream Copy stream state after each scan call.\n");
|
||||
printf(" --compress-expand Compress and expand stream state after each "
|
||||
"scan call.\n");
|
||||
printf(" --compress-reset-expand Compress, reset and expand stream state "
|
||||
"after each scan call.\n");
|
||||
printf(" --mangle-scratch Mangle scratch space after each scan call.\n");
|
||||
printf(" --no-nfa Disable NFA graph execution engine.\n");
|
||||
printf(" --no-pcre Disable PCRE engine.\n");
|
||||
printf(" --test-nfa Disable UE2 engine (test NFA against PCRE).\n");
|
||||
printf(" --abort-on-fail Abort, rather than exit, on failure.\n");
|
||||
printf(" --no-signal-handler Do not handle handle signals (to generate "
|
||||
"backtraces).\n");
|
||||
printf("\n");
|
||||
printf("Memory and resource control options:\n");
|
||||
printf("\n");
|
||||
printf(" -T NUM Run with NUM threads.\n");
|
||||
printf(" -M NUM Set maximum memory allocated to NUM megabytes per"
|
||||
" thread.\n");
|
||||
printf(" (0 means no limit, default is 1000 MB).\n");
|
||||
printf(" -m NUM Set PCRE_MATCH_LIMIT (default: %lu).\n",
|
||||
DEFAULT_PCRE_MATCH_LIMIT);
|
||||
printf(" -r NUM Set PCRE_MATCH_LIMIT_RECURSION (default: %lu).\n",
|
||||
DEFAULT_PCRE_MATCH_RECURSION_LIMIT);
|
||||
printf("\n");
|
||||
printf("Cross-compiling:\n");
|
||||
printf("\n");
|
||||
printf(" -x NAME Cross-compile for arch NAME.\n");
|
||||
printf(" -i DIR Don't compile, load from files in DIR "
|
||||
"instead.\n");
|
||||
printf(" -o DIR After compiling, save to files in DIR.\n");
|
||||
printf("\n");
|
||||
printf("Corpus generation options:\n");
|
||||
printf("\n");
|
||||
printf(" -n NUM Max corpora to generate for a given signature "
|
||||
"(default: %u).\n", DEFAULT_CORPUS_GENERATOR_LIMIT);
|
||||
printf(" -R NUM Random seed to use (default: seeded from "
|
||||
"time()).\n");
|
||||
printf(" -p NUM,NUM,NUM Percentage probabilities of "
|
||||
"(match,unmatch,random) char.\n");
|
||||
printf(" -C NUM,NUM Follow cycles (min,max) times.\n");
|
||||
printf(" -P NUM,NUM Add a random prefix of length between "
|
||||
"(min,max).\n");
|
||||
printf(" -S NUM,NUM Add a random suffix of length between "
|
||||
"(min,max).\n");
|
||||
printf(" -D NUM Apply an edit distance (default: 0) to each "
|
||||
"corpus.\n");
|
||||
printf(" -b NUM Limit alphabet to NUM characters, starting at "
|
||||
"lower-case 'a'.\n");
|
||||
printf("\n");
|
||||
|
||||
if (error) {
|
||||
printf("Error: %s\n", error);
|
||||
}
|
||||
}
|
||||
|
||||
void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
|
||||
vector<string> *corpora, UNUSED Grey *grey,
|
||||
unique_ptr<hs_platform_info> *plat_out) {
|
||||
static const char options[]
|
||||
= "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8";
|
||||
s32 in_multi = 0;
|
||||
s32 in_corpora = 0;
|
||||
int pcreFlag = 1;
|
||||
int nfaFlag = 1;
|
||||
int ue2Flag = 1;
|
||||
int copyScratch = 0;
|
||||
int copyStream = 0;
|
||||
int mangleScratch = 0;
|
||||
int compressFlag = 0;
|
||||
int compressResetFlag = 0;
|
||||
static const struct option longopts[] = {
|
||||
{"copy-scratch", 0, ©Scratch, 1},
|
||||
{"copy-stream", 0, ©Stream, 1},
|
||||
{"mangle-scratch", 0, &mangleScratch, 1},
|
||||
{"prefilter", 0, &force_prefilter, 1},
|
||||
{"no-pcre", 0, &pcreFlag, 0},
|
||||
{"no-nfa", 0, &nfaFlag, 0},
|
||||
{"test-nfa", 0, &ue2Flag, 0},
|
||||
{"abort-on-fail", 0, &abort_on_failure, 1},
|
||||
{"no-signal-handler", 0, &no_signal_handler, 1},
|
||||
{"compress-expand", 0, &compressFlag, 1},
|
||||
{"compress-reset-expand", 0, &compressResetFlag, 1},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
for (;;) {
|
||||
int c = getopt_long(argc, argv, options, longopts, nullptr);
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
switch (c) {
|
||||
case 'a':
|
||||
g_ue2CompileAll = true;
|
||||
in_multi = 2;
|
||||
break;
|
||||
case 'b': {
|
||||
unsigned sz;
|
||||
if (!fromString(optarg, sz) || sz > 256) {
|
||||
usage(argv[0], "Must provide an integer argument <= 256"
|
||||
"to '-b' flag");
|
||||
exit(1);
|
||||
}
|
||||
corpus_gen_prop.alphabetSize = sz;
|
||||
break;
|
||||
}
|
||||
case 'c':
|
||||
in_corpora = 2;
|
||||
break;
|
||||
case 'C': {
|
||||
vector<unsigned> nums;
|
||||
if (!strToList(optarg, nums) || nums.size() != 2
|
||||
|| nums[0] > nums[1]) {
|
||||
usage(argv[0], "Cycle limit '-C' argument takes a list of "
|
||||
" integers: MIN,MAX");
|
||||
exit(1);
|
||||
}
|
||||
corpus_gen_prop.setCycleLimit(nums[0], nums[1]);
|
||||
break;
|
||||
}
|
||||
case 'd': {
|
||||
unsigned dist;
|
||||
if (!fromString(optarg, dist)) {
|
||||
usage(argv[0],
|
||||
"Must provide an integer argument to '-d' flag");
|
||||
exit(1);
|
||||
}
|
||||
switch (dist) {
|
||||
case 2:
|
||||
somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL;
|
||||
break;
|
||||
case 4:
|
||||
somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM;
|
||||
break;
|
||||
case 8:
|
||||
somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
|
||||
break;
|
||||
default:
|
||||
usage(argv[0], "SOM precision must be 2, 4 or 8");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'D': {
|
||||
unsigned dist;
|
||||
if (!fromString(optarg, dist)) {
|
||||
usage(argv[0],
|
||||
"Must provide an integer argument to '-D' flag");
|
||||
exit(1);
|
||||
}
|
||||
corpus_gen_prop.editDistance = dist;
|
||||
break;
|
||||
}
|
||||
case 'e':
|
||||
g_exprPath.assign(optarg);
|
||||
break;
|
||||
case 'E': {
|
||||
u32 dist;
|
||||
if (!fromString(optarg, dist)) {
|
||||
usage(argv[0], "Argument to '-E' flag must be an integer");
|
||||
exit(1);
|
||||
}
|
||||
force_edit_distance = true;
|
||||
edit_distance = dist;
|
||||
break;
|
||||
}
|
||||
#ifndef RELEASE_BUILD
|
||||
case 'G':
|
||||
applyGreyOverrides(grey, string(optarg));
|
||||
break;
|
||||
#endif
|
||||
case 'h':
|
||||
usage(argv[0], nullptr);
|
||||
exit(0);
|
||||
case 'i':
|
||||
loadDatabases = true;
|
||||
serializePath = optarg;
|
||||
break;
|
||||
case 'k':
|
||||
if (!fromString(optarg, limit_matches) || limit_matches < 1) {
|
||||
usage(argv[0],
|
||||
"Must provide a positive integer argument to '-k' "
|
||||
"flag");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'L':
|
||||
somFlags = HS_FLAG_SOM_LEFTMOST;
|
||||
break;
|
||||
case 'm':
|
||||
if (!fromString(optarg, g_matchLimit) || g_matchLimit < 1) {
|
||||
usage(argv[0],
|
||||
"Must provide a positive integer argument to '-m' "
|
||||
"flag");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'M':
|
||||
if (!fromString(optarg, g_memoryLimit)) {
|
||||
usage(argv[0],
|
||||
"Must provide a positive (or zero) integer argument "
|
||||
"to '-M' flag");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'n': {
|
||||
unsigned int count;
|
||||
if (!fromString(optarg, count)) {
|
||||
usage(argv[0], "Argument to '-n' flag must be an integer");
|
||||
exit(1);
|
||||
}
|
||||
corpus_gen_prop.corpusLimit = count;
|
||||
break;
|
||||
}
|
||||
case 'o':
|
||||
saveDatabases = true;
|
||||
serializePath = optarg;
|
||||
break;
|
||||
case 'O':
|
||||
if (!fromString(optarg, g_streamOffset)) {
|
||||
usage(argv[0],
|
||||
"Argument '-O' flag must be a positive integer");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'p': {
|
||||
vector<unsigned> prob;
|
||||
if (!strToList(optarg, prob) || prob.size() != 3) {
|
||||
usage(argv[0], "Probabilities '-p' argument takes a list "
|
||||
"of three integers: MATCH,UNMATCH,RANDOM");
|
||||
exit(1);
|
||||
}
|
||||
if (!corpus_gen_prop.setPercentages(prob[0], prob[1],
|
||||
prob[2])) {
|
||||
usage(argv[0],
|
||||
"Unable to set corpus generator probabilities.");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'P': {
|
||||
vector<unsigned> nums;
|
||||
if (!strToList(optarg, nums) || nums.size() != 2
|
||||
|| nums[0] > nums[1]) {
|
||||
usage(argv[0], "Prefix '-P' argument takes a list of two"
|
||||
" integers: MIN,MAX");
|
||||
exit(1);
|
||||
}
|
||||
corpus_gen_prop.prefixRange = min_max(nums[0], nums[1]);
|
||||
break;
|
||||
}
|
||||
case 'q':
|
||||
g_quiet++;
|
||||
break;
|
||||
case 'r':
|
||||
if (!fromString(optarg, g_matchLimitRecursion)
|
||||
|| g_matchLimitRecursion < 1) {
|
||||
usage(argv[0], "Must provide a positive integer argument "
|
||||
"to '-r' flag");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'R': {
|
||||
if (!fromString(optarg, randomSeed)) {
|
||||
usage(argv[0], "Argument to '-R' flag must be an integer");
|
||||
exit(1);
|
||||
}
|
||||
corpus_gen_prop.seed(randomSeed);
|
||||
break;
|
||||
}
|
||||
case 's':
|
||||
g_signatureFiles.push_back(optarg);
|
||||
break;
|
||||
case 'S': {
|
||||
vector<unsigned> nums;
|
||||
if (!strToList(optarg, nums) || nums.size() != 2 ||
|
||||
nums[0] > nums[1]) {
|
||||
usage(argv[0], "Suffix '-S' argument takes a list of two"
|
||||
" integers: MIN,MAX");
|
||||
exit(1);
|
||||
}
|
||||
corpus_gen_prop.suffixRange = min_max(nums[0], nums[1]);
|
||||
break;
|
||||
}
|
||||
case 't':
|
||||
if (colliderMode != MODE_BLOCK) {
|
||||
usage(argv[0], "You can only use one mode at a time!");
|
||||
exit(1);
|
||||
}
|
||||
colliderMode = MODE_STREAMING;
|
||||
if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) {
|
||||
usage(argv[0], "Must provide a positive integer argument "
|
||||
"to '-t' flag");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'T':
|
||||
if (!fromString(optarg, numThreads) || numThreads < 1) {
|
||||
usage(argv[0], "Must provide a positive integer argument "
|
||||
"to '-T' flag");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'v':
|
||||
if (g_verbose) {
|
||||
echo_matches = true;
|
||||
}
|
||||
g_verbose = true;
|
||||
break;
|
||||
case 'V':
|
||||
if (colliderMode != MODE_BLOCK) {
|
||||
usage(argv[0], "You can only use one mode at a time!");
|
||||
exit(1);
|
||||
}
|
||||
colliderMode = MODE_VECTORED;
|
||||
if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) {
|
||||
usage(argv[0], "Must provide a positive integer argument "
|
||||
"to '-t' flag");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'w':
|
||||
saveCorpora = true;
|
||||
saveCorporaFile = optarg;
|
||||
break;
|
||||
case 'x':
|
||||
*plat_out = xcompileReadMode(optarg);
|
||||
if (!*plat_out) {
|
||||
usage(argv[0], xcompileUsage().c_str());
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'X': {
|
||||
u32 count;
|
||||
if (!fromString(optarg, count)) {
|
||||
usage(argv[0], "Argument to '-X' flag must be an integer");
|
||||
exit(1);
|
||||
}
|
||||
g_corpora_prefix.insert(g_corpora_prefix.end(), count, '~');
|
||||
break;
|
||||
}
|
||||
case 'Y':
|
||||
{
|
||||
u32 count;
|
||||
if (!fromString(optarg, count)) {
|
||||
usage(argv[0], "Argument to '-Y' flag must be an integer");
|
||||
exit(1);
|
||||
}
|
||||
g_corpora_suffix.insert(g_corpora_suffix.end(), count, '~');
|
||||
break;
|
||||
}
|
||||
case 'z':
|
||||
if (!strToList(optarg, g_signatures)) {
|
||||
usage(argv[0],
|
||||
"Argument to '-z' flag must be a list of integers");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'Z':
|
||||
static constexpr unsigned ALIGN_LIMIT = MAX_MAX_UE2_ALIGN - 1;
|
||||
if (optarg == string("R")) {
|
||||
// Random min alignment selected.
|
||||
use_random_alignment = true;
|
||||
break;
|
||||
} else if (!fromString(optarg, min_ue2_align)
|
||||
|| min_ue2_align > ALIGN_LIMIT) {
|
||||
usage(argv[0], "Argument must be 'R' or numeric < "
|
||||
xstr(MAX_MAX_UE2_ALIGN) " to '-Z'");
|
||||
exit(1);
|
||||
}
|
||||
max_ue2_align = min_ue2_align + 1;
|
||||
break;
|
||||
case '8':
|
||||
force_utf8 = true;
|
||||
break;
|
||||
case 1:
|
||||
if (in_multi) {
|
||||
if (!fromString(optarg, multicompile_bands)) {
|
||||
usage(argv[0],
|
||||
"Argument to '-a' flag must be an integer");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
} else if (in_corpora) {
|
||||
corpora->push_back(optarg);
|
||||
in_corpora = 2;
|
||||
break;
|
||||
}
|
||||
case 0:
|
||||
break;
|
||||
default:
|
||||
usage(argv[0], "Unrecognised command line argument.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
in_multi = MAX(0, in_multi - 1);
|
||||
in_corpora = MAX(0, in_corpora - 1);
|
||||
}
|
||||
|
||||
if (g_streamOffset && !g_streamBlocks) {
|
||||
usage(argv[0], "stream offset requires streams");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (g_exprPath.empty() && !g_signatureFiles.empty()) {
|
||||
/* attempt to infer an expression directory */
|
||||
for (const auto &fname : g_signatureFiles) {
|
||||
string exprPath = inferExpressionPath(fname);
|
||||
if (!g_exprPath.empty() && exprPath != g_exprPath) {
|
||||
usage(argv[0], "Only one expression path is allowed.");
|
||||
}
|
||||
g_exprPath.assign(exprPath);
|
||||
}
|
||||
}
|
||||
|
||||
// Must have a valid expression path
|
||||
if (g_exprPath.empty()) {
|
||||
usage(argv[0], "Must specify an expression path with the -e option.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// If we've been handed an expr file and no restrictions, use 'em all!
|
||||
if (!isDir(g_exprPath) && isFile(g_exprPath) && g_signatureFiles.empty()
|
||||
&& g_signatures.empty()) {
|
||||
g_allSignatures = true;
|
||||
}
|
||||
|
||||
// Must have a valid signature file
|
||||
if (g_signatureFiles.empty() && g_signatures.empty() && !g_allSignatures) {
|
||||
usage(argv[0], "Must specify a signature file with the -s option.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Cannot ask for both loading and saving
|
||||
if (loadDatabases && saveDatabases) {
|
||||
usage(argv[0], "You cannot both load and save databases.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Cannot ask for cross-compile and loading
|
||||
if (loadDatabases && *plat_out) {
|
||||
usage(argv[0], "You cannot both load and xcompile of databases.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// need at least two pattern engines active
|
||||
if (nfaFlag + pcreFlag + ue2Flag < 2) {
|
||||
usage(argv[0], "At least two pattern engines should be active.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (copyStream && !g_streamBlocks) {
|
||||
usage(argv[0], "Copying streams only makes sense in streaming mode.");
|
||||
exit(1);
|
||||
}
|
||||
if (compressFlag && compressResetFlag) {
|
||||
usage(argv[0],
|
||||
"Only use one of --compress-expand and --compress-reset-expand.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// set booleans appropriately
|
||||
use_NFA = (bool) nfaFlag;
|
||||
use_PCRE = (bool) pcreFlag;
|
||||
use_UE2 = (bool) ue2Flag;
|
||||
use_copy_scratch = (bool) copyScratch;
|
||||
use_copy_stream = (bool) copyStream;
|
||||
use_mangle_scratch = (bool) mangleScratch;
|
||||
use_compress_expand = (bool)compressFlag;
|
||||
use_compress_reset_expand = (bool)compressResetFlag;
|
||||
}
|
46
tools/hscollider/args.h
Normal file
46
tools/hscollider/args.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef ARGS_H
|
||||
#define ARGS_H
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace ue2 {
|
||||
struct Grey;
|
||||
}
|
||||
struct hs_platform_info;
|
||||
class CorpusProperties;
|
||||
|
||||
void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
|
||||
std::vector<std::string> *corpora, ue2::Grey *grey,
|
||||
std::unique_ptr<hs_platform_info> *plat_out);
|
||||
|
||||
#endif
|
92
tools/hscollider/common.h
Normal file
92
tools/hscollider/common.h
Normal file
@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef COMMON_H
|
||||
#define COMMON_H
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
enum ColliderMode {
|
||||
MODE_BLOCK,
|
||||
MODE_STREAMING,
|
||||
MODE_VECTORED
|
||||
};
|
||||
|
||||
extern unsigned numThreads;
|
||||
extern enum ColliderMode colliderMode;
|
||||
extern unsigned int somFlags;
|
||||
extern bool loadDatabases;
|
||||
extern bool saveDatabases;
|
||||
extern bool saveCorpora;
|
||||
extern std::string saveCorporaFile;
|
||||
extern std::string serializePath;
|
||||
extern bool echo_matches;
|
||||
extern int g_quiet;
|
||||
extern bool g_verbose;
|
||||
extern std::string g_exprPath;
|
||||
extern std::vector<std::string> g_signatureFiles;
|
||||
extern bool g_allSignatures;
|
||||
extern bool g_ue2CompileAll;
|
||||
extern unsigned g_streamBlocks;
|
||||
extern unsigned long long g_streamOffset;
|
||||
extern std::string g_corpora_prefix;
|
||||
extern std::string g_corpora_suffix;
|
||||
extern unsigned multicompile_bands;
|
||||
extern std::string g_corporaFile;
|
||||
extern std::vector<unsigned> g_signatures;
|
||||
extern unsigned long int g_matchLimit;
|
||||
extern unsigned long int g_matchLimitRecursion;
|
||||
extern unsigned min_ue2_align;
|
||||
extern unsigned max_ue2_align;
|
||||
extern size_t g_memoryLimit;
|
||||
extern bool force_utf8;
|
||||
extern int force_prefilter;
|
||||
extern unsigned somPrecisionMode;
|
||||
extern unsigned limit_matches;
|
||||
extern unsigned randomSeed;
|
||||
extern bool use_random_alignment;
|
||||
extern bool use_PCRE;
|
||||
extern bool use_NFA;
|
||||
extern bool use_UE2;
|
||||
extern bool use_copy_scratch;
|
||||
extern bool use_copy_stream;
|
||||
extern bool use_mangle_scratch;
|
||||
extern bool use_compress_expand;
|
||||
extern bool use_compress_reset_expand;
|
||||
extern int abort_on_failure;
|
||||
extern int no_signal_handler;
|
||||
extern bool force_edit_distance;
|
||||
extern unsigned edit_distance;
|
||||
|
||||
// Constants
|
||||
static const unsigned long int DEFAULT_PCRE_MATCH_LIMIT = 10*1000*1000;
|
||||
static const unsigned long int DEFAULT_PCRE_MATCH_RECURSION_LIMIT = 10000;
|
||||
#define MAX_MAX_UE2_ALIGN 64
|
||||
#endif
|
63
tools/hscollider/limit.cpp
Normal file
63
tools/hscollider/limit.cpp
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "limit.h"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#if defined(HAVE_SETRLIMIT)
|
||||
#include <cerrno>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <sys/resource.h>
|
||||
|
||||
void setMemoryLimit(size_t mbytes) {
|
||||
size_t bytes = mbytes * 1024 * 1024;
|
||||
|
||||
struct rlimit r;
|
||||
r.rlim_cur = bytes;
|
||||
r.rlim_max = bytes;
|
||||
|
||||
int rv = setrlimit(RLIMIT_DATA, &r);
|
||||
if (rv != 0) {
|
||||
std::cerr << "setrlimit(RLIMIT_DATA, ...) failed: " <<
|
||||
strerror(errno) << std::endl;
|
||||
}
|
||||
|
||||
rv = setrlimit(RLIMIT_AS, &r);
|
||||
if (rv != 0) {
|
||||
std::cerr << "setrlimit(RLIMIT_AS, ...) failed: " <<
|
||||
strerror(errno) << std::endl;
|
||||
}
|
||||
}
|
||||
#else // no setrlimit
|
||||
void setMemoryLimit(size_t) {}
|
||||
#endif
|
36
tools/hscollider/limit.h
Normal file
36
tools/hscollider/limit.h
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef LIMIT_H
|
||||
#define LIMIT_H
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
void setMemoryLimit(size_t mbytes);
|
||||
|
||||
#endif // LIMIT_H
|
2002
tools/hscollider/main.cpp
Normal file
2002
tools/hscollider/main.cpp
Normal file
File diff suppressed because it is too large
Load Diff
90
tools/hscollider/pcre_util.cpp
Normal file
90
tools/hscollider/pcre_util.cpp
Normal file
@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "pcre_util.h"
|
||||
|
||||
#include "hs.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <pcre.h> /* for pcre flags */
|
||||
|
||||
bool getPcreFlags(unsigned int hs_flags, unsigned int *flags,
|
||||
bool *highlander, bool *prefilter, bool *som) {
|
||||
assert(flags);
|
||||
assert(highlander);
|
||||
assert(prefilter);
|
||||
assert(som);
|
||||
*flags = 0;
|
||||
*highlander = false;
|
||||
*prefilter = false;
|
||||
*som = false;
|
||||
|
||||
if (hs_flags & HS_FLAG_CASELESS) {
|
||||
*flags |= PCRE_CASELESS;
|
||||
hs_flags &= ~HS_FLAG_CASELESS;
|
||||
}
|
||||
if (hs_flags & HS_FLAG_DOTALL) {
|
||||
*flags |= PCRE_DOTALL;
|
||||
hs_flags &= ~HS_FLAG_DOTALL;
|
||||
}
|
||||
if (hs_flags & HS_FLAG_MULTILINE) {
|
||||
*flags |= PCRE_MULTILINE;
|
||||
hs_flags &= ~HS_FLAG_MULTILINE;
|
||||
}
|
||||
if (hs_flags & HS_FLAG_UCP) {
|
||||
*flags |= PCRE_UCP;
|
||||
hs_flags &= ~HS_FLAG_UCP;
|
||||
}
|
||||
if (hs_flags & HS_FLAG_UTF8) {
|
||||
*flags |= PCRE_UTF8;
|
||||
hs_flags &= ~HS_FLAG_UTF8;
|
||||
}
|
||||
if (hs_flags & HS_FLAG_SINGLEMATCH) {
|
||||
*highlander = true;
|
||||
hs_flags &= ~HS_FLAG_SINGLEMATCH;
|
||||
}
|
||||
if (hs_flags & HS_FLAG_PREFILTER) {
|
||||
*prefilter = true;
|
||||
hs_flags &= ~HS_FLAG_PREFILTER;
|
||||
}
|
||||
if (hs_flags & HS_FLAG_SOM_LEFTMOST) {
|
||||
*som = true;
|
||||
hs_flags &= ~HS_FLAG_SOM_LEFTMOST;
|
||||
}
|
||||
|
||||
// Flags that are irrelevant to PCRE.
|
||||
hs_flags &= ~HS_FLAG_ALLOWEMPTY;
|
||||
|
||||
if (hs_flags) {
|
||||
// You've added new flags, haven't you?
|
||||
assert(0);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
41
tools/hscollider/pcre_util.h
Normal file
41
tools/hscollider/pcre_util.h
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef PCRE_UTIL_H
|
||||
#define PCRE_UTIL_H
|
||||
|
||||
/** Translates the given hyperscan flags into pcre flags (where appropriate)
|
||||
* and other bools (for flags which are not directly translateable).
|
||||
*
|
||||
* Returns false if an unknown hyperscan flag is encountered.
|
||||
*/
|
||||
bool getPcreFlags(unsigned int hs_flags, unsigned int *pcre_flags,
|
||||
bool *highlander, bool *prefilter, bool *som);
|
||||
|
||||
#endif /* PCRE_UTIL_H */
|
||||
|
185
tools/hscollider/sig.cpp
Normal file
185
tools/hscollider/sig.cpp
Normal file
@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "sig.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <ctype.h>
|
||||
#include <string>
|
||||
|
||||
#ifdef HAVE_SIGACTION
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_BACKTRACE
|
||||
#include <execinfo.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#define BACKTRACE_BUFFER_SIZE 200
|
||||
|
||||
TLS_VARIABLE volatile int debug_stage = STAGE_UNDEFINED;
|
||||
TLS_VARIABLE volatile int debug_expr = 0;
|
||||
TLS_VARIABLE const char * volatile debug_expr_ptr = nullptr;
|
||||
TLS_VARIABLE volatile int debug_corpus = 0;
|
||||
TLS_VARIABLE const char * volatile debug_corpus_ptr = nullptr;
|
||||
TLS_VARIABLE volatile size_t debug_corpus_len = 0;
|
||||
|
||||
extern std::string g_cmdline;
|
||||
|
||||
#ifdef HAVE_SIGACTION
|
||||
static void sighandler(int signum) {
|
||||
/* NOTE: This signal handler is designed solely to provide more information
|
||||
* when a crash occurs in ue2collider -- it makes calls to signal-unsafe
|
||||
* functions like printf() and backtrace() by design, since we're already
|
||||
* in deep trouble and are going to exit anyway. */
|
||||
|
||||
fflush(stdout);
|
||||
printf("signal %d\n", signum);
|
||||
printf("\nFailing cmdline was:\n%s\n\n", g_cmdline.c_str());
|
||||
printf("expression %d ", debug_expr);
|
||||
switch(debug_stage) {
|
||||
case STAGE_UE2_COMPILE:
|
||||
printf("ue2 compile\n");
|
||||
break;
|
||||
case STAGE_UE2_RUN:
|
||||
printf("corpus %d ue2 scan\n", debug_corpus);
|
||||
break;
|
||||
case STAGE_PCRE_COMPILE:
|
||||
printf("pcre compile\n");
|
||||
break;
|
||||
case STAGE_PCRE_RUN:
|
||||
printf("corpus %d pcre scan\n", debug_corpus);
|
||||
break;
|
||||
case STAGE_GRAPH_PREPROCESS:
|
||||
printf("graph preprocess\n");
|
||||
break;
|
||||
case STAGE_GRAPH_COMPILE:
|
||||
printf("graph compile\n");
|
||||
break;
|
||||
case STAGE_GRAPH_RUN:
|
||||
printf("corpus %d graph scan\n", debug_corpus);
|
||||
break;
|
||||
default:
|
||||
case STAGE_UNDEFINED:
|
||||
printf("unknown stage\n");
|
||||
break;
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
if (debug_expr_ptr) {
|
||||
printf("expression %p\n", debug_expr_ptr);
|
||||
printf("%d:%s\n\n", debug_expr, debug_expr_ptr);
|
||||
}
|
||||
|
||||
if (debug_stage == STAGE_PCRE_RUN || debug_stage == STAGE_UE2_RUN) {
|
||||
printf("corpus %p len %zu\n", debug_corpus_ptr, debug_corpus_len);
|
||||
|
||||
printf("%d:", debug_expr);
|
||||
for (size_t i = 0; i < debug_corpus_len && debug_corpus_ptr; i++) {
|
||||
unsigned char c = debug_corpus_ptr[i];
|
||||
if (c == '\n') {
|
||||
printf("\\n");
|
||||
} else if (c == '\t') {
|
||||
printf("\\t");
|
||||
} else if (c == '\r') {
|
||||
printf("\\r");
|
||||
} else if (0x20 <= c && c <= 0x7e && c != '\\') {
|
||||
printf("%c", c);
|
||||
} else {
|
||||
printf("\\x%02hhx", c);
|
||||
}
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
#ifdef HAVE_BACKTRACE
|
||||
static void *bt[BACKTRACE_BUFFER_SIZE];
|
||||
int count = backtrace(bt, BACKTRACE_BUFFER_SIZE);
|
||||
if (count) {
|
||||
backtrace_symbols_fd(bt, count, STDOUT_FILENO);
|
||||
} else {
|
||||
printf("(Call to backtrace() returns zero count.)\n");
|
||||
}
|
||||
#else
|
||||
printf("(Backtrace unavailable on this platform.)\n");
|
||||
#endif
|
||||
|
||||
_exit(signum);
|
||||
}
|
||||
#endif // HAVE_SIGACTION
|
||||
|
||||
void installSignalHandler(void) {
|
||||
#ifdef HAVE_SIGACTION
|
||||
struct sigaction act;
|
||||
memset(&act, 0, sizeof(act));
|
||||
act.sa_handler = sighandler;
|
||||
act.sa_flags = 0;
|
||||
sigemptyset(&act.sa_mask);
|
||||
sigaddset(&act.sa_mask, SIGSEGV);
|
||||
sigaddset(&act.sa_mask, SIGBUS);
|
||||
sigaddset(&act.sa_mask, SIGFPE);
|
||||
sigaddset(&act.sa_mask, SIGILL);
|
||||
sigaddset(&act.sa_mask, SIGABRT);
|
||||
sigaction(SIGBUS, &act, nullptr);
|
||||
sigaction(SIGFPE, &act, nullptr);
|
||||
sigaction(SIGILL, &act, nullptr);
|
||||
sigaction(SIGABRT, &act, nullptr);
|
||||
sigaction(SIGSEGV, &act, nullptr);
|
||||
setSignalStack();
|
||||
#endif // HAVE_SIGACTION
|
||||
}
|
||||
|
||||
#ifdef HAVE_SIGALTSTACK
|
||||
static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ];
|
||||
#endif
|
||||
|
||||
void setSignalStack(void) {
|
||||
#ifdef HAVE_SIGALTSTACK
|
||||
struct sigaction act;
|
||||
memset(&act, 0, sizeof(act));
|
||||
act.sa_handler = sighandler;
|
||||
act.sa_flags = 0;
|
||||
stack_t alt_stack;
|
||||
memset(&alt_stack, 0, sizeof(alt_stack));
|
||||
alt_stack.ss_flags = 0;
|
||||
alt_stack.ss_size = SIGSTKSZ;
|
||||
alt_stack.ss_sp = alt_stack_loc;
|
||||
if (!sigaltstack(&alt_stack, nullptr)) {
|
||||
act.sa_flags |= SA_ONSTACK;
|
||||
}
|
||||
sigaction(SIGSEGV, &act, nullptr);
|
||||
#endif
|
||||
}
|
||||
|
57
tools/hscollider/sig.h
Normal file
57
tools/hscollider/sig.h
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef SIG_H
|
||||
#define SIG_H
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#define STAGE_UNDEFINED 0
|
||||
#define STAGE_UE2_COMPILE 1
|
||||
#define STAGE_UE2_RUN 2
|
||||
#define STAGE_PCRE_COMPILE 3
|
||||
#define STAGE_PCRE_RUN 4
|
||||
#define STAGE_GRAPH_PREPROCESS 5
|
||||
#define STAGE_GRAPH_COMPILE 6
|
||||
#define STAGE_GRAPH_RUN 7
|
||||
|
||||
#define TLS_VARIABLE __thread
|
||||
|
||||
extern TLS_VARIABLE volatile int debug_stage;
|
||||
extern TLS_VARIABLE volatile int debug_expr;
|
||||
extern TLS_VARIABLE const char * volatile debug_expr_ptr;
|
||||
extern TLS_VARIABLE volatile int debug_corpus;
|
||||
extern TLS_VARIABLE const char * volatile debug_corpus_ptr;
|
||||
extern TLS_VARIABLE volatile size_t debug_corpus_len;
|
||||
|
||||
void installSignalHandler(void);
|
||||
|
||||
// Must be called by every thread.
|
||||
void setSignalStack(void);
|
||||
|
||||
#endif
|
54
tools/hscollider/simple_timer.h
Normal file
54
tools/hscollider/simple_timer.h
Normal file
@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef SIMPLE_TIMER_H
|
||||
#define SIMPLE_TIMER_H
|
||||
|
||||
#include <chrono>
|
||||
|
||||
class SimpleTimer {
|
||||
public:
|
||||
SimpleTimer();
|
||||
double elapsed() const;
|
||||
private:
|
||||
std::chrono::time_point<std::chrono::system_clock> start;
|
||||
};
|
||||
|
||||
SimpleTimer::SimpleTimer() {
|
||||
start = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
double SimpleTimer::elapsed() const {
|
||||
std::chrono::time_point<std::chrono::system_clock> end;
|
||||
end = std::chrono::system_clock::now();
|
||||
|
||||
std::chrono::duration<double> delta = end - start;
|
||||
return delta.count();
|
||||
}
|
||||
|
||||
#endif // SIMPLE_TIMER_H
|
Loading…
x
Reference in New Issue
Block a user