chimera: hybrid of Hyperscan and PCRE

This commit is contained in:
Wang, Xiang W
2018-03-09 03:52:12 -05:00
parent 8a1c497f44
commit bf87f8c003
47 changed files with 6985 additions and 202 deletions

View File

@@ -1,9 +1,3 @@
# we have a fixed requirement for PCRE
set(PCRE_REQUIRED_MAJOR_VERSION 8)
set(PCRE_REQUIRED_MINOR_VERSION 41)
set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
include (${CMAKE_MODULE_PATH}/pcre.cmake)
if (NOT CORRECT_PCRE_VERSION)
message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found, not building hscollider")
return()
@@ -29,6 +23,8 @@ set_source_files_properties(
ragelmaker(ColliderCorporaParser.rl)
add_definitions(-DHS_HYBRID)
# only set these after all tests are done
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
@@ -69,7 +65,7 @@ add_dependencies(hscollider ragel_ColliderCorporaParser)
add_dependencies(hscollider pcre)
if(NOT WIN32)
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil
expressionutil corpusomatic crosscompileutil pthread
"${BACKTRACE_LDFLAGS}")
@@ -78,7 +74,7 @@ if(HAVE_BACKTRACE)
"${BACKTRACE_CFLAGS}")
endif()
else() # WIN32
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil
expressionutil corpusomatic crosscompileutil)
endif()

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -54,10 +54,10 @@ public:
explicit DatabaseProxy(const std::set<unsigned> &expr_ids)
: ids(expr_ids) {}
explicit DatabaseProxy(std::shared_ptr<HyperscanDB> built_db)
explicit DatabaseProxy(std::shared_ptr<BaseDB> built_db)
: db(built_db) {}
std::shared_ptr<HyperscanDB> get(const UltimateTruth &ultimate) {
std::shared_ptr<BaseDB> get(const UltimateTruth &ultimate) {
std::lock_guard<std::mutex> lock(mutex);
if (failed) {
// We have previously failed to compile this database.
@@ -80,7 +80,7 @@ public:
private:
std::mutex mutex;
std::shared_ptr<HyperscanDB> db;
std::shared_ptr<BaseDB> db;
std::set<unsigned> ids;
bool failed = false; // Database failed compilation.
};

View File

@@ -187,6 +187,14 @@ string pcreErrStr(int err) {
}
}
/* that is, a mode provided by native hyperscan */
static
bool isStandardMode(unsigned int mode) {
return mode == MODE_BLOCK
|| mode == MODE_STREAMING
|| mode == MODE_VECTORED;
}
GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
unsigned long int limit,
unsigned long int limit_recursion)
@@ -194,8 +202,10 @@ GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
matchLimitRecursion(limit_recursion) {}
void GroundTruth::global_prep() {
// We're using pcre callouts
pcre_callout = &pcreCallOut;
if (isStandardMode(colliderMode)) {
// We're using pcre callouts
pcre_callout = &pcreCallOut;
}
}
static
@@ -262,11 +272,17 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
throw PcreCompileFailure("Unsupported extended flags.");
}
// Hybrid mode implies SOM.
if (colliderMode == MODE_HYBRID) {
assert(!use_NFA);
som = true;
}
// SOM flags might be set globally.
som |= !!somFlags;
// For traditional Hyperscan, add global callout to pattern.
if (!combination && !no_callouts) {
if (!combination && !no_callouts && isStandardMode(colliderMode)) {
addCallout(re);
}
@@ -403,6 +419,79 @@ int scanBasic(const CompiledPcre &compiled, const string &buffer,
return ret;
}
static
bool isUtf8(const CompiledPcre &compiled) {
unsigned long int options = 0;
pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
return options & PCRE_UTF8;
}
static
CaptureVec makeCaptureVec(const vector<int> &ovector, int ret) {
assert(ret > 0);
CaptureVec cap;
if (no_groups) {
return cap; // No group info requested.
}
cap.reserve(ret * 2);
for (int i = 0; i < ret * 2; i += 2) {
int from = ovector[i], to = ovector[i + 1];
cap.push_back(make_pair(from, to));
}
return cap;
}
static
int scanHybrid(const CompiledPcre &compiled, const string &buffer,
const pcre_extra &extra, vector<int> &ovector,
ResultSet &rs, ostream &out) {
int len = (int)buffer.length();
int startoffset = 0;
bool utf8 = isUtf8(compiled);
int flags = 0;
int ret;
do {
ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), len,
startoffset, flags, &ovector[0], ovector.size());
if (ret <= PCRE_ERROR_NOMATCH) {
return ret;
}
int from = ovector.at(0);
int to = ovector.at(1);
rs.addMatch(from, to, makeCaptureVec(ovector, ret));
if (echo_matches) {
out << "PCRE Match @ (" << from << "," << to << ")" << endl;
}
// If we only wanted a single match, we're done.
if (compiled.highlander) break;
// Next scan starts at the first codepoint after the match. It's
// possible that we have a vacuous match, in which case we must step
// past it to ensure that we always progress.
if (from != to) {
startoffset = to;
} else if (utf8) {
startoffset = to + 1;
while (startoffset < len
&& ((buffer[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
++startoffset;
}
} else {
startoffset = to + 1;
}
} while (startoffset <= len);
return ret;
}
static
int scanOffset(const CompiledPcre &compiled, const string &buffer,
const pcre_extra &extra, vector<int> &ovector,
@@ -532,15 +621,24 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
pcre_extra extra;
extra.flags = 0;
// Switch on callouts.
extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
extra.callout_data = &ctx;
// If running in traditional HyperScan mode, switch on callouts.
bool usingCallouts = isStandardMode(colliderMode);
if (usingCallouts) {
// Switch on callouts.
extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
extra.callout_data = &ctx;
}
// Set the match_limit (in order to bound execution time on very complex
// patterns)
extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
extra.match_limit = matchLimit;
extra.match_limit_recursion = matchLimitRecursion;
if (colliderMode == MODE_HYBRID) {
extra.match_limit = 10000000;
extra.match_limit_recursion = 1500;
} else {
extra.match_limit = matchLimit;
extra.match_limit_recursion = matchLimitRecursion;
}
#ifdef PCRE_NO_START_OPTIMIZE
// Switch off optimizations that may result in callouts not occurring.
@@ -553,6 +651,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
ovector.resize(ovecsize);
int ret;
bool hybrid = false;
switch (colliderMode) {
case MODE_BLOCK:
case MODE_STREAMING:
@@ -563,6 +662,10 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
ret = scanBasic(compiled, buffer, extra, ovector, ctx);
}
break;
case MODE_HYBRID:
ret = scanHybrid(compiled, buffer, extra, ovector, rs, out);
hybrid = true;
break;
default:
assert(0);
ret = PCRE_ERROR_NULL;
@@ -595,7 +698,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
return true;
}
if (compiled.som) {
if (compiled.som && !hybrid) {
filterLeftmostSom(rs);
}

View File

@@ -35,25 +35,36 @@
#include <utility>
#include <vector>
// Type for capturing groups: a vector of (from, to) offsets, with both set to
// -1 for inactive groups (like pcre's ovector). Used by hybrid modes.
typedef std::vector<std::pair<int, int> > CaptureVec;
// Class representing a single match, encapsulating to/from offsets.
class MatchResult {
public:
MatchResult(unsigned long long start, unsigned long long end)
: from(start), to(end) {}
MatchResult(unsigned long long start, unsigned long long end,
const CaptureVec &cap)
: from(start), to(end), captured(cap) {}
bool operator<(const MatchResult &a) const {
if (from != a.from) {
return from < a.from;
}
return to < a.to;
if (to != a.to) {
return to < a.to;
}
return captured < a.captured;
}
bool operator==(const MatchResult &a) const {
return from == a.from && to == a.to;
return from == a.from && to == a.to && captured == a.captured;
}
unsigned long long from;
unsigned long long to;
CaptureVec captured;
};
enum ResultSource {
@@ -114,6 +125,19 @@ public:
}
}
// Add a match (with capturing vector)
void addMatch(unsigned long long from, unsigned long long to,
const CaptureVec &cap, int block = 0) {
MatchResult m(from, to, cap);
matches.insert(m);
if (matches_by_block[block].find(m) != matches_by_block[block].end()) {
dupe_matches.insert(m);
} else {
matches_by_block[block].insert(m);
}
}
// Clear all matches.
void clear() {
matches.clear();

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -90,19 +90,14 @@ hs_error_t open_magic_stream(const hs_database_t *db, unsigned flags,
#endif // RELEASE_BUILD
class HyperscanDB : boost::noncopyable {
class BaseDB : boost::noncopyable {
public:
// Constructor takes iterators over a container of pattern IDs.
template <class Iter>
HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end)
: db(db_in), ids(ids_begin, ids_end) {}
BaseDB(Iter ids_begin, Iter ids_end)
: ids(ids_begin, ids_end) {}
~HyperscanDB() {
hs_free_database(db);
}
// Underlying Hyperscan database pointer.
hs_database_t *db;
virtual ~BaseDB();
// The set of expression IDs that must return their matches in order.
unordered_set<unsigned> ordered;
@@ -111,15 +106,55 @@ public:
unordered_set<unsigned> ids;
};
BaseDB::~BaseDB() { }
class HyperscanDB : public BaseDB {
public:
// Constructor takes iterators over a container of pattern IDs.
template <class Iter>
HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end)
: BaseDB(ids_begin, ids_end), db(db_in) {}
~HyperscanDB();
// Underlying Hyperscan database pointer.
hs_database_t *db;
};
HyperscanDB::~HyperscanDB() {
hs_free_database(db);
}
#ifdef HS_HYBRID
class HybridDB : public BaseDB {
public:
// Constructor takes iterators over a container of pattern IDs.
template <class Iter>
HybridDB(ch_database_t *db_in, Iter ids_begin, Iter ids_end)
: BaseDB(ids_begin, ids_end), db(db_in) {}
~HybridDB();
// Underlying Hyperscan database pointer.
ch_database_t *db;
};
HybridDB::~HybridDB() {
ch_free_database(db);
}
#endif // HS_HYBRID
// Used to track the ID and result set.
namespace {
struct MultiContext {
MultiContext(unsigned int id_in, const HyperscanDB &db_in, ResultSet *rs_in,
MultiContext(unsigned int id_in, const BaseDB &db_in, ResultSet *rs_in,
bool single_in, ostream &os)
: id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {}
unsigned int id;
int block = 0;
const HyperscanDB &db;
const BaseDB &db;
ResultSet *rs;
u64a lastRawMatch = 0; /* store last known unadjusted match location */
u64a lastOrderMatch = 0;
@@ -230,6 +265,75 @@ int callbackMulti(unsigned int id, unsigned long long from,
return 0;
}
#ifdef HS_HYBRID
// Hybrid matcher callback.
static
ch_callback_t callbackHybrid(unsigned id, unsigned long long from,
unsigned long long to, unsigned, unsigned size,
const ch_capture_t *captured, void *ctx) {
MultiContext *mctx = static_cast<MultiContext *>(ctx);
assert(mctx);
assert(mctx->rs);
assert(mctx->in_scan_call);
ostream &out = mctx->out;
to -= g_corpora_prefix.size();
if (mctx->terminated) {
out << "UE2 Match @ (" << from << "," << to << ") for " << id
<< " after termination" << endl;
mctx->rs->match_after_halt = true;
}
if (mctx->single || id == mctx->id) {
CaptureVec cap;
for (unsigned int i = 0; i < size; i++) {
if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) {
cap.push_back(make_pair(-1, -1));
} else {
cap.push_back(make_pair(captured[i].from, captured[i].to));
}
}
mctx->rs->addMatch(from, to, cap);
}
if (echo_matches) {
out << "Match @ [" << from << "," << to << "] for " << id << endl;
out << " Captured " << size << " groups: ";
for (unsigned int i = 0; i < size; i++) {
if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) {
out << "{} ";
} else {
out << "{" << captured[i].from << "," << captured[i].to << "} ";
}
}
out << endl;
}
if (limit_matches && mctx->rs->matches.size() == limit_matches) {
mctx->terminated = true;
return CH_CALLBACK_TERMINATE;
}
return CH_CALLBACK_CONTINUE;
}
// Hybrid matcher error callback.
static
ch_callback_t errorCallback(UNUSED ch_error_event_t errorType, UNUSED unsigned int id, void *,
void *ctx) {
UNUSED MultiContext *mctx = static_cast<MultiContext *>(ctx);
assert(mctx);
assert(mctx->rs);
assert(mctx->in_scan_call);
return CH_CALLBACK_SKIP_PATTERN;
}
#endif // HS_HYBRID
static
void filterLeftmostSom(ResultSet &rs) {
if (rs.matches.size() <= 1) {
@@ -252,6 +356,9 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr,
const Grey &grey_in, unsigned int streamBlocks)
: grey(grey_in), out(os), m_expr(expr), m_xcompile(false),
m_streamBlocks(streamBlocks), scratch(nullptr),
#ifdef HS_HYBRID
chimeraScratch(nullptr),
#endif
platform(plat) {
// Build our mode flags.
@@ -265,15 +372,27 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr,
case MODE_VECTORED:
m_mode = HS_MODE_VECTORED;
break;
case MODE_HYBRID:
m_mode = 0;
break;
}
// Set desired SOM precision, if we're in streaming mode.
if (colliderMode == MODE_STREAMING) {
m_mode |= somPrecisionMode;
}
#ifdef HS_HYBRID
if (colliderMode == MODE_HYBRID && !no_groups) {
m_mode |= CH_MODE_GROUPS;
}
#endif
}
UltimateTruth::~UltimateTruth() {
#ifdef HS_HYBRID
ch_free_scratch(chimeraScratch);
#endif
hs_free_scratch(scratch);
}
@@ -327,13 +446,13 @@ void mangle_scratch(hs_scratch_t *scratch) {
scratch->fdr_conf_offset = 0xe4;
}
bool UltimateTruth::blockScan(const HyperscanDB &hdb, const string &buffer,
bool UltimateTruth::blockScan(const BaseDB &bdb, const string &buffer,
size_t align, match_event_handler callback,
void *ctx_in, ResultSet *) {
assert(colliderMode == MODE_BLOCK);
assert(!m_xcompile);
const hs_database_t *db = hdb.db;
const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
assert(db);
MultiContext *ctx = (MultiContext *)ctx_in;
@@ -438,13 +557,13 @@ hs_stream_t *compressAndResetExpandStream(const hs_database_t *db,
return out;
}
bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer,
bool UltimateTruth::streamingScan(const BaseDB &bdb, const string &buffer,
size_t align, match_event_handler callback,
void *ctx_in, ResultSet *rs) {
assert(colliderMode == MODE_STREAMING);
assert(!m_xcompile);
const hs_database_t *db = hdb.db;
const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
assert(db);
MultiContext *ctx = (MultiContext *)ctx_in;
@@ -594,13 +713,13 @@ bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer,
return ret == HS_SUCCESS;
}
bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer,
bool UltimateTruth::vectoredScan(const BaseDB &bdb, const string &buffer,
size_t align, match_event_handler callback,
void *ctx_in, ResultSet *rs) {
assert(colliderMode == MODE_VECTORED);
assert(!m_xcompile);
const hs_database_t *db = hdb.db;
const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
assert(db);
MultiContext *ctx = (MultiContext *)ctx_in;
@@ -682,19 +801,67 @@ bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer,
return true;
}
bool UltimateTruth::run(unsigned int id, shared_ptr<const HyperscanDB> hdb,
#ifdef HS_HYBRID
bool UltimateTruth::hybridScan(const BaseDB &bdb, const string &buffer,
size_t align, ch_match_event_handler callback,
ch_error_event_handler error_callback,
void *ctx_in, ResultSet *) {
assert(colliderMode == MODE_HYBRID);
assert(!m_xcompile);
const ch_database_t *db = reinterpret_cast<const HybridDB &>(bdb).db;
assert(db);
MultiContext *ctx = (MultiContext *)ctx_in;
char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align);
if (!realigned) {
return false;
}
if (use_copy_scratch && !cloneScratch()) {
return false;
}
ctx->in_scan_call = true;
ch_error_t ret =
ch_scan(db, realigned, buffer.size(), 0, chimeraScratch, callback,
error_callback, ctx);
ctx->in_scan_call = false;
if (g_verbose) {
out << "Scan call returned " << ret << endl;
}
if (ctx->terminated) {
if (g_verbose && ret != CH_SCAN_TERMINATED) {
out << "Scan should have returned CH_SCAN_TERMINATED, returned "
<< ret << " instead." << endl;
}
return ret == CH_SCAN_TERMINATED;
}
if (g_verbose && ret != CH_SUCCESS) {
out << "Scan should have returned CH_SUCCESS, returned " << ret
<< " instead." << endl;
}
return ret == CH_SUCCESS;
}
#endif
bool UltimateTruth::run(unsigned int id, shared_ptr<const BaseDB> bdb,
const string &buffer, bool single_pattern,
unsigned int align, ResultSet &rs) {
assert(!m_xcompile);
assert(hdb);
assert(bdb);
// Ensure that scratch is appropriate for this database.
if (!allocScratch(hdb)) {
if (!allocScratch(bdb)) {
out << "Scratch alloc failed." << endl;
return false;
}
MultiContext ctx(id, *hdb, &rs, single_pattern, out);
MultiContext ctx(id, *bdb, &rs, single_pattern, out);
if (!g_corpora_suffix.empty()) {
ctx.use_max_offset = true;
ctx.max_offset = buffer.size() - g_corpora_suffix.size();
@@ -702,11 +869,20 @@ bool UltimateTruth::run(unsigned int id, shared_ptr<const HyperscanDB> hdb,
switch (colliderMode) {
case MODE_BLOCK:
return blockScan(*hdb, buffer, align, callbackMulti, &ctx, &rs);
return blockScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
case MODE_STREAMING:
return streamingScan(*hdb, buffer, align, callbackMulti, &ctx, &rs);
return streamingScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
case MODE_VECTORED:
return vectoredScan(*hdb, buffer, align, callbackMulti, &ctx, &rs);
return vectoredScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
case MODE_HYBRID:
#ifdef HS_HYBRID
return hybridScan(*bdb, buffer, align, callbackHybrid, errorCallback,
&ctx, &rs);
#else
cerr << "Hybrid mode not available in this build." << endl;
abort();
#endif
break;
}
assert(0);
@@ -739,7 +915,7 @@ bool isOrdered(const string &expr, unsigned int flags) {
return ordered;
}
static unique_ptr<HyperscanDB>
static unique_ptr<BaseDB>
compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
vector<unsigned> &idsvec, ptr_vector<hs_expr_ext> &ext,
unsigned mode, const hs_platform_info *platform, string &error,
@@ -762,7 +938,30 @@ compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
return ue2::make_unique<HyperscanDB>(db, idsvec.begin(), idsvec.end());
}
shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
#ifdef HS_HYBRID
static unique_ptr<BaseDB>
compileHybrid(vector<const char *> &patterns,
vector<unsigned> &flags, vector<unsigned> &idsvec,
unsigned mode, const hs_platform_info *platform, string &error) {
const unsigned count = patterns.size();
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err;
ch_error_t err = ch_compile_multi(&patterns[0], &flags[0],
&idsvec[0], count, mode, platform, &db,
&compile_err);
if (err != HS_SUCCESS) {
error = compile_err->message;
ch_free_compile_error(compile_err);
return nullptr;
}
return ue2::make_unique<HybridDB>(db, idsvec.begin(), idsvec.end());
}
#endif
shared_ptr<BaseDB> UltimateTruth::compile(const set<unsigned> &ids,
string &error) const {
// Build our vectors for compilation
const size_t count = ids.size();
@@ -811,6 +1010,17 @@ shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
ext[n].edit_distance = edit_distance;
}
if (colliderMode == MODE_HYBRID) {
if (ext[n].flags) {
error = "Hybrid does not support extended parameters.";
return nullptr;
}
// We can also strip some other flags in the hybrid matcher.
flags[n] &= ~HS_FLAG_PREFILTER; // prefilter always used
flags[n] &= ~HS_FLAG_ALLOWEMPTY; // empty always allowed
flags[n] &= ~HS_FLAG_SOM_LEFTMOST; // SOM always on
}
n++;
}
@@ -827,8 +1037,18 @@ shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
idsvec.push_back(0);
}
auto db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, platform,
error, grey);
unique_ptr<BaseDB> db;
if (colliderMode == MODE_HYBRID) {
#ifdef HS_HYBRID
db = compileHybrid(patterns, flags, idsvec, m_mode, platform, error);
#else
error = "Hybrid mode not available in this build.";
#endif
} else {
db = compileHyperscan(patterns, flags, idsvec, ext, m_mode,
platform, error, grey);
}
if (!db) {
return nullptr;
}
@@ -850,18 +1070,29 @@ shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
return move(db);
}
bool UltimateTruth::allocScratch(shared_ptr<const HyperscanDB> db) {
bool UltimateTruth::allocScratch(shared_ptr<const BaseDB> db) {
assert(db);
// We explicitly avoid running scratch allocators for the same HyperscanDB
// We explicitly avoid running scratch allocators for the same BaseDB
// over and over again by retaining a shared_ptr to the last one we saw.
if (db == last_db) {
return true;
}
hs_error_t err = hs_alloc_scratch(db.get()->db, &scratch);
if (err != HS_SUCCESS) {
return false;
if (colliderMode == MODE_HYBRID) {
#ifdef HS_HYBRID
ch_error_t err = ch_alloc_scratch(
reinterpret_cast<const HybridDB *>(db.get())->db, &chimeraScratch);
if (err != HS_SUCCESS) {
return false;
}
#endif // HS_HYBRID
} else {
hs_error_t err = hs_alloc_scratch(
reinterpret_cast<const HyperscanDB *>(db.get())->db, &scratch);
if (err != HS_SUCCESS) {
return false;
}
}
last_db = db;
@@ -869,20 +1100,40 @@ bool UltimateTruth::allocScratch(shared_ptr<const HyperscanDB> db) {
}
bool UltimateTruth::cloneScratch(void) {
hs_scratch_t *old_scratch = scratch;
hs_scratch_t *new_scratch;
hs_error_t ret = hs_clone_scratch(scratch, &new_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("failure to clone %d\n", ret);
return false;
if (colliderMode == MODE_HYBRID) {
#ifdef HS_HYBRID
ch_scratch_t *old_scratch = chimeraScratch;
ch_scratch_t *new_scratch;
ch_error_t ret = ch_clone_scratch(chimeraScratch, &new_scratch);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("failure to clone %d\n", ret);
return false;
}
chimeraScratch = new_scratch;
ret = ch_free_scratch(old_scratch);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("failure to free %d\n", ret);
return false;
}
DEBUG_PRINTF("hybrid scratch cloned from %p to %p\n",
old_scratch, chimeraScratch);
#endif // HS_HYBRID
} else {
hs_scratch_t *old_scratch = scratch;
hs_scratch_t *new_scratch;
hs_error_t ret = hs_clone_scratch(scratch, &new_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("failure to clone %d\n", ret);
return false;
}
scratch = new_scratch;
ret = hs_free_scratch(old_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("failure to free %d\n", ret);
return false;
}
DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch);
}
scratch = new_scratch;
ret = hs_free_scratch(old_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("failure to free %d\n", ret);
return false;
}
DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch);
return true;
}
@@ -947,20 +1198,35 @@ char *UltimateTruth::setupVecScanBuffer(const char *begin, size_t len,
return ptr;
}
bool UltimateTruth::saveDatabase(const HyperscanDB &hdb,
bool UltimateTruth::saveDatabase(const BaseDB &bdb,
const string &filename) const {
return ::saveDatabase(hdb.db, filename.c_str(), g_verbose);
if (colliderMode == MODE_HYBRID) {
cerr << "Hybrid mode doesn't support serialization." << endl;
abort();
} else {
return ::saveDatabase(reinterpret_cast<const HyperscanDB *>(&bdb)->db,
filename.c_str(), g_verbose);
}
return false;
}
shared_ptr<HyperscanDB>
shared_ptr<BaseDB>
UltimateTruth::loadDatabase(const string &filename,
const std::set<unsigned> &ids) const {
hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose);
if (!hs_db) {
return nullptr;
shared_ptr<BaseDB> db;
if (colliderMode == MODE_HYBRID) {
cerr << "Hybrid mode doesn't support deserialization." << endl;
abort();
} else {
hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose);
if (!hs_db) {
return nullptr;
}
db = make_shared<HyperscanDB>(hs_db, ids.begin(), ids.end());
}
auto db = make_shared<HyperscanDB>(hs_db, ids.begin(), ids.end());
assert(db);
// Fill db::ordered with the expressions that require the ordered flag.

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,10 @@
#include "hs.h"
#ifdef HS_HYBRID
#include "chimera/ch.h"
#endif
#include <memory>
#include <ostream>
#include <set>
@@ -47,7 +51,7 @@ struct Grey;
} // namespace ue2
class HyperscanDB;
class BaseDB;
class ResultSet;
// Wrapper around ue2 to generate results for an expression and corpus.
@@ -59,13 +63,13 @@ public:
~UltimateTruth();
std::shared_ptr<HyperscanDB> compile(const std::set<unsigned> &ids,
std::shared_ptr<BaseDB> compile(const std::set<unsigned> &ids,
std::string &error) const;
bool saveDatabase(const HyperscanDB &db,
bool saveDatabase(const BaseDB &db,
const std::string &filename) const;
std::shared_ptr<HyperscanDB>
std::shared_ptr<BaseDB>
loadDatabase(const std::string &filename,
const std::set<unsigned> &ids) const;
@@ -74,7 +78,7 @@ public:
return !m_xcompile;
}
bool run(unsigned id, std::shared_ptr<const HyperscanDB> db,
bool run(unsigned id, std::shared_ptr<const BaseDB> db,
const std::string &buffer, bool single_pattern, unsigned align,
ResultSet &rs);
@@ -84,22 +88,28 @@ public:
std::string dbFilename(const std::set<unsigned int> &ids) const;
private:
bool blockScan(const HyperscanDB &db, const std::string &buffer,
bool blockScan(const BaseDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx,
ResultSet *rs);
bool streamingScan(const HyperscanDB &db, const std::string &buffer,
bool streamingScan(const BaseDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx,
ResultSet *rs);
bool vectoredScan(const HyperscanDB &db, const std::string &buffer,
bool vectoredScan(const BaseDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx,
ResultSet *rs);
#ifdef HS_HYBRID
bool hybridScan(const BaseDB &db, const std::string &buffer,
size_t align, ch_match_event_handler callback,
ch_error_event_handler error_callback,
void *ctx, ResultSet *rs);
#endif // HS_HYBRID
char *setupScanBuffer(const char *buf, size_t len, size_t align);
char *setupVecScanBuffer(const char *buf, size_t len, size_t align,
unsigned int block_id);
bool allocScratch(std::shared_ptr<const HyperscanDB> db);
bool allocScratch(std::shared_ptr<const BaseDB> db);
bool cloneScratch(void);
@@ -126,6 +136,11 @@ private:
// Scratch space for Hyperscan.
hs_scratch_t *scratch;
#ifdef HS_HYBRID
// Scratch space for Chimera.
ch_scratch_t *chimeraScratch;
#endif // HS_HYBRID
// Temporary scan buffer used for realigned scanning
std::vector<char> m_scanBuf;
@@ -134,7 +149,7 @@ private:
// Last database we successfully allocated scratch for, so that we can
// avoid unnecessarily reallocating for it.
std::shared_ptr<const HyperscanDB> last_db;
std::shared_ptr<const BaseDB> last_db;
const hs_platform_info *platform;
};

View File

@@ -76,6 +76,7 @@ void usage(const char *name, const char *error) {
"blocks.\n");
printf(" -V NUM Use vectored mode, split data into ~NUM "
"blocks.\n");
printf(" -H Use hybrid mode.\n");
printf(" -Z {R or 0-%d} Only test one alignment, either as given or "
"'R' for random.\n", MAX_MAX_UE2_ALIGN - 1);
printf(" -q Quiet; display only match differences, no other "
@@ -90,6 +91,7 @@ void usage(const char *name, const char *error) {
printf(" -E DISTANCE Match all patterns within edit distance"
" DISTANCE.\n");
printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n");
printf(" --no-groups Disable capturing in Hybrid mode.\n");
printf("\n");
printf("Testing mode options:\n");
printf("\n");
@@ -157,7 +159,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
vector<string> *corpora, UNUSED Grey *grey,
unique_ptr<hs_platform_info> *plat_out) {
static const char options[]
= "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8";
= "-ab:cC:d:D:e:E:G:hHi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8";
s32 in_multi = 0;
s32 in_corpora = 0;
int pcreFlag = 1;
@@ -180,6 +182,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
{"no-signal-handler", 0, &no_signal_handler, 1},
{"compress-expand", 0, &compressFlag, 1},
{"compress-reset-expand", 0, &compressResetFlag, 1},
{"no-groups", 0, &no_groups, 1},
{nullptr, 0, nullptr, 0}};
for (;;) {
@@ -271,6 +274,15 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
case 'h':
usage(argv[0], nullptr);
exit(0);
case 'H':
if (colliderMode != MODE_BLOCK) {
usage(argv[0], "You can only use one mode at a time!");
exit(1);
}
colliderMode = MODE_HYBRID;
// Disable graph truth in hybrid mode
nfaFlag = 0;
break;
case 'i':
loadDatabases = true;
serializePath = optarg;
@@ -542,6 +554,11 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
exit(1);
}
if (colliderMode == MODE_HYBRID && !ue2Flag) {
usage(argv[0], "You cannot disable UE2 engine in Hybrid mode.");
exit(1);
}
// need at least two pattern engines active
if (nfaFlag + pcreFlag + ue2Flag < 2) {
usage(argv[0], "At least two pattern engines should be active.");

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,8 @@
enum ColliderMode {
MODE_BLOCK,
MODE_STREAMING,
MODE_VECTORED
MODE_VECTORED,
MODE_HYBRID
};
extern unsigned numThreads;
@@ -68,6 +69,7 @@ extern unsigned max_ue2_align;
extern size_t g_memoryLimit;
extern bool force_utf8;
extern int force_prefilter;
extern int no_groups;
extern unsigned somPrecisionMode;
extern unsigned limit_matches;
extern unsigned randomSeed;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -448,6 +448,9 @@ void printMode(void) {
case MODE_VECTORED:
cout << "Vectored-" << g_streamBlocks;
break;
case MODE_HYBRID:
cout << "Hybrid";
break;
}
if (use_copy_scratch) {
@@ -690,7 +693,7 @@ shared_ptr<DatabaseProxy> constructDatabase(const set<unsigned int> &ids,
if (loadDatabases) {
string filename = ultimate.dbFilename(ids);
shared_ptr<HyperscanDB> db = ultimate.loadDatabase(filename, ids);
shared_ptr<BaseDB> db = ultimate.loadDatabase(filename, ids);
if (!db) {
if (!g_quiet) {
cout << "FAILED: could not load database " << filename << endl;
@@ -706,7 +709,7 @@ shared_ptr<DatabaseProxy> constructDatabase(const set<unsigned int> &ids,
// If we're not runnable (i.e. we're cross-compiling), let's at least
// try to build the database.
if (!ultimate.runnable()) {
shared_ptr<HyperscanDB> db = ue2->get(ultimate);
shared_ptr<BaseDB> db = ue2->get(ultimate);
assert(db); // throws otherwise
}
@@ -872,7 +875,7 @@ void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
assert(use_UE2);
Corpus &corpus = unit.corpus;
shared_ptr<const HyperscanDB> db;
shared_ptr<const BaseDB> db;
if (use_UE2) {
// Acquire UE2 database.
debug_stage = STAGE_UE2_COMPILE;
@@ -1648,6 +1651,7 @@ void printSettingsV(const vector<string> &corporaFiles,
case MODE_BLOCK: cout << "block mode"; break;
case MODE_STREAMING: cout << "streaming mode"; break;
case MODE_VECTORED: cout << "vectored mode"; break;
case MODE_HYBRID: cout << "hybrid mode"; break;
}
cout << endl;
@@ -1746,6 +1750,7 @@ void printSettingsQ(const vector<string> &corporaFiles,
case MODE_BLOCK: cout << "block mode"; break;
case MODE_STREAMING: cout << "streaming mode"; break;
case MODE_VECTORED: cout << "vectored mode"; break;
case MODE_HYBRID: cout << "hybrid mode"; break;
}
cout << endl;