Logical Combination of patterns.

This commit is contained in:
Chang, Harry
2018-06-22 18:15:21 +08:00
parent 5895b8da25
commit 8a1c497f44
50 changed files with 2693 additions and 85 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -48,6 +48,7 @@
#include "nfagraph/ng_util.h"
#include "parser/Parser.h"
#include "parser/unsupported.h"
#include "parser/logical_combination.h"
#include "util/compile_context.h"
#include "util/make_unique.h"
#include "util/report_manager.h"
@@ -69,8 +70,11 @@ public:
CompiledNG(unique_ptr<NGHolder> g_in,
unique_ptr<ReportManager> rm_in)
: g(std::move(g_in)), rm(std::move(rm_in)) {}
CompiledNG(unique_ptr<ParsedLogical> pl_in)
: pl(std::move(pl_in)) {}
unique_ptr<ue2::NGHolder> g;
unique_ptr<ue2::ReportManager> rm;
unique_ptr<ue2::ParsedLogical> pl;
};
static
@@ -126,6 +130,14 @@ void CNGInfo::compile() {
}
try {
if (combination) {
auto pl = ue2::make_unique<ParsedLogical>();
pl->parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL);
pl->logicalKeyRenumber();
cng = make_unique<CompiledNG>(move(pl));
return;
}
bool isStreaming = colliderMode == MODE_STREAMING;
bool isVectored = colliderMode == MODE_VECTORED;
CompileContext cc(isStreaming, isVectored, get_current_target(),
@@ -199,6 +211,8 @@ unique_ptr<CNGInfo> GraphTruth::preprocess(unsigned id,
bool highlander = false;
bool prefilter = false;
bool som = false;
bool combination = false;
bool quiet = false;
auto i = m_expr.find(id);
if (i == m_expr.end()) {
@@ -214,7 +228,8 @@ unique_ptr<CNGInfo> GraphTruth::preprocess(unsigned id,
throw NGCompileFailure("Cannot parse expression flags.");
}
// read PCRE flags
if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som)) {
if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som,
&combination, &quiet)) {
throw NGCompileFailure("Cannot get PCRE flags.");
}
if (force_utf8) {
@@ -247,6 +262,8 @@ unique_ptr<CNGInfo> GraphTruth::preprocess(unsigned id,
cngi->highlander = highlander;
cngi->prefilter = prefilter;
cngi->som = som;
cngi->combination = combination;
cngi->quiet = quiet;
cngi->min_offset = ext.min_offset;
cngi->max_offset = ext.max_offset;
cngi->min_length = ext.min_length;
@@ -256,8 +273,95 @@ unique_ptr<CNGInfo> GraphTruth::preprocess(unsigned id,
return cngi;
}
/** \brief Returns 1 if compliant to all logical combinations. */
static
char isLogicalCombination(vector<char> &lv, const vector<LogicalOp> &comb,
size_t lkeyCount, unsigned start, unsigned result) {
assert(start <= result);
for (unsigned i = start; i <= result; i++) {
const LogicalOp &op = comb[i - lkeyCount];
assert(i == op.id);
switch (op.op) {
case LOGICAL_OP_NOT:
lv[op.id] = !lv[op.ro];
break;
case LOGICAL_OP_AND:
lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
break;
case LOGICAL_OP_OR:
lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
break;
default:
assert(0);
break;
}
}
return lv[result];
}
bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi,
const string &buffer, ResultSet &rs, string &) {
const string &buffer, ResultSet &rs, string &error) {
if (cngi.quiet) {
return true;
}
if (cngi.combination) {
// Compile and run sub-expressions, store match results.
map<unsigned long long, set<MatchResult>> offset_to_matches;
map<unsigned long long, set<unsigned>> offset_to_lkeys;
set<unsigned> sub_exps;
const auto &m_lkey = cng.pl->getLkeyMap();
for (const auto &it_lkey : m_lkey) {
if (sub_exps.find(it_lkey.first) == sub_exps.end()) {
sub_exps.emplace(it_lkey.first);
ResultSet sub_rs(RESULT_FROM_PCRE);
shared_ptr<CNGInfo> sub_cngi = preprocess(it_lkey.first);
const CompiledNG *sub_cng;
try {
sub_cng = sub_cngi->get();
}
catch (const NGCompileFailure &err) {
return false;
}
catch (const NGUnsupportedFailure &err) {
return false;
}
sub_cngi->quiet = false; // force not quiet in sub-exp.
if (!run(it_lkey.first, *sub_cng, *sub_cngi, buffer, sub_rs, error)) {
rs.clear();
return false;
}
for (const auto &it_mr : sub_rs.matches) {
offset_to_matches[it_mr.to].emplace(it_mr);
offset_to_lkeys[it_mr.to].emplace(it_lkey.second);
if (sub_cngi->highlander) {
break;
}
}
}
}
// Calculate rs for combination expression.
vector<char> lv;
const auto &comb = cng.pl->getLogicalTree();
lv.resize(m_lkey.size() + comb.size());
const auto &li = cng.pl->getCombInfoById(cngi.id);
for (const auto &it : offset_to_lkeys) {
for (auto report : it.second) {
lv[report] = 1;
}
if (isLogicalCombination(lv, comb, m_lkey.size(),
li.start, li.result)) {
for (const auto &mr : offset_to_matches.at(it.first)) {
if ((mr.to >= cngi.min_offset) &&
(mr.to <= cngi.max_offset)) {
rs.addMatch(mr.from, mr.to);
}
}
}
}
return true;
}
set<pair<size_t, size_t>> matches;
if (g_streamOffset) {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -106,6 +106,10 @@ public:
bool highlander = false;
bool prefilter = false;
bool som = false;
bool combination = false;
bool quiet = false;
unsigned id;
private:
void compile();
// If NFA graph scan failed for some reason, we mark it as bad and skip
@@ -116,8 +120,6 @@ private:
std::unique_ptr<CompiledNG> cng; // compiled NFA graph
std::mutex cng_mutex; // serialised accesses to NFA graph
unsigned id;
// Our expression map
const ExpressionMap &m_expr;
};

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -100,7 +100,8 @@ int pcreCallOut(pcre_callout_block *block) {
static
bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
bool *prefilter, bool *som, hs_expr_ext *ext) {
bool *prefilter, bool *som, bool *combination,
bool *quiet, hs_expr_ext *ext) {
string regex;
unsigned int hs_flags = 0;
if (!readExpression(expr, regex, &hs_flags, ext)) {
@@ -109,7 +110,8 @@ bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
expr.swap(regex);
if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som,
combination, quiet)) {
return false;
}
@@ -221,6 +223,8 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
bool highlander = false;
bool prefilter = false;
bool som = false;
bool combination = false;
bool quiet = false;
// we can still match approximate matching patterns with PCRE if edit
// distance 0 is requested
@@ -238,7 +242,8 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
hs_expr_ext ext;
// Decode the flags
if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som,
&combination, &quiet, &ext)) {
throw PcreCompileFailure("Unable to decode flags.");
}
@@ -261,7 +266,7 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
som |= !!somFlags;
// For traditional Hyperscan, add global callout to pattern.
if (!no_callouts) {
if (!combination && !no_callouts) {
addCallout(re);
}
@@ -275,12 +280,22 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
compiled->highlander = highlander;
compiled->prefilter = prefilter;
compiled->som = som;
compiled->combination = combination;
compiled->quiet = quiet;
compiled->min_offset = ext.min_offset;
compiled->max_offset = ext.max_offset;
compiled->min_length = ext.min_length;
compiled->expression = i->second; // original PCRE
flags |= PCRE_NO_AUTO_POSSESS;
if (compiled->combination) {
compiled->pl.parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL);
compiled->pl.logicalKeyRenumber();
compiled->report = id;
return compiled;
}
compiled->bytecode =
pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
@@ -424,8 +439,94 @@ int scanOffset(const CompiledPcre &compiled, const string &buffer,
return ret;
}
/** \brief Returns 1 if compliant to all logical combinations. */
static
char isLogicalCombination(vector<char> &lv, const vector<LogicalOp> &comb,
size_t lkeyCount, unsigned start, unsigned result) {
assert(start <= result);
for (unsigned i = start; i <= result; i++) {
const LogicalOp &op = comb[i - lkeyCount];
assert(i == op.id);
switch (op.op) {
case LOGICAL_OP_NOT:
lv[op.id] = !lv[op.ro];
break;
case LOGICAL_OP_AND:
lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
break;
case LOGICAL_OP_OR:
lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
break;
default:
assert(0);
break;
}
}
return lv[result];
}
bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
const string &buffer, ResultSet &rs, string &error) {
if (compiled.quiet) {
return true;
}
if (compiled.combination) {
// Compile and run sub-expressions, store match results.
map<unsigned long long, set<MatchResult>> offset_to_matches;
map<unsigned long long, set<unsigned>> offset_to_lkeys;
set<unsigned> sub_exps;
const auto &m_lkey = compiled.pl.getLkeyMap();
for (const auto &it_lkey : m_lkey) {
if (sub_exps.find(it_lkey.first) == sub_exps.end()) {
sub_exps.emplace(it_lkey.first);
ResultSet sub_rs(RESULT_FROM_PCRE);
shared_ptr<CompiledPcre> sub_pcre;
try {
sub_pcre = compile(it_lkey.first);
}
catch (const SoftPcreCompileFailure &err) {
return false;
}
catch (const PcreCompileFailure &err) {
return false;
}
sub_pcre->quiet = false; // force not quiet in sub-exp.
if (!run(it_lkey.first, *sub_pcre, buffer, sub_rs, error)) {
rs.clear();
return false;
}
for (const auto &it_mr : sub_rs.matches) {
offset_to_matches[it_mr.to].emplace(it_mr);
offset_to_lkeys[it_mr.to].emplace(it_lkey.second);
if (sub_pcre->highlander) {
break;
}
}
}
}
// Calculate rs for combination expression.
vector<char> lv;
const auto &comb = compiled.pl.getLogicalTree();
lv.resize(m_lkey.size() + comb.size());
const auto &li = compiled.pl.getCombInfoById(compiled.report);
for (const auto &it : offset_to_lkeys) {
for (auto report : it.second) {
lv[report] = 1;
}
if (isLogicalCombination(lv, comb, m_lkey.size(),
li.start, li.result)) {
for (const auto &mr : offset_to_matches.at(it.first)) {
if ((mr.to >= compiled.min_offset) &&
(mr.to <= compiled.max_offset)) {
rs.addMatch(mr.from, mr.to);
}
}
}
}
return true;
}
CalloutContext ctx(out);
pcre_extra extra;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,7 @@
#include "expressions.h"
#include "ResultSet.h"
#include "parser/logical_combination.h"
#include <memory>
#include <mutex>
@@ -85,6 +86,14 @@ public:
bool highlander = false;
bool prefilter = false;
bool som = false;
bool combination = false;
bool quiet = false;
// Parsed logical combinations.
ue2::ParsedLogical pl;
// Combination expression report id.
unsigned report;
private:
// If a PCRE has hit its match recursion limit when scanning a corpus, we

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -80,6 +80,39 @@ void NfaGeneratedCorpora::generate(unsigned id, vector<Corpus> &data) {
throw CorpusFailure("Expression could not be read: " + i->second);
}
// Combination's corpus is consist of sub-expressions' corpuses.
if (hs_flags & HS_FLAG_COMBINATION) {
ParsedLogical pl;
pl.parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL);
pl.logicalKeyRenumber();
const auto &m_lkey = pl.getLkeyMap();
assert(!m_lkey.empty());
u32 a_subid; // arbitrary sub id
unordered_map<u32, vector<Corpus>> m_data;
for (const auto &it : m_lkey) {
a_subid = it.first;
vector<Corpus> sub_data;
generate(a_subid, sub_data);
m_data.emplace(a_subid, move(sub_data));
}
assert(!m_data.empty());
size_t num_corpus = m_data[a_subid].size();
data.reserve(data.size() + num_corpus);
while (num_corpus) {
string cc; // 1 combination corpus
for (const auto &it : m_lkey) {
assert(!m_data[it.first].empty());
cc += m_data[it.first].back().data;
if (m_data[it.first].size() > 1) {
m_data[it.first].pop_back();
}
}
data.push_back(Corpus(cc));
num_corpus--;
}
return;
}
if (force_utf8_mode) {
hs_flags |= HS_FLAG_UTF8;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -114,6 +114,13 @@ public:
}
}
// Clear all matches.
void clear() {
matches.clear();
dupe_matches.clear();
matches_by_block.clear();
}
// Unexpected out of order match seen.
bool uoom = false;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,8 @@
#include <pcre.h> /* for pcre flags */
bool getPcreFlags(unsigned int hs_flags, unsigned int *flags,
bool *highlander, bool *prefilter, bool *som) {
bool *highlander, bool *prefilter, bool *som,
bool *combination, bool *quiet) {
assert(flags);
assert(highlander);
assert(prefilter);
@@ -76,6 +77,14 @@ bool getPcreFlags(unsigned int hs_flags, unsigned int *flags,
*som = true;
hs_flags &= ~HS_FLAG_SOM_LEFTMOST;
}
if (hs_flags & HS_FLAG_COMBINATION) {
*combination = true;
hs_flags &= ~HS_FLAG_COMBINATION;
}
if (hs_flags & HS_FLAG_QUIET) {
*quiet = true;
hs_flags &= ~HS_FLAG_QUIET;
}
// Flags that are irrelevant to PCRE.
hs_flags &= ~HS_FLAG_ALLOWEMPTY;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,8 @@
* Returns false if an unknown hyperscan flag is encountered.
*/
bool getPcreFlags(unsigned int hs_flags, unsigned int *pcre_flags,
bool *highlander, bool *prefilter, bool *som);
bool *highlander, bool *prefilter, bool *som,
bool *combination = nullptr, bool *quiet = nullptr);
#endif /* PCRE_UTIL_H */