Initial commit of Hyperscan

This commit is contained in:
Matthew Barr
2015-10-20 09:13:35 +11:00
commit 904e436f11
610 changed files with 213627 additions and 0 deletions

33
util/CMakeLists.txt Normal file
View File

@@ -0,0 +1,33 @@
# utility libs
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
set_source_files_properties(
${CMAKE_BINARY_DIR}/tools/ExpressionParser.cpp
PROPERTIES
COMPILE_FLAGS "${RAGEL_C_FLAGS}")
ragelmaker(ExpressionParser.rl)
set(expressionutil_SRCS
expressions.cpp
expressions.h
ExpressionParser.h
ExpressionParser.cpp
)
add_library(expressionutil ${expressionutil_SRCS})
add_dependencies(expressionutil ragel_ExpressionParser)
SET(corpusomatic_SRCS
ng_corpus_editor.h
ng_corpus_editor.cpp
ng_corpus_generator.h
ng_corpus_generator.cpp
ng_corpus_properties.h
ng_corpus_properties.cpp
ng_find_matches.h
ng_find_matches.cpp
)
add_library(corpusomatic ${corpusomatic_SRCS})

40
util/ExpressionParser.h Normal file
View File

@@ -0,0 +1,40 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef EXPRESSIONPARSER_H
#define EXPRESSIONPARSER_H
#include <string>
struct hs_expr_ext;
bool readExpression(const std::string &line, std::string &expr,
unsigned int *flags, hs_expr_ext *ext,
bool *must_be_ordered = nullptr);
#endif

173
util/ExpressionParser.rl Normal file
View File

@@ -0,0 +1,173 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ExpressionParser.h"
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include "ue2common.h"
#include "hs_compile.h"
using std::string;
namespace { // anon
enum ParamKey {
PARAM_NONE,
PARAM_MIN_OFFSET,
PARAM_MAX_OFFSET,
PARAM_MIN_LENGTH
};
%%{
machine ExpressionParser;
action accumulateNum {
num = (num * 10) + (fc - '0');
}
action handleFlag {
switch (fc) {
case 'i': *flags |= HS_FLAG_CASELESS; break;
case 's': *flags |= HS_FLAG_DOTALL; break;
case 'm': *flags |= HS_FLAG_MULTILINE; break;
case 'H': *flags |= HS_FLAG_SINGLEMATCH; break;
case 'O':
if (must_be_ordered) {
*must_be_ordered = true;
}
break;
case 'V': *flags |= HS_FLAG_ALLOWEMPTY; break;
case 'W': *flags |= HS_FLAG_UCP; break;
case '8': *flags |= HS_FLAG_UTF8; break;
case 'P': *flags |= HS_FLAG_PREFILTER; break;
case 'L': *flags |= HS_FLAG_SOM_LEFTMOST; break;
default: fbreak;
}
}
action handleExtParam {
switch (key) {
case PARAM_MIN_OFFSET:
ext->flags |= HS_EXT_FLAG_MIN_OFFSET;
ext->min_offset = num;
break;
case PARAM_MAX_OFFSET:
ext->flags |= HS_EXT_FLAG_MAX_OFFSET;
ext->max_offset = num;
break;
case PARAM_MIN_LENGTH:
ext->flags |= HS_EXT_FLAG_MIN_LENGTH;
ext->min_length = num;
break;
case PARAM_NONE:
default:
// No key specified, syntax invalid.
return false;
}
}
write data;
}%%
} // namespace
static
void initExt(hs_expr_ext *ext) {
memset(ext, 0, sizeof(*ext));
ext->max_offset = MAX_OFFSET;
}
bool readExpression(const std::string &input, std::string &expr,
unsigned int *flags, hs_expr_ext *ext,
bool *must_be_ordered) {
assert(flags);
assert(ext);
// Init flags and ext params.
*flags = 0;
initExt(ext);
if (must_be_ordered) {
*must_be_ordered = false;
}
// Extract expr, which is easier to do in straight C++ than with Ragel.
if (input.empty() || input[0] != '/') {
return false;
}
size_t end = input.find_last_of('/');
if (end == string::npos || end == 0) {
return false;
}
expr = input.substr(1, end - 1);
// Use a Ragel scanner to handle flags and params.
const char *p = input.c_str() + end + 1;
const char *pe = input.c_str() + input.size();
UNUSED const char *eof = pe;
UNUSED const char *ts = p, *te = p;
int cs;
UNUSED int act;
assert(p);
assert(pe);
// For storing integers as they're scanned.
u64a num = 0;
enum ParamKey key = PARAM_NONE;
%%{
single_flag = [ismW8HPLVO];
param = ('min_offset' @{ key = PARAM_MIN_OFFSET; } |
'max_offset' @{ key = PARAM_MAX_OFFSET; } |
'min_length' @{ key = PARAM_MIN_LENGTH; } );
value = (digit @accumulateNum)+ >{num = 0;};
param_spec = (' '* param '=' value ' '*) >{ key = PARAM_NONE; }
%handleExtParam;
main := ( single_flag @handleFlag )* # single-char flags
( '{' param_spec (',' param_spec)* '}' )? # list of ext params
$^{ return false; };
# Intialize and execute.
write init;
write exec;
}%%
DEBUG_PRINTF("expr='%s', flags=%u\n", expr.c_str(), *flags);
return (cs != ExpressionParser_error) && (p == pe);
}

297
util/expressions.cpp Normal file
View File

@@ -0,0 +1,297 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <algorithm>
#include <fstream>
#include <iostream>
#include <stdexcept>
#include <string>
#include <boost/algorithm/string/trim.hpp>
#include <sys/types.h>
#include <sys/stat.h>
#if !defined(_WIN32)
#include <dirent.h>
#include <unistd.h>
#else
// Windows support is probably very fragile
#include <windows.h>
#endif
#include "expressions.h"
#include "hs.h"
#include "string_util.h"
using namespace std;
static
void failLine(unsigned lineNum, const string &file,
const string &line, const string &error) {
cerr << "Parse error in file " << file
<< " on line " << lineNum << ": " << error
<< endl << "Line is: '" << line << "'" << endl;
exit(1);
}
static
void processLine(string &line, unsigned lineNum,
const string &file, ExpressionMap &exprMap) {
// if line is empty, or a comment, we can skip it
if (line.empty() || line[0] == '#') {
return;
}
// cull any whitespace
boost::trim(line);
// otherwise, it should be ID:PCRE, e.g.
// 10001:/foobar/is
size_t colonIdx = line.find_first_of(':');
if (colonIdx == string::npos) {
failLine(lineNum, file, line, "Could not parse line.");
}
// we should have an unsigned int as an ID, before the colon
unsigned id;
if (!fromString(line.substr(0, colonIdx), id)) {
failLine(lineNum, file, line, "Unable to parse ID.");
}
// rest of the expression is the PCRE
const string pcre_str(line.substr(colonIdx + 1));
//cout << "Inserting expr: id=" << id << ", pcre=" << pcre_str << endl;
bool ins = exprMap.insert(ExpressionMap::value_type(id, pcre_str)).second;
if (!ins) {
failLine(lineNum, file, line, "Duplicate ID found.");
}
}
#if defined(_WIN32)
#define stat _stat
#define S_ISDIR(st_m) (_S_IFDIR & (st_m))
#define S_ISREG(st_m) (_S_IFREG & (st_m))
#endif
void loadExpressionsFromFile(const string &fname, ExpressionMap &exprMap) {
struct stat st;
if (stat(fname.c_str(), &st) != 0) {
return;
}
if (!S_ISREG(st.st_mode)) {
return;
}
ifstream f(fname.c_str());
if (!f.good()) {
throw runtime_error("Can't open file");
}
unsigned lineNum = 0;
string line;
while (getline(f, line)) {
lineNum++;
processLine(line, lineNum, fname, exprMap);
}
}
static
bool isIgnorable(const std::string &f) {
if (f.empty()) {
return true;
}
// Editor backup files
if (*f.rbegin() == '~') {
return true;
}
// Ignore dotfiles
if (*f.begin() == '.') {
return true;
}
return false;
}
#ifndef _WIN32
void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
// Is our input path a file or a directory?
struct stat st;
if (stat(inPath.c_str(), &st) != 0) {
cerr << "Can't stat path: '" << inPath << "'" << endl;
exit(1);
}
if (S_ISREG(st.st_mode)) {
// process file
try {
loadExpressionsFromFile(inPath, exprMap);
} catch (runtime_error &e) {
cerr << e.what() << ": '" << inPath << "'" << endl;
exit(1);
}
} else if (S_ISDIR(st.st_mode)) {
DIR *d = opendir(inPath.c_str());
if (d == nullptr) {
cerr << "Can't open directory: '" << inPath << "'" << endl;
exit(1);
}
for (struct dirent *ent = readdir(d); ent; ent = readdir(d)) {
string basename(ent->d_name);
string fname(inPath);
fname.push_back('/');
fname.append(basename);
// Ignore '.' and '..'
if (basename == "." || basename == "..") {
continue;
}
// Skip emacs backup files, dotfiles (such as VIM swap).
if (isIgnorable(basename)) {
cerr << "Ignoring signature file " << fname << endl;
continue;
}
try {
loadExpressionsFromFile(fname, exprMap);
} catch (runtime_error &e) {
cerr << e.what() << ": '" << fname << "'" << endl;
exit(1);
}
}
closedir(d);
} else {
cerr << "Can't stat path: '" << inPath << "'" << endl;
exit(1);
}
}
#else // windows TODO: improve
void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
// Is our input path a file or a directory?
struct stat st;
if (stat(inPath.c_str(), &st) != 0) {
cerr << "Can't stat path: '" << inPath << "'" << endl;
exit(1);
}
if (S_ISREG(st.st_mode)) {
// process file
try {
loadExpressionsFromFile(inPath, exprMap);
} catch (runtime_error &e) {
cerr << e.what() << ": '" << inPath << "'" << endl;
exit(1);
}
} else if (S_ISDIR(st.st_mode)) {
WIN32_FIND_DATA ffd;
HANDLE hFind = INVALID_HANDLE_VALUE;
string glob = inPath + "/*";
hFind = FindFirstFile(glob.c_str(), &ffd);
if (hFind == INVALID_HANDLE_VALUE) {
cerr << "Can't open directory: '" << inPath << "'" << endl;
exit(1);
}
do {
string basename(ffd.cFileName);
string fname(inPath);
fname.push_back('/');
fname.append(basename);
// Ignore '.' and '..'
if (basename == "." || basename == "..") {
continue;
}
// Skip emacs backup files, dotfiles (such as VIM swap).
if (isIgnorable(basename)) {
cerr << "Ignoring signature file " << fname << endl;
continue;
}
try {
loadExpressionsFromFile(fname, exprMap);
} catch (runtime_error &e) {
cerr << e.what() << ": '" << fname << "'" << endl;
exit(1);
}
} while (FindNextFile(hFind, &ffd) != 0);
FindClose(hFind);
} else {
cerr << "Can't stat path: '" << inPath << "'" << endl;
exit(1);
}
}
#endif
void loadSignatureList(const string &inFile,
SignatureSet &signatures) {
ifstream f(inFile.c_str());
if (!f.good()) {
cerr << "Can't open file: '" << inFile << "'" << endl;
exit(1);
}
unsigned lineNum = 0;
string line;
while (getline(f, line)) {
lineNum++;
// if line is empty, or a comment, we can skip it
if (line.empty() || line[0] == '#') {
continue;
}
unsigned id;
if (fromString(line, id)) {
signatures.push_back(id);
} else {
// Parse error occurred
failLine(lineNum, inFile, line, "Unable to parse ID.");
}
}
}
void limitBySignature(ExpressionMap &exprMap,
const SignatureSet &signatures) {
ExpressionMap keepers;
SignatureSet::const_iterator it, ite;
for (it = signatures.begin(), ite = signatures.end(); it != ite; ++it) {
ExpressionMap::const_iterator match = exprMap.find(*it);
if (match == exprMap.end()) {
cerr << "Unable to find signature " << *it
<< " in expression set!" << endl;
exit(1);
}
keepers.insert(*match);
}
exprMap.swap(keepers);
}

55
util/expressions.h Normal file
View File

@@ -0,0 +1,55 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef EXPRESSIONS_H
#define EXPRESSIONS_H
#include <map>
#include <string>
#include <list>
typedef std::map<unsigned, std::string> ExpressionMap;
typedef std::list<unsigned> SignatureSet;
// load all of the expressions from the given directory into the given
// expression map. Exits on failure.
void loadExpressions(const std::string &inDir, ExpressionMap &exprMap);
void loadExpressionsFromFile(const std::string &fname, ExpressionMap &exprMap);
// load a list of signature IDs
void loadSignatureList(const std::string &inFile, SignatureSet &signatures);
// produce a new expression map only containing those signatures in the
// expression list
void generateExprMap(const SignatureSet &signatures,
const ExpressionMap &allExprs, ExpressionMap &out);
// trim expression map to only the given signatures (in-place)
void limitBySignature(ExpressionMap &exprMap, const SignatureSet &signatures);
#endif

290
util/ng_corpus_editor.cpp Normal file
View File

@@ -0,0 +1,290 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Corpus Editor: applies random transformation to a corpus.
*/
#include "config.h"
#include "ng_corpus_editor.h"
#include "ng_corpus_properties.h"
#include "ue2common.h"
#include "util/compare.h"
#include "util/unicode_def.h"
#include "parser/ucp_table.h"
#include <algorithm>
#include <cassert>
#include <string>
using namespace std;
using namespace ue2;
namespace {
enum Operation {
EDIT_INSERT = 0, //!< insert a character
EDIT_REMOVE = 1, //!< remove a character
EDIT_SUBSTITUTE = 2, //!< substitute a character for another
EDIT_TRANSPOSE = 3, //!< swap two characters
EDIT_FLIP_CASE = 4, //!< invert the case of an alpha character
};
template<typename SeqT>
static
size_t choosePosition(const SeqT &corpus, CorpusProperties &props) {
assert(!corpus.empty());
unsigned pos = props.rand(0, corpus.size() - 1);
return pos;
}
class CorpusEditor {
public:
CorpusEditor(CorpusProperties &p) : props(p) {}
// Apply edits to a corpus
void applyEdits(string &corpus);
private:
// operations
void insert(string &corpus);
void remove(string &corpus);
void substitute(string &corpus);
void transpose(string &corpus);
void flip_case(string &corpus);
Operation chooseOperation();
u8 chooseByte();
CorpusProperties &props;
};
Operation CorpusEditor::chooseOperation() {
return (Operation)props.rand(EDIT_INSERT, EDIT_FLIP_CASE);
}
void CorpusEditor::applyEdits(string &corpus) {
for (size_t i = 0; i != props.editDistance; i++) {
Operation op = chooseOperation();
switch (op) {
case EDIT_INSERT:
insert(corpus);
break;
case EDIT_REMOVE:
remove(corpus);
break;
case EDIT_SUBSTITUTE:
substitute(corpus);
break;
case EDIT_TRANSPOSE:
transpose(corpus);
break;
case EDIT_FLIP_CASE:
flip_case(corpus);
break;
}
}
}
void CorpusEditor::insert(string &corpus) {
unsigned pos = props.rand(0, corpus.size());
u8 c = chooseByte();
corpus.insert(pos, 1, (char)c);
}
void CorpusEditor::remove(string &corpus) {
if (corpus.empty()) return;
size_t pos = choosePosition(corpus, props);
corpus.erase(pos, 1);
}
void CorpusEditor::substitute(string &corpus) {
if (corpus.empty()) return;
size_t pos = choosePosition(corpus, props);
corpus[pos] = chooseByte();
}
void CorpusEditor::transpose(string &corpus) {
if (corpus.empty()) return;
size_t a = choosePosition(corpus, props);
size_t b = choosePosition(corpus, props);
u8 tmp = corpus[a];
corpus[a] = corpus[b];
corpus[b] = tmp;
}
void CorpusEditor::flip_case(string &corpus) {
if (corpus.empty()) return;
// Pick a random starting position and walk forward (wrapping at the end)
// until we find an alpha character.
const size_t len = corpus.size();
const size_t pos = choosePosition(corpus, props);
size_t i = pos;
for (;;) {
char c = corpus[i];
if (ourisalpha(c)) {
char upper = mytoupper(c), lower = mytolower(c);
corpus[i] = c == upper ? lower : upper;
DEBUG_PRINTF("flipped c=%c to %c\n", c, corpus[i]);
return;
}
if (++i == len) {
i = 0;
}
if (i == pos) { // wrapped, no alpha characters
break;
}
}
}
u8 CorpusEditor::chooseByte() {
return (u8)props.rand(0, 255);
}
class CorpusEditorUtf8 {
public:
CorpusEditorUtf8(CorpusProperties &p) : props(p) {}
// Apply edits to a corpus.
void applyEdits(vector<unichar> &corpus);
private:
// operations
void insert(vector<unichar> &corpus);
void remove(vector<unichar> &corpus);
void substitute(vector<unichar> &corpus);
void transpose(vector<unichar> &corpus);
void flip_case(vector<unichar> &corpus);
Operation chooseOperation();
unichar chooseCodePoint();
CorpusProperties &props;
};
Operation CorpusEditorUtf8::chooseOperation() {
return (Operation)props.rand(EDIT_INSERT, EDIT_FLIP_CASE);
}
void CorpusEditorUtf8::applyEdits(vector<unichar> &corpus) {
for (size_t i = 0; i != props.editDistance; i++) {
Operation op = chooseOperation();
switch (op) {
case EDIT_INSERT:
insert(corpus);
break;
case EDIT_REMOVE:
remove(corpus);
break;
case EDIT_SUBSTITUTE:
substitute(corpus);
break;
case EDIT_TRANSPOSE:
transpose(corpus);
break;
case EDIT_FLIP_CASE:
flip_case(corpus);
break;
}
}
}
void CorpusEditorUtf8::insert(vector<unichar> &corpus) {
unsigned pos = props.rand(0, corpus.size());
corpus.insert(corpus.begin() + pos, chooseCodePoint());
}
void CorpusEditorUtf8::remove(vector<unichar> &corpus) {
if (corpus.empty()) return;
size_t pos = choosePosition(corpus, props);
corpus.erase(corpus.begin() + pos);
}
void CorpusEditorUtf8::substitute(vector<unichar> &corpus) {
if (corpus.empty()) return;
size_t pos = choosePosition(corpus, props);
corpus[pos] = chooseCodePoint();
}
void CorpusEditorUtf8::transpose(vector<unichar> &corpus) {
if (corpus.empty()) return;
size_t a = choosePosition(corpus, props);
size_t b = choosePosition(corpus, props);
unichar tmp = corpus[a];
corpus[a] = corpus[b];
corpus[b] = tmp;
}
void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
if (corpus.empty()) return;
// Pick a random starting position and walk forward (wrapping at the end)
// until we find an alpha character.
const size_t len = corpus.size();
const size_t pos = choosePosition(corpus, props);
size_t i = pos;
for (;;) {
if (::flip_case(&corpus[i])) {
return;
}
if (++i == len) {
i = 0;
}
if (i == pos) { // wrapped, no alpha characters
break;
}
}
}
unichar CorpusEditorUtf8::chooseCodePoint(void) {
/* We need to ensure that we don't pick a surrogate cp */
const u32 range =
MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
unichar raw = props.rand(0, range - 1);
if (raw < UNICODE_SURROGATE_MIN) {
return raw;
} else {
return raw + UNICODE_SURROGATE_MAX + 1;
}
}
} // namespace
void editCorpus(string *corpus, CorpusProperties &props) {
CorpusEditor ed(props);
ed.applyEdits(*corpus);
}
void editCorpus(vector<unichar> *corpus, CorpusProperties &props) {
CorpusEditorUtf8 ed(props);
ed.applyEdits(*corpus);
}

47
util/ng_corpus_editor.h Normal file
View File

@@ -0,0 +1,47 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Corpus Editor: applies random transformation to a corpus.
*/
#ifndef CORPUS_EDITOR_H
#define CORPUS_EDITOR_H
#include <string>
#include <vector>
#include "ue2common.h"
#include "util/unicode_def.h"
class CorpusProperties;
void editCorpus(std::string *corpus, CorpusProperties &props);
void editCorpus(std::vector<ue2::unichar> *corpus, CorpusProperties &props);
#endif

View File

@@ -0,0 +1,683 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Corpus Generation tool.
*/
#include "config.h"
#include "ng_corpus_generator.h"
#include "ng_corpus_editor.h"
#include "nfagraph/ng.h"
#include "nfagraph/ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/make_unique.h"
#include "util/ue2_containers.h"
#include "util/ue2string.h"
#include "util/unicode_def.h"
#include "util/unicode_set.h"
#include <algorithm>
#include <deque>
#include <set>
#include <sstream>
#include <vector>
#include <boost/ptr_container/ptr_vector.hpp>
using namespace std;
using namespace ue2;
using boost::ptr_vector;
typedef vector<NFAVertex> VertexPath;
#if defined(DEBUG)
// For debugging output
static
string pathToString(const NGHolder &g, const VertexPath &p) {
ostringstream oss;
oss << '[';
for (auto i = p.begin(); i != p.end(); ++i) {
if (i != p.begin()) {
oss << ',';
}
oss << g[*i].index;
}
oss << ']';
return oss.str();
}
#endif
/** True if this graph has no non-special successors of start or startDs. */
static
bool graph_is_empty(const NGHolder &g) {
for (const auto &v : adjacent_vertices_range(g.start, g)) {
if (!is_special(v, g)) {
return false;
}
}
for (const auto &v : adjacent_vertices_range(g.start, g)) {
if (!is_special(v, g)) {
return false;
}
}
return true;
}
static
string encodeUtf8(const vector<unichar> &v) {
string rv;
for (const unichar &cp : v) {
if (cp < UTF_2CHAR_MIN) {
rv.push_back(cp);
} else if (cp < UTF_3CHAR_MIN) {
rv.push_back(UTF_TWO_BYTE_HEADER | (cp >> UTF_CONT_SHIFT));
rv.push_back(makeContByte(cp));
} else if (cp < UTF_4CHAR_MIN) {
rv.push_back(UTF_THREE_BYTE_HEADER | (cp >> (2 * UTF_CONT_SHIFT)));
rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT));
rv.push_back(makeContByte(cp));
} else {
rv.push_back(UTF_FOUR_BYTE_HEADER | (cp >> (3 * UTF_CONT_SHIFT)));
rv.push_back(makeContByte(cp >> (2 * UTF_CONT_SHIFT)));
rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT));
rv.push_back(makeContByte(cp));
}
}
return rv;
}
template<class Iter, class Val>
static
bool has_greater_than(Iter it, Iter end, const Val &v, size_t limit) {
for (; it != end; ++it) {
if (*it == v) {
if (limit == 0) {
return true;
}
--limit;
}
}
return false;
}
static
void findPaths(const NGHolder &g, CorpusProperties &cProps,
vector<VertexPath> &allPaths, size_t cycleLimit,
size_t corpusLimit) {
// The maximum number of open (in progress) paths. New paths beyond this
// limit will evict a random existing one.
const size_t MAX_OPEN = min((size_t)1000, corpusLimit * 10);
ptr_vector<VertexPath> open;
open.push_back(new VertexPath(1, g.start));
ue2::unordered_set<NFAVertex> one_way_in;
for (const auto &v : vertices_range(g)) {
if (!hasGreaterInDegree(1, v, g)) {
one_way_in.insert(v);
}
}
while (!open.empty()) {
u32 slot = cProps.rand(0, open.size() - 1);
swap(open.at(slot), open.back());
ptr_vector<VertexPath>::auto_type p = open.pop_back();
NFAVertex u = p->back();
DEBUG_PRINTF("dequeuing path %s, back %u\n",
pathToString(g, *p).c_str(), g[u].index);
NFAGraph::adjacency_iterator ai, ae;
for (tie(ai, ae) = adjacent_vertices(u, g); ai != ae; ++ai) {
NFAVertex v = *ai;
if (u == g.startDs && v == g.startDs) {
// explicitly avoid following startDs self-loop, as we have
// other mechanisms for adding prefixes to our corpora.
continue;
}
// Accept vertices generate completed paths.
if (v == g.accept || v == g.acceptEod) {
DEBUG_PRINTF("path complete: %s\n",
pathToString(g, *p).c_str());
allPaths.push_back(*p);
if (allPaths.size() >= corpusLimit) {
DEBUG_PRINTF("full, going home\n");
return;
}
// No meaningful edges out of accept or acceptEod.
continue;
}
if (!contains(one_way_in, v) &&
has_greater_than(p->begin(), p->end(), v, cycleLimit)) {
// Note that vertices that only have one predecessor don't need
// their cycle limit checked, as their predecessors will have
// the same count.
DEBUG_PRINTF("exceeded cycle limit for v=%u, pruning path\n",
g[v].index);
continue;
}
// If we've got no further adjacent vertices, re-use p rather than
// copying it for the next path.
VertexPath *new_path;
if (boost::next(ai) == ae) {
new_path = p.release();
} else {
new_path = new VertexPath(*p);
}
new_path->push_back(v);
if (open.size() < MAX_OPEN) {
open.push_back(new_path);
} else {
u32 victim = cProps.rand(0, open.size() - 1);
open.replace(victim, new_path);
}
}
}
DEBUG_PRINTF("bored, going home\n");
}
namespace {
/** \brief Concrete implementation */
class CorpusGeneratorImpl : public CorpusGenerator {
public:
CorpusGeneratorImpl(const NGHolder &graph_in, CorpusProperties &props);
~CorpusGeneratorImpl() {}
void generateCorpus(vector<string> &data);
private:
unsigned char getRandomChar();
unsigned char getMatchChar(const CharReach &cr);
unsigned char getUnmatchChar(const CharReach &cr);
unsigned char getChar(NFAVertex v);
void newGenerator(vector<string> &data);
string pathToCorpus(const VertexPath &path);
/** \brief Generate a string of random bytes between minLen and maxLen
* bytes in length. */
void addRandom(const min_max &mm, string *out);
/** \brief The NFA graph we operate over. */
const NGHolder &graph;
/** \brief Reference to our corpus generator properties object (stores some
* state) */
CorpusProperties &cProps;
};
CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in,
CorpusProperties &props)
: graph(graph_in), cProps(props) {
// empty
}
void CorpusGeneratorImpl::generateCorpus(vector<string> &data) {
newGenerator(data);
// If the caller has asked us, apply edit distance to corpora
if (cProps.editDistance) {
for (auto &s : data) {
editCorpus(&s, cProps);
}
}
}
/** \brief Generate a random character, taking care to stick to the alphabet
* that we've been asked for. */
u8 CorpusGeneratorImpl::getRandomChar() {
return 'a' + cProps.rand(0, min(cProps.alphabetSize, (u32)CharReach::npos));
}
/** \brief Select a random character from the given string of valid match
* characters. */
unsigned char CorpusGeneratorImpl::getMatchChar(const CharReach &cr) {
unsigned int num = cr.count();
if (num == 0) {
return 0;
} else if (num == 1) {
return (unsigned char)cr.find_first();
} else if (num == 256) {
// Dot class, any character is OK!
return (unsigned char)cProps.rand(0, 255);
}
else {
unsigned idx = cProps.rand(0, num - 1);
return (unsigned char)cr.find_nth(idx);
}
}
/** \brief Select a character that does not belong to the given bitset. This
* makes no guarantees on unmatchability if the bitset is full. */
unsigned char CorpusGeneratorImpl::getUnmatchChar(const CharReach &cr) {
return getMatchChar(~cr);
}
void CorpusGeneratorImpl::addRandom(const min_max &mm, string *out) {
assert(mm.min <= mm.max);
u32 range = mm.max - mm.min;
u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0);
for (u32 i = 0; i < len; ++i) {
out->push_back(getRandomChar());
}
}
unsigned char CorpusGeneratorImpl::getChar(NFAVertex v) {
const CharReach &cr = graph.g[v].char_reach;
switch (cProps.throwDice()) {
case CorpusProperties::ROLLED_MATCH:
return getMatchChar(cr);
case CorpusProperties::ROLLED_UNMATCH:
return getUnmatchChar(cr);
case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */
return getRandomChar();
}
assert(0);
return 0;
}
/** \brief Convert a path through the graph to a corpus string. */
string CorpusGeneratorImpl::pathToCorpus(const VertexPath &path) {
string s;
// Add random prefix
if (cProps.prefixRange.max) {
addRandom(cProps.prefixRange, &s);
}
// Generate a corpus from our path
for (const auto &e : path) {
if (!is_special(e, graph)) {
s += getChar(e);
}
}
// Add random suffix
if (cProps.suffixRange.max) {
addRandom(cProps.suffixRange, &s);
}
return s;
}
void CorpusGeneratorImpl::newGenerator(vector<string> &outdata) {
const unsigned int maxCycles = cProps.getCycleLimit().second;
DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n",
cProps.corpusLimit, maxCycles);
vector<VertexPath> allPaths;
// Special case: if the graph has ONLY special vertices, then this is
// likely to be an odd vacuous pattern or a pattern that can never match.
// In these cases, an empty corpus is useful.
if (graph_is_empty(graph)) {
VertexPath empty(1, graph.start);
allPaths.push_back(empty);
}
// build a set of unique paths
findPaths(graph, cProps, allPaths, maxCycles, cProps.corpusLimit);
// transform paths into corpora: we do this repeatedly until we (a) hit our
// limit, or (b) don't generate any new corpora for any of our paths.
set<string> data;
while (data.size() < cProps.corpusLimit) {
size_t count = data.size();
for (const auto &path : allPaths) {
string s = pathToCorpus(path);
if (data.insert(s).second) {
DEBUG_PRINTF("corpus %zu (%zu bytes): '%s'\n", data.size(),
s.size(), escapeString(s).c_str());
if (data.size() == cProps.corpusLimit) {
goto hit_limit;
}
}
}
if (data.size() == count) {
break; // we're finding it hard to generate more corpora
}
}
hit_limit:
DEBUG_PRINTF("%zu corpora built\n", data.size());
// populate the output vector from the set we built.
outdata.reserve(data.size());
copy(data.begin(), data.end(), back_inserter(outdata));
}
/** \brief Concrete implementation for UTF-8 */
class CorpusGeneratorUtf8 : public CorpusGenerator {
public:
CorpusGeneratorUtf8(const NGHolder &graph_in, CorpusProperties &props);
~CorpusGeneratorUtf8() {}
void generateCorpus(vector<string> &data);
private:
unichar getRandomChar();
unichar getMatchChar(CodePointSet cps);
unichar getUnmatchChar(const CodePointSet &cps);
unichar getChar(const CodePointSet &cps);
void newGenerator(vector<vector<unichar> > &data);
vector<unichar> pathToCorpus(const vector<CodePointSet> &path);
/** \brief Generate a random string between min and max codepoints in
* length. */
void addRandom(const min_max &mm, vector<unichar> *out);
/** \brief The NFA graph we operate over. */
const NGHolder &graph;
/** \brief Reference to our corpus generator properties object (stores some
* state) */
CorpusProperties &cProps;
};
CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGHolder &graph_in,
CorpusProperties &props)
: graph(graph_in), cProps(props) {
// empty
}
void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
vector<vector<unichar>> raw;
newGenerator(raw);
// If the caller has asked us, apply edit distance to corpora
if (cProps.editDistance) {
for (auto &e : raw) {
editCorpus(&e, cProps);
}
}
for (const auto &e : raw) {
data.push_back(encodeUtf8(e));
}
}
/** \brief Generate a random character, taking care to stick to the alphabet
* that we've been asked for. */
unichar CorpusGeneratorUtf8::getRandomChar() {
u32 range = MAX_UNICODE + 1
- (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
range = min(cProps.alphabetSize, range);
assert(range);
unichar c = 'a' + cProps.rand(0, range - 1);
if (c >= UNICODE_SURROGATE_MIN) {
c =+ UNICODE_SURROGATE_MAX + 1;
}
return c % (MAX_UNICODE + 1);
}
/** \brief Select a random character from the given string of valid match
* characters. */
unichar CorpusGeneratorUtf8::getMatchChar(CodePointSet cps) {
cps.unsetRange(UNICODE_SURROGATE_MIN, UNICODE_SURROGATE_MAX);
u32 num = cps.count();
if (num == 0) {
return 0;
} else if (num == 1) {
return lower(*cps.begin());
} else {
unichar rv = cps.at(cProps.rand(0, num - 1));
assert(rv != INVALID_UNICODE);
return rv;
}
}
/** \brief Select a character that does not belong to the given bitset. This
* makes no guarantees on unmatchability if the bitset is full. */
unichar CorpusGeneratorUtf8::getUnmatchChar(const CodePointSet &cps) {
return getMatchChar(~cps);
}
void CorpusGeneratorUtf8::addRandom(const min_max &mm, vector<unichar> *out) {
assert(mm.min <= mm.max);
u32 range = mm.max - mm.min;
u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0);
for (u32 i = 0; i < len; ++i) {
out->push_back(getRandomChar());
}
}
unichar CorpusGeneratorUtf8::getChar(const CodePointSet &cps) {
switch (cProps.throwDice()) {
case CorpusProperties::ROLLED_MATCH:
return getMatchChar(cps);
case CorpusProperties::ROLLED_UNMATCH:
return getUnmatchChar(cps);
case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */
return getRandomChar();
}
assert(0);
return 0;
}
/** \brief Convert a path through the graph to a corpus string. */
vector<unichar>
CorpusGeneratorUtf8::pathToCorpus(const vector<CodePointSet> &path) {
vector<unichar> s;
// Add random prefix
if (cProps.prefixRange.max) {
addRandom(cProps.prefixRange, &s);
}
// Generate a corpus from our path
for (const auto &e : path) {
s.push_back(getChar(e));
}
// Add random suffix
if (cProps.suffixRange.max) {
addRandom(cProps.suffixRange, &s);
}
return s;
}
static
u32 classify_vertex(const NFAGraph &g, NFAVertex v) {
const CharReach &cr = g[v].char_reach;
if (cr.isSubsetOf(UTF_ASCII_CR)) {
return 1;
} else if (cr.isSubsetOf(UTF_TWO_START_CR)) {
return 2;
} else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
return 3;
} else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
return 4;
}
/* this can happen due to dummy vertices from zwa */
return 1;
}
static
void fillCodePointSet(const CharReach &cr, CodePointSet *out, u8 mask = 0xff) {
for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
out->set(i & mask);
}
}
static
void expandCodePointSet(const CharReach &cr, CodePointSet *out, u32 mask,
u32 n) {
CodePointSet base;
base.swap(*out);
for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
u32 val = (i & mask) << (n * UTF_CONT_SHIFT);
for (const auto &cp : base) {
unichar ll = lower(cp);
unichar uu = upper(cp);
out->setRange(val + ll, MIN(val + uu, MAX_UNICODE));
}
}
}
static
void decodePath(const NFAGraph &g, const VertexPath &in,
vector<CodePointSet> &out) {
VertexPath::const_iterator it = in.begin();
while (it != in.end()) {
if (is_special(*it, g)) {
++it;
continue;
}
out.push_back(CodePointSet());
CodePointSet &cps = out.back();
switch (classify_vertex(g, *it)) {
case 1:
fillCodePointSet(g[*it].char_reach, &cps);
++it;
break;
case 2:
fillCodePointSet(g[*(it + 1)].char_reach, &cps,
UTF_CONT_BYTE_VALUE_MASK);
expandCodePointSet(g[*it].char_reach, &cps,
~UTF_TWO_BYTE_HEADER, 1);
it += 2;
break;
case 3:
fillCodePointSet(g[*(it + 2)].char_reach, &cps,
UTF_CONT_BYTE_VALUE_MASK);
expandCodePointSet(g[*(it + 1)].char_reach, &cps,
UTF_CONT_BYTE_VALUE_MASK, 1);
expandCodePointSet(g[*it].char_reach, &cps,
~UTF_THREE_BYTE_HEADER, 2);
it += 3;
break;
case 4:
fillCodePointSet(g[*(it + 3)].char_reach, &cps,
UTF_CONT_BYTE_VALUE_MASK);
expandCodePointSet(g[*(it + 2)].char_reach, &cps,
UTF_CONT_BYTE_VALUE_MASK, 1);
expandCodePointSet(g[*(it + 1)].char_reach, &cps,
UTF_CONT_BYTE_VALUE_MASK, 2);
expandCodePointSet(g[*it].char_reach, &cps,
~UTF_FOUR_BYTE_HEADER, 3);
it += 4;
break;
default:;
assert(0);
++it;
}
}
}
static
void translatePaths(const NGHolder &graph,
const vector<VertexPath> &allPathsTemp,
vector<vector<CodePointSet>> *out) {
assert(out);
for (const auto &path : allPathsTemp) {
out->push_back(vector<CodePointSet>());
decodePath(graph.g, path, out->back());
}
}
void CorpusGeneratorUtf8::newGenerator(vector<vector<unichar>> &outdata) {
const u32 maxCycles = cProps.getCycleLimit().second;
DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n",
cProps.corpusLimit, maxCycles);
vector<vector<CodePointSet>> allPaths;
// Special case: if the graph has ONLY special vertices, then this is
// likely to be an odd vacuous pattern or a pattern that can never match.
// In these cases, an empty corpus is useful.
if (graph_is_empty(graph)) {
allPaths.push_back(vector<CodePointSet>());
} else {
// build a set of unique paths
vector<VertexPath> allPathsTemp;
findPaths(graph, cProps, allPathsTemp, maxCycles, cProps.corpusLimit);
translatePaths(graph, allPathsTemp, &allPaths);
}
// transform paths into corpora: we do this repeatedly until we (a) hit our
// limit, or (b) don't generate any new corpora for any of our paths.
set<vector<unichar> > data;
while (data.size() < cProps.corpusLimit) {
size_t count = data.size();
for (const auto &path : allPaths) {
vector<unichar> vu = pathToCorpus(path);
if (data.insert(vu).second) {
if (data.size() == cProps.corpusLimit) {
goto hit_limit;
}
}
}
if (data.size() == count) {
break; // we're finding it hard to generate more corpora
}
}
hit_limit:
DEBUG_PRINTF("%zu corpora built\n", data.size());
// populate the output vector from the set we built.
outdata.reserve(data.size());
copy(data.begin(), data.end(), back_inserter(outdata));
}
} // namespace
CorpusGenerator::~CorpusGenerator() { }
// External entry point
unique_ptr<CorpusGenerator> makeCorpusGenerator(const NGWrapper &graph,
CorpusProperties &props) {
if (graph.utf8) {
return ue2::make_unique<CorpusGeneratorUtf8>(graph, props);
} else {
return ue2::make_unique<CorpusGeneratorImpl>(graph, props);
}
}

View File

@@ -0,0 +1,67 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Corpus Generation tool.
*/
#ifndef NG_CORPUS_GENERATOR_H_
#define NG_CORPUS_GENERATOR_H_
#include "ng_corpus_properties.h"
#include <memory>
#include <string>
#include <vector>
namespace ue2 {
class NGWrapper;
} // namespace ue2
/** \brief Abstract interface to corpus generator tool. */
class CorpusGenerator {
public:
virtual ~CorpusGenerator();
/** \brief Build some corpora.
*
* Generate a set of corpora, placed in the \a data vector, for the current
* NFAGraph according to the parameters provided by the CorpusProperties
* object. Returns the number of corpora generated.
*/
virtual void generateCorpus(std::vector<std::string> &data) = 0;
};
/** \brief Build a concrete impl conforming to the \ref CorpusGenerator
* interface. */
std::unique_ptr<CorpusGenerator>
makeCorpusGenerator(const ue2::NGWrapper &graph, CorpusProperties &props);
#endif

View File

@@ -0,0 +1,99 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief State for corpus generator.
*/
#include "config.h"
#include "ng_corpus_properties.h"
#include "ue2common.h"
#include <boost/random/uniform_int_distribution.hpp>
// default constructor
CorpusProperties::CorpusProperties()
: matchness(100), unmatchness(0), randomness(0), prefixRange(0, 0),
suffixRange(0, 0), cycleMin(1), cycleMax(1),
corpusLimit(DEFAULT_CORPUS_GENERATOR_LIMIT), editDistance(0),
alphabetSize(~0) {
// empty
}
bool CorpusProperties::setPercentages(unsigned int match, unsigned int unmatch,
unsigned int random) {
if (match + unmatch + random != 100) {
// Do not update probabilities
return false;
}
matchness = match;
unmatchness = unmatch;
randomness = random;
return true;
}
void CorpusProperties::seed(unsigned val) {
rngSeed = val;
randomGen.seed(val);
}
unsigned CorpusProperties::getSeed() const {
return rngSeed;
}
unsigned CorpusProperties::rand(unsigned n, unsigned m) {
boost::random::uniform_int_distribution<> dist(n, m);
return dist(randomGen);
}
// not const because it stores state for the random number generator
CorpusProperties::RollResult CorpusProperties::throwDice() {
if (matchness == 100) {
return ROLLED_MATCH;
}
if (unmatchness == 100) {
return ROLLED_UNMATCH;
}
if (randomness == 100) {
return ROLLED_RANDOM;
}
// This assumes a uniform distribution. Perhaps factor some 'depth' param
// and whether this 'depth' should increase or decrease the likelihood of
// unmatch or random rolls.
unsigned int outcome = rand(0, 99);
if (outcome < matchness) {
return ROLLED_MATCH;
}
if (outcome < matchness + unmatchness) {
return ROLLED_UNMATCH;
}
return ROLLED_RANDOM;
}

131
util/ng_corpus_properties.h Normal file
View File

@@ -0,0 +1,131 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief State for corpus generator.
*/
#ifndef NG_CORPUS_PROPERTIES_H
#define NG_CORPUS_PROPERTIES_H
#include <utility> // for std::pair
#include <boost/random/mersenne_twister.hpp>
#include "ue2common.h"
#define DEFAULT_CORPUS_GENERATOR_LIMIT 500000
struct min_max {
min_max(u32 min_in, u32 max_in) : min(min_in), max(max_in) {
assert(min <= max);
}
u32 min;
u32 max;
};
class CorpusProperties {
public:
/**
* Default constructor with default properties:
* - generate match char with 100% probability
* - generate unmatch char with 0% probability
* - generate random char with 0% probability
* - follow cycles once
* - do not expand character classes (including case classes)
* - generate data for all possible paths through graph
* - pick random characters from the full ASCII alphabet
*/
CorpusProperties();
/**
* Set probabilities (as percentages). Returns true if sum == 100,
* else returns false and no changes are made to current probabilities.
*/
bool setPercentages(unsigned int match, unsigned int unmatch,
unsigned int random);
unsigned percentMatch() const { return matchness; }
unsigned percentUnmatch() const { return unmatchness; }
unsigned percentRandom() const { return randomness; }
// The number of times a cycle is followed
void setCycleLimit(unsigned int min, unsigned int max) {
cycleMin = min;
cycleMax = max;
}
std::pair<unsigned int, unsigned int> getCycleLimit() const {
return std::make_pair(cycleMin, cycleMax);
}
// Roll for initiative
enum RollResult {
ROLLED_MATCH,
ROLLED_UNMATCH,
ROLLED_RANDOM,
};
RollResult throwDice();
/** \brief Set the PRNG seed. */
void seed(unsigned val);
unsigned int getSeed() const;
/** \brief Retrieve a value from the PRNG in the closed range [n, m]. */
unsigned rand(unsigned n, unsigned m);
private:
// Percentages
unsigned int matchness;
unsigned int unmatchness;
unsigned int randomness;
public:
// Extra data
min_max prefixRange;
min_max suffixRange;
private:
// Behaviours
unsigned int cycleMin;
unsigned int cycleMax;
public:
// FIXME: Limit the number of corpus files generated to the first 'limit'
// number of paths - note that this means the corpus will not be a complete
// representation of the pattern.
unsigned int corpusLimit;
unsigned int editDistance;
unsigned int alphabetSize;
private:
// PRNG.
boost::random::mt19937 randomGen;
unsigned int rngSeed;
};
#endif

334
util/ng_find_matches.cpp Normal file
View File

@@ -0,0 +1,334 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Pattern lifetime analysis.
*/
#include "config.h"
#include "ng_find_matches.h"
#include "nfagraph/ng_graph.h"
#include "nfagraph/ng_util.h"
#include "parser/position.h"
#include "util/container.h"
#include "util/compare.h"
#include "util/report.h"
#include "util/report_manager.h"
#include <algorithm>
using namespace std;
using namespace ue2;
// convenience typedefs
typedef map<NFAVertex,size_t> SOMMap;
typedef set<pair<size_t, size_t> > MatchSet;
struct fmstate {
SOMMap states;
SOMMap next;
size_t offset;
unsigned char cur;
unsigned char prev;
const bool som;
const bool utf8;
const bool allowStartDs;
const ReportManager &rm;
fmstate(const bool som_in, const bool utf8_in, const bool aSD_in,
const ReportManager &rm_in)
: offset(0), cur(0), prev(0), som(som_in), utf8(utf8_in),
allowStartDs(aSD_in), rm(rm_in) {}
};
static
void initStates(const NGHolder &g, struct fmstate &state) {
state.states.insert(make_pair(g.start, 0));
if (state.allowStartDs) {
state.states.insert(make_pair(g.startDs, 0));
}
}
static
bool isWordChar(const unsigned char c) {
// check if it's an alpha character
if (ourisalpha(c)) {
return true;
}
// check if it's a digit
if (c >= '0' && c <= '9') {
return true;
}
// check if it's an underscore
if (c == '_') {
return true;
}
return false;
}
static
bool isUtf8CodePoint(const char c) {
// check if this is a start of 4-byte character
if ((c & 0xF8) == 0xF0) {
return true;
}
// check if this is a start of 3-byte character
if ((c & 0xF0) == 0xE0) {
return true;
}
// check if this is a start of 2-byte character
if ((c & 0xE0) == 0xC0) {
return true;
}
// check if this is a single-byte character
if ((c & 0x80) == 0) {
return true;
}
return false;
}
static
bool canReach(const NGHolder &g, const NFAVertex &src, const NFAVertex &dst,
struct fmstate &state) {
// find relevant edge and see whether it has asserts
NFAEdge e;
bool exists;
u32 flags;
tie(e, exists) = edge(src, dst, g);
assert(exists);
flags = g[e].assert_flags;
if (!flags) {
return true;
}
if (flags & POS_FLAG_ASSERT_WORD_TO_NONWORD) {
if (isWordChar(state.prev) && !isWordChar(state.cur)) {
return true;
}
}
if (flags & POS_FLAG_ASSERT_NONWORD_TO_WORD) {
if (!isWordChar(state.prev) && isWordChar(state.cur)) {
return true;
}
}
if (flags & POS_FLAG_ASSERT_WORD_TO_WORD) {
if (isWordChar(state.prev) && isWordChar(state.cur)) {
return true;
}
}
if (flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD) {
if (!isWordChar(state.prev) && !isWordChar(state.cur)) {
return true;
}
}
return false;
}
static
void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
bool allowEodMatches) {
SOMMap::const_iterator it, ite;
for (it = state.states.begin(), ite = state.states.end(); it != ite; ++it) {
NFAGraph::adjacency_iterator ai, ae;
// we can't accept anything from startDs inbetween UTF-8 codepoints
if (state.utf8 && it->first == g.startDs && !isUtf8CodePoint(state.cur)) {
continue;
}
for (tie(ai, ae) = adjacent_vertices(it->first, g); ai != ae; ++ai) {
if (*ai == g.accept || (*ai == g.acceptEod && allowEodMatches)) {
// check edge assertions if we are allowed to reach accept
if (!canReach(g, it->first, *ai, state)) {
continue;
}
DEBUG_PRINTF("match found at %zu\n", state.offset);
assert(!g[it->first].reports.empty());
for (const auto &report_id :
g[it->first].reports) {
const Report &ri = state.rm.getReport(report_id);
DEBUG_PRINTF("report %u has offset adjustment %d\n",
report_id, ri.offsetAdjust);
matches.insert(
make_pair(it->second, state.offset + ri.offsetAdjust));
}
}
}
}
}
static
void step(const NGHolder &g, struct fmstate &state) {
state.next.clear();
SOMMap::iterator it, ite;
for (it = state.states.begin(), ite = state.states.end(); it != ite; ++it) {
NFAGraph::adjacency_iterator ai, ae;
for (tie(ai, ae) = adjacent_vertices(it->first, g); ai != ae; ++ai) {
if (*ai == g.acceptEod) {
// can't know the future: we don't know if we're at EOD.
continue;
}
if (*ai == g.accept) {
continue;
}
if (!state.allowStartDs && *ai == g.startDs) {
continue;
}
const CharReach &cr = g[*ai].char_reach;
// check reachability and edge assertions
if (cr.test(state.cur) && canReach(g, it->first, *ai, state)) {
SOMMap::const_iterator ni;
size_t next_som;
// if we aren't in SOM mode, just set every SOM to 0
if (!state.som) {
state.next[*ai] = 0;
continue;
}
// if this is first vertex since start, use current offset as SOM
if (it->first == g.start || it->first == g.startDs ||
is_virtual_start(it->first, g)) {
next_som = state.offset;
} else {
// else, inherit SOM from predecessor
next_som = it->second;
}
// check if the vertex is already active
ni = state.next.find(*ai);
// if this vertex is not yet active, use current SOM
if (ni == state.next.end()) {
state.next[*ai] = next_som;
} else {
// else, work out leftmost SOM
state.next[*ai] = min(next_som, ni->second);
}
}
}
}
}
// filter extraneous matches
static void filterMatches(MatchSet &matches) {
set<size_t> eom;
MatchSet::iterator msit;
// first, collect all end-offset matches
for (msit = matches.begin(); msit != matches.end(); ++msit) {
eom.insert(msit->second);
}
// now, go through all the end-offsets and filter extra matches
set<size_t>::const_iterator eomit;
for (eomit = eom.begin(); eomit != eom.end(); ++eomit) {
// find minimum SOM for this EOM
size_t min_som = -1U;
for (msit = matches.begin(); msit != matches.end(); ++msit) {
// skip entries with wrong EOM
if (msit->second != *eomit) {
continue;
}
min_som = min(min_som, msit->first);
}
msit = matches.begin();
while (msit != matches.end()) {
// skip everything that doesn't match
if (msit->second != *eomit || msit->first <= min_som) {
++msit;
continue;
}
DEBUG_PRINTF("erasing match %zu, %zu\n", msit->first, msit->second);
matches.erase(msit++);
}
}
}
/** \brief Find all matches for a given graph when executed against \a input.
*
* Fills \a matches with offsets into the data stream where a match is found.
*/
void findMatches(const NGHolder &g, const ReportManager &rm,
const string &input, MatchSet &matches, const bool notEod,
const bool som, const bool utf8) {
const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
struct fmstate state(som, utf8, allowStartDs, rm);
initStates(g, state);
string::const_iterator it, ite;
for (it = input.begin(), ite = input.end(); it != ite; ++it) {
state.offset = distance(input.begin(), it);
state.cur = *it;
step(g, state);
getMatches(g, matches, state, false);
DEBUG_PRINTF("index %zu, %zu states on\n", state.offset, state.next.size());
if (state.next.empty()) {
if (state.som) {
filterMatches(matches);
}
return;
}
state.states.swap(state.next);
state.prev = state.cur;
}
state.offset = input.size();
state.cur = 0;
// do additional step to get matches after stream end, this time count eod
// matches also (or not, if we're in notEod mode)
getMatches(g, matches, state, !notEod);
if (state.som) {
filterMatches(matches);
}
}

56
util/ng_find_matches.h Normal file
View File

@@ -0,0 +1,56 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Pattern matching based on direct NFA execution.
*/
#ifndef NG_FIND_MATCHES_H
#define NG_FIND_MATCHES_H
#include <string>
#include <set>
namespace ue2 {
class NGHolder;
class ReportManager;
struct BoundaryReports;
} // namespace ue2
/** \brief Find all matches for a given graph when executed against \a input.
*
* Fills \a matches with offsets into the data stream where a match is found.
*/
void findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm,
const std::string &input,
std::set<std::pair<size_t, size_t>> &matches,
const bool notEod, const bool som, const bool utf8);
#endif // NG_FIND_MATCHES_H

130
util/string_util.h Normal file
View File

@@ -0,0 +1,130 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef STRING_UTIL_H
#define STRING_UTIL_H
#include "ue2common.h"
#include <iomanip>
#include <string>
#include <sstream>
#include <vector>
#include <cstring>
#include <iostream>
//
// Utility functions
//
// read a string in and convert it to another type, anything supported
// by stringstream
template<typename T>
inline bool fromString(const std::string &s, T& val)
{
std::istringstream i(s);
char c;
if (!(i >> val) || i.get(c)) {
return false;
}
return true;
}
// read in a comma-separated set of values: very simple impl, not for
// external consumption
template<typename T>
inline bool strToList(const std::string &s, std::vector<T>& out)
{
std::istringstream i(s);
char c;
do {
T val;
if (!(i >> val)) {
break;
}
out.push_back(val);
} while (i.get(c) && c == ',');
return !out.empty();
}
// return a nicely escaped version of a string: this should probably become
// an IO manipulator or something
UNUSED static
const std::string printable(const std::string &in) {
std::ostringstream oss;
for (size_t i = 0; i < in.size(); ++i) {
unsigned char c = in[i];
if (c == '\"') {
oss << "\\\"";
} else if (c == '\n') {
oss << "\\n";
} else if (c == '\t') {
oss << "\\t";
} else if (c == '\r') {
oss << "\\r";
} else if (0x20 <= c && c <= 0x7e && c != '\\') {
oss << c;
} else {
oss << "\\x"
<< std::hex << std::setw(2) << std::setfill('0')
<< (unsigned)(in[i] & 0xff)
<< std::dec;
}
}
return oss.str();
}
template<typename it_t>
void prettyPrintRange(std::ostream &out, it_t begin, it_t end) {
bool in_range = false;
it_t it = begin;
it_t itp = it;
for (; it != end; itp = it++) {
if (it != begin && *it == *itp + 1) {
in_range = true;
continue;
} else if (it != begin) {
if (in_range) {
out << "-" << *itp;
}
out << ", ";
in_range = false;
}
out << *it;
}
if (in_range) {
out << "-" << *itp;
}
}
#endif // STRING_UTIL_H