mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-29 19:24:25 +03:00
Initial commit of Hyperscan
This commit is contained in:
33
util/CMakeLists.txt
Normal file
33
util/CMakeLists.txt
Normal file
@@ -0,0 +1,33 @@
|
||||
# utility libs
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
set_source_files_properties(
|
||||
${CMAKE_BINARY_DIR}/tools/ExpressionParser.cpp
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS "${RAGEL_C_FLAGS}")
|
||||
|
||||
ragelmaker(ExpressionParser.rl)
|
||||
|
||||
set(expressionutil_SRCS
|
||||
expressions.cpp
|
||||
expressions.h
|
||||
ExpressionParser.h
|
||||
ExpressionParser.cpp
|
||||
)
|
||||
add_library(expressionutil ${expressionutil_SRCS})
|
||||
add_dependencies(expressionutil ragel_ExpressionParser)
|
||||
|
||||
SET(corpusomatic_SRCS
|
||||
ng_corpus_editor.h
|
||||
ng_corpus_editor.cpp
|
||||
ng_corpus_generator.h
|
||||
ng_corpus_generator.cpp
|
||||
ng_corpus_properties.h
|
||||
ng_corpus_properties.cpp
|
||||
ng_find_matches.h
|
||||
ng_find_matches.cpp
|
||||
)
|
||||
add_library(corpusomatic ${corpusomatic_SRCS})
|
||||
|
40
util/ExpressionParser.h
Normal file
40
util/ExpressionParser.h
Normal file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef EXPRESSIONPARSER_H
|
||||
#define EXPRESSIONPARSER_H
|
||||
|
||||
#include <string>
|
||||
|
||||
struct hs_expr_ext;
|
||||
|
||||
bool readExpression(const std::string &line, std::string &expr,
|
||||
unsigned int *flags, hs_expr_ext *ext,
|
||||
bool *must_be_ordered = nullptr);
|
||||
|
||||
#endif
|
173
util/ExpressionParser.rl
Normal file
173
util/ExpressionParser.rl
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ExpressionParser.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "hs_compile.h"
|
||||
|
||||
|
||||
using std::string;
|
||||
|
||||
namespace { // anon
|
||||
|
||||
enum ParamKey {
|
||||
PARAM_NONE,
|
||||
PARAM_MIN_OFFSET,
|
||||
PARAM_MAX_OFFSET,
|
||||
PARAM_MIN_LENGTH
|
||||
};
|
||||
|
||||
%%{
|
||||
machine ExpressionParser;
|
||||
|
||||
action accumulateNum {
|
||||
num = (num * 10) + (fc - '0');
|
||||
}
|
||||
|
||||
action handleFlag {
|
||||
switch (fc) {
|
||||
case 'i': *flags |= HS_FLAG_CASELESS; break;
|
||||
case 's': *flags |= HS_FLAG_DOTALL; break;
|
||||
case 'm': *flags |= HS_FLAG_MULTILINE; break;
|
||||
case 'H': *flags |= HS_FLAG_SINGLEMATCH; break;
|
||||
case 'O':
|
||||
if (must_be_ordered) {
|
||||
*must_be_ordered = true;
|
||||
}
|
||||
break;
|
||||
case 'V': *flags |= HS_FLAG_ALLOWEMPTY; break;
|
||||
case 'W': *flags |= HS_FLAG_UCP; break;
|
||||
case '8': *flags |= HS_FLAG_UTF8; break;
|
||||
case 'P': *flags |= HS_FLAG_PREFILTER; break;
|
||||
case 'L': *flags |= HS_FLAG_SOM_LEFTMOST; break;
|
||||
default: fbreak;
|
||||
}
|
||||
}
|
||||
|
||||
action handleExtParam {
|
||||
switch (key) {
|
||||
case PARAM_MIN_OFFSET:
|
||||
ext->flags |= HS_EXT_FLAG_MIN_OFFSET;
|
||||
ext->min_offset = num;
|
||||
break;
|
||||
case PARAM_MAX_OFFSET:
|
||||
ext->flags |= HS_EXT_FLAG_MAX_OFFSET;
|
||||
ext->max_offset = num;
|
||||
break;
|
||||
case PARAM_MIN_LENGTH:
|
||||
ext->flags |= HS_EXT_FLAG_MIN_LENGTH;
|
||||
ext->min_length = num;
|
||||
break;
|
||||
case PARAM_NONE:
|
||||
default:
|
||||
// No key specified, syntax invalid.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
write data;
|
||||
}%%
|
||||
|
||||
} // namespace
|
||||
|
||||
static
|
||||
void initExt(hs_expr_ext *ext) {
|
||||
memset(ext, 0, sizeof(*ext));
|
||||
ext->max_offset = MAX_OFFSET;
|
||||
}
|
||||
|
||||
bool readExpression(const std::string &input, std::string &expr,
|
||||
unsigned int *flags, hs_expr_ext *ext,
|
||||
bool *must_be_ordered) {
|
||||
assert(flags);
|
||||
assert(ext);
|
||||
|
||||
// Init flags and ext params.
|
||||
*flags = 0;
|
||||
initExt(ext);
|
||||
if (must_be_ordered) {
|
||||
*must_be_ordered = false;
|
||||
}
|
||||
|
||||
// Extract expr, which is easier to do in straight C++ than with Ragel.
|
||||
if (input.empty() || input[0] != '/') {
|
||||
return false;
|
||||
}
|
||||
size_t end = input.find_last_of('/');
|
||||
if (end == string::npos || end == 0) {
|
||||
return false;
|
||||
}
|
||||
expr = input.substr(1, end - 1);
|
||||
|
||||
// Use a Ragel scanner to handle flags and params.
|
||||
const char *p = input.c_str() + end + 1;
|
||||
const char *pe = input.c_str() + input.size();
|
||||
UNUSED const char *eof = pe;
|
||||
UNUSED const char *ts = p, *te = p;
|
||||
int cs;
|
||||
UNUSED int act;
|
||||
|
||||
assert(p);
|
||||
assert(pe);
|
||||
|
||||
// For storing integers as they're scanned.
|
||||
u64a num = 0;
|
||||
enum ParamKey key = PARAM_NONE;
|
||||
|
||||
%%{
|
||||
single_flag = [ismW8HPLVO];
|
||||
param = ('min_offset' @{ key = PARAM_MIN_OFFSET; } |
|
||||
'max_offset' @{ key = PARAM_MAX_OFFSET; } |
|
||||
'min_length' @{ key = PARAM_MIN_LENGTH; } );
|
||||
|
||||
value = (digit @accumulateNum)+ >{num = 0;};
|
||||
param_spec = (' '* param '=' value ' '*) >{ key = PARAM_NONE; }
|
||||
%handleExtParam;
|
||||
|
||||
main := ( single_flag @handleFlag )* # single-char flags
|
||||
( '{' param_spec (',' param_spec)* '}' )? # list of ext params
|
||||
$^{ return false; };
|
||||
|
||||
# Intialize and execute.
|
||||
write init;
|
||||
write exec;
|
||||
}%%
|
||||
|
||||
DEBUG_PRINTF("expr='%s', flags=%u\n", expr.c_str(), *flags);
|
||||
|
||||
return (cs != ExpressionParser_error) && (p == pe);
|
||||
}
|
297
util/expressions.cpp
Normal file
297
util/expressions.cpp
Normal file
@@ -0,0 +1,297 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include <boost/algorithm/string/trim.hpp>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#if !defined(_WIN32)
|
||||
#include <dirent.h>
|
||||
#include <unistd.h>
|
||||
#else
|
||||
// Windows support is probably very fragile
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "expressions.h"
|
||||
#include "hs.h"
|
||||
#include "string_util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static
|
||||
void failLine(unsigned lineNum, const string &file,
|
||||
const string &line, const string &error) {
|
||||
cerr << "Parse error in file " << file
|
||||
<< " on line " << lineNum << ": " << error
|
||||
<< endl << "Line is: '" << line << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static
|
||||
void processLine(string &line, unsigned lineNum,
|
||||
const string &file, ExpressionMap &exprMap) {
|
||||
// if line is empty, or a comment, we can skip it
|
||||
if (line.empty() || line[0] == '#') {
|
||||
return;
|
||||
}
|
||||
|
||||
// cull any whitespace
|
||||
boost::trim(line);
|
||||
|
||||
// otherwise, it should be ID:PCRE, e.g.
|
||||
// 10001:/foobar/is
|
||||
|
||||
size_t colonIdx = line.find_first_of(':');
|
||||
if (colonIdx == string::npos) {
|
||||
failLine(lineNum, file, line, "Could not parse line.");
|
||||
}
|
||||
|
||||
// we should have an unsigned int as an ID, before the colon
|
||||
unsigned id;
|
||||
if (!fromString(line.substr(0, colonIdx), id)) {
|
||||
failLine(lineNum, file, line, "Unable to parse ID.");
|
||||
}
|
||||
|
||||
// rest of the expression is the PCRE
|
||||
const string pcre_str(line.substr(colonIdx + 1));
|
||||
|
||||
//cout << "Inserting expr: id=" << id << ", pcre=" << pcre_str << endl;
|
||||
|
||||
bool ins = exprMap.insert(ExpressionMap::value_type(id, pcre_str)).second;
|
||||
if (!ins) {
|
||||
failLine(lineNum, file, line, "Duplicate ID found.");
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define stat _stat
|
||||
#define S_ISDIR(st_m) (_S_IFDIR & (st_m))
|
||||
#define S_ISREG(st_m) (_S_IFREG & (st_m))
|
||||
#endif
|
||||
void loadExpressionsFromFile(const string &fname, ExpressionMap &exprMap) {
|
||||
struct stat st;
|
||||
if (stat(fname.c_str(), &st) != 0) {
|
||||
return;
|
||||
}
|
||||
if (!S_ISREG(st.st_mode)) {
|
||||
return;
|
||||
}
|
||||
ifstream f(fname.c_str());
|
||||
if (!f.good()) {
|
||||
throw runtime_error("Can't open file");
|
||||
}
|
||||
|
||||
unsigned lineNum = 0;
|
||||
string line;
|
||||
while (getline(f, line)) {
|
||||
lineNum++;
|
||||
processLine(line, lineNum, fname, exprMap);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
bool isIgnorable(const std::string &f) {
|
||||
if (f.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Editor backup files
|
||||
if (*f.rbegin() == '~') {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Ignore dotfiles
|
||||
if (*f.begin() == '.') {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
|
||||
// Is our input path a file or a directory?
|
||||
struct stat st;
|
||||
if (stat(inPath.c_str(), &st) != 0) {
|
||||
cerr << "Can't stat path: '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (S_ISREG(st.st_mode)) {
|
||||
// process file
|
||||
try {
|
||||
loadExpressionsFromFile(inPath, exprMap);
|
||||
} catch (runtime_error &e) {
|
||||
cerr << e.what() << ": '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
} else if (S_ISDIR(st.st_mode)) {
|
||||
DIR *d = opendir(inPath.c_str());
|
||||
if (d == nullptr) {
|
||||
cerr << "Can't open directory: '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (struct dirent *ent = readdir(d); ent; ent = readdir(d)) {
|
||||
string basename(ent->d_name);
|
||||
string fname(inPath);
|
||||
fname.push_back('/');
|
||||
fname.append(basename);
|
||||
|
||||
// Ignore '.' and '..'
|
||||
if (basename == "." || basename == "..") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip emacs backup files, dotfiles (such as VIM swap).
|
||||
if (isIgnorable(basename)) {
|
||||
cerr << "Ignoring signature file " << fname << endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
loadExpressionsFromFile(fname, exprMap);
|
||||
} catch (runtime_error &e) {
|
||||
cerr << e.what() << ": '" << fname << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
closedir(d);
|
||||
} else {
|
||||
cerr << "Can't stat path: '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
#else // windows TODO: improve
|
||||
void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
|
||||
// Is our input path a file or a directory?
|
||||
struct stat st;
|
||||
if (stat(inPath.c_str(), &st) != 0) {
|
||||
cerr << "Can't stat path: '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (S_ISREG(st.st_mode)) {
|
||||
// process file
|
||||
try {
|
||||
loadExpressionsFromFile(inPath, exprMap);
|
||||
} catch (runtime_error &e) {
|
||||
cerr << e.what() << ": '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
} else if (S_ISDIR(st.st_mode)) {
|
||||
WIN32_FIND_DATA ffd;
|
||||
HANDLE hFind = INVALID_HANDLE_VALUE;
|
||||
string glob = inPath + "/*";
|
||||
hFind = FindFirstFile(glob.c_str(), &ffd);
|
||||
if (hFind == INVALID_HANDLE_VALUE) {
|
||||
cerr << "Can't open directory: '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
do {
|
||||
string basename(ffd.cFileName);
|
||||
string fname(inPath);
|
||||
fname.push_back('/');
|
||||
fname.append(basename);
|
||||
|
||||
// Ignore '.' and '..'
|
||||
if (basename == "." || basename == "..") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip emacs backup files, dotfiles (such as VIM swap).
|
||||
if (isIgnorable(basename)) {
|
||||
cerr << "Ignoring signature file " << fname << endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
loadExpressionsFromFile(fname, exprMap);
|
||||
} catch (runtime_error &e) {
|
||||
cerr << e.what() << ": '" << fname << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
} while (FindNextFile(hFind, &ffd) != 0);
|
||||
FindClose(hFind);
|
||||
} else {
|
||||
cerr << "Can't stat path: '" << inPath << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void loadSignatureList(const string &inFile,
|
||||
SignatureSet &signatures) {
|
||||
ifstream f(inFile.c_str());
|
||||
if (!f.good()) {
|
||||
cerr << "Can't open file: '" << inFile << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
unsigned lineNum = 0;
|
||||
string line;
|
||||
while (getline(f, line)) {
|
||||
lineNum++;
|
||||
|
||||
// if line is empty, or a comment, we can skip it
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned id;
|
||||
if (fromString(line, id)) {
|
||||
signatures.push_back(id);
|
||||
} else {
|
||||
// Parse error occurred
|
||||
failLine(lineNum, inFile, line, "Unable to parse ID.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void limitBySignature(ExpressionMap &exprMap,
|
||||
const SignatureSet &signatures) {
|
||||
ExpressionMap keepers;
|
||||
|
||||
SignatureSet::const_iterator it, ite;
|
||||
for (it = signatures.begin(), ite = signatures.end(); it != ite; ++it) {
|
||||
ExpressionMap::const_iterator match = exprMap.find(*it);
|
||||
if (match == exprMap.end()) {
|
||||
cerr << "Unable to find signature " << *it
|
||||
<< " in expression set!" << endl;
|
||||
exit(1);
|
||||
}
|
||||
keepers.insert(*match);
|
||||
}
|
||||
|
||||
exprMap.swap(keepers);
|
||||
}
|
55
util/expressions.h
Normal file
55
util/expressions.h
Normal file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef EXPRESSIONS_H
|
||||
#define EXPRESSIONS_H
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
typedef std::map<unsigned, std::string> ExpressionMap;
|
||||
typedef std::list<unsigned> SignatureSet;
|
||||
|
||||
// load all of the expressions from the given directory into the given
|
||||
// expression map. Exits on failure.
|
||||
void loadExpressions(const std::string &inDir, ExpressionMap &exprMap);
|
||||
|
||||
void loadExpressionsFromFile(const std::string &fname, ExpressionMap &exprMap);
|
||||
|
||||
// load a list of signature IDs
|
||||
void loadSignatureList(const std::string &inFile, SignatureSet &signatures);
|
||||
|
||||
// produce a new expression map only containing those signatures in the
|
||||
// expression list
|
||||
void generateExprMap(const SignatureSet &signatures,
|
||||
const ExpressionMap &allExprs, ExpressionMap &out);
|
||||
|
||||
// trim expression map to only the given signatures (in-place)
|
||||
void limitBySignature(ExpressionMap &exprMap, const SignatureSet &signatures);
|
||||
#endif
|
290
util/ng_corpus_editor.cpp
Normal file
290
util/ng_corpus_editor.cpp
Normal file
@@ -0,0 +1,290 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Corpus Editor: applies random transformation to a corpus.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ng_corpus_editor.h"
|
||||
#include "ng_corpus_properties.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/compare.h"
|
||||
#include "util/unicode_def.h"
|
||||
#include "parser/ucp_table.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
using namespace ue2;
|
||||
|
||||
namespace {
|
||||
|
||||
enum Operation {
|
||||
EDIT_INSERT = 0, //!< insert a character
|
||||
EDIT_REMOVE = 1, //!< remove a character
|
||||
EDIT_SUBSTITUTE = 2, //!< substitute a character for another
|
||||
EDIT_TRANSPOSE = 3, //!< swap two characters
|
||||
EDIT_FLIP_CASE = 4, //!< invert the case of an alpha character
|
||||
};
|
||||
|
||||
template<typename SeqT>
|
||||
static
|
||||
size_t choosePosition(const SeqT &corpus, CorpusProperties &props) {
|
||||
assert(!corpus.empty());
|
||||
unsigned pos = props.rand(0, corpus.size() - 1);
|
||||
return pos;
|
||||
}
|
||||
|
||||
class CorpusEditor {
|
||||
public:
|
||||
CorpusEditor(CorpusProperties &p) : props(p) {}
|
||||
|
||||
// Apply edits to a corpus
|
||||
void applyEdits(string &corpus);
|
||||
|
||||
private:
|
||||
// operations
|
||||
void insert(string &corpus);
|
||||
void remove(string &corpus);
|
||||
void substitute(string &corpus);
|
||||
void transpose(string &corpus);
|
||||
void flip_case(string &corpus);
|
||||
|
||||
Operation chooseOperation();
|
||||
u8 chooseByte();
|
||||
|
||||
CorpusProperties &props;
|
||||
};
|
||||
|
||||
Operation CorpusEditor::chooseOperation() {
|
||||
return (Operation)props.rand(EDIT_INSERT, EDIT_FLIP_CASE);
|
||||
}
|
||||
|
||||
void CorpusEditor::applyEdits(string &corpus) {
|
||||
for (size_t i = 0; i != props.editDistance; i++) {
|
||||
Operation op = chooseOperation();
|
||||
switch (op) {
|
||||
case EDIT_INSERT:
|
||||
insert(corpus);
|
||||
break;
|
||||
case EDIT_REMOVE:
|
||||
remove(corpus);
|
||||
break;
|
||||
case EDIT_SUBSTITUTE:
|
||||
substitute(corpus);
|
||||
break;
|
||||
case EDIT_TRANSPOSE:
|
||||
transpose(corpus);
|
||||
break;
|
||||
case EDIT_FLIP_CASE:
|
||||
flip_case(corpus);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CorpusEditor::insert(string &corpus) {
|
||||
unsigned pos = props.rand(0, corpus.size());
|
||||
u8 c = chooseByte();
|
||||
corpus.insert(pos, 1, (char)c);
|
||||
}
|
||||
|
||||
void CorpusEditor::remove(string &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
size_t pos = choosePosition(corpus, props);
|
||||
corpus.erase(pos, 1);
|
||||
}
|
||||
|
||||
void CorpusEditor::substitute(string &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
size_t pos = choosePosition(corpus, props);
|
||||
corpus[pos] = chooseByte();
|
||||
}
|
||||
|
||||
void CorpusEditor::transpose(string &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
size_t a = choosePosition(corpus, props);
|
||||
size_t b = choosePosition(corpus, props);
|
||||
u8 tmp = corpus[a];
|
||||
corpus[a] = corpus[b];
|
||||
corpus[b] = tmp;
|
||||
}
|
||||
|
||||
void CorpusEditor::flip_case(string &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
|
||||
// Pick a random starting position and walk forward (wrapping at the end)
|
||||
// until we find an alpha character.
|
||||
const size_t len = corpus.size();
|
||||
const size_t pos = choosePosition(corpus, props);
|
||||
|
||||
size_t i = pos;
|
||||
for (;;) {
|
||||
char c = corpus[i];
|
||||
if (ourisalpha(c)) {
|
||||
char upper = mytoupper(c), lower = mytolower(c);
|
||||
corpus[i] = c == upper ? lower : upper;
|
||||
DEBUG_PRINTF("flipped c=%c to %c\n", c, corpus[i]);
|
||||
return;
|
||||
}
|
||||
if (++i == len) {
|
||||
i = 0;
|
||||
}
|
||||
if (i == pos) { // wrapped, no alpha characters
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
u8 CorpusEditor::chooseByte() {
|
||||
return (u8)props.rand(0, 255);
|
||||
}
|
||||
|
||||
class CorpusEditorUtf8 {
|
||||
public:
|
||||
CorpusEditorUtf8(CorpusProperties &p) : props(p) {}
|
||||
|
||||
// Apply edits to a corpus.
|
||||
void applyEdits(vector<unichar> &corpus);
|
||||
|
||||
private:
|
||||
// operations
|
||||
void insert(vector<unichar> &corpus);
|
||||
void remove(vector<unichar> &corpus);
|
||||
void substitute(vector<unichar> &corpus);
|
||||
void transpose(vector<unichar> &corpus);
|
||||
void flip_case(vector<unichar> &corpus);
|
||||
|
||||
Operation chooseOperation();
|
||||
unichar chooseCodePoint();
|
||||
|
||||
CorpusProperties &props;
|
||||
};
|
||||
|
||||
Operation CorpusEditorUtf8::chooseOperation() {
|
||||
return (Operation)props.rand(EDIT_INSERT, EDIT_FLIP_CASE);
|
||||
}
|
||||
|
||||
void CorpusEditorUtf8::applyEdits(vector<unichar> &corpus) {
|
||||
for (size_t i = 0; i != props.editDistance; i++) {
|
||||
Operation op = chooseOperation();
|
||||
switch (op) {
|
||||
case EDIT_INSERT:
|
||||
insert(corpus);
|
||||
break;
|
||||
case EDIT_REMOVE:
|
||||
remove(corpus);
|
||||
break;
|
||||
case EDIT_SUBSTITUTE:
|
||||
substitute(corpus);
|
||||
break;
|
||||
case EDIT_TRANSPOSE:
|
||||
transpose(corpus);
|
||||
break;
|
||||
case EDIT_FLIP_CASE:
|
||||
flip_case(corpus);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CorpusEditorUtf8::insert(vector<unichar> &corpus) {
|
||||
unsigned pos = props.rand(0, corpus.size());
|
||||
corpus.insert(corpus.begin() + pos, chooseCodePoint());
|
||||
}
|
||||
|
||||
void CorpusEditorUtf8::remove(vector<unichar> &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
size_t pos = choosePosition(corpus, props);
|
||||
corpus.erase(corpus.begin() + pos);
|
||||
}
|
||||
|
||||
void CorpusEditorUtf8::substitute(vector<unichar> &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
size_t pos = choosePosition(corpus, props);
|
||||
corpus[pos] = chooseCodePoint();
|
||||
}
|
||||
|
||||
void CorpusEditorUtf8::transpose(vector<unichar> &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
size_t a = choosePosition(corpus, props);
|
||||
size_t b = choosePosition(corpus, props);
|
||||
unichar tmp = corpus[a];
|
||||
corpus[a] = corpus[b];
|
||||
corpus[b] = tmp;
|
||||
}
|
||||
|
||||
void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
|
||||
if (corpus.empty()) return;
|
||||
|
||||
// Pick a random starting position and walk forward (wrapping at the end)
|
||||
// until we find an alpha character.
|
||||
const size_t len = corpus.size();
|
||||
const size_t pos = choosePosition(corpus, props);
|
||||
|
||||
size_t i = pos;
|
||||
for (;;) {
|
||||
if (::flip_case(&corpus[i])) {
|
||||
return;
|
||||
}
|
||||
if (++i == len) {
|
||||
i = 0;
|
||||
}
|
||||
if (i == pos) { // wrapped, no alpha characters
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unichar CorpusEditorUtf8::chooseCodePoint(void) {
|
||||
/* We need to ensure that we don't pick a surrogate cp */
|
||||
const u32 range =
|
||||
MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
|
||||
unichar raw = props.rand(0, range - 1);
|
||||
if (raw < UNICODE_SURROGATE_MIN) {
|
||||
return raw;
|
||||
} else {
|
||||
return raw + UNICODE_SURROGATE_MAX + 1;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void editCorpus(string *corpus, CorpusProperties &props) {
|
||||
CorpusEditor ed(props);
|
||||
ed.applyEdits(*corpus);
|
||||
}
|
||||
|
||||
void editCorpus(vector<unichar> *corpus, CorpusProperties &props) {
|
||||
CorpusEditorUtf8 ed(props);
|
||||
ed.applyEdits(*corpus);
|
||||
}
|
47
util/ng_corpus_editor.h
Normal file
47
util/ng_corpus_editor.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Corpus Editor: applies random transformation to a corpus.
|
||||
*/
|
||||
|
||||
#ifndef CORPUS_EDITOR_H
|
||||
#define CORPUS_EDITOR_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/unicode_def.h"
|
||||
|
||||
class CorpusProperties;
|
||||
|
||||
void editCorpus(std::string *corpus, CorpusProperties &props);
|
||||
void editCorpus(std::vector<ue2::unichar> *corpus, CorpusProperties &props);
|
||||
|
||||
#endif
|
683
util/ng_corpus_generator.cpp
Normal file
683
util/ng_corpus_generator.cpp
Normal file
@@ -0,0 +1,683 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Corpus Generation tool.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ng_corpus_generator.h"
|
||||
|
||||
#include "ng_corpus_editor.h"
|
||||
#include "nfagraph/ng.h"
|
||||
#include "nfagraph/ng_util.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/container.h"
|
||||
#include "util/graph_range.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/ue2_containers.h"
|
||||
#include "util/ue2string.h"
|
||||
#include "util/unicode_def.h"
|
||||
#include "util/unicode_set.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace ue2;
|
||||
using boost::ptr_vector;
|
||||
|
||||
typedef vector<NFAVertex> VertexPath;
|
||||
|
||||
#if defined(DEBUG)
|
||||
// For debugging output
|
||||
static
|
||||
string pathToString(const NGHolder &g, const VertexPath &p) {
|
||||
ostringstream oss;
|
||||
oss << '[';
|
||||
for (auto i = p.begin(); i != p.end(); ++i) {
|
||||
if (i != p.begin()) {
|
||||
oss << ',';
|
||||
}
|
||||
oss << g[*i].index;
|
||||
}
|
||||
oss << ']';
|
||||
return oss.str();
|
||||
}
|
||||
#endif
|
||||
|
||||
/** True if this graph has no non-special successors of start or startDs. */
|
||||
static
|
||||
bool graph_is_empty(const NGHolder &g) {
|
||||
for (const auto &v : adjacent_vertices_range(g.start, g)) {
|
||||
if (!is_special(v, g)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (const auto &v : adjacent_vertices_range(g.start, g)) {
|
||||
if (!is_special(v, g)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static
|
||||
string encodeUtf8(const vector<unichar> &v) {
|
||||
string rv;
|
||||
for (const unichar &cp : v) {
|
||||
if (cp < UTF_2CHAR_MIN) {
|
||||
rv.push_back(cp);
|
||||
} else if (cp < UTF_3CHAR_MIN) {
|
||||
rv.push_back(UTF_TWO_BYTE_HEADER | (cp >> UTF_CONT_SHIFT));
|
||||
rv.push_back(makeContByte(cp));
|
||||
} else if (cp < UTF_4CHAR_MIN) {
|
||||
rv.push_back(UTF_THREE_BYTE_HEADER | (cp >> (2 * UTF_CONT_SHIFT)));
|
||||
rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT));
|
||||
rv.push_back(makeContByte(cp));
|
||||
} else {
|
||||
rv.push_back(UTF_FOUR_BYTE_HEADER | (cp >> (3 * UTF_CONT_SHIFT)));
|
||||
rv.push_back(makeContByte(cp >> (2 * UTF_CONT_SHIFT)));
|
||||
rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT));
|
||||
rv.push_back(makeContByte(cp));
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
template<class Iter, class Val>
|
||||
static
|
||||
bool has_greater_than(Iter it, Iter end, const Val &v, size_t limit) {
|
||||
for (; it != end; ++it) {
|
||||
if (*it == v) {
|
||||
if (limit == 0) {
|
||||
return true;
|
||||
}
|
||||
--limit;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
void findPaths(const NGHolder &g, CorpusProperties &cProps,
|
||||
vector<VertexPath> &allPaths, size_t cycleLimit,
|
||||
size_t corpusLimit) {
|
||||
// The maximum number of open (in progress) paths. New paths beyond this
|
||||
// limit will evict a random existing one.
|
||||
const size_t MAX_OPEN = min((size_t)1000, corpusLimit * 10);
|
||||
|
||||
ptr_vector<VertexPath> open;
|
||||
open.push_back(new VertexPath(1, g.start));
|
||||
|
||||
ue2::unordered_set<NFAVertex> one_way_in;
|
||||
for (const auto &v : vertices_range(g)) {
|
||||
if (!hasGreaterInDegree(1, v, g)) {
|
||||
one_way_in.insert(v);
|
||||
}
|
||||
}
|
||||
|
||||
while (!open.empty()) {
|
||||
u32 slot = cProps.rand(0, open.size() - 1);
|
||||
swap(open.at(slot), open.back());
|
||||
ptr_vector<VertexPath>::auto_type p = open.pop_back();
|
||||
NFAVertex u = p->back();
|
||||
|
||||
DEBUG_PRINTF("dequeuing path %s, back %u\n",
|
||||
pathToString(g, *p).c_str(), g[u].index);
|
||||
|
||||
NFAGraph::adjacency_iterator ai, ae;
|
||||
for (tie(ai, ae) = adjacent_vertices(u, g); ai != ae; ++ai) {
|
||||
NFAVertex v = *ai;
|
||||
|
||||
if (u == g.startDs && v == g.startDs) {
|
||||
// explicitly avoid following startDs self-loop, as we have
|
||||
// other mechanisms for adding prefixes to our corpora.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Accept vertices generate completed paths.
|
||||
if (v == g.accept || v == g.acceptEod) {
|
||||
DEBUG_PRINTF("path complete: %s\n",
|
||||
pathToString(g, *p).c_str());
|
||||
allPaths.push_back(*p);
|
||||
if (allPaths.size() >= corpusLimit) {
|
||||
DEBUG_PRINTF("full, going home\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// No meaningful edges out of accept or acceptEod.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!contains(one_way_in, v) &&
|
||||
has_greater_than(p->begin(), p->end(), v, cycleLimit)) {
|
||||
// Note that vertices that only have one predecessor don't need
|
||||
// their cycle limit checked, as their predecessors will have
|
||||
// the same count.
|
||||
DEBUG_PRINTF("exceeded cycle limit for v=%u, pruning path\n",
|
||||
g[v].index);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we've got no further adjacent vertices, re-use p rather than
|
||||
// copying it for the next path.
|
||||
VertexPath *new_path;
|
||||
if (boost::next(ai) == ae) {
|
||||
new_path = p.release();
|
||||
} else {
|
||||
new_path = new VertexPath(*p);
|
||||
}
|
||||
|
||||
new_path->push_back(v);
|
||||
if (open.size() < MAX_OPEN) {
|
||||
open.push_back(new_path);
|
||||
} else {
|
||||
u32 victim = cProps.rand(0, open.size() - 1);
|
||||
open.replace(victim, new_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
DEBUG_PRINTF("bored, going home\n");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/** \brief Concrete implementation */
|
||||
class CorpusGeneratorImpl : public CorpusGenerator {
|
||||
public:
|
||||
CorpusGeneratorImpl(const NGHolder &graph_in, CorpusProperties &props);
|
||||
~CorpusGeneratorImpl() {}
|
||||
|
||||
void generateCorpus(vector<string> &data);
|
||||
|
||||
private:
|
||||
unsigned char getRandomChar();
|
||||
unsigned char getMatchChar(const CharReach &cr);
|
||||
unsigned char getUnmatchChar(const CharReach &cr);
|
||||
|
||||
unsigned char getChar(NFAVertex v);
|
||||
void newGenerator(vector<string> &data);
|
||||
string pathToCorpus(const VertexPath &path);
|
||||
|
||||
/** \brief Generate a string of random bytes between minLen and maxLen
|
||||
* bytes in length. */
|
||||
void addRandom(const min_max &mm, string *out);
|
||||
|
||||
/** \brief The NFA graph we operate over. */
|
||||
const NGHolder &graph;
|
||||
|
||||
/** \brief Reference to our corpus generator properties object (stores some
|
||||
* state) */
|
||||
CorpusProperties &cProps;
|
||||
};
|
||||
|
||||
CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in,
|
||||
CorpusProperties &props)
|
||||
: graph(graph_in), cProps(props) {
|
||||
// empty
|
||||
}
|
||||
|
||||
void CorpusGeneratorImpl::generateCorpus(vector<string> &data) {
|
||||
newGenerator(data);
|
||||
|
||||
// If the caller has asked us, apply edit distance to corpora
|
||||
if (cProps.editDistance) {
|
||||
for (auto &s : data) {
|
||||
editCorpus(&s, cProps);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Generate a random character, taking care to stick to the alphabet
|
||||
* that we've been asked for. */
|
||||
u8 CorpusGeneratorImpl::getRandomChar() {
|
||||
return 'a' + cProps.rand(0, min(cProps.alphabetSize, (u32)CharReach::npos));
|
||||
}
|
||||
|
||||
/** \brief Select a random character from the given string of valid match
|
||||
* characters. */
|
||||
unsigned char CorpusGeneratorImpl::getMatchChar(const CharReach &cr) {
|
||||
unsigned int num = cr.count();
|
||||
if (num == 0) {
|
||||
return 0;
|
||||
} else if (num == 1) {
|
||||
return (unsigned char)cr.find_first();
|
||||
} else if (num == 256) {
|
||||
// Dot class, any character is OK!
|
||||
return (unsigned char)cProps.rand(0, 255);
|
||||
}
|
||||
else {
|
||||
unsigned idx = cProps.rand(0, num - 1);
|
||||
return (unsigned char)cr.find_nth(idx);
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Select a character that does not belong to the given bitset. This
|
||||
* makes no guarantees on unmatchability if the bitset is full. */
|
||||
unsigned char CorpusGeneratorImpl::getUnmatchChar(const CharReach &cr) {
|
||||
return getMatchChar(~cr);
|
||||
}
|
||||
|
||||
void CorpusGeneratorImpl::addRandom(const min_max &mm, string *out) {
|
||||
assert(mm.min <= mm.max);
|
||||
u32 range = mm.max - mm.min;
|
||||
u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0);
|
||||
for (u32 i = 0; i < len; ++i) {
|
||||
out->push_back(getRandomChar());
|
||||
}
|
||||
}
|
||||
|
||||
unsigned char CorpusGeneratorImpl::getChar(NFAVertex v) {
|
||||
const CharReach &cr = graph.g[v].char_reach;
|
||||
|
||||
switch (cProps.throwDice()) {
|
||||
case CorpusProperties::ROLLED_MATCH:
|
||||
return getMatchChar(cr);
|
||||
case CorpusProperties::ROLLED_UNMATCH:
|
||||
return getUnmatchChar(cr);
|
||||
case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */
|
||||
return getRandomChar();
|
||||
}
|
||||
assert(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** \brief Convert a path through the graph to a corpus string. */
|
||||
string CorpusGeneratorImpl::pathToCorpus(const VertexPath &path) {
|
||||
string s;
|
||||
|
||||
// Add random prefix
|
||||
if (cProps.prefixRange.max) {
|
||||
addRandom(cProps.prefixRange, &s);
|
||||
}
|
||||
|
||||
// Generate a corpus from our path
|
||||
for (const auto &e : path) {
|
||||
if (!is_special(e, graph)) {
|
||||
s += getChar(e);
|
||||
}
|
||||
}
|
||||
|
||||
// Add random suffix
|
||||
if (cProps.suffixRange.max) {
|
||||
addRandom(cProps.suffixRange, &s);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
void CorpusGeneratorImpl::newGenerator(vector<string> &outdata) {
|
||||
const unsigned int maxCycles = cProps.getCycleLimit().second;
|
||||
DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n",
|
||||
cProps.corpusLimit, maxCycles);
|
||||
|
||||
vector<VertexPath> allPaths;
|
||||
|
||||
// Special case: if the graph has ONLY special vertices, then this is
|
||||
// likely to be an odd vacuous pattern or a pattern that can never match.
|
||||
// In these cases, an empty corpus is useful.
|
||||
if (graph_is_empty(graph)) {
|
||||
VertexPath empty(1, graph.start);
|
||||
allPaths.push_back(empty);
|
||||
}
|
||||
|
||||
// build a set of unique paths
|
||||
findPaths(graph, cProps, allPaths, maxCycles, cProps.corpusLimit);
|
||||
|
||||
// transform paths into corpora: we do this repeatedly until we (a) hit our
|
||||
// limit, or (b) don't generate any new corpora for any of our paths.
|
||||
set<string> data;
|
||||
while (data.size() < cProps.corpusLimit) {
|
||||
size_t count = data.size();
|
||||
for (const auto &path : allPaths) {
|
||||
string s = pathToCorpus(path);
|
||||
if (data.insert(s).second) {
|
||||
DEBUG_PRINTF("corpus %zu (%zu bytes): '%s'\n", data.size(),
|
||||
s.size(), escapeString(s).c_str());
|
||||
if (data.size() == cProps.corpusLimit) {
|
||||
goto hit_limit;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (data.size() == count) {
|
||||
break; // we're finding it hard to generate more corpora
|
||||
}
|
||||
}
|
||||
|
||||
hit_limit:
|
||||
DEBUG_PRINTF("%zu corpora built\n", data.size());
|
||||
|
||||
// populate the output vector from the set we built.
|
||||
outdata.reserve(data.size());
|
||||
copy(data.begin(), data.end(), back_inserter(outdata));
|
||||
}
|
||||
|
||||
/** \brief Concrete implementation for UTF-8 */
|
||||
class CorpusGeneratorUtf8 : public CorpusGenerator {
|
||||
public:
|
||||
CorpusGeneratorUtf8(const NGHolder &graph_in, CorpusProperties &props);
|
||||
~CorpusGeneratorUtf8() {}
|
||||
|
||||
void generateCorpus(vector<string> &data);
|
||||
|
||||
private:
|
||||
unichar getRandomChar();
|
||||
unichar getMatchChar(CodePointSet cps);
|
||||
unichar getUnmatchChar(const CodePointSet &cps);
|
||||
|
||||
unichar getChar(const CodePointSet &cps);
|
||||
void newGenerator(vector<vector<unichar> > &data);
|
||||
vector<unichar> pathToCorpus(const vector<CodePointSet> &path);
|
||||
|
||||
/** \brief Generate a random string between min and max codepoints in
|
||||
* length. */
|
||||
void addRandom(const min_max &mm, vector<unichar> *out);
|
||||
|
||||
/** \brief The NFA graph we operate over. */
|
||||
const NGHolder &graph;
|
||||
|
||||
/** \brief Reference to our corpus generator properties object (stores some
|
||||
* state) */
|
||||
CorpusProperties &cProps;
|
||||
};
|
||||
|
||||
CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGHolder &graph_in,
|
||||
CorpusProperties &props)
|
||||
: graph(graph_in), cProps(props) {
|
||||
// empty
|
||||
}
|
||||
|
||||
void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
|
||||
vector<vector<unichar>> raw;
|
||||
newGenerator(raw);
|
||||
|
||||
// If the caller has asked us, apply edit distance to corpora
|
||||
if (cProps.editDistance) {
|
||||
for (auto &e : raw) {
|
||||
editCorpus(&e, cProps);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &e : raw) {
|
||||
data.push_back(encodeUtf8(e));
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Generate a random character, taking care to stick to the alphabet
|
||||
* that we've been asked for. */
|
||||
unichar CorpusGeneratorUtf8::getRandomChar() {
|
||||
u32 range = MAX_UNICODE + 1
|
||||
- (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
|
||||
range = min(cProps.alphabetSize, range);
|
||||
assert(range);
|
||||
|
||||
unichar c = 'a' + cProps.rand(0, range - 1);
|
||||
|
||||
if (c >= UNICODE_SURROGATE_MIN) {
|
||||
c =+ UNICODE_SURROGATE_MAX + 1;
|
||||
}
|
||||
|
||||
return c % (MAX_UNICODE + 1);
|
||||
}
|
||||
|
||||
/** \brief Select a random character from the given string of valid match
|
||||
* characters. */
|
||||
unichar CorpusGeneratorUtf8::getMatchChar(CodePointSet cps) {
|
||||
cps.unsetRange(UNICODE_SURROGATE_MIN, UNICODE_SURROGATE_MAX);
|
||||
u32 num = cps.count();
|
||||
if (num == 0) {
|
||||
return 0;
|
||||
} else if (num == 1) {
|
||||
return lower(*cps.begin());
|
||||
} else {
|
||||
unichar rv = cps.at(cProps.rand(0, num - 1));
|
||||
assert(rv != INVALID_UNICODE);
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Select a character that does not belong to the given bitset. This
|
||||
* makes no guarantees on unmatchability if the bitset is full. */
|
||||
unichar CorpusGeneratorUtf8::getUnmatchChar(const CodePointSet &cps) {
|
||||
return getMatchChar(~cps);
|
||||
}
|
||||
|
||||
void CorpusGeneratorUtf8::addRandom(const min_max &mm, vector<unichar> *out) {
|
||||
assert(mm.min <= mm.max);
|
||||
u32 range = mm.max - mm.min;
|
||||
u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0);
|
||||
for (u32 i = 0; i < len; ++i) {
|
||||
out->push_back(getRandomChar());
|
||||
}
|
||||
}
|
||||
|
||||
unichar CorpusGeneratorUtf8::getChar(const CodePointSet &cps) {
|
||||
switch (cProps.throwDice()) {
|
||||
case CorpusProperties::ROLLED_MATCH:
|
||||
return getMatchChar(cps);
|
||||
case CorpusProperties::ROLLED_UNMATCH:
|
||||
return getUnmatchChar(cps);
|
||||
case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */
|
||||
return getRandomChar();
|
||||
}
|
||||
assert(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** \brief Convert a path through the graph to a corpus string. */
|
||||
vector<unichar>
|
||||
CorpusGeneratorUtf8::pathToCorpus(const vector<CodePointSet> &path) {
|
||||
vector<unichar> s;
|
||||
|
||||
// Add random prefix
|
||||
if (cProps.prefixRange.max) {
|
||||
addRandom(cProps.prefixRange, &s);
|
||||
}
|
||||
|
||||
// Generate a corpus from our path
|
||||
for (const auto &e : path) {
|
||||
s.push_back(getChar(e));
|
||||
}
|
||||
|
||||
// Add random suffix
|
||||
if (cProps.suffixRange.max) {
|
||||
addRandom(cProps.suffixRange, &s);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static
|
||||
u32 classify_vertex(const NFAGraph &g, NFAVertex v) {
|
||||
const CharReach &cr = g[v].char_reach;
|
||||
if (cr.isSubsetOf(UTF_ASCII_CR)) {
|
||||
return 1;
|
||||
} else if (cr.isSubsetOf(UTF_TWO_START_CR)) {
|
||||
return 2;
|
||||
} else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
|
||||
return 3;
|
||||
} else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
|
||||
return 4;
|
||||
}
|
||||
|
||||
/* this can happen due to dummy vertices from zwa */
|
||||
return 1;
|
||||
}
|
||||
|
||||
static
|
||||
void fillCodePointSet(const CharReach &cr, CodePointSet *out, u8 mask = 0xff) {
|
||||
for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
|
||||
out->set(i & mask);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void expandCodePointSet(const CharReach &cr, CodePointSet *out, u32 mask,
|
||||
u32 n) {
|
||||
CodePointSet base;
|
||||
base.swap(*out);
|
||||
for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
|
||||
u32 val = (i & mask) << (n * UTF_CONT_SHIFT);
|
||||
for (const auto &cp : base) {
|
||||
unichar ll = lower(cp);
|
||||
unichar uu = upper(cp);
|
||||
out->setRange(val + ll, MIN(val + uu, MAX_UNICODE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void decodePath(const NFAGraph &g, const VertexPath &in,
|
||||
vector<CodePointSet> &out) {
|
||||
VertexPath::const_iterator it = in.begin();
|
||||
while (it != in.end()) {
|
||||
if (is_special(*it, g)) {
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push_back(CodePointSet());
|
||||
CodePointSet &cps = out.back();
|
||||
|
||||
switch (classify_vertex(g, *it)) {
|
||||
case 1:
|
||||
fillCodePointSet(g[*it].char_reach, &cps);
|
||||
++it;
|
||||
break;
|
||||
case 2:
|
||||
fillCodePointSet(g[*(it + 1)].char_reach, &cps,
|
||||
UTF_CONT_BYTE_VALUE_MASK);
|
||||
expandCodePointSet(g[*it].char_reach, &cps,
|
||||
~UTF_TWO_BYTE_HEADER, 1);
|
||||
it += 2;
|
||||
break;
|
||||
case 3:
|
||||
fillCodePointSet(g[*(it + 2)].char_reach, &cps,
|
||||
UTF_CONT_BYTE_VALUE_MASK);
|
||||
expandCodePointSet(g[*(it + 1)].char_reach, &cps,
|
||||
UTF_CONT_BYTE_VALUE_MASK, 1);
|
||||
expandCodePointSet(g[*it].char_reach, &cps,
|
||||
~UTF_THREE_BYTE_HEADER, 2);
|
||||
it += 3;
|
||||
break;
|
||||
case 4:
|
||||
fillCodePointSet(g[*(it + 3)].char_reach, &cps,
|
||||
UTF_CONT_BYTE_VALUE_MASK);
|
||||
expandCodePointSet(g[*(it + 2)].char_reach, &cps,
|
||||
UTF_CONT_BYTE_VALUE_MASK, 1);
|
||||
expandCodePointSet(g[*(it + 1)].char_reach, &cps,
|
||||
UTF_CONT_BYTE_VALUE_MASK, 2);
|
||||
expandCodePointSet(g[*it].char_reach, &cps,
|
||||
~UTF_FOUR_BYTE_HEADER, 3);
|
||||
it += 4;
|
||||
break;
|
||||
default:;
|
||||
assert(0);
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void translatePaths(const NGHolder &graph,
|
||||
const vector<VertexPath> &allPathsTemp,
|
||||
vector<vector<CodePointSet>> *out) {
|
||||
assert(out);
|
||||
for (const auto &path : allPathsTemp) {
|
||||
out->push_back(vector<CodePointSet>());
|
||||
decodePath(graph.g, path, out->back());
|
||||
}
|
||||
}
|
||||
|
||||
void CorpusGeneratorUtf8::newGenerator(vector<vector<unichar>> &outdata) {
|
||||
const u32 maxCycles = cProps.getCycleLimit().second;
|
||||
DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n",
|
||||
cProps.corpusLimit, maxCycles);
|
||||
|
||||
vector<vector<CodePointSet>> allPaths;
|
||||
|
||||
// Special case: if the graph has ONLY special vertices, then this is
|
||||
// likely to be an odd vacuous pattern or a pattern that can never match.
|
||||
// In these cases, an empty corpus is useful.
|
||||
if (graph_is_empty(graph)) {
|
||||
allPaths.push_back(vector<CodePointSet>());
|
||||
} else {
|
||||
// build a set of unique paths
|
||||
vector<VertexPath> allPathsTemp;
|
||||
findPaths(graph, cProps, allPathsTemp, maxCycles, cProps.corpusLimit);
|
||||
translatePaths(graph, allPathsTemp, &allPaths);
|
||||
}
|
||||
|
||||
// transform paths into corpora: we do this repeatedly until we (a) hit our
|
||||
// limit, or (b) don't generate any new corpora for any of our paths.
|
||||
set<vector<unichar> > data;
|
||||
while (data.size() < cProps.corpusLimit) {
|
||||
size_t count = data.size();
|
||||
for (const auto &path : allPaths) {
|
||||
vector<unichar> vu = pathToCorpus(path);
|
||||
if (data.insert(vu).second) {
|
||||
if (data.size() == cProps.corpusLimit) {
|
||||
goto hit_limit;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (data.size() == count) {
|
||||
break; // we're finding it hard to generate more corpora
|
||||
}
|
||||
}
|
||||
|
||||
hit_limit:
|
||||
DEBUG_PRINTF("%zu corpora built\n", data.size());
|
||||
|
||||
// populate the output vector from the set we built.
|
||||
outdata.reserve(data.size());
|
||||
copy(data.begin(), data.end(), back_inserter(outdata));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
CorpusGenerator::~CorpusGenerator() { }
|
||||
|
||||
// External entry point
|
||||
|
||||
unique_ptr<CorpusGenerator> makeCorpusGenerator(const NGWrapper &graph,
|
||||
CorpusProperties &props) {
|
||||
if (graph.utf8) {
|
||||
return ue2::make_unique<CorpusGeneratorUtf8>(graph, props);
|
||||
} else {
|
||||
return ue2::make_unique<CorpusGeneratorImpl>(graph, props);
|
||||
}
|
||||
}
|
67
util/ng_corpus_generator.h
Normal file
67
util/ng_corpus_generator.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Corpus Generation tool.
|
||||
*/
|
||||
|
||||
#ifndef NG_CORPUS_GENERATOR_H_
|
||||
#define NG_CORPUS_GENERATOR_H_
|
||||
|
||||
#include "ng_corpus_properties.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class NGWrapper;
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
/** \brief Abstract interface to corpus generator tool. */
|
||||
class CorpusGenerator {
|
||||
public:
|
||||
virtual ~CorpusGenerator();
|
||||
|
||||
/** \brief Build some corpora.
|
||||
*
|
||||
* Generate a set of corpora, placed in the \a data vector, for the current
|
||||
* NFAGraph according to the parameters provided by the CorpusProperties
|
||||
* object. Returns the number of corpora generated.
|
||||
*/
|
||||
virtual void generateCorpus(std::vector<std::string> &data) = 0;
|
||||
};
|
||||
|
||||
/** \brief Build a concrete impl conforming to the \ref CorpusGenerator
|
||||
* interface. */
|
||||
std::unique_ptr<CorpusGenerator>
|
||||
makeCorpusGenerator(const ue2::NGWrapper &graph, CorpusProperties &props);
|
||||
|
||||
#endif
|
99
util/ng_corpus_properties.cpp
Normal file
99
util/ng_corpus_properties.cpp
Normal file
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief State for corpus generator.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ng_corpus_properties.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <boost/random/uniform_int_distribution.hpp>
|
||||
|
||||
// default constructor
|
||||
CorpusProperties::CorpusProperties()
|
||||
: matchness(100), unmatchness(0), randomness(0), prefixRange(0, 0),
|
||||
suffixRange(0, 0), cycleMin(1), cycleMax(1),
|
||||
corpusLimit(DEFAULT_CORPUS_GENERATOR_LIMIT), editDistance(0),
|
||||
alphabetSize(~0) {
|
||||
// empty
|
||||
}
|
||||
|
||||
bool CorpusProperties::setPercentages(unsigned int match, unsigned int unmatch,
|
||||
unsigned int random) {
|
||||
if (match + unmatch + random != 100) {
|
||||
// Do not update probabilities
|
||||
return false;
|
||||
}
|
||||
matchness = match;
|
||||
unmatchness = unmatch;
|
||||
randomness = random;
|
||||
return true;
|
||||
}
|
||||
|
||||
void CorpusProperties::seed(unsigned val) {
|
||||
rngSeed = val;
|
||||
randomGen.seed(val);
|
||||
}
|
||||
|
||||
unsigned CorpusProperties::getSeed() const {
|
||||
return rngSeed;
|
||||
}
|
||||
|
||||
unsigned CorpusProperties::rand(unsigned n, unsigned m) {
|
||||
boost::random::uniform_int_distribution<> dist(n, m);
|
||||
return dist(randomGen);
|
||||
}
|
||||
|
||||
// not const because it stores state for the random number generator
|
||||
CorpusProperties::RollResult CorpusProperties::throwDice() {
|
||||
if (matchness == 100) {
|
||||
return ROLLED_MATCH;
|
||||
}
|
||||
if (unmatchness == 100) {
|
||||
return ROLLED_UNMATCH;
|
||||
}
|
||||
if (randomness == 100) {
|
||||
return ROLLED_RANDOM;
|
||||
}
|
||||
|
||||
// This assumes a uniform distribution. Perhaps factor some 'depth' param
|
||||
// and whether this 'depth' should increase or decrease the likelihood of
|
||||
// unmatch or random rolls.
|
||||
unsigned int outcome = rand(0, 99);
|
||||
if (outcome < matchness) {
|
||||
return ROLLED_MATCH;
|
||||
}
|
||||
if (outcome < matchness + unmatchness) {
|
||||
return ROLLED_UNMATCH;
|
||||
}
|
||||
|
||||
return ROLLED_RANDOM;
|
||||
}
|
131
util/ng_corpus_properties.h
Normal file
131
util/ng_corpus_properties.h
Normal file
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief State for corpus generator.
|
||||
*/
|
||||
|
||||
#ifndef NG_CORPUS_PROPERTIES_H
|
||||
#define NG_CORPUS_PROPERTIES_H
|
||||
|
||||
#include <utility> // for std::pair
|
||||
#include <boost/random/mersenne_twister.hpp>
|
||||
|
||||
#include "ue2common.h"
|
||||
|
||||
#define DEFAULT_CORPUS_GENERATOR_LIMIT 500000
|
||||
|
||||
struct min_max {
|
||||
min_max(u32 min_in, u32 max_in) : min(min_in), max(max_in) {
|
||||
assert(min <= max);
|
||||
}
|
||||
u32 min;
|
||||
u32 max;
|
||||
};
|
||||
|
||||
class CorpusProperties {
|
||||
public:
|
||||
/**
|
||||
* Default constructor with default properties:
|
||||
* - generate match char with 100% probability
|
||||
* - generate unmatch char with 0% probability
|
||||
* - generate random char with 0% probability
|
||||
* - follow cycles once
|
||||
* - do not expand character classes (including case classes)
|
||||
* - generate data for all possible paths through graph
|
||||
* - pick random characters from the full ASCII alphabet
|
||||
*/
|
||||
CorpusProperties();
|
||||
|
||||
/**
|
||||
* Set probabilities (as percentages). Returns true if sum == 100,
|
||||
* else returns false and no changes are made to current probabilities.
|
||||
*/
|
||||
bool setPercentages(unsigned int match, unsigned int unmatch,
|
||||
unsigned int random);
|
||||
|
||||
unsigned percentMatch() const { return matchness; }
|
||||
unsigned percentUnmatch() const { return unmatchness; }
|
||||
unsigned percentRandom() const { return randomness; }
|
||||
|
||||
// The number of times a cycle is followed
|
||||
void setCycleLimit(unsigned int min, unsigned int max) {
|
||||
cycleMin = min;
|
||||
cycleMax = max;
|
||||
}
|
||||
std::pair<unsigned int, unsigned int> getCycleLimit() const {
|
||||
return std::make_pair(cycleMin, cycleMax);
|
||||
}
|
||||
|
||||
// Roll for initiative
|
||||
enum RollResult {
|
||||
ROLLED_MATCH,
|
||||
ROLLED_UNMATCH,
|
||||
ROLLED_RANDOM,
|
||||
};
|
||||
RollResult throwDice();
|
||||
|
||||
/** \brief Set the PRNG seed. */
|
||||
void seed(unsigned val);
|
||||
unsigned int getSeed() const;
|
||||
|
||||
/** \brief Retrieve a value from the PRNG in the closed range [n, m]. */
|
||||
unsigned rand(unsigned n, unsigned m);
|
||||
|
||||
private:
|
||||
// Percentages
|
||||
unsigned int matchness;
|
||||
unsigned int unmatchness;
|
||||
unsigned int randomness;
|
||||
|
||||
public:
|
||||
// Extra data
|
||||
min_max prefixRange;
|
||||
min_max suffixRange;
|
||||
|
||||
private:
|
||||
// Behaviours
|
||||
unsigned int cycleMin;
|
||||
unsigned int cycleMax;
|
||||
|
||||
public:
|
||||
// FIXME: Limit the number of corpus files generated to the first 'limit'
|
||||
// number of paths - note that this means the corpus will not be a complete
|
||||
// representation of the pattern.
|
||||
unsigned int corpusLimit;
|
||||
|
||||
unsigned int editDistance;
|
||||
unsigned int alphabetSize;
|
||||
|
||||
private:
|
||||
// PRNG.
|
||||
boost::random::mt19937 randomGen;
|
||||
unsigned int rngSeed;
|
||||
};
|
||||
|
||||
#endif
|
334
util/ng_find_matches.cpp
Normal file
334
util/ng_find_matches.cpp
Normal file
@@ -0,0 +1,334 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Pattern lifetime analysis.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "ng_find_matches.h"
|
||||
|
||||
#include "nfagraph/ng_graph.h"
|
||||
#include "nfagraph/ng_util.h"
|
||||
#include "parser/position.h"
|
||||
#include "util/container.h"
|
||||
#include "util/compare.h"
|
||||
#include "util/report.h"
|
||||
#include "util/report_manager.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
using namespace ue2;
|
||||
|
||||
// convenience typedefs
|
||||
typedef map<NFAVertex,size_t> SOMMap;
|
||||
typedef set<pair<size_t, size_t> > MatchSet;
|
||||
|
||||
struct fmstate {
|
||||
SOMMap states;
|
||||
SOMMap next;
|
||||
size_t offset;
|
||||
unsigned char cur;
|
||||
unsigned char prev;
|
||||
const bool som;
|
||||
const bool utf8;
|
||||
const bool allowStartDs;
|
||||
const ReportManager &rm;
|
||||
|
||||
fmstate(const bool som_in, const bool utf8_in, const bool aSD_in,
|
||||
const ReportManager &rm_in)
|
||||
: offset(0), cur(0), prev(0), som(som_in), utf8(utf8_in),
|
||||
allowStartDs(aSD_in), rm(rm_in) {}
|
||||
};
|
||||
|
||||
static
|
||||
void initStates(const NGHolder &g, struct fmstate &state) {
|
||||
state.states.insert(make_pair(g.start, 0));
|
||||
if (state.allowStartDs) {
|
||||
state.states.insert(make_pair(g.startDs, 0));
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
bool isWordChar(const unsigned char c) {
|
||||
// check if it's an alpha character
|
||||
if (ourisalpha(c)) {
|
||||
return true;
|
||||
}
|
||||
// check if it's a digit
|
||||
if (c >= '0' && c <= '9') {
|
||||
return true;
|
||||
}
|
||||
// check if it's an underscore
|
||||
if (c == '_') {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
bool isUtf8CodePoint(const char c) {
|
||||
// check if this is a start of 4-byte character
|
||||
if ((c & 0xF8) == 0xF0) {
|
||||
return true;
|
||||
}
|
||||
// check if this is a start of 3-byte character
|
||||
if ((c & 0xF0) == 0xE0) {
|
||||
return true;
|
||||
}
|
||||
// check if this is a start of 2-byte character
|
||||
if ((c & 0xE0) == 0xC0) {
|
||||
return true;
|
||||
}
|
||||
// check if this is a single-byte character
|
||||
if ((c & 0x80) == 0) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
bool canReach(const NGHolder &g, const NFAVertex &src, const NFAVertex &dst,
|
||||
struct fmstate &state) {
|
||||
// find relevant edge and see whether it has asserts
|
||||
NFAEdge e;
|
||||
bool exists;
|
||||
u32 flags;
|
||||
|
||||
tie(e, exists) = edge(src, dst, g);
|
||||
assert(exists);
|
||||
|
||||
flags = g[e].assert_flags;
|
||||
if (!flags) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (flags & POS_FLAG_ASSERT_WORD_TO_NONWORD) {
|
||||
if (isWordChar(state.prev) && !isWordChar(state.cur)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & POS_FLAG_ASSERT_NONWORD_TO_WORD) {
|
||||
if (!isWordChar(state.prev) && isWordChar(state.cur)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & POS_FLAG_ASSERT_WORD_TO_WORD) {
|
||||
if (isWordChar(state.prev) && isWordChar(state.cur)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD) {
|
||||
if (!isWordChar(state.prev) && !isWordChar(state.cur)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
|
||||
bool allowEodMatches) {
|
||||
SOMMap::const_iterator it, ite;
|
||||
|
||||
for (it = state.states.begin(), ite = state.states.end(); it != ite; ++it) {
|
||||
NFAGraph::adjacency_iterator ai, ae;
|
||||
|
||||
// we can't accept anything from startDs inbetween UTF-8 codepoints
|
||||
if (state.utf8 && it->first == g.startDs && !isUtf8CodePoint(state.cur)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (tie(ai, ae) = adjacent_vertices(it->first, g); ai != ae; ++ai) {
|
||||
if (*ai == g.accept || (*ai == g.acceptEod && allowEodMatches)) {
|
||||
// check edge assertions if we are allowed to reach accept
|
||||
if (!canReach(g, it->first, *ai, state)) {
|
||||
continue;
|
||||
}
|
||||
DEBUG_PRINTF("match found at %zu\n", state.offset);
|
||||
|
||||
assert(!g[it->first].reports.empty());
|
||||
for (const auto &report_id :
|
||||
g[it->first].reports) {
|
||||
const Report &ri = state.rm.getReport(report_id);
|
||||
|
||||
DEBUG_PRINTF("report %u has offset adjustment %d\n",
|
||||
report_id, ri.offsetAdjust);
|
||||
matches.insert(
|
||||
make_pair(it->second, state.offset + ri.offsetAdjust));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void step(const NGHolder &g, struct fmstate &state) {
|
||||
state.next.clear();
|
||||
SOMMap::iterator it, ite;
|
||||
|
||||
for (it = state.states.begin(), ite = state.states.end(); it != ite; ++it) {
|
||||
NFAGraph::adjacency_iterator ai, ae;
|
||||
|
||||
for (tie(ai, ae) = adjacent_vertices(it->first, g); ai != ae; ++ai) {
|
||||
if (*ai == g.acceptEod) {
|
||||
// can't know the future: we don't know if we're at EOD.
|
||||
continue;
|
||||
}
|
||||
if (*ai == g.accept) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!state.allowStartDs && *ai == g.startDs) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const CharReach &cr = g[*ai].char_reach;
|
||||
// check reachability and edge assertions
|
||||
if (cr.test(state.cur) && canReach(g, it->first, *ai, state)) {
|
||||
SOMMap::const_iterator ni;
|
||||
size_t next_som;
|
||||
|
||||
// if we aren't in SOM mode, just set every SOM to 0
|
||||
if (!state.som) {
|
||||
state.next[*ai] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
// if this is first vertex since start, use current offset as SOM
|
||||
if (it->first == g.start || it->first == g.startDs ||
|
||||
is_virtual_start(it->first, g)) {
|
||||
next_som = state.offset;
|
||||
} else {
|
||||
// else, inherit SOM from predecessor
|
||||
next_som = it->second;
|
||||
}
|
||||
|
||||
// check if the vertex is already active
|
||||
ni = state.next.find(*ai);
|
||||
|
||||
// if this vertex is not yet active, use current SOM
|
||||
if (ni == state.next.end()) {
|
||||
state.next[*ai] = next_som;
|
||||
} else {
|
||||
// else, work out leftmost SOM
|
||||
state.next[*ai] = min(next_som, ni->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// filter extraneous matches
|
||||
static void filterMatches(MatchSet &matches) {
|
||||
set<size_t> eom;
|
||||
MatchSet::iterator msit;
|
||||
|
||||
// first, collect all end-offset matches
|
||||
for (msit = matches.begin(); msit != matches.end(); ++msit) {
|
||||
eom.insert(msit->second);
|
||||
}
|
||||
|
||||
// now, go through all the end-offsets and filter extra matches
|
||||
set<size_t>::const_iterator eomit;
|
||||
for (eomit = eom.begin(); eomit != eom.end(); ++eomit) {
|
||||
|
||||
// find minimum SOM for this EOM
|
||||
size_t min_som = -1U;
|
||||
for (msit = matches.begin(); msit != matches.end(); ++msit) {
|
||||
// skip entries with wrong EOM
|
||||
if (msit->second != *eomit) {
|
||||
continue;
|
||||
}
|
||||
|
||||
min_som = min(min_som, msit->first);
|
||||
}
|
||||
|
||||
msit = matches.begin();
|
||||
while (msit != matches.end()) {
|
||||
// skip everything that doesn't match
|
||||
if (msit->second != *eomit || msit->first <= min_som) {
|
||||
++msit;
|
||||
continue;
|
||||
}
|
||||
DEBUG_PRINTF("erasing match %zu, %zu\n", msit->first, msit->second);
|
||||
matches.erase(msit++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Find all matches for a given graph when executed against \a input.
|
||||
*
|
||||
* Fills \a matches with offsets into the data stream where a match is found.
|
||||
*/
|
||||
void findMatches(const NGHolder &g, const ReportManager &rm,
|
||||
const string &input, MatchSet &matches, const bool notEod,
|
||||
const bool som, const bool utf8) {
|
||||
const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
|
||||
|
||||
struct fmstate state(som, utf8, allowStartDs, rm);
|
||||
|
||||
initStates(g, state);
|
||||
|
||||
string::const_iterator it, ite;
|
||||
for (it = input.begin(), ite = input.end(); it != ite; ++it) {
|
||||
state.offset = distance(input.begin(), it);
|
||||
state.cur = *it;
|
||||
|
||||
step(g, state);
|
||||
|
||||
getMatches(g, matches, state, false);
|
||||
|
||||
DEBUG_PRINTF("index %zu, %zu states on\n", state.offset, state.next.size());
|
||||
if (state.next.empty()) {
|
||||
if (state.som) {
|
||||
filterMatches(matches);
|
||||
}
|
||||
return;
|
||||
}
|
||||
state.states.swap(state.next);
|
||||
state.prev = state.cur;
|
||||
}
|
||||
state.offset = input.size();
|
||||
state.cur = 0;
|
||||
|
||||
// do additional step to get matches after stream end, this time count eod
|
||||
// matches also (or not, if we're in notEod mode)
|
||||
|
||||
getMatches(g, matches, state, !notEod);
|
||||
|
||||
if (state.som) {
|
||||
filterMatches(matches);
|
||||
}
|
||||
}
|
56
util/ng_find_matches.h
Normal file
56
util/ng_find_matches.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Pattern matching based on direct NFA execution.
|
||||
*/
|
||||
|
||||
#ifndef NG_FIND_MATCHES_H
|
||||
#define NG_FIND_MATCHES_H
|
||||
|
||||
#include <string>
|
||||
#include <set>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class NGHolder;
|
||||
class ReportManager;
|
||||
struct BoundaryReports;
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
/** \brief Find all matches for a given graph when executed against \a input.
|
||||
*
|
||||
* Fills \a matches with offsets into the data stream where a match is found.
|
||||
*/
|
||||
void findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm,
|
||||
const std::string &input,
|
||||
std::set<std::pair<size_t, size_t>> &matches,
|
||||
const bool notEod, const bool som, const bool utf8);
|
||||
|
||||
#endif // NG_FIND_MATCHES_H
|
130
util/string_util.h
Normal file
130
util/string_util.h
Normal file
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef STRING_UTIL_H
|
||||
#define STRING_UTIL_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include <iomanip>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
//
|
||||
// Utility functions
|
||||
//
|
||||
|
||||
// read a string in and convert it to another type, anything supported
|
||||
// by stringstream
|
||||
template<typename T>
|
||||
inline bool fromString(const std::string &s, T& val)
|
||||
{
|
||||
std::istringstream i(s);
|
||||
char c;
|
||||
if (!(i >> val) || i.get(c)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// read in a comma-separated set of values: very simple impl, not for
|
||||
// external consumption
|
||||
template<typename T>
|
||||
inline bool strToList(const std::string &s, std::vector<T>& out)
|
||||
{
|
||||
std::istringstream i(s);
|
||||
char c;
|
||||
do {
|
||||
T val;
|
||||
if (!(i >> val)) {
|
||||
break;
|
||||
}
|
||||
|
||||
out.push_back(val);
|
||||
} while (i.get(c) && c == ',');
|
||||
|
||||
return !out.empty();
|
||||
}
|
||||
|
||||
// return a nicely escaped version of a string: this should probably become
|
||||
// an IO manipulator or something
|
||||
UNUSED static
|
||||
const std::string printable(const std::string &in) {
|
||||
std::ostringstream oss;
|
||||
for (size_t i = 0; i < in.size(); ++i) {
|
||||
unsigned char c = in[i];
|
||||
if (c == '\"') {
|
||||
oss << "\\\"";
|
||||
} else if (c == '\n') {
|
||||
oss << "\\n";
|
||||
} else if (c == '\t') {
|
||||
oss << "\\t";
|
||||
} else if (c == '\r') {
|
||||
oss << "\\r";
|
||||
} else if (0x20 <= c && c <= 0x7e && c != '\\') {
|
||||
oss << c;
|
||||
} else {
|
||||
oss << "\\x"
|
||||
<< std::hex << std::setw(2) << std::setfill('0')
|
||||
<< (unsigned)(in[i] & 0xff)
|
||||
<< std::dec;
|
||||
}
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
template<typename it_t>
|
||||
void prettyPrintRange(std::ostream &out, it_t begin, it_t end) {
|
||||
bool in_range = false;
|
||||
it_t it = begin;
|
||||
it_t itp = it;
|
||||
|
||||
for (; it != end; itp = it++) {
|
||||
if (it != begin && *it == *itp + 1) {
|
||||
in_range = true;
|
||||
continue;
|
||||
} else if (it != begin) {
|
||||
if (in_range) {
|
||||
out << "-" << *itp;
|
||||
}
|
||||
|
||||
out << ", ";
|
||||
in_range = false;
|
||||
}
|
||||
|
||||
out << *it;
|
||||
}
|
||||
|
||||
if (in_range) {
|
||||
out << "-" << *itp;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // STRING_UTIL_H
|
Reference in New Issue
Block a user