Add support for approximate matching in ue2collider

This commit is contained in:
Anatoly Burakov 2017-02-10 15:42:36 +00:00 committed by Matthew Barr
parent 2de6706df2
commit 4c2b7cc04f
4 changed files with 860 additions and 110 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -218,7 +218,7 @@ namespace {
/** \brief Concrete implementation */
class CorpusGeneratorImpl : public CorpusGenerator {
public:
CorpusGeneratorImpl(const NGHolder &graph_in, CorpusProperties &props);
CorpusGeneratorImpl(const NGWrapper &graph_in, CorpusProperties &props);
~CorpusGeneratorImpl() {}
void generateCorpus(vector<string> &data);
@ -244,10 +244,13 @@ private:
CorpusProperties &cProps;
};
CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in,
CorpusGeneratorImpl::CorpusGeneratorImpl(const NGWrapper &graph_in,
CorpusProperties &props)
: graph(graph_in), cProps(props) {
// empty
// if this pattern is to be matched approximately
if (graph_in.edit_distance && !props.editDistance) {
props.editDistance = props.rand(0, graph_in.edit_distance + 1);
}
}
void CorpusGeneratorImpl::generateCorpus(vector<string> &data) {
@ -388,7 +391,7 @@ hit_limit:
/** \brief Concrete implementation for UTF-8 */
class CorpusGeneratorUtf8 : public CorpusGenerator {
public:
CorpusGeneratorUtf8(const NGHolder &graph_in, CorpusProperties &props);
CorpusGeneratorUtf8(const NGWrapper &graph_in, CorpusProperties &props);
~CorpusGeneratorUtf8() {}
void generateCorpus(vector<string> &data);
@ -407,17 +410,21 @@ private:
void addRandom(const min_max &mm, vector<unichar> *out);
/** \brief The NFA graph we operate over. */
const NGHolder &graph;
const NGWrapper &graph;
/** \brief Reference to our corpus generator properties object (stores some
* state) */
CorpusProperties &cProps;
};
CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGHolder &graph_in,
CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGWrapper &graph_in,
CorpusProperties &props)
: graph(graph_in), cProps(props) {
// empty
// we do not support Utf8 for approximate matching
if (graph.edit_distance) {
throw CorpusGenerationFailure("UTF-8 for edited patterns is not "
"supported.");
}
}
void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Intel Corporation
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -45,6 +45,12 @@ class NGWrapper;
} // namespace ue2
struct CorpusGenerationFailure {
explicit CorpusGenerationFailure(const std::string s) :
message(std::move(s)) {}
std::string message;
};
/** \brief Abstract interface to corpus generator tool. */
class CorpusGenerator {
public:

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Intel Corporation
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -51,6 +51,7 @@ struct BoundaryReports;
void findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm,
const std::string &input,
std::set<std::pair<size_t, size_t>> &matches,
const bool notEod, const bool som, const bool utf8);
const unsigned int max_edit_distance, const bool notEod,
const bool utf8);
#endif // NG_FIND_MATCHES_H