mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
528 lines
18 KiB
C++
528 lines
18 KiB
C++
/*
|
|
* Copyright (c) 2015, Intel Corporation
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/** \file
|
|
* \brief Glushkov construction.
|
|
*/
|
|
#include "buildstate.h"
|
|
#include "position.h"
|
|
#include "position_dump.h"
|
|
#include "position_info.h"
|
|
#include "parse_error.h"
|
|
#include "hs_internal.h"
|
|
#include "ue2common.h"
|
|
#include "nfagraph/ng_builder.h"
|
|
#include "util/charreach.h"
|
|
#include "util/container.h"
|
|
#include "util/make_unique.h"
|
|
#include "util/ue2_containers.h"
|
|
|
|
#include <algorithm>
|
|
#include <iterator>
|
|
#include <limits>
|
|
#include <map>
|
|
#include <utility>
|
|
|
|
#if defined(DEBUG) || defined(DUMP_SUPPORT)
|
|
#include <ostream>
|
|
#include <sstream>
|
|
#endif
|
|
|
|
using namespace std;
|
|
|
|
namespace ue2 {
|
|
|
|
/** \brief Represents an uninitialized state. */
|
|
const Position GlushkovBuildState::POS_UNINITIALIZED =
|
|
numeric_limits<Position>::max();
|
|
|
|
/** \brief Represents an epsilon transition in the firsts of a component. */
|
|
const Position GlushkovBuildState::POS_EPSILON =
|
|
numeric_limits<Position>::max() - 1;
|
|
|
|
GlushkovBuildState::~GlushkovBuildState() { }
|
|
|
|
namespace /* anonymous */ {
|
|
|
|
class CheckPositionFlags {
|
|
public:
|
|
explicit CheckPositionFlags(int fl) : flags(fl) {}
|
|
bool operator()(const PositionInfo &p) const {
|
|
return (p.flags & flags) == flags;
|
|
}
|
|
private:
|
|
int flags;
|
|
};
|
|
|
|
class CheckUnflaggedEpsilon {
|
|
public:
|
|
bool operator()(const PositionInfo &p) const {
|
|
return p.pos == GlushkovBuildState::POS_EPSILON && p.flags == 0;
|
|
}
|
|
};
|
|
|
|
/** \brief Concrete impl of the GlushkovBuildState interface. */
|
|
class GlushkovBuildStateImpl : public GlushkovBuildState {
|
|
public:
|
|
GlushkovBuildStateImpl(NFABuilder &b, bool prefilter);
|
|
|
|
/** \brief Returns a reference to the NFABuilder being used. */
|
|
NFABuilder &getBuilder() override { return builder; }
|
|
|
|
/** \brief Returns a const reference to the NFABuilder being used. */
|
|
const NFABuilder &getBuilder() const override { return builder; }
|
|
|
|
/** \brief Wire up the lasts of one component to the firsts of another. */
|
|
void connectRegions(const vector<PositionInfo> &lasts,
|
|
const vector<PositionInfo> &firsts) override;
|
|
|
|
/** \brief Wire the lasts of the main sequence to accepts. */
|
|
void connectAccepts(const vector<PositionInfo> &lasts) override;
|
|
|
|
/** \brief Wire up a single last to a list of firsts. */
|
|
void connectSuccessors(const PositionInfo &last,
|
|
vector<PositionInfo> firsts);
|
|
|
|
/** Wire up a pair of positions. */
|
|
void addSuccessor(Position from, Position to) override;
|
|
|
|
/** \brief Clone the vertex properties and edges of all vertices between
|
|
* two positions. */
|
|
void cloneFollowSet(Position from, Position to, unsigned offset) override;
|
|
|
|
/** \brief Build the prioritised list of edges out of our successor map. */
|
|
void buildEdges() override;
|
|
|
|
/** Construct an edge, called internally by \ref buildEdges. */
|
|
void buildEdge(Position from, const PositionInfo &to);
|
|
|
|
Position startState;
|
|
Position startDotstarState;
|
|
Position acceptState;
|
|
Position acceptEodState;
|
|
Position acceptNlEodState;
|
|
Position acceptNlState;
|
|
|
|
NFABuilder &builder; //!< \brief builder for the NFAGraph
|
|
|
|
bool doPrefilter; //!< \brief we're building a prefiltering pattern
|
|
|
|
/** \brief Map storing successors for each position. */
|
|
map<Position, flat_set<PositionInfo>> successors;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b,
|
|
bool prefilter) :
|
|
startState(b.getStart()),
|
|
startDotstarState(b.getStartDotStar()),
|
|
acceptState(b.getAccept()),
|
|
acceptEodState(b.getAcceptEOD()),
|
|
acceptNlEodState(POS_UNINITIALIZED),
|
|
acceptNlState(POS_UNINITIALIZED),
|
|
builder(b),
|
|
doPrefilter(prefilter)
|
|
{
|
|
// Our special nodes need special relationships.
|
|
vector<PositionInfo> lasts, firsts;
|
|
|
|
// start->startDs and startDs self-loop.
|
|
lasts.push_back(startState);
|
|
lasts.push_back(startDotstarState);
|
|
firsts.push_back(startDotstarState);
|
|
connectRegions(lasts, firsts);
|
|
|
|
// accept to acceptEod edges already wired
|
|
|
|
// XXX: a small hack to support vacuous NFAs: give start and startDs an
|
|
// initial report ID.
|
|
builder.setNodeReportID(startState, 0);
|
|
builder.setNodeReportID(startDotstarState, 0);
|
|
}
|
|
|
|
static
|
|
void checkEmbeddedEndAnchor(const PositionInfo &from,
|
|
const vector<PositionInfo> &firsts) {
|
|
if (!(from.flags & POS_FLAG_ONLY_ENDS)) {
|
|
return;
|
|
}
|
|
|
|
for (const auto &first : firsts) {
|
|
if (first.pos != GlushkovBuildStateImpl::POS_EPSILON) {
|
|
/* can make it through the parse tree */
|
|
throw ParseError("Embedded end anchors not supported.");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Wire up the lasts of one component to the firsts of another
|
|
void
|
|
GlushkovBuildStateImpl::connectRegions(const vector<PositionInfo> &lasts,
|
|
const vector<PositionInfo> &firsts) {
|
|
for (const auto &last : lasts) {
|
|
checkEmbeddedEndAnchor(last, firsts);
|
|
connectSuccessors(last, firsts);
|
|
}
|
|
}
|
|
|
|
static
|
|
void filterEdges(const GlushkovBuildStateImpl &bs, const PositionInfo &from,
|
|
vector<PositionInfo> &tolist) {
|
|
if (from.pos == bs.startDotstarState) {
|
|
// If we're connecting from start-dotstar, remove all caret flavoured
|
|
// positions.
|
|
CheckPositionFlags check(POS_FLAG_NOFLOAT);
|
|
tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
|
|
tolist.end());
|
|
if (from.flags & POS_FLAG_NOFLOAT) {
|
|
tolist.clear();
|
|
}
|
|
} else if (from.pos == bs.startState) {
|
|
// If we're connecting from start, we should remove any epsilons that
|
|
// aren't caret flavoured.
|
|
CheckUnflaggedEpsilon check;
|
|
tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
|
|
tolist.end());
|
|
CheckPositionFlags check2(POS_FLAG_MUST_FLOAT | POS_FLAG_NOFLOAT);
|
|
tolist.erase(remove_if(tolist.begin(), tolist.end(), check2),
|
|
tolist.end());
|
|
}
|
|
|
|
if (bs.builder.getAssertFlag(from.pos) & POS_FLAG_MULTILINE_START) {
|
|
// If we have a (mildly boneheaded) pattern like /^$/m, we're right up
|
|
// against the edge of what we can do without true assertion support.
|
|
// Here we have an evil hack to prevent us plugging the \n generated by
|
|
// the caret right into acceptEod (which is in the firsts of the
|
|
// dollar).
|
|
/* This is due to the 'interesting quirk' that multiline ^ does not
|
|
* not match a newline at the end of buffer. */
|
|
DEBUG_PRINTF("multiline start - no eod\n");
|
|
tolist.erase(remove(tolist.begin(), tolist.end(), bs.acceptEodState),
|
|
tolist.end());
|
|
}
|
|
}
|
|
|
|
static
|
|
Position makeNewlineAssertPos(GlushkovBuildState &bs) {
|
|
NFABuilder &builder = bs.getBuilder();
|
|
Position newline = builder.makePositions(1);
|
|
builder.addCharReach(newline, CharReach('\n'));
|
|
builder.setAssertFlag(newline, POS_FLAG_FIDDLE_ACCEPT);
|
|
builder.setNodeReportID(newline, -1);
|
|
return newline;
|
|
}
|
|
|
|
static
|
|
void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
|
|
vector<PositionInfo> *tolist) {
|
|
NFABuilder &builder = bs.getBuilder();
|
|
u32 flags = from.flags;
|
|
|
|
bool require_eod = flags & POS_FLAG_WIRE_EOD;
|
|
bool require_nl_eod = flags & POS_FLAG_WIRE_NL_EOD
|
|
&& !(flags & POS_FLAG_NO_NL_EOD);
|
|
bool require_nl_accept = (flags & POS_FLAG_WIRE_NL_ACCEPT)
|
|
&& !(flags & POS_FLAG_NO_NL_ACCEPT);
|
|
|
|
bool require_accept = !(flags & POS_FLAG_ONLY_ENDS);
|
|
|
|
if (require_eod) {
|
|
tolist->push_back(bs.acceptEodState);
|
|
}
|
|
|
|
if (require_nl_accept) {
|
|
if (bs.acceptNlState == GlushkovBuildState::POS_UNINITIALIZED) {
|
|
Position newline = makeNewlineAssertPos(bs);
|
|
bs.addSuccessor(newline, builder.getAccept());
|
|
bs.acceptNlState = newline;
|
|
}
|
|
tolist->push_back(bs.acceptNlState);
|
|
}
|
|
|
|
if (require_nl_eod) {
|
|
if (bs.acceptNlEodState == GlushkovBuildState::POS_UNINITIALIZED) {
|
|
Position newline = makeNewlineAssertPos(bs);
|
|
bs.addSuccessor(newline, builder.getAcceptEOD());
|
|
bs.acceptNlEodState = newline;
|
|
}
|
|
tolist->push_back(bs.acceptNlEodState);
|
|
}
|
|
|
|
if (require_accept) {
|
|
tolist->push_back(bs.acceptState);
|
|
}
|
|
}
|
|
|
|
void GlushkovBuildStateImpl::connectAccepts(const vector<PositionInfo> &lasts) {
|
|
for (const auto &last : lasts) {
|
|
vector<PositionInfo> accepts;
|
|
generateAccepts(*this, last, &accepts);
|
|
connectSuccessors(last, accepts);
|
|
}
|
|
}
|
|
|
|
#if defined(DEBUG) || defined(DUMP_SUPPORT)
|
|
|
|
static UNUSED
|
|
string dumpCaptures(const PositionInfo &p) {
|
|
ostringstream oss;
|
|
|
|
if (p.flags & POS_FLAG_NOFLOAT) {
|
|
oss << "<nofloat>";
|
|
}
|
|
if (p.flags & POS_FLAG_MUST_FLOAT) {
|
|
oss << "<must_float>";
|
|
}
|
|
if (p.flags & POS_FLAG_FIDDLE_ACCEPT) {
|
|
oss << "<fiddle_accept>";
|
|
}
|
|
if (p.flags & POS_FLAG_ONLY_ENDS) {
|
|
oss << "<only_ends>";
|
|
}
|
|
if (p.flags & POS_FLAG_NO_NL_EOD) {
|
|
oss << "<no_nl_eod>";
|
|
}
|
|
if (p.flags & POS_FLAG_NO_NL_ACCEPT) {
|
|
oss << "<no_nl_acc>";
|
|
}
|
|
|
|
return oss.str();
|
|
}
|
|
|
|
#endif // DEBUG || DUMP_SUPPORT
|
|
|
|
void GlushkovBuildStateImpl::connectSuccessors(const PositionInfo &from,
|
|
vector<PositionInfo> tolist) {
|
|
/* note: tolist maybe modified for our own internal use -> not a reference */
|
|
assert(from.pos != POS_EPSILON);
|
|
assert(from.pos != POS_UNINITIALIZED);
|
|
assert(find(tolist.begin(), tolist.end(), POS_UNINITIALIZED)
|
|
== tolist.end());
|
|
|
|
DEBUG_PRINTF("FROM = %u%s TO = %s\n", from.pos, dumpCaptures(from).c_str(),
|
|
dumpPositions(tolist.begin(), tolist.end()).c_str());
|
|
|
|
/* prevent creation of edges with invalid assertions */
|
|
filterEdges(*this, from, tolist);
|
|
|
|
if (from.flags & POS_FLAG_FIDDLE_ACCEPT) {
|
|
auto accept = find(tolist.begin(), tolist.end(), acceptState);
|
|
if (accept != tolist.end()) {
|
|
DEBUG_PRINTF("accept through -1 offset-adjusting dot\n");
|
|
Position fakedot = builder.makePositions(1);
|
|
builder.addCharReach(fakedot, CharReach(0x00, 0xff));
|
|
builder.setNodeReportID(fakedot, -1);
|
|
addSuccessor(fakedot, acceptState);
|
|
*accept = fakedot;
|
|
} else {
|
|
// We might lead to accept via an assertion vertex, so we add the
|
|
// offset adj to this vertex itself. Used for cases like /^\B/m,
|
|
// which should match only at 0 for '\n'.
|
|
builder.setNodeReportID(from.pos, -1);
|
|
}
|
|
|
|
assert(find(tolist.begin(), tolist.end(), acceptState) == tolist.end());
|
|
}
|
|
|
|
auto &succ = successors[from.pos];
|
|
|
|
DEBUG_PRINTF("connect %u -> %s\n", from.pos,
|
|
dumpPositions(tolist.begin(), tolist.end()).c_str());
|
|
DEBUG_PRINTF("%u curr succ: %s\n", from.pos,
|
|
dumpPositions(begin(succ), end(succ)).c_str());
|
|
|
|
for (const auto &to : tolist) {
|
|
if (to.pos != POS_EPSILON) {
|
|
succ.insert(to);
|
|
}
|
|
}
|
|
|
|
DEBUG_PRINTF("%u succ: %s\n", from.pos,
|
|
dumpPositions(begin(succ), end(succ)).c_str());
|
|
}
|
|
|
|
void GlushkovBuildStateImpl::addSuccessor(Position from, Position to) {
|
|
DEBUG_PRINTF("connect %u -> %u\n", from, to);
|
|
assert(from != POS_EPSILON && from != POS_UNINITIALIZED);
|
|
assert(to != POS_EPSILON && to != POS_UNINITIALIZED);
|
|
|
|
auto &succ = successors[from];
|
|
succ.insert(to);
|
|
|
|
DEBUG_PRINTF("%u succ: %s\n", from,
|
|
dumpPositions(begin(succ), end(succ)).c_str());
|
|
}
|
|
|
|
void GlushkovBuildStateImpl::cloneFollowSet(Position first, Position last,
|
|
unsigned offset) {
|
|
assert(first <= last);
|
|
|
|
// Clone vertex properties (reachability, etc)
|
|
builder.cloneRegion(first, last, offset);
|
|
|
|
/* Clone the successors of all the positions between first and last
|
|
* inclusive, producing a new set of positions starting at (first +
|
|
* offset). */
|
|
for (Position i = first; i <= last; i++) {
|
|
// This should be a new position.
|
|
assert(successors[i + offset].empty());
|
|
|
|
for (const PositionInfo &to : successors[i]) {
|
|
if (to.pos >= first && to.pos <= last) {
|
|
PositionInfo clone(to);
|
|
clone.pos += offset;
|
|
DEBUG_PRINTF("clone: %u -> %u\n", i + offset, clone.pos);
|
|
successors[i + offset].insert(clone);
|
|
} else {
|
|
// There shouldn't be any stray edges leading out of this
|
|
// region!
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void GlushkovBuildStateImpl::buildEdge(Position from, const PositionInfo &to) {
|
|
// Guard against embedded anchors
|
|
if (to == startState) {
|
|
/* can make it through the parse tree */
|
|
throw ParseError("Embedded start anchors not supported.");
|
|
}
|
|
|
|
assert(to.pos != POS_UNINITIALIZED);
|
|
assert(to.pos != POS_EPSILON);
|
|
|
|
if (builder.hasEdge(from, to.pos)) {
|
|
return;
|
|
}
|
|
|
|
builder.addEdge(from, to.pos);
|
|
}
|
|
|
|
void GlushkovBuildStateImpl::buildEdges() {
|
|
// Create all the edges and track which vertices are asserts which need to
|
|
// be removed later.
|
|
for (const auto &m : successors) {
|
|
const Position from = m.first;
|
|
for (const auto &to : m.second) {
|
|
buildEdge(from, to);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Construct a usable GlushkovBuildState for the outside world.
|
|
unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
|
|
bool prefilter) {
|
|
return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter);
|
|
}
|
|
|
|
// free functions for utility use
|
|
|
|
/** \brief Eliminate lower-priority duplicate PositionInfo entries.
|
|
*
|
|
* Scans through a list of positions and retains only the highest priority
|
|
* version of a given (position, flags) entry. */
|
|
void cleanupPositions(vector<PositionInfo> &a) {
|
|
ue2::unordered_set<pair<Position, int>> seen; // track dupes
|
|
|
|
vector<PositionInfo> out;
|
|
out.reserve(a.size()); // output should be close to input in size.
|
|
|
|
for (const auto &p : a) {
|
|
if (seen.emplace(p.pos, p.flags).second) {
|
|
out.push_back(p); // first encounter
|
|
}
|
|
}
|
|
|
|
DEBUG_PRINTF("in %zu; out %zu\n", a.size(), out.size());
|
|
a.swap(out);
|
|
}
|
|
|
|
static
|
|
vector<PositionInfo>::iterator
|
|
replaceElemWithSequence(vector<PositionInfo> &dest,
|
|
vector<PositionInfo>::iterator &victim,
|
|
const vector<PositionInfo> &replacement) {
|
|
auto past = dest.erase(victim);
|
|
size_t d = distance(dest.begin(), past) + replacement.size();
|
|
dest.insert(past, replacement.begin(), replacement.end());
|
|
/* recalc past as iterator may have been invalidated */
|
|
return dest.begin() + d;
|
|
}
|
|
|
|
/** \brief Replace all epsilons with the given positions.
|
|
*
|
|
* Replace epsilons in a firsts list with another given firsts list. Note: the
|
|
* firsts lists must come from disjoint sets of components. If no epsilons are
|
|
* in the first firsts list the source is appended to the end.
|
|
*/
|
|
void replaceEpsilons(vector<PositionInfo> &target,
|
|
const vector<PositionInfo> &source) {
|
|
auto found =
|
|
find(target.begin(), target.end(), GlushkovBuildState::POS_EPSILON);
|
|
|
|
if (found == target.end()) {
|
|
// no epsilons to replace, push on to the end
|
|
target.insert(target.end(), source.begin(), source.end());
|
|
return;
|
|
}
|
|
|
|
while (found != target.end()) {
|
|
checkEmbeddedEndAnchor(*found, source);
|
|
|
|
// replace this epsilon with a copy of source with the same flags
|
|
vector<PositionInfo> newsource(source);
|
|
for (auto &pos : newsource) {
|
|
pos.flags |= found->flags;
|
|
}
|
|
|
|
found = replaceElemWithSequence(target, found, newsource);
|
|
// find the next epsilon
|
|
found = find(found, target.end(), GlushkovBuildState::POS_EPSILON);
|
|
}
|
|
|
|
cleanupPositions(target);
|
|
}
|
|
|
|
#ifdef DUMP_SUPPORT
|
|
|
|
void dump(ostream &os, const PositionInfo &p) {
|
|
if (p.pos == GlushkovBuildState::POS_EPSILON) {
|
|
os << "epsilon";
|
|
} else {
|
|
os << p.pos;
|
|
}
|
|
|
|
os << dumpCaptures(p);
|
|
}
|
|
|
|
#endif // DUMP_SUPPORT
|
|
|
|
} // namespace ue2
|