mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-10-01 20:17:42 +03:00
Initial commit of Hyperscan
This commit is contained in:
159
src/parser/AsciiComponentClass.cpp
Normal file
159
src/parser/AsciiComponentClass.cpp
Normal file
@@ -0,0 +1,159 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Character classes and their mnemonics.
|
||||
*/
|
||||
#include "AsciiComponentClass.h"
|
||||
#include "Utf8ComponentClass.h"
|
||||
#include "buildstate.h"
|
||||
#include "parse_error.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "util/charreach_util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
AsciiComponentClass::AsciiComponentClass(const ParseMode &mode_in)
|
||||
: ComponentClass(mode_in), position(GlushkovBuildState::POS_UNINITIALIZED) {
|
||||
assert(!mode.utf8);
|
||||
}
|
||||
|
||||
AsciiComponentClass *AsciiComponentClass::clone() const {
|
||||
return new AsciiComponentClass(*this);
|
||||
}
|
||||
|
||||
bool AsciiComponentClass::class_empty(void) const {
|
||||
return cr.none() && cr_ucp.none();
|
||||
}
|
||||
|
||||
void AsciiComponentClass::createRange(unichar to) {
|
||||
assert(range_start <= 0xff);
|
||||
unsigned char from = (u8)range_start;
|
||||
if (from > to) {
|
||||
throw LocatedParseError("Range out of order in character class");
|
||||
} else {
|
||||
in_cand_range = false;
|
||||
cr.setRange(from, to);
|
||||
range_start = INVALID_UNICODE;
|
||||
}
|
||||
}
|
||||
|
||||
void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
|
||||
// We should always be finalized by now.
|
||||
assert(finalized);
|
||||
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
position = builder.makePositions(1);
|
||||
|
||||
builder.addCharReach(position, cr);
|
||||
builder.setNodeReportID(position, 0 /* offset adj */);
|
||||
recordPosBounds(position, position + 1);
|
||||
}
|
||||
|
||||
void AsciiComponentClass::buildFollowSet(GlushkovBuildState &,
|
||||
const vector<PositionInfo> &) {
|
||||
// all follow set construction is handled by firsts/lasts
|
||||
}
|
||||
|
||||
void AsciiComponentClass::add(PredefinedClass c, bool negative) {
|
||||
if (in_cand_range) { // can't form a range here
|
||||
throw LocatedParseError("Invalid range in character class");
|
||||
}
|
||||
DEBUG_PRINTF("getting %u %s\n", (u32)c, negative ? "^" : "");
|
||||
|
||||
if (mode.ucp) {
|
||||
c = translateForUcpMode(c, mode);
|
||||
}
|
||||
|
||||
CharReach pcr = getPredefinedCharReach(c, mode);
|
||||
if (negative) {
|
||||
pcr.flip();
|
||||
}
|
||||
|
||||
if (isUcp(c)) {
|
||||
cr_ucp |= pcr;
|
||||
} else {
|
||||
cr |= pcr;
|
||||
}
|
||||
range_start = INVALID_UNICODE;
|
||||
in_cand_range = false;
|
||||
}
|
||||
|
||||
void AsciiComponentClass::add(unichar c) {
|
||||
DEBUG_PRINTF("adding \\x%02x\n", c);
|
||||
if (c > 0xff) { // too big!
|
||||
throw LocatedParseError("Hexadecimal value is greater than \\xFF");
|
||||
}
|
||||
|
||||
if (in_cand_range) {
|
||||
createRange(c);
|
||||
return;
|
||||
}
|
||||
|
||||
cr.set(c);
|
||||
range_start = c;
|
||||
}
|
||||
|
||||
void AsciiComponentClass::finalize() {
|
||||
if (finalized) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the
|
||||
// dash is a literal dash.
|
||||
if (in_cand_range) {
|
||||
cr.set('-');
|
||||
in_cand_range = false;
|
||||
}
|
||||
|
||||
if (mode.caseless) {
|
||||
make_caseless(&cr);
|
||||
}
|
||||
|
||||
cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
|
||||
|
||||
if (m_negate) {
|
||||
cr.flip();
|
||||
}
|
||||
|
||||
finalized = true;
|
||||
}
|
||||
|
||||
vector<PositionInfo> AsciiComponentClass::first(void) const {
|
||||
return vector<PositionInfo>(1, PositionInfo(position));
|
||||
}
|
||||
|
||||
vector<PositionInfo> AsciiComponentClass::last(void) const {
|
||||
return vector<PositionInfo>(1, PositionInfo(position));
|
||||
}
|
||||
|
||||
} // namespace ue2
|
91
src/parser/AsciiComponentClass.h
Normal file
91
src/parser/AsciiComponentClass.h
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Character classes and their mnemonics.
|
||||
*/
|
||||
|
||||
#ifndef ASCIICOMPONENTCLASS_H
|
||||
#define ASCIICOMPONENTCLASS_H
|
||||
|
||||
#include "ComponentClass.h"
|
||||
#include "util/charreach.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class AsciiComponentClass : public ComponentClass {
|
||||
friend class ConstructLiteralVisitor;
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
friend class CaselessVisitor;
|
||||
friend class SimplifyVisitor;
|
||||
friend class SimplifyCandidatesVisitor;
|
||||
public:
|
||||
explicit AsciiComponentClass(const ParseMode &mode_in);
|
||||
~AsciiComponentClass() override {}
|
||||
AsciiComponentClass *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
bool class_empty(void) const override;
|
||||
void add(PredefinedClass c, bool negative) override;
|
||||
void add(unichar c) override;
|
||||
void finalize(void) override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &) override;
|
||||
std::vector<PositionInfo> first(void) const override;
|
||||
std::vector<PositionInfo> last(void) const override;
|
||||
|
||||
protected:
|
||||
void createRange(unichar to) override;
|
||||
|
||||
private:
|
||||
Position position;
|
||||
CharReach cr;
|
||||
CharReach cr_ucp;
|
||||
|
||||
// Private copy ctor. Use clone instead.
|
||||
AsciiComponentClass(const AsciiComponentClass &other)
|
||||
: ComponentClass(other), position(other.position), cr(other.cr),
|
||||
cr_ucp(other.cr_ucp) {}
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // ASCIICOMPONENTCLASS_H
|
75
src/parser/Component.cpp
Normal file
75
src/parser/Component.cpp
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Base class for all components.
|
||||
*/
|
||||
|
||||
|
||||
#include "Component.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
Component::Component()
|
||||
: pos_begin(GlushkovBuildState::POS_UNINITIALIZED),
|
||||
pos_end(GlushkovBuildState::POS_UNINITIALIZED) {}
|
||||
|
||||
Component::~Component() {}
|
||||
|
||||
bool Component::repeatable() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
void Component::recordPosBounds(u32 b, u32 e) {
|
||||
pos_begin = b;
|
||||
pos_end = e;
|
||||
}
|
||||
|
||||
void Component::optimise(bool) {
|
||||
}
|
||||
|
||||
bool Component::vacuous_everywhere(void) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Component::checkEmbeddedStartAnchor(bool) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Component::checkEmbeddedEndAnchor(bool) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace ue2
|
145
src/parser/Component.h
Normal file
145
src/parser/Component.h
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Base class for all components.
|
||||
*/
|
||||
|
||||
#ifndef _RE_COMPONENT_H_
|
||||
#define _RE_COMPONENT_H_
|
||||
|
||||
#include "ComponentVisitor.h"
|
||||
#include "ConstComponentVisitor.h"
|
||||
|
||||
#include "position.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class GlushkovBuildState;
|
||||
class PositionInfo;
|
||||
|
||||
enum EmptyPathType {
|
||||
NOT_EMPTY, /**< component must consume characters */
|
||||
EPS_ONLY_PATHS, /**< eps path with no overhanging asserts */
|
||||
BOUNDARY_PATHS /**< eps paths some with overhanging asserts */
|
||||
};
|
||||
|
||||
/** \brief Base class for regular expression parse tree components. */
|
||||
class Component {
|
||||
friend class DumpVisitor;
|
||||
public:
|
||||
/** \brief Constructor. */
|
||||
Component();
|
||||
|
||||
/** \brief Destructor. */
|
||||
virtual ~Component();
|
||||
|
||||
/** \brief Returns a newly-allocated deep copy of this component. */
|
||||
virtual Component *clone() const = 0;
|
||||
|
||||
/** \brief Apply the given visitor functor. */
|
||||
virtual Component *accept(ComponentVisitor &v) = 0;
|
||||
|
||||
/** \brief Apply the given const visitor functor. */
|
||||
virtual void accept(ConstComponentVisitor &v) const = 0;
|
||||
|
||||
/** \brief Glushkov construction First() function.
|
||||
* \return set of initial positions in this component. */
|
||||
virtual std::vector<PositionInfo> first() const = 0;
|
||||
|
||||
/** \brief Glushkov construction Last() function.
|
||||
* \return set of final positions in this component. */
|
||||
virtual std::vector<PositionInfo> last() const = 0;
|
||||
|
||||
/** \brief Glushkov construction Empty() function.
|
||||
* \return true iff the component accepts epsilon.
|
||||
*
|
||||
* Note: ^, $, etc are considered empty. */
|
||||
virtual bool empty() const = 0;
|
||||
|
||||
/** \brief True iff epsilon can pass through the component.
|
||||
*
|
||||
* Note: ^, $, etc are not vacuous everywhere. */
|
||||
virtual bool vacuous_everywhere(void) const;
|
||||
|
||||
/** \brief True iff the component is repeatable on its own, without being
|
||||
* encapsulated in a sequence first.
|
||||
*
|
||||
* This is true for most components, but not for repeats, anchors and word
|
||||
* boundaries. */
|
||||
virtual bool repeatable() const;
|
||||
|
||||
/** \brief Optimisation pass on the component tree.
|
||||
*
|
||||
* Called before \ref notePositions. May modify to the component tree.
|
||||
* Assumes no start of match information is required.
|
||||
*/
|
||||
virtual void optimise(bool connected_to_sds);
|
||||
|
||||
/** \brief Informs the Glushkov build process of the positions used by this
|
||||
* component. */
|
||||
virtual void notePositions(GlushkovBuildState &bs) = 0;
|
||||
|
||||
/** \brief Glushkov construction Follow() function.
|
||||
*
|
||||
* Constructs (in \a bs) the set of positions in this component reachable
|
||||
* from the positions in \a lastPos.
|
||||
*
|
||||
* \throw ParseError on failure
|
||||
*/
|
||||
virtual void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) = 0;
|
||||
|
||||
/** \brief Return value is used for chaining, throws if finds embedded
|
||||
* anchor. */
|
||||
virtual bool checkEmbeddedStartAnchor(bool at_start) const;
|
||||
|
||||
/* \brief Return value is used for chaining, throws if finds embedded
|
||||
* anchor. */
|
||||
virtual bool checkEmbeddedEndAnchor(bool at_end) const;
|
||||
|
||||
protected:
|
||||
/** \brief Called during \ref notePositions. */
|
||||
void recordPosBounds(u32 b, u32 e);
|
||||
|
||||
u32 pos_begin;
|
||||
u32 pos_end;
|
||||
|
||||
// Protected copy ctor. Use clone instead.
|
||||
Component(const Component &other)
|
||||
: pos_begin(other.pos_begin), pos_end(other.pos_end) {}
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
190
src/parser/ComponentAlternation.cpp
Normal file
190
src/parser/ComponentAlternation.cpp
Normal file
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Alternations (foo|bar|baz).
|
||||
*/
|
||||
|
||||
|
||||
#include "ComponentAlternation.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentAlternation::ComponentAlternation() {
|
||||
// empty
|
||||
}
|
||||
|
||||
ComponentAlternation::~ComponentAlternation() {
|
||||
// empty
|
||||
}
|
||||
|
||||
ComponentAlternation::ComponentAlternation(const ComponentAlternation &other)
|
||||
: Component(other) {
|
||||
for (const auto &c : other.children) {
|
||||
assert(c);
|
||||
children.push_back(unique_ptr<Component>(c->clone()));
|
||||
}
|
||||
}
|
||||
|
||||
ComponentAlternation * ComponentAlternation::clone() const {
|
||||
return new ComponentAlternation(*this);
|
||||
}
|
||||
|
||||
Component *ComponentAlternation::accept(ComponentVisitor &v) {
|
||||
Component *c = v.visit(this);
|
||||
if (c != this) {
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
Component *child = i->get();
|
||||
c = (*i)->accept(v);
|
||||
if (c != child) {
|
||||
// Child has been replaced (new Component pointer) or we've been
|
||||
// instructed to delete it (null).
|
||||
i->reset(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove deleted children.
|
||||
children.erase(remove(children.begin(), children.end(), nullptr),
|
||||
children.end());
|
||||
|
||||
v.post(this);
|
||||
return this;
|
||||
}
|
||||
|
||||
void ComponentAlternation::accept(ConstComponentVisitor &v) const {
|
||||
v.pre(*this);
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
(*i)->accept(v);
|
||||
if (i + 1 != e) {
|
||||
v.during(*this);
|
||||
}
|
||||
}
|
||||
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
void ComponentAlternation::append(unique_ptr<Component> component) {
|
||||
children.push_back(move(component));
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentAlternation::first() const {
|
||||
// firsts come from all our subcomponents in position order. This will
|
||||
// maintain left-to-right priority order.
|
||||
vector<PositionInfo> firsts, subfirsts;
|
||||
|
||||
for (const auto &c : children) {
|
||||
subfirsts = c->first();
|
||||
firsts.insert(firsts.end(), subfirsts.begin(), subfirsts.end());
|
||||
}
|
||||
return firsts;
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentAlternation::last() const {
|
||||
vector<PositionInfo> lasts, sublasts;
|
||||
|
||||
for (const auto &c : children) {
|
||||
sublasts = c->last();
|
||||
lasts.insert(lasts.end(), sublasts.begin(), sublasts.end());
|
||||
}
|
||||
return lasts;
|
||||
}
|
||||
|
||||
bool ComponentAlternation::empty(void) const {
|
||||
// an alternation can be empty if any of its components are empty
|
||||
for (const auto &c : children) {
|
||||
if (c->empty()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void ComponentAlternation::notePositions(GlushkovBuildState &bs) {
|
||||
u32 pb = bs.getBuilder().numVertices();
|
||||
for (auto &c : children) {
|
||||
c->notePositions(bs);
|
||||
}
|
||||
recordPosBounds(pb, bs.getBuilder().numVertices());
|
||||
}
|
||||
|
||||
void ComponentAlternation::buildFollowSet(GlushkovBuildState &bs,
|
||||
const vector<PositionInfo> &lastPos) {
|
||||
for (auto &c : children) {
|
||||
c->buildFollowSet(bs, lastPos);
|
||||
}
|
||||
}
|
||||
|
||||
bool ComponentAlternation::checkEmbeddedStartAnchor(bool at_start) const {
|
||||
bool rv = at_start;
|
||||
for (const auto &c : children) {
|
||||
rv &= c->checkEmbeddedStartAnchor(at_start);
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
bool ComponentAlternation::checkEmbeddedEndAnchor(bool at_end) const {
|
||||
bool rv = at_end;
|
||||
for (const auto &c : children) {
|
||||
rv &= c->checkEmbeddedEndAnchor(at_end);
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
bool ComponentAlternation::vacuous_everywhere(void) const {
|
||||
for (const auto &c : children) {
|
||||
if (c->vacuous_everywhere()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void ComponentAlternation::optimise(bool connected_to_sds) {
|
||||
for (auto &c : children) {
|
||||
c->optimise(connected_to_sds);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ue2
|
79
src/parser/ComponentAlternation.h
Normal file
79
src/parser/ComponentAlternation.h
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Alternations (foo|bar|baz).
|
||||
*/
|
||||
|
||||
#ifndef COMPONENT_ALTERNATION_H
|
||||
#define COMPONENT_ALTERNATION_H
|
||||
|
||||
#include "Component.h"
|
||||
#include "position.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class PositionInfo;
|
||||
|
||||
class ComponentAlternation : public Component {
|
||||
friend class DumpVisitor;
|
||||
friend class SimplifyVisitor;
|
||||
public:
|
||||
ComponentAlternation();
|
||||
~ComponentAlternation() override;
|
||||
ComponentAlternation *clone() const override;
|
||||
Component *accept(ComponentVisitor &v) override;
|
||||
void accept(ConstComponentVisitor &v) const override;
|
||||
|
||||
size_t numBranches() const { return children.size(); }
|
||||
|
||||
void append(std::unique_ptr<Component> component);
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
bool empty(void) const override;
|
||||
bool vacuous_everywhere() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
bool checkEmbeddedStartAnchor(bool at_start) const override;
|
||||
bool checkEmbeddedEndAnchor(bool at_end) const override;
|
||||
|
||||
void optimise(bool connected_to_sds) override;
|
||||
|
||||
private:
|
||||
std::vector<std::unique_ptr<Component>> children;
|
||||
|
||||
ComponentAlternation(const ComponentAlternation &other);
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
121
src/parser/ComponentAssertion.cpp
Normal file
121
src/parser/ComponentAssertion.cpp
Normal file
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Lookahead/lookbehind zero-width assertions.
|
||||
*/
|
||||
#include "ComponentAssertion.h"
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentAssertion::ComponentAssertion(enum Direction dir, enum Sense sense)
|
||||
: m_dir(dir), m_sense(sense) {}
|
||||
|
||||
ComponentAssertion::~ComponentAssertion() { }
|
||||
|
||||
ComponentAssertion *ComponentAssertion::clone() const {
|
||||
return new ComponentAssertion(*this);
|
||||
}
|
||||
|
||||
Component * ComponentAssertion::accept(ComponentVisitor &v) {
|
||||
Component *c = v.visit(this);
|
||||
if (c != this) {
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
Component *child = i->get();
|
||||
c = (*i)->accept(v);
|
||||
if (c != child) {
|
||||
// Child has been replaced (new Component pointer) or we've been
|
||||
// instructed to delete it (null).
|
||||
i->reset(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove deleted children.
|
||||
children.erase(remove(children.begin(), children.end(), nullptr),
|
||||
children.end());
|
||||
|
||||
v.post(this);
|
||||
return this;
|
||||
}
|
||||
|
||||
void ComponentAssertion::accept(ConstComponentVisitor &v) const {
|
||||
v.pre(*this);
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
(*i)->accept(v);
|
||||
if (i + 1 != e) {
|
||||
v.during(*this);
|
||||
}
|
||||
}
|
||||
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentAssertion::first() const {
|
||||
assert(0);
|
||||
return vector<PositionInfo>();
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentAssertion::last() const {
|
||||
assert(0);
|
||||
return vector<PositionInfo>();
|
||||
}
|
||||
|
||||
bool ComponentAssertion::empty() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
void ComponentAssertion::notePositions(GlushkovBuildState &) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
void ComponentAssertion::buildFollowSet(GlushkovBuildState &,
|
||||
const vector<PositionInfo> &) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
bool ComponentAssertion::repeatable() const {
|
||||
// If this assertion has no children (it's an empty sequence, like that
|
||||
// produced by '(?!)') then PCRE would throw a "nothing to repeat" error.
|
||||
// So we do as well.
|
||||
return !children.empty();
|
||||
}
|
||||
|
||||
} // namespace ue2
|
76
src/parser/ComponentAssertion.h
Normal file
76
src/parser/ComponentAssertion.h
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Lookahead/lookbehind zero-width assertions.
|
||||
*/
|
||||
|
||||
#ifndef _RE_COMPONENTASSERTION_H_
|
||||
#define _RE_COMPONENTASSERTION_H_
|
||||
|
||||
#include "ComponentSequence.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class ComponentAssertion : public ComponentSequence {
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
public:
|
||||
enum Direction {
|
||||
LOOKAHEAD, //!< lookahead (forward) assertion
|
||||
LOOKBEHIND //!< lookbehind (backward) assertion
|
||||
};
|
||||
|
||||
enum Sense {
|
||||
POS, //!< positive assertion, (?=...) or (?<=...)
|
||||
NEG //!< negative assertion, (?!...) or (?<!...)
|
||||
};
|
||||
|
||||
ComponentAssertion(enum Direction dir, enum Sense sense);
|
||||
~ComponentAssertion() override;
|
||||
ComponentAssertion *clone() const override;
|
||||
Component *accept(ComponentVisitor &v) override;
|
||||
void accept(ConstComponentVisitor &v) const override;
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
|
||||
bool empty() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
bool repeatable() const override;
|
||||
|
||||
private:
|
||||
enum Direction m_dir;
|
||||
enum Sense m_sense;
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
92
src/parser/ComponentAtomicGroup.cpp
Normal file
92
src/parser/ComponentAtomicGroup.cpp
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Atomic groups (?>...)
|
||||
*/
|
||||
#include "ComponentAtomicGroup.h"
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentAtomicGroup *ComponentAtomicGroup::clone() const {
|
||||
return new ComponentAtomicGroup(*this);
|
||||
}
|
||||
|
||||
Component *ComponentAtomicGroup::accept(ComponentVisitor &v) {
|
||||
Component *c = v.visit(this);
|
||||
if (c != this) {
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
Component *child = i->get();
|
||||
c = (*i)->accept(v);
|
||||
if (c != child) {
|
||||
// Child has been replaced (new Component pointer) or we've been
|
||||
// instructed to delete it (null).
|
||||
i->reset(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove deleted children.
|
||||
children.erase(remove(children.begin(), children.end(), nullptr),
|
||||
children.end());
|
||||
|
||||
v.post(this);
|
||||
return this;
|
||||
}
|
||||
|
||||
void ComponentAtomicGroup::accept(ConstComponentVisitor &v) const {
|
||||
v.pre(*this);
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
(*i)->accept(v);
|
||||
if (i + 1 != e) {
|
||||
v.during(*this);
|
||||
}
|
||||
}
|
||||
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
void ComponentAtomicGroup::notePositions(GlushkovBuildState &) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
void ComponentAtomicGroup::buildFollowSet(GlushkovBuildState &,
|
||||
const vector<PositionInfo> &) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
} // namespace
|
58
src/parser/ComponentAtomicGroup.h
Normal file
58
src/parser/ComponentAtomicGroup.h
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Atomic groups (?>...)
|
||||
*/
|
||||
|
||||
#ifndef _COMPONENTATOMICGROUP_H_
|
||||
#define _COMPONENTATOMICGROUP_H_
|
||||
|
||||
#include "ComponentSequence.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
// The atomic group component is a subclass of sequence that is only buildable
|
||||
// in prefilter mode, where we treat it as a standard sequence.
|
||||
class ComponentAtomicGroup : public ComponentSequence {
|
||||
friend class DumpVisitor;
|
||||
public:
|
||||
ComponentAtomicGroup() {}
|
||||
~ComponentAtomicGroup() override {}
|
||||
ComponentAtomicGroup *clone() const override;
|
||||
Component *accept(ComponentVisitor &v) override;
|
||||
void accept(ConstComponentVisitor &v) const override;
|
||||
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
79
src/parser/ComponentBackReference.cpp
Normal file
79
src/parser/ComponentBackReference.cpp
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Back-references (/([a-f]{3}).*\\1/)
|
||||
*/
|
||||
|
||||
|
||||
#include "ComponentBackReference.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "util/charreach.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentBackReference::ComponentBackReference(unsigned int id)
|
||||
: ref_id(id) {}
|
||||
|
||||
ComponentBackReference::ComponentBackReference(const string &s)
|
||||
: name(s), ref_id(0) {}
|
||||
|
||||
ComponentBackReference * ComponentBackReference::clone() const {
|
||||
return new ComponentBackReference(*this);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentBackReference::first() const {
|
||||
assert(0);
|
||||
return vector<PositionInfo>();
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentBackReference::last() const {
|
||||
assert(0);
|
||||
return vector<PositionInfo>();
|
||||
}
|
||||
|
||||
bool ComponentBackReference::empty(void) const { return true; }
|
||||
|
||||
void ComponentBackReference::notePositions(GlushkovBuildState &) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
void ComponentBackReference::buildFollowSet(GlushkovBuildState &,
|
||||
const vector<PositionInfo> &) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
} // namespace
|
84
src/parser/ComponentBackReference.h
Normal file
84
src/parser/ComponentBackReference.h
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Back-references (/([a-f]{3}).*\\1/)
|
||||
*/
|
||||
|
||||
#ifndef _RE_COMPONENTBACKREFERENCE_H_
|
||||
#define _RE_COMPONENTBACKREFERENCE_H_
|
||||
|
||||
#include "Component.h"
|
||||
#include <string>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class ComponentBackReference : public Component {
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
friend class ReferenceVisitor;
|
||||
public:
|
||||
explicit ComponentBackReference(unsigned int id);
|
||||
explicit ComponentBackReference(const std::string &s);
|
||||
~ComponentBackReference() override {}
|
||||
ComponentBackReference *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
unsigned int getRefID() const { return ref_id; }
|
||||
const std::string &getRefName() const { return name; }
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
bool empty(void) const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
|
||||
private:
|
||||
// Private copy ctor. Use clone instead.
|
||||
ComponentBackReference(const ComponentBackReference &other)
|
||||
: Component(other), name(other.name), ref_id(other.ref_id) {}
|
||||
|
||||
std::string name;
|
||||
unsigned int ref_id;
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
186
src/parser/ComponentBoundary.cpp
Normal file
186
src/parser/ComponentBoundary.cpp
Normal file
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Boundary assertions (^, $, \\A, \\Z, \\z)
|
||||
*/
|
||||
|
||||
|
||||
#include "ComponentBoundary.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "parse_error.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "Parser.h"
|
||||
#include "util/charreach.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentBoundary::ComponentBoundary(enum Boundary bound)
|
||||
: m_bound(bound), m_newline(GlushkovBuildState::POS_UNINITIALIZED) {}
|
||||
|
||||
ComponentBoundary::~ComponentBoundary() {
|
||||
}
|
||||
|
||||
ComponentBoundary::ComponentBoundary(const ComponentBoundary &other)
|
||||
: Component(other), m_bound(other.m_bound), m_newline(other.m_newline),
|
||||
m_first(other.m_first), m_last(other.m_last) {}
|
||||
|
||||
ComponentBoundary * ComponentBoundary::clone() const {
|
||||
return new ComponentBoundary(*this);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentBoundary::first() const {
|
||||
return m_first;
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentBoundary::last() const {
|
||||
return m_last;
|
||||
}
|
||||
|
||||
bool ComponentBoundary::empty() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ComponentBoundary::repeatable() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
Position makeNewline(GlushkovBuildState &bs) {
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
Position newline = builder.makePositions(1);
|
||||
builder.addCharReach(newline, CharReach('\n'));
|
||||
return newline;
|
||||
}
|
||||
|
||||
void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
const Position startState = builder.getStart();
|
||||
|
||||
switch (m_bound) {
|
||||
case BEGIN_STRING: // beginning of data stream ('^')
|
||||
{
|
||||
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
|
||||
epsilon.flags = POS_FLAG_NOFLOAT;
|
||||
m_first.push_back(epsilon);
|
||||
|
||||
// We have the start vertex in firsts so that we can discourage
|
||||
// the mid-pattern use of boundaries.
|
||||
m_first.push_back(startState);
|
||||
|
||||
break;
|
||||
}
|
||||
case BEGIN_LINE: // multiline anchor: beginning of stream or a newline
|
||||
{
|
||||
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
|
||||
epsilon.flags = POS_FLAG_NOFLOAT;
|
||||
m_first.push_back(epsilon);
|
||||
|
||||
// We have the start vertex in firsts so that we can discourage
|
||||
// the mid-pattern use of boundaries.
|
||||
m_first.push_back(startState);
|
||||
|
||||
// Newline
|
||||
m_newline = makeNewline(bs);
|
||||
builder.setAssertFlag(m_newline, POS_FLAG_MULTILINE_START);
|
||||
builder.setAssertFlag(m_newline, POS_FLAG_VIRTUAL_START);
|
||||
PositionInfo nl(m_newline);
|
||||
nl.flags = POS_FLAG_MUST_FLOAT | POS_FLAG_FIDDLE_ACCEPT;
|
||||
m_first.push_back(nl);
|
||||
m_last.push_back(nl);
|
||||
recordPosBounds(m_newline, m_newline + 1);
|
||||
break;
|
||||
}
|
||||
case END_STRING: // end of data stream ('\z')
|
||||
{
|
||||
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
|
||||
epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_NO_NL_EOD |
|
||||
POS_FLAG_NO_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
|
||||
m_first.push_back(epsilon);
|
||||
break;
|
||||
}
|
||||
case END_STRING_OPTIONAL_LF: // end of data with optional LF ('$')
|
||||
{
|
||||
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
|
||||
epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_WIRE_NL_EOD |
|
||||
POS_FLAG_NO_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
|
||||
m_first.push_back(epsilon);
|
||||
break;
|
||||
}
|
||||
case END_LINE: // multiline anchor: end of data or a newline
|
||||
{
|
||||
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
|
||||
epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_WIRE_NL_EOD |
|
||||
POS_FLAG_WIRE_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
|
||||
m_first.push_back(epsilon);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// unsupported
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ComponentBoundary::buildFollowSet(GlushkovBuildState &,
|
||||
const vector<PositionInfo> &) {
|
||||
|
||||
}
|
||||
|
||||
bool ComponentBoundary::checkEmbeddedStartAnchor(bool at_start) const {
|
||||
if (at_start) {
|
||||
return at_start;
|
||||
}
|
||||
|
||||
if (m_bound == BEGIN_STRING || m_bound == BEGIN_LINE) {
|
||||
throw ParseError("Embedded start anchors not supported.");
|
||||
}
|
||||
|
||||
return at_start;
|
||||
}
|
||||
|
||||
bool ComponentBoundary::checkEmbeddedEndAnchor(bool at_end) const {
|
||||
if (at_end) {
|
||||
return at_end;
|
||||
}
|
||||
|
||||
if (m_bound != BEGIN_STRING && m_bound != BEGIN_LINE) {
|
||||
throw ParseError("Embedded end anchors not supported.");
|
||||
}
|
||||
|
||||
return at_end;
|
||||
}
|
||||
|
||||
} // namespace
|
94
src/parser/ComponentBoundary.h
Normal file
94
src/parser/ComponentBoundary.h
Normal file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Boundary assertions (^, $, \\A, \\Z, \\z)
|
||||
*/
|
||||
|
||||
#ifndef _RE_COMPONENTBOUNDARY_H_
|
||||
#define _RE_COMPONENTBOUNDARY_H_
|
||||
|
||||
#include "Component.h"
|
||||
#include "position.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Encapsulates a line/string boundary assertion. */
|
||||
class ComponentBoundary : public Component {
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
friend class UnsafeBoundsVisitor;
|
||||
friend class MultilineVisitor;
|
||||
public:
|
||||
enum Boundary {
|
||||
BEGIN_STRING, //!< beginning of data stream
|
||||
END_STRING, //!< end of data stream
|
||||
END_STRING_OPTIONAL_LF, //!< end of data stream with an optional
|
||||
// linefeed
|
||||
BEGIN_LINE, //!< '(^|\\n)'
|
||||
END_LINE //!< '($|\\n)'
|
||||
};
|
||||
|
||||
explicit ComponentBoundary(enum Boundary bound);
|
||||
~ComponentBoundary() override;
|
||||
ComponentBoundary *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
bool empty() const override;
|
||||
bool repeatable() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
bool checkEmbeddedStartAnchor(bool at_start) const override;
|
||||
bool checkEmbeddedEndAnchor(bool at_end) const override;
|
||||
|
||||
private:
|
||||
enum Boundary m_bound; //!< \brief which assertion is that?
|
||||
Position m_newline; //!< \brief special newline state
|
||||
std::vector<PositionInfo> m_first; //!< \brief positions returned for first()
|
||||
std::vector<PositionInfo> m_last; //!< \brief positions returned for last()
|
||||
|
||||
ComponentBoundary(const ComponentBoundary &other);
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
70
src/parser/ComponentByte.cpp
Normal file
70
src/parser/ComponentByte.cpp
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Single bytes (\\C metachar)
|
||||
*/
|
||||
|
||||
|
||||
#include "ComponentByte.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "util/charreach.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentByte::ComponentByte()
|
||||
: position(GlushkovBuildState::POS_UNINITIALIZED) {}
|
||||
|
||||
ComponentByte::~ComponentByte() {}
|
||||
|
||||
ComponentByte *ComponentByte::clone() const {
|
||||
return new ComponentByte(*this);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentByte::first() const {
|
||||
return vector<PositionInfo>(1, PositionInfo(position));
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentByte::last() const {
|
||||
return vector<PositionInfo>(1, PositionInfo(position));
|
||||
}
|
||||
|
||||
void ComponentByte::notePositions(GlushkovBuildState &bs) {
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
position = builder.makePositions(1);
|
||||
builder.addCharReach(position, CharReach::dot());
|
||||
builder.setNodeReportID(position, 0 /* offset adj */);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
80
src/parser/ComponentByte.h
Normal file
80
src/parser/ComponentByte.h
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Single bytes (\\C metachar)
|
||||
*/
|
||||
|
||||
#ifndef _RE_COMPONENTBYTE_H_
|
||||
#define _RE_COMPONENTBYTE_H_
|
||||
|
||||
#include "Component.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class ComponentByte : public Component {
|
||||
friend class DumpVisitor;
|
||||
public:
|
||||
ComponentByte(void);
|
||||
~ComponentByte() override;
|
||||
ComponentByte *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
|
||||
bool empty() const override { return false; }
|
||||
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &,
|
||||
const std::vector<PositionInfo> &) override {
|
||||
// all follow set construction is handled by firsts/lasts
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
Position position;
|
||||
|
||||
ComponentByte(const ComponentByte &other)
|
||||
: Component(other), position(other.position) {}
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
448
src/parser/ComponentClass.cpp
Normal file
448
src/parser/ComponentClass.cpp
Normal file
@@ -0,0 +1,448 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Character classes and their mnemonics.
|
||||
*/
|
||||
#include "Parser.h"
|
||||
#include "ComponentClass.h"
|
||||
#include "AsciiComponentClass.h"
|
||||
#include "ucp_table.h"
|
||||
#include "Utf8ComponentClass.h"
|
||||
#include "util/charreach.h"
|
||||
#include "util/make_unique.h"
|
||||
|
||||
#include <boost/icl/interval_set.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
static
|
||||
CharReach to_cr(const CodePointSet &cps) {
|
||||
CharReach cr;
|
||||
for (const auto &cp : cps) {
|
||||
if (lower(cp) >= CharReach::npos) {
|
||||
break;
|
||||
}
|
||||
|
||||
cr.setRange(lower(cp), MIN(upper(cp), CharReach::npos - 1));
|
||||
}
|
||||
|
||||
return cr;
|
||||
}
|
||||
|
||||
CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
|
||||
const CharReach lower('a', 'z');
|
||||
const CharReach upper('A', 'Z');
|
||||
const CharReach number('0', '9');
|
||||
switch (c) {
|
||||
case CLASS_ALNUM:
|
||||
return lower | upper | number;
|
||||
case CLASS_ALPHA:
|
||||
return lower | upper;
|
||||
case CLASS_ANY:
|
||||
if (mode.dotall) {
|
||||
return ~CharReach();
|
||||
} else {
|
||||
return ~CharReach('\n');
|
||||
}
|
||||
case CLASS_ASCII:
|
||||
return CharReach(0, 127);
|
||||
case CLASS_BLANK:
|
||||
return CharReach(" \t");
|
||||
case CLASS_CNTRL:
|
||||
return CharReach(0, 31) | CharReach(127 /* del */);
|
||||
case CLASS_DIGIT:
|
||||
return number;
|
||||
case CLASS_GRAPH:
|
||||
case CLASS_XGRAPH:
|
||||
return CharReach(0x21, 0x7e);
|
||||
case CLASS_HORZ:
|
||||
return CharReach("\x09\x20\xA0");
|
||||
case CLASS_LOWER:
|
||||
if (mode.caseless) {
|
||||
return lower | upper;
|
||||
} else {
|
||||
return lower;
|
||||
}
|
||||
case CLASS_PRINT:
|
||||
return CharReach(0x20, 0x7e);
|
||||
case CLASS_PUNCT:
|
||||
return CharReach(0x21, '0' - 1)
|
||||
| CharReach('9' + 1, 'A' - 1)
|
||||
| CharReach('Z' + 1, 'a' - 1)
|
||||
| CharReach('z' + 1, 126);
|
||||
case CLASS_SPACE:
|
||||
return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
|
||||
case CLASS_UPPER:
|
||||
if (mode.caseless) {
|
||||
return lower | upper;
|
||||
} else {
|
||||
return upper;
|
||||
}
|
||||
case CLASS_VERT:
|
||||
return CharReach("\x0a\x0b\x0c\x0d\x85");
|
||||
case CLASS_WORD:
|
||||
return lower | upper | number | CharReach('_');
|
||||
case CLASS_XDIGIT:
|
||||
return CharReach("0123456789abcdefABCDEF");
|
||||
case CLASS_UCP_C:
|
||||
return to_cr(getUcpC());
|
||||
case CLASS_UCP_CC:
|
||||
return to_cr(getUcpCc());
|
||||
case CLASS_UCP_CF:
|
||||
return to_cr(getUcpCf());
|
||||
case CLASS_UCP_CN:
|
||||
return to_cr(getUcpCn());
|
||||
case CLASS_UCP_CO:
|
||||
return to_cr(getUcpCo());
|
||||
case CLASS_UCP_CS:
|
||||
return to_cr(getUcpCs());
|
||||
case CLASS_UCP_L:
|
||||
return to_cr(getUcpL());
|
||||
case CLASS_UCP_L_AND:
|
||||
return to_cr(getUcpL_and());
|
||||
case CLASS_UCP_LL:
|
||||
return to_cr(getUcpLl());
|
||||
case CLASS_UCP_LM:
|
||||
return to_cr(getUcpLm());
|
||||
case CLASS_UCP_LO:
|
||||
return to_cr(getUcpLo());
|
||||
case CLASS_UCP_LT:
|
||||
return to_cr(getUcpLt());
|
||||
case CLASS_UCP_LU:
|
||||
return to_cr(getUcpLu());
|
||||
case CLASS_UCP_M:
|
||||
return to_cr(getUcpM());
|
||||
case CLASS_UCP_MC:
|
||||
return to_cr(getUcpMc());
|
||||
case CLASS_UCP_ME:
|
||||
return to_cr(getUcpMe());
|
||||
case CLASS_UCP_MN:
|
||||
return to_cr(getUcpMn());
|
||||
case CLASS_UCP_N:
|
||||
return to_cr(getUcpN());
|
||||
case CLASS_UCP_ND:
|
||||
return to_cr(getUcpNd());
|
||||
case CLASS_UCP_NL:
|
||||
return to_cr(getUcpNl());
|
||||
case CLASS_UCP_NO:
|
||||
return to_cr(getUcpNo());
|
||||
case CLASS_UCP_P:
|
||||
return to_cr(getUcpP());
|
||||
case CLASS_UCP_PC:
|
||||
return to_cr(getUcpPc());
|
||||
case CLASS_UCP_PD:
|
||||
return to_cr(getUcpPd());
|
||||
case CLASS_UCP_PE:
|
||||
return to_cr(getUcpPe());
|
||||
case CLASS_UCP_PF:
|
||||
return to_cr(getUcpPf());
|
||||
case CLASS_UCP_PI:
|
||||
return to_cr(getUcpPi());
|
||||
case CLASS_UCP_PO:
|
||||
return to_cr(getUcpPo());
|
||||
case CLASS_UCP_PS:
|
||||
return to_cr(getUcpPs());
|
||||
case CLASS_UCP_S:
|
||||
return to_cr(getUcpS());
|
||||
case CLASS_UCP_SC:
|
||||
return to_cr(getUcpSc());
|
||||
case CLASS_UCP_SK:
|
||||
return to_cr(getUcpSk());
|
||||
case CLASS_UCP_SM:
|
||||
return to_cr(getUcpSm());
|
||||
case CLASS_UCP_SO:
|
||||
return to_cr(getUcpSo());
|
||||
case CLASS_UCP_XAN:
|
||||
return to_cr(getUcpXan());
|
||||
case CLASS_UCP_XPS:
|
||||
case CLASS_UCP_XSP:
|
||||
return getPredefinedCharReach(CLASS_VERT, mode) | getPredefinedCharReach(CLASS_HORZ, mode);
|
||||
case CLASS_UCP_XWD:
|
||||
return to_cr(getUcpXwd());
|
||||
case CLASS_UCP_Z:
|
||||
return to_cr(getUcpZ());
|
||||
case CLASS_UCP_ZL:
|
||||
return to_cr(getUcpZl());
|
||||
case CLASS_UCP_ZP:
|
||||
return to_cr(getUcpZp());
|
||||
case CLASS_UCP_ZS:
|
||||
return to_cr(getUcpZs());
|
||||
case CLASS_SCRIPT_ARABIC:
|
||||
return to_cr(getUcpArabic());
|
||||
case CLASS_SCRIPT_ARMENIAN:
|
||||
return to_cr(getUcpArmenian());
|
||||
case CLASS_SCRIPT_AVESTAN:
|
||||
return to_cr(getUcpAvestan());
|
||||
case CLASS_SCRIPT_BALINESE:
|
||||
return to_cr(getUcpBalinese());
|
||||
case CLASS_SCRIPT_BAMUM:
|
||||
return to_cr(getUcpBamum());
|
||||
case CLASS_SCRIPT_BATAK:
|
||||
return to_cr(getUcpBatak());
|
||||
case CLASS_SCRIPT_BENGALI:
|
||||
return to_cr(getUcpBengali());
|
||||
case CLASS_SCRIPT_BOPOMOFO:
|
||||
return to_cr(getUcpBopomofo());
|
||||
case CLASS_SCRIPT_BRAHMI:
|
||||
return to_cr(getUcpBrahmi());
|
||||
case CLASS_SCRIPT_BRAILLE:
|
||||
return to_cr(getUcpBraille());
|
||||
case CLASS_SCRIPT_BUGINESE:
|
||||
return to_cr(getUcpBuginese());
|
||||
case CLASS_SCRIPT_BUHID:
|
||||
return to_cr(getUcpBuhid());
|
||||
case CLASS_SCRIPT_CANADIAN_ABORIGINAL:
|
||||
return to_cr(getUcpCanadian_Aboriginal());
|
||||
case CLASS_SCRIPT_CARIAN:
|
||||
return to_cr(getUcpCarian());
|
||||
case CLASS_SCRIPT_CHAM:
|
||||
return to_cr(getUcpCham());
|
||||
case CLASS_SCRIPT_CHEROKEE:
|
||||
return to_cr(getUcpCherokee());
|
||||
case CLASS_SCRIPT_COMMON:
|
||||
return to_cr(getUcpCommon());
|
||||
case CLASS_SCRIPT_COPTIC:
|
||||
return to_cr(getUcpCoptic());
|
||||
case CLASS_SCRIPT_CUNEIFORM:
|
||||
return to_cr(getUcpCuneiform());
|
||||
case CLASS_SCRIPT_CYPRIOT:
|
||||
return to_cr(getUcpCypriot());
|
||||
case CLASS_SCRIPT_CYRILLIC:
|
||||
return to_cr(getUcpCyrillic());
|
||||
case CLASS_SCRIPT_DESERET:
|
||||
return to_cr(getUcpDeseret());
|
||||
case CLASS_SCRIPT_DEVANAGARI:
|
||||
return to_cr(getUcpDevanagari());
|
||||
case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS:
|
||||
return to_cr(getUcpEgyptian_Hieroglyphs());
|
||||
case CLASS_SCRIPT_ETHIOPIC:
|
||||
return to_cr(getUcpEthiopic());
|
||||
case CLASS_SCRIPT_GEORGIAN:
|
||||
return to_cr(getUcpGeorgian());
|
||||
case CLASS_SCRIPT_GLAGOLITIC:
|
||||
return to_cr(getUcpGlagolitic());
|
||||
case CLASS_SCRIPT_GOTHIC:
|
||||
return to_cr(getUcpGothic());
|
||||
case CLASS_SCRIPT_GREEK:
|
||||
return to_cr(getUcpGreek());
|
||||
case CLASS_SCRIPT_GUJARATI:
|
||||
return to_cr(getUcpGujarati());
|
||||
case CLASS_SCRIPT_GURMUKHI:
|
||||
return to_cr(getUcpGurmukhi());
|
||||
case CLASS_SCRIPT_HAN:
|
||||
return to_cr(getUcpHan());
|
||||
case CLASS_SCRIPT_HANGUL:
|
||||
return to_cr(getUcpHangul());
|
||||
case CLASS_SCRIPT_HANUNOO:
|
||||
return to_cr(getUcpHanunoo());
|
||||
case CLASS_SCRIPT_HEBREW:
|
||||
return to_cr(getUcpHebrew());
|
||||
case CLASS_SCRIPT_HIRAGANA:
|
||||
return to_cr(getUcpHiragana());
|
||||
case CLASS_SCRIPT_IMPERIAL_ARAMAIC:
|
||||
return to_cr(getUcpImperial_Aramaic());
|
||||
case CLASS_SCRIPT_INHERITED:
|
||||
return to_cr(getUcpInherited());
|
||||
case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI:
|
||||
return to_cr(getUcpInscriptional_Pahlavi());
|
||||
case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN:
|
||||
return to_cr(getUcpInscriptional_Parthian());
|
||||
case CLASS_SCRIPT_JAVANESE:
|
||||
return to_cr(getUcpJavanese());
|
||||
case CLASS_SCRIPT_KAITHI:
|
||||
return to_cr(getUcpKaithi());
|
||||
case CLASS_SCRIPT_KANNADA:
|
||||
return to_cr(getUcpKannada());
|
||||
case CLASS_SCRIPT_KATAKANA:
|
||||
return to_cr(getUcpKatakana());
|
||||
case CLASS_SCRIPT_KAYAH_LI:
|
||||
return to_cr(getUcpKayah_Li());
|
||||
case CLASS_SCRIPT_KHAROSHTHI:
|
||||
return to_cr(getUcpKharoshthi());
|
||||
case CLASS_SCRIPT_KHMER:
|
||||
return to_cr(getUcpKhmer());
|
||||
case CLASS_SCRIPT_LAO:
|
||||
return to_cr(getUcpLao());
|
||||
case CLASS_SCRIPT_LATIN:
|
||||
return to_cr(getUcpLatin());
|
||||
case CLASS_SCRIPT_LEPCHA:
|
||||
return to_cr(getUcpLepcha());
|
||||
case CLASS_SCRIPT_LIMBU:
|
||||
return to_cr(getUcpLimbu());
|
||||
case CLASS_SCRIPT_LINEAR_B:
|
||||
return to_cr(getUcpLinear_B());
|
||||
case CLASS_SCRIPT_LISU:
|
||||
return to_cr(getUcpLisu());
|
||||
case CLASS_SCRIPT_LYCIAN:
|
||||
return to_cr(getUcpLycian());
|
||||
case CLASS_SCRIPT_LYDIAN:
|
||||
return to_cr(getUcpLydian());
|
||||
case CLASS_SCRIPT_MALAYALAM:
|
||||
return to_cr(getUcpMalayalam());
|
||||
case CLASS_SCRIPT_MANDAIC:
|
||||
return to_cr(getUcpMandaic());
|
||||
case CLASS_SCRIPT_MEETEI_MAYEK:
|
||||
return to_cr(getUcpMeetei_Mayek());
|
||||
case CLASS_SCRIPT_MONGOLIAN:
|
||||
return to_cr(getUcpMongolian());
|
||||
case CLASS_SCRIPT_MYANMAR:
|
||||
return to_cr(getUcpMyanmar());
|
||||
case CLASS_SCRIPT_NEW_TAI_LUE:
|
||||
return to_cr(getUcpNew_Tai_Lue());
|
||||
case CLASS_SCRIPT_NKO:
|
||||
return to_cr(getUcpNko());
|
||||
case CLASS_SCRIPT_OGHAM:
|
||||
return to_cr(getUcpOgham());
|
||||
case CLASS_SCRIPT_OL_CHIKI:
|
||||
return to_cr(getUcpOl_Chiki());
|
||||
case CLASS_SCRIPT_OLD_ITALIC:
|
||||
return to_cr(getUcpOld_Italic());
|
||||
case CLASS_SCRIPT_OLD_PERSIAN:
|
||||
return to_cr(getUcpOld_Persian());
|
||||
case CLASS_SCRIPT_OLD_SOUTH_ARABIAN:
|
||||
return to_cr(getUcpOld_South_Arabian());
|
||||
case CLASS_SCRIPT_OLD_TURKIC:
|
||||
return to_cr(getUcpOld_Turkic());
|
||||
case CLASS_SCRIPT_ORIYA:
|
||||
return to_cr(getUcpOriya());
|
||||
case CLASS_SCRIPT_OSMANYA:
|
||||
return to_cr(getUcpOsmanya());
|
||||
case CLASS_SCRIPT_PHAGS_PA:
|
||||
return to_cr(getUcpPhags_Pa());
|
||||
case CLASS_SCRIPT_PHOENICIAN:
|
||||
return to_cr(getUcpPhoenician());
|
||||
case CLASS_SCRIPT_REJANG:
|
||||
return to_cr(getUcpRejang());
|
||||
case CLASS_SCRIPT_RUNIC:
|
||||
return to_cr(getUcpRunic());
|
||||
case CLASS_SCRIPT_SAMARITAN:
|
||||
return to_cr(getUcpSamaritan());
|
||||
case CLASS_SCRIPT_SAURASHTRA:
|
||||
return to_cr(getUcpSaurashtra());
|
||||
case CLASS_SCRIPT_SHAVIAN:
|
||||
return to_cr(getUcpShavian());
|
||||
case CLASS_SCRIPT_SINHALA:
|
||||
return to_cr(getUcpSinhala());
|
||||
case CLASS_SCRIPT_SUNDANESE:
|
||||
return to_cr(getUcpSundanese());
|
||||
case CLASS_SCRIPT_SYLOTI_NAGRI:
|
||||
return to_cr(getUcpSyloti_Nagri());
|
||||
case CLASS_SCRIPT_SYRIAC:
|
||||
return to_cr(getUcpSyriac());
|
||||
case CLASS_SCRIPT_TAGALOG:
|
||||
return to_cr(getUcpTagalog());
|
||||
case CLASS_SCRIPT_TAGBANWA:
|
||||
return to_cr(getUcpTagbanwa());
|
||||
case CLASS_SCRIPT_TAI_LE:
|
||||
return to_cr(getUcpTai_Le());
|
||||
case CLASS_SCRIPT_TAI_THAM:
|
||||
return to_cr(getUcpTai_Tham());
|
||||
case CLASS_SCRIPT_TAI_VIET:
|
||||
return to_cr(getUcpTai_Viet());
|
||||
case CLASS_SCRIPT_TAMIL:
|
||||
return to_cr(getUcpTamil());
|
||||
case CLASS_SCRIPT_TELUGU:
|
||||
return to_cr(getUcpTelugu());
|
||||
case CLASS_SCRIPT_THAANA:
|
||||
return to_cr(getUcpThaana());
|
||||
case CLASS_SCRIPT_THAI:
|
||||
return to_cr(getUcpThai());
|
||||
case CLASS_SCRIPT_TIBETAN:
|
||||
return to_cr(getUcpTibetan());
|
||||
case CLASS_SCRIPT_TIFINAGH:
|
||||
return to_cr(getUcpTifinagh());
|
||||
case CLASS_SCRIPT_UGARITIC:
|
||||
return to_cr(getUcpUgaritic());
|
||||
case CLASS_SCRIPT_VAI:
|
||||
return to_cr(getUcpVai());
|
||||
case CLASS_SCRIPT_YI:
|
||||
return to_cr(getUcpYi());
|
||||
case CLASS_UCP_ANY: /* always include newline */
|
||||
return ~CharReach();
|
||||
}
|
||||
assert(0);
|
||||
return CharReach();
|
||||
}
|
||||
|
||||
unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode) {
|
||||
if (mode.utf8) {
|
||||
return ue2::make_unique<UTF8ComponentClass>(mode);
|
||||
} else {
|
||||
return ue2::make_unique<AsciiComponentClass>(mode);
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<ComponentClass> generateComponent(PredefinedClass c, bool negate,
|
||||
const ParseMode &mode) {
|
||||
auto cc = getComponentClass(mode);
|
||||
cc->add(c, negate);
|
||||
cc->finalize();
|
||||
return cc;
|
||||
}
|
||||
|
||||
unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
|
||||
bool nocase) {
|
||||
ParseMode mode;
|
||||
mode.caseless = nocase;
|
||||
auto cc = getComponentClass(mode);
|
||||
cc->add(c);
|
||||
cc->finalize();
|
||||
return cc;
|
||||
}
|
||||
|
||||
ComponentClass::ComponentClass(const ParseMode &mode_in)
|
||||
: m_negate(false), mode(mode_in), in_cand_range(false),
|
||||
range_start(INVALID_UNICODE), finalized(false), firstChar('\0') {}
|
||||
|
||||
ComponentClass::~ComponentClass() { }
|
||||
|
||||
void ComponentClass::addDash(void) {
|
||||
if (!in_cand_range) {
|
||||
// this could be the start of a range
|
||||
if (range_start != INVALID_UNICODE) {
|
||||
in_cand_range = true;
|
||||
} else {
|
||||
/* no possible start character for range, this is just a literal */
|
||||
add('-');
|
||||
}
|
||||
} else {
|
||||
// already creating a range, so this must be literal '-'
|
||||
in_cand_range = false;
|
||||
createRange('-');
|
||||
}
|
||||
}
|
||||
|
||||
void ComponentClass::negate() {
|
||||
assert(class_empty());
|
||||
m_negate = true;
|
||||
}
|
||||
|
||||
} // namespace ue2
|
283
src/parser/ComponentClass.h
Normal file
283
src/parser/ComponentClass.h
Normal file
@@ -0,0 +1,283 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Character classes and their mnemonics.
|
||||
*/
|
||||
|
||||
#ifndef COMPONENTCLASS_H
|
||||
#define COMPONENTCLASS_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
#include "Component.h"
|
||||
#include "Parser.h"
|
||||
#include "util/charreach.h"
|
||||
#include "util/unicode_def.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
enum PredefinedClass {
|
||||
CLASS_ALNUM,
|
||||
CLASS_ALPHA,
|
||||
CLASS_ANY, /* dot, not quite any when not in dotall mode */
|
||||
CLASS_ASCII,
|
||||
CLASS_BLANK,
|
||||
CLASS_CNTRL,
|
||||
CLASS_DIGIT,
|
||||
CLASS_GRAPH,
|
||||
CLASS_HORZ,
|
||||
CLASS_LOWER,
|
||||
CLASS_PRINT,
|
||||
CLASS_PUNCT,
|
||||
CLASS_SPACE, /* has vertical tab */
|
||||
CLASS_UPPER,
|
||||
CLASS_VERT,
|
||||
CLASS_WORD,
|
||||
CLASS_XDIGIT,
|
||||
CLASS_XGRAPH,
|
||||
CLASS_UCP_C,
|
||||
CLASS_UCP_CC,
|
||||
CLASS_UCP_CF,
|
||||
CLASS_UCP_CN, /* unallocated code points */
|
||||
CLASS_UCP_CO,
|
||||
CLASS_UCP_CS, /* does not contain valid unicode codepoints */
|
||||
CLASS_UCP_L,
|
||||
CLASS_UCP_LL,
|
||||
CLASS_UCP_LM,
|
||||
CLASS_UCP_LO,
|
||||
CLASS_UCP_LT,
|
||||
CLASS_UCP_LU,
|
||||
CLASS_UCP_L_AND, /* L& = LL+LU+LT */
|
||||
CLASS_UCP_M,
|
||||
CLASS_UCP_MC,
|
||||
CLASS_UCP_ME,
|
||||
CLASS_UCP_MN,
|
||||
CLASS_UCP_N,
|
||||
CLASS_UCP_ND,
|
||||
CLASS_UCP_NL,
|
||||
CLASS_UCP_NO,
|
||||
CLASS_UCP_P,
|
||||
CLASS_UCP_PC,
|
||||
CLASS_UCP_PD,
|
||||
CLASS_UCP_PE,
|
||||
CLASS_UCP_PF,
|
||||
CLASS_UCP_PI,
|
||||
CLASS_UCP_PO,
|
||||
CLASS_UCP_PS,
|
||||
CLASS_UCP_S,
|
||||
CLASS_UCP_SC,
|
||||
CLASS_UCP_SK,
|
||||
CLASS_UCP_SM,
|
||||
CLASS_UCP_SO,
|
||||
CLASS_UCP_Z,
|
||||
CLASS_UCP_ZL,
|
||||
CLASS_UCP_ZP,
|
||||
CLASS_UCP_ZS,
|
||||
CLASS_UCP_XAN,
|
||||
CLASS_UCP_XPS, /* CLASS_SPACE */
|
||||
CLASS_UCP_XSP,
|
||||
CLASS_UCP_XWD,
|
||||
CLASS_SCRIPT_ARABIC,
|
||||
CLASS_SCRIPT_ARMENIAN,
|
||||
CLASS_SCRIPT_AVESTAN,
|
||||
CLASS_SCRIPT_BALINESE,
|
||||
CLASS_SCRIPT_BAMUM,
|
||||
CLASS_SCRIPT_BATAK,
|
||||
CLASS_SCRIPT_BENGALI,
|
||||
CLASS_SCRIPT_BOPOMOFO,
|
||||
CLASS_SCRIPT_BRAHMI,
|
||||
CLASS_SCRIPT_BRAILLE,
|
||||
CLASS_SCRIPT_BUGINESE,
|
||||
CLASS_SCRIPT_BUHID,
|
||||
CLASS_SCRIPT_CANADIAN_ABORIGINAL,
|
||||
CLASS_SCRIPT_CARIAN,
|
||||
CLASS_SCRIPT_CHAM,
|
||||
CLASS_SCRIPT_CHEROKEE,
|
||||
CLASS_SCRIPT_COMMON,
|
||||
CLASS_SCRIPT_COPTIC,
|
||||
CLASS_SCRIPT_CUNEIFORM,
|
||||
CLASS_SCRIPT_CYPRIOT,
|
||||
CLASS_SCRIPT_CYRILLIC,
|
||||
CLASS_SCRIPT_DESERET,
|
||||
CLASS_SCRIPT_DEVANAGARI,
|
||||
CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS,
|
||||
CLASS_SCRIPT_ETHIOPIC,
|
||||
CLASS_SCRIPT_GEORGIAN,
|
||||
CLASS_SCRIPT_GLAGOLITIC,
|
||||
CLASS_SCRIPT_GOTHIC,
|
||||
CLASS_SCRIPT_GREEK,
|
||||
CLASS_SCRIPT_GUJARATI,
|
||||
CLASS_SCRIPT_GURMUKHI,
|
||||
CLASS_SCRIPT_HAN,
|
||||
CLASS_SCRIPT_HANGUL,
|
||||
CLASS_SCRIPT_HANUNOO,
|
||||
CLASS_SCRIPT_HEBREW,
|
||||
CLASS_SCRIPT_HIRAGANA,
|
||||
CLASS_SCRIPT_IMPERIAL_ARAMAIC,
|
||||
CLASS_SCRIPT_INHERITED,
|
||||
CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI,
|
||||
CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN,
|
||||
CLASS_SCRIPT_JAVANESE,
|
||||
CLASS_SCRIPT_KAITHI,
|
||||
CLASS_SCRIPT_KANNADA,
|
||||
CLASS_SCRIPT_KATAKANA,
|
||||
CLASS_SCRIPT_KAYAH_LI,
|
||||
CLASS_SCRIPT_KHAROSHTHI,
|
||||
CLASS_SCRIPT_KHMER,
|
||||
CLASS_SCRIPT_LAO,
|
||||
CLASS_SCRIPT_LATIN,
|
||||
CLASS_SCRIPT_LEPCHA,
|
||||
CLASS_SCRIPT_LIMBU,
|
||||
CLASS_SCRIPT_LINEAR_B,
|
||||
CLASS_SCRIPT_LISU,
|
||||
CLASS_SCRIPT_LYCIAN,
|
||||
CLASS_SCRIPT_LYDIAN,
|
||||
CLASS_SCRIPT_MALAYALAM,
|
||||
CLASS_SCRIPT_MANDAIC,
|
||||
CLASS_SCRIPT_MEETEI_MAYEK,
|
||||
CLASS_SCRIPT_MONGOLIAN,
|
||||
CLASS_SCRIPT_MYANMAR,
|
||||
CLASS_SCRIPT_NEW_TAI_LUE,
|
||||
CLASS_SCRIPT_NKO,
|
||||
CLASS_SCRIPT_OGHAM,
|
||||
CLASS_SCRIPT_OL_CHIKI,
|
||||
CLASS_SCRIPT_OLD_ITALIC,
|
||||
CLASS_SCRIPT_OLD_PERSIAN,
|
||||
CLASS_SCRIPT_OLD_SOUTH_ARABIAN,
|
||||
CLASS_SCRIPT_OLD_TURKIC,
|
||||
CLASS_SCRIPT_ORIYA,
|
||||
CLASS_SCRIPT_OSMANYA,
|
||||
CLASS_SCRIPT_PHAGS_PA,
|
||||
CLASS_SCRIPT_PHOENICIAN,
|
||||
CLASS_SCRIPT_REJANG,
|
||||
CLASS_SCRIPT_RUNIC,
|
||||
CLASS_SCRIPT_SAMARITAN,
|
||||
CLASS_SCRIPT_SAURASHTRA,
|
||||
CLASS_SCRIPT_SHAVIAN,
|
||||
CLASS_SCRIPT_SINHALA,
|
||||
CLASS_SCRIPT_SUNDANESE,
|
||||
CLASS_SCRIPT_SYLOTI_NAGRI,
|
||||
CLASS_SCRIPT_SYRIAC,
|
||||
CLASS_SCRIPT_TAGALOG,
|
||||
CLASS_SCRIPT_TAGBANWA,
|
||||
CLASS_SCRIPT_TAI_LE,
|
||||
CLASS_SCRIPT_TAI_THAM,
|
||||
CLASS_SCRIPT_TAI_VIET,
|
||||
CLASS_SCRIPT_TAMIL,
|
||||
CLASS_SCRIPT_TELUGU,
|
||||
CLASS_SCRIPT_THAANA,
|
||||
CLASS_SCRIPT_THAI,
|
||||
CLASS_SCRIPT_TIBETAN,
|
||||
CLASS_SCRIPT_TIFINAGH,
|
||||
CLASS_SCRIPT_UGARITIC,
|
||||
CLASS_SCRIPT_VAI,
|
||||
CLASS_SCRIPT_YI,
|
||||
CLASS_UCP_ANY
|
||||
};
|
||||
|
||||
CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode);
|
||||
|
||||
class ComponentClass;
|
||||
class NFABuilder;
|
||||
|
||||
/* Caller is responsible for lifecycle management, class finalized */
|
||||
std::unique_ptr<ComponentClass>
|
||||
generateComponent(PredefinedClass c, bool negated, const ParseMode &mode);
|
||||
|
||||
/* Caller is responsible for lifecycle management, class open */
|
||||
std::unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode);
|
||||
|
||||
/** Common case: generate a component for a single literal character, possibly
|
||||
* in caseless mode. Caller is responsible for lifecycle management. */
|
||||
std::unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
|
||||
bool nocase);
|
||||
|
||||
class ComponentClass : public Component {
|
||||
friend class DumpVisitor;
|
||||
protected:
|
||||
explicit ComponentClass(const ParseMode &mode_in);
|
||||
public:
|
||||
~ComponentClass() override;
|
||||
ComponentClass *clone() const override = 0;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override = 0;
|
||||
void accept(ConstComponentVisitor &v) const override = 0;
|
||||
|
||||
/** True iff we have already started adding members to the class. This is
|
||||
* a different concept to Component::empty */
|
||||
virtual bool class_empty(void) const = 0;
|
||||
|
||||
virtual void add(PredefinedClass c, bool negated) = 0;
|
||||
virtual void add(unichar c) = 0; /* may throw LocatedParseError */
|
||||
void addDash(void);
|
||||
|
||||
void negate(void);
|
||||
virtual void finalize(void) = 0;
|
||||
|
||||
bool isNegated() const { return m_negate; }
|
||||
|
||||
void setFirstChar(char c) { firstChar = c; }
|
||||
char getFirstChar() const { return firstChar; }
|
||||
|
||||
std::vector<PositionInfo> first() const override = 0;
|
||||
std::vector<PositionInfo> last() const override = 0;
|
||||
bool empty() const override { return false; } /* always 1 codepoint wide */
|
||||
|
||||
void notePositions(GlushkovBuildState &bs) override = 0;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &) override = 0;
|
||||
|
||||
protected:
|
||||
bool m_negate;
|
||||
const ParseMode mode;
|
||||
bool in_cand_range;
|
||||
unichar range_start;
|
||||
bool finalized;
|
||||
|
||||
/** Literal character at the start of this character class, e.g. '.' for
|
||||
* the class [.abc]. Used to identify (unsupported) POSIX collating
|
||||
* elements. */
|
||||
char firstChar;
|
||||
|
||||
virtual void createRange(unichar) = 0;
|
||||
|
||||
// Protected copy ctor. Use clone instead.
|
||||
ComponentClass(const ComponentClass &other)
|
||||
: Component(other), m_negate(other.m_negate), mode(other.mode),
|
||||
in_cand_range(other.in_cand_range), range_start(other.range_start),
|
||||
finalized(other.finalized),
|
||||
firstChar(other.firstChar) {}
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // COMPONENTCLASS_H
|
166
src/parser/ComponentCondReference.cpp
Normal file
166
src/parser/ComponentCondReference.cpp
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Conditional reference.
|
||||
*/
|
||||
#include "ComponentCondReference.h"
|
||||
#include "ComponentAlternation.h"
|
||||
#include "ComponentAssertion.h"
|
||||
#include "parse_error.h"
|
||||
#include "position_info.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentCondReference::ComponentCondReference(unsigned ref)
|
||||
: kind(CONDITION_NUMBER), ref_id(ref), hasBothBranches(false) {}
|
||||
|
||||
ComponentCondReference::ComponentCondReference(const string &name)
|
||||
: kind(CONDITION_NAME), ref_id(0), ref_name(name), hasBothBranches(false) {}
|
||||
|
||||
ComponentCondReference::ComponentCondReference(unique_ptr<Component> c)
|
||||
: kind(CONDITION_ASSERTION), ref_id(0), assertion(move(c)),
|
||||
hasBothBranches(false) {}
|
||||
|
||||
ComponentCondReference::~ComponentCondReference() {}
|
||||
|
||||
ComponentCondReference::ComponentCondReference(
|
||||
const ComponentCondReference &other)
|
||||
: ComponentSequence(other), kind(other.kind), ref_id(other.ref_id),
|
||||
ref_name(other.ref_name), hasBothBranches(other.hasBothBranches) {
|
||||
if (kind == CONDITION_ASSERTION) {
|
||||
assert(other.assertion);
|
||||
assertion.reset(other.assertion->clone());
|
||||
} else {
|
||||
assert(!other.assertion);
|
||||
}
|
||||
}
|
||||
|
||||
ComponentCondReference *ComponentCondReference::clone() const {
|
||||
return new ComponentCondReference(*this);
|
||||
}
|
||||
|
||||
Component *ComponentCondReference::accept(ComponentVisitor &v) {
|
||||
Component *c = v.visit(this);
|
||||
if (c != this) {
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
if (kind == CONDITION_ASSERTION) {
|
||||
Component *a = assertion.get();
|
||||
c = assertion->accept(v);
|
||||
if (c != a) {
|
||||
assertion.reset(c);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
Component *child = i->get();
|
||||
c = (*i)->accept(v);
|
||||
if (c != child) {
|
||||
// Child has been replaced (new Component pointer) or we've been
|
||||
// instructed to delete it (null).
|
||||
i->reset(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove deleted children.
|
||||
children.erase(remove(children.begin(), children.end(), nullptr),
|
||||
children.end());
|
||||
|
||||
v.post(this);
|
||||
return this;
|
||||
}
|
||||
|
||||
void ComponentCondReference::accept(ConstComponentVisitor &v) const {
|
||||
v.pre(*this);
|
||||
|
||||
if (kind == CONDITION_ASSERTION) {
|
||||
assertion->accept(v);
|
||||
v.during(*this); // FIXME: a good idea?
|
||||
}
|
||||
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
(*i)->accept(v);
|
||||
if (i + 1 != e) {
|
||||
v.during(*this);
|
||||
}
|
||||
}
|
||||
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
void ComponentCondReference::addAlternation() {
|
||||
if (alternation) {
|
||||
if (ref_name == "DEFINE") {
|
||||
throw LocatedParseError("DEFINE conditional group with more than "
|
||||
"one branch");
|
||||
}
|
||||
|
||||
if (alternation->numBranches() >= 2) {
|
||||
throw LocatedParseError("Conditional with more than two branches");
|
||||
}
|
||||
}
|
||||
hasBothBranches = true;
|
||||
ComponentSequence::addAlternation();
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentCondReference::first() const {
|
||||
assert(0);
|
||||
return vector<PositionInfo>();
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentCondReference::last() const {
|
||||
assert(0);
|
||||
return vector<PositionInfo>();
|
||||
}
|
||||
|
||||
bool ComponentCondReference::empty() const { return true; }
|
||||
|
||||
void ComponentCondReference::notePositions(GlushkovBuildState &) { assert(0); }
|
||||
|
||||
void ComponentCondReference::buildFollowSet(GlushkovBuildState &,
|
||||
const vector<PositionInfo> &) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
bool ComponentCondReference::repeatable() const {
|
||||
// If this assertion has no children (it's an empty sequence, like that
|
||||
// produced by '(?!)') then PCRE would throw a "nothing to repeat" error.
|
||||
// So we do as well.
|
||||
return !children.empty();
|
||||
}
|
||||
|
||||
} // namespace ue2
|
91
src/parser/ComponentCondReference.h
Normal file
91
src/parser/ComponentCondReference.h
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Conditional reference.
|
||||
*/
|
||||
|
||||
#ifndef PARSER_COMPONENTCONDREFERENCE_H_
|
||||
#define PARSER_COMPONENTCONDREFERENCE_H_
|
||||
|
||||
#include "ComponentSequence.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class ComponentCondReference : public ComponentSequence {
|
||||
friend class DumpVisitor;
|
||||
friend class PrefilterVisitor;
|
||||
friend class ReferenceVisitor;
|
||||
friend class PrintVisitor;
|
||||
public:
|
||||
ComponentCondReference(unsigned ref);
|
||||
ComponentCondReference(const std::string &name);
|
||||
ComponentCondReference(std::unique_ptr<Component> c);
|
||||
|
||||
~ComponentCondReference() override;
|
||||
ComponentCondReference *clone() const override;
|
||||
Component *accept(ComponentVisitor &v) override;
|
||||
void accept(ConstComponentVisitor &v) const override;
|
||||
|
||||
void addAlternation() override;
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
|
||||
bool empty() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
bool repeatable() const override;
|
||||
|
||||
private:
|
||||
ComponentCondReference(const ComponentCondReference &other);
|
||||
|
||||
enum Condition {
|
||||
CONDITION_NUMBER,
|
||||
CONDITION_NAME,
|
||||
CONDITION_ASSERTION
|
||||
};
|
||||
|
||||
enum Condition kind;
|
||||
|
||||
unsigned ref_id;
|
||||
std::string ref_name;
|
||||
std::unique_ptr<Component> assertion;
|
||||
|
||||
/** True if an alternation has been added, which means we have both a YES
|
||||
* and a NO branch. */
|
||||
bool hasBothBranches;
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // PARSER_COMPONENTCONDREFERENCE_H_
|
75
src/parser/ComponentEUS.cpp
Normal file
75
src/parser/ComponentEUS.cpp
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Extended Unicode sequences (\\X)
|
||||
*/
|
||||
|
||||
|
||||
#include "ComponentEUS.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "Parser.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "util/charreach.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentEUS::ComponentEUS(u32 loc_in, const ParseMode &mode)
|
||||
: loc(loc_in), utf8(mode.utf8),
|
||||
position(GlushkovBuildState::POS_UNINITIALIZED) {}
|
||||
|
||||
ComponentEUS::~ComponentEUS() {}
|
||||
|
||||
ComponentEUS * ComponentEUS::clone() const {
|
||||
return new ComponentEUS(*this);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentEUS::first() const {
|
||||
return vector<PositionInfo>(1, PositionInfo(position));
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentEUS::last() const {
|
||||
return vector<PositionInfo>(1, PositionInfo(position));
|
||||
}
|
||||
|
||||
void ComponentEUS::notePositions(GlushkovBuildState &bs) {
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
position = builder.makePositions(1);
|
||||
builder.addCharReach(position, CharReach::dot());
|
||||
builder.setNodeReportID(position, 0 /* offset adj */);
|
||||
if (utf8) { /* we are prefiltering, turn to.+ */
|
||||
builder.addEdge(position, position);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ue2
|
86
src/parser/ComponentEUS.h
Normal file
86
src/parser/ComponentEUS.h
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Extended Unicode sequences (\\X)
|
||||
*/
|
||||
|
||||
#ifndef _RE_COMPONENTEXTENDEDUNICODESEQUENCE_H_
|
||||
#define _RE_COMPONENTEXTENDEDUNICODESEQUENCE_H_
|
||||
|
||||
#include "Component.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
struct ParseMode;
|
||||
|
||||
class ComponentEUS : public Component {
|
||||
friend class DumpVisitor;
|
||||
friend class UnsupportedVisitor;
|
||||
public:
|
||||
ComponentEUS(u32 loc, const ParseMode &mode);
|
||||
~ComponentEUS() override;
|
||||
ComponentEUS *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
|
||||
bool empty() const override { return false; }
|
||||
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &,
|
||||
const std::vector<PositionInfo> &) override {
|
||||
// all follow set construction is handled by firsts/lasts
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
u32 loc;
|
||||
bool utf8;
|
||||
Position position;
|
||||
|
||||
ComponentEUS(const ComponentEUS &other)
|
||||
: Component(other), loc(other.loc), utf8(other.utf8),
|
||||
position(other.position) {}
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
93
src/parser/ComponentEmpty.cpp
Normal file
93
src/parser/ComponentEmpty.cpp
Normal file
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Represents an empty regex element, like (?m)
|
||||
*/
|
||||
#include <cassert>
|
||||
|
||||
#include "ComponentEmpty.h"
|
||||
#include "position.h"
|
||||
#include "position_info.h"
|
||||
#include "buildstate.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentEmpty::ComponentEmpty() {
|
||||
// Surprise, it's EMPTY!
|
||||
}
|
||||
|
||||
ComponentEmpty::~ComponentEmpty() {
|
||||
// Surprise, it's EMPTY!
|
||||
}
|
||||
|
||||
ComponentEmpty *ComponentEmpty::clone() const { return new ComponentEmpty(); }
|
||||
|
||||
bool ComponentEmpty::empty() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ComponentEmpty::vacuous_everywhere(void) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ComponentEmpty::repeatable() const {
|
||||
// This is the whole point of this class. Empty constructs like '(?m)' are
|
||||
// not repeatable.
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentEmpty::first() const {
|
||||
return vector<PositionInfo>(1, GlushkovBuildState::POS_EPSILON);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentEmpty::last() const {
|
||||
return vector<PositionInfo>();
|
||||
}
|
||||
|
||||
void ComponentEmpty::notePositions(GlushkovBuildState &) {
|
||||
// Nothing to do.
|
||||
}
|
||||
|
||||
void ComponentEmpty::buildFollowSet(GlushkovBuildState &,
|
||||
const vector<PositionInfo> &) {
|
||||
// Nothing to do.
|
||||
}
|
||||
|
||||
bool ComponentEmpty::checkEmbeddedStartAnchor(bool at_start) const {
|
||||
return at_start;
|
||||
}
|
||||
|
||||
bool ComponentEmpty::checkEmbeddedEndAnchor(bool at_end) const {
|
||||
return at_end;
|
||||
}
|
||||
|
||||
} // namespace ue2
|
75
src/parser/ComponentEmpty.h
Normal file
75
src/parser/ComponentEmpty.h
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Represents an empty regex element, like (?m)
|
||||
*/
|
||||
|
||||
#ifndef PARSER_COMPONENT_EMPTY_H_
|
||||
#define PARSER_COMPONENT_EMPTY_H_
|
||||
|
||||
#include "Component.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class ComponentEmpty : public Component {
|
||||
friend class DumpVisitor;
|
||||
public:
|
||||
ComponentEmpty();
|
||||
~ComponentEmpty() override;
|
||||
ComponentEmpty *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
bool empty() const override;
|
||||
bool vacuous_everywhere() const override;
|
||||
bool repeatable() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
|
||||
bool checkEmbeddedStartAnchor(bool at_start) const override;
|
||||
bool checkEmbeddedEndAnchor(bool at_end) const override;
|
||||
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // PARSER_COMPONENT_EMPTY_H_
|
393
src/parser/ComponentRepeat.cpp
Normal file
393
src/parser/ComponentRepeat.cpp
Normal file
@@ -0,0 +1,393 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Repeats ('*', '+', '?', '{M,N}', etc)
|
||||
*/
|
||||
|
||||
|
||||
#include "ComponentRepeat.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "parse_error.h"
|
||||
#include "Parser.h"
|
||||
#include "position.h"
|
||||
#include "position_dump.h"
|
||||
#include "position_info.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/make_unique.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Hard limit on the maximum repeat for bounded repeats. */
|
||||
static const u32 MAX_MAX_BOUND = 32767;
|
||||
|
||||
/** \brief If expanding a repeat would lead to this many positions being
|
||||
* generated, we fail the pattern. */
|
||||
static const u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge
|
||||
|
||||
/* no edge priorities means that if our subcomponent can be empty, our min
|
||||
* extent is effectively zero. */
|
||||
ComponentRepeat::ComponentRepeat(unique_ptr<Component> sub_comp_in, u32 min,
|
||||
u32 max, enum RepeatType t)
|
||||
: type(t), sub_comp(move(sub_comp_in)), m_min(min), m_max(max),
|
||||
posFirst(GlushkovBuildState::POS_UNINITIALIZED),
|
||||
posLast(GlushkovBuildState::POS_UNINITIALIZED) {
|
||||
assert(sub_comp);
|
||||
assert(max > 0);
|
||||
assert(m_min <= m_max);
|
||||
if (m_max < NoLimit && m_max > MAX_MAX_BOUND) {
|
||||
throw ParseError("Bounded repeat is too large.");
|
||||
}
|
||||
}
|
||||
|
||||
ComponentRepeat::~ComponentRepeat() {}
|
||||
|
||||
ComponentRepeat *ComponentRepeat::clone() const {
|
||||
return new ComponentRepeat(*this);
|
||||
}
|
||||
|
||||
ComponentRepeat::ComponentRepeat(const ComponentRepeat &other)
|
||||
: Component(other),
|
||||
type(other.type), sub_comp(unique_ptr<Component>(other.sub_comp->clone())),
|
||||
m_min(other.m_min), m_max(other.m_max),
|
||||
m_firsts(other.m_firsts), m_lasts(other.m_lasts),
|
||||
posFirst(other.posFirst), posLast(other.posLast),
|
||||
firsts_cache(other.firsts_cache) {}
|
||||
|
||||
bool ComponentRepeat::empty() const {
|
||||
return m_min == 0 || sub_comp->empty();
|
||||
}
|
||||
|
||||
bool ComponentRepeat::repeatable() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
void addBase(Position base, vector<PositionInfo> &firsts,
|
||||
vector<PositionInfo> &lasts) {
|
||||
for (auto &e : firsts) {
|
||||
if (e.pos != GlushkovBuildState::POS_EPSILON) {
|
||||
e.pos += base;
|
||||
}
|
||||
}
|
||||
for (auto &e : lasts) {
|
||||
e.pos += base;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void checkPositions(vector<PositionInfo> &v, const GlushkovBuildState &bs) {
|
||||
const NFABuilder& builder = bs.getBuilder();
|
||||
for (const auto &e : v) {
|
||||
if (builder.isSpecialState(e.pos)) {
|
||||
throw ParseError("Embedded anchors not supported.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ComponentRepeat::notePositions(GlushkovBuildState &bs) {
|
||||
assert(m_max > 0);
|
||||
assert(m_max == NoLimit || m_max < MAX_MAX_BOUND);
|
||||
|
||||
/* Note: We can construct smaller subgraphs if we're not maintaining edge
|
||||
* priorities. */
|
||||
|
||||
// We create one copy only through a recursive call to notePositions(),
|
||||
// first() and last(). Then we clone its positions and store the
|
||||
// appropriate firsts and lasts values for the copies.
|
||||
posFirst = bs.getBuilder().numVertices();
|
||||
sub_comp->notePositions(bs);
|
||||
|
||||
u32 copies = m_max < NoLimit ? m_max : MAX(m_min, 1);
|
||||
DEBUG_PRINTF("building %u copies of repeated region\n", copies);
|
||||
m_firsts.clear();
|
||||
m_lasts.clear();
|
||||
m_firsts.resize(copies);
|
||||
m_lasts.resize(copies);
|
||||
|
||||
m_firsts[0] = sub_comp->first();
|
||||
m_lasts[0] = sub_comp->last();
|
||||
|
||||
postSubNotePositionHook();
|
||||
|
||||
posLast = bs.getBuilder().numVertices() - 1;
|
||||
u32 vcount = posLast + 1 - posFirst;
|
||||
|
||||
// If we're making more than one copy, then our firsts and lasts must only
|
||||
// contain vertices inside [posFirst, posLast]: anything else means we have
|
||||
// an embedded anchor or otherwise weird situation.
|
||||
if (copies > 1) {
|
||||
checkPositions(m_firsts[0], bs);
|
||||
checkPositions(m_lasts[0], bs);
|
||||
}
|
||||
|
||||
// Avoid enormous expansions
|
||||
if (vcount * copies > MAX_POSITIONS_EXPANDED) {
|
||||
throw ParseError("Bounded repeat is too large.");
|
||||
}
|
||||
|
||||
// Add positions for the rest of the copies
|
||||
size_t copyPositions = vcount * (copies - 1);
|
||||
bs.getBuilder().makePositions(copyPositions);
|
||||
|
||||
// Calculate our firsts and lasts for the copies
|
||||
for (u32 i = 1; i < copies; ++i) {
|
||||
m_firsts[i] = m_firsts[0];
|
||||
m_lasts[i] = m_lasts[0];
|
||||
u32 base = i * vcount;
|
||||
addBase(base, m_firsts[i], m_lasts[i]);
|
||||
}
|
||||
|
||||
recordPosBounds(posFirst, bs.getBuilder().numVertices());
|
||||
precalc_firsts(); /* ComponentRepeat requires firsts to be calculated ahead
|
||||
* of time and cached due to expense */
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentRepeat::first() const {
|
||||
DEBUG_PRINTF("firsts = %s\n", dumpPositions(firsts_cache.begin(),
|
||||
firsts_cache.end()).c_str());
|
||||
return firsts_cache;
|
||||
}
|
||||
|
||||
void ComponentRepeat::buildFollowSet(GlushkovBuildState &bs,
|
||||
const vector<PositionInfo> &lastPos) {
|
||||
if (!m_max) {
|
||||
return;
|
||||
}
|
||||
DEBUG_PRINTF("enter\n");
|
||||
|
||||
// Wire up the first (the "real") entry
|
||||
|
||||
DEBUG_PRINTF("initial repeat\n");
|
||||
sub_comp->buildFollowSet(bs, lastPos);
|
||||
|
||||
// Clone the subgraph we just added N times, where N is the minimum extent
|
||||
// of the graph minus one, wiring them up in a linear sequence
|
||||
|
||||
u32 copies = m_firsts.size();
|
||||
DEBUG_PRINTF("cloning %u copies of repeat\n", copies - 1);
|
||||
for (u32 rep = 1; rep < copies; rep++) {
|
||||
u32 offset = (posLast + 1 - posFirst) * rep;
|
||||
if (offset > 0) {
|
||||
bs.cloneFollowSet(posFirst, posLast, offset);
|
||||
}
|
||||
}
|
||||
|
||||
wireRepeats(bs, lastPos);
|
||||
|
||||
DEBUG_PRINTF("leave\n");
|
||||
}
|
||||
|
||||
void ComponentRepeat::optimise(bool connected_to_sds) {
|
||||
DEBUG_PRINTF("opt %d\n", (int)connected_to_sds);
|
||||
if (!connected_to_sds) {
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("setting m_max to %u\n", m_min);
|
||||
m_max = m_min;
|
||||
}
|
||||
|
||||
bool ComponentRepeat::vacuous_everywhere() const {
|
||||
return !m_min;
|
||||
}
|
||||
|
||||
bool ComponentRepeat::checkEmbeddedStartAnchor(bool at_start) const {
|
||||
at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
|
||||
|
||||
if (m_max > 1) {
|
||||
at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
|
||||
}
|
||||
|
||||
return at_start;
|
||||
}
|
||||
|
||||
bool ComponentRepeat::checkEmbeddedEndAnchor(bool at_end) const {
|
||||
at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
|
||||
|
||||
if (m_max > 1) {
|
||||
at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
|
||||
}
|
||||
|
||||
return at_end;
|
||||
}
|
||||
|
||||
Component *ComponentRepeat::accept(ComponentVisitor &v) {
|
||||
Component *c = v.visit(this);
|
||||
if (c != this) {
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
c = sub_comp->accept(v);
|
||||
if (c != sub_comp.get()) {
|
||||
sub_comp.reset(c);
|
||||
}
|
||||
|
||||
v.post(this);
|
||||
return !sub_comp ? nullptr : this;
|
||||
}
|
||||
|
||||
void ComponentRepeat::accept(ConstComponentVisitor &v) const {
|
||||
v.pre(*this);
|
||||
sub_comp->accept(v);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentRepeat::last() const {
|
||||
vector<PositionInfo> lasts;
|
||||
if (!m_max) {
|
||||
return lasts;
|
||||
}
|
||||
|
||||
assert(!m_firsts.empty()); // notePositions should already have run
|
||||
assert(!m_lasts.empty());
|
||||
|
||||
// Optimisation: when we're not maintaining edge priorities, handling
|
||||
// optional repeats has been taken care of by our FIRSTS. Thus, only
|
||||
// the last mandatory repeat and (if different) the last optional
|
||||
// repeat contributes to lasts.
|
||||
if (m_min) {
|
||||
const vector<PositionInfo> &l = m_lasts[m_min - 1];
|
||||
lasts.insert(lasts.end(), l.begin(), l.end());
|
||||
}
|
||||
if (!m_min || m_min != m_lasts.size()) {
|
||||
lasts.insert(lasts.end(), m_lasts.back().begin(), m_lasts.back().end());
|
||||
}
|
||||
return lasts;
|
||||
}
|
||||
|
||||
void ComponentRepeat::wireRepeats(GlushkovBuildState &bs,
|
||||
const vector<PositionInfo> &lastPos) {
|
||||
/* note: m_lasts[0] already valid */
|
||||
u32 copies = m_firsts.size();
|
||||
const bool isEmpty = sub_comp->empty();
|
||||
const vector<PositionInfo> &optLasts = m_min ? m_lasts[m_min - 1] : lastPos;
|
||||
|
||||
if (!copies) {
|
||||
goto inf_check;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("wiring up %u mand repeats\n", m_min);
|
||||
for (u32 rep = 1; rep < m_min; rep++) {
|
||||
bs.connectRegions(m_lasts[rep - 1], m_firsts[rep]);
|
||||
|
||||
if (isEmpty) {
|
||||
m_lasts[rep].insert(m_lasts[rep].end(), m_lasts[rep - 1].begin(),
|
||||
m_lasts[rep - 1].end());
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("wiring up %d optional repeats\n", copies - m_min);
|
||||
for (u32 rep = MAX(m_min, 1); rep < copies; rep++) {
|
||||
vector<PositionInfo> lasts = m_lasts[rep - 1];
|
||||
if (m_min && rep != m_min) {
|
||||
lasts.insert(lasts.end(), optLasts.begin(), optLasts.end());
|
||||
sort(lasts.begin(), lasts.end());
|
||||
lasts.erase(unique(lasts.begin(), lasts.end()), lasts.end());
|
||||
}
|
||||
bs.connectRegions(lasts, m_firsts[rep]);
|
||||
}
|
||||
|
||||
inf_check:
|
||||
// If we have no max bound, we need a self-loop as well.
|
||||
if (m_max == NoLimit) {
|
||||
DEBUG_PRINTF("final repeat self-loop\n");
|
||||
bs.connectRegions(m_lasts.back(), m_firsts.back());
|
||||
}
|
||||
}
|
||||
|
||||
void ComponentRepeat::precalc_firsts() {
|
||||
DEBUG_PRINTF("building firsts for {%u,%u} repeat with %s sub\n", m_min,
|
||||
m_max, sub_comp->empty() ? "emptiable" : "non-emptiable");
|
||||
|
||||
/* For normal repeat, our optional repeats each have an epsilon at the end
|
||||
* of their firsts lists.
|
||||
*/
|
||||
for (u32 i = m_min; i < m_firsts.size();i++) {
|
||||
m_firsts[i].insert(m_firsts[i].end(), GlushkovBuildState::POS_EPSILON);
|
||||
}
|
||||
|
||||
firsts_cache.clear();
|
||||
if (!m_max) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(!m_firsts.empty()); // notePositions should already have run
|
||||
const vector<PositionInfo> &f = m_firsts.front();
|
||||
|
||||
// If we're running without edge priorities, then we want to generate the
|
||||
// repeat in such a way that the firsts do all the work. This will minimise
|
||||
// the number of exceptional states in a LimEx NFA implementation.
|
||||
|
||||
if (!m_min || sub_comp->empty()) {
|
||||
// Emptiable: all our repeats contribute to firsts.
|
||||
// Each repeat's firsts is spliced in at the location of the epsilon
|
||||
// (if any) in the previous repeat's firsts.
|
||||
for (const auto &e : m_firsts) {
|
||||
replaceEpsilons(firsts_cache, e);
|
||||
}
|
||||
} else {
|
||||
// Not emptiable: firsts come from our first repeat only.
|
||||
firsts_cache.insert(firsts_cache.end(), f.begin(), f.end());
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
bool hasPositionFlags(const Component &c) {
|
||||
for (const auto &e : c.first()) {
|
||||
if (e.flags) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void ComponentRepeat::postSubNotePositionHook() {
|
||||
// UE-444 optimization: we can REWRITE m_min under various circumstances,
|
||||
// so that we create smaller NFA graphs. Note that this is _not_ possible
|
||||
// if our subcomponent contains a flagged position, e.g. nofloat.
|
||||
if (!hasPositionFlags(*sub_comp) && sub_comp->empty()) {
|
||||
m_min = 0;
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<ComponentRepeat> makeComponentRepeat(unique_ptr<Component> sub_comp,
|
||||
u32 min, u32 max,
|
||||
ComponentRepeat::RepeatType t) {
|
||||
return ue2::make_unique<ComponentRepeat>(move(sub_comp), min, max, t);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
146
src/parser/ComponentRepeat.h
Normal file
146
src/parser/ComponentRepeat.h
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Repeats ('*', '+', '?', '{M,N}', etc)
|
||||
*/
|
||||
|
||||
#ifndef RE_COMPONENTREPEAT_H
|
||||
#define RE_COMPONENTREPEAT_H
|
||||
|
||||
#include "Component.h"
|
||||
#include "position.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Encapsulates a repeat of a subexpression ('*', '+', '?', '{M,N}',
|
||||
* etc).
|
||||
*
|
||||
* Ascii Art Time:
|
||||
*
|
||||
* Our standard representation of standard repeats. Other constructions (fan-in
|
||||
* vs fan-out) would also be possible and equivalent for our purposes.
|
||||
*
|
||||
* {n,m}
|
||||
*
|
||||
* S->M->M->M->O->O->O->T
|
||||
* | ^ ^ ^
|
||||
* | | | |
|
||||
* \-----------/
|
||||
*
|
||||
* {0,m}
|
||||
*
|
||||
* S->O->O->O->T
|
||||
* | ^ ^ ^
|
||||
* | | | |
|
||||
* \-----------/
|
||||
*
|
||||
*/
|
||||
|
||||
class ComponentRepeat : public Component {
|
||||
friend class ConstructLiteralVisitor;
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
friend class SimplifyVisitor;
|
||||
public:
|
||||
/** \brief Value representing no maximum bound. */
|
||||
static constexpr u32 NoLimit = 0xffffffff;
|
||||
|
||||
/** \brief Type of this repeat, characterising its
|
||||
* greediness/possessiveness. */
|
||||
enum RepeatType {
|
||||
/** Minimising repeat, like 'a*?'. */
|
||||
REPEAT_NONGREEDY,
|
||||
/** Maximising repeat, like 'a*'. This is the default in PCRE. */
|
||||
REPEAT_GREEDY,
|
||||
/** Possessive, maximising repeat, like 'a*+'. Possessive repeats are
|
||||
* only currently supported in prefiltering mode, where we treat them
|
||||
* the same way we treat normal greedy repeats. */
|
||||
REPEAT_POSSESSIVE,
|
||||
};
|
||||
|
||||
ComponentRepeat(std::unique_ptr<Component> sub_comp, u32 min, u32 max,
|
||||
RepeatType t);
|
||||
~ComponentRepeat() override;
|
||||
ComponentRepeat *clone() const override;
|
||||
Component *accept(ComponentVisitor &v) override;
|
||||
void accept(ConstComponentVisitor &v) const override;
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
bool empty() const override;
|
||||
bool repeatable() const override;
|
||||
bool vacuous_everywhere() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
bool checkEmbeddedStartAnchor(bool at_start) const override;
|
||||
bool checkEmbeddedEndAnchor(bool at_end) const override;
|
||||
|
||||
void optimise(bool connected_to_sds) override;
|
||||
|
||||
virtual std::pair<u32, u32> getBounds() const {
|
||||
return std::make_pair(m_min, m_max);
|
||||
}
|
||||
|
||||
/** \brief From declared behaviour (not taking into account the
|
||||
* sub-component). */
|
||||
enum RepeatType type;
|
||||
|
||||
protected:
|
||||
/** Called by \ref buildFollowSet to connect up the various repeats. */
|
||||
void precalc_firsts();
|
||||
void postSubNotePositionHook();
|
||||
void wireRepeats(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos);
|
||||
|
||||
std::unique_ptr<Component> sub_comp;
|
||||
u32 m_min;
|
||||
u32 m_max;
|
||||
|
||||
std::vector<std::vector<PositionInfo> > m_firsts;
|
||||
std::vector<std::vector<PositionInfo> > m_lasts;
|
||||
Position posFirst;
|
||||
Position posLast;
|
||||
|
||||
std::vector<PositionInfo> firsts_cache;
|
||||
|
||||
ComponentRepeat(const ComponentRepeat &other);
|
||||
};
|
||||
|
||||
std::unique_ptr<ComponentRepeat>
|
||||
makeComponentRepeat(std::unique_ptr<Component> sub_comp, u32 min, u32 max,
|
||||
ComponentRepeat::RepeatType t);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // _RE_COMPONENTREPEAT_H_
|
376
src/parser/ComponentSequence.cpp
Normal file
376
src/parser/ComponentSequence.cpp
Normal file
@@ -0,0 +1,376 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Sequence of Component objects.
|
||||
*/
|
||||
|
||||
|
||||
#include "ComponentSequence.h"
|
||||
|
||||
#include "buildstate.h"
|
||||
#include "ComponentAlternation.h"
|
||||
#include "ComponentRepeat.h"
|
||||
#include "Parser.h"
|
||||
#include "ue2common.h"
|
||||
#include "parse_error.h"
|
||||
#include "position_dump.h"
|
||||
#include "position_info.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "util/container.h"
|
||||
#include "util/make_unique.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentSequence::ComponentSequence() : capture_index(NOT_CAPTURED) {}
|
||||
|
||||
ComponentSequence::~ComponentSequence() {}
|
||||
|
||||
ComponentSequence::ComponentSequence(const ComponentSequence &other)
|
||||
: Component(other), capture_index(other.capture_index) {
|
||||
// Deep copy children.
|
||||
for (const auto &c : other.children) {
|
||||
assert(c);
|
||||
children.push_back(unique_ptr<Component>(c->clone()));
|
||||
}
|
||||
if (other.alternation) {
|
||||
const ComponentAlternation &c = *other.alternation;
|
||||
alternation.reset(c.clone());
|
||||
}
|
||||
}
|
||||
|
||||
ComponentSequence *ComponentSequence::clone() const {
|
||||
return new ComponentSequence(*this);
|
||||
}
|
||||
|
||||
Component *ComponentSequence::accept(ComponentVisitor &v) {
|
||||
assert(!alternation); // Sequence must be finalized first.
|
||||
|
||||
Component *c = v.visit(this);
|
||||
if (c != this) {
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
Component *child = i->get();
|
||||
c = (*i)->accept(v);
|
||||
if (c != child) {
|
||||
// Child has been replaced (new Component pointer) or we've been
|
||||
// instructed to delete it (null).
|
||||
i->reset(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove deleted children.
|
||||
children.erase(remove(children.begin(), children.end(), nullptr),
|
||||
children.end());
|
||||
|
||||
v.post(this);
|
||||
return this;
|
||||
}
|
||||
|
||||
void ComponentSequence::accept(ConstComponentVisitor &v) const {
|
||||
assert(!alternation); // Sequence must be finalized first.
|
||||
|
||||
v.pre(*this);
|
||||
|
||||
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
|
||||
(*i)->accept(v);
|
||||
|
||||
if (i + 1 != e) {
|
||||
v.during(*this);
|
||||
}
|
||||
}
|
||||
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
void ComponentSequence::addComponent(unique_ptr<Component> comp) {
|
||||
children.push_back(move(comp));
|
||||
}
|
||||
|
||||
bool ComponentSequence::addRepeat(u32 min, u32 max,
|
||||
ComponentRepeat::RepeatType type) {
|
||||
if (children.empty() || min > max || max == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We can't apply a repeat to some types of component.
|
||||
assert(children.back());
|
||||
if (!children.back()->repeatable()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
children.back() = makeComponentRepeat(move(children.back()), min, max,
|
||||
type);
|
||||
assert(children.back());
|
||||
return true;
|
||||
}
|
||||
|
||||
void ComponentSequence::addAlternation() {
|
||||
if (!alternation) {
|
||||
alternation = ue2::make_unique<ComponentAlternation>();
|
||||
}
|
||||
|
||||
auto seq = ue2::make_unique<ComponentSequence>();
|
||||
seq->children.swap(children);
|
||||
alternation->append(move(seq));
|
||||
}
|
||||
|
||||
void ComponentSequence::finalize() {
|
||||
if (alternation) {
|
||||
addAlternation();
|
||||
assert(children.empty());
|
||||
children.push_back(move(alternation));
|
||||
alternation = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentSequence::first() const {
|
||||
vector<PositionInfo> firsts, subfirsts;
|
||||
|
||||
for (const auto &c : children) {
|
||||
subfirsts = c->first();
|
||||
replaceEpsilons(firsts, subfirsts);
|
||||
if (!c->empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (firsts.empty()) {
|
||||
DEBUG_PRINTF("trivial empty sequence %zu\n", firsts.size());
|
||||
assert(children.empty());
|
||||
firsts.push_back(GlushkovBuildState::POS_EPSILON);
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("%zu firsts\n", firsts.size());
|
||||
return firsts;
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct eps_info {
|
||||
eps_info() : flags(0U) {}
|
||||
u32 flags;
|
||||
};
|
||||
}
|
||||
|
||||
static
|
||||
void epsilonVisit(vector<eps_info> *info, const vector<PositionInfo> &f) {
|
||||
vector<eps_info> out;
|
||||
out.reserve(info->size());
|
||||
|
||||
set<u32> seen_flags;
|
||||
|
||||
assert(!info->empty());
|
||||
for (auto eps = find(f.begin(), f.end(), GlushkovBuildState::POS_EPSILON);
|
||||
eps != f.end();
|
||||
eps = find(eps + 1, f.end(), GlushkovBuildState::POS_EPSILON)) {
|
||||
for (auto it = info->begin(); it != info->end(); ++it) {
|
||||
u32 flags = it->flags | eps->flags;
|
||||
if (contains(seen_flags, flags)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push_back(*it);
|
||||
out.back().flags = flags;
|
||||
seen_flags.insert(flags);
|
||||
}
|
||||
}
|
||||
|
||||
info->swap(out);
|
||||
assert(!info->empty());
|
||||
}
|
||||
|
||||
static
|
||||
void applyEpsilonVisits(vector<PositionInfo> &lasts,
|
||||
const vector<eps_info> &eps_visits) {
|
||||
vector<PositionInfo> out;
|
||||
out.reserve(lasts.size() * eps_visits.size());
|
||||
|
||||
for (const auto &last : lasts) {
|
||||
for (const auto &e : eps_visits) {
|
||||
out.push_back(last);
|
||||
out.back().flags |= e.flags;
|
||||
}
|
||||
}
|
||||
|
||||
cleanupPositions(out);
|
||||
lasts.swap(out);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentSequence::last() const {
|
||||
vector<PositionInfo> lasts, sublasts;
|
||||
vector<eps_info> visits(1);
|
||||
|
||||
auto i = children.rbegin(), e = children.rend();
|
||||
for (; i != e; ++i) {
|
||||
sublasts = (*i)->last();
|
||||
applyEpsilonVisits(sublasts, visits);
|
||||
lasts.insert(lasts.end(), sublasts.begin(), sublasts.end());
|
||||
if ((*i)->empty()) {
|
||||
// this epsilon's flags should propagate to subsequent lasts'
|
||||
// enter/exit lists
|
||||
epsilonVisit(&visits, (*i)->first());
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("lasts = %s\n",
|
||||
dumpPositions(lasts.begin(), lasts.end()).c_str());
|
||||
return lasts;
|
||||
}
|
||||
|
||||
bool ComponentSequence::empty(void) const {
|
||||
// a sequence can be empty if all its subcomponents can be empty
|
||||
for (const auto &c : children) {
|
||||
if (!c->empty()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void ComponentSequence::notePositions(GlushkovBuildState &bs) {
|
||||
u32 pb = bs.getBuilder().numVertices();
|
||||
for (auto &c : children) {
|
||||
c->notePositions(bs);
|
||||
}
|
||||
recordPosBounds(pb, bs.getBuilder().numVertices());
|
||||
}
|
||||
|
||||
void ComponentSequence::buildFollowSet(GlushkovBuildState &bs,
|
||||
const vector<PositionInfo> &lastPos) {
|
||||
DEBUG_PRINTF("sequence of %zu components\n", children.size());
|
||||
|
||||
// If no components, no work to do.
|
||||
if (children.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// First element
|
||||
children.front()->buildFollowSet(bs, lastPos);
|
||||
if (children.size() == 1) {
|
||||
// If our sequence contains precisely one component, then we've done
|
||||
// all our work. Hooking up its firsts and lasts will be done by our
|
||||
// parent component.
|
||||
return;
|
||||
}
|
||||
|
||||
// Remaining elements, wiring last to first in sequence.
|
||||
|
||||
vector<PositionInfo> prevLasts = children.front()->last();
|
||||
|
||||
for (auto it = next(children.begin()), ite = children.end(); it != ite; ++it) {
|
||||
assert(*it);
|
||||
Component &c = *(*it);
|
||||
|
||||
// Build subcomponent follow set
|
||||
c.buildFollowSet(bs, prevLasts);
|
||||
|
||||
// FIRST(curr)
|
||||
vector<PositionInfo> currFirsts(c.first());
|
||||
|
||||
// LAST(prev) => FIRST(curr)
|
||||
DEBUG_PRINTF("connecting lasts (|| %zu) to firsts of comp %zd\n",
|
||||
prevLasts.size(), it - children.begin());
|
||||
bs.connectRegions(prevLasts, currFirsts);
|
||||
|
||||
// Generate a new LAST(prev) for the next iteration; either c->last()
|
||||
// on its own if it can't be empty or c->last unioned with the previous
|
||||
// last if c can be empty
|
||||
vector<PositionInfo> currLasts(c.last());
|
||||
|
||||
if (!c.empty()) {
|
||||
// Current component can't be empty, so use its lasts only
|
||||
prevLasts.swap(currLasts);
|
||||
DEBUG_PRINTF("swapped lasts\n");
|
||||
} else {
|
||||
// Add current lasts to previous lasts
|
||||
DEBUG_PRINTF("doing stuff for empty comp\n");
|
||||
prevLasts.insert(prevLasts.end(), currLasts.begin(), currLasts.end());
|
||||
DEBUG_PRINTF("done stuff for empty comp\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ComponentSequence::checkEmbeddedStartAnchor(bool at_start) const {
|
||||
for (const auto &c : children) {
|
||||
at_start = c->checkEmbeddedStartAnchor(at_start);
|
||||
}
|
||||
|
||||
return at_start;
|
||||
}
|
||||
|
||||
bool ComponentSequence::checkEmbeddedEndAnchor(bool at_end) const {
|
||||
// Note reversed ordering.
|
||||
for (auto i = children.rbegin(), e = children.rend(); i != e; ++i) {
|
||||
at_end = (*i)->checkEmbeddedEndAnchor(at_end);
|
||||
}
|
||||
|
||||
return at_end;
|
||||
}
|
||||
|
||||
bool ComponentSequence::vacuous_everywhere() const {
|
||||
for (const auto &c : children) {
|
||||
if (!c->vacuous_everywhere()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void ComponentSequence::optimise(bool connected_to_sds) {
|
||||
DEBUG_PRINTF("opt %d\n", (int)connected_to_sds);
|
||||
for (u32 i = 0; i < children.size();) {
|
||||
DEBUG_PRINTF("opt %u: ctsds: %d\n", i, (int)connected_to_sds);
|
||||
Component &sub = *children[i];
|
||||
|
||||
sub.optimise(connected_to_sds);
|
||||
|
||||
bool vacuous = sub.vacuous_everywhere();
|
||||
|
||||
if (connected_to_sds && vacuous) {
|
||||
DEBUG_PRINTF("delete opt %u\n", i);
|
||||
auto it = children.begin() + i;
|
||||
children.erase(it);
|
||||
continue;
|
||||
}
|
||||
|
||||
connected_to_sds = connected_to_sds && vacuous;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ue2
|
108
src/parser/ComponentSequence.h
Normal file
108
src/parser/ComponentSequence.h
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Sequence of Component objects.
|
||||
*/
|
||||
|
||||
#ifndef COMPONENT_SEQUENCE_H
|
||||
#define COMPONENT_SEQUENCE_H
|
||||
|
||||
#include "Component.h"
|
||||
#include "ComponentRepeat.h" // for ComponentRepeat::RepeatType
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class ComponentAlternation;
|
||||
class GlushkovBuildState;
|
||||
|
||||
// Encapsulates a number of sub expressions to be applied sequentially
|
||||
class ComponentSequence : public Component {
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
friend class SimplifyVisitor;
|
||||
public:
|
||||
/** \brief capture index representing a sequence that ISN'T capturing */
|
||||
static constexpr unsigned int NOT_CAPTURED = 65536;
|
||||
|
||||
ComponentSequence();
|
||||
~ComponentSequence() override;
|
||||
ComponentSequence *clone() const override;
|
||||
Component *accept(ComponentVisitor &v) override;
|
||||
void accept(ConstComponentVisitor &v) const override;
|
||||
|
||||
bool addRepeat(u32 min, u32 max, ComponentRepeat::RepeatType type);
|
||||
|
||||
// overridden by ComponentCondReference, which can only have 1 or 2
|
||||
// branches.
|
||||
virtual void addAlternation();
|
||||
|
||||
virtual void finalize();
|
||||
|
||||
void addComponent(std::unique_ptr<Component> comp);
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
bool empty(void) const override;
|
||||
bool vacuous_everywhere() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
bool checkEmbeddedStartAnchor(bool at_start) const override;
|
||||
bool checkEmbeddedEndAnchor(bool at_end) const override;
|
||||
|
||||
void optimise(bool connected_to_sds) override;
|
||||
|
||||
void setCaptureIndex(unsigned int idx) { capture_index = idx; }
|
||||
unsigned int getCaptureIndex() const { return capture_index; }
|
||||
void setCaptureName(const std::string &s) { capture_name = s; }
|
||||
const std::string &getCaptureName() const { return capture_name; }
|
||||
|
||||
virtual const std::vector<std::unique_ptr<Component>> &getChildren() const {
|
||||
return children;
|
||||
}
|
||||
|
||||
protected:
|
||||
ComponentSequence(const ComponentSequence &other);
|
||||
|
||||
std::vector<std::unique_ptr<Component>> children;
|
||||
std::unique_ptr<ComponentAlternation> alternation;
|
||||
|
||||
private:
|
||||
unsigned int capture_index;
|
||||
std::string capture_name; //!< empty means no name
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
76
src/parser/ComponentVisitor.cpp
Normal file
76
src/parser/ComponentVisitor.cpp
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "AsciiComponentClass.h"
|
||||
#include "ComponentVisitor.h"
|
||||
#include "ComponentAlternation.h"
|
||||
#include "ComponentAssertion.h"
|
||||
#include "ComponentAtomicGroup.h"
|
||||
#include "ComponentBackReference.h"
|
||||
#include "ComponentBoundary.h"
|
||||
#include "ComponentByte.h"
|
||||
#include "ComponentCondReference.h"
|
||||
#include "ComponentClass.h"
|
||||
#include "ComponentEmpty.h"
|
||||
#include "ComponentEUS.h"
|
||||
#include "ComponentRepeat.h"
|
||||
#include "ComponentSequence.h"
|
||||
#include "ComponentWordBoundary.h"
|
||||
#include "Utf8ComponentClass.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentVisitor::~ComponentVisitor() {
|
||||
// empty
|
||||
}
|
||||
|
||||
// Default implementations.
|
||||
|
||||
DefaultComponentVisitor::DefaultComponentVisitor() {}
|
||||
DefaultComponentVisitor::~DefaultComponentVisitor() {}
|
||||
|
||||
#define DEFAULT_FUNCS(comp) \
|
||||
Component *DefaultComponentVisitor::visit(comp *c) { return c; } \
|
||||
void DefaultComponentVisitor::post(comp *) {}
|
||||
|
||||
DEFAULT_FUNCS(AsciiComponentClass)
|
||||
DEFAULT_FUNCS(ComponentAlternation)
|
||||
DEFAULT_FUNCS(ComponentAssertion)
|
||||
DEFAULT_FUNCS(ComponentAtomicGroup)
|
||||
DEFAULT_FUNCS(ComponentBackReference)
|
||||
DEFAULT_FUNCS(ComponentBoundary)
|
||||
DEFAULT_FUNCS(ComponentByte)
|
||||
DEFAULT_FUNCS(ComponentCondReference)
|
||||
DEFAULT_FUNCS(ComponentEmpty)
|
||||
DEFAULT_FUNCS(ComponentEUS)
|
||||
DEFAULT_FUNCS(ComponentRepeat)
|
||||
DEFAULT_FUNCS(ComponentSequence)
|
||||
DEFAULT_FUNCS(ComponentWordBoundary)
|
||||
DEFAULT_FUNCS(UTF8ComponentClass)
|
||||
|
||||
} // namespace
|
150
src/parser/ComponentVisitor.h
Normal file
150
src/parser/ComponentVisitor.h
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Visitor base class for working with the component tree.
|
||||
*/
|
||||
|
||||
#ifndef COMPONENTVISITOR_H
|
||||
#define COMPONENTVISITOR_H
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class AsciiComponentClass;
|
||||
class Component;
|
||||
class ComponentAlternation;
|
||||
class ComponentAssertion;
|
||||
class ComponentAtomicGroup;
|
||||
class ComponentBackReference;
|
||||
class ComponentBoundary;
|
||||
class ComponentByte;
|
||||
class ComponentClass;
|
||||
class ComponentCondReference;
|
||||
class ComponentEmpty;
|
||||
class ComponentEUS;
|
||||
class ComponentRepeat;
|
||||
class ComponentSequence;
|
||||
class ComponentWordBoundary;
|
||||
class UTF8ComponentClass;
|
||||
|
||||
/**
|
||||
* \brief Visitor base class for working with the component tree.
|
||||
*
|
||||
* Our approach to implementing the visitor pattern for traversing (and
|
||||
* optionally mutating) the Component tree for a pattern. Each _visit_ function
|
||||
* takes a Component subclass pointer in and returns a Component pointer. That
|
||||
* pointer can have several values, dictating what the containing Component
|
||||
* should do:
|
||||
*
|
||||
* 1. If ptr == c, then do nothing.
|
||||
* 2. If ptr == nullptr, then remove c from the tree.
|
||||
* 3. If ptr != c && ptr != nullptr, then replace c with ptr.
|
||||
*
|
||||
* Traversal order is pre-order.
|
||||
*
|
||||
* After a Component's subcomponents have been visited, the _post_ function for
|
||||
* that Component will be called.
|
||||
*/
|
||||
class ComponentVisitor {
|
||||
public:
|
||||
virtual ~ComponentVisitor();
|
||||
|
||||
virtual Component *visit(AsciiComponentClass *c) = 0;
|
||||
virtual Component *visit(ComponentAlternation *c) = 0;
|
||||
virtual Component *visit(ComponentAssertion *c) = 0;
|
||||
virtual Component *visit(ComponentAtomicGroup *c) = 0;
|
||||
virtual Component *visit(ComponentBackReference *c) = 0;
|
||||
virtual Component *visit(ComponentBoundary *c) = 0;
|
||||
virtual Component *visit(ComponentByte *c) = 0;
|
||||
virtual Component *visit(ComponentCondReference *c) = 0;
|
||||
virtual Component *visit(ComponentEmpty *c) = 0;
|
||||
virtual Component *visit(ComponentEUS *c) = 0;
|
||||
virtual Component *visit(ComponentRepeat *c) = 0;
|
||||
virtual Component *visit(ComponentSequence *c) = 0;
|
||||
virtual Component *visit(ComponentWordBoundary *c) = 0;
|
||||
virtual Component *visit(UTF8ComponentClass *c) = 0;
|
||||
|
||||
virtual void post(AsciiComponentClass *c) = 0;
|
||||
virtual void post(ComponentAlternation *c) = 0;
|
||||
virtual void post(ComponentAssertion *c) = 0;
|
||||
virtual void post(ComponentAtomicGroup *c) = 0;
|
||||
virtual void post(ComponentBackReference *c) = 0;
|
||||
virtual void post(ComponentBoundary *c) = 0;
|
||||
virtual void post(ComponentByte *c) = 0;
|
||||
virtual void post(ComponentCondReference *c) = 0;
|
||||
virtual void post(ComponentEmpty *c) = 0;
|
||||
virtual void post(ComponentEUS *c) = 0;
|
||||
virtual void post(ComponentRepeat *c) = 0;
|
||||
virtual void post(ComponentSequence *c) = 0;
|
||||
virtual void post(ComponentWordBoundary *c) = 0;
|
||||
virtual void post(UTF8ComponentClass *c) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Concrete subclass of ComponentVisitor with default behaviour,
|
||||
* allowing you to just implement the member functions you need.
|
||||
*/
|
||||
class DefaultComponentVisitor : public ComponentVisitor {
|
||||
public:
|
||||
DefaultComponentVisitor();
|
||||
~DefaultComponentVisitor() override;
|
||||
|
||||
Component *visit(AsciiComponentClass *c) override;
|
||||
Component *visit(ComponentAlternation *c) override;
|
||||
Component *visit(ComponentAssertion *c) override;
|
||||
Component *visit(ComponentAtomicGroup *c) override;
|
||||
Component *visit(ComponentBackReference *c) override;
|
||||
Component *visit(ComponentBoundary *c) override;
|
||||
Component *visit(ComponentByte *c) override;
|
||||
Component *visit(ComponentCondReference *c) override;
|
||||
Component *visit(ComponentEmpty *c) override;
|
||||
Component *visit(ComponentEUS *c) override;
|
||||
Component *visit(ComponentRepeat *c) override;
|
||||
Component *visit(ComponentSequence *c) override;
|
||||
Component *visit(ComponentWordBoundary *c) override;
|
||||
Component *visit(UTF8ComponentClass *c) override;
|
||||
|
||||
void post(AsciiComponentClass *c) override;
|
||||
void post(ComponentAlternation *c) override;
|
||||
void post(ComponentAssertion *c) override;
|
||||
void post(ComponentAtomicGroup *c) override;
|
||||
void post(ComponentBackReference *c) override;
|
||||
void post(ComponentBoundary *c) override;
|
||||
void post(ComponentByte *c) override;
|
||||
void post(ComponentCondReference *c) override;
|
||||
void post(ComponentEmpty *c) override;
|
||||
void post(ComponentEUS *c) override;
|
||||
void post(ComponentRepeat *c) override;
|
||||
void post(ComponentSequence *c) override;
|
||||
void post(ComponentWordBoundary *c) override;
|
||||
void post(UTF8ComponentClass *c) override;
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // COMPONENTVISITOR_H
|
105
src/parser/ComponentWordBoundary.cpp
Normal file
105
src/parser/ComponentWordBoundary.cpp
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Word Boundary Assertion (\\b or \\B)
|
||||
*/
|
||||
#include "ComponentWordBoundary.h"
|
||||
#include "buildstate.h"
|
||||
#include "parse_error.h"
|
||||
#include "Parser.h"
|
||||
#include "position_info.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ComponentWordBoundary::ComponentWordBoundary(u32 loc_in, bool neg,
|
||||
const ParseMode &mode)
|
||||
: loc(loc_in), position(GlushkovBuildState::POS_UNINITIALIZED),
|
||||
negated(neg), ucp(mode.ucp), prefilter(false) {}
|
||||
|
||||
ComponentWordBoundary::~ComponentWordBoundary() {
|
||||
// empty
|
||||
}
|
||||
|
||||
ComponentWordBoundary * ComponentWordBoundary::clone() const {
|
||||
return new ComponentWordBoundary(*this);
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentWordBoundary::first() const {
|
||||
vector<PositionInfo> firsts;
|
||||
firsts.push_back(position);
|
||||
return firsts;
|
||||
}
|
||||
|
||||
vector<PositionInfo> ComponentWordBoundary::last() const {
|
||||
// Same as firsts
|
||||
return first();
|
||||
}
|
||||
|
||||
bool ComponentWordBoundary::empty() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ComponentWordBoundary::repeatable() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
void ComponentWordBoundary::notePositions(GlushkovBuildState &bs) {
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
position = builder.makePositions(1);
|
||||
|
||||
if (ucp) {
|
||||
assert(prefilter); // only in prefiltering mode!
|
||||
if (negated) {
|
||||
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_WORD_UCP
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP);
|
||||
} else {
|
||||
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP);
|
||||
}
|
||||
} else {
|
||||
if (negated) {
|
||||
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_WORD
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_NONWORD);
|
||||
} else {
|
||||
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_NONWORD
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_WORD);
|
||||
}
|
||||
}
|
||||
recordPosBounds(position, position + 1);
|
||||
}
|
||||
|
||||
void ComponentWordBoundary::buildFollowSet(GlushkovBuildState&,
|
||||
const vector<PositionInfo>&) {
|
||||
// No internal connections, nowt to do
|
||||
}
|
||||
|
||||
} // namespace ue2
|
90
src/parser/ComponentWordBoundary.h
Normal file
90
src/parser/ComponentWordBoundary.h
Normal file
@@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Word Boundary Assertion (\\b or \\B)
|
||||
*/
|
||||
|
||||
#ifndef _RE_COMPONENTWORDBOUNDARY_H_
|
||||
#define _RE_COMPONENTWORDBOUNDARY_H_
|
||||
|
||||
#include "Component.h"
|
||||
#include "position.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
struct ParseMode;
|
||||
|
||||
/** \brief Encapsulates a positive (\\b) or negative (\\B) word boundary
|
||||
* assertion. */
|
||||
class ComponentWordBoundary : public Component {
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
friend class UnsupportedVisitor;
|
||||
public:
|
||||
ComponentWordBoundary(u32 loc, bool negated, const ParseMode &mode);
|
||||
~ComponentWordBoundary() override;
|
||||
ComponentWordBoundary *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
std::vector<PositionInfo> first() const override;
|
||||
std::vector<PositionInfo> last() const override;
|
||||
bool empty() const override;
|
||||
bool repeatable() const override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &lastPos) override;
|
||||
|
||||
void setPrefilter(bool p) { prefilter = p; }
|
||||
|
||||
private:
|
||||
u32 loc; //!< location in pattern for error reporting.
|
||||
Position position;
|
||||
bool negated;
|
||||
bool ucp;
|
||||
bool prefilter; //!< set by PrefilterVisitor, this is ugly
|
||||
|
||||
ComponentWordBoundary(const ComponentWordBoundary &other)
|
||||
: Component(other), loc(other.loc), position(other.position),
|
||||
negated(other.negated), ucp(other.ucp), prefilter(other.prefilter) {}
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
78
src/parser/ConstComponentVisitor.cpp
Normal file
78
src/parser/ConstComponentVisitor.cpp
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "ConstComponentVisitor.h"
|
||||
|
||||
#include "AsciiComponentClass.h"
|
||||
#include "ComponentAlternation.h"
|
||||
#include "ComponentAssertion.h"
|
||||
#include "ComponentAtomicGroup.h"
|
||||
#include "ComponentBackReference.h"
|
||||
#include "ComponentBoundary.h"
|
||||
#include "ComponentByte.h"
|
||||
#include "ComponentCondReference.h"
|
||||
#include "ComponentClass.h"
|
||||
#include "ComponentEmpty.h"
|
||||
#include "ComponentEUS.h"
|
||||
#include "ComponentRepeat.h"
|
||||
#include "ComponentSequence.h"
|
||||
#include "ComponentWordBoundary.h"
|
||||
#include "Utf8ComponentClass.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ConstComponentVisitor::~ConstComponentVisitor() {
|
||||
// empty
|
||||
}
|
||||
|
||||
// Default implementations.
|
||||
|
||||
DefaultConstComponentVisitor::DefaultConstComponentVisitor() {}
|
||||
DefaultConstComponentVisitor::~DefaultConstComponentVisitor() {}
|
||||
|
||||
#define DEFAULT_FUNCS(comp) \
|
||||
void DefaultConstComponentVisitor::pre(const comp &) {} \
|
||||
void DefaultConstComponentVisitor::during(const comp &) {} \
|
||||
void DefaultConstComponentVisitor::post(const comp &) {}
|
||||
|
||||
DEFAULT_FUNCS(AsciiComponentClass)
|
||||
DEFAULT_FUNCS(ComponentAlternation)
|
||||
DEFAULT_FUNCS(ComponentAssertion)
|
||||
DEFAULT_FUNCS(ComponentAtomicGroup)
|
||||
DEFAULT_FUNCS(ComponentBackReference)
|
||||
DEFAULT_FUNCS(ComponentBoundary)
|
||||
DEFAULT_FUNCS(ComponentByte)
|
||||
DEFAULT_FUNCS(ComponentCondReference)
|
||||
DEFAULT_FUNCS(ComponentEmpty)
|
||||
DEFAULT_FUNCS(ComponentEUS)
|
||||
DEFAULT_FUNCS(ComponentRepeat)
|
||||
DEFAULT_FUNCS(ComponentSequence)
|
||||
DEFAULT_FUNCS(ComponentWordBoundary)
|
||||
DEFAULT_FUNCS(UTF8ComponentClass)
|
||||
|
||||
} // namespace ue2
|
170
src/parser/ConstComponentVisitor.h
Normal file
170
src/parser/ConstComponentVisitor.h
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Visitor base class for working with the component tree.
|
||||
*/
|
||||
|
||||
#ifndef CONSTCOMPONENTVISITOR_H
|
||||
#define CONSTCOMPONENTVISITOR_H
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class AsciiComponentClass;
|
||||
class Component;
|
||||
class ComponentAlternation;
|
||||
class ComponentAssertion;
|
||||
class ComponentAtomicGroup;
|
||||
class ComponentBackReference;
|
||||
class ComponentBoundary;
|
||||
class ComponentByte;
|
||||
class ComponentCondReference;
|
||||
class ComponentClass;
|
||||
class ComponentEmpty;
|
||||
class ComponentEUS;
|
||||
class ComponentRepeat;
|
||||
class ComponentSequence;
|
||||
class ComponentWordBoundary;
|
||||
class UTF8ComponentClass;
|
||||
|
||||
/**
|
||||
* \brief Visitor base class for traversing an immutable component tree.
|
||||
*
|
||||
* Our approach to implementing the visitor pattern for traversing the
|
||||
* Component tree for a pattern. This version operates on an immutable tree;
|
||||
* use \ref ComponentVisitor if you need to make changes to components during
|
||||
* traversal.
|
||||
*/
|
||||
class ConstComponentVisitor {
|
||||
public:
|
||||
virtual ~ConstComponentVisitor();
|
||||
|
||||
virtual void pre(const AsciiComponentClass &c) = 0;
|
||||
virtual void pre(const ComponentAlternation &c) = 0;
|
||||
virtual void pre(const ComponentAssertion &c) = 0;
|
||||
virtual void pre(const ComponentAtomicGroup &c) = 0;
|
||||
virtual void pre(const ComponentBackReference &c) = 0;
|
||||
virtual void pre(const ComponentBoundary &c) = 0;
|
||||
virtual void pre(const ComponentByte &c) = 0;
|
||||
virtual void pre(const ComponentCondReference &c) = 0;
|
||||
virtual void pre(const ComponentEmpty &c) = 0;
|
||||
virtual void pre(const ComponentEUS &c) = 0;
|
||||
virtual void pre(const ComponentRepeat &c) = 0;
|
||||
virtual void pre(const ComponentSequence &c) = 0;
|
||||
virtual void pre(const ComponentWordBoundary &c) = 0;
|
||||
virtual void pre(const UTF8ComponentClass &c) = 0;
|
||||
|
||||
virtual void during(const AsciiComponentClass &c) = 0;
|
||||
virtual void during(const ComponentAlternation &c) = 0;
|
||||
virtual void during(const ComponentAssertion &c) = 0;
|
||||
virtual void during(const ComponentAtomicGroup &c) = 0;
|
||||
virtual void during(const ComponentBackReference &c) = 0;
|
||||
virtual void during(const ComponentBoundary &c) = 0;
|
||||
virtual void during(const ComponentByte &c) = 0;
|
||||
virtual void during(const ComponentCondReference &c) = 0;
|
||||
virtual void during(const ComponentEmpty &c) = 0;
|
||||
virtual void during(const ComponentEUS &c) = 0;
|
||||
virtual void during(const ComponentRepeat &c) = 0;
|
||||
virtual void during(const ComponentSequence &c) = 0;
|
||||
virtual void during(const ComponentWordBoundary &c) = 0;
|
||||
virtual void during(const UTF8ComponentClass &c) = 0;
|
||||
|
||||
virtual void post(const AsciiComponentClass &c) = 0;
|
||||
virtual void post(const ComponentAlternation &c) = 0;
|
||||
virtual void post(const ComponentAssertion &c) = 0;
|
||||
virtual void post(const ComponentAtomicGroup &c) = 0;
|
||||
virtual void post(const ComponentBackReference &c) = 0;
|
||||
virtual void post(const ComponentBoundary &c) = 0;
|
||||
virtual void post(const ComponentByte &c) = 0;
|
||||
virtual void post(const ComponentCondReference &c) = 0;
|
||||
virtual void post(const ComponentEmpty &c) = 0;
|
||||
virtual void post(const ComponentEUS &c) = 0;
|
||||
virtual void post(const ComponentRepeat &c) = 0;
|
||||
virtual void post(const ComponentSequence &c) = 0;
|
||||
virtual void post(const ComponentWordBoundary &c) = 0;
|
||||
virtual void post(const UTF8ComponentClass &c) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Concrete subclass of ConstComponentVisitor with default behaviour,
|
||||
* allowing you to just implement the member functions you need.
|
||||
*/
|
||||
class DefaultConstComponentVisitor : public ConstComponentVisitor {
|
||||
public:
|
||||
DefaultConstComponentVisitor();
|
||||
~DefaultConstComponentVisitor() override;
|
||||
|
||||
void pre(const AsciiComponentClass &c) override;
|
||||
void pre(const ComponentAlternation &c) override;
|
||||
void pre(const ComponentAssertion &c) override;
|
||||
void pre(const ComponentAtomicGroup &c) override;
|
||||
void pre(const ComponentBackReference &c) override;
|
||||
void pre(const ComponentBoundary &c) override;
|
||||
void pre(const ComponentByte &c) override;
|
||||
void pre(const ComponentCondReference &c) override;
|
||||
void pre(const ComponentEmpty &c) override;
|
||||
void pre(const ComponentEUS &c) override;
|
||||
void pre(const ComponentRepeat &c) override;
|
||||
void pre(const ComponentSequence &c) override;
|
||||
void pre(const ComponentWordBoundary &c) override;
|
||||
void pre(const UTF8ComponentClass &c) override;
|
||||
|
||||
void during(const AsciiComponentClass &c) override;
|
||||
void during(const ComponentAlternation &c) override;
|
||||
void during(const ComponentAssertion &c) override;
|
||||
void during(const ComponentAtomicGroup &c) override;
|
||||
void during(const ComponentBackReference &c) override;
|
||||
void during(const ComponentBoundary &c) override;
|
||||
void during(const ComponentByte &c) override;
|
||||
void during(const ComponentCondReference &c) override;
|
||||
void during(const ComponentEmpty &c) override;
|
||||
void during(const ComponentEUS &c) override;
|
||||
void during(const ComponentRepeat &c) override;
|
||||
void during(const ComponentSequence &c) override;
|
||||
void during(const ComponentWordBoundary &c) override;
|
||||
void during(const UTF8ComponentClass &c) override;
|
||||
|
||||
void post(const AsciiComponentClass &c) override;
|
||||
void post(const ComponentAlternation &c) override;
|
||||
void post(const ComponentAssertion &c) override;
|
||||
void post(const ComponentAtomicGroup &c) override;
|
||||
void post(const ComponentBackReference &c) override;
|
||||
void post(const ComponentBoundary &c) override;
|
||||
void post(const ComponentByte &c) override;
|
||||
void post(const ComponentCondReference &c) override;
|
||||
void post(const ComponentEmpty &c) override;
|
||||
void post(const ComponentEUS &c) override;
|
||||
void post(const ComponentRepeat &c) override;
|
||||
void post(const ComponentSequence &c) override;
|
||||
void post(const ComponentWordBoundary &c) override;
|
||||
void post(const UTF8ComponentClass &c) override;
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // CONSTCOMPONENTVISITOR_H
|
76
src/parser/Parser.h
Normal file
76
src/parser/Parser.h
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Interface to Parser.
|
||||
*/
|
||||
|
||||
#ifndef _RE_PARSER_H_
|
||||
#define _RE_PARSER_H_
|
||||
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class Component;
|
||||
|
||||
/** \brief Represents the current "mode flags" at any point in the parsing
|
||||
* process.
|
||||
*
|
||||
* This is necessary as some modes can be changed part-way through an
|
||||
* expression, such as in:
|
||||
*
|
||||
* /foo(?i)bar/
|
||||
*/
|
||||
struct ParseMode {
|
||||
ParseMode() {}
|
||||
explicit ParseMode(u32 hs_flags);
|
||||
|
||||
bool caseless = false;
|
||||
bool dotall = false;
|
||||
bool ignore_space = false;
|
||||
bool multiline = false;
|
||||
bool ucp = false;
|
||||
bool utf8 = false;
|
||||
};
|
||||
|
||||
/** \brief Parse the given regular expression into a \ref Component tree.
|
||||
*
|
||||
* The \a mode parameter should contain the initial mode flags, and will be
|
||||
* updated by the parser if additional global flags are introduced in the
|
||||
* expression (for example, via "(*UTF8)".)
|
||||
*
|
||||
* This call will throw a ParseError on failure.
|
||||
*/
|
||||
std::unique_ptr<Component> parse(const char *const ptr, ParseMode &mode);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // _RE_PARSER_H_
|
1964
src/parser/Parser.rl
Normal file
1964
src/parser/Parser.rl
Normal file
File diff suppressed because it is too large
Load Diff
1184
src/parser/Utf8ComponentClass.cpp
Normal file
1184
src/parser/Utf8ComponentClass.cpp
Normal file
File diff suppressed because it is too large
Load Diff
115
src/parser/Utf8ComponentClass.h
Normal file
115
src/parser/Utf8ComponentClass.h
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Character class in UTF-8 mode.
|
||||
*/
|
||||
|
||||
#ifndef UTF8_COMPONENT_CLASS_H
|
||||
#define UTF8_COMPONENT_CLASS_H
|
||||
|
||||
#include "ComponentClass.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/unicode_set.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class UTF8ComponentClass : public ComponentClass {
|
||||
friend class DumpVisitor;
|
||||
friend class PrintVisitor;
|
||||
friend class CaselessVisitor;
|
||||
friend class SimplifyVisitor;
|
||||
friend class SimplifyCandidatesVisitor;
|
||||
public:
|
||||
explicit UTF8ComponentClass(const ParseMode &mode);
|
||||
~UTF8ComponentClass() override {}
|
||||
UTF8ComponentClass *clone() const override;
|
||||
|
||||
Component *accept(ComponentVisitor &v) override {
|
||||
Component *c = v.visit(this);
|
||||
v.post(this);
|
||||
return c;
|
||||
}
|
||||
|
||||
void accept(ConstComponentVisitor &v) const override {
|
||||
v.pre(*this);
|
||||
v.during(*this);
|
||||
v.post(*this);
|
||||
}
|
||||
|
||||
bool class_empty(void) const override;
|
||||
void add(PredefinedClass c, bool negative) override;
|
||||
void add(unichar c) override;
|
||||
void finalize(void) override;
|
||||
void notePositions(GlushkovBuildState &bs) override;
|
||||
void buildFollowSet(GlushkovBuildState &bs,
|
||||
const std::vector<PositionInfo> &) override;
|
||||
std::vector<PositionInfo> first(void) const override;
|
||||
std::vector<PositionInfo> last(void) const override;
|
||||
|
||||
protected:
|
||||
void createRange(unichar to) override;
|
||||
|
||||
private:
|
||||
Position getHead(NFABuilder &builder, u8 first_byte);
|
||||
void addToTail(GlushkovBuildState &bs, std::map<Position, Position> &finals,
|
||||
Position prev, unichar b, unichar e);
|
||||
void ensureDotTrailer(GlushkovBuildState &bs);
|
||||
void ensureTwoDotTrailer(GlushkovBuildState &bs);
|
||||
void ensureThreeDotTrailer(GlushkovBuildState &bs);
|
||||
void buildOneByte(GlushkovBuildState &bs);
|
||||
void buildTwoByte(GlushkovBuildState &bs);
|
||||
void buildThreeByte(GlushkovBuildState &bs);
|
||||
void buildFourByte(GlushkovBuildState &bs);
|
||||
|
||||
CodePointSet cps;
|
||||
CodePointSet cps_ucp;
|
||||
|
||||
std::map<u8, Position> heads;
|
||||
Position single_pos;
|
||||
Position one_dot_trailer;
|
||||
Position two_dot_trailer;
|
||||
Position three_dot_trailer;
|
||||
|
||||
Position two_char_dot_head;
|
||||
Position three_char_dot_head;
|
||||
Position four_char_dot_head;
|
||||
std::set<Position> tails;
|
||||
};
|
||||
|
||||
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
|
||||
bool isUcp(PredefinedClass c);
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // UTF8_COMPONENT_CLASS_H
|
527
src/parser/buildstate.cpp
Normal file
527
src/parser/buildstate.cpp
Normal file
@@ -0,0 +1,527 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Glushkov construction.
|
||||
*/
|
||||
#include "buildstate.h"
|
||||
#include "position.h"
|
||||
#include "position_dump.h"
|
||||
#include "position_info.h"
|
||||
#include "parse_error.h"
|
||||
#include "hs_internal.h"
|
||||
#include "ue2common.h"
|
||||
#include "nfagraph/ng_builder.h"
|
||||
#include "util/charreach.h"
|
||||
#include "util/container.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/ue2_containers.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
|
||||
#if defined(DEBUG) || defined(DUMP_SUPPORT)
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Represents an uninitialized state. */
|
||||
const Position GlushkovBuildState::POS_UNINITIALIZED =
|
||||
numeric_limits<Position>::max();
|
||||
|
||||
/** \brief Represents an epsilon transition in the firsts of a component. */
|
||||
const Position GlushkovBuildState::POS_EPSILON =
|
||||
numeric_limits<Position>::max() - 1;
|
||||
|
||||
GlushkovBuildState::~GlushkovBuildState() { }
|
||||
|
||||
namespace /* anonymous */ {
|
||||
|
||||
class CheckPositionFlags {
|
||||
public:
|
||||
explicit CheckPositionFlags(int fl) : flags(fl) {}
|
||||
bool operator()(const PositionInfo &p) const {
|
||||
return (p.flags & flags) == flags;
|
||||
}
|
||||
private:
|
||||
int flags;
|
||||
};
|
||||
|
||||
class CheckUnflaggedEpsilon {
|
||||
public:
|
||||
bool operator()(const PositionInfo &p) const {
|
||||
return p.pos == GlushkovBuildState::POS_EPSILON && p.flags == 0;
|
||||
}
|
||||
};
|
||||
|
||||
/** \brief Concrete impl of the GlushkovBuildState interface. */
|
||||
class GlushkovBuildStateImpl : public GlushkovBuildState {
|
||||
public:
|
||||
GlushkovBuildStateImpl(NFABuilder &b, bool prefilter);
|
||||
|
||||
/** \brief Returns a reference to the NFABuilder being used. */
|
||||
NFABuilder &getBuilder() override { return builder; }
|
||||
|
||||
/** \brief Returns a const reference to the NFABuilder being used. */
|
||||
const NFABuilder &getBuilder() const override { return builder; }
|
||||
|
||||
/** \brief Wire up the lasts of one component to the firsts of another. */
|
||||
void connectRegions(const vector<PositionInfo> &lasts,
|
||||
const vector<PositionInfo> &firsts) override;
|
||||
|
||||
/** \brief Wire the lasts of the main sequence to accepts. */
|
||||
void connectAccepts(const vector<PositionInfo> &lasts) override;
|
||||
|
||||
/** \brief Wire up a single last to a list of firsts. */
|
||||
void connectSuccessors(const PositionInfo &last,
|
||||
vector<PositionInfo> firsts);
|
||||
|
||||
/** Wire up a pair of positions. */
|
||||
void addSuccessor(Position from, Position to) override;
|
||||
|
||||
/** \brief Clone the vertex properties and edges of all vertices between
|
||||
* two positions. */
|
||||
void cloneFollowSet(Position from, Position to, unsigned offset) override;
|
||||
|
||||
/** \brief Build the prioritised list of edges out of our successor map. */
|
||||
void buildEdges() override;
|
||||
|
||||
/** Construct an edge, called internally by \ref buildEdges. */
|
||||
void buildEdge(Position from, const PositionInfo &to);
|
||||
|
||||
Position startState;
|
||||
Position startDotstarState;
|
||||
Position acceptState;
|
||||
Position acceptEodState;
|
||||
Position acceptNlEodState;
|
||||
Position acceptNlState;
|
||||
|
||||
NFABuilder &builder; //!< \brief builder for the NFAGraph
|
||||
|
||||
bool doPrefilter; //!< \brief we're building a prefiltering pattern
|
||||
|
||||
/** \brief Map storing successors for each position. */
|
||||
map<Position, flat_set<PositionInfo>> successors;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b,
|
||||
bool prefilter) :
|
||||
startState(b.getStart()),
|
||||
startDotstarState(b.getStartDotStar()),
|
||||
acceptState(b.getAccept()),
|
||||
acceptEodState(b.getAcceptEOD()),
|
||||
acceptNlEodState(POS_UNINITIALIZED),
|
||||
acceptNlState(POS_UNINITIALIZED),
|
||||
builder(b),
|
||||
doPrefilter(prefilter)
|
||||
{
|
||||
// Our special nodes need special relationships.
|
||||
vector<PositionInfo> lasts, firsts;
|
||||
|
||||
// start->startDs and startDs self-loop.
|
||||
lasts.push_back(startState);
|
||||
lasts.push_back(startDotstarState);
|
||||
firsts.push_back(startDotstarState);
|
||||
connectRegions(lasts, firsts);
|
||||
|
||||
// accept to acceptEod edges already wired
|
||||
|
||||
// XXX: a small hack to support vacuous NFAs: give start and startDs an
|
||||
// initial report ID.
|
||||
builder.setNodeReportID(startState, 0);
|
||||
builder.setNodeReportID(startDotstarState, 0);
|
||||
}
|
||||
|
||||
static
|
||||
void checkEmbeddedEndAnchor(const PositionInfo &from,
|
||||
const vector<PositionInfo> &firsts) {
|
||||
if (!(from.flags & POS_FLAG_ONLY_ENDS)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto &first : firsts) {
|
||||
if (first.pos != GlushkovBuildStateImpl::POS_EPSILON) {
|
||||
/* can make it through the parse tree */
|
||||
throw ParseError("Embedded end anchors not supported.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wire up the lasts of one component to the firsts of another
|
||||
void
|
||||
GlushkovBuildStateImpl::connectRegions(const vector<PositionInfo> &lasts,
|
||||
const vector<PositionInfo> &firsts) {
|
||||
for (const auto &last : lasts) {
|
||||
checkEmbeddedEndAnchor(last, firsts);
|
||||
connectSuccessors(last, firsts);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void filterEdges(const GlushkovBuildStateImpl &bs, const PositionInfo &from,
|
||||
vector<PositionInfo> &tolist) {
|
||||
if (from.pos == bs.startDotstarState) {
|
||||
// If we're connecting from start-dotstar, remove all caret flavoured
|
||||
// positions.
|
||||
CheckPositionFlags check(POS_FLAG_NOFLOAT);
|
||||
tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
|
||||
tolist.end());
|
||||
if (from.flags & POS_FLAG_NOFLOAT) {
|
||||
tolist.clear();
|
||||
}
|
||||
} else if (from.pos == bs.startState) {
|
||||
// If we're connecting from start, we should remove any epsilons that
|
||||
// aren't caret flavoured.
|
||||
CheckUnflaggedEpsilon check;
|
||||
tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
|
||||
tolist.end());
|
||||
CheckPositionFlags check2(POS_FLAG_MUST_FLOAT | POS_FLAG_NOFLOAT);
|
||||
tolist.erase(remove_if(tolist.begin(), tolist.end(), check2),
|
||||
tolist.end());
|
||||
}
|
||||
|
||||
if (bs.builder.getAssertFlag(from.pos) & POS_FLAG_MULTILINE_START) {
|
||||
// If we have a (mildly boneheaded) pattern like /^$/m, we're right up
|
||||
// against the edge of what we can do without true assertion support.
|
||||
// Here we have an evil hack to prevent us plugging the \n generated by
|
||||
// the caret right into acceptEod (which is in the firsts of the
|
||||
// dollar).
|
||||
/* This is due to the 'interesting quirk' that multiline ^ does not
|
||||
* not match a newline at the end of buffer. */
|
||||
DEBUG_PRINTF("multiline start - no eod\n");
|
||||
tolist.erase(remove(tolist.begin(), tolist.end(), bs.acceptEodState),
|
||||
tolist.end());
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
Position makeNewlineAssertPos(GlushkovBuildState &bs) {
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
Position newline = builder.makePositions(1);
|
||||
builder.addCharReach(newline, CharReach('\n'));
|
||||
builder.setAssertFlag(newline, POS_FLAG_FIDDLE_ACCEPT);
|
||||
builder.setNodeReportID(newline, -1);
|
||||
return newline;
|
||||
}
|
||||
|
||||
static
|
||||
void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
|
||||
vector<PositionInfo> *tolist) {
|
||||
NFABuilder &builder = bs.getBuilder();
|
||||
u32 flags = from.flags;
|
||||
|
||||
bool require_eod = flags & POS_FLAG_WIRE_EOD;
|
||||
bool require_nl_eod = flags & POS_FLAG_WIRE_NL_EOD
|
||||
&& !(flags & POS_FLAG_NO_NL_EOD);
|
||||
bool require_nl_accept = (flags & POS_FLAG_WIRE_NL_ACCEPT)
|
||||
&& !(flags & POS_FLAG_NO_NL_ACCEPT);
|
||||
|
||||
bool require_accept = !(flags & POS_FLAG_ONLY_ENDS);
|
||||
|
||||
if (require_eod) {
|
||||
tolist->push_back(bs.acceptEodState);
|
||||
}
|
||||
|
||||
if (require_nl_accept) {
|
||||
if (bs.acceptNlState == GlushkovBuildState::POS_UNINITIALIZED) {
|
||||
Position newline = makeNewlineAssertPos(bs);
|
||||
bs.addSuccessor(newline, builder.getAccept());
|
||||
bs.acceptNlState = newline;
|
||||
}
|
||||
tolist->push_back(bs.acceptNlState);
|
||||
}
|
||||
|
||||
if (require_nl_eod) {
|
||||
if (bs.acceptNlEodState == GlushkovBuildState::POS_UNINITIALIZED) {
|
||||
Position newline = makeNewlineAssertPos(bs);
|
||||
bs.addSuccessor(newline, builder.getAcceptEOD());
|
||||
bs.acceptNlEodState = newline;
|
||||
}
|
||||
tolist->push_back(bs.acceptNlEodState);
|
||||
}
|
||||
|
||||
if (require_accept) {
|
||||
tolist->push_back(bs.acceptState);
|
||||
}
|
||||
}
|
||||
|
||||
void GlushkovBuildStateImpl::connectAccepts(const vector<PositionInfo> &lasts) {
|
||||
for (const auto &last : lasts) {
|
||||
vector<PositionInfo> accepts;
|
||||
generateAccepts(*this, last, &accepts);
|
||||
connectSuccessors(last, accepts);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(DEBUG) || defined(DUMP_SUPPORT)
|
||||
|
||||
static UNUSED
|
||||
string dumpCaptures(const PositionInfo &p) {
|
||||
ostringstream oss;
|
||||
|
||||
if (p.flags & POS_FLAG_NOFLOAT) {
|
||||
oss << "<nofloat>";
|
||||
}
|
||||
if (p.flags & POS_FLAG_MUST_FLOAT) {
|
||||
oss << "<must_float>";
|
||||
}
|
||||
if (p.flags & POS_FLAG_FIDDLE_ACCEPT) {
|
||||
oss << "<fiddle_accept>";
|
||||
}
|
||||
if (p.flags & POS_FLAG_ONLY_ENDS) {
|
||||
oss << "<only_ends>";
|
||||
}
|
||||
if (p.flags & POS_FLAG_NO_NL_EOD) {
|
||||
oss << "<no_nl_eod>";
|
||||
}
|
||||
if (p.flags & POS_FLAG_NO_NL_ACCEPT) {
|
||||
oss << "<no_nl_acc>";
|
||||
}
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
#endif // DEBUG || DUMP_SUPPORT
|
||||
|
||||
void GlushkovBuildStateImpl::connectSuccessors(const PositionInfo &from,
|
||||
vector<PositionInfo> tolist) {
|
||||
/* note: tolist maybe modified for our own internal use -> not a reference */
|
||||
assert(from.pos != POS_EPSILON);
|
||||
assert(from.pos != POS_UNINITIALIZED);
|
||||
assert(find(tolist.begin(), tolist.end(), POS_UNINITIALIZED)
|
||||
== tolist.end());
|
||||
|
||||
DEBUG_PRINTF("FROM = %u%s TO = %s\n", from.pos, dumpCaptures(from).c_str(),
|
||||
dumpPositions(tolist.begin(), tolist.end()).c_str());
|
||||
|
||||
/* prevent creation of edges with invalid assertions */
|
||||
filterEdges(*this, from, tolist);
|
||||
|
||||
if (from.flags & POS_FLAG_FIDDLE_ACCEPT) {
|
||||
auto accept = find(tolist.begin(), tolist.end(), acceptState);
|
||||
if (accept != tolist.end()) {
|
||||
DEBUG_PRINTF("accept through -1 offset-adjusting dot\n");
|
||||
Position fakedot = builder.makePositions(1);
|
||||
builder.addCharReach(fakedot, CharReach(0x00, 0xff));
|
||||
builder.setNodeReportID(fakedot, -1);
|
||||
addSuccessor(fakedot, acceptState);
|
||||
*accept = fakedot;
|
||||
} else {
|
||||
// We might lead to accept via an assertion vertex, so we add the
|
||||
// offset adj to this vertex itself. Used for cases like /^\B/m,
|
||||
// which should match only at 0 for '\n'.
|
||||
builder.setNodeReportID(from.pos, -1);
|
||||
}
|
||||
|
||||
assert(find(tolist.begin(), tolist.end(), acceptState) == tolist.end());
|
||||
}
|
||||
|
||||
auto &succ = successors[from.pos];
|
||||
|
||||
DEBUG_PRINTF("connect %u -> %s\n", from.pos,
|
||||
dumpPositions(tolist.begin(), tolist.end()).c_str());
|
||||
DEBUG_PRINTF("%u curr succ: %s\n", from.pos,
|
||||
dumpPositions(begin(succ), end(succ)).c_str());
|
||||
|
||||
for (const auto &to : tolist) {
|
||||
if (to.pos != POS_EPSILON) {
|
||||
succ.insert(to);
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("%u succ: %s\n", from.pos,
|
||||
dumpPositions(begin(succ), end(succ)).c_str());
|
||||
}
|
||||
|
||||
void GlushkovBuildStateImpl::addSuccessor(Position from, Position to) {
|
||||
DEBUG_PRINTF("connect %u -> %u\n", from, to);
|
||||
assert(from != POS_EPSILON && from != POS_UNINITIALIZED);
|
||||
assert(to != POS_EPSILON && to != POS_UNINITIALIZED);
|
||||
|
||||
auto &succ = successors[from];
|
||||
succ.insert(to);
|
||||
|
||||
DEBUG_PRINTF("%u succ: %s\n", from,
|
||||
dumpPositions(begin(succ), end(succ)).c_str());
|
||||
}
|
||||
|
||||
void GlushkovBuildStateImpl::cloneFollowSet(Position first, Position last,
|
||||
unsigned offset) {
|
||||
assert(first <= last);
|
||||
|
||||
// Clone vertex properties (reachability, etc)
|
||||
builder.cloneRegion(first, last, offset);
|
||||
|
||||
/* Clone the successors of all the positions between first and last
|
||||
* inclusive, producing a new set of positions starting at (first +
|
||||
* offset). */
|
||||
for (Position i = first; i <= last; i++) {
|
||||
// This should be a new position.
|
||||
assert(successors[i + offset].empty());
|
||||
|
||||
for (const PositionInfo &to : successors[i]) {
|
||||
if (to.pos >= first && to.pos <= last) {
|
||||
PositionInfo clone(to);
|
||||
clone.pos += offset;
|
||||
DEBUG_PRINTF("clone: %u -> %u\n", i + offset, clone.pos);
|
||||
successors[i + offset].insert(clone);
|
||||
} else {
|
||||
// There shouldn't be any stray edges leading out of this
|
||||
// region!
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GlushkovBuildStateImpl::buildEdge(Position from, const PositionInfo &to) {
|
||||
// Guard against embedded anchors
|
||||
if (to == startState) {
|
||||
/* can make it through the parse tree */
|
||||
throw ParseError("Embedded start anchors not supported.");
|
||||
}
|
||||
|
||||
assert(to.pos != POS_UNINITIALIZED);
|
||||
assert(to.pos != POS_EPSILON);
|
||||
|
||||
if (builder.hasEdge(from, to.pos)) {
|
||||
return;
|
||||
}
|
||||
|
||||
builder.addEdge(from, to.pos);
|
||||
}
|
||||
|
||||
void GlushkovBuildStateImpl::buildEdges() {
|
||||
// Create all the edges and track which vertices are asserts which need to
|
||||
// be removed later.
|
||||
for (const auto &m : successors) {
|
||||
const Position from = m.first;
|
||||
for (const auto &to : m.second) {
|
||||
buildEdge(from, to);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Construct a usable GlushkovBuildState for the outside world.
|
||||
unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
|
||||
bool prefilter) {
|
||||
return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter);
|
||||
}
|
||||
|
||||
// free functions for utility use
|
||||
|
||||
/** \brief Eliminate lower-priority duplicate PositionInfo entries.
|
||||
*
|
||||
* Scans through a list of positions and retains only the highest priority
|
||||
* version of a given (position, flags) entry. */
|
||||
void cleanupPositions(vector<PositionInfo> &a) {
|
||||
ue2::unordered_set<pair<Position, int>> seen; // track dupes
|
||||
|
||||
vector<PositionInfo> out;
|
||||
out.reserve(a.size()); // output should be close to input in size.
|
||||
|
||||
for (const auto &p : a) {
|
||||
if (seen.emplace(p.pos, p.flags).second) {
|
||||
out.push_back(p); // first encounter
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("in %zu; out %zu\n", a.size(), out.size());
|
||||
a.swap(out);
|
||||
}
|
||||
|
||||
static
|
||||
vector<PositionInfo>::iterator
|
||||
replaceElemWithSequence(vector<PositionInfo> &dest,
|
||||
vector<PositionInfo>::iterator &victim,
|
||||
const vector<PositionInfo> &replacement) {
|
||||
auto past = dest.erase(victim);
|
||||
size_t d = distance(dest.begin(), past) + replacement.size();
|
||||
dest.insert(past, replacement.begin(), replacement.end());
|
||||
/* recalc past as iterator may have been invalidated */
|
||||
return dest.begin() + d;
|
||||
}
|
||||
|
||||
/** \brief Replace all epsilons with the given positions.
|
||||
*
|
||||
* Replace epsilons in a firsts list with another given firsts list. Note: the
|
||||
* firsts lists must come from disjoint sets of components. If no epsilons are
|
||||
* in the first firsts list the source is appended to the end.
|
||||
*/
|
||||
void replaceEpsilons(vector<PositionInfo> &target,
|
||||
const vector<PositionInfo> &source) {
|
||||
auto found =
|
||||
find(target.begin(), target.end(), GlushkovBuildState::POS_EPSILON);
|
||||
|
||||
if (found == target.end()) {
|
||||
// no epsilons to replace, push on to the end
|
||||
target.insert(target.end(), source.begin(), source.end());
|
||||
return;
|
||||
}
|
||||
|
||||
while (found != target.end()) {
|
||||
checkEmbeddedEndAnchor(*found, source);
|
||||
|
||||
// replace this epsilon with a copy of source with the same flags
|
||||
vector<PositionInfo> newsource(source);
|
||||
for (auto &pos : newsource) {
|
||||
pos.flags |= found->flags;
|
||||
}
|
||||
|
||||
found = replaceElemWithSequence(target, found, newsource);
|
||||
// find the next epsilon
|
||||
found = find(found, target.end(), GlushkovBuildState::POS_EPSILON);
|
||||
}
|
||||
|
||||
cleanupPositions(target);
|
||||
}
|
||||
|
||||
#ifdef DUMP_SUPPORT
|
||||
|
||||
void dump(ostream &os, const PositionInfo &p) {
|
||||
if (p.pos == GlushkovBuildState::POS_EPSILON) {
|
||||
os << "epsilon";
|
||||
} else {
|
||||
os << p.pos;
|
||||
}
|
||||
|
||||
os << dumpCaptures(p);
|
||||
}
|
||||
|
||||
#endif // DUMP_SUPPORT
|
||||
|
||||
} // namespace ue2
|
103
src/parser/buildstate.h
Normal file
103
src/parser/buildstate.h
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Glushkov construction.
|
||||
*/
|
||||
|
||||
#ifndef BUILDSTATE_H
|
||||
#define BUILDSTATE_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "position.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class NFABuilder;
|
||||
class PositionInfo;
|
||||
|
||||
/** \brief Machinery for Glushkov construction.
|
||||
*
|
||||
* Abstract base class; use \ref makeGlushkovBuildState to get one of these you
|
||||
* can use. */
|
||||
class GlushkovBuildState : boost::noncopyable {
|
||||
public:
|
||||
/** \brief Represents an uninitialized state. */
|
||||
static const Position POS_UNINITIALIZED;
|
||||
|
||||
/** \brief Represents an epsilon transition in the firsts of a component. */
|
||||
static const Position POS_EPSILON;
|
||||
|
||||
virtual ~GlushkovBuildState();
|
||||
|
||||
/** \brief Returns a reference to the NFABuilder being used. */
|
||||
virtual NFABuilder &getBuilder() = 0;
|
||||
|
||||
/** \brief Returns a const reference to the NFABuilder being used. */
|
||||
virtual const NFABuilder &getBuilder() const = 0;
|
||||
|
||||
/** \brief Wire up edges from the lasts of one component to the firsts of
|
||||
* another. */
|
||||
virtual void connectRegions(const std::vector<PositionInfo> &lasts,
|
||||
const std::vector<PositionInfo> &firsts) = 0;
|
||||
|
||||
/** \brief Wire the lasts of the main sequence to accepts. */
|
||||
virtual void connectAccepts(const std::vector<PositionInfo> &lasts) = 0;
|
||||
|
||||
/** \brief Wire up a pair of positions. */
|
||||
virtual void addSuccessor(Position from, Position to) = 0;
|
||||
|
||||
/** \brief Clone the vertex properties and edges of all vertices between
|
||||
* two positions. */
|
||||
virtual void cloneFollowSet(Position from, Position to, u32 offset) = 0;
|
||||
|
||||
/** \brief Build the prioritised list of edges out of our successor map. */
|
||||
virtual void buildEdges() = 0;
|
||||
};
|
||||
|
||||
/** \brief Returns a new GlushkovBuildState object. */
|
||||
std::unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
|
||||
bool prefilter);
|
||||
|
||||
/** \brief Replace all epsilons with the given positions. */
|
||||
void replaceEpsilons(std::vector<PositionInfo> &target,
|
||||
const std::vector<PositionInfo> &source);
|
||||
|
||||
/** \brief Eliminate lower-priority duplicate PositionInfo entries.
|
||||
*
|
||||
* Scans through a list of positions and retains only the highest priority
|
||||
* version of a given (position, flags) entry. */
|
||||
void cleanupPositions(std::vector<PositionInfo> &a);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
120
src/parser/check_refs.cpp
Normal file
120
src/parser/check_refs.cpp
Normal file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Component tree analysis that checks that references (such as
|
||||
* back-refs, conditionals) have valid referents.
|
||||
*/
|
||||
#include "check_refs.h"
|
||||
#include "ComponentBackReference.h"
|
||||
#include "ComponentCondReference.h"
|
||||
#include "ConstComponentVisitor.h"
|
||||
#include "parse_error.h"
|
||||
#include "util/container.h"
|
||||
#include "util/ue2_containers.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/**
|
||||
* \brief Visitor that checks the validity of references against a known list
|
||||
* of indices and labels.
|
||||
*/
|
||||
class ReferenceVisitor: public DefaultConstComponentVisitor {
|
||||
private:
|
||||
const size_t num_ids;
|
||||
const flat_set<string> &names;
|
||||
|
||||
public:
|
||||
ReferenceVisitor(size_t num_groups, const flat_set<string> &targets)
|
||||
: num_ids(num_groups), names(targets) {}
|
||||
|
||||
~ReferenceVisitor();
|
||||
|
||||
void invalid_index(const char *component, unsigned id) {
|
||||
assert(component);
|
||||
ostringstream str;
|
||||
str << "Invalid " << component << " to expression " << id << ".";
|
||||
throw ParseError(str.str());
|
||||
}
|
||||
|
||||
void invalid_label(const char *component, const std::string &label) {
|
||||
assert(component);
|
||||
ostringstream str;
|
||||
str << "Invalid " << component << " to label '" << label << "'.";
|
||||
throw ParseError(str.str());
|
||||
}
|
||||
|
||||
void pre(const ComponentBackReference &c) override {
|
||||
if (c.ref_id) {
|
||||
if (c.ref_id >= num_ids) {
|
||||
invalid_index("back reference", c.ref_id);
|
||||
}
|
||||
} else {
|
||||
if (!contains(names, c.name)) {
|
||||
invalid_label("back reference", c.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void pre(const ComponentCondReference &c) override {
|
||||
switch (c.kind) {
|
||||
case ComponentCondReference::CONDITION_NUMBER:
|
||||
if (c.ref_id >= num_ids) {
|
||||
invalid_index("conditional reference", c.ref_id);
|
||||
}
|
||||
break;
|
||||
case ComponentCondReference::CONDITION_NAME:
|
||||
if (c.ref_name == "DEFINE") {
|
||||
// The string "DEFINE" is a special "always false" condition
|
||||
// used to define subroutines.
|
||||
break;
|
||||
}
|
||||
if (!contains(names, c.ref_name)) {
|
||||
invalid_label("conditional reference", c.ref_name);
|
||||
}
|
||||
break;
|
||||
case ComponentCondReference::CONDITION_ASSERTION:
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Out-of-line destructor to silence weak vtable warnings.
|
||||
ReferenceVisitor::~ReferenceVisitor() {}
|
||||
|
||||
void checkReferences(const Component &root, unsigned int groupIndices,
|
||||
const ue2::flat_set<std::string> &groupNames) {
|
||||
ReferenceVisitor vis(groupIndices, groupNames);
|
||||
root.accept(vis);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
50
src/parser/check_refs.h
Normal file
50
src/parser/check_refs.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Component tree analysis that checks that references (such as
|
||||
* back-refs, conditionals) have valid referents.
|
||||
*/
|
||||
#ifndef PARSER_CHECK_REFS_H_
|
||||
#define PARSER_CHECK_REFS_H_
|
||||
|
||||
#include "util/ue2_containers.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class Component;
|
||||
class ComponentSequence;
|
||||
|
||||
void checkReferences(const Component &root, unsigned int groupIndices,
|
||||
const ue2::flat_set<std::string> &groupNames);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // PARSER_CHECK_REFS_H_
|
303
src/parser/dump.cpp
Normal file
303
src/parser/dump.cpp
Normal file
@@ -0,0 +1,303 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "dump.h"
|
||||
#include "position.h"
|
||||
#include "ConstComponentVisitor.h"
|
||||
#include "ComponentBackReference.h"
|
||||
#include "ComponentClass.h"
|
||||
#include "ComponentCondReference.h"
|
||||
#include "ComponentRepeat.h"
|
||||
#include "ComponentAlternation.h"
|
||||
#include "ComponentAssertion.h"
|
||||
#include "ComponentAtomicGroup.h"
|
||||
#include "ComponentBoundary.h"
|
||||
#include "ComponentByte.h"
|
||||
#include "ComponentEmpty.h"
|
||||
#include "ComponentEUS.h"
|
||||
#include "ComponentSequence.h"
|
||||
#include "ComponentWordBoundary.h"
|
||||
#include "Utf8ComponentClass.h"
|
||||
#include "AsciiComponentClass.h"
|
||||
#include "util/charreach.h"
|
||||
#include "util/dump_charclass.h"
|
||||
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
|
||||
#ifndef DUMP_SUPPORT
|
||||
#error No dump support!
|
||||
#endif
|
||||
|
||||
using std::ostream;
|
||||
using std::string;
|
||||
using std::endl;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class DumpVisitor : public ConstComponentVisitor {
|
||||
private:
|
||||
void indent() { level++; }
|
||||
void outdent() {
|
||||
assert(level > 0);
|
||||
level--;
|
||||
}
|
||||
std::string filler() const { return string(level * 2, ' '); }
|
||||
|
||||
public:
|
||||
explicit DumpVisitor(ostream &s) : os(s), level(0) {}
|
||||
~DumpVisitor() override;
|
||||
|
||||
void pre(const AsciiComponentClass &c) override {
|
||||
os << filler() << "ASCII CLASS" << endl << filler() << " ";
|
||||
describeClass(os, c.cr, 256, CC_OUT_TEXT);
|
||||
os << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const AsciiComponentClass &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentAlternation &) override {
|
||||
os << filler() << "ALTERNATION" << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentAlternation &) override {
|
||||
outdent();
|
||||
}
|
||||
|
||||
void pre(const ComponentAssertion &c) override {
|
||||
os << filler() << "ASSERTION (";
|
||||
switch (c.m_sense) {
|
||||
case ComponentAssertion::POS:
|
||||
os << "POSITIVE ";
|
||||
break;
|
||||
case ComponentAssertion::NEG:
|
||||
os << "NEGATIVE ";
|
||||
break;
|
||||
}
|
||||
|
||||
switch (c.m_dir) {
|
||||
case ComponentAssertion::LOOKAHEAD:
|
||||
os << "LOOKAHEAD";
|
||||
break;
|
||||
case ComponentAssertion::LOOKBEHIND:
|
||||
os << "LOOKBEHIND";
|
||||
break;
|
||||
}
|
||||
|
||||
os << ")" << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentAssertion &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentAtomicGroup &) override {
|
||||
os << filler() << "ATOMIC GROUP" << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentAtomicGroup &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentBackReference &c) override {
|
||||
if (!c.name.empty()) {
|
||||
os << filler() << "BACKREF " << c.name << std::endl;
|
||||
} else {
|
||||
os << filler() << "BACKREF " << c.ref_id << std::endl;
|
||||
}
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentBackReference &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentBoundary &c) override {
|
||||
os << filler() << "BOUNDARY" << endl << filler() << " ";
|
||||
switch (c.m_bound) {
|
||||
case ComponentBoundary::BEGIN_STRING:
|
||||
os << "ComponentBoundary::BEGIN_STRING";
|
||||
break;
|
||||
case ComponentBoundary::END_STRING:
|
||||
os << "ComponentBoundary::END_STRING";
|
||||
break;
|
||||
case ComponentBoundary::END_STRING_OPTIONAL_LF:
|
||||
os << "ComponentBoundary::END_STRING_OPTIONAL_LF";
|
||||
break;
|
||||
case ComponentBoundary::BEGIN_LINE:
|
||||
os << "ComponentBoundary::BEGIN_LINE";
|
||||
break;
|
||||
case ComponentBoundary::END_LINE:
|
||||
os << "ComponentBoundary::END_LINE";
|
||||
break;
|
||||
}
|
||||
os << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentBoundary &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentByte &) override {
|
||||
os << filler() << "BYTE" << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentByte &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentCondReference &c) override {
|
||||
os << filler() << "CONDITIONAL REFERENCE" << endl;
|
||||
switch (c.kind) {
|
||||
case ComponentCondReference::CONDITION_NUMBER:
|
||||
os << filler() << "REFERENCES GROUP WITH NUMBER " << c.ref_id
|
||||
<< endl;
|
||||
break;
|
||||
case ComponentCondReference::CONDITION_NAME:
|
||||
os << filler() << "REFERENCES GROUP WITH NAME " << c.ref_name
|
||||
<< endl;
|
||||
break;
|
||||
case ComponentCondReference::CONDITION_ASSERTION:
|
||||
os << filler() << "REFERENCES FOLLOWING ASSERTION" << endl;
|
||||
break;
|
||||
}
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentCondReference &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentEmpty &) override {
|
||||
os << filler() << "EMPTY" << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentEmpty &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentEUS &) override {
|
||||
os << filler() << "EUS" << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentEUS &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentRepeat &c) override {
|
||||
os << filler() << "REPEAT (" << c.m_min << ", ";
|
||||
if (c.m_max == ComponentRepeat::NoLimit) {
|
||||
os << "NoLimit";
|
||||
} else {
|
||||
os << c.m_max;
|
||||
}
|
||||
os << ") ";
|
||||
switch (c.type) {
|
||||
case ComponentRepeat::REPEAT_NONGREEDY:
|
||||
os << "non-greedy";
|
||||
break;
|
||||
case ComponentRepeat::REPEAT_GREEDY:
|
||||
os << "greedy";
|
||||
break;
|
||||
case ComponentRepeat::REPEAT_POSSESSIVE:
|
||||
os << "possessive";
|
||||
break;
|
||||
}
|
||||
os << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentRepeat &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentSequence &c) override {
|
||||
os << filler() << "SEQUENCE ";
|
||||
if (c.capture_index == ComponentSequence::NOT_CAPTURED) {
|
||||
os << "(not captured) ";
|
||||
} else {
|
||||
os << "(capture index " << c.capture_index << ") ";
|
||||
}
|
||||
if (!c.capture_name.empty()) {
|
||||
os << "(capture name '" << c.capture_name << "')";
|
||||
}
|
||||
os << endl;
|
||||
indent();
|
||||
if (c.children.empty()) {
|
||||
os << filler() << " <empty>" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void post(const ComponentSequence &) override { outdent(); }
|
||||
|
||||
void pre(const ComponentWordBoundary &c) override {
|
||||
os << filler() << (c.negated ? "NON-WORD-BOUNDARY ('\\B')"
|
||||
: "WORD-BOUNDARY ('\\b')") << endl;
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const ComponentWordBoundary &) override { outdent(); }
|
||||
|
||||
void pre(const UTF8ComponentClass &c) override {
|
||||
os << filler() << "UTF8 CLASS" << endl << filler() << " ";
|
||||
if (c.cps.none()) {
|
||||
os << "<none>";
|
||||
} else {
|
||||
for (auto it = c.cps.begin(), ite = c.cps.end(); it != ite; ++it) {
|
||||
os << std::hex << *it << " ";
|
||||
}
|
||||
}
|
||||
os << endl;
|
||||
|
||||
indent();
|
||||
}
|
||||
|
||||
void post(const UTF8ComponentClass &) override { outdent(); }
|
||||
|
||||
// not used
|
||||
void during(const AsciiComponentClass &) override {}
|
||||
void during(const ComponentAlternation &) override {}
|
||||
void during(const ComponentAssertion &) override {}
|
||||
void during(const ComponentAtomicGroup &) override {}
|
||||
void during(const ComponentBackReference &) override {}
|
||||
void during(const ComponentBoundary &) override {}
|
||||
void during(const ComponentByte &) override {}
|
||||
void during(const ComponentCondReference &) override {}
|
||||
void during(const ComponentEmpty &) override {}
|
||||
void during(const ComponentEUS &) override {}
|
||||
void during(const ComponentRepeat &) override {}
|
||||
void during(const ComponentSequence &) override {}
|
||||
void during(const ComponentWordBoundary &) override {}
|
||||
void during(const UTF8ComponentClass &) override {}
|
||||
|
||||
private:
|
||||
ostream &os;
|
||||
unsigned level;
|
||||
};
|
||||
|
||||
DumpVisitor::~DumpVisitor() {}
|
||||
|
||||
void dumpTree(ostream &os, const Component *const root) {
|
||||
assert(root);
|
||||
DumpVisitor vis(os);
|
||||
root->accept(vis);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
48
src/parser/dump.h
Normal file
48
src/parser/dump.h
Normal file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef PARSER_DUMP_H_
|
||||
#define PARSER_DUMP_H_
|
||||
|
||||
#ifdef DUMP_SUPPORT
|
||||
|
||||
#include <ostream>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class Component;
|
||||
|
||||
/** \brief Dump a text representation of the given component tree. Only
|
||||
* available in DUMP_SUPPORT builds. */
|
||||
void dumpTree(std::ostream &os, const Component *const root);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // DUMP_SUPPORT
|
||||
|
||||
#endif // PARSER_DUMP_H_
|
52
src/parser/parse_error.cpp
Normal file
52
src/parser/parse_error.cpp
Normal file
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Parse/Compile exceptions.
|
||||
*/
|
||||
|
||||
|
||||
#include "parse_error.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
// this is just to get these out of the .h to avoid weak vtables
|
||||
|
||||
ParseError::~ParseError() {}
|
||||
|
||||
LocatedParseError::~LocatedParseError() {}
|
||||
|
||||
void LocatedParseError::locate(size_t offset) {
|
||||
std::ostringstream str;
|
||||
str << reason << " at index " << offset << ".";
|
||||
reason = str.str();
|
||||
}
|
||||
|
||||
}
|
65
src/parser/parse_error.h
Normal file
65
src/parser/parse_error.h
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Parse/Compile exceptions.
|
||||
*/
|
||||
|
||||
#ifndef PARSE_ERROR_H_A02047D1AA16C9
|
||||
#define PARSE_ERROR_H_A02047D1AA16C9
|
||||
|
||||
#include "util/compile_error.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Error thrown internally by the Parser interface. */
|
||||
class ParseError : public CompileError {
|
||||
public:
|
||||
// Note: 'why' should describe why the error occurred and end with a
|
||||
// full stop, but no line break.
|
||||
explicit ParseError(const std::string &why) : CompileError(why) {}
|
||||
|
||||
~ParseError() override;
|
||||
};
|
||||
|
||||
class LocatedParseError : public ParseError {
|
||||
public:
|
||||
explicit LocatedParseError(const std::string &why) : ParseError(".") {
|
||||
reason = why; // don't use ParseError ctor
|
||||
}
|
||||
|
||||
~LocatedParseError() override;
|
||||
|
||||
void locate(size_t offset);
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif /* PARSE_ERROR_H_A02047D1AA16C9 */
|
48
src/parser/parser_util.cpp
Normal file
48
src/parser/parser_util.cpp
Normal file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Utilities (currently just ParseMode constructor)
|
||||
*/
|
||||
|
||||
|
||||
#include "hs.h"
|
||||
#include "Parser.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
ParseMode::ParseMode(u32 hs_flags) :
|
||||
caseless(hs_flags & HS_FLAG_CASELESS),
|
||||
dotall(hs_flags & HS_FLAG_DOTALL),
|
||||
ignore_space(false),
|
||||
multiline(hs_flags & HS_FLAG_MULTILINE),
|
||||
ucp(hs_flags & HS_FLAG_UCP),
|
||||
utf8(hs_flags & HS_FLAG_UTF8) {}
|
||||
|
||||
} // namespace ue2
|
107
src/parser/position.h
Normal file
107
src/parser/position.h
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Per-position flags used during Glushkov construction, PositionInfo class.
|
||||
*/
|
||||
|
||||
#ifndef PARSER_POSITION_H
|
||||
#define PARSER_POSITION_H
|
||||
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <set>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
#define POS_FLAG_NOFLOAT (1 << 0) //!< don't wire to start-dotstar
|
||||
#define POS_FLAG_MUST_FLOAT (1 << 1) //!< don't wire solely to start
|
||||
#define POS_FLAG_FIDDLE_ACCEPT (1 << 2) //!< add a dot with an offset adjustment when wiring to accept
|
||||
#define POS_FLAG_ASSERT_WORD_TO_NONWORD (1 << 3) //!< epsilon for word to nonword transition
|
||||
#define POS_FLAG_ASSERT_NONWORD_TO_WORD (1 << 4) //!< epsilon for nonword to word transition
|
||||
#define POS_FLAG_ASSERT_WORD_TO_WORD (1 << 5) //!< epsilon for word to word transition
|
||||
#define POS_FLAG_ASSERT_NONWORD_TO_NONWORD (1 << 6) //!< epsilon for nonword to nonword transition
|
||||
|
||||
/** vertex created by cloning startDs, not considered part of the match.
|
||||
* mirrors POS_FLAG_FIDDLE_ACCEPT */
|
||||
#define POS_FLAG_VIRTUAL_START (1 << 7)
|
||||
|
||||
/** multi-line ^ does not match \\n at end of buffer. As a result, we must never
|
||||
* wire the \\n from ^ to eod */
|
||||
#define POS_FLAG_MULTILINE_START (1 << 8)
|
||||
|
||||
#define POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP (1 << 9)
|
||||
#define POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP (1 << 10)
|
||||
#define POS_FLAG_ASSERT_WORD_TO_WORD_UCP (1 << 11)
|
||||
#define POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP (1 << 12)
|
||||
|
||||
#define POS_FLAG_ASSERT_NONWORD_TO_ANY (POS_FLAG_ASSERT_NONWORD_TO_NONWORD \
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_WORD)
|
||||
#define POS_FLAG_ASSERT_WORD_TO_ANY (POS_FLAG_ASSERT_WORD_TO_NONWORD \
|
||||
| POS_FLAG_ASSERT_WORD_TO_WORD)
|
||||
|
||||
#define POS_FLAG_ASSERT_ANY_TO_NONWORD (POS_FLAG_ASSERT_NONWORD_TO_NONWORD \
|
||||
| POS_FLAG_ASSERT_WORD_TO_NONWORD)
|
||||
#define POS_FLAG_ASSERT_ANY_TO_WORD (POS_FLAG_ASSERT_NONWORD_TO_WORD \
|
||||
| POS_FLAG_ASSERT_WORD_TO_WORD)
|
||||
|
||||
#define POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP \
|
||||
(POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP \
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP)
|
||||
#define POS_FLAG_ASSERT_WORD_TO_ANY_UCP (POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP \
|
||||
| POS_FLAG_ASSERT_WORD_TO_WORD_UCP)
|
||||
|
||||
#define POS_FLAG_ASSERT_ANY_TO_NONWORD_UCP \
|
||||
(POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP \
|
||||
| POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP)
|
||||
#define POS_FLAG_ASSERT_ANY_TO_WORD_UCP (POS_FLAG_ASSERT_WORD_TO_WORD_UCP \
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP)
|
||||
|
||||
#define UCP_ASSERT_FLAGS (POS_FLAG_ASSERT_WORD_TO_ANY_UCP \
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP)
|
||||
|
||||
#define NON_UCP_ASSERT_FLAGS (POS_FLAG_ASSERT_WORD_TO_ANY \
|
||||
| POS_FLAG_ASSERT_NONWORD_TO_ANY)
|
||||
|
||||
/** do not wire to accept or other pos; may still wire to eod, etc if
|
||||
* instructed */
|
||||
#define POS_FLAG_ONLY_ENDS (1 << 23)
|
||||
|
||||
#define POS_FLAG_WIRE_EOD (1 << 24) /**< wire to accept eod */
|
||||
#define POS_FLAG_WIRE_NL_EOD (1 << 25) /**< wire to nl before accept eod */
|
||||
#define POS_FLAG_WIRE_NL_ACCEPT (1 << 26) /**< wire to nl before accept */
|
||||
#define POS_FLAG_NO_NL_EOD (1 << 27) /**< disallow nl before accept eod */
|
||||
#define POS_FLAG_NO_NL_ACCEPT (1 << 28) /**< disallow nl before accept */
|
||||
|
||||
/** \brief Parse and Glushkov construction use only. State number within the
|
||||
* NFA as it is being constructed. */
|
||||
typedef u32 Position;
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // PARSER_POSITION_H
|
63
src/parser/position_dump.h
Normal file
63
src/parser/position_dump.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef POSITION_DUMP_H
|
||||
#define POSITION_DUMP_H
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
#ifdef DUMP_SUPPORT
|
||||
// implemented in buildstate.cpp
|
||||
void dump(std::ostream &os, const PositionInfo &p);
|
||||
#endif
|
||||
|
||||
#if defined(DUMP_SUPPORT) || defined(DEBUG)
|
||||
|
||||
template<class Iterator>
|
||||
static UNUSED
|
||||
std::string dumpPositions(const Iterator &begin, const Iterator &end) {
|
||||
std::ostringstream oss;
|
||||
oss << '[';
|
||||
for (Iterator i = begin; i != end; ++i) {
|
||||
if (i != begin) {
|
||||
oss << ' ';
|
||||
}
|
||||
dump(oss, *i);
|
||||
}
|
||||
oss << ']';
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif /* POSITION_DUMP_H */
|
||||
|
57
src/parser/position_info.h
Normal file
57
src/parser/position_info.h
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef POSITION_INFO_H
|
||||
#define POSITION_INFO_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "position.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** Class representing a component state. */
|
||||
class PositionInfo {
|
||||
public:
|
||||
PositionInfo(unsigned int p) : pos(p), flags(0) {}
|
||||
|
||||
bool operator<(const PositionInfo &other) const {
|
||||
return pos < other.pos;
|
||||
}
|
||||
|
||||
bool operator==(const PositionInfo &other) const {
|
||||
return pos == other.pos;
|
||||
}
|
||||
|
||||
Position pos; //!< state number
|
||||
int flags; //!< from POS_FLAG_* above
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif /* POSITION_INFO_H */
|
||||
|
339
src/parser/prefilter.cpp
Normal file
339
src/parser/prefilter.cpp
Normal file
@@ -0,0 +1,339 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Prefiltering component tree transformation.
|
||||
*/
|
||||
#include "ComponentAssertion.h"
|
||||
#include "ComponentAtomicGroup.h"
|
||||
#include "ComponentBackReference.h"
|
||||
#include "ComponentBoundary.h"
|
||||
#include "ComponentClass.h"
|
||||
#include "ComponentCondReference.h"
|
||||
#include "ComponentRepeat.h"
|
||||
#include "ComponentSequence.h"
|
||||
#include "ComponentVisitor.h"
|
||||
#include "ComponentWordBoundary.h"
|
||||
#include "ConstComponentVisitor.h"
|
||||
#include "Parser.h"
|
||||
#include "prefilter.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <stack>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Max number of positions a referent can have to be considered safe to
|
||||
* replace a reference in prefiltering mode. */
|
||||
static const size_t MAX_REFERENT_POSITIONS = 1;
|
||||
|
||||
/** \brief Constructs a \ref ComponentClass that matches a dot (any
|
||||
* byte/codepoint, depending on whether UTF-8). */
|
||||
static
|
||||
unique_ptr<ComponentClass> makeDotClass(const ParseMode &mode_in) {
|
||||
ParseMode mode(mode_in);
|
||||
mode.dotall = true;
|
||||
return generateComponent(CLASS_ANY, false, mode);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* \brief Visitor used to determine if a given referent component is safe to
|
||||
* replace its reference in prefiltering mode. Throws
|
||||
* SafeReferentVisitor::Unsafe to terminate early on unsafe cases. */
|
||||
class SafeReferentVisitor : public DefaultConstComponentVisitor {
|
||||
public:
|
||||
struct Unsafe {};
|
||||
|
||||
SafeReferentVisitor() : numPositions(0) {}
|
||||
|
||||
bool is_safe() const {
|
||||
DEBUG_PRINTF("numPositions = %zu\n", numPositions);
|
||||
return numPositions <= MAX_REFERENT_POSITIONS;
|
||||
}
|
||||
|
||||
void pre(const AsciiComponentClass &) override {
|
||||
numPositions++;
|
||||
}
|
||||
|
||||
void pre(const UTF8ComponentClass &) override {
|
||||
// FIXME: we should be able to tell precisely how many positions this
|
||||
// class will use. Right now, use the worst case.
|
||||
numPositions += 4;
|
||||
}
|
||||
|
||||
void pre(const ComponentBoundary &) override {
|
||||
numPositions++;
|
||||
}
|
||||
|
||||
void pre(const ComponentByte &) override {
|
||||
numPositions++;
|
||||
}
|
||||
|
||||
void pre(const ComponentEUS &) override {
|
||||
numPositions++;
|
||||
}
|
||||
|
||||
void pre(const ComponentRepeat &) override {
|
||||
// Record the number of positions used before we visit the contents of
|
||||
// the repeat.
|
||||
countStack.push(numPositions);
|
||||
}
|
||||
|
||||
void post(const ComponentRepeat &c) override {
|
||||
assert(!countStack.empty());
|
||||
size_t before = countStack.top();
|
||||
countStack.pop();
|
||||
assert(before <= numPositions);
|
||||
|
||||
std::pair<u32, u32> bounds = c.getBounds();
|
||||
size_t subPositions = numPositions - before;
|
||||
size_t copies = bounds.second < ComponentRepeat::NoLimit
|
||||
? bounds.second
|
||||
: max(bounds.first, 1U);
|
||||
numPositions = before + (subPositions * copies);
|
||||
}
|
||||
|
||||
void pre(const ComponentWordBoundary &) override {
|
||||
// not quite accurate, as these are expanded out in assert
|
||||
// resolution...
|
||||
numPositions++;
|
||||
}
|
||||
|
||||
void pre(const ComponentBackReference &) override {
|
||||
throw Unsafe();
|
||||
}
|
||||
|
||||
void pre(const ComponentCondReference &) override {
|
||||
throw Unsafe();
|
||||
}
|
||||
|
||||
private:
|
||||
size_t numPositions;
|
||||
|
||||
// For temporary use
|
||||
std::stack<size_t> countStack;
|
||||
};
|
||||
|
||||
static
|
||||
bool isSafeReferent(const Component &c) {
|
||||
try {
|
||||
SafeReferentVisitor vis;
|
||||
c.accept(vis);
|
||||
return vis.is_safe();
|
||||
}
|
||||
catch (const SafeReferentVisitor::Unsafe &) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Visitor to find the \ref ComponentSequence with a given reference ID
|
||||
* or name: if found, the visitor will throw a const ptr to it.
|
||||
*/
|
||||
class FindSequenceVisitor : public DefaultConstComponentVisitor {
|
||||
public:
|
||||
explicit FindSequenceVisitor(unsigned ref_id) : id(ref_id) {}
|
||||
explicit FindSequenceVisitor(const std::string &s) : name(s) {}
|
||||
|
||||
void pre(const ComponentSequence &c) override {
|
||||
if (!name.empty()) {
|
||||
if (c.getCaptureName() == name) {
|
||||
throw &c;
|
||||
}
|
||||
} else if (c.getCaptureIndex() == id) {
|
||||
throw &c;
|
||||
}
|
||||
}
|
||||
private:
|
||||
const std::string name;
|
||||
const unsigned id = 0;
|
||||
};
|
||||
|
||||
static
|
||||
const ComponentSequence *findCapturingGroup(const Component *root,
|
||||
FindSequenceVisitor &vis) {
|
||||
try {
|
||||
root->accept(vis);
|
||||
DEBUG_PRINTF("group not found\n");
|
||||
return nullptr;
|
||||
} catch (const ComponentSequence *seq) {
|
||||
return seq;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* \brief Visitor to apply prefilter reductions, swapping components for which
|
||||
* we don't have real implementations with implementable ones. Any such
|
||||
* replacement should produce a superset of the matches that would be produced
|
||||
* by the original.
|
||||
*/
|
||||
class PrefilterVisitor : public DefaultComponentVisitor {
|
||||
public:
|
||||
PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
|
||||
~PrefilterVisitor();
|
||||
|
||||
/** \brief Calls the visitor (recursively) on a new replacement component
|
||||
* we've just created. Takes care of freeing it if the sequence is itself
|
||||
* replaced. */
|
||||
template<class T>
|
||||
Component *visit_replacement(T *r) {
|
||||
Component *c = r->accept(*this);
|
||||
if (c != r) {
|
||||
delete r;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
Component *visit(ComponentBackReference *c) override {
|
||||
assert(c);
|
||||
|
||||
// If the referent is simple (represents a single position), then we
|
||||
// replace the back-reference with a copy of it.
|
||||
const ComponentSequence *ref = nullptr;
|
||||
const std::string &ref_name = c->getRefName();
|
||||
const unsigned ref_id = c->getRefID();
|
||||
if (!ref_name.empty()) {
|
||||
FindSequenceVisitor vis(ref_name);
|
||||
ref = findCapturingGroup(root, vis);
|
||||
} else if (ref_id > 0) {
|
||||
FindSequenceVisitor vis(ref_id);
|
||||
ref = findCapturingGroup(root, vis);
|
||||
}
|
||||
|
||||
if (ref && isSafeReferent(*ref)) {
|
||||
DEBUG_PRINTF("found safe ref %p\n", ref);
|
||||
ComponentSequence *seq = ref->clone();
|
||||
// Remove labels from cloned sequence.
|
||||
seq->setCaptureName("");
|
||||
seq->setCaptureIndex(ComponentSequence::NOT_CAPTURED);
|
||||
|
||||
return visit_replacement(seq);
|
||||
}
|
||||
|
||||
// Replace with ".*".
|
||||
auto rep = makeComponentRepeat(makeDotClass(mode), 0,
|
||||
ComponentRepeat::NoLimit,
|
||||
ComponentRepeat::REPEAT_GREEDY);
|
||||
return rep.release(); // FIXME: owning raw ptr
|
||||
}
|
||||
|
||||
Component *visit(UNUSED ComponentAssertion *c) override {
|
||||
assert(c);
|
||||
// Replace with an empty sequence.
|
||||
return new ComponentSequence();
|
||||
}
|
||||
|
||||
Component *visit(ComponentRepeat *c) override {
|
||||
assert(c);
|
||||
// Possessive repeats become greedy.
|
||||
if (c->type == ComponentRepeat::REPEAT_POSSESSIVE) {
|
||||
c->type = ComponentRepeat::REPEAT_GREEDY;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
Component *visit(ComponentAtomicGroup *c) override {
|
||||
assert(c);
|
||||
// Replace with a plain sequence containing the atomic group's
|
||||
// children.
|
||||
ComponentSequence *seq = new ComponentSequence();
|
||||
const auto &children = c->getChildren();
|
||||
for (const auto &child : children) {
|
||||
assert(child);
|
||||
seq->addComponent(unique_ptr<Component>(child->clone()));
|
||||
}
|
||||
|
||||
return visit_replacement(seq);
|
||||
}
|
||||
|
||||
Component *visit(UNUSED ComponentEUS *c) override {
|
||||
assert(c);
|
||||
// Replace with ".+".
|
||||
auto rep = makeComponentRepeat(makeDotClass(mode), 1,
|
||||
ComponentRepeat::NoLimit,
|
||||
ComponentRepeat::REPEAT_GREEDY);
|
||||
return rep.release(); // FIXME: owning raw ptr
|
||||
}
|
||||
|
||||
Component *visit(ComponentWordBoundary *c) override {
|
||||
assert(c);
|
||||
c->setPrefilter(true);
|
||||
return c;
|
||||
}
|
||||
|
||||
Component *visit(ComponentCondReference *c) override {
|
||||
assert(c);
|
||||
// Replace with a plain sequence containing the conditional reference's
|
||||
// children.
|
||||
ComponentSequence *seq = new ComponentSequence();
|
||||
const auto &children = c->getChildren();
|
||||
|
||||
// Empty children is accepted by PCRE as a "do nothing" case.
|
||||
if (children.empty()) {
|
||||
return seq;
|
||||
}
|
||||
|
||||
for (const auto &child : children) {
|
||||
assert(child);
|
||||
seq->addComponent(unique_ptr<Component>(child->clone()));
|
||||
}
|
||||
|
||||
// If the conditional reference had just a YES branch, we want this to
|
||||
// be an alternation with an empty sequence (the NO branch).
|
||||
if (!c->hasBothBranches) {
|
||||
seq->addAlternation();
|
||||
seq->finalize();
|
||||
}
|
||||
|
||||
return visit_replacement(seq);
|
||||
}
|
||||
|
||||
private:
|
||||
Component *root;
|
||||
const ParseMode &mode;
|
||||
};
|
||||
|
||||
PrefilterVisitor::~PrefilterVisitor() {}
|
||||
|
||||
void prefilterTree(unique_ptr<Component> &root, const ParseMode &mode) {
|
||||
assert(root);
|
||||
PrefilterVisitor vis(root.get(), mode);
|
||||
|
||||
Component *c = root->accept(vis);
|
||||
if (c != root.get()) {
|
||||
root.reset(c);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ue2
|
48
src/parser/prefilter.h
Normal file
48
src/parser/prefilter.h
Normal file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef PARSER_PREFILTER_H
|
||||
#define PARSER_PREFILTER_H
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class Component;
|
||||
struct ParseMode;
|
||||
|
||||
/**
|
||||
* \brief Applies prefiltering transformations to the given component.
|
||||
*
|
||||
* May reseat the given Component pointer.
|
||||
*/
|
||||
void prefilterTree(std::unique_ptr<Component> &root, const ParseMode &mode);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // PARSER_PREFILTER_H
|
201
src/parser/shortcut_literal.cpp
Normal file
201
src/parser/shortcut_literal.cpp
Normal file
@@ -0,0 +1,201 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Shortcut literal pass: directly add literal components to Rose.
|
||||
*/
|
||||
#include "AsciiComponentClass.h"
|
||||
#include "Utf8ComponentClass.h"
|
||||
#include "ComponentAssertion.h"
|
||||
#include "ComponentAtomicGroup.h"
|
||||
#include "ComponentBackReference.h"
|
||||
#include "ComponentBoundary.h"
|
||||
#include "ComponentClass.h"
|
||||
#include "ComponentCondReference.h"
|
||||
#include "ComponentRepeat.h"
|
||||
#include "ComponentSequence.h"
|
||||
#include "ComponentVisitor.h"
|
||||
#include "ComponentWordBoundary.h"
|
||||
#include "ConstComponentVisitor.h"
|
||||
#include "parse_error.h"
|
||||
#include "shortcut_literal.h"
|
||||
#include "grey.h"
|
||||
#include "nfagraph/ng.h"
|
||||
#include "compiler/compiler.h"
|
||||
#include "util/ue2string.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
#include <stack>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/**
|
||||
* \brief Visitor that constructs a ue2_literal from a component tree.
|
||||
*
|
||||
* If a component that can't be part of a literal is encountered, this visitor
|
||||
* will throw ConstructLiteralVisitor::NotLiteral.
|
||||
*/
|
||||
class ConstructLiteralVisitor : public ConstComponentVisitor {
|
||||
public:
|
||||
~ConstructLiteralVisitor();
|
||||
|
||||
/** \brief Thrown if this component does not represent a literal. */
|
||||
struct NotLiteral {};
|
||||
|
||||
void pre(const AsciiComponentClass &c) override {
|
||||
const CharReach &cr = c.cr;
|
||||
const size_t width = cr.count();
|
||||
if (width == 1) {
|
||||
lit.push_back(cr.find_first(), false);
|
||||
} else if (width == 2 && cr.isCaselessChar()) {
|
||||
lit.push_back(cr.find_first(), true);
|
||||
} else {
|
||||
throw NotLiteral();
|
||||
}
|
||||
}
|
||||
|
||||
void pre(const ComponentRepeat &c) override {
|
||||
if (c.m_min == 0 || c.m_min != c.m_max) {
|
||||
throw NotLiteral();
|
||||
}
|
||||
|
||||
if (c.m_max < ComponentRepeat::NoLimit && c.m_max > 32767) {
|
||||
throw ParseError("Bounded repeat is too large.");
|
||||
}
|
||||
|
||||
// Store the current length of the literal; in this repeat's post()
|
||||
// call we will append N-1 more copies of [index..end].
|
||||
repeat_stack.push(lit.length());
|
||||
}
|
||||
|
||||
void post(const ComponentRepeat &c) override {
|
||||
// Add N-1 copies of the string between the entry to the repeat and the
|
||||
// current end of the literal.
|
||||
assert(!repeat_stack.empty());
|
||||
const ue2_literal suffix = lit.substr(repeat_stack.top());
|
||||
repeat_stack.pop();
|
||||
|
||||
for (unsigned i = 1; i < c.m_min; i++) {
|
||||
lit += suffix;
|
||||
}
|
||||
}
|
||||
|
||||
void pre(const ComponentSequence &) override {
|
||||
// Pass through.
|
||||
}
|
||||
|
||||
void pre(const ComponentAlternation &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentAssertion &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentAtomicGroup &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentBackReference &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentBoundary &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentByte &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentCondReference &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentEmpty &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentEUS &) override { throw NotLiteral(); }
|
||||
void pre(const ComponentWordBoundary &) override { throw NotLiteral(); }
|
||||
void pre(const UTF8ComponentClass &) override { throw NotLiteral(); }
|
||||
|
||||
void during(const AsciiComponentClass &) override {}
|
||||
void during(const ComponentAlternation &) override {}
|
||||
void during(const ComponentAssertion &) override {}
|
||||
void during(const ComponentAtomicGroup &) override {}
|
||||
void during(const ComponentBackReference &) override {}
|
||||
void during(const ComponentBoundary &) override {}
|
||||
void during(const ComponentByte &) override {}
|
||||
void during(const ComponentCondReference &) override {}
|
||||
void during(const ComponentEmpty &) override {}
|
||||
void during(const ComponentEUS &) override {}
|
||||
void during(const ComponentRepeat &) override {}
|
||||
void during(const ComponentSequence &) override {}
|
||||
void during(const ComponentWordBoundary &) override {}
|
||||
void during(const UTF8ComponentClass &) override {}
|
||||
|
||||
void post(const AsciiComponentClass &) override {}
|
||||
void post(const ComponentAlternation &) override {}
|
||||
void post(const ComponentAssertion &) override {}
|
||||
void post(const ComponentAtomicGroup &) override {}
|
||||
void post(const ComponentBackReference &) override {}
|
||||
void post(const ComponentBoundary &) override {}
|
||||
void post(const ComponentByte &) override {}
|
||||
void post(const ComponentCondReference &) override {}
|
||||
void post(const ComponentEmpty &) override {}
|
||||
void post(const ComponentEUS &) override {}
|
||||
void post(const ComponentSequence &) override {}
|
||||
void post(const ComponentWordBoundary &) override {}
|
||||
void post(const UTF8ComponentClass &) override {}
|
||||
|
||||
ue2_literal lit;
|
||||
stack<size_t> repeat_stack; //!< index of entry to repeat.
|
||||
};
|
||||
|
||||
ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
|
||||
|
||||
/** \brief True if the literal expression \a expr could be added to Rose. */
|
||||
bool shortcutLiteral(NG &ng, const ParsedExpression &expr) {
|
||||
assert(expr.component);
|
||||
|
||||
if (!ng.cc.grey.allowRose) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// XXX: don't shortcut literals with extended params (yet)
|
||||
if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length) {
|
||||
DEBUG_PRINTF("extended params not allowed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
ConstructLiteralVisitor vis;
|
||||
try {
|
||||
assert(expr.component);
|
||||
expr.component->accept(vis);
|
||||
assert(vis.repeat_stack.empty());
|
||||
} catch (const ConstructLiteralVisitor::NotLiteral&) {
|
||||
DEBUG_PRINTF("not a literal\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const ue2_literal &lit = vis.lit;
|
||||
|
||||
if (lit.empty()) {
|
||||
DEBUG_PRINTF("empty literal\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (expr.highlander && lit.length() <= 1) {
|
||||
DEBUG_PRINTF("not shortcutting SEP literal\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str());
|
||||
return ng.addLiteral(lit, expr.index, expr.id, expr.highlander, expr.som);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
46
src/parser/shortcut_literal.h
Normal file
46
src/parser/shortcut_literal.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Shortcut literal pass: directly add literal components to Rose.
|
||||
*/
|
||||
|
||||
#ifndef SHORTCUT_LITERAL_H
|
||||
#define SHORTCUT_LITERAL_H
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class NG;
|
||||
class ParsedExpression;
|
||||
|
||||
/** \brief True if the literal expression \a expr could be added to Rose. */
|
||||
bool shortcutLiteral(NG &ng, const ParsedExpression &expr);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
134
src/parser/ucp_table.cpp
Normal file
134
src/parser/ucp_table.cpp
Normal file
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "Utf8ComponentClass.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
#define UCP_FN(cat) \
|
||||
CodePointSet getUcp##cat(void) { \
|
||||
CodePointSet rv; \
|
||||
for (u32 i = 0; i < ARRAY_LENGTH(ucp_##cat##_def); i += 2) { \
|
||||
rv.setRange(ucp_##cat##_def[i], ucp_##cat##_def[i + 1]); \
|
||||
} \
|
||||
return rv; \
|
||||
}
|
||||
|
||||
struct unicase {
|
||||
unichar base;
|
||||
unichar caseless;
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#define UCP_TABLE_DEFINE_FN
|
||||
#include "ucp_table.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
static
|
||||
bool operator<(const unicase &a, const unicase &b) {
|
||||
if (a.base < b.base) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (a.base > b.base) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return a.caseless < b.caseless;
|
||||
}
|
||||
|
||||
void make_caseless(CodePointSet *cps) {
|
||||
assert(cps);
|
||||
DEBUG_PRINTF("hello\n");
|
||||
// Cheap optimisation: if we are empty or a dot, we're already caseless.
|
||||
if (cps->begin() == cps->end()) {
|
||||
DEBUG_PRINTF("empty\n");
|
||||
return;
|
||||
}
|
||||
if (lower(*cps->begin()) == 0 && upper(*cps->begin()) == MAX_UNICODE) {
|
||||
DEBUG_PRINTF("dot\n");
|
||||
return;
|
||||
}
|
||||
|
||||
CodePointSet base = *cps;
|
||||
|
||||
const unicase *uc_begin = ucp_caseless_def;
|
||||
const unicase *const uc_end = ucp_caseless_def
|
||||
+ ARRAY_LENGTH(ucp_caseless_def);
|
||||
DEBUG_PRINTF("uc len %zd\n", uc_end - uc_begin);
|
||||
|
||||
for (auto it = base.begin(), ite = base.end(); it != ite; ++it) {
|
||||
unichar b = lower(*it);
|
||||
unichar e = upper(*it) + 1;
|
||||
|
||||
for (; b < e; b++) {
|
||||
DEBUG_PRINTF("decasing %x\n", b);
|
||||
unicase test = {b, 0}; /* NUL is not a caseless version of anything,
|
||||
* so we are ok */
|
||||
uc_begin = lower_bound(uc_begin, uc_end, test);
|
||||
if (uc_begin == uc_end) {
|
||||
DEBUG_PRINTF("EOL\n");
|
||||
return;
|
||||
}
|
||||
while (uc_begin->base == b) {
|
||||
DEBUG_PRINTF("at {%x,%x}\n", uc_begin->base, uc_begin->caseless);
|
||||
cps->set(uc_begin->caseless);
|
||||
++uc_begin;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Flip the case of the codepoint in c, if possible.
|
||||
*
|
||||
* Note that this assumes a one-to-one case mapping, which (though not
|
||||
* realistic) is what PCRE does. */
|
||||
bool flip_case(unichar *c) {
|
||||
assert(c);
|
||||
|
||||
const unicase *const uc_begin = ucp_caseless_def;
|
||||
const unicase *const uc_end =
|
||||
ucp_caseless_def + ARRAY_LENGTH(ucp_caseless_def);
|
||||
|
||||
const unicase test = { *c, 0 };
|
||||
const unicase *f = lower_bound(uc_begin, uc_end, test);
|
||||
if (f->base == *c) {
|
||||
DEBUG_PRINTF("flipped c=%x to %x\n", *c, f->caseless);
|
||||
*c = f->caseless;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace ue2
|
11043
src/parser/ucp_table.h
Normal file
11043
src/parser/ucp_table.h
Normal file
File diff suppressed because it is too large
Load Diff
87
src/parser/unsupported.cpp
Normal file
87
src/parser/unsupported.cpp
Normal file
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Checks component trees for unsupported components.
|
||||
*/
|
||||
#include "ConstComponentVisitor.h"
|
||||
#include "ComponentEUS.h"
|
||||
#include "ComponentRepeat.h"
|
||||
#include "ComponentWordBoundary.h"
|
||||
#include "parse_error.h"
|
||||
#include "unsupported.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Visitor class that throws a ParseError exception when it encounters
|
||||
* an unsupported component. */
|
||||
class UnsupportedVisitor : public DefaultConstComponentVisitor {
|
||||
public:
|
||||
~UnsupportedVisitor();
|
||||
void pre(const ComponentAssertion &) override {
|
||||
throw ParseError("Zero-width assertions are not supported.");
|
||||
}
|
||||
void pre(const ComponentAtomicGroup &) override {
|
||||
throw ParseError("Atomic groups are unsupported.");
|
||||
}
|
||||
void pre(const ComponentBackReference &) override {
|
||||
throw ParseError("Back-references are unsupported.");
|
||||
}
|
||||
void pre(const ComponentCondReference &) override {
|
||||
throw ParseError("Conditional references are not supported.");
|
||||
}
|
||||
void pre(const ComponentEUS &c) override {
|
||||
std::ostringstream str;
|
||||
str << "\\X unsupported at index " << c.loc << ".";
|
||||
throw ParseError(str.str());
|
||||
}
|
||||
void pre(const ComponentRepeat &c) override {
|
||||
if (c.type == ComponentRepeat::REPEAT_POSSESSIVE) {
|
||||
throw ParseError("Possessive quantifiers are not supported.");
|
||||
}
|
||||
}
|
||||
void pre(const ComponentWordBoundary &c) override {
|
||||
if (c.ucp && !c.prefilter) {
|
||||
std::ostringstream str;
|
||||
str << (!c.negated ? "\\b" : "\\B")
|
||||
<< " unsupported in UCP mode at index " << c.loc << ".";
|
||||
throw ParseError(str.str());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
UnsupportedVisitor::~UnsupportedVisitor() {}
|
||||
|
||||
void checkUnsupported(const Component &root) {
|
||||
UnsupportedVisitor vis;
|
||||
root.accept(vis);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
47
src/parser/unsupported.h
Normal file
47
src/parser/unsupported.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Checks component trees for unsupported components.
|
||||
*/
|
||||
#ifndef PARSER_UNSUPPORTED_H_
|
||||
#define PARSER_UNSUPPORTED_H_
|
||||
|
||||
#include "parse_error.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
class Component;
|
||||
|
||||
/** \brief Throws a ParseError if this component tree contains an unsupported
|
||||
* Component. */
|
||||
void checkUnsupported(const Component &root);
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // PARSER_UNSUPPORTED_H_
|
163
src/parser/utf8_validate.cpp
Normal file
163
src/parser/utf8_validate.cpp
Normal file
@@ -0,0 +1,163 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "utf8_validate.h"
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/unicode_def.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
static
|
||||
bool hasValidContBytes(const u8 *s, size_t num) {
|
||||
/* continuer bytes must all be of the form 10xx xxxx */
|
||||
for (size_t i = 0; i < num; i++) {
|
||||
if ((s[i] & 0xc0) != UTF_CONT_BYTE_HEADER) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static
|
||||
bool isAllowedCodepoint(u32 val) {
|
||||
if (val >= 0xd800 && val <= 0xdfff) {
|
||||
return false; // High and low surrogate halves
|
||||
}
|
||||
if (val > 0x10ffff) {
|
||||
return false; // As per limit in RFC 3629
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool isValidUtf8(const char *expression) {
|
||||
if (!expression) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const size_t len = strlen(expression);
|
||||
const u8 *s = (const u8 *)expression;
|
||||
u32 val;
|
||||
|
||||
size_t i = 0;
|
||||
while (i < len) {
|
||||
DEBUG_PRINTF("byte %zu: 0x%02x\n", i, s[i]);
|
||||
// One octet.
|
||||
if (s[i] < 0x7f) {
|
||||
DEBUG_PRINTF("one octet\n");
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Two octets.
|
||||
if ((s[i] & 0xe0) == UTF_TWO_BYTE_HEADER) {
|
||||
DEBUG_PRINTF("two octets\n");
|
||||
if (i + 2 > len) {
|
||||
break;
|
||||
}
|
||||
if (!hasValidContBytes(&s[i] + 1, 1)) {
|
||||
break;
|
||||
}
|
||||
val = ((s[i] & 0x1f) << 6) | (s[i + 1] & UTF_CONT_BYTE_VALUE_MASK);
|
||||
DEBUG_PRINTF("val=0x%x\n", val);
|
||||
if (val < 1U << 7) {
|
||||
DEBUG_PRINTF("overlong encoding\n");
|
||||
break;
|
||||
}
|
||||
if (!isAllowedCodepoint(val)) {
|
||||
DEBUG_PRINTF("codepoint not allowed\n");
|
||||
break;
|
||||
}
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Three octets.
|
||||
if ((s[i] & 0xf0) == UTF_THREE_BYTE_HEADER) {
|
||||
DEBUG_PRINTF("three octets\n");
|
||||
if (i + 3 > len) {
|
||||
break;
|
||||
}
|
||||
if (!hasValidContBytes(&s[i] + 1, 2)) {
|
||||
break;
|
||||
}
|
||||
val = ((s[i] & 0xf) << 12) |
|
||||
((s[i + 1] & UTF_CONT_BYTE_VALUE_MASK) << 6) |
|
||||
(s[i + 2] & UTF_CONT_BYTE_VALUE_MASK);
|
||||
if (val < 1U << 11) {
|
||||
DEBUG_PRINTF("overlong encoding\n");
|
||||
break;
|
||||
}
|
||||
if (!isAllowedCodepoint(val)) {
|
||||
DEBUG_PRINTF("codepoint not allowed\n");
|
||||
break;
|
||||
}
|
||||
i += 3;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Four octets.
|
||||
if ((s[i] & 0xf8) == UTF_FOUR_BYTE_HEADER) {
|
||||
DEBUG_PRINTF("four octets\n");
|
||||
if (i + 4 > len) {
|
||||
break;
|
||||
}
|
||||
if (!hasValidContBytes(&s[i] + 1, 3)) {
|
||||
break;
|
||||
}
|
||||
val = ((s[i] & 0xf) << 18) |
|
||||
((s[i + 1] & UTF_CONT_BYTE_VALUE_MASK) << 12) |
|
||||
((s[i + 2] & UTF_CONT_BYTE_VALUE_MASK) << 6) |
|
||||
(s[i + 3] & UTF_CONT_BYTE_VALUE_MASK);
|
||||
if (val < 1U << 16) {
|
||||
DEBUG_PRINTF("overlong encoding\n");
|
||||
break;
|
||||
}
|
||||
if (!isAllowedCodepoint(val)) {
|
||||
DEBUG_PRINTF("codepoint not allowed\n");
|
||||
break;
|
||||
}
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Something else?
|
||||
DEBUG_PRINTF("bad byte 0x%02x\n", s[i]);
|
||||
break;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("i=%zu, len=%zu\n", i, len);
|
||||
return i == len;
|
||||
}
|
||||
|
||||
} // namespace ue2
|
39
src/parser/utf8_validate.h
Normal file
39
src/parser/utf8_validate.h
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef PARSER_UTF8_VALIDATE_H
|
||||
#define PARSER_UTF8_VALIDATE_H
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
/** \brief Validate that the given expression is well-formed UTF-8. */
|
||||
bool isValidUtf8(const char *expression);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif // PARSER_UTF8_VALIDATE_H
|
Reference in New Issue
Block a user