Initial commit of Hyperscan

This commit is contained in:
Matthew Barr
2015-10-20 09:13:35 +11:00
commit 904e436f11
610 changed files with 213627 additions and 0 deletions

View File

@@ -0,0 +1,159 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Character classes and their mnemonics.
*/
#include "AsciiComponentClass.h"
#include "Utf8ComponentClass.h"
#include "buildstate.h"
#include "parse_error.h"
#include "position.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
#include "util/charreach_util.h"
using namespace std;
namespace ue2 {
AsciiComponentClass::AsciiComponentClass(const ParseMode &mode_in)
: ComponentClass(mode_in), position(GlushkovBuildState::POS_UNINITIALIZED) {
assert(!mode.utf8);
}
AsciiComponentClass *AsciiComponentClass::clone() const {
return new AsciiComponentClass(*this);
}
bool AsciiComponentClass::class_empty(void) const {
return cr.none() && cr_ucp.none();
}
void AsciiComponentClass::createRange(unichar to) {
assert(range_start <= 0xff);
unsigned char from = (u8)range_start;
if (from > to) {
throw LocatedParseError("Range out of order in character class");
} else {
in_cand_range = false;
cr.setRange(from, to);
range_start = INVALID_UNICODE;
}
}
void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
// We should always be finalized by now.
assert(finalized);
NFABuilder &builder = bs.getBuilder();
position = builder.makePositions(1);
builder.addCharReach(position, cr);
builder.setNodeReportID(position, 0 /* offset adj */);
recordPosBounds(position, position + 1);
}
void AsciiComponentClass::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
// all follow set construction is handled by firsts/lasts
}
void AsciiComponentClass::add(PredefinedClass c, bool negative) {
if (in_cand_range) { // can't form a range here
throw LocatedParseError("Invalid range in character class");
}
DEBUG_PRINTF("getting %u %s\n", (u32)c, negative ? "^" : "");
if (mode.ucp) {
c = translateForUcpMode(c, mode);
}
CharReach pcr = getPredefinedCharReach(c, mode);
if (negative) {
pcr.flip();
}
if (isUcp(c)) {
cr_ucp |= pcr;
} else {
cr |= pcr;
}
range_start = INVALID_UNICODE;
in_cand_range = false;
}
void AsciiComponentClass::add(unichar c) {
DEBUG_PRINTF("adding \\x%02x\n", c);
if (c > 0xff) { // too big!
throw LocatedParseError("Hexadecimal value is greater than \\xFF");
}
if (in_cand_range) {
createRange(c);
return;
}
cr.set(c);
range_start = c;
}
void AsciiComponentClass::finalize() {
if (finalized) {
return;
}
// Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the
// dash is a literal dash.
if (in_cand_range) {
cr.set('-');
in_cand_range = false;
}
if (mode.caseless) {
make_caseless(&cr);
}
cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
if (m_negate) {
cr.flip();
}
finalized = true;
}
vector<PositionInfo> AsciiComponentClass::first(void) const {
return vector<PositionInfo>(1, PositionInfo(position));
}
vector<PositionInfo> AsciiComponentClass::last(void) const {
return vector<PositionInfo>(1, PositionInfo(position));
}
} // namespace ue2

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Character classes and their mnemonics.
*/
#ifndef ASCIICOMPONENTCLASS_H
#define ASCIICOMPONENTCLASS_H
#include "ComponentClass.h"
#include "util/charreach.h"
namespace ue2 {
class AsciiComponentClass : public ComponentClass {
friend class ConstructLiteralVisitor;
friend class DumpVisitor;
friend class PrintVisitor;
friend class CaselessVisitor;
friend class SimplifyVisitor;
friend class SimplifyCandidatesVisitor;
public:
explicit AsciiComponentClass(const ParseMode &mode_in);
~AsciiComponentClass() override {}
AsciiComponentClass *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
bool class_empty(void) const override;
void add(PredefinedClass c, bool negative) override;
void add(unichar c) override;
void finalize(void) override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &) override;
std::vector<PositionInfo> first(void) const override;
std::vector<PositionInfo> last(void) const override;
protected:
void createRange(unichar to) override;
private:
Position position;
CharReach cr;
CharReach cr_ucp;
// Private copy ctor. Use clone instead.
AsciiComponentClass(const AsciiComponentClass &other)
: ComponentClass(other), position(other.position), cr(other.cr),
cr_ucp(other.cr_ucp) {}
};
} // namespace ue2
#endif // ASCIICOMPONENTCLASS_H

75
src/parser/Component.cpp Normal file
View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Base class for all components.
*/
#include "Component.h"
#include "buildstate.h"
#include "position.h"
#include "position_info.h"
#include "ue2common.h"
using namespace std;
namespace ue2 {
Component::Component()
: pos_begin(GlushkovBuildState::POS_UNINITIALIZED),
pos_end(GlushkovBuildState::POS_UNINITIALIZED) {}
Component::~Component() {}
bool Component::repeatable() const {
return true;
}
void Component::recordPosBounds(u32 b, u32 e) {
pos_begin = b;
pos_end = e;
}
void Component::optimise(bool) {
}
bool Component::vacuous_everywhere(void) const {
return false;
}
bool Component::checkEmbeddedStartAnchor(bool) const {
return false;
}
bool Component::checkEmbeddedEndAnchor(bool) const {
return false;
}
} // namespace ue2

145
src/parser/Component.h Normal file
View File

@@ -0,0 +1,145 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Base class for all components.
*/
#ifndef _RE_COMPONENT_H_
#define _RE_COMPONENT_H_
#include "ComponentVisitor.h"
#include "ConstComponentVisitor.h"
#include "position.h"
#include "ue2common.h"
#include <set>
#include <string>
#include <vector>
namespace ue2 {
class GlushkovBuildState;
class PositionInfo;
enum EmptyPathType {
NOT_EMPTY, /**< component must consume characters */
EPS_ONLY_PATHS, /**< eps path with no overhanging asserts */
BOUNDARY_PATHS /**< eps paths some with overhanging asserts */
};
/** \brief Base class for regular expression parse tree components. */
class Component {
friend class DumpVisitor;
public:
/** \brief Constructor. */
Component();
/** \brief Destructor. */
virtual ~Component();
/** \brief Returns a newly-allocated deep copy of this component. */
virtual Component *clone() const = 0;
/** \brief Apply the given visitor functor. */
virtual Component *accept(ComponentVisitor &v) = 0;
/** \brief Apply the given const visitor functor. */
virtual void accept(ConstComponentVisitor &v) const = 0;
/** \brief Glushkov construction First() function.
* \return set of initial positions in this component. */
virtual std::vector<PositionInfo> first() const = 0;
/** \brief Glushkov construction Last() function.
* \return set of final positions in this component. */
virtual std::vector<PositionInfo> last() const = 0;
/** \brief Glushkov construction Empty() function.
* \return true iff the component accepts epsilon.
*
* Note: ^, $, etc are considered empty. */
virtual bool empty() const = 0;
/** \brief True iff epsilon can pass through the component.
*
* Note: ^, $, etc are not vacuous everywhere. */
virtual bool vacuous_everywhere(void) const;
/** \brief True iff the component is repeatable on its own, without being
* encapsulated in a sequence first.
*
* This is true for most components, but not for repeats, anchors and word
* boundaries. */
virtual bool repeatable() const;
/** \brief Optimisation pass on the component tree.
*
* Called before \ref notePositions. May modify to the component tree.
* Assumes no start of match information is required.
*/
virtual void optimise(bool connected_to_sds);
/** \brief Informs the Glushkov build process of the positions used by this
* component. */
virtual void notePositions(GlushkovBuildState &bs) = 0;
/** \brief Glushkov construction Follow() function.
*
* Constructs (in \a bs) the set of positions in this component reachable
* from the positions in \a lastPos.
*
* \throw ParseError on failure
*/
virtual void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) = 0;
/** \brief Return value is used for chaining, throws if finds embedded
* anchor. */
virtual bool checkEmbeddedStartAnchor(bool at_start) const;
/* \brief Return value is used for chaining, throws if finds embedded
* anchor. */
virtual bool checkEmbeddedEndAnchor(bool at_end) const;
protected:
/** \brief Called during \ref notePositions. */
void recordPosBounds(u32 b, u32 e);
u32 pos_begin;
u32 pos_end;
// Protected copy ctor. Use clone instead.
Component(const Component &other)
: pos_begin(other.pos_begin), pos_end(other.pos_end) {}
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,190 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Alternations (foo|bar|baz).
*/
#include "ComponentAlternation.h"
#include "buildstate.h"
#include "position.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
#include "ue2common.h"
#include <algorithm>
using namespace std;
namespace ue2 {
ComponentAlternation::ComponentAlternation() {
// empty
}
ComponentAlternation::~ComponentAlternation() {
// empty
}
ComponentAlternation::ComponentAlternation(const ComponentAlternation &other)
: Component(other) {
for (const auto &c : other.children) {
assert(c);
children.push_back(unique_ptr<Component>(c->clone()));
}
}
ComponentAlternation * ComponentAlternation::clone() const {
return new ComponentAlternation(*this);
}
Component *ComponentAlternation::accept(ComponentVisitor &v) {
Component *c = v.visit(this);
if (c != this) {
v.post(this);
return c;
}
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
Component *child = i->get();
c = (*i)->accept(v);
if (c != child) {
// Child has been replaced (new Component pointer) or we've been
// instructed to delete it (null).
i->reset(c);
}
}
// Remove deleted children.
children.erase(remove(children.begin(), children.end(), nullptr),
children.end());
v.post(this);
return this;
}
void ComponentAlternation::accept(ConstComponentVisitor &v) const {
v.pre(*this);
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
(*i)->accept(v);
if (i + 1 != e) {
v.during(*this);
}
}
v.post(*this);
}
void ComponentAlternation::append(unique_ptr<Component> component) {
children.push_back(move(component));
}
vector<PositionInfo> ComponentAlternation::first() const {
// firsts come from all our subcomponents in position order. This will
// maintain left-to-right priority order.
vector<PositionInfo> firsts, subfirsts;
for (const auto &c : children) {
subfirsts = c->first();
firsts.insert(firsts.end(), subfirsts.begin(), subfirsts.end());
}
return firsts;
}
vector<PositionInfo> ComponentAlternation::last() const {
vector<PositionInfo> lasts, sublasts;
for (const auto &c : children) {
sublasts = c->last();
lasts.insert(lasts.end(), sublasts.begin(), sublasts.end());
}
return lasts;
}
bool ComponentAlternation::empty(void) const {
// an alternation can be empty if any of its components are empty
for (const auto &c : children) {
if (c->empty()) {
return true;
}
}
return false;
}
void ComponentAlternation::notePositions(GlushkovBuildState &bs) {
u32 pb = bs.getBuilder().numVertices();
for (auto &c : children) {
c->notePositions(bs);
}
recordPosBounds(pb, bs.getBuilder().numVertices());
}
void ComponentAlternation::buildFollowSet(GlushkovBuildState &bs,
const vector<PositionInfo> &lastPos) {
for (auto &c : children) {
c->buildFollowSet(bs, lastPos);
}
}
bool ComponentAlternation::checkEmbeddedStartAnchor(bool at_start) const {
bool rv = at_start;
for (const auto &c : children) {
rv &= c->checkEmbeddedStartAnchor(at_start);
}
return rv;
}
bool ComponentAlternation::checkEmbeddedEndAnchor(bool at_end) const {
bool rv = at_end;
for (const auto &c : children) {
rv &= c->checkEmbeddedEndAnchor(at_end);
}
return rv;
}
bool ComponentAlternation::vacuous_everywhere(void) const {
for (const auto &c : children) {
if (c->vacuous_everywhere()) {
return true;
}
}
return false;
}
void ComponentAlternation::optimise(bool connected_to_sds) {
for (auto &c : children) {
c->optimise(connected_to_sds);
}
}
} // namespace ue2

View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Alternations (foo|bar|baz).
*/
#ifndef COMPONENT_ALTERNATION_H
#define COMPONENT_ALTERNATION_H
#include "Component.h"
#include "position.h"
#include <memory>
namespace ue2 {
class PositionInfo;
class ComponentAlternation : public Component {
friend class DumpVisitor;
friend class SimplifyVisitor;
public:
ComponentAlternation();
~ComponentAlternation() override;
ComponentAlternation *clone() const override;
Component *accept(ComponentVisitor &v) override;
void accept(ConstComponentVisitor &v) const override;
size_t numBranches() const { return children.size(); }
void append(std::unique_ptr<Component> component);
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty(void) const override;
bool vacuous_everywhere() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
bool checkEmbeddedStartAnchor(bool at_start) const override;
bool checkEmbeddedEndAnchor(bool at_end) const override;
void optimise(bool connected_to_sds) override;
private:
std::vector<std::unique_ptr<Component>> children;
ComponentAlternation(const ComponentAlternation &other);
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,121 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Lookahead/lookbehind zero-width assertions.
*/
#include "ComponentAssertion.h"
#include "buildstate.h"
#include "position.h"
#include "position_info.h"
#include "ue2common.h"
#include <cassert>
#include <algorithm>
using namespace std;
namespace ue2 {
ComponentAssertion::ComponentAssertion(enum Direction dir, enum Sense sense)
: m_dir(dir), m_sense(sense) {}
ComponentAssertion::~ComponentAssertion() { }
ComponentAssertion *ComponentAssertion::clone() const {
return new ComponentAssertion(*this);
}
Component * ComponentAssertion::accept(ComponentVisitor &v) {
Component *c = v.visit(this);
if (c != this) {
v.post(this);
return c;
}
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
Component *child = i->get();
c = (*i)->accept(v);
if (c != child) {
// Child has been replaced (new Component pointer) or we've been
// instructed to delete it (null).
i->reset(c);
}
}
// Remove deleted children.
children.erase(remove(children.begin(), children.end(), nullptr),
children.end());
v.post(this);
return this;
}
void ComponentAssertion::accept(ConstComponentVisitor &v) const {
v.pre(*this);
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
(*i)->accept(v);
if (i + 1 != e) {
v.during(*this);
}
}
v.post(*this);
}
vector<PositionInfo> ComponentAssertion::first() const {
assert(0);
return vector<PositionInfo>();
}
vector<PositionInfo> ComponentAssertion::last() const {
assert(0);
return vector<PositionInfo>();
}
bool ComponentAssertion::empty() const {
return true;
}
void ComponentAssertion::notePositions(GlushkovBuildState &) {
assert(0);
}
void ComponentAssertion::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
assert(0);
}
bool ComponentAssertion::repeatable() const {
// If this assertion has no children (it's an empty sequence, like that
// produced by '(?!)') then PCRE would throw a "nothing to repeat" error.
// So we do as well.
return !children.empty();
}
} // namespace ue2

View File

@@ -0,0 +1,76 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Lookahead/lookbehind zero-width assertions.
*/
#ifndef _RE_COMPONENTASSERTION_H_
#define _RE_COMPONENTASSERTION_H_
#include "ComponentSequence.h"
namespace ue2 {
class ComponentAssertion : public ComponentSequence {
friend class DumpVisitor;
friend class PrintVisitor;
public:
enum Direction {
LOOKAHEAD, //!< lookahead (forward) assertion
LOOKBEHIND //!< lookbehind (backward) assertion
};
enum Sense {
POS, //!< positive assertion, (?=...) or (?<=...)
NEG //!< negative assertion, (?!...) or (?<!...)
};
ComponentAssertion(enum Direction dir, enum Sense sense);
~ComponentAssertion() override;
ComponentAssertion *clone() const override;
Component *accept(ComponentVisitor &v) override;
void accept(ConstComponentVisitor &v) const override;
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
bool repeatable() const override;
private:
enum Direction m_dir;
enum Sense m_sense;
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,92 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Atomic groups (?>...)
*/
#include "ComponentAtomicGroup.h"
#include "buildstate.h"
#include "position.h"
#include <algorithm>
using namespace std;
namespace ue2 {
ComponentAtomicGroup *ComponentAtomicGroup::clone() const {
return new ComponentAtomicGroup(*this);
}
Component *ComponentAtomicGroup::accept(ComponentVisitor &v) {
Component *c = v.visit(this);
if (c != this) {
v.post(this);
return c;
}
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
Component *child = i->get();
c = (*i)->accept(v);
if (c != child) {
// Child has been replaced (new Component pointer) or we've been
// instructed to delete it (null).
i->reset(c);
}
}
// Remove deleted children.
children.erase(remove(children.begin(), children.end(), nullptr),
children.end());
v.post(this);
return this;
}
void ComponentAtomicGroup::accept(ConstComponentVisitor &v) const {
v.pre(*this);
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
(*i)->accept(v);
if (i + 1 != e) {
v.during(*this);
}
}
v.post(*this);
}
void ComponentAtomicGroup::notePositions(GlushkovBuildState &) {
assert(0);
}
void ComponentAtomicGroup::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
assert(0);
}
} // namespace

View File

@@ -0,0 +1,58 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Atomic groups (?>...)
*/
#ifndef _COMPONENTATOMICGROUP_H_
#define _COMPONENTATOMICGROUP_H_
#include "ComponentSequence.h"
namespace ue2 {
// The atomic group component is a subclass of sequence that is only buildable
// in prefilter mode, where we treat it as a standard sequence.
class ComponentAtomicGroup : public ComponentSequence {
friend class DumpVisitor;
public:
ComponentAtomicGroup() {}
~ComponentAtomicGroup() override {}
ComponentAtomicGroup *clone() const override;
Component *accept(ComponentVisitor &v) override;
void accept(ConstComponentVisitor &v) const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Back-references (/([a-f]{3}).*\\1/)
*/
#include "ComponentBackReference.h"
#include "buildstate.h"
#include "position.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
#include "util/charreach.h"
#include <cassert>
using namespace std;
namespace ue2 {
ComponentBackReference::ComponentBackReference(unsigned int id)
: ref_id(id) {}
ComponentBackReference::ComponentBackReference(const string &s)
: name(s), ref_id(0) {}
ComponentBackReference * ComponentBackReference::clone() const {
return new ComponentBackReference(*this);
}
vector<PositionInfo> ComponentBackReference::first() const {
assert(0);
return vector<PositionInfo>();
}
vector<PositionInfo> ComponentBackReference::last() const {
assert(0);
return vector<PositionInfo>();
}
bool ComponentBackReference::empty(void) const { return true; }
void ComponentBackReference::notePositions(GlushkovBuildState &) {
assert(0);
}
void ComponentBackReference::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
assert(0);
}
} // namespace

View File

@@ -0,0 +1,84 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Back-references (/([a-f]{3}).*\\1/)
*/
#ifndef _RE_COMPONENTBACKREFERENCE_H_
#define _RE_COMPONENTBACKREFERENCE_H_
#include "Component.h"
#include <string>
namespace ue2 {
class ComponentBackReference : public Component {
friend class DumpVisitor;
friend class PrintVisitor;
friend class ReferenceVisitor;
public:
explicit ComponentBackReference(unsigned int id);
explicit ComponentBackReference(const std::string &s);
~ComponentBackReference() override {}
ComponentBackReference *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
unsigned int getRefID() const { return ref_id; }
const std::string &getRefName() const { return name; }
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty(void) const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
private:
// Private copy ctor. Use clone instead.
ComponentBackReference(const ComponentBackReference &other)
: Component(other), name(other.name), ref_id(other.ref_id) {}
std::string name;
unsigned int ref_id;
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,186 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Boundary assertions (^, $, \\A, \\Z, \\z)
*/
#include "ComponentBoundary.h"
#include "buildstate.h"
#include "parse_error.h"
#include "position.h"
#include "position_info.h"
#include "Parser.h"
#include "util/charreach.h"
#include "nfagraph/ng_builder.h"
#include <cassert>
using namespace std;
namespace ue2 {
ComponentBoundary::ComponentBoundary(enum Boundary bound)
: m_bound(bound), m_newline(GlushkovBuildState::POS_UNINITIALIZED) {}
ComponentBoundary::~ComponentBoundary() {
}
ComponentBoundary::ComponentBoundary(const ComponentBoundary &other)
: Component(other), m_bound(other.m_bound), m_newline(other.m_newline),
m_first(other.m_first), m_last(other.m_last) {}
ComponentBoundary * ComponentBoundary::clone() const {
return new ComponentBoundary(*this);
}
vector<PositionInfo> ComponentBoundary::first() const {
return m_first;
}
vector<PositionInfo> ComponentBoundary::last() const {
return m_last;
}
bool ComponentBoundary::empty() const {
return true;
}
bool ComponentBoundary::repeatable() const {
return false;
}
static
Position makeNewline(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
Position newline = builder.makePositions(1);
builder.addCharReach(newline, CharReach('\n'));
return newline;
}
void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
NFABuilder &builder = bs.getBuilder();
const Position startState = builder.getStart();
switch (m_bound) {
case BEGIN_STRING: // beginning of data stream ('^')
{
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
epsilon.flags = POS_FLAG_NOFLOAT;
m_first.push_back(epsilon);
// We have the start vertex in firsts so that we can discourage
// the mid-pattern use of boundaries.
m_first.push_back(startState);
break;
}
case BEGIN_LINE: // multiline anchor: beginning of stream or a newline
{
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
epsilon.flags = POS_FLAG_NOFLOAT;
m_first.push_back(epsilon);
// We have the start vertex in firsts so that we can discourage
// the mid-pattern use of boundaries.
m_first.push_back(startState);
// Newline
m_newline = makeNewline(bs);
builder.setAssertFlag(m_newline, POS_FLAG_MULTILINE_START);
builder.setAssertFlag(m_newline, POS_FLAG_VIRTUAL_START);
PositionInfo nl(m_newline);
nl.flags = POS_FLAG_MUST_FLOAT | POS_FLAG_FIDDLE_ACCEPT;
m_first.push_back(nl);
m_last.push_back(nl);
recordPosBounds(m_newline, m_newline + 1);
break;
}
case END_STRING: // end of data stream ('\z')
{
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_NO_NL_EOD |
POS_FLAG_NO_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
m_first.push_back(epsilon);
break;
}
case END_STRING_OPTIONAL_LF: // end of data with optional LF ('$')
{
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_WIRE_NL_EOD |
POS_FLAG_NO_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
m_first.push_back(epsilon);
break;
}
case END_LINE: // multiline anchor: end of data or a newline
{
PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_WIRE_NL_EOD |
POS_FLAG_WIRE_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
m_first.push_back(epsilon);
break;
}
default:
// unsupported
assert(0);
break;
}
}
void ComponentBoundary::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
}
bool ComponentBoundary::checkEmbeddedStartAnchor(bool at_start) const {
if (at_start) {
return at_start;
}
if (m_bound == BEGIN_STRING || m_bound == BEGIN_LINE) {
throw ParseError("Embedded start anchors not supported.");
}
return at_start;
}
bool ComponentBoundary::checkEmbeddedEndAnchor(bool at_end) const {
if (at_end) {
return at_end;
}
if (m_bound != BEGIN_STRING && m_bound != BEGIN_LINE) {
throw ParseError("Embedded end anchors not supported.");
}
return at_end;
}
} // namespace

View File

@@ -0,0 +1,94 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Boundary assertions (^, $, \\A, \\Z, \\z)
*/
#ifndef _RE_COMPONENTBOUNDARY_H_
#define _RE_COMPONENTBOUNDARY_H_
#include "Component.h"
#include "position.h"
namespace ue2 {
/** \brief Encapsulates a line/string boundary assertion. */
class ComponentBoundary : public Component {
friend class DumpVisitor;
friend class PrintVisitor;
friend class UnsafeBoundsVisitor;
friend class MultilineVisitor;
public:
enum Boundary {
BEGIN_STRING, //!< beginning of data stream
END_STRING, //!< end of data stream
END_STRING_OPTIONAL_LF, //!< end of data stream with an optional
// linefeed
BEGIN_LINE, //!< '(^|\\n)'
END_LINE //!< '($|\\n)'
};
explicit ComponentBoundary(enum Boundary bound);
~ComponentBoundary() override;
ComponentBoundary *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override;
bool repeatable() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
bool checkEmbeddedStartAnchor(bool at_start) const override;
bool checkEmbeddedEndAnchor(bool at_end) const override;
private:
enum Boundary m_bound; //!< \brief which assertion is that?
Position m_newline; //!< \brief special newline state
std::vector<PositionInfo> m_first; //!< \brief positions returned for first()
std::vector<PositionInfo> m_last; //!< \brief positions returned for last()
ComponentBoundary(const ComponentBoundary &other);
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Single bytes (\\C metachar)
*/
#include "ComponentByte.h"
#include "buildstate.h"
#include "position.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
#include "util/charreach.h"
using namespace std;
namespace ue2 {
ComponentByte::ComponentByte()
: position(GlushkovBuildState::POS_UNINITIALIZED) {}
ComponentByte::~ComponentByte() {}
ComponentByte *ComponentByte::clone() const {
return new ComponentByte(*this);
}
vector<PositionInfo> ComponentByte::first() const {
return vector<PositionInfo>(1, PositionInfo(position));
}
vector<PositionInfo> ComponentByte::last() const {
return vector<PositionInfo>(1, PositionInfo(position));
}
void ComponentByte::notePositions(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
position = builder.makePositions(1);
builder.addCharReach(position, CharReach::dot());
builder.setNodeReportID(position, 0 /* offset adj */);
}
} // namespace ue2

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Single bytes (\\C metachar)
*/
#ifndef _RE_COMPONENTBYTE_H_
#define _RE_COMPONENTBYTE_H_
#include "Component.h"
namespace ue2 {
class ComponentByte : public Component {
friend class DumpVisitor;
public:
ComponentByte(void);
~ComponentByte() override;
ComponentByte *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override { return false; }
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &,
const std::vector<PositionInfo> &) override {
// all follow set construction is handled by firsts/lasts
return;
}
private:
Position position;
ComponentByte(const ComponentByte &other)
: Component(other), position(other.position) {}
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,448 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Character classes and their mnemonics.
*/
#include "Parser.h"
#include "ComponentClass.h"
#include "AsciiComponentClass.h"
#include "ucp_table.h"
#include "Utf8ComponentClass.h"
#include "util/charreach.h"
#include "util/make_unique.h"
#include <boost/icl/interval_set.hpp>
using namespace std;
namespace ue2 {
static
CharReach to_cr(const CodePointSet &cps) {
CharReach cr;
for (const auto &cp : cps) {
if (lower(cp) >= CharReach::npos) {
break;
}
cr.setRange(lower(cp), MIN(upper(cp), CharReach::npos - 1));
}
return cr;
}
CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
const CharReach lower('a', 'z');
const CharReach upper('A', 'Z');
const CharReach number('0', '9');
switch (c) {
case CLASS_ALNUM:
return lower | upper | number;
case CLASS_ALPHA:
return lower | upper;
case CLASS_ANY:
if (mode.dotall) {
return ~CharReach();
} else {
return ~CharReach('\n');
}
case CLASS_ASCII:
return CharReach(0, 127);
case CLASS_BLANK:
return CharReach(" \t");
case CLASS_CNTRL:
return CharReach(0, 31) | CharReach(127 /* del */);
case CLASS_DIGIT:
return number;
case CLASS_GRAPH:
case CLASS_XGRAPH:
return CharReach(0x21, 0x7e);
case CLASS_HORZ:
return CharReach("\x09\x20\xA0");
case CLASS_LOWER:
if (mode.caseless) {
return lower | upper;
} else {
return lower;
}
case CLASS_PRINT:
return CharReach(0x20, 0x7e);
case CLASS_PUNCT:
return CharReach(0x21, '0' - 1)
| CharReach('9' + 1, 'A' - 1)
| CharReach('Z' + 1, 'a' - 1)
| CharReach('z' + 1, 126);
case CLASS_SPACE:
return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
case CLASS_UPPER:
if (mode.caseless) {
return lower | upper;
} else {
return upper;
}
case CLASS_VERT:
return CharReach("\x0a\x0b\x0c\x0d\x85");
case CLASS_WORD:
return lower | upper | number | CharReach('_');
case CLASS_XDIGIT:
return CharReach("0123456789abcdefABCDEF");
case CLASS_UCP_C:
return to_cr(getUcpC());
case CLASS_UCP_CC:
return to_cr(getUcpCc());
case CLASS_UCP_CF:
return to_cr(getUcpCf());
case CLASS_UCP_CN:
return to_cr(getUcpCn());
case CLASS_UCP_CO:
return to_cr(getUcpCo());
case CLASS_UCP_CS:
return to_cr(getUcpCs());
case CLASS_UCP_L:
return to_cr(getUcpL());
case CLASS_UCP_L_AND:
return to_cr(getUcpL_and());
case CLASS_UCP_LL:
return to_cr(getUcpLl());
case CLASS_UCP_LM:
return to_cr(getUcpLm());
case CLASS_UCP_LO:
return to_cr(getUcpLo());
case CLASS_UCP_LT:
return to_cr(getUcpLt());
case CLASS_UCP_LU:
return to_cr(getUcpLu());
case CLASS_UCP_M:
return to_cr(getUcpM());
case CLASS_UCP_MC:
return to_cr(getUcpMc());
case CLASS_UCP_ME:
return to_cr(getUcpMe());
case CLASS_UCP_MN:
return to_cr(getUcpMn());
case CLASS_UCP_N:
return to_cr(getUcpN());
case CLASS_UCP_ND:
return to_cr(getUcpNd());
case CLASS_UCP_NL:
return to_cr(getUcpNl());
case CLASS_UCP_NO:
return to_cr(getUcpNo());
case CLASS_UCP_P:
return to_cr(getUcpP());
case CLASS_UCP_PC:
return to_cr(getUcpPc());
case CLASS_UCP_PD:
return to_cr(getUcpPd());
case CLASS_UCP_PE:
return to_cr(getUcpPe());
case CLASS_UCP_PF:
return to_cr(getUcpPf());
case CLASS_UCP_PI:
return to_cr(getUcpPi());
case CLASS_UCP_PO:
return to_cr(getUcpPo());
case CLASS_UCP_PS:
return to_cr(getUcpPs());
case CLASS_UCP_S:
return to_cr(getUcpS());
case CLASS_UCP_SC:
return to_cr(getUcpSc());
case CLASS_UCP_SK:
return to_cr(getUcpSk());
case CLASS_UCP_SM:
return to_cr(getUcpSm());
case CLASS_UCP_SO:
return to_cr(getUcpSo());
case CLASS_UCP_XAN:
return to_cr(getUcpXan());
case CLASS_UCP_XPS:
case CLASS_UCP_XSP:
return getPredefinedCharReach(CLASS_VERT, mode) | getPredefinedCharReach(CLASS_HORZ, mode);
case CLASS_UCP_XWD:
return to_cr(getUcpXwd());
case CLASS_UCP_Z:
return to_cr(getUcpZ());
case CLASS_UCP_ZL:
return to_cr(getUcpZl());
case CLASS_UCP_ZP:
return to_cr(getUcpZp());
case CLASS_UCP_ZS:
return to_cr(getUcpZs());
case CLASS_SCRIPT_ARABIC:
return to_cr(getUcpArabic());
case CLASS_SCRIPT_ARMENIAN:
return to_cr(getUcpArmenian());
case CLASS_SCRIPT_AVESTAN:
return to_cr(getUcpAvestan());
case CLASS_SCRIPT_BALINESE:
return to_cr(getUcpBalinese());
case CLASS_SCRIPT_BAMUM:
return to_cr(getUcpBamum());
case CLASS_SCRIPT_BATAK:
return to_cr(getUcpBatak());
case CLASS_SCRIPT_BENGALI:
return to_cr(getUcpBengali());
case CLASS_SCRIPT_BOPOMOFO:
return to_cr(getUcpBopomofo());
case CLASS_SCRIPT_BRAHMI:
return to_cr(getUcpBrahmi());
case CLASS_SCRIPT_BRAILLE:
return to_cr(getUcpBraille());
case CLASS_SCRIPT_BUGINESE:
return to_cr(getUcpBuginese());
case CLASS_SCRIPT_BUHID:
return to_cr(getUcpBuhid());
case CLASS_SCRIPT_CANADIAN_ABORIGINAL:
return to_cr(getUcpCanadian_Aboriginal());
case CLASS_SCRIPT_CARIAN:
return to_cr(getUcpCarian());
case CLASS_SCRIPT_CHAM:
return to_cr(getUcpCham());
case CLASS_SCRIPT_CHEROKEE:
return to_cr(getUcpCherokee());
case CLASS_SCRIPT_COMMON:
return to_cr(getUcpCommon());
case CLASS_SCRIPT_COPTIC:
return to_cr(getUcpCoptic());
case CLASS_SCRIPT_CUNEIFORM:
return to_cr(getUcpCuneiform());
case CLASS_SCRIPT_CYPRIOT:
return to_cr(getUcpCypriot());
case CLASS_SCRIPT_CYRILLIC:
return to_cr(getUcpCyrillic());
case CLASS_SCRIPT_DESERET:
return to_cr(getUcpDeseret());
case CLASS_SCRIPT_DEVANAGARI:
return to_cr(getUcpDevanagari());
case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS:
return to_cr(getUcpEgyptian_Hieroglyphs());
case CLASS_SCRIPT_ETHIOPIC:
return to_cr(getUcpEthiopic());
case CLASS_SCRIPT_GEORGIAN:
return to_cr(getUcpGeorgian());
case CLASS_SCRIPT_GLAGOLITIC:
return to_cr(getUcpGlagolitic());
case CLASS_SCRIPT_GOTHIC:
return to_cr(getUcpGothic());
case CLASS_SCRIPT_GREEK:
return to_cr(getUcpGreek());
case CLASS_SCRIPT_GUJARATI:
return to_cr(getUcpGujarati());
case CLASS_SCRIPT_GURMUKHI:
return to_cr(getUcpGurmukhi());
case CLASS_SCRIPT_HAN:
return to_cr(getUcpHan());
case CLASS_SCRIPT_HANGUL:
return to_cr(getUcpHangul());
case CLASS_SCRIPT_HANUNOO:
return to_cr(getUcpHanunoo());
case CLASS_SCRIPT_HEBREW:
return to_cr(getUcpHebrew());
case CLASS_SCRIPT_HIRAGANA:
return to_cr(getUcpHiragana());
case CLASS_SCRIPT_IMPERIAL_ARAMAIC:
return to_cr(getUcpImperial_Aramaic());
case CLASS_SCRIPT_INHERITED:
return to_cr(getUcpInherited());
case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI:
return to_cr(getUcpInscriptional_Pahlavi());
case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN:
return to_cr(getUcpInscriptional_Parthian());
case CLASS_SCRIPT_JAVANESE:
return to_cr(getUcpJavanese());
case CLASS_SCRIPT_KAITHI:
return to_cr(getUcpKaithi());
case CLASS_SCRIPT_KANNADA:
return to_cr(getUcpKannada());
case CLASS_SCRIPT_KATAKANA:
return to_cr(getUcpKatakana());
case CLASS_SCRIPT_KAYAH_LI:
return to_cr(getUcpKayah_Li());
case CLASS_SCRIPT_KHAROSHTHI:
return to_cr(getUcpKharoshthi());
case CLASS_SCRIPT_KHMER:
return to_cr(getUcpKhmer());
case CLASS_SCRIPT_LAO:
return to_cr(getUcpLao());
case CLASS_SCRIPT_LATIN:
return to_cr(getUcpLatin());
case CLASS_SCRIPT_LEPCHA:
return to_cr(getUcpLepcha());
case CLASS_SCRIPT_LIMBU:
return to_cr(getUcpLimbu());
case CLASS_SCRIPT_LINEAR_B:
return to_cr(getUcpLinear_B());
case CLASS_SCRIPT_LISU:
return to_cr(getUcpLisu());
case CLASS_SCRIPT_LYCIAN:
return to_cr(getUcpLycian());
case CLASS_SCRIPT_LYDIAN:
return to_cr(getUcpLydian());
case CLASS_SCRIPT_MALAYALAM:
return to_cr(getUcpMalayalam());
case CLASS_SCRIPT_MANDAIC:
return to_cr(getUcpMandaic());
case CLASS_SCRIPT_MEETEI_MAYEK:
return to_cr(getUcpMeetei_Mayek());
case CLASS_SCRIPT_MONGOLIAN:
return to_cr(getUcpMongolian());
case CLASS_SCRIPT_MYANMAR:
return to_cr(getUcpMyanmar());
case CLASS_SCRIPT_NEW_TAI_LUE:
return to_cr(getUcpNew_Tai_Lue());
case CLASS_SCRIPT_NKO:
return to_cr(getUcpNko());
case CLASS_SCRIPT_OGHAM:
return to_cr(getUcpOgham());
case CLASS_SCRIPT_OL_CHIKI:
return to_cr(getUcpOl_Chiki());
case CLASS_SCRIPT_OLD_ITALIC:
return to_cr(getUcpOld_Italic());
case CLASS_SCRIPT_OLD_PERSIAN:
return to_cr(getUcpOld_Persian());
case CLASS_SCRIPT_OLD_SOUTH_ARABIAN:
return to_cr(getUcpOld_South_Arabian());
case CLASS_SCRIPT_OLD_TURKIC:
return to_cr(getUcpOld_Turkic());
case CLASS_SCRIPT_ORIYA:
return to_cr(getUcpOriya());
case CLASS_SCRIPT_OSMANYA:
return to_cr(getUcpOsmanya());
case CLASS_SCRIPT_PHAGS_PA:
return to_cr(getUcpPhags_Pa());
case CLASS_SCRIPT_PHOENICIAN:
return to_cr(getUcpPhoenician());
case CLASS_SCRIPT_REJANG:
return to_cr(getUcpRejang());
case CLASS_SCRIPT_RUNIC:
return to_cr(getUcpRunic());
case CLASS_SCRIPT_SAMARITAN:
return to_cr(getUcpSamaritan());
case CLASS_SCRIPT_SAURASHTRA:
return to_cr(getUcpSaurashtra());
case CLASS_SCRIPT_SHAVIAN:
return to_cr(getUcpShavian());
case CLASS_SCRIPT_SINHALA:
return to_cr(getUcpSinhala());
case CLASS_SCRIPT_SUNDANESE:
return to_cr(getUcpSundanese());
case CLASS_SCRIPT_SYLOTI_NAGRI:
return to_cr(getUcpSyloti_Nagri());
case CLASS_SCRIPT_SYRIAC:
return to_cr(getUcpSyriac());
case CLASS_SCRIPT_TAGALOG:
return to_cr(getUcpTagalog());
case CLASS_SCRIPT_TAGBANWA:
return to_cr(getUcpTagbanwa());
case CLASS_SCRIPT_TAI_LE:
return to_cr(getUcpTai_Le());
case CLASS_SCRIPT_TAI_THAM:
return to_cr(getUcpTai_Tham());
case CLASS_SCRIPT_TAI_VIET:
return to_cr(getUcpTai_Viet());
case CLASS_SCRIPT_TAMIL:
return to_cr(getUcpTamil());
case CLASS_SCRIPT_TELUGU:
return to_cr(getUcpTelugu());
case CLASS_SCRIPT_THAANA:
return to_cr(getUcpThaana());
case CLASS_SCRIPT_THAI:
return to_cr(getUcpThai());
case CLASS_SCRIPT_TIBETAN:
return to_cr(getUcpTibetan());
case CLASS_SCRIPT_TIFINAGH:
return to_cr(getUcpTifinagh());
case CLASS_SCRIPT_UGARITIC:
return to_cr(getUcpUgaritic());
case CLASS_SCRIPT_VAI:
return to_cr(getUcpVai());
case CLASS_SCRIPT_YI:
return to_cr(getUcpYi());
case CLASS_UCP_ANY: /* always include newline */
return ~CharReach();
}
assert(0);
return CharReach();
}
unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode) {
if (mode.utf8) {
return ue2::make_unique<UTF8ComponentClass>(mode);
} else {
return ue2::make_unique<AsciiComponentClass>(mode);
}
}
unique_ptr<ComponentClass> generateComponent(PredefinedClass c, bool negate,
const ParseMode &mode) {
auto cc = getComponentClass(mode);
cc->add(c, negate);
cc->finalize();
return cc;
}
unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
bool nocase) {
ParseMode mode;
mode.caseless = nocase;
auto cc = getComponentClass(mode);
cc->add(c);
cc->finalize();
return cc;
}
ComponentClass::ComponentClass(const ParseMode &mode_in)
: m_negate(false), mode(mode_in), in_cand_range(false),
range_start(INVALID_UNICODE), finalized(false), firstChar('\0') {}
ComponentClass::~ComponentClass() { }
void ComponentClass::addDash(void) {
if (!in_cand_range) {
// this could be the start of a range
if (range_start != INVALID_UNICODE) {
in_cand_range = true;
} else {
/* no possible start character for range, this is just a literal */
add('-');
}
} else {
// already creating a range, so this must be literal '-'
in_cand_range = false;
createRange('-');
}
}
void ComponentClass::negate() {
assert(class_empty());
m_negate = true;
}
} // namespace ue2

283
src/parser/ComponentClass.h Normal file
View File

@@ -0,0 +1,283 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Character classes and their mnemonics.
*/
#ifndef COMPONENTCLASS_H
#define COMPONENTCLASS_H
#include <string>
#include <vector>
#include <utility>
#include "Component.h"
#include "Parser.h"
#include "util/charreach.h"
#include "util/unicode_def.h"
#include "ue2common.h"
namespace ue2 {
enum PredefinedClass {
CLASS_ALNUM,
CLASS_ALPHA,
CLASS_ANY, /* dot, not quite any when not in dotall mode */
CLASS_ASCII,
CLASS_BLANK,
CLASS_CNTRL,
CLASS_DIGIT,
CLASS_GRAPH,
CLASS_HORZ,
CLASS_LOWER,
CLASS_PRINT,
CLASS_PUNCT,
CLASS_SPACE, /* has vertical tab */
CLASS_UPPER,
CLASS_VERT,
CLASS_WORD,
CLASS_XDIGIT,
CLASS_XGRAPH,
CLASS_UCP_C,
CLASS_UCP_CC,
CLASS_UCP_CF,
CLASS_UCP_CN, /* unallocated code points */
CLASS_UCP_CO,
CLASS_UCP_CS, /* does not contain valid unicode codepoints */
CLASS_UCP_L,
CLASS_UCP_LL,
CLASS_UCP_LM,
CLASS_UCP_LO,
CLASS_UCP_LT,
CLASS_UCP_LU,
CLASS_UCP_L_AND, /* L& = LL+LU+LT */
CLASS_UCP_M,
CLASS_UCP_MC,
CLASS_UCP_ME,
CLASS_UCP_MN,
CLASS_UCP_N,
CLASS_UCP_ND,
CLASS_UCP_NL,
CLASS_UCP_NO,
CLASS_UCP_P,
CLASS_UCP_PC,
CLASS_UCP_PD,
CLASS_UCP_PE,
CLASS_UCP_PF,
CLASS_UCP_PI,
CLASS_UCP_PO,
CLASS_UCP_PS,
CLASS_UCP_S,
CLASS_UCP_SC,
CLASS_UCP_SK,
CLASS_UCP_SM,
CLASS_UCP_SO,
CLASS_UCP_Z,
CLASS_UCP_ZL,
CLASS_UCP_ZP,
CLASS_UCP_ZS,
CLASS_UCP_XAN,
CLASS_UCP_XPS, /* CLASS_SPACE */
CLASS_UCP_XSP,
CLASS_UCP_XWD,
CLASS_SCRIPT_ARABIC,
CLASS_SCRIPT_ARMENIAN,
CLASS_SCRIPT_AVESTAN,
CLASS_SCRIPT_BALINESE,
CLASS_SCRIPT_BAMUM,
CLASS_SCRIPT_BATAK,
CLASS_SCRIPT_BENGALI,
CLASS_SCRIPT_BOPOMOFO,
CLASS_SCRIPT_BRAHMI,
CLASS_SCRIPT_BRAILLE,
CLASS_SCRIPT_BUGINESE,
CLASS_SCRIPT_BUHID,
CLASS_SCRIPT_CANADIAN_ABORIGINAL,
CLASS_SCRIPT_CARIAN,
CLASS_SCRIPT_CHAM,
CLASS_SCRIPT_CHEROKEE,
CLASS_SCRIPT_COMMON,
CLASS_SCRIPT_COPTIC,
CLASS_SCRIPT_CUNEIFORM,
CLASS_SCRIPT_CYPRIOT,
CLASS_SCRIPT_CYRILLIC,
CLASS_SCRIPT_DESERET,
CLASS_SCRIPT_DEVANAGARI,
CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS,
CLASS_SCRIPT_ETHIOPIC,
CLASS_SCRIPT_GEORGIAN,
CLASS_SCRIPT_GLAGOLITIC,
CLASS_SCRIPT_GOTHIC,
CLASS_SCRIPT_GREEK,
CLASS_SCRIPT_GUJARATI,
CLASS_SCRIPT_GURMUKHI,
CLASS_SCRIPT_HAN,
CLASS_SCRIPT_HANGUL,
CLASS_SCRIPT_HANUNOO,
CLASS_SCRIPT_HEBREW,
CLASS_SCRIPT_HIRAGANA,
CLASS_SCRIPT_IMPERIAL_ARAMAIC,
CLASS_SCRIPT_INHERITED,
CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI,
CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN,
CLASS_SCRIPT_JAVANESE,
CLASS_SCRIPT_KAITHI,
CLASS_SCRIPT_KANNADA,
CLASS_SCRIPT_KATAKANA,
CLASS_SCRIPT_KAYAH_LI,
CLASS_SCRIPT_KHAROSHTHI,
CLASS_SCRIPT_KHMER,
CLASS_SCRIPT_LAO,
CLASS_SCRIPT_LATIN,
CLASS_SCRIPT_LEPCHA,
CLASS_SCRIPT_LIMBU,
CLASS_SCRIPT_LINEAR_B,
CLASS_SCRIPT_LISU,
CLASS_SCRIPT_LYCIAN,
CLASS_SCRIPT_LYDIAN,
CLASS_SCRIPT_MALAYALAM,
CLASS_SCRIPT_MANDAIC,
CLASS_SCRIPT_MEETEI_MAYEK,
CLASS_SCRIPT_MONGOLIAN,
CLASS_SCRIPT_MYANMAR,
CLASS_SCRIPT_NEW_TAI_LUE,
CLASS_SCRIPT_NKO,
CLASS_SCRIPT_OGHAM,
CLASS_SCRIPT_OL_CHIKI,
CLASS_SCRIPT_OLD_ITALIC,
CLASS_SCRIPT_OLD_PERSIAN,
CLASS_SCRIPT_OLD_SOUTH_ARABIAN,
CLASS_SCRIPT_OLD_TURKIC,
CLASS_SCRIPT_ORIYA,
CLASS_SCRIPT_OSMANYA,
CLASS_SCRIPT_PHAGS_PA,
CLASS_SCRIPT_PHOENICIAN,
CLASS_SCRIPT_REJANG,
CLASS_SCRIPT_RUNIC,
CLASS_SCRIPT_SAMARITAN,
CLASS_SCRIPT_SAURASHTRA,
CLASS_SCRIPT_SHAVIAN,
CLASS_SCRIPT_SINHALA,
CLASS_SCRIPT_SUNDANESE,
CLASS_SCRIPT_SYLOTI_NAGRI,
CLASS_SCRIPT_SYRIAC,
CLASS_SCRIPT_TAGALOG,
CLASS_SCRIPT_TAGBANWA,
CLASS_SCRIPT_TAI_LE,
CLASS_SCRIPT_TAI_THAM,
CLASS_SCRIPT_TAI_VIET,
CLASS_SCRIPT_TAMIL,
CLASS_SCRIPT_TELUGU,
CLASS_SCRIPT_THAANA,
CLASS_SCRIPT_THAI,
CLASS_SCRIPT_TIBETAN,
CLASS_SCRIPT_TIFINAGH,
CLASS_SCRIPT_UGARITIC,
CLASS_SCRIPT_VAI,
CLASS_SCRIPT_YI,
CLASS_UCP_ANY
};
CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode);
class ComponentClass;
class NFABuilder;
/* Caller is responsible for lifecycle management, class finalized */
std::unique_ptr<ComponentClass>
generateComponent(PredefinedClass c, bool negated, const ParseMode &mode);
/* Caller is responsible for lifecycle management, class open */
std::unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode);
/** Common case: generate a component for a single literal character, possibly
* in caseless mode. Caller is responsible for lifecycle management. */
std::unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
bool nocase);
class ComponentClass : public Component {
friend class DumpVisitor;
protected:
explicit ComponentClass(const ParseMode &mode_in);
public:
~ComponentClass() override;
ComponentClass *clone() const override = 0;
Component *accept(ComponentVisitor &v) override = 0;
void accept(ConstComponentVisitor &v) const override = 0;
/** True iff we have already started adding members to the class. This is
* a different concept to Component::empty */
virtual bool class_empty(void) const = 0;
virtual void add(PredefinedClass c, bool negated) = 0;
virtual void add(unichar c) = 0; /* may throw LocatedParseError */
void addDash(void);
void negate(void);
virtual void finalize(void) = 0;
bool isNegated() const { return m_negate; }
void setFirstChar(char c) { firstChar = c; }
char getFirstChar() const { return firstChar; }
std::vector<PositionInfo> first() const override = 0;
std::vector<PositionInfo> last() const override = 0;
bool empty() const override { return false; } /* always 1 codepoint wide */
void notePositions(GlushkovBuildState &bs) override = 0;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &) override = 0;
protected:
bool m_negate;
const ParseMode mode;
bool in_cand_range;
unichar range_start;
bool finalized;
/** Literal character at the start of this character class, e.g. '.' for
* the class [.abc]. Used to identify (unsupported) POSIX collating
* elements. */
char firstChar;
virtual void createRange(unichar) = 0;
// Protected copy ctor. Use clone instead.
ComponentClass(const ComponentClass &other)
: Component(other), m_negate(other.m_negate), mode(other.mode),
in_cand_range(other.in_cand_range), range_start(other.range_start),
finalized(other.finalized),
firstChar(other.firstChar) {}
};
} // namespace ue2
#endif // COMPONENTCLASS_H

View File

@@ -0,0 +1,166 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Conditional reference.
*/
#include "ComponentCondReference.h"
#include "ComponentAlternation.h"
#include "ComponentAssertion.h"
#include "parse_error.h"
#include "position_info.h"
#include <algorithm>
#include <cassert>
#include <memory>
using namespace std;
namespace ue2 {
ComponentCondReference::ComponentCondReference(unsigned ref)
: kind(CONDITION_NUMBER), ref_id(ref), hasBothBranches(false) {}
ComponentCondReference::ComponentCondReference(const string &name)
: kind(CONDITION_NAME), ref_id(0), ref_name(name), hasBothBranches(false) {}
ComponentCondReference::ComponentCondReference(unique_ptr<Component> c)
: kind(CONDITION_ASSERTION), ref_id(0), assertion(move(c)),
hasBothBranches(false) {}
ComponentCondReference::~ComponentCondReference() {}
ComponentCondReference::ComponentCondReference(
const ComponentCondReference &other)
: ComponentSequence(other), kind(other.kind), ref_id(other.ref_id),
ref_name(other.ref_name), hasBothBranches(other.hasBothBranches) {
if (kind == CONDITION_ASSERTION) {
assert(other.assertion);
assertion.reset(other.assertion->clone());
} else {
assert(!other.assertion);
}
}
ComponentCondReference *ComponentCondReference::clone() const {
return new ComponentCondReference(*this);
}
Component *ComponentCondReference::accept(ComponentVisitor &v) {
Component *c = v.visit(this);
if (c != this) {
v.post(this);
return c;
}
if (kind == CONDITION_ASSERTION) {
Component *a = assertion.get();
c = assertion->accept(v);
if (c != a) {
assertion.reset(c);
}
}
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
Component *child = i->get();
c = (*i)->accept(v);
if (c != child) {
// Child has been replaced (new Component pointer) or we've been
// instructed to delete it (null).
i->reset(c);
}
}
// Remove deleted children.
children.erase(remove(children.begin(), children.end(), nullptr),
children.end());
v.post(this);
return this;
}
void ComponentCondReference::accept(ConstComponentVisitor &v) const {
v.pre(*this);
if (kind == CONDITION_ASSERTION) {
assertion->accept(v);
v.during(*this); // FIXME: a good idea?
}
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
(*i)->accept(v);
if (i + 1 != e) {
v.during(*this);
}
}
v.post(*this);
}
void ComponentCondReference::addAlternation() {
if (alternation) {
if (ref_name == "DEFINE") {
throw LocatedParseError("DEFINE conditional group with more than "
"one branch");
}
if (alternation->numBranches() >= 2) {
throw LocatedParseError("Conditional with more than two branches");
}
}
hasBothBranches = true;
ComponentSequence::addAlternation();
}
vector<PositionInfo> ComponentCondReference::first() const {
assert(0);
return vector<PositionInfo>();
}
vector<PositionInfo> ComponentCondReference::last() const {
assert(0);
return vector<PositionInfo>();
}
bool ComponentCondReference::empty() const { return true; }
void ComponentCondReference::notePositions(GlushkovBuildState &) { assert(0); }
void ComponentCondReference::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
assert(0);
}
bool ComponentCondReference::repeatable() const {
// If this assertion has no children (it's an empty sequence, like that
// produced by '(?!)') then PCRE would throw a "nothing to repeat" error.
// So we do as well.
return !children.empty();
}
} // namespace ue2

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Conditional reference.
*/
#ifndef PARSER_COMPONENTCONDREFERENCE_H_
#define PARSER_COMPONENTCONDREFERENCE_H_
#include "ComponentSequence.h"
#include <memory>
#include <string>
namespace ue2 {
class ComponentCondReference : public ComponentSequence {
friend class DumpVisitor;
friend class PrefilterVisitor;
friend class ReferenceVisitor;
friend class PrintVisitor;
public:
ComponentCondReference(unsigned ref);
ComponentCondReference(const std::string &name);
ComponentCondReference(std::unique_ptr<Component> c);
~ComponentCondReference() override;
ComponentCondReference *clone() const override;
Component *accept(ComponentVisitor &v) override;
void accept(ConstComponentVisitor &v) const override;
void addAlternation() override;
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
bool repeatable() const override;
private:
ComponentCondReference(const ComponentCondReference &other);
enum Condition {
CONDITION_NUMBER,
CONDITION_NAME,
CONDITION_ASSERTION
};
enum Condition kind;
unsigned ref_id;
std::string ref_name;
std::unique_ptr<Component> assertion;
/** True if an alternation has been added, which means we have both a YES
* and a NO branch. */
bool hasBothBranches;
};
} // namespace ue2
#endif // PARSER_COMPONENTCONDREFERENCE_H_

View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Extended Unicode sequences (\\X)
*/
#include "ComponentEUS.h"
#include "buildstate.h"
#include "position.h"
#include "position_info.h"
#include "Parser.h"
#include "nfagraph/ng_builder.h"
#include "util/charreach.h"
using namespace std;
namespace ue2 {
ComponentEUS::ComponentEUS(u32 loc_in, const ParseMode &mode)
: loc(loc_in), utf8(mode.utf8),
position(GlushkovBuildState::POS_UNINITIALIZED) {}
ComponentEUS::~ComponentEUS() {}
ComponentEUS * ComponentEUS::clone() const {
return new ComponentEUS(*this);
}
vector<PositionInfo> ComponentEUS::first() const {
return vector<PositionInfo>(1, PositionInfo(position));
}
vector<PositionInfo> ComponentEUS::last() const {
return vector<PositionInfo>(1, PositionInfo(position));
}
void ComponentEUS::notePositions(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
position = builder.makePositions(1);
builder.addCharReach(position, CharReach::dot());
builder.setNodeReportID(position, 0 /* offset adj */);
if (utf8) { /* we are prefiltering, turn to.+ */
builder.addEdge(position, position);
}
}
} // namespace ue2

86
src/parser/ComponentEUS.h Normal file
View File

@@ -0,0 +1,86 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Extended Unicode sequences (\\X)
*/
#ifndef _RE_COMPONENTEXTENDEDUNICODESEQUENCE_H_
#define _RE_COMPONENTEXTENDEDUNICODESEQUENCE_H_
#include "Component.h"
namespace ue2 {
struct ParseMode;
class ComponentEUS : public Component {
friend class DumpVisitor;
friend class UnsupportedVisitor;
public:
ComponentEUS(u32 loc, const ParseMode &mode);
~ComponentEUS() override;
ComponentEUS *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override { return false; }
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &,
const std::vector<PositionInfo> &) override {
// all follow set construction is handled by firsts/lasts
return;
}
private:
u32 loc;
bool utf8;
Position position;
ComponentEUS(const ComponentEUS &other)
: Component(other), loc(other.loc), utf8(other.utf8),
position(other.position) {}
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,93 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Represents an empty regex element, like (?m)
*/
#include <cassert>
#include "ComponentEmpty.h"
#include "position.h"
#include "position_info.h"
#include "buildstate.h"
#include "ue2common.h"
using namespace std;
namespace ue2 {
ComponentEmpty::ComponentEmpty() {
// Surprise, it's EMPTY!
}
ComponentEmpty::~ComponentEmpty() {
// Surprise, it's EMPTY!
}
ComponentEmpty *ComponentEmpty::clone() const { return new ComponentEmpty(); }
bool ComponentEmpty::empty() const {
return true;
}
bool ComponentEmpty::vacuous_everywhere(void) const {
return true;
}
bool ComponentEmpty::repeatable() const {
// This is the whole point of this class. Empty constructs like '(?m)' are
// not repeatable.
return false;
}
vector<PositionInfo> ComponentEmpty::first() const {
return vector<PositionInfo>(1, GlushkovBuildState::POS_EPSILON);
}
vector<PositionInfo> ComponentEmpty::last() const {
return vector<PositionInfo>();
}
void ComponentEmpty::notePositions(GlushkovBuildState &) {
// Nothing to do.
}
void ComponentEmpty::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
// Nothing to do.
}
bool ComponentEmpty::checkEmbeddedStartAnchor(bool at_start) const {
return at_start;
}
bool ComponentEmpty::checkEmbeddedEndAnchor(bool at_end) const {
return at_end;
}
} // namespace ue2

View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Represents an empty regex element, like (?m)
*/
#ifndef PARSER_COMPONENT_EMPTY_H_
#define PARSER_COMPONENT_EMPTY_H_
#include "Component.h"
namespace ue2 {
class ComponentEmpty : public Component {
friend class DumpVisitor;
public:
ComponentEmpty();
~ComponentEmpty() override;
ComponentEmpty *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override;
bool vacuous_everywhere() const override;
bool repeatable() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
bool checkEmbeddedStartAnchor(bool at_start) const override;
bool checkEmbeddedEndAnchor(bool at_end) const override;
};
} // namespace ue2
#endif // PARSER_COMPONENT_EMPTY_H_

View File

@@ -0,0 +1,393 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Repeats ('*', '+', '?', '{M,N}', etc)
*/
#include "ComponentRepeat.h"
#include "buildstate.h"
#include "nfagraph/ng_builder.h"
#include "parse_error.h"
#include "Parser.h"
#include "position.h"
#include "position_dump.h"
#include "position_info.h"
#include "ue2common.h"
#include "util/make_unique.h"
#include <algorithm>
#include <cassert>
using namespace std;
namespace ue2 {
/** \brief Hard limit on the maximum repeat for bounded repeats. */
static const u32 MAX_MAX_BOUND = 32767;
/** \brief If expanding a repeat would lead to this many positions being
* generated, we fail the pattern. */
static const u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge
/* no edge priorities means that if our subcomponent can be empty, our min
* extent is effectively zero. */
ComponentRepeat::ComponentRepeat(unique_ptr<Component> sub_comp_in, u32 min,
u32 max, enum RepeatType t)
: type(t), sub_comp(move(sub_comp_in)), m_min(min), m_max(max),
posFirst(GlushkovBuildState::POS_UNINITIALIZED),
posLast(GlushkovBuildState::POS_UNINITIALIZED) {
assert(sub_comp);
assert(max > 0);
assert(m_min <= m_max);
if (m_max < NoLimit && m_max > MAX_MAX_BOUND) {
throw ParseError("Bounded repeat is too large.");
}
}
ComponentRepeat::~ComponentRepeat() {}
ComponentRepeat *ComponentRepeat::clone() const {
return new ComponentRepeat(*this);
}
ComponentRepeat::ComponentRepeat(const ComponentRepeat &other)
: Component(other),
type(other.type), sub_comp(unique_ptr<Component>(other.sub_comp->clone())),
m_min(other.m_min), m_max(other.m_max),
m_firsts(other.m_firsts), m_lasts(other.m_lasts),
posFirst(other.posFirst), posLast(other.posLast),
firsts_cache(other.firsts_cache) {}
bool ComponentRepeat::empty() const {
return m_min == 0 || sub_comp->empty();
}
bool ComponentRepeat::repeatable() const {
return false;
}
static
void addBase(Position base, vector<PositionInfo> &firsts,
vector<PositionInfo> &lasts) {
for (auto &e : firsts) {
if (e.pos != GlushkovBuildState::POS_EPSILON) {
e.pos += base;
}
}
for (auto &e : lasts) {
e.pos += base;
}
}
static
void checkPositions(vector<PositionInfo> &v, const GlushkovBuildState &bs) {
const NFABuilder& builder = bs.getBuilder();
for (const auto &e : v) {
if (builder.isSpecialState(e.pos)) {
throw ParseError("Embedded anchors not supported.");
}
}
}
void ComponentRepeat::notePositions(GlushkovBuildState &bs) {
assert(m_max > 0);
assert(m_max == NoLimit || m_max < MAX_MAX_BOUND);
/* Note: We can construct smaller subgraphs if we're not maintaining edge
* priorities. */
// We create one copy only through a recursive call to notePositions(),
// first() and last(). Then we clone its positions and store the
// appropriate firsts and lasts values for the copies.
posFirst = bs.getBuilder().numVertices();
sub_comp->notePositions(bs);
u32 copies = m_max < NoLimit ? m_max : MAX(m_min, 1);
DEBUG_PRINTF("building %u copies of repeated region\n", copies);
m_firsts.clear();
m_lasts.clear();
m_firsts.resize(copies);
m_lasts.resize(copies);
m_firsts[0] = sub_comp->first();
m_lasts[0] = sub_comp->last();
postSubNotePositionHook();
posLast = bs.getBuilder().numVertices() - 1;
u32 vcount = posLast + 1 - posFirst;
// If we're making more than one copy, then our firsts and lasts must only
// contain vertices inside [posFirst, posLast]: anything else means we have
// an embedded anchor or otherwise weird situation.
if (copies > 1) {
checkPositions(m_firsts[0], bs);
checkPositions(m_lasts[0], bs);
}
// Avoid enormous expansions
if (vcount * copies > MAX_POSITIONS_EXPANDED) {
throw ParseError("Bounded repeat is too large.");
}
// Add positions for the rest of the copies
size_t copyPositions = vcount * (copies - 1);
bs.getBuilder().makePositions(copyPositions);
// Calculate our firsts and lasts for the copies
for (u32 i = 1; i < copies; ++i) {
m_firsts[i] = m_firsts[0];
m_lasts[i] = m_lasts[0];
u32 base = i * vcount;
addBase(base, m_firsts[i], m_lasts[i]);
}
recordPosBounds(posFirst, bs.getBuilder().numVertices());
precalc_firsts(); /* ComponentRepeat requires firsts to be calculated ahead
* of time and cached due to expense */
}
vector<PositionInfo> ComponentRepeat::first() const {
DEBUG_PRINTF("firsts = %s\n", dumpPositions(firsts_cache.begin(),
firsts_cache.end()).c_str());
return firsts_cache;
}
void ComponentRepeat::buildFollowSet(GlushkovBuildState &bs,
const vector<PositionInfo> &lastPos) {
if (!m_max) {
return;
}
DEBUG_PRINTF("enter\n");
// Wire up the first (the "real") entry
DEBUG_PRINTF("initial repeat\n");
sub_comp->buildFollowSet(bs, lastPos);
// Clone the subgraph we just added N times, where N is the minimum extent
// of the graph minus one, wiring them up in a linear sequence
u32 copies = m_firsts.size();
DEBUG_PRINTF("cloning %u copies of repeat\n", copies - 1);
for (u32 rep = 1; rep < copies; rep++) {
u32 offset = (posLast + 1 - posFirst) * rep;
if (offset > 0) {
bs.cloneFollowSet(posFirst, posLast, offset);
}
}
wireRepeats(bs, lastPos);
DEBUG_PRINTF("leave\n");
}
void ComponentRepeat::optimise(bool connected_to_sds) {
DEBUG_PRINTF("opt %d\n", (int)connected_to_sds);
if (!connected_to_sds) {
return;
}
DEBUG_PRINTF("setting m_max to %u\n", m_min);
m_max = m_min;
}
bool ComponentRepeat::vacuous_everywhere() const {
return !m_min;
}
bool ComponentRepeat::checkEmbeddedStartAnchor(bool at_start) const {
at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
if (m_max > 1) {
at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
}
return at_start;
}
bool ComponentRepeat::checkEmbeddedEndAnchor(bool at_end) const {
at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
if (m_max > 1) {
at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
}
return at_end;
}
Component *ComponentRepeat::accept(ComponentVisitor &v) {
Component *c = v.visit(this);
if (c != this) {
v.post(this);
return c;
}
c = sub_comp->accept(v);
if (c != sub_comp.get()) {
sub_comp.reset(c);
}
v.post(this);
return !sub_comp ? nullptr : this;
}
void ComponentRepeat::accept(ConstComponentVisitor &v) const {
v.pre(*this);
sub_comp->accept(v);
v.post(*this);
}
vector<PositionInfo> ComponentRepeat::last() const {
vector<PositionInfo> lasts;
if (!m_max) {
return lasts;
}
assert(!m_firsts.empty()); // notePositions should already have run
assert(!m_lasts.empty());
// Optimisation: when we're not maintaining edge priorities, handling
// optional repeats has been taken care of by our FIRSTS. Thus, only
// the last mandatory repeat and (if different) the last optional
// repeat contributes to lasts.
if (m_min) {
const vector<PositionInfo> &l = m_lasts[m_min - 1];
lasts.insert(lasts.end(), l.begin(), l.end());
}
if (!m_min || m_min != m_lasts.size()) {
lasts.insert(lasts.end(), m_lasts.back().begin(), m_lasts.back().end());
}
return lasts;
}
void ComponentRepeat::wireRepeats(GlushkovBuildState &bs,
const vector<PositionInfo> &lastPos) {
/* note: m_lasts[0] already valid */
u32 copies = m_firsts.size();
const bool isEmpty = sub_comp->empty();
const vector<PositionInfo> &optLasts = m_min ? m_lasts[m_min - 1] : lastPos;
if (!copies) {
goto inf_check;
}
DEBUG_PRINTF("wiring up %u mand repeats\n", m_min);
for (u32 rep = 1; rep < m_min; rep++) {
bs.connectRegions(m_lasts[rep - 1], m_firsts[rep]);
if (isEmpty) {
m_lasts[rep].insert(m_lasts[rep].end(), m_lasts[rep - 1].begin(),
m_lasts[rep - 1].end());
}
}
DEBUG_PRINTF("wiring up %d optional repeats\n", copies - m_min);
for (u32 rep = MAX(m_min, 1); rep < copies; rep++) {
vector<PositionInfo> lasts = m_lasts[rep - 1];
if (m_min && rep != m_min) {
lasts.insert(lasts.end(), optLasts.begin(), optLasts.end());
sort(lasts.begin(), lasts.end());
lasts.erase(unique(lasts.begin(), lasts.end()), lasts.end());
}
bs.connectRegions(lasts, m_firsts[rep]);
}
inf_check:
// If we have no max bound, we need a self-loop as well.
if (m_max == NoLimit) {
DEBUG_PRINTF("final repeat self-loop\n");
bs.connectRegions(m_lasts.back(), m_firsts.back());
}
}
void ComponentRepeat::precalc_firsts() {
DEBUG_PRINTF("building firsts for {%u,%u} repeat with %s sub\n", m_min,
m_max, sub_comp->empty() ? "emptiable" : "non-emptiable");
/* For normal repeat, our optional repeats each have an epsilon at the end
* of their firsts lists.
*/
for (u32 i = m_min; i < m_firsts.size();i++) {
m_firsts[i].insert(m_firsts[i].end(), GlushkovBuildState::POS_EPSILON);
}
firsts_cache.clear();
if (!m_max) {
return;
}
assert(!m_firsts.empty()); // notePositions should already have run
const vector<PositionInfo> &f = m_firsts.front();
// If we're running without edge priorities, then we want to generate the
// repeat in such a way that the firsts do all the work. This will minimise
// the number of exceptional states in a LimEx NFA implementation.
if (!m_min || sub_comp->empty()) {
// Emptiable: all our repeats contribute to firsts.
// Each repeat's firsts is spliced in at the location of the epsilon
// (if any) in the previous repeat's firsts.
for (const auto &e : m_firsts) {
replaceEpsilons(firsts_cache, e);
}
} else {
// Not emptiable: firsts come from our first repeat only.
firsts_cache.insert(firsts_cache.end(), f.begin(), f.end());
}
}
static
bool hasPositionFlags(const Component &c) {
for (const auto &e : c.first()) {
if (e.flags) {
return true;
}
}
return false;
}
void ComponentRepeat::postSubNotePositionHook() {
// UE-444 optimization: we can REWRITE m_min under various circumstances,
// so that we create smaller NFA graphs. Note that this is _not_ possible
// if our subcomponent contains a flagged position, e.g. nofloat.
if (!hasPositionFlags(*sub_comp) && sub_comp->empty()) {
m_min = 0;
}
}
unique_ptr<ComponentRepeat> makeComponentRepeat(unique_ptr<Component> sub_comp,
u32 min, u32 max,
ComponentRepeat::RepeatType t) {
return ue2::make_unique<ComponentRepeat>(move(sub_comp), min, max, t);
}
} // namespace ue2

View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Repeats ('*', '+', '?', '{M,N}', etc)
*/
#ifndef RE_COMPONENTREPEAT_H
#define RE_COMPONENTREPEAT_H
#include "Component.h"
#include "position.h"
#include "ue2common.h"
#include <memory>
#include <utility>
namespace ue2 {
/** \brief Encapsulates a repeat of a subexpression ('*', '+', '?', '{M,N}',
* etc).
*
* Ascii Art Time:
*
* Our standard representation of standard repeats. Other constructions (fan-in
* vs fan-out) would also be possible and equivalent for our purposes.
*
* {n,m}
*
* S->M->M->M->O->O->O->T
* | ^ ^ ^
* | | | |
* \-----------/
*
* {0,m}
*
* S->O->O->O->T
* | ^ ^ ^
* | | | |
* \-----------/
*
*/
class ComponentRepeat : public Component {
friend class ConstructLiteralVisitor;
friend class DumpVisitor;
friend class PrintVisitor;
friend class SimplifyVisitor;
public:
/** \brief Value representing no maximum bound. */
static constexpr u32 NoLimit = 0xffffffff;
/** \brief Type of this repeat, characterising its
* greediness/possessiveness. */
enum RepeatType {
/** Minimising repeat, like 'a*?'. */
REPEAT_NONGREEDY,
/** Maximising repeat, like 'a*'. This is the default in PCRE. */
REPEAT_GREEDY,
/** Possessive, maximising repeat, like 'a*+'. Possessive repeats are
* only currently supported in prefiltering mode, where we treat them
* the same way we treat normal greedy repeats. */
REPEAT_POSSESSIVE,
};
ComponentRepeat(std::unique_ptr<Component> sub_comp, u32 min, u32 max,
RepeatType t);
~ComponentRepeat() override;
ComponentRepeat *clone() const override;
Component *accept(ComponentVisitor &v) override;
void accept(ConstComponentVisitor &v) const override;
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override;
bool repeatable() const override;
bool vacuous_everywhere() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
bool checkEmbeddedStartAnchor(bool at_start) const override;
bool checkEmbeddedEndAnchor(bool at_end) const override;
void optimise(bool connected_to_sds) override;
virtual std::pair<u32, u32> getBounds() const {
return std::make_pair(m_min, m_max);
}
/** \brief From declared behaviour (not taking into account the
* sub-component). */
enum RepeatType type;
protected:
/** Called by \ref buildFollowSet to connect up the various repeats. */
void precalc_firsts();
void postSubNotePositionHook();
void wireRepeats(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos);
std::unique_ptr<Component> sub_comp;
u32 m_min;
u32 m_max;
std::vector<std::vector<PositionInfo> > m_firsts;
std::vector<std::vector<PositionInfo> > m_lasts;
Position posFirst;
Position posLast;
std::vector<PositionInfo> firsts_cache;
ComponentRepeat(const ComponentRepeat &other);
};
std::unique_ptr<ComponentRepeat>
makeComponentRepeat(std::unique_ptr<Component> sub_comp, u32 min, u32 max,
ComponentRepeat::RepeatType t);
} // namespace ue2
#endif // _RE_COMPONENTREPEAT_H_

View File

@@ -0,0 +1,376 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Sequence of Component objects.
*/
#include "ComponentSequence.h"
#include "buildstate.h"
#include "ComponentAlternation.h"
#include "ComponentRepeat.h"
#include "Parser.h"
#include "ue2common.h"
#include "parse_error.h"
#include "position_dump.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
#include "util/container.h"
#include "util/make_unique.h"
#include <algorithm>
#include <cassert>
using namespace std;
namespace ue2 {
ComponentSequence::ComponentSequence() : capture_index(NOT_CAPTURED) {}
ComponentSequence::~ComponentSequence() {}
ComponentSequence::ComponentSequence(const ComponentSequence &other)
: Component(other), capture_index(other.capture_index) {
// Deep copy children.
for (const auto &c : other.children) {
assert(c);
children.push_back(unique_ptr<Component>(c->clone()));
}
if (other.alternation) {
const ComponentAlternation &c = *other.alternation;
alternation.reset(c.clone());
}
}
ComponentSequence *ComponentSequence::clone() const {
return new ComponentSequence(*this);
}
Component *ComponentSequence::accept(ComponentVisitor &v) {
assert(!alternation); // Sequence must be finalized first.
Component *c = v.visit(this);
if (c != this) {
v.post(this);
return c;
}
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
Component *child = i->get();
c = (*i)->accept(v);
if (c != child) {
// Child has been replaced (new Component pointer) or we've been
// instructed to delete it (null).
i->reset(c);
}
}
// Remove deleted children.
children.erase(remove(children.begin(), children.end(), nullptr),
children.end());
v.post(this);
return this;
}
void ComponentSequence::accept(ConstComponentVisitor &v) const {
assert(!alternation); // Sequence must be finalized first.
v.pre(*this);
for (auto i = children.begin(), e = children.end(); i != e; ++i) {
(*i)->accept(v);
if (i + 1 != e) {
v.during(*this);
}
}
v.post(*this);
}
void ComponentSequence::addComponent(unique_ptr<Component> comp) {
children.push_back(move(comp));
}
bool ComponentSequence::addRepeat(u32 min, u32 max,
ComponentRepeat::RepeatType type) {
if (children.empty() || min > max || max == 0) {
return false;
}
// We can't apply a repeat to some types of component.
assert(children.back());
if (!children.back()->repeatable()) {
return false;
}
children.back() = makeComponentRepeat(move(children.back()), min, max,
type);
assert(children.back());
return true;
}
void ComponentSequence::addAlternation() {
if (!alternation) {
alternation = ue2::make_unique<ComponentAlternation>();
}
auto seq = ue2::make_unique<ComponentSequence>();
seq->children.swap(children);
alternation->append(move(seq));
}
void ComponentSequence::finalize() {
if (alternation) {
addAlternation();
assert(children.empty());
children.push_back(move(alternation));
alternation = nullptr;
}
}
vector<PositionInfo> ComponentSequence::first() const {
vector<PositionInfo> firsts, subfirsts;
for (const auto &c : children) {
subfirsts = c->first();
replaceEpsilons(firsts, subfirsts);
if (!c->empty()) {
break;
}
}
if (firsts.empty()) {
DEBUG_PRINTF("trivial empty sequence %zu\n", firsts.size());
assert(children.empty());
firsts.push_back(GlushkovBuildState::POS_EPSILON);
}
DEBUG_PRINTF("%zu firsts\n", firsts.size());
return firsts;
}
namespace {
struct eps_info {
eps_info() : flags(0U) {}
u32 flags;
};
}
static
void epsilonVisit(vector<eps_info> *info, const vector<PositionInfo> &f) {
vector<eps_info> out;
out.reserve(info->size());
set<u32> seen_flags;
assert(!info->empty());
for (auto eps = find(f.begin(), f.end(), GlushkovBuildState::POS_EPSILON);
eps != f.end();
eps = find(eps + 1, f.end(), GlushkovBuildState::POS_EPSILON)) {
for (auto it = info->begin(); it != info->end(); ++it) {
u32 flags = it->flags | eps->flags;
if (contains(seen_flags, flags)) {
continue;
}
out.push_back(*it);
out.back().flags = flags;
seen_flags.insert(flags);
}
}
info->swap(out);
assert(!info->empty());
}
static
void applyEpsilonVisits(vector<PositionInfo> &lasts,
const vector<eps_info> &eps_visits) {
vector<PositionInfo> out;
out.reserve(lasts.size() * eps_visits.size());
for (const auto &last : lasts) {
for (const auto &e : eps_visits) {
out.push_back(last);
out.back().flags |= e.flags;
}
}
cleanupPositions(out);
lasts.swap(out);
}
vector<PositionInfo> ComponentSequence::last() const {
vector<PositionInfo> lasts, sublasts;
vector<eps_info> visits(1);
auto i = children.rbegin(), e = children.rend();
for (; i != e; ++i) {
sublasts = (*i)->last();
applyEpsilonVisits(sublasts, visits);
lasts.insert(lasts.end(), sublasts.begin(), sublasts.end());
if ((*i)->empty()) {
// this epsilon's flags should propagate to subsequent lasts'
// enter/exit lists
epsilonVisit(&visits, (*i)->first());
} else {
break;
}
}
DEBUG_PRINTF("lasts = %s\n",
dumpPositions(lasts.begin(), lasts.end()).c_str());
return lasts;
}
bool ComponentSequence::empty(void) const {
// a sequence can be empty if all its subcomponents can be empty
for (const auto &c : children) {
if (!c->empty()) {
return false;
}
}
return true;
}
void ComponentSequence::notePositions(GlushkovBuildState &bs) {
u32 pb = bs.getBuilder().numVertices();
for (auto &c : children) {
c->notePositions(bs);
}
recordPosBounds(pb, bs.getBuilder().numVertices());
}
void ComponentSequence::buildFollowSet(GlushkovBuildState &bs,
const vector<PositionInfo> &lastPos) {
DEBUG_PRINTF("sequence of %zu components\n", children.size());
// If no components, no work to do.
if (children.empty()) {
return;
}
// First element
children.front()->buildFollowSet(bs, lastPos);
if (children.size() == 1) {
// If our sequence contains precisely one component, then we've done
// all our work. Hooking up its firsts and lasts will be done by our
// parent component.
return;
}
// Remaining elements, wiring last to first in sequence.
vector<PositionInfo> prevLasts = children.front()->last();
for (auto it = next(children.begin()), ite = children.end(); it != ite; ++it) {
assert(*it);
Component &c = *(*it);
// Build subcomponent follow set
c.buildFollowSet(bs, prevLasts);
// FIRST(curr)
vector<PositionInfo> currFirsts(c.first());
// LAST(prev) => FIRST(curr)
DEBUG_PRINTF("connecting lasts (|| %zu) to firsts of comp %zd\n",
prevLasts.size(), it - children.begin());
bs.connectRegions(prevLasts, currFirsts);
// Generate a new LAST(prev) for the next iteration; either c->last()
// on its own if it can't be empty or c->last unioned with the previous
// last if c can be empty
vector<PositionInfo> currLasts(c.last());
if (!c.empty()) {
// Current component can't be empty, so use its lasts only
prevLasts.swap(currLasts);
DEBUG_PRINTF("swapped lasts\n");
} else {
// Add current lasts to previous lasts
DEBUG_PRINTF("doing stuff for empty comp\n");
prevLasts.insert(prevLasts.end(), currLasts.begin(), currLasts.end());
DEBUG_PRINTF("done stuff for empty comp\n");
}
}
}
bool ComponentSequence::checkEmbeddedStartAnchor(bool at_start) const {
for (const auto &c : children) {
at_start = c->checkEmbeddedStartAnchor(at_start);
}
return at_start;
}
bool ComponentSequence::checkEmbeddedEndAnchor(bool at_end) const {
// Note reversed ordering.
for (auto i = children.rbegin(), e = children.rend(); i != e; ++i) {
at_end = (*i)->checkEmbeddedEndAnchor(at_end);
}
return at_end;
}
bool ComponentSequence::vacuous_everywhere() const {
for (const auto &c : children) {
if (!c->vacuous_everywhere()) {
return false;
}
}
return true;
}
void ComponentSequence::optimise(bool connected_to_sds) {
DEBUG_PRINTF("opt %d\n", (int)connected_to_sds);
for (u32 i = 0; i < children.size();) {
DEBUG_PRINTF("opt %u: ctsds: %d\n", i, (int)connected_to_sds);
Component &sub = *children[i];
sub.optimise(connected_to_sds);
bool vacuous = sub.vacuous_everywhere();
if (connected_to_sds && vacuous) {
DEBUG_PRINTF("delete opt %u\n", i);
auto it = children.begin() + i;
children.erase(it);
continue;
}
connected_to_sds = connected_to_sds && vacuous;
i++;
}
}
} // namespace ue2

View File

@@ -0,0 +1,108 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Sequence of Component objects.
*/
#ifndef COMPONENT_SEQUENCE_H
#define COMPONENT_SEQUENCE_H
#include "Component.h"
#include "ComponentRepeat.h" // for ComponentRepeat::RepeatType
#include "ue2common.h"
#include <memory>
#include <set>
#include <vector>
namespace ue2 {
class ComponentAlternation;
class GlushkovBuildState;
// Encapsulates a number of sub expressions to be applied sequentially
class ComponentSequence : public Component {
friend class DumpVisitor;
friend class PrintVisitor;
friend class SimplifyVisitor;
public:
/** \brief capture index representing a sequence that ISN'T capturing */
static constexpr unsigned int NOT_CAPTURED = 65536;
ComponentSequence();
~ComponentSequence() override;
ComponentSequence *clone() const override;
Component *accept(ComponentVisitor &v) override;
void accept(ConstComponentVisitor &v) const override;
bool addRepeat(u32 min, u32 max, ComponentRepeat::RepeatType type);
// overridden by ComponentCondReference, which can only have 1 or 2
// branches.
virtual void addAlternation();
virtual void finalize();
void addComponent(std::unique_ptr<Component> comp);
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty(void) const override;
bool vacuous_everywhere() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
bool checkEmbeddedStartAnchor(bool at_start) const override;
bool checkEmbeddedEndAnchor(bool at_end) const override;
void optimise(bool connected_to_sds) override;
void setCaptureIndex(unsigned int idx) { capture_index = idx; }
unsigned int getCaptureIndex() const { return capture_index; }
void setCaptureName(const std::string &s) { capture_name = s; }
const std::string &getCaptureName() const { return capture_name; }
virtual const std::vector<std::unique_ptr<Component>> &getChildren() const {
return children;
}
protected:
ComponentSequence(const ComponentSequence &other);
std::vector<std::unique_ptr<Component>> children;
std::unique_ptr<ComponentAlternation> alternation;
private:
unsigned int capture_index;
std::string capture_name; //!< empty means no name
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,76 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "AsciiComponentClass.h"
#include "ComponentVisitor.h"
#include "ComponentAlternation.h"
#include "ComponentAssertion.h"
#include "ComponentAtomicGroup.h"
#include "ComponentBackReference.h"
#include "ComponentBoundary.h"
#include "ComponentByte.h"
#include "ComponentCondReference.h"
#include "ComponentClass.h"
#include "ComponentEmpty.h"
#include "ComponentEUS.h"
#include "ComponentRepeat.h"
#include "ComponentSequence.h"
#include "ComponentWordBoundary.h"
#include "Utf8ComponentClass.h"
namespace ue2 {
ComponentVisitor::~ComponentVisitor() {
// empty
}
// Default implementations.
DefaultComponentVisitor::DefaultComponentVisitor() {}
DefaultComponentVisitor::~DefaultComponentVisitor() {}
#define DEFAULT_FUNCS(comp) \
Component *DefaultComponentVisitor::visit(comp *c) { return c; } \
void DefaultComponentVisitor::post(comp *) {}
DEFAULT_FUNCS(AsciiComponentClass)
DEFAULT_FUNCS(ComponentAlternation)
DEFAULT_FUNCS(ComponentAssertion)
DEFAULT_FUNCS(ComponentAtomicGroup)
DEFAULT_FUNCS(ComponentBackReference)
DEFAULT_FUNCS(ComponentBoundary)
DEFAULT_FUNCS(ComponentByte)
DEFAULT_FUNCS(ComponentCondReference)
DEFAULT_FUNCS(ComponentEmpty)
DEFAULT_FUNCS(ComponentEUS)
DEFAULT_FUNCS(ComponentRepeat)
DEFAULT_FUNCS(ComponentSequence)
DEFAULT_FUNCS(ComponentWordBoundary)
DEFAULT_FUNCS(UTF8ComponentClass)
} // namespace

View File

@@ -0,0 +1,150 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Visitor base class for working with the component tree.
*/
#ifndef COMPONENTVISITOR_H
#define COMPONENTVISITOR_H
namespace ue2 {
class AsciiComponentClass;
class Component;
class ComponentAlternation;
class ComponentAssertion;
class ComponentAtomicGroup;
class ComponentBackReference;
class ComponentBoundary;
class ComponentByte;
class ComponentClass;
class ComponentCondReference;
class ComponentEmpty;
class ComponentEUS;
class ComponentRepeat;
class ComponentSequence;
class ComponentWordBoundary;
class UTF8ComponentClass;
/**
* \brief Visitor base class for working with the component tree.
*
* Our approach to implementing the visitor pattern for traversing (and
* optionally mutating) the Component tree for a pattern. Each _visit_ function
* takes a Component subclass pointer in and returns a Component pointer. That
* pointer can have several values, dictating what the containing Component
* should do:
*
* 1. If ptr == c, then do nothing.
* 2. If ptr == nullptr, then remove c from the tree.
* 3. If ptr != c && ptr != nullptr, then replace c with ptr.
*
* Traversal order is pre-order.
*
* After a Component's subcomponents have been visited, the _post_ function for
* that Component will be called.
*/
class ComponentVisitor {
public:
virtual ~ComponentVisitor();
virtual Component *visit(AsciiComponentClass *c) = 0;
virtual Component *visit(ComponentAlternation *c) = 0;
virtual Component *visit(ComponentAssertion *c) = 0;
virtual Component *visit(ComponentAtomicGroup *c) = 0;
virtual Component *visit(ComponentBackReference *c) = 0;
virtual Component *visit(ComponentBoundary *c) = 0;
virtual Component *visit(ComponentByte *c) = 0;
virtual Component *visit(ComponentCondReference *c) = 0;
virtual Component *visit(ComponentEmpty *c) = 0;
virtual Component *visit(ComponentEUS *c) = 0;
virtual Component *visit(ComponentRepeat *c) = 0;
virtual Component *visit(ComponentSequence *c) = 0;
virtual Component *visit(ComponentWordBoundary *c) = 0;
virtual Component *visit(UTF8ComponentClass *c) = 0;
virtual void post(AsciiComponentClass *c) = 0;
virtual void post(ComponentAlternation *c) = 0;
virtual void post(ComponentAssertion *c) = 0;
virtual void post(ComponentAtomicGroup *c) = 0;
virtual void post(ComponentBackReference *c) = 0;
virtual void post(ComponentBoundary *c) = 0;
virtual void post(ComponentByte *c) = 0;
virtual void post(ComponentCondReference *c) = 0;
virtual void post(ComponentEmpty *c) = 0;
virtual void post(ComponentEUS *c) = 0;
virtual void post(ComponentRepeat *c) = 0;
virtual void post(ComponentSequence *c) = 0;
virtual void post(ComponentWordBoundary *c) = 0;
virtual void post(UTF8ComponentClass *c) = 0;
};
/**
* \brief Concrete subclass of ComponentVisitor with default behaviour,
* allowing you to just implement the member functions you need.
*/
class DefaultComponentVisitor : public ComponentVisitor {
public:
DefaultComponentVisitor();
~DefaultComponentVisitor() override;
Component *visit(AsciiComponentClass *c) override;
Component *visit(ComponentAlternation *c) override;
Component *visit(ComponentAssertion *c) override;
Component *visit(ComponentAtomicGroup *c) override;
Component *visit(ComponentBackReference *c) override;
Component *visit(ComponentBoundary *c) override;
Component *visit(ComponentByte *c) override;
Component *visit(ComponentCondReference *c) override;
Component *visit(ComponentEmpty *c) override;
Component *visit(ComponentEUS *c) override;
Component *visit(ComponentRepeat *c) override;
Component *visit(ComponentSequence *c) override;
Component *visit(ComponentWordBoundary *c) override;
Component *visit(UTF8ComponentClass *c) override;
void post(AsciiComponentClass *c) override;
void post(ComponentAlternation *c) override;
void post(ComponentAssertion *c) override;
void post(ComponentAtomicGroup *c) override;
void post(ComponentBackReference *c) override;
void post(ComponentBoundary *c) override;
void post(ComponentByte *c) override;
void post(ComponentCondReference *c) override;
void post(ComponentEmpty *c) override;
void post(ComponentEUS *c) override;
void post(ComponentRepeat *c) override;
void post(ComponentSequence *c) override;
void post(ComponentWordBoundary *c) override;
void post(UTF8ComponentClass *c) override;
};
} // namespace ue2
#endif // COMPONENTVISITOR_H

View File

@@ -0,0 +1,105 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Word Boundary Assertion (\\b or \\B)
*/
#include "ComponentWordBoundary.h"
#include "buildstate.h"
#include "parse_error.h"
#include "Parser.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
using namespace std;
namespace ue2 {
ComponentWordBoundary::ComponentWordBoundary(u32 loc_in, bool neg,
const ParseMode &mode)
: loc(loc_in), position(GlushkovBuildState::POS_UNINITIALIZED),
negated(neg), ucp(mode.ucp), prefilter(false) {}
ComponentWordBoundary::~ComponentWordBoundary() {
// empty
}
ComponentWordBoundary * ComponentWordBoundary::clone() const {
return new ComponentWordBoundary(*this);
}
vector<PositionInfo> ComponentWordBoundary::first() const {
vector<PositionInfo> firsts;
firsts.push_back(position);
return firsts;
}
vector<PositionInfo> ComponentWordBoundary::last() const {
// Same as firsts
return first();
}
bool ComponentWordBoundary::empty() const {
return false;
}
bool ComponentWordBoundary::repeatable() const {
return false;
}
void ComponentWordBoundary::notePositions(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
position = builder.makePositions(1);
if (ucp) {
assert(prefilter); // only in prefiltering mode!
if (negated) {
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_WORD_UCP
| POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP);
} else {
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP
| POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP);
}
} else {
if (negated) {
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_WORD
| POS_FLAG_ASSERT_NONWORD_TO_NONWORD);
} else {
builder.setAssertFlag(position, POS_FLAG_ASSERT_WORD_TO_NONWORD
| POS_FLAG_ASSERT_NONWORD_TO_WORD);
}
}
recordPosBounds(position, position + 1);
}
void ComponentWordBoundary::buildFollowSet(GlushkovBuildState&,
const vector<PositionInfo>&) {
// No internal connections, nowt to do
}
} // namespace ue2

View File

@@ -0,0 +1,90 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Word Boundary Assertion (\\b or \\B)
*/
#ifndef _RE_COMPONENTWORDBOUNDARY_H_
#define _RE_COMPONENTWORDBOUNDARY_H_
#include "Component.h"
#include "position.h"
namespace ue2 {
struct ParseMode;
/** \brief Encapsulates a positive (\\b) or negative (\\B) word boundary
* assertion. */
class ComponentWordBoundary : public Component {
friend class DumpVisitor;
friend class PrintVisitor;
friend class UnsupportedVisitor;
public:
ComponentWordBoundary(u32 loc, bool negated, const ParseMode &mode);
~ComponentWordBoundary() override;
ComponentWordBoundary *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
std::vector<PositionInfo> first() const override;
std::vector<PositionInfo> last() const override;
bool empty() const override;
bool repeatable() const override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &lastPos) override;
void setPrefilter(bool p) { prefilter = p; }
private:
u32 loc; //!< location in pattern for error reporting.
Position position;
bool negated;
bool ucp;
bool prefilter; //!< set by PrefilterVisitor, this is ugly
ComponentWordBoundary(const ComponentWordBoundary &other)
: Component(other), loc(other.loc), position(other.position),
negated(other.negated), ucp(other.ucp), prefilter(other.prefilter) {}
};
} // namespace ue2
#endif

View File

@@ -0,0 +1,78 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "ConstComponentVisitor.h"
#include "AsciiComponentClass.h"
#include "ComponentAlternation.h"
#include "ComponentAssertion.h"
#include "ComponentAtomicGroup.h"
#include "ComponentBackReference.h"
#include "ComponentBoundary.h"
#include "ComponentByte.h"
#include "ComponentCondReference.h"
#include "ComponentClass.h"
#include "ComponentEmpty.h"
#include "ComponentEUS.h"
#include "ComponentRepeat.h"
#include "ComponentSequence.h"
#include "ComponentWordBoundary.h"
#include "Utf8ComponentClass.h"
namespace ue2 {
ConstComponentVisitor::~ConstComponentVisitor() {
// empty
}
// Default implementations.
DefaultConstComponentVisitor::DefaultConstComponentVisitor() {}
DefaultConstComponentVisitor::~DefaultConstComponentVisitor() {}
#define DEFAULT_FUNCS(comp) \
void DefaultConstComponentVisitor::pre(const comp &) {} \
void DefaultConstComponentVisitor::during(const comp &) {} \
void DefaultConstComponentVisitor::post(const comp &) {}
DEFAULT_FUNCS(AsciiComponentClass)
DEFAULT_FUNCS(ComponentAlternation)
DEFAULT_FUNCS(ComponentAssertion)
DEFAULT_FUNCS(ComponentAtomicGroup)
DEFAULT_FUNCS(ComponentBackReference)
DEFAULT_FUNCS(ComponentBoundary)
DEFAULT_FUNCS(ComponentByte)
DEFAULT_FUNCS(ComponentCondReference)
DEFAULT_FUNCS(ComponentEmpty)
DEFAULT_FUNCS(ComponentEUS)
DEFAULT_FUNCS(ComponentRepeat)
DEFAULT_FUNCS(ComponentSequence)
DEFAULT_FUNCS(ComponentWordBoundary)
DEFAULT_FUNCS(UTF8ComponentClass)
} // namespace ue2

View File

@@ -0,0 +1,170 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Visitor base class for working with the component tree.
*/
#ifndef CONSTCOMPONENTVISITOR_H
#define CONSTCOMPONENTVISITOR_H
namespace ue2 {
class AsciiComponentClass;
class Component;
class ComponentAlternation;
class ComponentAssertion;
class ComponentAtomicGroup;
class ComponentBackReference;
class ComponentBoundary;
class ComponentByte;
class ComponentCondReference;
class ComponentClass;
class ComponentEmpty;
class ComponentEUS;
class ComponentRepeat;
class ComponentSequence;
class ComponentWordBoundary;
class UTF8ComponentClass;
/**
* \brief Visitor base class for traversing an immutable component tree.
*
* Our approach to implementing the visitor pattern for traversing the
* Component tree for a pattern. This version operates on an immutable tree;
* use \ref ComponentVisitor if you need to make changes to components during
* traversal.
*/
class ConstComponentVisitor {
public:
virtual ~ConstComponentVisitor();
virtual void pre(const AsciiComponentClass &c) = 0;
virtual void pre(const ComponentAlternation &c) = 0;
virtual void pre(const ComponentAssertion &c) = 0;
virtual void pre(const ComponentAtomicGroup &c) = 0;
virtual void pre(const ComponentBackReference &c) = 0;
virtual void pre(const ComponentBoundary &c) = 0;
virtual void pre(const ComponentByte &c) = 0;
virtual void pre(const ComponentCondReference &c) = 0;
virtual void pre(const ComponentEmpty &c) = 0;
virtual void pre(const ComponentEUS &c) = 0;
virtual void pre(const ComponentRepeat &c) = 0;
virtual void pre(const ComponentSequence &c) = 0;
virtual void pre(const ComponentWordBoundary &c) = 0;
virtual void pre(const UTF8ComponentClass &c) = 0;
virtual void during(const AsciiComponentClass &c) = 0;
virtual void during(const ComponentAlternation &c) = 0;
virtual void during(const ComponentAssertion &c) = 0;
virtual void during(const ComponentAtomicGroup &c) = 0;
virtual void during(const ComponentBackReference &c) = 0;
virtual void during(const ComponentBoundary &c) = 0;
virtual void during(const ComponentByte &c) = 0;
virtual void during(const ComponentCondReference &c) = 0;
virtual void during(const ComponentEmpty &c) = 0;
virtual void during(const ComponentEUS &c) = 0;
virtual void during(const ComponentRepeat &c) = 0;
virtual void during(const ComponentSequence &c) = 0;
virtual void during(const ComponentWordBoundary &c) = 0;
virtual void during(const UTF8ComponentClass &c) = 0;
virtual void post(const AsciiComponentClass &c) = 0;
virtual void post(const ComponentAlternation &c) = 0;
virtual void post(const ComponentAssertion &c) = 0;
virtual void post(const ComponentAtomicGroup &c) = 0;
virtual void post(const ComponentBackReference &c) = 0;
virtual void post(const ComponentBoundary &c) = 0;
virtual void post(const ComponentByte &c) = 0;
virtual void post(const ComponentCondReference &c) = 0;
virtual void post(const ComponentEmpty &c) = 0;
virtual void post(const ComponentEUS &c) = 0;
virtual void post(const ComponentRepeat &c) = 0;
virtual void post(const ComponentSequence &c) = 0;
virtual void post(const ComponentWordBoundary &c) = 0;
virtual void post(const UTF8ComponentClass &c) = 0;
};
/**
* \brief Concrete subclass of ConstComponentVisitor with default behaviour,
* allowing you to just implement the member functions you need.
*/
class DefaultConstComponentVisitor : public ConstComponentVisitor {
public:
DefaultConstComponentVisitor();
~DefaultConstComponentVisitor() override;
void pre(const AsciiComponentClass &c) override;
void pre(const ComponentAlternation &c) override;
void pre(const ComponentAssertion &c) override;
void pre(const ComponentAtomicGroup &c) override;
void pre(const ComponentBackReference &c) override;
void pre(const ComponentBoundary &c) override;
void pre(const ComponentByte &c) override;
void pre(const ComponentCondReference &c) override;
void pre(const ComponentEmpty &c) override;
void pre(const ComponentEUS &c) override;
void pre(const ComponentRepeat &c) override;
void pre(const ComponentSequence &c) override;
void pre(const ComponentWordBoundary &c) override;
void pre(const UTF8ComponentClass &c) override;
void during(const AsciiComponentClass &c) override;
void during(const ComponentAlternation &c) override;
void during(const ComponentAssertion &c) override;
void during(const ComponentAtomicGroup &c) override;
void during(const ComponentBackReference &c) override;
void during(const ComponentBoundary &c) override;
void during(const ComponentByte &c) override;
void during(const ComponentCondReference &c) override;
void during(const ComponentEmpty &c) override;
void during(const ComponentEUS &c) override;
void during(const ComponentRepeat &c) override;
void during(const ComponentSequence &c) override;
void during(const ComponentWordBoundary &c) override;
void during(const UTF8ComponentClass &c) override;
void post(const AsciiComponentClass &c) override;
void post(const ComponentAlternation &c) override;
void post(const ComponentAssertion &c) override;
void post(const ComponentAtomicGroup &c) override;
void post(const ComponentBackReference &c) override;
void post(const ComponentBoundary &c) override;
void post(const ComponentByte &c) override;
void post(const ComponentCondReference &c) override;
void post(const ComponentEmpty &c) override;
void post(const ComponentEUS &c) override;
void post(const ComponentRepeat &c) override;
void post(const ComponentSequence &c) override;
void post(const ComponentWordBoundary &c) override;
void post(const UTF8ComponentClass &c) override;
};
} // namespace ue2
#endif // CONSTCOMPONENTVISITOR_H

76
src/parser/Parser.h Normal file
View File

@@ -0,0 +1,76 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Interface to Parser.
*/
#ifndef _RE_PARSER_H_
#define _RE_PARSER_H_
#include "ue2common.h"
#include <memory>
namespace ue2 {
class Component;
/** \brief Represents the current "mode flags" at any point in the parsing
* process.
*
* This is necessary as some modes can be changed part-way through an
* expression, such as in:
*
* /foo(?i)bar/
*/
struct ParseMode {
ParseMode() {}
explicit ParseMode(u32 hs_flags);
bool caseless = false;
bool dotall = false;
bool ignore_space = false;
bool multiline = false;
bool ucp = false;
bool utf8 = false;
};
/** \brief Parse the given regular expression into a \ref Component tree.
*
* The \a mode parameter should contain the initial mode flags, and will be
* updated by the parser if additional global flags are introduced in the
* expression (for example, via "(*UTF8)".)
*
* This call will throw a ParseError on failure.
*/
std::unique_ptr<Component> parse(const char *const ptr, ParseMode &mode);
} // namespace ue2
#endif // _RE_PARSER_H_

1964
src/parser/Parser.rl Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,115 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Character class in UTF-8 mode.
*/
#ifndef UTF8_COMPONENT_CLASS_H
#define UTF8_COMPONENT_CLASS_H
#include "ComponentClass.h"
#include "ue2common.h"
#include "util/unicode_set.h"
#include <map>
#include <set>
#include <string>
#include <vector>
namespace ue2 {
class UTF8ComponentClass : public ComponentClass {
friend class DumpVisitor;
friend class PrintVisitor;
friend class CaselessVisitor;
friend class SimplifyVisitor;
friend class SimplifyCandidatesVisitor;
public:
explicit UTF8ComponentClass(const ParseMode &mode);
~UTF8ComponentClass() override {}
UTF8ComponentClass *clone() const override;
Component *accept(ComponentVisitor &v) override {
Component *c = v.visit(this);
v.post(this);
return c;
}
void accept(ConstComponentVisitor &v) const override {
v.pre(*this);
v.during(*this);
v.post(*this);
}
bool class_empty(void) const override;
void add(PredefinedClass c, bool negative) override;
void add(unichar c) override;
void finalize(void) override;
void notePositions(GlushkovBuildState &bs) override;
void buildFollowSet(GlushkovBuildState &bs,
const std::vector<PositionInfo> &) override;
std::vector<PositionInfo> first(void) const override;
std::vector<PositionInfo> last(void) const override;
protected:
void createRange(unichar to) override;
private:
Position getHead(NFABuilder &builder, u8 first_byte);
void addToTail(GlushkovBuildState &bs, std::map<Position, Position> &finals,
Position prev, unichar b, unichar e);
void ensureDotTrailer(GlushkovBuildState &bs);
void ensureTwoDotTrailer(GlushkovBuildState &bs);
void ensureThreeDotTrailer(GlushkovBuildState &bs);
void buildOneByte(GlushkovBuildState &bs);
void buildTwoByte(GlushkovBuildState &bs);
void buildThreeByte(GlushkovBuildState &bs);
void buildFourByte(GlushkovBuildState &bs);
CodePointSet cps;
CodePointSet cps_ucp;
std::map<u8, Position> heads;
Position single_pos;
Position one_dot_trailer;
Position two_dot_trailer;
Position three_dot_trailer;
Position two_char_dot_head;
Position three_char_dot_head;
Position four_char_dot_head;
std::set<Position> tails;
};
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
bool isUcp(PredefinedClass c);
} // namespace
#endif // UTF8_COMPONENT_CLASS_H

527
src/parser/buildstate.cpp Normal file
View File

@@ -0,0 +1,527 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Glushkov construction.
*/
#include "buildstate.h"
#include "position.h"
#include "position_dump.h"
#include "position_info.h"
#include "parse_error.h"
#include "hs_internal.h"
#include "ue2common.h"
#include "nfagraph/ng_builder.h"
#include "util/charreach.h"
#include "util/container.h"
#include "util/make_unique.h"
#include "util/ue2_containers.h"
#include <algorithm>
#include <iterator>
#include <limits>
#include <map>
#include <utility>
#if defined(DEBUG) || defined(DUMP_SUPPORT)
#include <ostream>
#include <sstream>
#endif
using namespace std;
namespace ue2 {
/** \brief Represents an uninitialized state. */
const Position GlushkovBuildState::POS_UNINITIALIZED =
numeric_limits<Position>::max();
/** \brief Represents an epsilon transition in the firsts of a component. */
const Position GlushkovBuildState::POS_EPSILON =
numeric_limits<Position>::max() - 1;
GlushkovBuildState::~GlushkovBuildState() { }
namespace /* anonymous */ {
class CheckPositionFlags {
public:
explicit CheckPositionFlags(int fl) : flags(fl) {}
bool operator()(const PositionInfo &p) const {
return (p.flags & flags) == flags;
}
private:
int flags;
};
class CheckUnflaggedEpsilon {
public:
bool operator()(const PositionInfo &p) const {
return p.pos == GlushkovBuildState::POS_EPSILON && p.flags == 0;
}
};
/** \brief Concrete impl of the GlushkovBuildState interface. */
class GlushkovBuildStateImpl : public GlushkovBuildState {
public:
GlushkovBuildStateImpl(NFABuilder &b, bool prefilter);
/** \brief Returns a reference to the NFABuilder being used. */
NFABuilder &getBuilder() override { return builder; }
/** \brief Returns a const reference to the NFABuilder being used. */
const NFABuilder &getBuilder() const override { return builder; }
/** \brief Wire up the lasts of one component to the firsts of another. */
void connectRegions(const vector<PositionInfo> &lasts,
const vector<PositionInfo> &firsts) override;
/** \brief Wire the lasts of the main sequence to accepts. */
void connectAccepts(const vector<PositionInfo> &lasts) override;
/** \brief Wire up a single last to a list of firsts. */
void connectSuccessors(const PositionInfo &last,
vector<PositionInfo> firsts);
/** Wire up a pair of positions. */
void addSuccessor(Position from, Position to) override;
/** \brief Clone the vertex properties and edges of all vertices between
* two positions. */
void cloneFollowSet(Position from, Position to, unsigned offset) override;
/** \brief Build the prioritised list of edges out of our successor map. */
void buildEdges() override;
/** Construct an edge, called internally by \ref buildEdges. */
void buildEdge(Position from, const PositionInfo &to);
Position startState;
Position startDotstarState;
Position acceptState;
Position acceptEodState;
Position acceptNlEodState;
Position acceptNlState;
NFABuilder &builder; //!< \brief builder for the NFAGraph
bool doPrefilter; //!< \brief we're building a prefiltering pattern
/** \brief Map storing successors for each position. */
map<Position, flat_set<PositionInfo>> successors;
};
} // namespace
GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b,
bool prefilter) :
startState(b.getStart()),
startDotstarState(b.getStartDotStar()),
acceptState(b.getAccept()),
acceptEodState(b.getAcceptEOD()),
acceptNlEodState(POS_UNINITIALIZED),
acceptNlState(POS_UNINITIALIZED),
builder(b),
doPrefilter(prefilter)
{
// Our special nodes need special relationships.
vector<PositionInfo> lasts, firsts;
// start->startDs and startDs self-loop.
lasts.push_back(startState);
lasts.push_back(startDotstarState);
firsts.push_back(startDotstarState);
connectRegions(lasts, firsts);
// accept to acceptEod edges already wired
// XXX: a small hack to support vacuous NFAs: give start and startDs an
// initial report ID.
builder.setNodeReportID(startState, 0);
builder.setNodeReportID(startDotstarState, 0);
}
static
void checkEmbeddedEndAnchor(const PositionInfo &from,
const vector<PositionInfo> &firsts) {
if (!(from.flags & POS_FLAG_ONLY_ENDS)) {
return;
}
for (const auto &first : firsts) {
if (first.pos != GlushkovBuildStateImpl::POS_EPSILON) {
/* can make it through the parse tree */
throw ParseError("Embedded end anchors not supported.");
}
}
}
// Wire up the lasts of one component to the firsts of another
void
GlushkovBuildStateImpl::connectRegions(const vector<PositionInfo> &lasts,
const vector<PositionInfo> &firsts) {
for (const auto &last : lasts) {
checkEmbeddedEndAnchor(last, firsts);
connectSuccessors(last, firsts);
}
}
static
void filterEdges(const GlushkovBuildStateImpl &bs, const PositionInfo &from,
vector<PositionInfo> &tolist) {
if (from.pos == bs.startDotstarState) {
// If we're connecting from start-dotstar, remove all caret flavoured
// positions.
CheckPositionFlags check(POS_FLAG_NOFLOAT);
tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
tolist.end());
if (from.flags & POS_FLAG_NOFLOAT) {
tolist.clear();
}
} else if (from.pos == bs.startState) {
// If we're connecting from start, we should remove any epsilons that
// aren't caret flavoured.
CheckUnflaggedEpsilon check;
tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
tolist.end());
CheckPositionFlags check2(POS_FLAG_MUST_FLOAT | POS_FLAG_NOFLOAT);
tolist.erase(remove_if(tolist.begin(), tolist.end(), check2),
tolist.end());
}
if (bs.builder.getAssertFlag(from.pos) & POS_FLAG_MULTILINE_START) {
// If we have a (mildly boneheaded) pattern like /^$/m, we're right up
// against the edge of what we can do without true assertion support.
// Here we have an evil hack to prevent us plugging the \n generated by
// the caret right into acceptEod (which is in the firsts of the
// dollar).
/* This is due to the 'interesting quirk' that multiline ^ does not
* not match a newline at the end of buffer. */
DEBUG_PRINTF("multiline start - no eod\n");
tolist.erase(remove(tolist.begin(), tolist.end(), bs.acceptEodState),
tolist.end());
}
}
static
Position makeNewlineAssertPos(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
Position newline = builder.makePositions(1);
builder.addCharReach(newline, CharReach('\n'));
builder.setAssertFlag(newline, POS_FLAG_FIDDLE_ACCEPT);
builder.setNodeReportID(newline, -1);
return newline;
}
static
void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
vector<PositionInfo> *tolist) {
NFABuilder &builder = bs.getBuilder();
u32 flags = from.flags;
bool require_eod = flags & POS_FLAG_WIRE_EOD;
bool require_nl_eod = flags & POS_FLAG_WIRE_NL_EOD
&& !(flags & POS_FLAG_NO_NL_EOD);
bool require_nl_accept = (flags & POS_FLAG_WIRE_NL_ACCEPT)
&& !(flags & POS_FLAG_NO_NL_ACCEPT);
bool require_accept = !(flags & POS_FLAG_ONLY_ENDS);
if (require_eod) {
tolist->push_back(bs.acceptEodState);
}
if (require_nl_accept) {
if (bs.acceptNlState == GlushkovBuildState::POS_UNINITIALIZED) {
Position newline = makeNewlineAssertPos(bs);
bs.addSuccessor(newline, builder.getAccept());
bs.acceptNlState = newline;
}
tolist->push_back(bs.acceptNlState);
}
if (require_nl_eod) {
if (bs.acceptNlEodState == GlushkovBuildState::POS_UNINITIALIZED) {
Position newline = makeNewlineAssertPos(bs);
bs.addSuccessor(newline, builder.getAcceptEOD());
bs.acceptNlEodState = newline;
}
tolist->push_back(bs.acceptNlEodState);
}
if (require_accept) {
tolist->push_back(bs.acceptState);
}
}
void GlushkovBuildStateImpl::connectAccepts(const vector<PositionInfo> &lasts) {
for (const auto &last : lasts) {
vector<PositionInfo> accepts;
generateAccepts(*this, last, &accepts);
connectSuccessors(last, accepts);
}
}
#if defined(DEBUG) || defined(DUMP_SUPPORT)
static UNUSED
string dumpCaptures(const PositionInfo &p) {
ostringstream oss;
if (p.flags & POS_FLAG_NOFLOAT) {
oss << "<nofloat>";
}
if (p.flags & POS_FLAG_MUST_FLOAT) {
oss << "<must_float>";
}
if (p.flags & POS_FLAG_FIDDLE_ACCEPT) {
oss << "<fiddle_accept>";
}
if (p.flags & POS_FLAG_ONLY_ENDS) {
oss << "<only_ends>";
}
if (p.flags & POS_FLAG_NO_NL_EOD) {
oss << "<no_nl_eod>";
}
if (p.flags & POS_FLAG_NO_NL_ACCEPT) {
oss << "<no_nl_acc>";
}
return oss.str();
}
#endif // DEBUG || DUMP_SUPPORT
void GlushkovBuildStateImpl::connectSuccessors(const PositionInfo &from,
vector<PositionInfo> tolist) {
/* note: tolist maybe modified for our own internal use -> not a reference */
assert(from.pos != POS_EPSILON);
assert(from.pos != POS_UNINITIALIZED);
assert(find(tolist.begin(), tolist.end(), POS_UNINITIALIZED)
== tolist.end());
DEBUG_PRINTF("FROM = %u%s TO = %s\n", from.pos, dumpCaptures(from).c_str(),
dumpPositions(tolist.begin(), tolist.end()).c_str());
/* prevent creation of edges with invalid assertions */
filterEdges(*this, from, tolist);
if (from.flags & POS_FLAG_FIDDLE_ACCEPT) {
auto accept = find(tolist.begin(), tolist.end(), acceptState);
if (accept != tolist.end()) {
DEBUG_PRINTF("accept through -1 offset-adjusting dot\n");
Position fakedot = builder.makePositions(1);
builder.addCharReach(fakedot, CharReach(0x00, 0xff));
builder.setNodeReportID(fakedot, -1);
addSuccessor(fakedot, acceptState);
*accept = fakedot;
} else {
// We might lead to accept via an assertion vertex, so we add the
// offset adj to this vertex itself. Used for cases like /^\B/m,
// which should match only at 0 for '\n'.
builder.setNodeReportID(from.pos, -1);
}
assert(find(tolist.begin(), tolist.end(), acceptState) == tolist.end());
}
auto &succ = successors[from.pos];
DEBUG_PRINTF("connect %u -> %s\n", from.pos,
dumpPositions(tolist.begin(), tolist.end()).c_str());
DEBUG_PRINTF("%u curr succ: %s\n", from.pos,
dumpPositions(begin(succ), end(succ)).c_str());
for (const auto &to : tolist) {
if (to.pos != POS_EPSILON) {
succ.insert(to);
}
}
DEBUG_PRINTF("%u succ: %s\n", from.pos,
dumpPositions(begin(succ), end(succ)).c_str());
}
void GlushkovBuildStateImpl::addSuccessor(Position from, Position to) {
DEBUG_PRINTF("connect %u -> %u\n", from, to);
assert(from != POS_EPSILON && from != POS_UNINITIALIZED);
assert(to != POS_EPSILON && to != POS_UNINITIALIZED);
auto &succ = successors[from];
succ.insert(to);
DEBUG_PRINTF("%u succ: %s\n", from,
dumpPositions(begin(succ), end(succ)).c_str());
}
void GlushkovBuildStateImpl::cloneFollowSet(Position first, Position last,
unsigned offset) {
assert(first <= last);
// Clone vertex properties (reachability, etc)
builder.cloneRegion(first, last, offset);
/* Clone the successors of all the positions between first and last
* inclusive, producing a new set of positions starting at (first +
* offset). */
for (Position i = first; i <= last; i++) {
// This should be a new position.
assert(successors[i + offset].empty());
for (const PositionInfo &to : successors[i]) {
if (to.pos >= first && to.pos <= last) {
PositionInfo clone(to);
clone.pos += offset;
DEBUG_PRINTF("clone: %u -> %u\n", i + offset, clone.pos);
successors[i + offset].insert(clone);
} else {
// There shouldn't be any stray edges leading out of this
// region!
assert(0);
}
}
}
}
void GlushkovBuildStateImpl::buildEdge(Position from, const PositionInfo &to) {
// Guard against embedded anchors
if (to == startState) {
/* can make it through the parse tree */
throw ParseError("Embedded start anchors not supported.");
}
assert(to.pos != POS_UNINITIALIZED);
assert(to.pos != POS_EPSILON);
if (builder.hasEdge(from, to.pos)) {
return;
}
builder.addEdge(from, to.pos);
}
void GlushkovBuildStateImpl::buildEdges() {
// Create all the edges and track which vertices are asserts which need to
// be removed later.
for (const auto &m : successors) {
const Position from = m.first;
for (const auto &to : m.second) {
buildEdge(from, to);
}
}
}
// Construct a usable GlushkovBuildState for the outside world.
unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
bool prefilter) {
return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter);
}
// free functions for utility use
/** \brief Eliminate lower-priority duplicate PositionInfo entries.
*
* Scans through a list of positions and retains only the highest priority
* version of a given (position, flags) entry. */
void cleanupPositions(vector<PositionInfo> &a) {
ue2::unordered_set<pair<Position, int>> seen; // track dupes
vector<PositionInfo> out;
out.reserve(a.size()); // output should be close to input in size.
for (const auto &p : a) {
if (seen.emplace(p.pos, p.flags).second) {
out.push_back(p); // first encounter
}
}
DEBUG_PRINTF("in %zu; out %zu\n", a.size(), out.size());
a.swap(out);
}
static
vector<PositionInfo>::iterator
replaceElemWithSequence(vector<PositionInfo> &dest,
vector<PositionInfo>::iterator &victim,
const vector<PositionInfo> &replacement) {
auto past = dest.erase(victim);
size_t d = distance(dest.begin(), past) + replacement.size();
dest.insert(past, replacement.begin(), replacement.end());
/* recalc past as iterator may have been invalidated */
return dest.begin() + d;
}
/** \brief Replace all epsilons with the given positions.
*
* Replace epsilons in a firsts list with another given firsts list. Note: the
* firsts lists must come from disjoint sets of components. If no epsilons are
* in the first firsts list the source is appended to the end.
*/
void replaceEpsilons(vector<PositionInfo> &target,
const vector<PositionInfo> &source) {
auto found =
find(target.begin(), target.end(), GlushkovBuildState::POS_EPSILON);
if (found == target.end()) {
// no epsilons to replace, push on to the end
target.insert(target.end(), source.begin(), source.end());
return;
}
while (found != target.end()) {
checkEmbeddedEndAnchor(*found, source);
// replace this epsilon with a copy of source with the same flags
vector<PositionInfo> newsource(source);
for (auto &pos : newsource) {
pos.flags |= found->flags;
}
found = replaceElemWithSequence(target, found, newsource);
// find the next epsilon
found = find(found, target.end(), GlushkovBuildState::POS_EPSILON);
}
cleanupPositions(target);
}
#ifdef DUMP_SUPPORT
void dump(ostream &os, const PositionInfo &p) {
if (p.pos == GlushkovBuildState::POS_EPSILON) {
os << "epsilon";
} else {
os << p.pos;
}
os << dumpCaptures(p);
}
#endif // DUMP_SUPPORT
} // namespace ue2

103
src/parser/buildstate.h Normal file
View File

@@ -0,0 +1,103 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Glushkov construction.
*/
#ifndef BUILDSTATE_H
#define BUILDSTATE_H
#include "ue2common.h"
#include "position.h"
#include <memory>
#include <vector>
#include <boost/core/noncopyable.hpp>
namespace ue2 {
class NFABuilder;
class PositionInfo;
/** \brief Machinery for Glushkov construction.
*
* Abstract base class; use \ref makeGlushkovBuildState to get one of these you
* can use. */
class GlushkovBuildState : boost::noncopyable {
public:
/** \brief Represents an uninitialized state. */
static const Position POS_UNINITIALIZED;
/** \brief Represents an epsilon transition in the firsts of a component. */
static const Position POS_EPSILON;
virtual ~GlushkovBuildState();
/** \brief Returns a reference to the NFABuilder being used. */
virtual NFABuilder &getBuilder() = 0;
/** \brief Returns a const reference to the NFABuilder being used. */
virtual const NFABuilder &getBuilder() const = 0;
/** \brief Wire up edges from the lasts of one component to the firsts of
* another. */
virtual void connectRegions(const std::vector<PositionInfo> &lasts,
const std::vector<PositionInfo> &firsts) = 0;
/** \brief Wire the lasts of the main sequence to accepts. */
virtual void connectAccepts(const std::vector<PositionInfo> &lasts) = 0;
/** \brief Wire up a pair of positions. */
virtual void addSuccessor(Position from, Position to) = 0;
/** \brief Clone the vertex properties and edges of all vertices between
* two positions. */
virtual void cloneFollowSet(Position from, Position to, u32 offset) = 0;
/** \brief Build the prioritised list of edges out of our successor map. */
virtual void buildEdges() = 0;
};
/** \brief Returns a new GlushkovBuildState object. */
std::unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
bool prefilter);
/** \brief Replace all epsilons with the given positions. */
void replaceEpsilons(std::vector<PositionInfo> &target,
const std::vector<PositionInfo> &source);
/** \brief Eliminate lower-priority duplicate PositionInfo entries.
*
* Scans through a list of positions and retains only the highest priority
* version of a given (position, flags) entry. */
void cleanupPositions(std::vector<PositionInfo> &a);
} // namespace ue2
#endif

120
src/parser/check_refs.cpp Normal file
View File

@@ -0,0 +1,120 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Component tree analysis that checks that references (such as
* back-refs, conditionals) have valid referents.
*/
#include "check_refs.h"
#include "ComponentBackReference.h"
#include "ComponentCondReference.h"
#include "ConstComponentVisitor.h"
#include "parse_error.h"
#include "util/container.h"
#include "util/ue2_containers.h"
#include <sstream>
using namespace std;
namespace ue2 {
/**
* \brief Visitor that checks the validity of references against a known list
* of indices and labels.
*/
class ReferenceVisitor: public DefaultConstComponentVisitor {
private:
const size_t num_ids;
const flat_set<string> &names;
public:
ReferenceVisitor(size_t num_groups, const flat_set<string> &targets)
: num_ids(num_groups), names(targets) {}
~ReferenceVisitor();
void invalid_index(const char *component, unsigned id) {
assert(component);
ostringstream str;
str << "Invalid " << component << " to expression " << id << ".";
throw ParseError(str.str());
}
void invalid_label(const char *component, const std::string &label) {
assert(component);
ostringstream str;
str << "Invalid " << component << " to label '" << label << "'.";
throw ParseError(str.str());
}
void pre(const ComponentBackReference &c) override {
if (c.ref_id) {
if (c.ref_id >= num_ids) {
invalid_index("back reference", c.ref_id);
}
} else {
if (!contains(names, c.name)) {
invalid_label("back reference", c.name);
}
}
}
void pre(const ComponentCondReference &c) override {
switch (c.kind) {
case ComponentCondReference::CONDITION_NUMBER:
if (c.ref_id >= num_ids) {
invalid_index("conditional reference", c.ref_id);
}
break;
case ComponentCondReference::CONDITION_NAME:
if (c.ref_name == "DEFINE") {
// The string "DEFINE" is a special "always false" condition
// used to define subroutines.
break;
}
if (!contains(names, c.ref_name)) {
invalid_label("conditional reference", c.ref_name);
}
break;
case ComponentCondReference::CONDITION_ASSERTION:
break;
}
}
};
// Out-of-line destructor to silence weak vtable warnings.
ReferenceVisitor::~ReferenceVisitor() {}
void checkReferences(const Component &root, unsigned int groupIndices,
const ue2::flat_set<std::string> &groupNames) {
ReferenceVisitor vis(groupIndices, groupNames);
root.accept(vis);
}
} // namespace ue2

50
src/parser/check_refs.h Normal file
View File

@@ -0,0 +1,50 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Component tree analysis that checks that references (such as
* back-refs, conditionals) have valid referents.
*/
#ifndef PARSER_CHECK_REFS_H_
#define PARSER_CHECK_REFS_H_
#include "util/ue2_containers.h"
#include <string>
namespace ue2 {
class Component;
class ComponentSequence;
void checkReferences(const Component &root, unsigned int groupIndices,
const ue2::flat_set<std::string> &groupNames);
} // namespace ue2
#endif // PARSER_CHECK_REFS_H_

303
src/parser/dump.cpp Normal file
View File

@@ -0,0 +1,303 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "dump.h"
#include "position.h"
#include "ConstComponentVisitor.h"
#include "ComponentBackReference.h"
#include "ComponentClass.h"
#include "ComponentCondReference.h"
#include "ComponentRepeat.h"
#include "ComponentAlternation.h"
#include "ComponentAssertion.h"
#include "ComponentAtomicGroup.h"
#include "ComponentBoundary.h"
#include "ComponentByte.h"
#include "ComponentEmpty.h"
#include "ComponentEUS.h"
#include "ComponentSequence.h"
#include "ComponentWordBoundary.h"
#include "Utf8ComponentClass.h"
#include "AsciiComponentClass.h"
#include "util/charreach.h"
#include "util/dump_charclass.h"
#include <ostream>
#include <string>
#ifndef DUMP_SUPPORT
#error No dump support!
#endif
using std::ostream;
using std::string;
using std::endl;
namespace ue2 {
class DumpVisitor : public ConstComponentVisitor {
private:
void indent() { level++; }
void outdent() {
assert(level > 0);
level--;
}
std::string filler() const { return string(level * 2, ' '); }
public:
explicit DumpVisitor(ostream &s) : os(s), level(0) {}
~DumpVisitor() override;
void pre(const AsciiComponentClass &c) override {
os << filler() << "ASCII CLASS" << endl << filler() << " ";
describeClass(os, c.cr, 256, CC_OUT_TEXT);
os << endl;
indent();
}
void post(const AsciiComponentClass &) override { outdent(); }
void pre(const ComponentAlternation &) override {
os << filler() << "ALTERNATION" << endl;
indent();
}
void post(const ComponentAlternation &) override {
outdent();
}
void pre(const ComponentAssertion &c) override {
os << filler() << "ASSERTION (";
switch (c.m_sense) {
case ComponentAssertion::POS:
os << "POSITIVE ";
break;
case ComponentAssertion::NEG:
os << "NEGATIVE ";
break;
}
switch (c.m_dir) {
case ComponentAssertion::LOOKAHEAD:
os << "LOOKAHEAD";
break;
case ComponentAssertion::LOOKBEHIND:
os << "LOOKBEHIND";
break;
}
os << ")" << endl;
indent();
}
void post(const ComponentAssertion &) override { outdent(); }
void pre(const ComponentAtomicGroup &) override {
os << filler() << "ATOMIC GROUP" << endl;
indent();
}
void post(const ComponentAtomicGroup &) override { outdent(); }
void pre(const ComponentBackReference &c) override {
if (!c.name.empty()) {
os << filler() << "BACKREF " << c.name << std::endl;
} else {
os << filler() << "BACKREF " << c.ref_id << std::endl;
}
indent();
}
void post(const ComponentBackReference &) override { outdent(); }
void pre(const ComponentBoundary &c) override {
os << filler() << "BOUNDARY" << endl << filler() << " ";
switch (c.m_bound) {
case ComponentBoundary::BEGIN_STRING:
os << "ComponentBoundary::BEGIN_STRING";
break;
case ComponentBoundary::END_STRING:
os << "ComponentBoundary::END_STRING";
break;
case ComponentBoundary::END_STRING_OPTIONAL_LF:
os << "ComponentBoundary::END_STRING_OPTIONAL_LF";
break;
case ComponentBoundary::BEGIN_LINE:
os << "ComponentBoundary::BEGIN_LINE";
break;
case ComponentBoundary::END_LINE:
os << "ComponentBoundary::END_LINE";
break;
}
os << endl;
indent();
}
void post(const ComponentBoundary &) override { outdent(); }
void pre(const ComponentByte &) override {
os << filler() << "BYTE" << endl;
indent();
}
void post(const ComponentByte &) override { outdent(); }
void pre(const ComponentCondReference &c) override {
os << filler() << "CONDITIONAL REFERENCE" << endl;
switch (c.kind) {
case ComponentCondReference::CONDITION_NUMBER:
os << filler() << "REFERENCES GROUP WITH NUMBER " << c.ref_id
<< endl;
break;
case ComponentCondReference::CONDITION_NAME:
os << filler() << "REFERENCES GROUP WITH NAME " << c.ref_name
<< endl;
break;
case ComponentCondReference::CONDITION_ASSERTION:
os << filler() << "REFERENCES FOLLOWING ASSERTION" << endl;
break;
}
indent();
}
void post(const ComponentCondReference &) override { outdent(); }
void pre(const ComponentEmpty &) override {
os << filler() << "EMPTY" << endl;
indent();
}
void post(const ComponentEmpty &) override { outdent(); }
void pre(const ComponentEUS &) override {
os << filler() << "EUS" << endl;
indent();
}
void post(const ComponentEUS &) override { outdent(); }
void pre(const ComponentRepeat &c) override {
os << filler() << "REPEAT (" << c.m_min << ", ";
if (c.m_max == ComponentRepeat::NoLimit) {
os << "NoLimit";
} else {
os << c.m_max;
}
os << ") ";
switch (c.type) {
case ComponentRepeat::REPEAT_NONGREEDY:
os << "non-greedy";
break;
case ComponentRepeat::REPEAT_GREEDY:
os << "greedy";
break;
case ComponentRepeat::REPEAT_POSSESSIVE:
os << "possessive";
break;
}
os << endl;
indent();
}
void post(const ComponentRepeat &) override { outdent(); }
void pre(const ComponentSequence &c) override {
os << filler() << "SEQUENCE ";
if (c.capture_index == ComponentSequence::NOT_CAPTURED) {
os << "(not captured) ";
} else {
os << "(capture index " << c.capture_index << ") ";
}
if (!c.capture_name.empty()) {
os << "(capture name '" << c.capture_name << "')";
}
os << endl;
indent();
if (c.children.empty()) {
os << filler() << " <empty>" << endl;
}
}
void post(const ComponentSequence &) override { outdent(); }
void pre(const ComponentWordBoundary &c) override {
os << filler() << (c.negated ? "NON-WORD-BOUNDARY ('\\B')"
: "WORD-BOUNDARY ('\\b')") << endl;
indent();
}
void post(const ComponentWordBoundary &) override { outdent(); }
void pre(const UTF8ComponentClass &c) override {
os << filler() << "UTF8 CLASS" << endl << filler() << " ";
if (c.cps.none()) {
os << "<none>";
} else {
for (auto it = c.cps.begin(), ite = c.cps.end(); it != ite; ++it) {
os << std::hex << *it << " ";
}
}
os << endl;
indent();
}
void post(const UTF8ComponentClass &) override { outdent(); }
// not used
void during(const AsciiComponentClass &) override {}
void during(const ComponentAlternation &) override {}
void during(const ComponentAssertion &) override {}
void during(const ComponentAtomicGroup &) override {}
void during(const ComponentBackReference &) override {}
void during(const ComponentBoundary &) override {}
void during(const ComponentByte &) override {}
void during(const ComponentCondReference &) override {}
void during(const ComponentEmpty &) override {}
void during(const ComponentEUS &) override {}
void during(const ComponentRepeat &) override {}
void during(const ComponentSequence &) override {}
void during(const ComponentWordBoundary &) override {}
void during(const UTF8ComponentClass &) override {}
private:
ostream &os;
unsigned level;
};
DumpVisitor::~DumpVisitor() {}
void dumpTree(ostream &os, const Component *const root) {
assert(root);
DumpVisitor vis(os);
root->accept(vis);
}
} // namespace ue2

48
src/parser/dump.h Normal file
View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef PARSER_DUMP_H_
#define PARSER_DUMP_H_
#ifdef DUMP_SUPPORT
#include <ostream>
namespace ue2 {
class Component;
/** \brief Dump a text representation of the given component tree. Only
* available in DUMP_SUPPORT builds. */
void dumpTree(std::ostream &os, const Component *const root);
} // namespace ue2
#endif // DUMP_SUPPORT
#endif // PARSER_DUMP_H_

View File

@@ -0,0 +1,52 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Parse/Compile exceptions.
*/
#include "parse_error.h"
#include <sstream>
namespace ue2 {
// this is just to get these out of the .h to avoid weak vtables
ParseError::~ParseError() {}
LocatedParseError::~LocatedParseError() {}
void LocatedParseError::locate(size_t offset) {
std::ostringstream str;
str << reason << " at index " << offset << ".";
reason = str.str();
}
}

65
src/parser/parse_error.h Normal file
View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Parse/Compile exceptions.
*/
#ifndef PARSE_ERROR_H_A02047D1AA16C9
#define PARSE_ERROR_H_A02047D1AA16C9
#include "util/compile_error.h"
#include <string>
namespace ue2 {
/** \brief Error thrown internally by the Parser interface. */
class ParseError : public CompileError {
public:
// Note: 'why' should describe why the error occurred and end with a
// full stop, but no line break.
explicit ParseError(const std::string &why) : CompileError(why) {}
~ParseError() override;
};
class LocatedParseError : public ParseError {
public:
explicit LocatedParseError(const std::string &why) : ParseError(".") {
reason = why; // don't use ParseError ctor
}
~LocatedParseError() override;
void locate(size_t offset);
};
} // namespace ue2
#endif /* PARSE_ERROR_H_A02047D1AA16C9 */

View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Utilities (currently just ParseMode constructor)
*/
#include "hs.h"
#include "Parser.h"
#include "ue2common.h"
namespace ue2 {
ParseMode::ParseMode(u32 hs_flags) :
caseless(hs_flags & HS_FLAG_CASELESS),
dotall(hs_flags & HS_FLAG_DOTALL),
ignore_space(false),
multiline(hs_flags & HS_FLAG_MULTILINE),
ucp(hs_flags & HS_FLAG_UCP),
utf8(hs_flags & HS_FLAG_UTF8) {}
} // namespace ue2

107
src/parser/position.h Normal file
View File

@@ -0,0 +1,107 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Per-position flags used during Glushkov construction, PositionInfo class.
*/
#ifndef PARSER_POSITION_H
#define PARSER_POSITION_H
#include "ue2common.h"
#include <set>
namespace ue2 {
#define POS_FLAG_NOFLOAT (1 << 0) //!< don't wire to start-dotstar
#define POS_FLAG_MUST_FLOAT (1 << 1) //!< don't wire solely to start
#define POS_FLAG_FIDDLE_ACCEPT (1 << 2) //!< add a dot with an offset adjustment when wiring to accept
#define POS_FLAG_ASSERT_WORD_TO_NONWORD (1 << 3) //!< epsilon for word to nonword transition
#define POS_FLAG_ASSERT_NONWORD_TO_WORD (1 << 4) //!< epsilon for nonword to word transition
#define POS_FLAG_ASSERT_WORD_TO_WORD (1 << 5) //!< epsilon for word to word transition
#define POS_FLAG_ASSERT_NONWORD_TO_NONWORD (1 << 6) //!< epsilon for nonword to nonword transition
/** vertex created by cloning startDs, not considered part of the match.
* mirrors POS_FLAG_FIDDLE_ACCEPT */
#define POS_FLAG_VIRTUAL_START (1 << 7)
/** multi-line ^ does not match \\n at end of buffer. As a result, we must never
* wire the \\n from ^ to eod */
#define POS_FLAG_MULTILINE_START (1 << 8)
#define POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP (1 << 9)
#define POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP (1 << 10)
#define POS_FLAG_ASSERT_WORD_TO_WORD_UCP (1 << 11)
#define POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP (1 << 12)
#define POS_FLAG_ASSERT_NONWORD_TO_ANY (POS_FLAG_ASSERT_NONWORD_TO_NONWORD \
| POS_FLAG_ASSERT_NONWORD_TO_WORD)
#define POS_FLAG_ASSERT_WORD_TO_ANY (POS_FLAG_ASSERT_WORD_TO_NONWORD \
| POS_FLAG_ASSERT_WORD_TO_WORD)
#define POS_FLAG_ASSERT_ANY_TO_NONWORD (POS_FLAG_ASSERT_NONWORD_TO_NONWORD \
| POS_FLAG_ASSERT_WORD_TO_NONWORD)
#define POS_FLAG_ASSERT_ANY_TO_WORD (POS_FLAG_ASSERT_NONWORD_TO_WORD \
| POS_FLAG_ASSERT_WORD_TO_WORD)
#define POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP \
(POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP \
| POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP)
#define POS_FLAG_ASSERT_WORD_TO_ANY_UCP (POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP \
| POS_FLAG_ASSERT_WORD_TO_WORD_UCP)
#define POS_FLAG_ASSERT_ANY_TO_NONWORD_UCP \
(POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP \
| POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP)
#define POS_FLAG_ASSERT_ANY_TO_WORD_UCP (POS_FLAG_ASSERT_WORD_TO_WORD_UCP \
| POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP)
#define UCP_ASSERT_FLAGS (POS_FLAG_ASSERT_WORD_TO_ANY_UCP \
| POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP)
#define NON_UCP_ASSERT_FLAGS (POS_FLAG_ASSERT_WORD_TO_ANY \
| POS_FLAG_ASSERT_NONWORD_TO_ANY)
/** do not wire to accept or other pos; may still wire to eod, etc if
* instructed */
#define POS_FLAG_ONLY_ENDS (1 << 23)
#define POS_FLAG_WIRE_EOD (1 << 24) /**< wire to accept eod */
#define POS_FLAG_WIRE_NL_EOD (1 << 25) /**< wire to nl before accept eod */
#define POS_FLAG_WIRE_NL_ACCEPT (1 << 26) /**< wire to nl before accept */
#define POS_FLAG_NO_NL_EOD (1 << 27) /**< disallow nl before accept eod */
#define POS_FLAG_NO_NL_ACCEPT (1 << 28) /**< disallow nl before accept */
/** \brief Parse and Glushkov construction use only. State number within the
* NFA as it is being constructed. */
typedef u32 Position;
} // namespace ue2
#endif // PARSER_POSITION_H

View File

@@ -0,0 +1,63 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef POSITION_DUMP_H
#define POSITION_DUMP_H
#include <sstream>
namespace ue2 {
#ifdef DUMP_SUPPORT
// implemented in buildstate.cpp
void dump(std::ostream &os, const PositionInfo &p);
#endif
#if defined(DUMP_SUPPORT) || defined(DEBUG)
template<class Iterator>
static UNUSED
std::string dumpPositions(const Iterator &begin, const Iterator &end) {
std::ostringstream oss;
oss << '[';
for (Iterator i = begin; i != end; ++i) {
if (i != begin) {
oss << ' ';
}
dump(oss, *i);
}
oss << ']';
return oss.str();
}
#endif
} // namespace ue2
#endif /* POSITION_DUMP_H */

View File

@@ -0,0 +1,57 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef POSITION_INFO_H
#define POSITION_INFO_H
#include "ue2common.h"
#include "position.h"
namespace ue2 {
/** Class representing a component state. */
class PositionInfo {
public:
PositionInfo(unsigned int p) : pos(p), flags(0) {}
bool operator<(const PositionInfo &other) const {
return pos < other.pos;
}
bool operator==(const PositionInfo &other) const {
return pos == other.pos;
}
Position pos; //!< state number
int flags; //!< from POS_FLAG_* above
};
} // namespace ue2
#endif /* POSITION_INFO_H */

339
src/parser/prefilter.cpp Normal file
View File

@@ -0,0 +1,339 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Prefiltering component tree transformation.
*/
#include "ComponentAssertion.h"
#include "ComponentAtomicGroup.h"
#include "ComponentBackReference.h"
#include "ComponentBoundary.h"
#include "ComponentClass.h"
#include "ComponentCondReference.h"
#include "ComponentRepeat.h"
#include "ComponentSequence.h"
#include "ComponentVisitor.h"
#include "ComponentWordBoundary.h"
#include "ConstComponentVisitor.h"
#include "Parser.h"
#include "prefilter.h"
#include <algorithm>
#include <stack>
using namespace std;
namespace ue2 {
/** \brief Max number of positions a referent can have to be considered safe to
* replace a reference in prefiltering mode. */
static const size_t MAX_REFERENT_POSITIONS = 1;
/** \brief Constructs a \ref ComponentClass that matches a dot (any
* byte/codepoint, depending on whether UTF-8). */
static
unique_ptr<ComponentClass> makeDotClass(const ParseMode &mode_in) {
ParseMode mode(mode_in);
mode.dotall = true;
return generateComponent(CLASS_ANY, false, mode);
}
namespace {
/**
* \brief Visitor used to determine if a given referent component is safe to
* replace its reference in prefiltering mode. Throws
* SafeReferentVisitor::Unsafe to terminate early on unsafe cases. */
class SafeReferentVisitor : public DefaultConstComponentVisitor {
public:
struct Unsafe {};
SafeReferentVisitor() : numPositions(0) {}
bool is_safe() const {
DEBUG_PRINTF("numPositions = %zu\n", numPositions);
return numPositions <= MAX_REFERENT_POSITIONS;
}
void pre(const AsciiComponentClass &) override {
numPositions++;
}
void pre(const UTF8ComponentClass &) override {
// FIXME: we should be able to tell precisely how many positions this
// class will use. Right now, use the worst case.
numPositions += 4;
}
void pre(const ComponentBoundary &) override {
numPositions++;
}
void pre(const ComponentByte &) override {
numPositions++;
}
void pre(const ComponentEUS &) override {
numPositions++;
}
void pre(const ComponentRepeat &) override {
// Record the number of positions used before we visit the contents of
// the repeat.
countStack.push(numPositions);
}
void post(const ComponentRepeat &c) override {
assert(!countStack.empty());
size_t before = countStack.top();
countStack.pop();
assert(before <= numPositions);
std::pair<u32, u32> bounds = c.getBounds();
size_t subPositions = numPositions - before;
size_t copies = bounds.second < ComponentRepeat::NoLimit
? bounds.second
: max(bounds.first, 1U);
numPositions = before + (subPositions * copies);
}
void pre(const ComponentWordBoundary &) override {
// not quite accurate, as these are expanded out in assert
// resolution...
numPositions++;
}
void pre(const ComponentBackReference &) override {
throw Unsafe();
}
void pre(const ComponentCondReference &) override {
throw Unsafe();
}
private:
size_t numPositions;
// For temporary use
std::stack<size_t> countStack;
};
static
bool isSafeReferent(const Component &c) {
try {
SafeReferentVisitor vis;
c.accept(vis);
return vis.is_safe();
}
catch (const SafeReferentVisitor::Unsafe &) {
return false;
}
}
/**
* \brief Visitor to find the \ref ComponentSequence with a given reference ID
* or name: if found, the visitor will throw a const ptr to it.
*/
class FindSequenceVisitor : public DefaultConstComponentVisitor {
public:
explicit FindSequenceVisitor(unsigned ref_id) : id(ref_id) {}
explicit FindSequenceVisitor(const std::string &s) : name(s) {}
void pre(const ComponentSequence &c) override {
if (!name.empty()) {
if (c.getCaptureName() == name) {
throw &c;
}
} else if (c.getCaptureIndex() == id) {
throw &c;
}
}
private:
const std::string name;
const unsigned id = 0;
};
static
const ComponentSequence *findCapturingGroup(const Component *root,
FindSequenceVisitor &vis) {
try {
root->accept(vis);
DEBUG_PRINTF("group not found\n");
return nullptr;
} catch (const ComponentSequence *seq) {
return seq;
}
}
} // namespace
/**
* \brief Visitor to apply prefilter reductions, swapping components for which
* we don't have real implementations with implementable ones. Any such
* replacement should produce a superset of the matches that would be produced
* by the original.
*/
class PrefilterVisitor : public DefaultComponentVisitor {
public:
PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
~PrefilterVisitor();
/** \brief Calls the visitor (recursively) on a new replacement component
* we've just created. Takes care of freeing it if the sequence is itself
* replaced. */
template<class T>
Component *visit_replacement(T *r) {
Component *c = r->accept(*this);
if (c != r) {
delete r;
}
return c;
}
Component *visit(ComponentBackReference *c) override {
assert(c);
// If the referent is simple (represents a single position), then we
// replace the back-reference with a copy of it.
const ComponentSequence *ref = nullptr;
const std::string &ref_name = c->getRefName();
const unsigned ref_id = c->getRefID();
if (!ref_name.empty()) {
FindSequenceVisitor vis(ref_name);
ref = findCapturingGroup(root, vis);
} else if (ref_id > 0) {
FindSequenceVisitor vis(ref_id);
ref = findCapturingGroup(root, vis);
}
if (ref && isSafeReferent(*ref)) {
DEBUG_PRINTF("found safe ref %p\n", ref);
ComponentSequence *seq = ref->clone();
// Remove labels from cloned sequence.
seq->setCaptureName("");
seq->setCaptureIndex(ComponentSequence::NOT_CAPTURED);
return visit_replacement(seq);
}
// Replace with ".*".
auto rep = makeComponentRepeat(makeDotClass(mode), 0,
ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_GREEDY);
return rep.release(); // FIXME: owning raw ptr
}
Component *visit(UNUSED ComponentAssertion *c) override {
assert(c);
// Replace with an empty sequence.
return new ComponentSequence();
}
Component *visit(ComponentRepeat *c) override {
assert(c);
// Possessive repeats become greedy.
if (c->type == ComponentRepeat::REPEAT_POSSESSIVE) {
c->type = ComponentRepeat::REPEAT_GREEDY;
}
return c;
}
Component *visit(ComponentAtomicGroup *c) override {
assert(c);
// Replace with a plain sequence containing the atomic group's
// children.
ComponentSequence *seq = new ComponentSequence();
const auto &children = c->getChildren();
for (const auto &child : children) {
assert(child);
seq->addComponent(unique_ptr<Component>(child->clone()));
}
return visit_replacement(seq);
}
Component *visit(UNUSED ComponentEUS *c) override {
assert(c);
// Replace with ".+".
auto rep = makeComponentRepeat(makeDotClass(mode), 1,
ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_GREEDY);
return rep.release(); // FIXME: owning raw ptr
}
Component *visit(ComponentWordBoundary *c) override {
assert(c);
c->setPrefilter(true);
return c;
}
Component *visit(ComponentCondReference *c) override {
assert(c);
// Replace with a plain sequence containing the conditional reference's
// children.
ComponentSequence *seq = new ComponentSequence();
const auto &children = c->getChildren();
// Empty children is accepted by PCRE as a "do nothing" case.
if (children.empty()) {
return seq;
}
for (const auto &child : children) {
assert(child);
seq->addComponent(unique_ptr<Component>(child->clone()));
}
// If the conditional reference had just a YES branch, we want this to
// be an alternation with an empty sequence (the NO branch).
if (!c->hasBothBranches) {
seq->addAlternation();
seq->finalize();
}
return visit_replacement(seq);
}
private:
Component *root;
const ParseMode &mode;
};
PrefilterVisitor::~PrefilterVisitor() {}
void prefilterTree(unique_ptr<Component> &root, const ParseMode &mode) {
assert(root);
PrefilterVisitor vis(root.get(), mode);
Component *c = root->accept(vis);
if (c != root.get()) {
root.reset(c);
}
}
} // namespace ue2

48
src/parser/prefilter.h Normal file
View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef PARSER_PREFILTER_H
#define PARSER_PREFILTER_H
#include <memory>
namespace ue2 {
class Component;
struct ParseMode;
/**
* \brief Applies prefiltering transformations to the given component.
*
* May reseat the given Component pointer.
*/
void prefilterTree(std::unique_ptr<Component> &root, const ParseMode &mode);
} // namespace ue2
#endif // PARSER_PREFILTER_H

View File

@@ -0,0 +1,201 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Shortcut literal pass: directly add literal components to Rose.
*/
#include "AsciiComponentClass.h"
#include "Utf8ComponentClass.h"
#include "ComponentAssertion.h"
#include "ComponentAtomicGroup.h"
#include "ComponentBackReference.h"
#include "ComponentBoundary.h"
#include "ComponentClass.h"
#include "ComponentCondReference.h"
#include "ComponentRepeat.h"
#include "ComponentSequence.h"
#include "ComponentVisitor.h"
#include "ComponentWordBoundary.h"
#include "ConstComponentVisitor.h"
#include "parse_error.h"
#include "shortcut_literal.h"
#include "grey.h"
#include "nfagraph/ng.h"
#include "compiler/compiler.h"
#include "util/ue2string.h"
#include "ue2common.h"
#include <stack>
using namespace std;
namespace ue2 {
/**
* \brief Visitor that constructs a ue2_literal from a component tree.
*
* If a component that can't be part of a literal is encountered, this visitor
* will throw ConstructLiteralVisitor::NotLiteral.
*/
class ConstructLiteralVisitor : public ConstComponentVisitor {
public:
~ConstructLiteralVisitor();
/** \brief Thrown if this component does not represent a literal. */
struct NotLiteral {};
void pre(const AsciiComponentClass &c) override {
const CharReach &cr = c.cr;
const size_t width = cr.count();
if (width == 1) {
lit.push_back(cr.find_first(), false);
} else if (width == 2 && cr.isCaselessChar()) {
lit.push_back(cr.find_first(), true);
} else {
throw NotLiteral();
}
}
void pre(const ComponentRepeat &c) override {
if (c.m_min == 0 || c.m_min != c.m_max) {
throw NotLiteral();
}
if (c.m_max < ComponentRepeat::NoLimit && c.m_max > 32767) {
throw ParseError("Bounded repeat is too large.");
}
// Store the current length of the literal; in this repeat's post()
// call we will append N-1 more copies of [index..end].
repeat_stack.push(lit.length());
}
void post(const ComponentRepeat &c) override {
// Add N-1 copies of the string between the entry to the repeat and the
// current end of the literal.
assert(!repeat_stack.empty());
const ue2_literal suffix = lit.substr(repeat_stack.top());
repeat_stack.pop();
for (unsigned i = 1; i < c.m_min; i++) {
lit += suffix;
}
}
void pre(const ComponentSequence &) override {
// Pass through.
}
void pre(const ComponentAlternation &) override { throw NotLiteral(); }
void pre(const ComponentAssertion &) override { throw NotLiteral(); }
void pre(const ComponentAtomicGroup &) override { throw NotLiteral(); }
void pre(const ComponentBackReference &) override { throw NotLiteral(); }
void pre(const ComponentBoundary &) override { throw NotLiteral(); }
void pre(const ComponentByte &) override { throw NotLiteral(); }
void pre(const ComponentCondReference &) override { throw NotLiteral(); }
void pre(const ComponentEmpty &) override { throw NotLiteral(); }
void pre(const ComponentEUS &) override { throw NotLiteral(); }
void pre(const ComponentWordBoundary &) override { throw NotLiteral(); }
void pre(const UTF8ComponentClass &) override { throw NotLiteral(); }
void during(const AsciiComponentClass &) override {}
void during(const ComponentAlternation &) override {}
void during(const ComponentAssertion &) override {}
void during(const ComponentAtomicGroup &) override {}
void during(const ComponentBackReference &) override {}
void during(const ComponentBoundary &) override {}
void during(const ComponentByte &) override {}
void during(const ComponentCondReference &) override {}
void during(const ComponentEmpty &) override {}
void during(const ComponentEUS &) override {}
void during(const ComponentRepeat &) override {}
void during(const ComponentSequence &) override {}
void during(const ComponentWordBoundary &) override {}
void during(const UTF8ComponentClass &) override {}
void post(const AsciiComponentClass &) override {}
void post(const ComponentAlternation &) override {}
void post(const ComponentAssertion &) override {}
void post(const ComponentAtomicGroup &) override {}
void post(const ComponentBackReference &) override {}
void post(const ComponentBoundary &) override {}
void post(const ComponentByte &) override {}
void post(const ComponentCondReference &) override {}
void post(const ComponentEmpty &) override {}
void post(const ComponentEUS &) override {}
void post(const ComponentSequence &) override {}
void post(const ComponentWordBoundary &) override {}
void post(const UTF8ComponentClass &) override {}
ue2_literal lit;
stack<size_t> repeat_stack; //!< index of entry to repeat.
};
ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
/** \brief True if the literal expression \a expr could be added to Rose. */
bool shortcutLiteral(NG &ng, const ParsedExpression &expr) {
assert(expr.component);
if (!ng.cc.grey.allowRose) {
return false;
}
// XXX: don't shortcut literals with extended params (yet)
if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length) {
DEBUG_PRINTF("extended params not allowed\n");
return false;
}
ConstructLiteralVisitor vis;
try {
assert(expr.component);
expr.component->accept(vis);
assert(vis.repeat_stack.empty());
} catch (const ConstructLiteralVisitor::NotLiteral&) {
DEBUG_PRINTF("not a literal\n");
return false;
}
const ue2_literal &lit = vis.lit;
if (lit.empty()) {
DEBUG_PRINTF("empty literal\n");
return false;
}
if (expr.highlander && lit.length() <= 1) {
DEBUG_PRINTF("not shortcutting SEP literal\n");
return false;
}
DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str());
return ng.addLiteral(lit, expr.index, expr.id, expr.highlander, expr.som);
}
} // namespace ue2

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Shortcut literal pass: directly add literal components to Rose.
*/
#ifndef SHORTCUT_LITERAL_H
#define SHORTCUT_LITERAL_H
namespace ue2 {
class NG;
class ParsedExpression;
/** \brief True if the literal expression \a expr could be added to Rose. */
bool shortcutLiteral(NG &ng, const ParsedExpression &expr);
} // namespace ue2
#endif

134
src/parser/ucp_table.cpp Normal file
View File

@@ -0,0 +1,134 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "Utf8ComponentClass.h"
#include <algorithm>
using namespace std;
namespace ue2 {
#define UCP_FN(cat) \
CodePointSet getUcp##cat(void) { \
CodePointSet rv; \
for (u32 i = 0; i < ARRAY_LENGTH(ucp_##cat##_def); i += 2) { \
rv.setRange(ucp_##cat##_def[i], ucp_##cat##_def[i + 1]); \
} \
return rv; \
}
struct unicase {
unichar base;
unichar caseless;
};
} // namespace ue2
#define UCP_TABLE_DEFINE_FN
#include "ucp_table.h"
namespace ue2 {
static
bool operator<(const unicase &a, const unicase &b) {
if (a.base < b.base) {
return true;
}
if (a.base > b.base) {
return false;
}
return a.caseless < b.caseless;
}
void make_caseless(CodePointSet *cps) {
assert(cps);
DEBUG_PRINTF("hello\n");
// Cheap optimisation: if we are empty or a dot, we're already caseless.
if (cps->begin() == cps->end()) {
DEBUG_PRINTF("empty\n");
return;
}
if (lower(*cps->begin()) == 0 && upper(*cps->begin()) == MAX_UNICODE) {
DEBUG_PRINTF("dot\n");
return;
}
CodePointSet base = *cps;
const unicase *uc_begin = ucp_caseless_def;
const unicase *const uc_end = ucp_caseless_def
+ ARRAY_LENGTH(ucp_caseless_def);
DEBUG_PRINTF("uc len %zd\n", uc_end - uc_begin);
for (auto it = base.begin(), ite = base.end(); it != ite; ++it) {
unichar b = lower(*it);
unichar e = upper(*it) + 1;
for (; b < e; b++) {
DEBUG_PRINTF("decasing %x\n", b);
unicase test = {b, 0}; /* NUL is not a caseless version of anything,
* so we are ok */
uc_begin = lower_bound(uc_begin, uc_end, test);
if (uc_begin == uc_end) {
DEBUG_PRINTF("EOL\n");
return;
}
while (uc_begin->base == b) {
DEBUG_PRINTF("at {%x,%x}\n", uc_begin->base, uc_begin->caseless);
cps->set(uc_begin->caseless);
++uc_begin;
}
}
}
}
/** \brief Flip the case of the codepoint in c, if possible.
*
* Note that this assumes a one-to-one case mapping, which (though not
* realistic) is what PCRE does. */
bool flip_case(unichar *c) {
assert(c);
const unicase *const uc_begin = ucp_caseless_def;
const unicase *const uc_end =
ucp_caseless_def + ARRAY_LENGTH(ucp_caseless_def);
const unicase test = { *c, 0 };
const unicase *f = lower_bound(uc_begin, uc_end, test);
if (f->base == *c) {
DEBUG_PRINTF("flipped c=%x to %x\n", *c, f->caseless);
*c = f->caseless;
return true;
}
return false;
}
} // namespace ue2

11043
src/parser/ucp_table.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,87 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Checks component trees for unsupported components.
*/
#include "ConstComponentVisitor.h"
#include "ComponentEUS.h"
#include "ComponentRepeat.h"
#include "ComponentWordBoundary.h"
#include "parse_error.h"
#include "unsupported.h"
#include <sstream>
namespace ue2 {
/** \brief Visitor class that throws a ParseError exception when it encounters
* an unsupported component. */
class UnsupportedVisitor : public DefaultConstComponentVisitor {
public:
~UnsupportedVisitor();
void pre(const ComponentAssertion &) override {
throw ParseError("Zero-width assertions are not supported.");
}
void pre(const ComponentAtomicGroup &) override {
throw ParseError("Atomic groups are unsupported.");
}
void pre(const ComponentBackReference &) override {
throw ParseError("Back-references are unsupported.");
}
void pre(const ComponentCondReference &) override {
throw ParseError("Conditional references are not supported.");
}
void pre(const ComponentEUS &c) override {
std::ostringstream str;
str << "\\X unsupported at index " << c.loc << ".";
throw ParseError(str.str());
}
void pre(const ComponentRepeat &c) override {
if (c.type == ComponentRepeat::REPEAT_POSSESSIVE) {
throw ParseError("Possessive quantifiers are not supported.");
}
}
void pre(const ComponentWordBoundary &c) override {
if (c.ucp && !c.prefilter) {
std::ostringstream str;
str << (!c.negated ? "\\b" : "\\B")
<< " unsupported in UCP mode at index " << c.loc << ".";
throw ParseError(str.str());
}
}
};
UnsupportedVisitor::~UnsupportedVisitor() {}
void checkUnsupported(const Component &root) {
UnsupportedVisitor vis;
root.accept(vis);
}
} // namespace ue2

47
src/parser/unsupported.h Normal file
View File

@@ -0,0 +1,47 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Checks component trees for unsupported components.
*/
#ifndef PARSER_UNSUPPORTED_H_
#define PARSER_UNSUPPORTED_H_
#include "parse_error.h"
namespace ue2 {
class Component;
/** \brief Throws a ParseError if this component tree contains an unsupported
* Component. */
void checkUnsupported(const Component &root);
} // namespace
#endif // PARSER_UNSUPPORTED_H_

View File

@@ -0,0 +1,163 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "utf8_validate.h"
#include "ue2common.h"
#include "util/unicode_def.h"
#include <cstring>
namespace ue2 {
static
bool hasValidContBytes(const u8 *s, size_t num) {
/* continuer bytes must all be of the form 10xx xxxx */
for (size_t i = 0; i < num; i++) {
if ((s[i] & 0xc0) != UTF_CONT_BYTE_HEADER) {
return false;
}
}
return true;
}
static
bool isAllowedCodepoint(u32 val) {
if (val >= 0xd800 && val <= 0xdfff) {
return false; // High and low surrogate halves
}
if (val > 0x10ffff) {
return false; // As per limit in RFC 3629
}
return true;
}
bool isValidUtf8(const char *expression) {
if (!expression) {
return true;
}
const size_t len = strlen(expression);
const u8 *s = (const u8 *)expression;
u32 val;
size_t i = 0;
while (i < len) {
DEBUG_PRINTF("byte %zu: 0x%02x\n", i, s[i]);
// One octet.
if (s[i] < 0x7f) {
DEBUG_PRINTF("one octet\n");
i++;
continue;
}
// Two octets.
if ((s[i] & 0xe0) == UTF_TWO_BYTE_HEADER) {
DEBUG_PRINTF("two octets\n");
if (i + 2 > len) {
break;
}
if (!hasValidContBytes(&s[i] + 1, 1)) {
break;
}
val = ((s[i] & 0x1f) << 6) | (s[i + 1] & UTF_CONT_BYTE_VALUE_MASK);
DEBUG_PRINTF("val=0x%x\n", val);
if (val < 1U << 7) {
DEBUG_PRINTF("overlong encoding\n");
break;
}
if (!isAllowedCodepoint(val)) {
DEBUG_PRINTF("codepoint not allowed\n");
break;
}
i += 2;
continue;
}
// Three octets.
if ((s[i] & 0xf0) == UTF_THREE_BYTE_HEADER) {
DEBUG_PRINTF("three octets\n");
if (i + 3 > len) {
break;
}
if (!hasValidContBytes(&s[i] + 1, 2)) {
break;
}
val = ((s[i] & 0xf) << 12) |
((s[i + 1] & UTF_CONT_BYTE_VALUE_MASK) << 6) |
(s[i + 2] & UTF_CONT_BYTE_VALUE_MASK);
if (val < 1U << 11) {
DEBUG_PRINTF("overlong encoding\n");
break;
}
if (!isAllowedCodepoint(val)) {
DEBUG_PRINTF("codepoint not allowed\n");
break;
}
i += 3;
continue;
}
// Four octets.
if ((s[i] & 0xf8) == UTF_FOUR_BYTE_HEADER) {
DEBUG_PRINTF("four octets\n");
if (i + 4 > len) {
break;
}
if (!hasValidContBytes(&s[i] + 1, 3)) {
break;
}
val = ((s[i] & 0xf) << 18) |
((s[i + 1] & UTF_CONT_BYTE_VALUE_MASK) << 12) |
((s[i + 2] & UTF_CONT_BYTE_VALUE_MASK) << 6) |
(s[i + 3] & UTF_CONT_BYTE_VALUE_MASK);
if (val < 1U << 16) {
DEBUG_PRINTF("overlong encoding\n");
break;
}
if (!isAllowedCodepoint(val)) {
DEBUG_PRINTF("codepoint not allowed\n");
break;
}
i += 4;
continue;
}
// Something else?
DEBUG_PRINTF("bad byte 0x%02x\n", s[i]);
break;
}
DEBUG_PRINTF("i=%zu, len=%zu\n", i, len);
return i == len;
}
} // namespace ue2

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef PARSER_UTF8_VALIDATE_H
#define PARSER_UTF8_VALIDATE_H
namespace ue2 {
/** \brief Validate that the given expression is well-formed UTF-8. */
bool isValidUtf8(const char *expression);
} // namespace ue2
#endif // PARSER_UTF8_VALIDATE_H