mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
2039 lines
79 KiB
Ragel
2039 lines
79 KiB
Ragel
/*
|
|
* Copyright (c) 2015-2017, Intel Corporation
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/** \file
|
|
* \brief Parser code (generated with Ragel from Parser.rl).
|
|
*/
|
|
|
|
#include "config.h"
|
|
|
|
/* Parser.cpp is a built source, may not be in same dir as parser files */
|
|
#include "parser/check_refs.h"
|
|
#include "parser/control_verbs.h"
|
|
#include "parser/ComponentAlternation.h"
|
|
#include "parser/ComponentAssertion.h"
|
|
#include "parser/ComponentAtomicGroup.h"
|
|
#include "parser/ComponentBackReference.h"
|
|
#include "parser/ComponentBoundary.h"
|
|
#include "parser/ComponentByte.h"
|
|
#include "parser/ComponentClass.h"
|
|
#include "parser/ComponentCondReference.h"
|
|
#include "parser/ComponentEmpty.h"
|
|
#include "parser/ComponentEUS.h"
|
|
#include "parser/Component.h"
|
|
#include "parser/ComponentRepeat.h"
|
|
#include "parser/ComponentSequence.h"
|
|
#include "parser/ComponentWordBoundary.h"
|
|
#include "parser/parse_error.h"
|
|
#include "parser/Parser.h"
|
|
#include "ue2common.h"
|
|
#include "util/compare.h"
|
|
#include "util/flat_containers.h"
|
|
#include "util/unicode_def.h"
|
|
#include "util/verify_types.h"
|
|
|
|
#include <cassert>
|
|
#include <cctype>
|
|
#include <cstring>
|
|
#include <cstdlib>
|
|
#include <map>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
using namespace std;
|
|
|
|
namespace ue2 {
|
|
|
|
#define PUSH_SEQUENCE do {\
|
|
sequences.push_back(ExprState(currentSeq, (size_t)(ts - ptr), \
|
|
mode)); \
|
|
} while(0)
|
|
#define POP_SEQUENCE do {\
|
|
currentSeq = sequences.back().seq; \
|
|
mode = sequences.back().mode; \
|
|
sequences.pop_back(); \
|
|
} while(0)
|
|
|
|
namespace {
|
|
|
|
/** \brief Structure representing current state as we're parsing (current
|
|
* sequence, current options). Stored in the 'sequences' vector. */
|
|
struct ExprState {
|
|
ExprState(ComponentSequence *seq_in, size_t offset,
|
|
const ParseMode &mode_in) :
|
|
seq(seq_in), seqOffset(offset), mode(mode_in) {}
|
|
|
|
ComponentSequence *seq; //!< current sequence
|
|
size_t seqOffset; //!< offset seq was entered, for error reporting
|
|
ParseMode mode; //!< current mode flags
|
|
};
|
|
|
|
} // namespace
|
|
|
|
static
|
|
unsigned parseAsDecimal(unsigned oct) {
|
|
// The input was parsed as octal, but should have been parsed as decimal.
|
|
// Deconstruct the octal number and reconstruct into decimal
|
|
unsigned ret = 0;
|
|
unsigned multiplier = 1;
|
|
while (oct) {
|
|
ret += (oct & 0x7) * multiplier;
|
|
oct >>= 3;
|
|
multiplier *= 10;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/** \brief Maximum value for a positive integer. We use INT_MAX, as that's what
|
|
* PCRE uses. */
|
|
static constexpr u32 MAX_NUMBER = INT_MAX;
|
|
|
|
static
|
|
void pushDec(u32 *acc, char raw_digit) {
|
|
assert(raw_digit >= '0' && raw_digit <= '9');
|
|
u32 digit_val = raw_digit - '0';
|
|
|
|
// Ensure that we don't overflow.
|
|
u64a val = ((u64a)*acc * 10) + digit_val;
|
|
if (val > MAX_NUMBER) {
|
|
throw LocatedParseError("Number is too big");
|
|
}
|
|
|
|
*acc = verify_u32(val);
|
|
}
|
|
|
|
static
|
|
void pushOct(u32 *acc, char raw_digit) {
|
|
assert(raw_digit >= '0' && raw_digit <= '7');
|
|
u32 digit_val = raw_digit - '0';
|
|
|
|
// Ensure that we don't overflow.
|
|
u64a val = ((u64a)*acc * 8) + digit_val;
|
|
if (val > MAX_NUMBER) {
|
|
throw LocatedParseError("Number is too big");
|
|
}
|
|
|
|
*acc = verify_u32(val);
|
|
}
|
|
|
|
static
|
|
void throwInvalidRepeat(void) {
|
|
throw LocatedParseError("Invalid repeat");
|
|
}
|
|
|
|
static
|
|
void throwInvalidUtf8(void) {
|
|
throw ParseError("Expression is not valid UTF-8.");
|
|
}
|
|
|
|
/**
|
|
* Adds the given child component to the parent sequence, returning a pointer
|
|
* to the new (child) "current sequence".
|
|
*/
|
|
static
|
|
ComponentSequence *enterSequence(ComponentSequence *parent,
|
|
unique_ptr<ComponentSequence> child) {
|
|
assert(parent);
|
|
assert(child);
|
|
|
|
ComponentSequence *seq = child.get();
|
|
parent->addComponent(std::move(child));
|
|
return seq;
|
|
}
|
|
|
|
static
|
|
void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) {
|
|
if (mode.utf8 && mode.caseless) {
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
assert(cc);
|
|
cc->add(c);
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
} else {
|
|
currentSeq->addComponent(getLiteralComponentClass(c, mode.caseless));
|
|
}
|
|
}
|
|
|
|
static
|
|
void addEscaped(ComponentSequence *currentSeq, unichar accum,
|
|
const ParseMode &mode, const char *err_msg) {
|
|
if (mode.utf8) {
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
assert(cc);
|
|
cc->add(accum);
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
} else {
|
|
if (accum > 255) {
|
|
throw LocatedParseError(err_msg);
|
|
}
|
|
addLiteral(currentSeq, (char)accum, mode);
|
|
}
|
|
}
|
|
|
|
static
|
|
void addEscapedOctal(ComponentSequence *currentSeq, unichar accum,
|
|
const ParseMode &mode) {
|
|
addEscaped(currentSeq, accum, mode, "Octal value is greater than \\377");
|
|
}
|
|
|
|
static
|
|
void addEscapedHex(ComponentSequence *currentSeq, unichar accum,
|
|
const ParseMode &mode) {
|
|
addEscaped(currentSeq, accum, mode,
|
|
"Hexadecimal value is greater than \\xFF");
|
|
}
|
|
|
|
#define SLASH_C_ERROR "\\c must be followed by an ASCII character"
|
|
|
|
static
|
|
u8 decodeCtrl(char raw) {
|
|
if (raw & 0x80) {
|
|
throw LocatedParseError(SLASH_C_ERROR);
|
|
}
|
|
return mytoupper(raw) ^ 0x40;
|
|
}
|
|
|
|
static
|
|
unichar readUtf8CodePoint2c(const char *s) {
|
|
auto *ts = reinterpret_cast<const u8 *>(s);
|
|
assert(ts[0] >= 0xc0 && ts[0] < 0xe0);
|
|
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
|
unichar val = ts[0] & 0x1f;
|
|
val <<= 6;
|
|
val |= ts[1] & 0x3f;
|
|
DEBUG_PRINTF("utf8 %02hhx %02hhx ->\\x{%x}\n", ts[0],
|
|
ts[1], val);
|
|
return val;
|
|
}
|
|
|
|
static
|
|
unichar readUtf8CodePoint3c(const char *s) {
|
|
auto *ts = (const u8 *)s;
|
|
assert(ts[0] >= 0xe0 && ts[0] < 0xf0);
|
|
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
|
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
|
|
unichar val = ts[0] & 0x0f;
|
|
val <<= 6;
|
|
val |= ts[1] & 0x3f;
|
|
val <<= 6;
|
|
val |= ts[2] & 0x3f;
|
|
DEBUG_PRINTF("utf8 %02hhx %02hhx %02hhx ->\\x{%x}\n", ts[0],
|
|
ts[1], ts[2], val);
|
|
return val;
|
|
}
|
|
|
|
static
|
|
unichar readUtf8CodePoint4c(const char *s) {
|
|
auto *ts = (const u8 *)s;
|
|
assert(ts[0] >= 0xf0 && ts[0] < 0xf8);
|
|
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
|
|
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
|
|
assert(ts[3] >= 0x80 && ts[3] < 0xc0);
|
|
unichar val = ts[0] & 0x07;
|
|
val <<= 6;
|
|
val |= ts[1] & 0x3f;
|
|
val <<= 6;
|
|
val |= ts[2] & 0x3f;
|
|
val <<= 6;
|
|
val |= ts[3] & 0x3f;
|
|
DEBUG_PRINTF("utf8 %02hhx %02hhx %02hhx %02hhx ->\\x{%x}\n", ts[0],
|
|
ts[1], ts[2], ts[3], val);
|
|
return val;
|
|
}
|
|
|
|
%%{
|
|
machine regex;
|
|
alphtype unsigned char;
|
|
|
|
action throwUnsupportedEscape {
|
|
ostringstream str;
|
|
str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
|
|
<< " not supported in a character class.";
|
|
throw ParseError(str.str());
|
|
}
|
|
action unsupportedProperty {
|
|
throw LocatedParseError("Character property not supported");
|
|
}
|
|
action clearLabel { label.clear();}
|
|
action appendLabelCharacter { label.push_back(fc);}
|
|
action clearOctAccumulator { octAccumulator = 0;}
|
|
action clearAccumulator { accumulator = 0;}
|
|
action setOctAccumulator {
|
|
octAccumulator = 0;
|
|
pushOct(&octAccumulator, fc);
|
|
}
|
|
action setDecAccumulator {
|
|
accumulator = 0;
|
|
pushDec(&accumulator, fc);
|
|
}
|
|
action clearNM { repeatN = 0; repeatM = 0; }
|
|
action appendN { pushDec(&repeatN, fc); }
|
|
action appendM { pushDec(&repeatM, fc); }
|
|
action appendAccumulatorOctDigit { pushOct(&octAccumulator, fc); }
|
|
action appendAccumulatorDecDigit { pushDec(&accumulator, fc); }
|
|
action appendAccumulatorHexDigit {
|
|
accumulator *= 16;
|
|
accumulator += fc - '0';
|
|
}
|
|
action appendAccumulatorHexL {
|
|
accumulator *= 16;
|
|
accumulator += 10 + fc - 'a';
|
|
}
|
|
action appendAccumulatorHexU {
|
|
accumulator *= 16;
|
|
accumulator += 10 + fc - 'A';
|
|
}
|
|
|
|
# enter a comment group, where we just scan for a close paren.
|
|
action enterComment {
|
|
inComment = true;
|
|
fgoto readComment;
|
|
}
|
|
|
|
# enter an extended mode comment, where we just scan for a newline.
|
|
action enterNewlineTerminatedComment {
|
|
inComment = true;
|
|
fgoto readNewlineTerminatedComment;
|
|
}
|
|
|
|
# enter a CAPTURING group ( e.g. '(blah)' )
|
|
action enterCapturingGroup {
|
|
PUSH_SEQUENCE;
|
|
auto seq = std::make_unique<ComponentSequence>();
|
|
seq->setCaptureIndex(groupIndex++);
|
|
currentSeq = enterSequence(currentSeq, std::move(seq));
|
|
}
|
|
|
|
# enter a NAMED CAPTURING group ( e.g. (?'<hatstand>blah) )
|
|
action enterNamedGroup {
|
|
assert(!label.empty()); // should be guaranteed by machine
|
|
char c = *label.begin();
|
|
if (c >= '0' && c <= '9') {
|
|
throw LocatedParseError("Group name cannot begin with a digit");
|
|
}
|
|
if (!groupNames.insert(label).second) {
|
|
throw LocatedParseError("Two named subpatterns use the name '" + label + "'");
|
|
}
|
|
PUSH_SEQUENCE;
|
|
auto seq = std::make_unique<ComponentSequence>();
|
|
seq->setCaptureIndex(groupIndex++);
|
|
seq->setCaptureName(label);
|
|
currentSeq = enterSequence(currentSeq, std::move(seq));
|
|
}
|
|
|
|
# enter a NON-CAPTURING group where we're modifying flags
|
|
# ( e.g. '(?i:blah)' ). Standard non-capturing groups use this path
|
|
# as well.
|
|
action enterModifiedGroup {
|
|
PUSH_SEQUENCE;
|
|
mode = newMode;
|
|
currentSeq =
|
|
enterSequence(currentSeq, std::make_unique<ComponentSequence>());
|
|
}
|
|
|
|
action exitGroup {
|
|
if (sequences.empty()) {
|
|
throw LocatedParseError("Unmatched parentheses");
|
|
}
|
|
currentSeq->finalize();
|
|
POP_SEQUENCE;
|
|
}
|
|
action enterZWLookAhead {
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
|
|
ComponentAssertion::POS));
|
|
}
|
|
action enterZWNegLookAhead {
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
|
|
ComponentAssertion::NEG));
|
|
}
|
|
action enterZWLookBehind {
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
|
|
ComponentAssertion::POS));
|
|
}
|
|
action enterZWNegLookBehind {
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
|
|
ComponentAssertion::NEG));
|
|
}
|
|
action enterEmbeddedCode {
|
|
throw LocatedParseError("Embedded code is not supported");
|
|
}
|
|
action enterConditionUnsupported {
|
|
throw LocatedParseError("Conditional subpattern unsupported");
|
|
}
|
|
action enterReferenceUnsupported {
|
|
throw LocatedParseError("Subpattern reference unsupported");
|
|
}
|
|
action enterNumberedConditionalRef {
|
|
if (accumulator == 0) {
|
|
throw LocatedParseError("Numbered reference cannot be zero");
|
|
}
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentCondReference>(accumulator));
|
|
}
|
|
action enterNamedConditionalRef {
|
|
PUSH_SEQUENCE;
|
|
assert(!label.empty());
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentCondReference>(label));
|
|
}
|
|
action enterAtomicGroup {
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentAtomicGroup>());
|
|
}
|
|
action eatClass {
|
|
assert(!currentCls);
|
|
assert(!inCharClass); // not reentrant
|
|
currentCls = getComponentClass(mode);
|
|
inCharClass = true;
|
|
inCharClassEarly = true;
|
|
currentClsBegin = ts;
|
|
fgoto readClass;
|
|
}
|
|
action resetModifiers {
|
|
newMode = mode;
|
|
}
|
|
action applyModifiers {
|
|
mode = newMode;
|
|
currentSeq->addComponent(std::make_unique<ComponentEmpty>());
|
|
}
|
|
action modifyMatchPositive {
|
|
switch (fc) {
|
|
case 'i':
|
|
newMode.caseless = true;
|
|
break;
|
|
case 'm':
|
|
newMode.multiline = true;
|
|
break;
|
|
case 's':
|
|
newMode.dotall = true;
|
|
break;
|
|
case 'x':
|
|
newMode.ignore_space = true;
|
|
break;
|
|
default:
|
|
assert(0); // this action only called for [imsx]
|
|
break;
|
|
}
|
|
}
|
|
action modifyMatchNegative {
|
|
switch (fc) {
|
|
case 'i':
|
|
newMode.caseless = false;
|
|
break;
|
|
case 'm':
|
|
newMode.multiline = false;
|
|
break;
|
|
case 's':
|
|
newMode.dotall = false;
|
|
break;
|
|
case 'x':
|
|
newMode.ignore_space = false;
|
|
break;
|
|
default:
|
|
assert(0); // this action only called for [imsx]
|
|
break;
|
|
}
|
|
}
|
|
action is_utf8 { mode.utf8 }
|
|
action is_ignore_space { mode.ignore_space }
|
|
action is_early_charclass { inCharClassEarly }
|
|
|
|
action addNumberedBackRef {
|
|
if (accumulator == 0) {
|
|
throw LocatedParseError("Numbered reference cannot be zero");
|
|
}
|
|
currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
|
|
}
|
|
|
|
action addNegativeNumberedBackRef {
|
|
// Accumulator is a negative offset.
|
|
if (accumulator == 0) {
|
|
throw LocatedParseError("Numbered reference cannot be zero");
|
|
}
|
|
if (accumulator >= groupIndex) {
|
|
throw LocatedParseError("Invalid reference");
|
|
}
|
|
unsigned idx = groupIndex - accumulator;
|
|
currentSeq->addComponent(std::make_unique<ComponentBackReference>(idx));
|
|
}
|
|
|
|
action addNamedBackRef {
|
|
currentSeq->addComponent(std::make_unique<ComponentBackReference>(label));
|
|
}
|
|
|
|
escapedOctal0 = '\\0' @clearOctAccumulator [0-7]{0,2} $appendAccumulatorOctDigit;
|
|
escapedOctal2 = '\\' [1-7] $setOctAccumulator [0-7]{1,2} $appendAccumulatorOctDigit;
|
|
escapedOctal2c = '\\' [1-7] $setOctAccumulator [0-7]{0,2} $appendAccumulatorOctDigit;
|
|
backRefIdSingle = [1-7] $setDecAccumulator;
|
|
backRefId = [1-9] $setDecAccumulator [0-9]+ $appendAccumulatorDecDigit;
|
|
escapedHex = '\\x' @clearAccumulator ([0-9] $appendAccumulatorHexDigit | [a-f] $appendAccumulatorHexL | [A-F] $appendAccumulatorHexU){0,2};
|
|
escapedCtrl = '\\c' any?;
|
|
escapedUnsupported = '\\' [NluLU];
|
|
repeatNM1 = '\{' @clearNM [0-9]+ $appendN ('}' @{repeatM = repeatN;} | ',' '\}' @{repeatM = ComponentRepeat::NoLimit;} | ',' [0-9]+ $appendM '}');
|
|
|
|
backReferenceG = '\\g' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit;
|
|
backReferenceGNegative = '\\g-' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit;
|
|
backReferenceGBracket = '\\g{' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit '}';
|
|
backReferenceGBracket2 = '\\g{-' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit '}';
|
|
backReferenceGBracketName = '\\g{' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '}';
|
|
backReferenceKBracketName = '\\k{' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '}';
|
|
backReferenceKBracketName2 = '\\k<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>';
|
|
backReferenceKBracketName3 = '\\k\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\'';
|
|
backReferenceP = '(?P=' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter ')';
|
|
|
|
namedGroup1 = '(?<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>';
|
|
namedGroup2 = '(?\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\'';
|
|
namedGroup3 = '(?P<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>';
|
|
|
|
namedConditionalRef1 = '(?(<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>)';
|
|
namedConditionalRef2 = '(?(\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\')';
|
|
namedConditionalRef3 = '(?(' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter ')';
|
|
|
|
numberedSubExpression = '(?' [+\-]? [0-9]+ ')';
|
|
namedSubExpression = '(?' ('&'|'P>') [A-Za-z0-9_]+ ')';
|
|
|
|
positiveMatchModifiers = [imsx]+ $modifyMatchPositive;
|
|
negativeMatchModifiers = '-' [imsx]+ $modifyMatchNegative;
|
|
matchModifiers = positiveMatchModifiers ? negativeMatchModifiers ?;
|
|
|
|
utf8_cont = 0x80..0xbf;
|
|
utf8_2c = 0xc0..0xdf utf8_cont;
|
|
utf8_3c = 0xe0..0xef utf8_cont utf8_cont;
|
|
utf8_4c = 0xf0..0xf7 utf8_cont utf8_cont utf8_cont;
|
|
hi_byte = 0x80..0xff;
|
|
|
|
whitespace = [\t\n\v\f\r ];
|
|
|
|
#############################################################
|
|
# Trivial parser to read Perl 5.10+ control verbs, introduced
|
|
# by '(*'.
|
|
#############################################################
|
|
readVerb := |*
|
|
'UTF8)' => {
|
|
throw LocatedParseError("(*UTF8) must be at start of "
|
|
"expression, encountered");
|
|
};
|
|
'UTF)' => {
|
|
throw LocatedParseError("(*UTF) must be at start of "
|
|
"expression, encountered");
|
|
};
|
|
'UCP)' => {
|
|
throw LocatedParseError("(*UCP) must be at start of "
|
|
"expression, encountered");
|
|
};
|
|
# Use the control verb mini-parser to report an error for this
|
|
# unsupported/unknown verb.
|
|
[^)]+ ')' => {
|
|
ParseMode temp_mode;
|
|
assert(ts - 2 >= ptr); // parser needs the '(*' at the start too.
|
|
read_control_verbs(ts - 2, te, (ts - 2 - ptr), temp_mode);
|
|
assert(0); // Should have thrown a parse error.
|
|
throw LocatedParseError("Unknown control verb");
|
|
};
|
|
any => {
|
|
throw LocatedParseError("Unknown control verb");
|
|
};
|
|
*|;
|
|
|
|
#############################################################
|
|
# Parser to read UCP
|
|
#############################################################
|
|
readUCP := |*
|
|
'C' => { currentCls->add(CLASS_UCP_C, negated); fret; };
|
|
'Cc' => { currentCls->add(CLASS_UCP_CC, negated); fret; };
|
|
'Cf' => { currentCls->add(CLASS_UCP_CF, negated); fret; };
|
|
'Cn' => { currentCls->add(CLASS_UCP_CN, negated); fret; };
|
|
'Co' => { currentCls->add(CLASS_UCP_CO, negated); fret; };
|
|
'Cs' => { currentCls->add(CLASS_UCP_CS, negated); fret; };
|
|
'L' => { currentCls->add(CLASS_UCP_L, negated); fret; };
|
|
'Ll' => { currentCls->add(CLASS_UCP_LL, negated); fret; };
|
|
'Lm' => { currentCls->add(CLASS_UCP_LM, negated); fret; };
|
|
'Lo' => { currentCls->add(CLASS_UCP_LO, negated); fret; };
|
|
'Lt' => { currentCls->add(CLASS_UCP_LT, negated); fret; };
|
|
'Lu' => { currentCls->add(CLASS_UCP_LU, negated); fret; };
|
|
'L&' => { currentCls->add(CLASS_UCP_L_AND, negated); fret; };
|
|
'M' => { currentCls->add(CLASS_UCP_M, negated); fret; };
|
|
'Mc' => { currentCls->add(CLASS_UCP_MC, negated); fret; };
|
|
'Me' => { currentCls->add(CLASS_UCP_ME, negated); fret; };
|
|
'Mn' => { currentCls->add(CLASS_UCP_MN, negated); fret; };
|
|
'N' => { currentCls->add(CLASS_UCP_N, negated); fret; };
|
|
'Nd' => { currentCls->add(CLASS_UCP_ND, negated); fret; };
|
|
'Nl' => { currentCls->add(CLASS_UCP_NL, negated); fret; };
|
|
'No' => { currentCls->add(CLASS_UCP_NO, negated); fret; };
|
|
'P' => { currentCls->add(CLASS_UCP_P, negated); fret; };
|
|
'Pc' => { currentCls->add(CLASS_UCP_PC, negated); fret; };
|
|
'Pd' => { currentCls->add(CLASS_UCP_PD, negated); fret; };
|
|
'Pe' => { currentCls->add(CLASS_UCP_PE, negated); fret; };
|
|
'Pf' => { currentCls->add(CLASS_UCP_PF, negated); fret; };
|
|
'Pi' => { currentCls->add(CLASS_UCP_PI, negated); fret; };
|
|
'Po' => { currentCls->add(CLASS_UCP_PO, negated); fret; };
|
|
'Ps' => { currentCls->add(CLASS_UCP_PS, negated); fret; };
|
|
'S' => { currentCls->add(CLASS_UCP_S, negated); fret; };
|
|
'Sc' => { currentCls->add(CLASS_UCP_SC, negated); fret; };
|
|
'Sk' => { currentCls->add(CLASS_UCP_SK, negated); fret; };
|
|
'Sm' => { currentCls->add(CLASS_UCP_SM, negated); fret; };
|
|
'So' => { currentCls->add(CLASS_UCP_SO, negated); fret; };
|
|
'Z' => { currentCls->add(CLASS_UCP_Z, negated); fret; };
|
|
'Zl' => { currentCls->add(CLASS_UCP_ZL, negated); fret; };
|
|
'Zp' => { currentCls->add(CLASS_UCP_ZP, negated); fret; };
|
|
'Zs' => { currentCls->add(CLASS_UCP_ZS, negated); fret; };
|
|
'Xan' => { currentCls->add(CLASS_UCP_XAN, negated); fret; };
|
|
'Xps' => { currentCls->add(CLASS_UCP_XPS, negated); fret; };
|
|
'Xsp' => { currentCls->add(CLASS_UCP_XSP, negated); fret; };
|
|
'Xwd' => { currentCls->add(CLASS_UCP_XWD, negated); fret; };
|
|
'Arabic' => { currentCls->add(CLASS_SCRIPT_ARABIC, negated); fret; };
|
|
'Armenian' => { currentCls->add(CLASS_SCRIPT_ARMENIAN, negated); fret; };
|
|
'Avestan' => { currentCls->add(CLASS_SCRIPT_AVESTAN, negated); fret; };
|
|
'Balinese' => { currentCls->add(CLASS_SCRIPT_BALINESE, negated); fret; };
|
|
'Bamum' => { currentCls->add(CLASS_SCRIPT_BAMUM, negated); fret; };
|
|
'Batak' => { currentCls->add(CLASS_SCRIPT_BATAK, negated); fret; };
|
|
'Bengali' => { currentCls->add(CLASS_SCRIPT_BENGALI, negated); fret; };
|
|
'Bopomofo' => { currentCls->add(CLASS_SCRIPT_BOPOMOFO, negated); fret; };
|
|
'Brahmi' => { currentCls->add(CLASS_SCRIPT_BRAHMI, negated); fret; };
|
|
'Braille' => { currentCls->add(CLASS_SCRIPT_BRAILLE, negated); fret; };
|
|
'Buginese' => { currentCls->add(CLASS_SCRIPT_BUGINESE, negated); fret; };
|
|
'Buhid' => { currentCls->add(CLASS_SCRIPT_BUHID, negated); fret; };
|
|
'Canadian_Aboriginal' => { currentCls->add(CLASS_SCRIPT_CANADIAN_ABORIGINAL, negated); fret; };
|
|
'Carian' => { currentCls->add(CLASS_SCRIPT_CARIAN, negated); fret; };
|
|
'Cham' => { currentCls->add(CLASS_SCRIPT_CHAM, negated); fret; };
|
|
'Cherokee' => { currentCls->add(CLASS_SCRIPT_CHEROKEE, negated); fret; };
|
|
'Common' => { currentCls->add(CLASS_SCRIPT_COMMON, negated); fret; };
|
|
'Coptic' => { currentCls->add(CLASS_SCRIPT_COPTIC, negated); fret; };
|
|
'Cuneiform' => { currentCls->add(CLASS_SCRIPT_CUNEIFORM, negated); fret; };
|
|
'Cypriot' => { currentCls->add(CLASS_SCRIPT_CYPRIOT, negated); fret; };
|
|
'Cyrillic' => { currentCls->add(CLASS_SCRIPT_CYRILLIC, negated); fret; };
|
|
'Deseret' => { currentCls->add(CLASS_SCRIPT_DESERET, negated); fret; };
|
|
'Devanagari' => { currentCls->add(CLASS_SCRIPT_DEVANAGARI, negated); fret; };
|
|
'Egyptian_Hieroglyphs' => { currentCls->add(CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS, negated); fret; };
|
|
'Ethiopic' => { currentCls->add(CLASS_SCRIPT_ETHIOPIC, negated); fret; };
|
|
'Georgian' => { currentCls->add(CLASS_SCRIPT_GEORGIAN, negated); fret; };
|
|
'Glagolitic' => { currentCls->add(CLASS_SCRIPT_GLAGOLITIC, negated); fret; };
|
|
'Gothic' => { currentCls->add(CLASS_SCRIPT_GOTHIC, negated); fret; };
|
|
'Greek' => { currentCls->add(CLASS_SCRIPT_GREEK, negated); fret; };
|
|
'Gujarati' => { currentCls->add(CLASS_SCRIPT_GUJARATI, negated); fret; };
|
|
'Gurmukhi' => { currentCls->add(CLASS_SCRIPT_GURMUKHI, negated); fret; };
|
|
'Han' => { currentCls->add(CLASS_SCRIPT_HAN, negated); fret; };
|
|
'Hangul' => { currentCls->add(CLASS_SCRIPT_HANGUL, negated); fret; };
|
|
'Hanunoo' => { currentCls->add(CLASS_SCRIPT_HANUNOO, negated); fret; };
|
|
'Hebrew' => { currentCls->add(CLASS_SCRIPT_HEBREW, negated); fret; };
|
|
'Hiragana' => { currentCls->add(CLASS_SCRIPT_HIRAGANA, negated); fret; };
|
|
'Imperial_Aramaic' => { currentCls->add(CLASS_SCRIPT_IMPERIAL_ARAMAIC, negated); fret; };
|
|
'Inherited' => { currentCls->add(CLASS_SCRIPT_INHERITED, negated); fret; };
|
|
'Inscriptional_Pahlavi' => { currentCls->add(CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI, negated); fret; };
|
|
'Inscriptional_Parthian' => { currentCls->add(CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN, negated); fret; };
|
|
'Javanese' => { currentCls->add(CLASS_SCRIPT_JAVANESE, negated); fret; };
|
|
'Kaithi' => { currentCls->add(CLASS_SCRIPT_KAITHI, negated); fret; };
|
|
'Kannada' => { currentCls->add(CLASS_SCRIPT_KANNADA, negated); fret; };
|
|
'Katakana' => { currentCls->add(CLASS_SCRIPT_KATAKANA, negated); fret; };
|
|
'Kayah_Li' => { currentCls->add(CLASS_SCRIPT_KAYAH_LI, negated); fret; };
|
|
'Kharoshthi' => { currentCls->add(CLASS_SCRIPT_KHAROSHTHI, negated); fret; };
|
|
'Khmer' => { currentCls->add(CLASS_SCRIPT_KHMER, negated); fret; };
|
|
'Lao' => { currentCls->add(CLASS_SCRIPT_LAO, negated); fret; };
|
|
'Latin' => { currentCls->add(CLASS_SCRIPT_LATIN, negated); fret; };
|
|
'Lepcha' => { currentCls->add(CLASS_SCRIPT_LEPCHA, negated); fret; };
|
|
'Limbu' => { currentCls->add(CLASS_SCRIPT_LIMBU, negated); fret; };
|
|
'Linear_B' => { currentCls->add(CLASS_SCRIPT_LINEAR_B, negated); fret; };
|
|
'Lisu' => { currentCls->add(CLASS_SCRIPT_LISU, negated); fret; };
|
|
'Lycian' => { currentCls->add(CLASS_SCRIPT_LYCIAN, negated); fret; };
|
|
'Lydian' => { currentCls->add(CLASS_SCRIPT_LYDIAN, negated); fret; };
|
|
'Malayalam' => { currentCls->add(CLASS_SCRIPT_MALAYALAM, negated); fret; };
|
|
'Mandaic' => { currentCls->add(CLASS_SCRIPT_MANDAIC, negated); fret; };
|
|
'Meetei_Mayek' => { currentCls->add(CLASS_SCRIPT_MEETEI_MAYEK, negated); fret; };
|
|
'Mongolian' => { currentCls->add(CLASS_SCRIPT_MONGOLIAN, negated); fret; };
|
|
'Myanmar' => { currentCls->add(CLASS_SCRIPT_MYANMAR, negated); fret; };
|
|
'New_Tai_Lue' => { currentCls->add(CLASS_SCRIPT_NEW_TAI_LUE, negated); fret; };
|
|
'Nko' => { currentCls->add(CLASS_SCRIPT_NKO, negated); fret; };
|
|
'Ogham' => { currentCls->add(CLASS_SCRIPT_OGHAM, negated); fret; };
|
|
'Ol_Chiki' => { currentCls->add(CLASS_SCRIPT_OL_CHIKI, negated); fret; };
|
|
'Old_Italic' => { currentCls->add(CLASS_SCRIPT_OLD_ITALIC, negated); fret; };
|
|
'Old_Persian' => { currentCls->add(CLASS_SCRIPT_OLD_PERSIAN, negated); fret; };
|
|
'Old_South_Arabian' => { currentCls->add(CLASS_SCRIPT_OLD_SOUTH_ARABIAN, negated); fret; };
|
|
'Old_Turkic' => { currentCls->add(CLASS_SCRIPT_OLD_TURKIC, negated); fret; };
|
|
'Oriya' => { currentCls->add(CLASS_SCRIPT_ORIYA, negated); fret; };
|
|
'Osmanya' => { currentCls->add(CLASS_SCRIPT_OSMANYA, negated); fret; };
|
|
'Phags_Pa' => { currentCls->add(CLASS_SCRIPT_PHAGS_PA, negated); fret; };
|
|
'Phoenician' => { currentCls->add(CLASS_SCRIPT_PHOENICIAN, negated); fret; };
|
|
'Rejang' => { currentCls->add(CLASS_SCRIPT_REJANG, negated); fret; };
|
|
'Runic' => { currentCls->add(CLASS_SCRIPT_RUNIC, negated); fret; };
|
|
'Samaritan' => { currentCls->add(CLASS_SCRIPT_SAMARITAN, negated); fret; };
|
|
'Saurashtra' => { currentCls->add(CLASS_SCRIPT_SAURASHTRA, negated); fret; };
|
|
'Shavian' => { currentCls->add(CLASS_SCRIPT_SHAVIAN, negated); fret; };
|
|
'Sinhala' => { currentCls->add(CLASS_SCRIPT_SINHALA, negated); fret; };
|
|
'Sundanese' => { currentCls->add(CLASS_SCRIPT_SUNDANESE, negated); fret; };
|
|
'Syloti_Nagri' => { currentCls->add(CLASS_SCRIPT_SYLOTI_NAGRI, negated); fret; };
|
|
'Syriac' => { currentCls->add(CLASS_SCRIPT_SYRIAC, negated); fret; };
|
|
'Tagalog' => { currentCls->add(CLASS_SCRIPT_TAGALOG, negated); fret; };
|
|
'Tagbanwa' => { currentCls->add(CLASS_SCRIPT_TAGBANWA, negated); fret; };
|
|
'Tai_Le' => { currentCls->add(CLASS_SCRIPT_TAI_LE, negated); fret; };
|
|
'Tai_Tham' => { currentCls->add(CLASS_SCRIPT_TAI_THAM, negated); fret; };
|
|
'Tai_Viet' => { currentCls->add(CLASS_SCRIPT_TAI_VIET, negated); fret; };
|
|
'Tamil' => { currentCls->add(CLASS_SCRIPT_TAMIL, negated); fret; };
|
|
'Telugu' => { currentCls->add(CLASS_SCRIPT_TELUGU, negated); fret; };
|
|
'Thaana' => { currentCls->add(CLASS_SCRIPT_THAANA, negated); fret; };
|
|
'Thai' => { currentCls->add(CLASS_SCRIPT_THAI, negated); fret; };
|
|
'Tibetan' => { currentCls->add(CLASS_SCRIPT_TIBETAN, negated); fret; };
|
|
'Tifinagh' => { currentCls->add(CLASS_SCRIPT_TIFINAGH, negated); fret; };
|
|
'Ugaritic' => { currentCls->add(CLASS_SCRIPT_UGARITIC, negated); fret; };
|
|
'Vai' => { currentCls->add(CLASS_SCRIPT_VAI, negated); fret; };
|
|
'Yi' => { currentCls->add(CLASS_SCRIPT_YI, negated); fret; };
|
|
'Any' => { currentCls->add(CLASS_UCP_ANY, negated); fret; };
|
|
any => { throw LocatedParseError("Unknown property"); };
|
|
*|;
|
|
|
|
readBracedUCP := ('{'
|
|
('^' ${ negated = !negated; }) ?
|
|
([^^] ${ fhold; fcall readUCP; })
|
|
'}' ${ if (!inCharClass) { // not inside [..]
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
})
|
|
$^{ throw LocatedParseError("Malformed property"); };
|
|
|
|
readUCPSingle := |*
|
|
'C' => {
|
|
currentCls->add(CLASS_UCP_C, negated);
|
|
if (!inCharClass) {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
};
|
|
'L' => {
|
|
currentCls->add(CLASS_UCP_L, negated);
|
|
if (!inCharClass) {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
};
|
|
'M' => {
|
|
currentCls->add(CLASS_UCP_M, negated);
|
|
if (!inCharClass) {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
};
|
|
'N' => {
|
|
currentCls->add(CLASS_UCP_N, negated);
|
|
if (!inCharClass) {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
};
|
|
'P' => {
|
|
currentCls->add(CLASS_UCP_P, negated);
|
|
if (!inCharClass) {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
};
|
|
'S' => {
|
|
currentCls->add(CLASS_UCP_S, negated);
|
|
if (!inCharClass) {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
};
|
|
'Z' => {
|
|
currentCls->add(CLASS_UCP_Z, negated);
|
|
if (!inCharClass) {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
}
|
|
fret;
|
|
};
|
|
|
|
any => { throw LocatedParseError("Unknown property"); };
|
|
*|;
|
|
charClassGuts := |*
|
|
# We don't support POSIX collating elements (neither does PCRE
|
|
# or Perl). These look like [.ch.] or [=ch=].
|
|
'\[\.' ( '\\]' | [^\]] )* '\.\]' |
|
|
'\[=' ( '\\]' | [^\]] )* '=\]' => {
|
|
throw LocatedParseError("Unsupported POSIX collating "
|
|
"element");
|
|
};
|
|
# Named sets
|
|
# Adding these may cause the charclass to close, hence the
|
|
# finalized check - UE-2276
|
|
'[:alnum:]' => {
|
|
currentCls->add(CLASS_ALNUM, false);
|
|
};
|
|
'[:^alnum:]' => {
|
|
currentCls->add(CLASS_ALNUM, true);
|
|
};
|
|
'[:alpha:]' => {
|
|
currentCls->add(CLASS_ALPHA, false);
|
|
};
|
|
'[:^alpha:]' => {
|
|
currentCls->add(CLASS_ALPHA, true);
|
|
};
|
|
'[:ascii:]' => {
|
|
currentCls->add(CLASS_ASCII, false);
|
|
};
|
|
'[:^ascii:]' => {
|
|
currentCls->add(CLASS_ASCII, true);
|
|
};
|
|
'[:blank:]' => {
|
|
currentCls->add(CLASS_BLANK, false);
|
|
};
|
|
'[:^blank:]' => {
|
|
currentCls->add(CLASS_BLANK, true);
|
|
};
|
|
'[:cntrl:]' => {
|
|
currentCls->add(CLASS_CNTRL, false);
|
|
};
|
|
'[:^cntrl:]' => {
|
|
currentCls->add(CLASS_CNTRL, true);
|
|
};
|
|
'[:digit:]' => {
|
|
currentCls->add(CLASS_DIGIT, false);
|
|
};
|
|
'[:^digit:]' => {
|
|
currentCls->add(CLASS_DIGIT, true);
|
|
};
|
|
'[:graph:]' => {
|
|
currentCls->add(CLASS_GRAPH, false);
|
|
};
|
|
'[:^graph:]' => {
|
|
currentCls->add(CLASS_GRAPH, true);
|
|
};
|
|
'[:lower:]' => {
|
|
currentCls->add(CLASS_LOWER, false);
|
|
};
|
|
'[:^lower:]' => {
|
|
currentCls->add(CLASS_LOWER, true);
|
|
};
|
|
'[:print:]' => {
|
|
currentCls->add(CLASS_PRINT, false);
|
|
};
|
|
'[:^print:]' => {
|
|
currentCls->add(CLASS_PRINT, true);
|
|
};
|
|
'[:punct:]' => {
|
|
currentCls->add(CLASS_PUNCT, false);
|
|
};
|
|
'[:^punct:]' => {
|
|
currentCls->add(CLASS_PUNCT, true);
|
|
};
|
|
# Posix SPACE covers 9, 10, 11, 12, 13, 32
|
|
'[:space:]' => {
|
|
currentCls->add(CLASS_SPACE, false);
|
|
};
|
|
'[:^space:]' => {
|
|
currentCls->add(CLASS_SPACE, true);
|
|
};
|
|
'[:upper:]' => {
|
|
currentCls->add(CLASS_UPPER, false);
|
|
};
|
|
'[:^upper:]' => {
|
|
currentCls->add(CLASS_UPPER, true);
|
|
};
|
|
'[:word:]' => {
|
|
currentCls->add(CLASS_WORD, false);
|
|
};
|
|
'[:^word:]' => {
|
|
currentCls->add(CLASS_WORD, true);
|
|
};
|
|
'[:xdigit:]' => {
|
|
currentCls->add(CLASS_XDIGIT, false);
|
|
};
|
|
'[:^xdigit:]' => {
|
|
currentCls->add(CLASS_XDIGIT, true);
|
|
};
|
|
# Anything else between "[:" and ":]" is an invalid POSIX class.
|
|
# Note that "\]" counts as a literal char here.
|
|
'\[:' ( '\\]' | [^\]] )* ':\]' => {
|
|
throw LocatedParseError("Invalid POSIX named class");
|
|
};
|
|
'\\Q' => {
|
|
fcall readQuotedClass;
|
|
};
|
|
'\\E' => { /*noop*/};
|
|
# Backspace (this is only valid for \b in char classes)
|
|
'\\b' => {
|
|
currentCls->add('\x08');
|
|
};
|
|
# Tab
|
|
'\\t' => {
|
|
currentCls->add('\x09');
|
|
};
|
|
# Newline
|
|
'\\n' => {
|
|
currentCls->add('\x0a');
|
|
};
|
|
# Carriage return
|
|
'\\r' => {
|
|
currentCls->add('\x0d');
|
|
};
|
|
# Form feed
|
|
'\\f' => {
|
|
currentCls->add('\x0c');
|
|
};
|
|
# Bell
|
|
'\\a' => {
|
|
currentCls->add('\x07');
|
|
};
|
|
# Escape
|
|
'\\e' => {
|
|
currentCls->add('\x1b');
|
|
};
|
|
# Horizontal whitespace
|
|
'\\h' => {
|
|
currentCls->add(CLASS_HORZ, false);
|
|
};
|
|
# Not horizontal whitespace
|
|
'\\H' => {
|
|
currentCls->add(CLASS_HORZ, true);
|
|
};
|
|
# Vertical whitespace
|
|
'\\v' => {
|
|
currentCls->add(CLASS_VERT, false);
|
|
};
|
|
# Not vertical whitespace
|
|
'\\V' => {
|
|
currentCls->add(CLASS_VERT, true);
|
|
};
|
|
|
|
'\\p{' => {
|
|
negated = false;
|
|
fhold;
|
|
fcall readBracedUCP;
|
|
};
|
|
|
|
'\\p' any => {
|
|
negated = false;
|
|
fhold;
|
|
fcall readUCPSingle;
|
|
};
|
|
|
|
'\\P{' => {
|
|
negated = true;
|
|
fhold;
|
|
fcall readBracedUCP;
|
|
};
|
|
|
|
'\\P'any => {
|
|
negated = true;
|
|
fhold;
|
|
fcall readUCPSingle;
|
|
};
|
|
|
|
'\\P' => { throw LocatedParseError("Malformed property"); };
|
|
'\\p' => { throw LocatedParseError("Malformed property"); };
|
|
|
|
# Octal
|
|
escapedOctal0 => {
|
|
currentCls->add(octAccumulator);
|
|
};
|
|
escapedOctal2c => {
|
|
currentCls->add(octAccumulator);
|
|
};
|
|
|
|
'\\o{' [0-7]+ '}' => {
|
|
string oct(ts + 3, te - ts - 4);
|
|
unsigned long val;
|
|
try {
|
|
val = stoul(oct, nullptr, 8);
|
|
} catch (const std::out_of_range &) {
|
|
val = MAX_UNICODE + 1;
|
|
}
|
|
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
|
|
throw LocatedParseError("Value in \\o{...} sequence is too large");
|
|
}
|
|
currentCls->add((unichar)val);
|
|
};
|
|
|
|
# And for when it goes wrong
|
|
'\\o' => {
|
|
throw LocatedParseError("Value in \\o{...} sequence is non-octal or missing braces");
|
|
};
|
|
|
|
# Hex
|
|
escapedHex => {
|
|
currentCls->add(accumulator);
|
|
};
|
|
# not a back-ref, not octal, just PCRE madness
|
|
'\\' [89] => {
|
|
// whatever we found here
|
|
currentCls->add(*(ts + 1));
|
|
|
|
};
|
|
# Unicode Hex
|
|
'\\x{' xdigit+ '}' => {
|
|
string hex(ts + 3, te - ts - 4);
|
|
unsigned long val;
|
|
try {
|
|
val = stoul(hex, nullptr, 16);
|
|
} catch (const std::out_of_range &) {
|
|
val = MAX_UNICODE + 1;
|
|
}
|
|
if (val > MAX_UNICODE) {
|
|
throw LocatedParseError("Value in \\x{...} sequence is too large");
|
|
}
|
|
currentCls->add((unichar)val);
|
|
};
|
|
# And for when it goes wrong
|
|
'\\x{' => {
|
|
throw LocatedParseError("Value in \\x{...} sequence is non-hex or missing }");
|
|
};
|
|
# Control characters
|
|
escapedCtrl => {
|
|
if (te - ts < 3) {
|
|
assert(te - ts == 2);
|
|
throw LocatedParseError(SLASH_C_ERROR);
|
|
} else {
|
|
assert(te - ts == 3);
|
|
currentCls->add(decodeCtrl(ts[2]));
|
|
}
|
|
};
|
|
# Word character
|
|
'\\w' => {
|
|
currentCls->add(CLASS_WORD, false);
|
|
};
|
|
# Non word character
|
|
'\\W' => {
|
|
currentCls->add(CLASS_WORD, true);
|
|
};
|
|
# Whitespace character (except VT)
|
|
'\\s' => {
|
|
currentCls->add(CLASS_SPACE, false);
|
|
};
|
|
# Non whitespace character
|
|
'\\S' => {
|
|
currentCls->add(CLASS_SPACE, true);
|
|
};
|
|
# Digit character
|
|
'\\d' => {
|
|
currentCls->add(CLASS_DIGIT, false);
|
|
};
|
|
# Non digit character
|
|
'\\D' => {
|
|
currentCls->add(CLASS_DIGIT, true);
|
|
};
|
|
'\-' => {
|
|
currentCls->addDash();
|
|
};
|
|
|
|
# A bunch of unsupported (for now) escapes
|
|
escapedUnsupported - '\\X' => throwUnsupportedEscape;
|
|
|
|
# PCRE appears to discard escaped g in a char class (a backref bug?)
|
|
'\\g' => throwUnsupportedEscape;
|
|
|
|
# the too-hard basket: UE-944, UE-1134, UE-1157
|
|
# many escaped single char literals shold be benign, but PCRE
|
|
# breaks with them when adding to ranges, so unless they have
|
|
# defined special meaning in a char-class we reject them to be
|
|
# safe.
|
|
'\\' alpha => throwUnsupportedEscape;
|
|
|
|
'\\' any => {
|
|
// add the literal char
|
|
currentCls->add(*(ts + 1));
|
|
};
|
|
|
|
#unicode chars
|
|
utf8_2c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
currentCls->add(readUtf8CodePoint2c(ts));
|
|
};
|
|
|
|
utf8_3c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
currentCls->add(readUtf8CodePoint3c(ts));
|
|
};
|
|
|
|
utf8_4c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
currentCls->add(readUtf8CodePoint4c(ts));
|
|
};
|
|
|
|
hi_byte when is_utf8 => {
|
|
assert(mode.utf8);
|
|
throwInvalidUtf8();
|
|
};
|
|
|
|
# Literal character
|
|
(any - ']') => {
|
|
currentCls->add((u8)*ts);
|
|
};
|
|
|
|
']' => {
|
|
currentCls->finalize();
|
|
currentSeq->addComponent(std::move(currentCls));
|
|
inCharClass = false;
|
|
fgoto main;
|
|
};
|
|
*|;
|
|
|
|
#############################################################
|
|
# Parser to read stuff from a character class
|
|
#############################################################
|
|
readClass := |*
|
|
# A caret at the beginning of the class means that the rest of the
|
|
# class is negated.
|
|
'\^' when is_early_charclass => {
|
|
if (currentCls->isNegated()) {
|
|
// Already seen a caret; the second one is not a meta-character.
|
|
inCharClassEarly = false;
|
|
fhold; fgoto charClassGuts;
|
|
} else {
|
|
currentCls->negate();
|
|
// Note: we cannot switch off inCharClassEarly here, as /[^]]/
|
|
// needs to use the right square bracket path below.
|
|
}
|
|
};
|
|
# A right square bracket before anything "real" is interpreted as a
|
|
# literal right square bracket.
|
|
']' when is_early_charclass => {
|
|
currentCls->add(']');
|
|
inCharClassEarly = false;
|
|
};
|
|
# if we hit a quote before anything "real", handle it
|
|
'\\Q' => { fcall readQuotedClass; };
|
|
'\\E' => { /*noop*/};
|
|
|
|
# time for the real work to happen
|
|
any => {
|
|
inCharClassEarly = false;
|
|
fhold;
|
|
fgoto charClassGuts;
|
|
};
|
|
*|;
|
|
|
|
#############################################################
|
|
# Parser to read a quoted literal
|
|
#############################################################
|
|
readQuotedLiteral := |*
|
|
# Escape sequence
|
|
'\\E' => {
|
|
fgoto main;
|
|
};
|
|
|
|
#unicode chars
|
|
utf8_2c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
cc->add(readUtf8CodePoint2c(ts));
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
|
|
utf8_3c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
cc->add(readUtf8CodePoint3c(ts));
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
|
|
utf8_4c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
cc->add(readUtf8CodePoint4c(ts));
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
|
|
hi_byte when is_utf8 => {
|
|
assert(mode.utf8);
|
|
throwInvalidUtf8();
|
|
};
|
|
|
|
# Literal character
|
|
any => {
|
|
addLiteral(currentSeq, *ts, mode);
|
|
};
|
|
*|;
|
|
|
|
#############################################################
|
|
# Parser to read a quoted class
|
|
#############################################################
|
|
readQuotedClass := |*
|
|
# Escape sequence
|
|
'\\E' => {
|
|
fret;
|
|
};
|
|
|
|
#unicode chars
|
|
utf8_2c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
currentCls->add(readUtf8CodePoint2c(ts));
|
|
inCharClassEarly = false;
|
|
};
|
|
|
|
utf8_3c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
currentCls->add(readUtf8CodePoint3c(ts));
|
|
inCharClassEarly = false;
|
|
};
|
|
|
|
utf8_4c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
currentCls->add(readUtf8CodePoint4c(ts));
|
|
inCharClassEarly = false;
|
|
};
|
|
|
|
hi_byte when is_utf8 => {
|
|
assert(mode.utf8);
|
|
throwInvalidUtf8();
|
|
};
|
|
|
|
# Literal character
|
|
any => {
|
|
currentCls->add(*ts);
|
|
inCharClassEarly = false;
|
|
};
|
|
*|;
|
|
|
|
|
|
#############################################################
|
|
# Parser to read (and ignore) a comment block
|
|
#############################################################
|
|
readComment := |*
|
|
# Right paren
|
|
'\)' => { inComment = false; fgoto main; };
|
|
|
|
# absolutely everything gets ignored until we see a right
|
|
# paren
|
|
any;
|
|
*|;
|
|
|
|
#############################################################
|
|
# Parser to read (and ignore) a newline-terminated comment
|
|
# block
|
|
#############################################################
|
|
readNewlineTerminatedComment := |*
|
|
'\n' => { inComment = false; fgoto main; };
|
|
|
|
# absolutely everything gets ignored until we see a
|
|
# newline
|
|
any;
|
|
*|;
|
|
|
|
#############################################################
|
|
# Parser for standard components
|
|
#############################################################
|
|
main := |*
|
|
#############################################################
|
|
# Standard components
|
|
#############################################################
|
|
# Begin capturing group (non-capturing handled further down)
|
|
'\(' => enterCapturingGroup;
|
|
# End group
|
|
'\)' => exitGroup;
|
|
# Mark alternation
|
|
'\|' => {
|
|
currentSeq->addAlternation();
|
|
};
|
|
# POSIX named elements should only be used inside a class. Note
|
|
# that we need to be able to reject /[:\]:]/ here.
|
|
'\[:' ( '\\]' | [^\]] )* ':\]' => {
|
|
throw LocatedParseError("POSIX named classes are only "
|
|
"supported inside a class");
|
|
};
|
|
# We don't support POSIX collating elements (neither does PCRE
|
|
# or Perl). These look like [.ch.] or [=ch=].
|
|
'\[\.' ( '\\]' | [^\]] )* '\.\]' |
|
|
'\[=' ( '\\]' | [^\]] )* '=\]' => {
|
|
throw LocatedParseError("Unsupported POSIX collating "
|
|
"element");
|
|
};
|
|
# Begin eating characters for class
|
|
'\[' => eatClass;
|
|
# Begin quoted literal
|
|
'\\Q' => {
|
|
fgoto readQuotedLiteral;
|
|
};
|
|
# An \E that is not preceded by a \Q is ignored
|
|
'\\E' => { /* noop */ };
|
|
# Match any character
|
|
'\.' => {
|
|
currentSeq->addComponent(generateComponent(CLASS_ANY, false, mode));
|
|
};
|
|
# Match one byte
|
|
'\\C' => {
|
|
if (mode.utf8) {
|
|
throw LocatedParseError("\\C is unsupported in UTF8");
|
|
}
|
|
currentSeq->addComponent(std::make_unique<ComponentByte>());
|
|
};
|
|
# Match 0 or more times (greedy)
|
|
'\*' => {
|
|
if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit,
|
|
ComponentRepeat::REPEAT_GREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 0 or more times (non-greedy)
|
|
'\*\?' => {
|
|
if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit,
|
|
ComponentRepeat::REPEAT_NONGREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 0 or more times (possessive)
|
|
'\*\+' => {
|
|
if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit,
|
|
ComponentRepeat::REPEAT_POSSESSIVE)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 1 or more times (greedy)
|
|
'\+' => {
|
|
if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit,
|
|
ComponentRepeat::REPEAT_GREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 1 or more times (non-greedy)
|
|
'\+\?' => {
|
|
if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit,
|
|
ComponentRepeat::REPEAT_NONGREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 1 or more times (possessive)
|
|
'\+\+' => {
|
|
if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit,
|
|
ComponentRepeat::REPEAT_POSSESSIVE)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 0 or 1 times (greedy)
|
|
'\?' => {
|
|
if (!currentSeq->addRepeat(
|
|
0, 1, ComponentRepeat::REPEAT_GREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 0 or 1 times (non-greedy)
|
|
'\?\?' => {
|
|
if (!currentSeq->addRepeat(
|
|
0, 1, ComponentRepeat::REPEAT_NONGREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match 0 or 1 times (possessive)
|
|
'\?\+' => {
|
|
if (!currentSeq->addRepeat(
|
|
0, 1, ComponentRepeat::REPEAT_POSSESSIVE)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match {n}|{n,}|{n,m} times (greedy)
|
|
repeatNM1 => {
|
|
if (repeatN > repeatM || repeatM == 0) {
|
|
throwInvalidRepeat();
|
|
} else if (!currentSeq->addRepeat(
|
|
repeatN, repeatM,
|
|
ComponentRepeat::REPEAT_GREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match {n}|{n,}|{n,m} times (non-greedy)
|
|
repeatNM1 '\?' => {
|
|
if (repeatN > repeatM || repeatM == 0) {
|
|
throwInvalidRepeat();
|
|
} else if (!currentSeq->addRepeat(
|
|
repeatN, repeatM,
|
|
ComponentRepeat::REPEAT_NONGREEDY)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
# Match {n}|{n,}|{n,m} times (possessive)
|
|
repeatNM1 '\+' => {
|
|
if (repeatN > repeatM || repeatM == 0) {
|
|
throwInvalidRepeat();
|
|
} else if (!currentSeq->addRepeat(
|
|
repeatN, repeatM,
|
|
ComponentRepeat::REPEAT_POSSESSIVE)) {
|
|
throwInvalidRepeat();
|
|
}
|
|
};
|
|
|
|
# In ignore_space mode, an unescaped # character introduces a
|
|
# comment that runs until the next newline or the end of the
|
|
# pattern.
|
|
'\#' when is_ignore_space => enterNewlineTerminatedComment;
|
|
|
|
# Perl 5.10 Special Backtracking Control Verbs: we support
|
|
# UTF8/UCP, none of the others
|
|
'(*' [^)] => { fhold; fcall readVerb; };
|
|
|
|
# Earlier parser code checked for the terminating NULL and exited
|
|
# explicitly.
|
|
'\0' => { assert(0); fbreak; };
|
|
|
|
#############################################################
|
|
# Boundaries
|
|
#############################################################
|
|
|
|
# Start of data; also after internal newline in multiline mode
|
|
'\^' => {
|
|
auto bound = mode.multiline ? ComponentBoundary::BEGIN_LINE
|
|
: ComponentBoundary::BEGIN_STRING;
|
|
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
|
|
};
|
|
# End of data (with optional internal newline); also before
|
|
# internal newline in multiline mode
|
|
'\$' => {
|
|
auto bound = mode.multiline ? ComponentBoundary::END_LINE
|
|
: ComponentBoundary::END_STRING_OPTIONAL_LF;
|
|
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
|
|
};
|
|
# Beginning of data
|
|
'\\A' => {
|
|
auto bound = ComponentBoundary::BEGIN_STRING;
|
|
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
|
|
};
|
|
# End of data (with optional internal newline)
|
|
'\\Z' => {
|
|
auto bound = ComponentBoundary::END_STRING_OPTIONAL_LF;
|
|
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
|
|
};
|
|
# End of data
|
|
'\\z' => {
|
|
auto bound = ComponentBoundary::END_STRING;
|
|
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
|
|
};
|
|
# Word boundary
|
|
'\\b' => {
|
|
currentSeq->addComponent(
|
|
std::make_unique<ComponentWordBoundary>(ts - ptr, false, mode));
|
|
};
|
|
# Non-word boundary
|
|
'\\B' => {
|
|
currentSeq->addComponent(
|
|
std::make_unique<ComponentWordBoundary>(ts - ptr, true, mode));
|
|
};
|
|
|
|
#############################################################
|
|
# Escaped chars
|
|
#############################################################
|
|
|
|
# Tab
|
|
'\\t' => {
|
|
addLiteral(currentSeq, '\x09', mode);
|
|
};
|
|
# Newline
|
|
'\\n' => {
|
|
addLiteral(currentSeq, '\x0a', mode);
|
|
};
|
|
# Carriage return
|
|
'\\r' => {
|
|
addLiteral(currentSeq, '\x0d', mode);
|
|
};
|
|
# Form feed
|
|
'\\f' => {
|
|
addLiteral(currentSeq, '\x0c', mode);
|
|
};
|
|
# Bell
|
|
'\\a' => {
|
|
addLiteral(currentSeq, '\x07', mode);
|
|
};
|
|
# Escape
|
|
'\\e' => {
|
|
addLiteral(currentSeq, '\x1b', mode);
|
|
};
|
|
# Octal
|
|
escapedOctal0 => {
|
|
addLiteral(currentSeq, octAccumulator, mode);
|
|
};
|
|
escapedOctal2 => {
|
|
// If there are enough capturing sub expressions, this may be
|
|
// a back reference
|
|
accumulator = parseAsDecimal(octAccumulator);
|
|
if (accumulator < groupIndex) {
|
|
currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
|
|
} else {
|
|
addEscapedOctal(currentSeq, octAccumulator, mode);
|
|
}
|
|
};
|
|
|
|
# Numeric back reference
|
|
# everything less than 8 is a straight up back ref, even if
|
|
# it is a forwards backward reference (aieeee!)
|
|
# Note that \8 and \9 are the literal chars '8' and '9'.
|
|
'\\' backRefIdSingle => addNumberedBackRef;
|
|
# otherwise we need to munge through the possible backref
|
|
'\\' backRefId => {
|
|
// if there are enough left parens to this point, back ref
|
|
if (accumulator < groupIndex) {
|
|
currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
|
|
} else {
|
|
// Otherwise, we interpret the first three digits as an
|
|
// octal escape, and the remaining characters stand for
|
|
// themselves as literals.
|
|
const char *s = ts;
|
|
unsigned int accum = 0;
|
|
unsigned int oct_digits = 0;
|
|
assert(*s == '\\'); // token starts at backslash
|
|
for (++s; s < te && oct_digits < 3; ++oct_digits, ++s) {
|
|
u8 digit = *s - '0';
|
|
if (digit < 8) {
|
|
accum = digit + accum * 8;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (oct_digits > 0) {
|
|
addEscapedOctal(currentSeq, accum, mode);
|
|
}
|
|
|
|
// And then the rest of the digits, if any, are literal.
|
|
for (; s < te; ++s) {
|
|
addLiteral(currentSeq, *s, mode);
|
|
}
|
|
}
|
|
};
|
|
backReferenceG => addNumberedBackRef;
|
|
backReferenceGNegative => addNegativeNumberedBackRef;
|
|
backReferenceGBracket => addNumberedBackRef;
|
|
backReferenceGBracket2 => addNegativeNumberedBackRef;
|
|
backReferenceGBracketName => addNamedBackRef;
|
|
backReferenceKBracketName => addNamedBackRef;
|
|
backReferenceKBracketName2 => addNamedBackRef;
|
|
backReferenceKBracketName3 => addNamedBackRef;
|
|
backReferenceP => addNamedBackRef;
|
|
# Oniguruma - either angle braces or single quotes for this one
|
|
('\\g<' [^>]*? '>'|'\\g\'' [^\']*? '\'') => {
|
|
ostringstream str;
|
|
str << "Onigiruma subroutine call at index " << ts - ptr <<
|
|
" not supported.";
|
|
throw ParseError(str.str());
|
|
};
|
|
# Fallthrough: a \g that hasn't been caught by one of the above
|
|
# is invalid syntax. Without this rule, we would accept /A\g/.
|
|
'\\g' => {
|
|
throw LocatedParseError("Invalid reference after \\g");
|
|
};
|
|
'\\o{' [0-7]+ '}' => {
|
|
string oct(ts + 3, te - ts - 4);
|
|
unsigned long val;
|
|
try {
|
|
val = stoul(oct, nullptr, 8);
|
|
} catch (const std::out_of_range &) {
|
|
val = MAX_UNICODE + 1;
|
|
}
|
|
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
|
|
throw LocatedParseError("Value in \\o{...} sequence is too large");
|
|
}
|
|
addEscapedOctal(currentSeq, (unichar)val, mode);
|
|
};
|
|
# And for when it goes wrong
|
|
'\\o' => {
|
|
throw LocatedParseError("Value in \\o{...} sequence is non-octal or missing braces");
|
|
};
|
|
# Hex
|
|
escapedHex => {
|
|
addEscapedHex(currentSeq, accumulator, mode);
|
|
};
|
|
# Unicode Hex
|
|
'\\x{' xdigit+ '}' => {
|
|
string hex(ts + 3, te - ts - 4);
|
|
unsigned long val;
|
|
try {
|
|
val = stoul(hex, nullptr, 16);
|
|
} catch (const std::out_of_range &) {
|
|
val = MAX_UNICODE + 1;
|
|
}
|
|
if (val > MAX_UNICODE) {
|
|
throw LocatedParseError("Value in \\x{...} sequence is too large");
|
|
}
|
|
addEscapedHex(currentSeq, (unichar)val, mode);
|
|
};
|
|
# And for when it goes wrong
|
|
'\\x{' => {
|
|
throw LocatedParseError("Value in \\x{...} sequence is non-hex or missing }");
|
|
};
|
|
# Control characters
|
|
escapedCtrl => {
|
|
if (te - ts < 3) {
|
|
assert(te - ts == 2);
|
|
throw LocatedParseError(SLASH_C_ERROR);
|
|
} else {
|
|
assert(te - ts == 3);
|
|
addLiteral(currentSeq, decodeCtrl(ts[2]), mode);
|
|
}
|
|
};
|
|
# A bunch of unsupported (for now) escapes
|
|
escapedUnsupported => {
|
|
ostringstream str;
|
|
str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
|
|
<< " not supported.";
|
|
throw ParseError(str.str());
|
|
};
|
|
|
|
# Word character
|
|
'\\w' => {
|
|
auto cc = generateComponent(CLASS_WORD, false, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Non word character
|
|
'\\W' => {
|
|
auto cc = generateComponent(CLASS_WORD, true, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Whitespace character
|
|
'\\s' => {
|
|
auto cc = generateComponent(CLASS_SPACE, false, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Non whitespace character
|
|
'\\S' => {
|
|
auto cc = generateComponent(CLASS_SPACE, true, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Digit character
|
|
'\\d' => {
|
|
auto cc = generateComponent(CLASS_DIGIT, false, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Non digit character
|
|
'\\D' => {
|
|
auto cc = generateComponent(CLASS_DIGIT, true, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Horizontal whitespace
|
|
'\\h' => {
|
|
auto cc = generateComponent(CLASS_HORZ, false, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Not horizontal whitespace
|
|
'\\H' => {
|
|
auto cc = generateComponent(CLASS_HORZ, true, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Vertical whitespace
|
|
'\\v' => {
|
|
auto cc = generateComponent(CLASS_VERT, false, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
# Not vertical whitespace
|
|
'\\V' => {
|
|
auto cc = generateComponent(CLASS_VERT, true, mode);
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
|
|
'\\p{' => {
|
|
assert(!currentCls && !inCharClass);
|
|
currentCls = getComponentClass(mode);
|
|
negated = false;
|
|
fhold;
|
|
fcall readBracedUCP;
|
|
};
|
|
|
|
'\\p' any => {
|
|
assert(!currentCls && !inCharClass);
|
|
currentCls = getComponentClass(mode);
|
|
negated = false;
|
|
fhold;
|
|
fcall readUCPSingle;
|
|
};
|
|
|
|
'\\P{' => {
|
|
assert(!currentCls && !inCharClass);
|
|
currentCls = getComponentClass(mode);
|
|
negated = true;
|
|
fhold;
|
|
fcall readBracedUCP;
|
|
};
|
|
|
|
'\\P' any => {
|
|
assert(!currentCls && !inCharClass);
|
|
currentCls = getComponentClass(mode);
|
|
negated = true;
|
|
fhold;
|
|
fcall readUCPSingle;
|
|
};
|
|
|
|
'\\P' => { throw LocatedParseError("Malformed property"); };
|
|
'\\p' => { throw LocatedParseError("Malformed property"); };
|
|
|
|
# Newline sequence, hairy semantics that we don't do
|
|
'\\R' => {
|
|
ostringstream str;
|
|
str << "\\R at index " << ts - ptr << " not supported.";
|
|
throw ParseError(str.str());
|
|
};
|
|
|
|
# Reset start of match, also hairy semantics that we don't do
|
|
'\\K' => {
|
|
ostringstream str;
|
|
str << "\\K at index " << ts - ptr << " not supported.";
|
|
throw ParseError(str.str());
|
|
};
|
|
|
|
# \k without a backref is bugged in PCRE so we have no
|
|
# idea what our semantics should be on it
|
|
'\\k' => {
|
|
ostringstream str;
|
|
str << "\\k at index " << ts - ptr << " not supported.";
|
|
throw ParseError(str.str());
|
|
};
|
|
|
|
# \G is more hairy pcre-api stuff, DO NOT WANT
|
|
'\\G' => {
|
|
ostringstream str;
|
|
str << "\\G at index " << ts - ptr << " not supported.";
|
|
throw ParseError(str.str());
|
|
};
|
|
|
|
'\\X' => {
|
|
currentSeq->addComponent(std::make_unique<ComponentEUS>(ts - ptr, mode));
|
|
};
|
|
|
|
# Fall through general escaped character
|
|
'\\' any => {
|
|
addLiteral(currentSeq, *(ts + 1), mode);
|
|
};
|
|
|
|
# A backslash with no follower is not allowed
|
|
'\\' => {
|
|
assert(ts + 1 == pe);
|
|
ostringstream str;
|
|
str << "Unescaped \\ at end of input, index " << ts - ptr << ".";
|
|
throw ParseError(str.str());
|
|
};
|
|
|
|
#############################################################
|
|
# Extended patterns
|
|
#############################################################
|
|
|
|
# Comment
|
|
'\(\?\#' => enterComment;
|
|
# Match modifiers
|
|
'\(\?' matchModifiers >resetModifiers ')' => applyModifiers;
|
|
# Non-capturing group, with flag modifiers
|
|
'\(\?' matchModifiers >resetModifiers ':' => enterModifiedGroup;
|
|
# Zero width look ahead assertion
|
|
'\(\?=' => enterZWLookAhead;
|
|
# Zero width negative look ahead assertion
|
|
'\(\?\!' => enterZWNegLookAhead;
|
|
# Zero width look behind assertion
|
|
'\(\?\<=' => enterZWLookBehind;
|
|
# Zero width negative look behind assertion
|
|
'\(\?\<\!' => enterZWNegLookBehind;
|
|
# Code (TOTALLY unsupported... for good reason)
|
|
'\(\?\{' => enterEmbeddedCode;
|
|
'\(\?\?\{' => enterEmbeddedCode;
|
|
# Atomic group
|
|
'\(\?\>' => enterAtomicGroup;
|
|
|
|
# Named capturing groups
|
|
( namedGroup1 |
|
|
namedGroup2 |
|
|
namedGroup3 ) => enterNamedGroup;
|
|
|
|
# named/numbered subroutine references
|
|
numberedSubExpression => enterReferenceUnsupported;
|
|
namedSubExpression => enterReferenceUnsupported;
|
|
|
|
# Conditional reference with a positive lookahead assertion
|
|
'(?(?=' => {
|
|
auto a = std::make_unique<ComponentAssertion>(
|
|
ComponentAssertion::LOOKAHEAD, ComponentAssertion::POS);
|
|
ComponentAssertion *a_seq = a.get();
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentCondReference>(std::move(a)));
|
|
PUSH_SEQUENCE;
|
|
currentSeq = a_seq;
|
|
};
|
|
# Conditional reference with a negative lookahead assertion
|
|
'(?(?!' => {
|
|
auto a = std::make_unique<ComponentAssertion>(
|
|
ComponentAssertion::LOOKAHEAD, ComponentAssertion::NEG);
|
|
ComponentAssertion *a_seq = a.get();
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentCondReference>(std::move(a)));
|
|
PUSH_SEQUENCE;
|
|
currentSeq = a_seq;
|
|
};
|
|
# Conditional reference with a positive lookbehind assertion
|
|
'(?(?<=' => {
|
|
auto a = std::make_unique<ComponentAssertion>(
|
|
ComponentAssertion::LOOKBEHIND, ComponentAssertion::POS);
|
|
ComponentAssertion *a_seq = a.get();
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentCondReference>(std::move(a)));
|
|
PUSH_SEQUENCE;
|
|
currentSeq = a_seq;
|
|
};
|
|
# Conditional reference with a negative lookbehind assertion
|
|
'(?(?<!' => {
|
|
auto a = std::make_unique<ComponentAssertion>(
|
|
ComponentAssertion::LOOKBEHIND, ComponentAssertion::NEG);
|
|
ComponentAssertion *a_seq = a.get();
|
|
PUSH_SEQUENCE;
|
|
currentSeq = enterSequence(currentSeq,
|
|
std::make_unique<ComponentCondReference>(std::move(a)));
|
|
PUSH_SEQUENCE;
|
|
currentSeq = a_seq;
|
|
};
|
|
|
|
# Recursive conditional references (unsupported)
|
|
'(?(R' ( [0-9]+ | ('&' [A-Za-z0-9_]+) ) ? ')' => {
|
|
throw LocatedParseError("Pattern recursion not supported");
|
|
};
|
|
|
|
# Conditional references
|
|
# numbered
|
|
'\(\?\(' (backRefIdSingle | backRefId) ')' => enterNumberedConditionalRef;
|
|
# named
|
|
( namedConditionalRef1 |
|
|
namedConditionalRef2 |
|
|
namedConditionalRef3 ) => enterNamedConditionalRef;
|
|
|
|
# Conditions (unsupported)
|
|
'\(\?\(' => enterConditionUnsupported;
|
|
|
|
# Callouts (unsupported)
|
|
'\(\?C' [0-9]* '\)' => {
|
|
ostringstream str;
|
|
str << "Callout at index " << ts - ptr << " not supported.";
|
|
throw ParseError(str.str());
|
|
};
|
|
|
|
# Any other char after '(?' is a pattern modifier we don't
|
|
# recognise.
|
|
'\(\?' any => {
|
|
throw LocatedParseError("Unrecognised character after (?");
|
|
};
|
|
|
|
#unicode chars
|
|
utf8_2c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
cc->add(readUtf8CodePoint2c(ts));
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
|
|
utf8_3c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
cc->add(readUtf8CodePoint3c(ts));
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
|
|
utf8_4c when is_utf8 => {
|
|
assert(mode.utf8);
|
|
/* leverage ComponentClass to generate the vertices */
|
|
auto cc = getComponentClass(mode);
|
|
cc->add(readUtf8CodePoint4c(ts));
|
|
cc->finalize();
|
|
currentSeq->addComponent(std::move(cc));
|
|
};
|
|
|
|
hi_byte when is_utf8 => {
|
|
assert(mode.utf8);
|
|
throwInvalidUtf8();
|
|
};
|
|
|
|
#############################################################
|
|
# Literal character
|
|
#############################################################
|
|
# literal character
|
|
whitespace => {
|
|
if (mode.ignore_space == false) {
|
|
addLiteral(currentSeq, *ts, mode);
|
|
}
|
|
};
|
|
any => {
|
|
addLiteral(currentSeq, *ts, mode);
|
|
};
|
|
*|;
|
|
|
|
prepush {
|
|
DEBUG_PRINTF("stack %zu top %d\n", stack.size(), top);
|
|
if ((int)stack.size() == top) {
|
|
stack.resize(2 * (top + 1));
|
|
}
|
|
}
|
|
}%%
|
|
|
|
%% write data nofinal;
|
|
|
|
/** \brief Main parser call, returns root Component or nullptr. */
|
|
unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
|
|
assert(ptr);
|
|
|
|
const char *p = ptr;
|
|
const char *pe = ptr + strlen(ptr);
|
|
|
|
// First, read the control verbs, set any global mode flags and move the
|
|
// ptr forward.
|
|
p = read_control_verbs(p, pe, 0, globalMode);
|
|
|
|
const char *eof = pe;
|
|
int cs;
|
|
UNUSED int act;
|
|
int top;
|
|
vector<int> stack;
|
|
const char *ts, *te;
|
|
unichar accumulator = 0;
|
|
unichar octAccumulator = 0; /* required as we are also accumulating for
|
|
* back ref when looking for octals */
|
|
unsigned repeatN = 0;
|
|
unsigned repeatM = 0;
|
|
string label;
|
|
|
|
ParseMode mode = globalMode;
|
|
ParseMode newMode;
|
|
|
|
bool negated = false;
|
|
bool inComment = false;
|
|
|
|
// Stack of sequences and flags used to store state when we enter
|
|
// sub-sequences.
|
|
vector<ExprState> sequences;
|
|
|
|
// Index of the next capturing group. Note that zero is reserved for the
|
|
// root sequence.
|
|
unsigned groupIndex = 1;
|
|
|
|
// Set storing group names that are currently in use.
|
|
flat_set<string> groupNames;
|
|
|
|
// Root sequence.
|
|
unique_ptr<ComponentSequence> rootSeq = std::make_unique<ComponentSequence>();
|
|
rootSeq->setCaptureIndex(0);
|
|
|
|
// Current sequence being appended to
|
|
ComponentSequence *currentSeq = rootSeq.get();
|
|
|
|
// The current character class being appended to. This is used as the
|
|
// accumulator for both character class and UCP properties.
|
|
unique_ptr<ComponentClass> currentCls;
|
|
|
|
// True if the machine is currently inside a character class, i.e. square
|
|
// brackets [..].
|
|
bool inCharClass = false;
|
|
|
|
// True if the machine is inside a character class but it has not processed
|
|
// any "real" elements yet, i.e. it's still processing meta-characters like
|
|
// '^'.
|
|
bool inCharClassEarly = false;
|
|
|
|
// Location at which the current character class began.
|
|
const char *currentClsBegin = p;
|
|
|
|
// We throw exceptions on various parsing failures beyond this point: we
|
|
// use a try/catch block here to clean up our allocated memory before we
|
|
// re-throw the exception to the caller.
|
|
try {
|
|
// Embed the Ragel machine here
|
|
%% write init;
|
|
%% write exec;
|
|
|
|
if (p != pe && *p != '\0') {
|
|
// didn't make it to the end of our input, but we didn't throw a ParseError?
|
|
assert(0);
|
|
ostringstream str;
|
|
str << "Parse error at index " << (p - ptr) << ".";
|
|
throw ParseError(str.str());
|
|
}
|
|
|
|
if (currentCls) {
|
|
assert(inCharClass);
|
|
assert(currentClsBegin);
|
|
ostringstream oss;
|
|
oss << "Unterminated character class starting at index "
|
|
<< currentClsBegin - ptr << ".";
|
|
throw ParseError(oss.str());
|
|
}
|
|
|
|
if (inComment) {
|
|
throw ParseError("Unterminated comment.");
|
|
}
|
|
|
|
if (!sequences.empty()) {
|
|
ostringstream str;
|
|
str << "Missing close parenthesis for group started at index "
|
|
<< sequences.back().seqOffset << ".";
|
|
throw ParseError(str.str());
|
|
}
|
|
|
|
// Unlikely, but possible
|
|
if (groupIndex > 65535) {
|
|
throw ParseError("The maximum number of capturing subexpressions is 65535.");
|
|
}
|
|
|
|
// Finalize the top-level sequence, which will take care of any
|
|
// top-level alternation.
|
|
currentSeq->finalize();
|
|
assert(currentSeq == rootSeq.get());
|
|
|
|
// Ensure that all references are valid.
|
|
checkReferences(*rootSeq, groupIndex, groupNames);
|
|
|
|
return rootSeq;
|
|
} catch (LocatedParseError &error) {
|
|
if (ts >= ptr && ts <= pe) {
|
|
error.locate(ts - ptr);
|
|
} else {
|
|
error.locate(0);
|
|
}
|
|
throw;
|
|
}
|
|
}
|
|
|
|
} // namespace ue2
|