vectorscan/src/parser/Parser.rl
2023-10-03 20:24:39 +03:00

2039 lines
79 KiB
Ragel

/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Parser code (generated with Ragel from Parser.rl).
*/
#include "config.h"
/* Parser.cpp is a built source, may not be in same dir as parser files */
#include "parser/check_refs.h"
#include "parser/control_verbs.h"
#include "parser/ComponentAlternation.h"
#include "parser/ComponentAssertion.h"
#include "parser/ComponentAtomicGroup.h"
#include "parser/ComponentBackReference.h"
#include "parser/ComponentBoundary.h"
#include "parser/ComponentByte.h"
#include "parser/ComponentClass.h"
#include "parser/ComponentCondReference.h"
#include "parser/ComponentEmpty.h"
#include "parser/ComponentEUS.h"
#include "parser/Component.h"
#include "parser/ComponentRepeat.h"
#include "parser/ComponentSequence.h"
#include "parser/ComponentWordBoundary.h"
#include "parser/parse_error.h"
#include "parser/Parser.h"
#include "ue2common.h"
#include "util/compare.h"
#include "util/flat_containers.h"
#include "util/unicode_def.h"
#include "util/verify_types.h"
#include <cassert>
#include <cctype>
#include <cstring>
#include <cstdlib>
#include <map>
#include <sstream>
#include <string>
#include <vector>
using namespace std;
namespace ue2 {
#define PUSH_SEQUENCE do {\
sequences.push_back(ExprState(currentSeq, (size_t)(ts - ptr), \
mode)); \
} while(0)
#define POP_SEQUENCE do {\
currentSeq = sequences.back().seq; \
mode = sequences.back().mode; \
sequences.pop_back(); \
} while(0)
namespace {
/** \brief Structure representing current state as we're parsing (current
* sequence, current options). Stored in the 'sequences' vector. */
struct ExprState {
ExprState(ComponentSequence *seq_in, size_t offset,
const ParseMode &mode_in) :
seq(seq_in), seqOffset(offset), mode(mode_in) {}
ComponentSequence *seq; //!< current sequence
size_t seqOffset; //!< offset seq was entered, for error reporting
ParseMode mode; //!< current mode flags
};
} // namespace
static
unsigned parseAsDecimal(unsigned oct) {
// The input was parsed as octal, but should have been parsed as decimal.
// Deconstruct the octal number and reconstruct into decimal
unsigned ret = 0;
unsigned multiplier = 1;
while (oct) {
ret += (oct & 0x7) * multiplier;
oct >>= 3;
multiplier *= 10;
}
return ret;
}
/** \brief Maximum value for a positive integer. We use INT_MAX, as that's what
* PCRE uses. */
static constexpr u32 MAX_NUMBER = INT_MAX;
static
void pushDec(u32 *acc, char raw_digit) {
assert(raw_digit >= '0' && raw_digit <= '9');
u32 digit_val = raw_digit - '0';
// Ensure that we don't overflow.
u64a val = ((u64a)*acc * 10) + digit_val;
if (val > MAX_NUMBER) {
throw LocatedParseError("Number is too big");
}
*acc = verify_u32(val);
}
static
void pushOct(u32 *acc, char raw_digit) {
assert(raw_digit >= '0' && raw_digit <= '7');
u32 digit_val = raw_digit - '0';
// Ensure that we don't overflow.
u64a val = ((u64a)*acc * 8) + digit_val;
if (val > MAX_NUMBER) {
throw LocatedParseError("Number is too big");
}
*acc = verify_u32(val);
}
static
void throwInvalidRepeat(void) {
throw LocatedParseError("Invalid repeat");
}
static
void throwInvalidUtf8(void) {
throw ParseError("Expression is not valid UTF-8.");
}
/**
* Adds the given child component to the parent sequence, returning a pointer
* to the new (child) "current sequence".
*/
static
ComponentSequence *enterSequence(ComponentSequence *parent,
unique_ptr<ComponentSequence> child) {
assert(parent);
assert(child);
ComponentSequence *seq = child.get();
parent->addComponent(std::move(child));
return seq;
}
static
void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) {
if (mode.utf8 && mode.caseless) {
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
assert(cc);
cc->add(c);
cc->finalize();
currentSeq->addComponent(std::move(cc));
} else {
currentSeq->addComponent(getLiteralComponentClass(c, mode.caseless));
}
}
static
void addEscaped(ComponentSequence *currentSeq, unichar accum,
const ParseMode &mode, const char *err_msg) {
if (mode.utf8) {
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
assert(cc);
cc->add(accum);
cc->finalize();
currentSeq->addComponent(std::move(cc));
} else {
if (accum > 255) {
throw LocatedParseError(err_msg);
}
addLiteral(currentSeq, (char)accum, mode);
}
}
static
void addEscapedOctal(ComponentSequence *currentSeq, unichar accum,
const ParseMode &mode) {
addEscaped(currentSeq, accum, mode, "Octal value is greater than \\377");
}
static
void addEscapedHex(ComponentSequence *currentSeq, unichar accum,
const ParseMode &mode) {
addEscaped(currentSeq, accum, mode,
"Hexadecimal value is greater than \\xFF");
}
#define SLASH_C_ERROR "\\c must be followed by an ASCII character"
static
u8 decodeCtrl(char raw) {
if (raw & 0x80) {
throw LocatedParseError(SLASH_C_ERROR);
}
return mytoupper(raw) ^ 0x40;
}
static
unichar readUtf8CodePoint2c(const char *s) {
auto *ts = (const u8 *)s;
assert(ts[0] >= 0xc0 && ts[0] < 0xe0);
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
unichar val = ts[0] & 0x1f;
val <<= 6;
val |= ts[1] & 0x3f;
DEBUG_PRINTF("utf8 %02hhx %02hhx ->\\x{%x}\n", ts[0],
ts[1], val);
return val;
}
static
unichar readUtf8CodePoint3c(const char *s) {
auto *ts = (const u8 *)s;
assert(ts[0] >= 0xe0 && ts[0] < 0xf0);
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
unichar val = ts[0] & 0x0f;
val <<= 6;
val |= ts[1] & 0x3f;
val <<= 6;
val |= ts[2] & 0x3f;
DEBUG_PRINTF("utf8 %02hhx %02hhx %02hhx ->\\x{%x}\n", ts[0],
ts[1], ts[2], val);
return val;
}
static
unichar readUtf8CodePoint4c(const char *s) {
auto *ts = (const u8 *)s;
assert(ts[0] >= 0xf0 && ts[0] < 0xf8);
assert(ts[1] >= 0x80 && ts[1] < 0xc0);
assert(ts[2] >= 0x80 && ts[2] < 0xc0);
assert(ts[3] >= 0x80 && ts[3] < 0xc0);
unichar val = ts[0] & 0x07;
val <<= 6;
val |= ts[1] & 0x3f;
val <<= 6;
val |= ts[2] & 0x3f;
val <<= 6;
val |= ts[3] & 0x3f;
DEBUG_PRINTF("utf8 %02hhx %02hhx %02hhx %02hhx ->\\x{%x}\n", ts[0],
ts[1], ts[2], ts[3], val);
return val;
}
%%{
machine regex;
alphtype unsigned char;
action throwUnsupportedEscape {
ostringstream str;
str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
<< " not supported in a character class.";
throw ParseError(str.str());
}
action unsupportedProperty {
throw LocatedParseError("Character property not supported");
}
action clearLabel { label.clear();}
action appendLabelCharacter { label.push_back(fc);}
action clearOctAccumulator { octAccumulator = 0;}
action clearAccumulator { accumulator = 0;}
action setOctAccumulator {
octAccumulator = 0;
pushOct(&octAccumulator, fc);
}
action setDecAccumulator {
accumulator = 0;
pushDec(&accumulator, fc);
}
action clearNM { repeatN = 0; repeatM = 0; }
action appendN { pushDec(&repeatN, fc); }
action appendM { pushDec(&repeatM, fc); }
action appendAccumulatorOctDigit { pushOct(&octAccumulator, fc); }
action appendAccumulatorDecDigit { pushDec(&accumulator, fc); }
action appendAccumulatorHexDigit {
accumulator *= 16;
accumulator += fc - '0';
}
action appendAccumulatorHexL {
accumulator *= 16;
accumulator += 10 + fc - 'a';
}
action appendAccumulatorHexU {
accumulator *= 16;
accumulator += 10 + fc - 'A';
}
# enter a comment group, where we just scan for a close paren.
action enterComment {
inComment = true;
fgoto readComment;
}
# enter an extended mode comment, where we just scan for a newline.
action enterNewlineTerminatedComment {
inComment = true;
fgoto readNewlineTerminatedComment;
}
# enter a CAPTURING group ( e.g. '(blah)' )
action enterCapturingGroup {
PUSH_SEQUENCE;
auto seq = std::make_unique<ComponentSequence>();
seq->setCaptureIndex(groupIndex++);
currentSeq = enterSequence(currentSeq, std::move(seq));
}
# enter a NAMED CAPTURING group ( e.g. (?'<hatstand>blah) )
action enterNamedGroup {
assert(!label.empty()); // should be guaranteed by machine
char c = *label.begin();
if (c >= '0' && c <= '9') {
throw LocatedParseError("Group name cannot begin with a digit");
}
if (!groupNames.insert(label).second) {
throw LocatedParseError("Two named subpatterns use the name '" + label + "'");
}
PUSH_SEQUENCE;
auto seq = std::make_unique<ComponentSequence>();
seq->setCaptureIndex(groupIndex++);
seq->setCaptureName(label);
currentSeq = enterSequence(currentSeq, std::move(seq));
}
# enter a NON-CAPTURING group where we're modifying flags
# ( e.g. '(?i:blah)' ). Standard non-capturing groups use this path
# as well.
action enterModifiedGroup {
PUSH_SEQUENCE;
mode = newMode;
currentSeq =
enterSequence(currentSeq, std::make_unique<ComponentSequence>());
}
action exitGroup {
if (sequences.empty()) {
throw LocatedParseError("Unmatched parentheses");
}
currentSeq->finalize();
POP_SEQUENCE;
}
action enterZWLookAhead {
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
ComponentAssertion::POS));
}
action enterZWNegLookAhead {
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
ComponentAssertion::NEG));
}
action enterZWLookBehind {
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
ComponentAssertion::POS));
}
action enterZWNegLookBehind {
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
ComponentAssertion::NEG));
}
action enterEmbeddedCode {
throw LocatedParseError("Embedded code is not supported");
}
action enterConditionUnsupported {
throw LocatedParseError("Conditional subpattern unsupported");
}
action enterReferenceUnsupported {
throw LocatedParseError("Subpattern reference unsupported");
}
action enterNumberedConditionalRef {
if (accumulator == 0) {
throw LocatedParseError("Numbered reference cannot be zero");
}
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentCondReference>(accumulator));
}
action enterNamedConditionalRef {
PUSH_SEQUENCE;
assert(!label.empty());
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentCondReference>(label));
}
action enterAtomicGroup {
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentAtomicGroup>());
}
action eatClass {
assert(!currentCls);
assert(!inCharClass); // not reentrant
currentCls = getComponentClass(mode);
inCharClass = true;
inCharClassEarly = true;
currentClsBegin = ts;
fgoto readClass;
}
action resetModifiers {
newMode = mode;
}
action applyModifiers {
mode = newMode;
currentSeq->addComponent(std::make_unique<ComponentEmpty>());
}
action modifyMatchPositive {
switch (fc) {
case 'i':
newMode.caseless = true;
break;
case 'm':
newMode.multiline = true;
break;
case 's':
newMode.dotall = true;
break;
case 'x':
newMode.ignore_space = true;
break;
default:
assert(0); // this action only called for [imsx]
break;
}
}
action modifyMatchNegative {
switch (fc) {
case 'i':
newMode.caseless = false;
break;
case 'm':
newMode.multiline = false;
break;
case 's':
newMode.dotall = false;
break;
case 'x':
newMode.ignore_space = false;
break;
default:
assert(0); // this action only called for [imsx]
break;
}
}
action is_utf8 { mode.utf8 }
action is_ignore_space { mode.ignore_space }
action is_early_charclass { inCharClassEarly }
action addNumberedBackRef {
if (accumulator == 0) {
throw LocatedParseError("Numbered reference cannot be zero");
}
currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
}
action addNegativeNumberedBackRef {
// Accumulator is a negative offset.
if (accumulator == 0) {
throw LocatedParseError("Numbered reference cannot be zero");
}
if (accumulator >= groupIndex) {
throw LocatedParseError("Invalid reference");
}
unsigned idx = groupIndex - accumulator;
currentSeq->addComponent(std::make_unique<ComponentBackReference>(idx));
}
action addNamedBackRef {
currentSeq->addComponent(std::make_unique<ComponentBackReference>(label));
}
escapedOctal0 = '\\0' @clearOctAccumulator [0-7]{0,2} $appendAccumulatorOctDigit;
escapedOctal2 = '\\' [1-7] $setOctAccumulator [0-7]{1,2} $appendAccumulatorOctDigit;
escapedOctal2c = '\\' [1-7] $setOctAccumulator [0-7]{0,2} $appendAccumulatorOctDigit;
backRefIdSingle = [1-7] $setDecAccumulator;
backRefId = [1-9] $setDecAccumulator [0-9]+ $appendAccumulatorDecDigit;
escapedHex = '\\x' @clearAccumulator ([0-9] $appendAccumulatorHexDigit | [a-f] $appendAccumulatorHexL | [A-F] $appendAccumulatorHexU){0,2};
escapedCtrl = '\\c' any?;
escapedUnsupported = '\\' [NluLU];
repeatNM1 = '\{' @clearNM [0-9]+ $appendN ('}' @{repeatM = repeatN;} | ',' '\}' @{repeatM = ComponentRepeat::NoLimit;} | ',' [0-9]+ $appendM '}');
backReferenceG = '\\g' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit;
backReferenceGNegative = '\\g-' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit;
backReferenceGBracket = '\\g{' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit '}';
backReferenceGBracket2 = '\\g{-' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit '}';
backReferenceGBracketName = '\\g{' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '}';
backReferenceKBracketName = '\\k{' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '}';
backReferenceKBracketName2 = '\\k<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>';
backReferenceKBracketName3 = '\\k\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\'';
backReferenceP = '(?P=' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter ')';
namedGroup1 = '(?<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>';
namedGroup2 = '(?\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\'';
namedGroup3 = '(?P<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>';
namedConditionalRef1 = '(?(<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>)';
namedConditionalRef2 = '(?(\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\')';
namedConditionalRef3 = '(?(' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter ')';
numberedSubExpression = '(?' [+\-]? [0-9]+ ')';
namedSubExpression = '(?' ('&'|'P>') [A-Za-z0-9_]+ ')';
positiveMatchModifiers = [imsx]+ $modifyMatchPositive;
negativeMatchModifiers = '-' [imsx]+ $modifyMatchNegative;
matchModifiers = positiveMatchModifiers ? negativeMatchModifiers ?;
utf8_cont = 0x80..0xbf;
utf8_2c = 0xc0..0xdf utf8_cont;
utf8_3c = 0xe0..0xef utf8_cont utf8_cont;
utf8_4c = 0xf0..0xf7 utf8_cont utf8_cont utf8_cont;
hi_byte = 0x80..0xff;
whitespace = [\t\n\v\f\r ];
#############################################################
# Trivial parser to read Perl 5.10+ control verbs, introduced
# by '(*'.
#############################################################
readVerb := |*
'UTF8)' => {
throw LocatedParseError("(*UTF8) must be at start of "
"expression, encountered");
};
'UTF)' => {
throw LocatedParseError("(*UTF) must be at start of "
"expression, encountered");
};
'UCP)' => {
throw LocatedParseError("(*UCP) must be at start of "
"expression, encountered");
};
# Use the control verb mini-parser to report an error for this
# unsupported/unknown verb.
[^)]+ ')' => {
ParseMode temp_mode;
assert(ts - 2 >= ptr); // parser needs the '(*' at the start too.
read_control_verbs(ts - 2, te, (ts - 2 - ptr), temp_mode);
assert(0); // Should have thrown a parse error.
throw LocatedParseError("Unknown control verb");
};
any => {
throw LocatedParseError("Unknown control verb");
};
*|;
#############################################################
# Parser to read UCP
#############################################################
readUCP := |*
'C' => { currentCls->add(CLASS_UCP_C, negated); fret; };
'Cc' => { currentCls->add(CLASS_UCP_CC, negated); fret; };
'Cf' => { currentCls->add(CLASS_UCP_CF, negated); fret; };
'Cn' => { currentCls->add(CLASS_UCP_CN, negated); fret; };
'Co' => { currentCls->add(CLASS_UCP_CO, negated); fret; };
'Cs' => { currentCls->add(CLASS_UCP_CS, negated); fret; };
'L' => { currentCls->add(CLASS_UCP_L, negated); fret; };
'Ll' => { currentCls->add(CLASS_UCP_LL, negated); fret; };
'Lm' => { currentCls->add(CLASS_UCP_LM, negated); fret; };
'Lo' => { currentCls->add(CLASS_UCP_LO, negated); fret; };
'Lt' => { currentCls->add(CLASS_UCP_LT, negated); fret; };
'Lu' => { currentCls->add(CLASS_UCP_LU, negated); fret; };
'L&' => { currentCls->add(CLASS_UCP_L_AND, negated); fret; };
'M' => { currentCls->add(CLASS_UCP_M, negated); fret; };
'Mc' => { currentCls->add(CLASS_UCP_MC, negated); fret; };
'Me' => { currentCls->add(CLASS_UCP_ME, negated); fret; };
'Mn' => { currentCls->add(CLASS_UCP_MN, negated); fret; };
'N' => { currentCls->add(CLASS_UCP_N, negated); fret; };
'Nd' => { currentCls->add(CLASS_UCP_ND, negated); fret; };
'Nl' => { currentCls->add(CLASS_UCP_NL, negated); fret; };
'No' => { currentCls->add(CLASS_UCP_NO, negated); fret; };
'P' => { currentCls->add(CLASS_UCP_P, negated); fret; };
'Pc' => { currentCls->add(CLASS_UCP_PC, negated); fret; };
'Pd' => { currentCls->add(CLASS_UCP_PD, negated); fret; };
'Pe' => { currentCls->add(CLASS_UCP_PE, negated); fret; };
'Pf' => { currentCls->add(CLASS_UCP_PF, negated); fret; };
'Pi' => { currentCls->add(CLASS_UCP_PI, negated); fret; };
'Po' => { currentCls->add(CLASS_UCP_PO, negated); fret; };
'Ps' => { currentCls->add(CLASS_UCP_PS, negated); fret; };
'S' => { currentCls->add(CLASS_UCP_S, negated); fret; };
'Sc' => { currentCls->add(CLASS_UCP_SC, negated); fret; };
'Sk' => { currentCls->add(CLASS_UCP_SK, negated); fret; };
'Sm' => { currentCls->add(CLASS_UCP_SM, negated); fret; };
'So' => { currentCls->add(CLASS_UCP_SO, negated); fret; };
'Z' => { currentCls->add(CLASS_UCP_Z, negated); fret; };
'Zl' => { currentCls->add(CLASS_UCP_ZL, negated); fret; };
'Zp' => { currentCls->add(CLASS_UCP_ZP, negated); fret; };
'Zs' => { currentCls->add(CLASS_UCP_ZS, negated); fret; };
'Xan' => { currentCls->add(CLASS_UCP_XAN, negated); fret; };
'Xps' => { currentCls->add(CLASS_UCP_XPS, negated); fret; };
'Xsp' => { currentCls->add(CLASS_UCP_XSP, negated); fret; };
'Xwd' => { currentCls->add(CLASS_UCP_XWD, negated); fret; };
'Arabic' => { currentCls->add(CLASS_SCRIPT_ARABIC, negated); fret; };
'Armenian' => { currentCls->add(CLASS_SCRIPT_ARMENIAN, negated); fret; };
'Avestan' => { currentCls->add(CLASS_SCRIPT_AVESTAN, negated); fret; };
'Balinese' => { currentCls->add(CLASS_SCRIPT_BALINESE, negated); fret; };
'Bamum' => { currentCls->add(CLASS_SCRIPT_BAMUM, negated); fret; };
'Batak' => { currentCls->add(CLASS_SCRIPT_BATAK, negated); fret; };
'Bengali' => { currentCls->add(CLASS_SCRIPT_BENGALI, negated); fret; };
'Bopomofo' => { currentCls->add(CLASS_SCRIPT_BOPOMOFO, negated); fret; };
'Brahmi' => { currentCls->add(CLASS_SCRIPT_BRAHMI, negated); fret; };
'Braille' => { currentCls->add(CLASS_SCRIPT_BRAILLE, negated); fret; };
'Buginese' => { currentCls->add(CLASS_SCRIPT_BUGINESE, negated); fret; };
'Buhid' => { currentCls->add(CLASS_SCRIPT_BUHID, negated); fret; };
'Canadian_Aboriginal' => { currentCls->add(CLASS_SCRIPT_CANADIAN_ABORIGINAL, negated); fret; };
'Carian' => { currentCls->add(CLASS_SCRIPT_CARIAN, negated); fret; };
'Cham' => { currentCls->add(CLASS_SCRIPT_CHAM, negated); fret; };
'Cherokee' => { currentCls->add(CLASS_SCRIPT_CHEROKEE, negated); fret; };
'Common' => { currentCls->add(CLASS_SCRIPT_COMMON, negated); fret; };
'Coptic' => { currentCls->add(CLASS_SCRIPT_COPTIC, negated); fret; };
'Cuneiform' => { currentCls->add(CLASS_SCRIPT_CUNEIFORM, negated); fret; };
'Cypriot' => { currentCls->add(CLASS_SCRIPT_CYPRIOT, negated); fret; };
'Cyrillic' => { currentCls->add(CLASS_SCRIPT_CYRILLIC, negated); fret; };
'Deseret' => { currentCls->add(CLASS_SCRIPT_DESERET, negated); fret; };
'Devanagari' => { currentCls->add(CLASS_SCRIPT_DEVANAGARI, negated); fret; };
'Egyptian_Hieroglyphs' => { currentCls->add(CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS, negated); fret; };
'Ethiopic' => { currentCls->add(CLASS_SCRIPT_ETHIOPIC, negated); fret; };
'Georgian' => { currentCls->add(CLASS_SCRIPT_GEORGIAN, negated); fret; };
'Glagolitic' => { currentCls->add(CLASS_SCRIPT_GLAGOLITIC, negated); fret; };
'Gothic' => { currentCls->add(CLASS_SCRIPT_GOTHIC, negated); fret; };
'Greek' => { currentCls->add(CLASS_SCRIPT_GREEK, negated); fret; };
'Gujarati' => { currentCls->add(CLASS_SCRIPT_GUJARATI, negated); fret; };
'Gurmukhi' => { currentCls->add(CLASS_SCRIPT_GURMUKHI, negated); fret; };
'Han' => { currentCls->add(CLASS_SCRIPT_HAN, negated); fret; };
'Hangul' => { currentCls->add(CLASS_SCRIPT_HANGUL, negated); fret; };
'Hanunoo' => { currentCls->add(CLASS_SCRIPT_HANUNOO, negated); fret; };
'Hebrew' => { currentCls->add(CLASS_SCRIPT_HEBREW, negated); fret; };
'Hiragana' => { currentCls->add(CLASS_SCRIPT_HIRAGANA, negated); fret; };
'Imperial_Aramaic' => { currentCls->add(CLASS_SCRIPT_IMPERIAL_ARAMAIC, negated); fret; };
'Inherited' => { currentCls->add(CLASS_SCRIPT_INHERITED, negated); fret; };
'Inscriptional_Pahlavi' => { currentCls->add(CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI, negated); fret; };
'Inscriptional_Parthian' => { currentCls->add(CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN, negated); fret; };
'Javanese' => { currentCls->add(CLASS_SCRIPT_JAVANESE, negated); fret; };
'Kaithi' => { currentCls->add(CLASS_SCRIPT_KAITHI, negated); fret; };
'Kannada' => { currentCls->add(CLASS_SCRIPT_KANNADA, negated); fret; };
'Katakana' => { currentCls->add(CLASS_SCRIPT_KATAKANA, negated); fret; };
'Kayah_Li' => { currentCls->add(CLASS_SCRIPT_KAYAH_LI, negated); fret; };
'Kharoshthi' => { currentCls->add(CLASS_SCRIPT_KHAROSHTHI, negated); fret; };
'Khmer' => { currentCls->add(CLASS_SCRIPT_KHMER, negated); fret; };
'Lao' => { currentCls->add(CLASS_SCRIPT_LAO, negated); fret; };
'Latin' => { currentCls->add(CLASS_SCRIPT_LATIN, negated); fret; };
'Lepcha' => { currentCls->add(CLASS_SCRIPT_LEPCHA, negated); fret; };
'Limbu' => { currentCls->add(CLASS_SCRIPT_LIMBU, negated); fret; };
'Linear_B' => { currentCls->add(CLASS_SCRIPT_LINEAR_B, negated); fret; };
'Lisu' => { currentCls->add(CLASS_SCRIPT_LISU, negated); fret; };
'Lycian' => { currentCls->add(CLASS_SCRIPT_LYCIAN, negated); fret; };
'Lydian' => { currentCls->add(CLASS_SCRIPT_LYDIAN, negated); fret; };
'Malayalam' => { currentCls->add(CLASS_SCRIPT_MALAYALAM, negated); fret; };
'Mandaic' => { currentCls->add(CLASS_SCRIPT_MANDAIC, negated); fret; };
'Meetei_Mayek' => { currentCls->add(CLASS_SCRIPT_MEETEI_MAYEK, negated); fret; };
'Mongolian' => { currentCls->add(CLASS_SCRIPT_MONGOLIAN, negated); fret; };
'Myanmar' => { currentCls->add(CLASS_SCRIPT_MYANMAR, negated); fret; };
'New_Tai_Lue' => { currentCls->add(CLASS_SCRIPT_NEW_TAI_LUE, negated); fret; };
'Nko' => { currentCls->add(CLASS_SCRIPT_NKO, negated); fret; };
'Ogham' => { currentCls->add(CLASS_SCRIPT_OGHAM, negated); fret; };
'Ol_Chiki' => { currentCls->add(CLASS_SCRIPT_OL_CHIKI, negated); fret; };
'Old_Italic' => { currentCls->add(CLASS_SCRIPT_OLD_ITALIC, negated); fret; };
'Old_Persian' => { currentCls->add(CLASS_SCRIPT_OLD_PERSIAN, negated); fret; };
'Old_South_Arabian' => { currentCls->add(CLASS_SCRIPT_OLD_SOUTH_ARABIAN, negated); fret; };
'Old_Turkic' => { currentCls->add(CLASS_SCRIPT_OLD_TURKIC, negated); fret; };
'Oriya' => { currentCls->add(CLASS_SCRIPT_ORIYA, negated); fret; };
'Osmanya' => { currentCls->add(CLASS_SCRIPT_OSMANYA, negated); fret; };
'Phags_Pa' => { currentCls->add(CLASS_SCRIPT_PHAGS_PA, negated); fret; };
'Phoenician' => { currentCls->add(CLASS_SCRIPT_PHOENICIAN, negated); fret; };
'Rejang' => { currentCls->add(CLASS_SCRIPT_REJANG, negated); fret; };
'Runic' => { currentCls->add(CLASS_SCRIPT_RUNIC, negated); fret; };
'Samaritan' => { currentCls->add(CLASS_SCRIPT_SAMARITAN, negated); fret; };
'Saurashtra' => { currentCls->add(CLASS_SCRIPT_SAURASHTRA, negated); fret; };
'Shavian' => { currentCls->add(CLASS_SCRIPT_SHAVIAN, negated); fret; };
'Sinhala' => { currentCls->add(CLASS_SCRIPT_SINHALA, negated); fret; };
'Sundanese' => { currentCls->add(CLASS_SCRIPT_SUNDANESE, negated); fret; };
'Syloti_Nagri' => { currentCls->add(CLASS_SCRIPT_SYLOTI_NAGRI, negated); fret; };
'Syriac' => { currentCls->add(CLASS_SCRIPT_SYRIAC, negated); fret; };
'Tagalog' => { currentCls->add(CLASS_SCRIPT_TAGALOG, negated); fret; };
'Tagbanwa' => { currentCls->add(CLASS_SCRIPT_TAGBANWA, negated); fret; };
'Tai_Le' => { currentCls->add(CLASS_SCRIPT_TAI_LE, negated); fret; };
'Tai_Tham' => { currentCls->add(CLASS_SCRIPT_TAI_THAM, negated); fret; };
'Tai_Viet' => { currentCls->add(CLASS_SCRIPT_TAI_VIET, negated); fret; };
'Tamil' => { currentCls->add(CLASS_SCRIPT_TAMIL, negated); fret; };
'Telugu' => { currentCls->add(CLASS_SCRIPT_TELUGU, negated); fret; };
'Thaana' => { currentCls->add(CLASS_SCRIPT_THAANA, negated); fret; };
'Thai' => { currentCls->add(CLASS_SCRIPT_THAI, negated); fret; };
'Tibetan' => { currentCls->add(CLASS_SCRIPT_TIBETAN, negated); fret; };
'Tifinagh' => { currentCls->add(CLASS_SCRIPT_TIFINAGH, negated); fret; };
'Ugaritic' => { currentCls->add(CLASS_SCRIPT_UGARITIC, negated); fret; };
'Vai' => { currentCls->add(CLASS_SCRIPT_VAI, negated); fret; };
'Yi' => { currentCls->add(CLASS_SCRIPT_YI, negated); fret; };
'Any' => { currentCls->add(CLASS_UCP_ANY, negated); fret; };
any => { throw LocatedParseError("Unknown property"); };
*|;
readBracedUCP := ('{'
('^' ${ negated = !negated; }) ?
([^^] ${ fhold; fcall readUCP; })
'}' ${ if (!inCharClass) { // not inside [..]
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
})
$^{ throw LocatedParseError("Malformed property"); };
readUCPSingle := |*
'C' => {
currentCls->add(CLASS_UCP_C, negated);
if (!inCharClass) {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
};
'L' => {
currentCls->add(CLASS_UCP_L, negated);
if (!inCharClass) {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
};
'M' => {
currentCls->add(CLASS_UCP_M, negated);
if (!inCharClass) {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
};
'N' => {
currentCls->add(CLASS_UCP_N, negated);
if (!inCharClass) {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
};
'P' => {
currentCls->add(CLASS_UCP_P, negated);
if (!inCharClass) {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
};
'S' => {
currentCls->add(CLASS_UCP_S, negated);
if (!inCharClass) {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
};
'Z' => {
currentCls->add(CLASS_UCP_Z, negated);
if (!inCharClass) {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
}
fret;
};
any => { throw LocatedParseError("Unknown property"); };
*|;
charClassGuts := |*
# We don't support POSIX collating elements (neither does PCRE
# or Perl). These look like [.ch.] or [=ch=].
'\[\.' ( '\\]' | [^\]] )* '\.\]' |
'\[=' ( '\\]' | [^\]] )* '=\]' => {
throw LocatedParseError("Unsupported POSIX collating "
"element");
};
# Named sets
# Adding these may cause the charclass to close, hence the
# finalized check - UE-2276
'[:alnum:]' => {
currentCls->add(CLASS_ALNUM, false);
};
'[:^alnum:]' => {
currentCls->add(CLASS_ALNUM, true);
};
'[:alpha:]' => {
currentCls->add(CLASS_ALPHA, false);
};
'[:^alpha:]' => {
currentCls->add(CLASS_ALPHA, true);
};
'[:ascii:]' => {
currentCls->add(CLASS_ASCII, false);
};
'[:^ascii:]' => {
currentCls->add(CLASS_ASCII, true);
};
'[:blank:]' => {
currentCls->add(CLASS_BLANK, false);
};
'[:^blank:]' => {
currentCls->add(CLASS_BLANK, true);
};
'[:cntrl:]' => {
currentCls->add(CLASS_CNTRL, false);
};
'[:^cntrl:]' => {
currentCls->add(CLASS_CNTRL, true);
};
'[:digit:]' => {
currentCls->add(CLASS_DIGIT, false);
};
'[:^digit:]' => {
currentCls->add(CLASS_DIGIT, true);
};
'[:graph:]' => {
currentCls->add(CLASS_GRAPH, false);
};
'[:^graph:]' => {
currentCls->add(CLASS_GRAPH, true);
};
'[:lower:]' => {
currentCls->add(CLASS_LOWER, false);
};
'[:^lower:]' => {
currentCls->add(CLASS_LOWER, true);
};
'[:print:]' => {
currentCls->add(CLASS_PRINT, false);
};
'[:^print:]' => {
currentCls->add(CLASS_PRINT, true);
};
'[:punct:]' => {
currentCls->add(CLASS_PUNCT, false);
};
'[:^punct:]' => {
currentCls->add(CLASS_PUNCT, true);
};
# Posix SPACE covers 9, 10, 11, 12, 13, 32
'[:space:]' => {
currentCls->add(CLASS_SPACE, false);
};
'[:^space:]' => {
currentCls->add(CLASS_SPACE, true);
};
'[:upper:]' => {
currentCls->add(CLASS_UPPER, false);
};
'[:^upper:]' => {
currentCls->add(CLASS_UPPER, true);
};
'[:word:]' => {
currentCls->add(CLASS_WORD, false);
};
'[:^word:]' => {
currentCls->add(CLASS_WORD, true);
};
'[:xdigit:]' => {
currentCls->add(CLASS_XDIGIT, false);
};
'[:^xdigit:]' => {
currentCls->add(CLASS_XDIGIT, true);
};
# Anything else between "[:" and ":]" is an invalid POSIX class.
# Note that "\]" counts as a literal char here.
'\[:' ( '\\]' | [^\]] )* ':\]' => {
throw LocatedParseError("Invalid POSIX named class");
};
'\\Q' => {
fcall readQuotedClass;
};
'\\E' => { /*noop*/};
# Backspace (this is only valid for \b in char classes)
'\\b' => {
currentCls->add('\x08');
};
# Tab
'\\t' => {
currentCls->add('\x09');
};
# Newline
'\\n' => {
currentCls->add('\x0a');
};
# Carriage return
'\\r' => {
currentCls->add('\x0d');
};
# Form feed
'\\f' => {
currentCls->add('\x0c');
};
# Bell
'\\a' => {
currentCls->add('\x07');
};
# Escape
'\\e' => {
currentCls->add('\x1b');
};
# Horizontal whitespace
'\\h' => {
currentCls->add(CLASS_HORZ, false);
};
# Not horizontal whitespace
'\\H' => {
currentCls->add(CLASS_HORZ, true);
};
# Vertical whitespace
'\\v' => {
currentCls->add(CLASS_VERT, false);
};
# Not vertical whitespace
'\\V' => {
currentCls->add(CLASS_VERT, true);
};
'\\p{' => {
negated = false;
fhold;
fcall readBracedUCP;
};
'\\p' any => {
negated = false;
fhold;
fcall readUCPSingle;
};
'\\P{' => {
negated = true;
fhold;
fcall readBracedUCP;
};
'\\P'any => {
negated = true;
fhold;
fcall readUCPSingle;
};
'\\P' => { throw LocatedParseError("Malformed property"); };
'\\p' => { throw LocatedParseError("Malformed property"); };
# Octal
escapedOctal0 => {
currentCls->add(octAccumulator);
};
escapedOctal2c => {
currentCls->add(octAccumulator);
};
'\\o{' [0-7]+ '}' => {
string oct(ts + 3, te - ts - 4);
unsigned long val;
try {
val = stoul(oct, nullptr, 8);
} catch (const std::out_of_range &) {
val = MAX_UNICODE + 1;
}
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
throw LocatedParseError("Value in \\o{...} sequence is too large");
}
currentCls->add((unichar)val);
};
# And for when it goes wrong
'\\o' => {
throw LocatedParseError("Value in \\o{...} sequence is non-octal or missing braces");
};
# Hex
escapedHex => {
currentCls->add(accumulator);
};
# not a back-ref, not octal, just PCRE madness
'\\' [89] => {
// whatever we found here
currentCls->add(*(ts + 1));
};
# Unicode Hex
'\\x{' xdigit+ '}' => {
string hex(ts + 3, te - ts - 4);
unsigned long val;
try {
val = stoul(hex, nullptr, 16);
} catch (const std::out_of_range &) {
val = MAX_UNICODE + 1;
}
if (val > MAX_UNICODE) {
throw LocatedParseError("Value in \\x{...} sequence is too large");
}
currentCls->add((unichar)val);
};
# And for when it goes wrong
'\\x{' => {
throw LocatedParseError("Value in \\x{...} sequence is non-hex or missing }");
};
# Control characters
escapedCtrl => {
if (te - ts < 3) {
assert(te - ts == 2);
throw LocatedParseError(SLASH_C_ERROR);
} else {
assert(te - ts == 3);
currentCls->add(decodeCtrl(ts[2]));
}
};
# Word character
'\\w' => {
currentCls->add(CLASS_WORD, false);
};
# Non word character
'\\W' => {
currentCls->add(CLASS_WORD, true);
};
# Whitespace character (except VT)
'\\s' => {
currentCls->add(CLASS_SPACE, false);
};
# Non whitespace character
'\\S' => {
currentCls->add(CLASS_SPACE, true);
};
# Digit character
'\\d' => {
currentCls->add(CLASS_DIGIT, false);
};
# Non digit character
'\\D' => {
currentCls->add(CLASS_DIGIT, true);
};
'\-' => {
currentCls->addDash();
};
# A bunch of unsupported (for now) escapes
escapedUnsupported - '\\X' => throwUnsupportedEscape;
# PCRE appears to discard escaped g in a char class (a backref bug?)
'\\g' => throwUnsupportedEscape;
# the too-hard basket: UE-944, UE-1134, UE-1157
# many escaped single char literals shold be benign, but PCRE
# breaks with them when adding to ranges, so unless they have
# defined special meaning in a char-class we reject them to be
# safe.
'\\' alpha => throwUnsupportedEscape;
'\\' any => {
// add the literal char
currentCls->add(*(ts + 1));
};
#unicode chars
utf8_2c when is_utf8 => {
assert(mode.utf8);
currentCls->add(readUtf8CodePoint2c(ts));
};
utf8_3c when is_utf8 => {
assert(mode.utf8);
currentCls->add(readUtf8CodePoint3c(ts));
};
utf8_4c when is_utf8 => {
assert(mode.utf8);
currentCls->add(readUtf8CodePoint4c(ts));
};
hi_byte when is_utf8 => {
assert(mode.utf8);
throwInvalidUtf8();
};
# Literal character
(any - ']') => {
currentCls->add((u8)*ts);
};
']' => {
currentCls->finalize();
currentSeq->addComponent(std::move(currentCls));
inCharClass = false;
fgoto main;
};
*|;
#############################################################
# Parser to read stuff from a character class
#############################################################
readClass := |*
# A caret at the beginning of the class means that the rest of the
# class is negated.
'\^' when is_early_charclass => {
if (currentCls->isNegated()) {
// Already seen a caret; the second one is not a meta-character.
inCharClassEarly = false;
fhold; fgoto charClassGuts;
} else {
currentCls->negate();
// Note: we cannot switch off inCharClassEarly here, as /[^]]/
// needs to use the right square bracket path below.
}
};
# A right square bracket before anything "real" is interpreted as a
# literal right square bracket.
']' when is_early_charclass => {
currentCls->add(']');
inCharClassEarly = false;
};
# if we hit a quote before anything "real", handle it
'\\Q' => { fcall readQuotedClass; };
'\\E' => { /*noop*/};
# time for the real work to happen
any => {
inCharClassEarly = false;
fhold;
fgoto charClassGuts;
};
*|;
#############################################################
# Parser to read a quoted literal
#############################################################
readQuotedLiteral := |*
# Escape sequence
'\\E' => {
fgoto main;
};
#unicode chars
utf8_2c when is_utf8 => {
assert(mode.utf8);
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
cc->add(readUtf8CodePoint2c(ts));
cc->finalize();
currentSeq->addComponent(std::move(cc));
};
utf8_3c when is_utf8 => {
assert(mode.utf8);
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
cc->add(readUtf8CodePoint3c(ts));
cc->finalize();
currentSeq->addComponent(std::move(cc));
};
utf8_4c when is_utf8 => {
assert(mode.utf8);
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
cc->add(readUtf8CodePoint4c(ts));
cc->finalize();
currentSeq->addComponent(std::move(cc));
};
hi_byte when is_utf8 => {
assert(mode.utf8);
throwInvalidUtf8();
};
# Literal character
any => {
addLiteral(currentSeq, *ts, mode);
};
*|;
#############################################################
# Parser to read a quoted class
#############################################################
readQuotedClass := |*
# Escape sequence
'\\E' => {
fret;
};
#unicode chars
utf8_2c when is_utf8 => {
assert(mode.utf8);
currentCls->add(readUtf8CodePoint2c(ts));
inCharClassEarly = false;
};
utf8_3c when is_utf8 => {
assert(mode.utf8);
currentCls->add(readUtf8CodePoint3c(ts));
inCharClassEarly = false;
};
utf8_4c when is_utf8 => {
assert(mode.utf8);
currentCls->add(readUtf8CodePoint4c(ts));
inCharClassEarly = false;
};
hi_byte when is_utf8 => {
assert(mode.utf8);
throwInvalidUtf8();
};
# Literal character
any => {
currentCls->add(*ts);
inCharClassEarly = false;
};
*|;
#############################################################
# Parser to read (and ignore) a comment block
#############################################################
readComment := |*
# Right paren
'\)' => { inComment = false; fgoto main; };
# absolutely everything gets ignored until we see a right
# paren
any;
*|;
#############################################################
# Parser to read (and ignore) a newline-terminated comment
# block
#############################################################
readNewlineTerminatedComment := |*
'\n' => { inComment = false; fgoto main; };
# absolutely everything gets ignored until we see a
# newline
any;
*|;
#############################################################
# Parser for standard components
#############################################################
main := |*
#############################################################
# Standard components
#############################################################
# Begin capturing group (non-capturing handled further down)
'\(' => enterCapturingGroup;
# End group
'\)' => exitGroup;
# Mark alternation
'\|' => {
currentSeq->addAlternation();
};
# POSIX named elements should only be used inside a class. Note
# that we need to be able to reject /[:\]:]/ here.
'\[:' ( '\\]' | [^\]] )* ':\]' => {
throw LocatedParseError("POSIX named classes are only "
"supported inside a class");
};
# We don't support POSIX collating elements (neither does PCRE
# or Perl). These look like [.ch.] or [=ch=].
'\[\.' ( '\\]' | [^\]] )* '\.\]' |
'\[=' ( '\\]' | [^\]] )* '=\]' => {
throw LocatedParseError("Unsupported POSIX collating "
"element");
};
# Begin eating characters for class
'\[' => eatClass;
# Begin quoted literal
'\\Q' => {
fgoto readQuotedLiteral;
};
# An \E that is not preceded by a \Q is ignored
'\\E' => { /* noop */ };
# Match any character
'\.' => {
currentSeq->addComponent(generateComponent(CLASS_ANY, false, mode));
};
# Match one byte
'\\C' => {
if (mode.utf8) {
throw LocatedParseError("\\C is unsupported in UTF8");
}
currentSeq->addComponent(std::make_unique<ComponentByte>());
};
# Match 0 or more times (greedy)
'\*' => {
if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_GREEDY)) {
throwInvalidRepeat();
}
};
# Match 0 or more times (non-greedy)
'\*\?' => {
if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_NONGREEDY)) {
throwInvalidRepeat();
}
};
# Match 0 or more times (possessive)
'\*\+' => {
if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_POSSESSIVE)) {
throwInvalidRepeat();
}
};
# Match 1 or more times (greedy)
'\+' => {
if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_GREEDY)) {
throwInvalidRepeat();
}
};
# Match 1 or more times (non-greedy)
'\+\?' => {
if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_NONGREEDY)) {
throwInvalidRepeat();
}
};
# Match 1 or more times (possessive)
'\+\+' => {
if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit,
ComponentRepeat::REPEAT_POSSESSIVE)) {
throwInvalidRepeat();
}
};
# Match 0 or 1 times (greedy)
'\?' => {
if (!currentSeq->addRepeat(
0, 1, ComponentRepeat::REPEAT_GREEDY)) {
throwInvalidRepeat();
}
};
# Match 0 or 1 times (non-greedy)
'\?\?' => {
if (!currentSeq->addRepeat(
0, 1, ComponentRepeat::REPEAT_NONGREEDY)) {
throwInvalidRepeat();
}
};
# Match 0 or 1 times (possessive)
'\?\+' => {
if (!currentSeq->addRepeat(
0, 1, ComponentRepeat::REPEAT_POSSESSIVE)) {
throwInvalidRepeat();
}
};
# Match {n}|{n,}|{n,m} times (greedy)
repeatNM1 => {
if (repeatN > repeatM || repeatM == 0) {
throwInvalidRepeat();
} else if (!currentSeq->addRepeat(
repeatN, repeatM,
ComponentRepeat::REPEAT_GREEDY)) {
throwInvalidRepeat();
}
};
# Match {n}|{n,}|{n,m} times (non-greedy)
repeatNM1 '\?' => {
if (repeatN > repeatM || repeatM == 0) {
throwInvalidRepeat();
} else if (!currentSeq->addRepeat(
repeatN, repeatM,
ComponentRepeat::REPEAT_NONGREEDY)) {
throwInvalidRepeat();
}
};
# Match {n}|{n,}|{n,m} times (possessive)
repeatNM1 '\+' => {
if (repeatN > repeatM || repeatM == 0) {
throwInvalidRepeat();
} else if (!currentSeq->addRepeat(
repeatN, repeatM,
ComponentRepeat::REPEAT_POSSESSIVE)) {
throwInvalidRepeat();
}
};
# In ignore_space mode, an unescaped # character introduces a
# comment that runs until the next newline or the end of the
# pattern.
'\#' when is_ignore_space => enterNewlineTerminatedComment;
# Perl 5.10 Special Backtracking Control Verbs: we support
# UTF8/UCP, none of the others
'(*' [^)] => { fhold; fcall readVerb; };
# Earlier parser code checked for the terminating NULL and exited
# explicitly.
'\0' => { assert(0); fbreak; };
#############################################################
# Boundaries
#############################################################
# Start of data; also after internal newline in multiline mode
'\^' => {
auto bound = mode.multiline ? ComponentBoundary::BEGIN_LINE
: ComponentBoundary::BEGIN_STRING;
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
};
# End of data (with optional internal newline); also before
# internal newline in multiline mode
'\$' => {
auto bound = mode.multiline ? ComponentBoundary::END_LINE
: ComponentBoundary::END_STRING_OPTIONAL_LF;
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
};
# Beginning of data
'\\A' => {
auto bound = ComponentBoundary::BEGIN_STRING;
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
};
# End of data (with optional internal newline)
'\\Z' => {
auto bound = ComponentBoundary::END_STRING_OPTIONAL_LF;
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
};
# End of data
'\\z' => {
auto bound = ComponentBoundary::END_STRING;
currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
};
# Word boundary
'\\b' => {
currentSeq->addComponent(
std::make_unique<ComponentWordBoundary>(ts - ptr, false, mode));
};
# Non-word boundary
'\\B' => {
currentSeq->addComponent(
std::make_unique<ComponentWordBoundary>(ts - ptr, true, mode));
};
#############################################################
# Escaped chars
#############################################################
# Tab
'\\t' => {
addLiteral(currentSeq, '\x09', mode);
};
# Newline
'\\n' => {
addLiteral(currentSeq, '\x0a', mode);
};
# Carriage return
'\\r' => {
addLiteral(currentSeq, '\x0d', mode);
};
# Form feed
'\\f' => {
addLiteral(currentSeq, '\x0c', mode);
};
# Bell
'\\a' => {
addLiteral(currentSeq, '\x07', mode);
};
# Escape
'\\e' => {
addLiteral(currentSeq, '\x1b', mode);
};
# Octal
escapedOctal0 => {
addLiteral(currentSeq, octAccumulator, mode);
};
escapedOctal2 => {
// If there are enough capturing sub expressions, this may be
// a back reference
accumulator = parseAsDecimal(octAccumulator);
if (accumulator < groupIndex) {
currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
} else {
addEscapedOctal(currentSeq, octAccumulator, mode);
}
};
# Numeric back reference
# everything less than 8 is a straight up back ref, even if
# it is a forwards backward reference (aieeee!)
# Note that \8 and \9 are the literal chars '8' and '9'.
'\\' backRefIdSingle => addNumberedBackRef;
# otherwise we need to munge through the possible backref
'\\' backRefId => {
// if there are enough left parens to this point, back ref
if (accumulator < groupIndex) {
currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
} else {
// Otherwise, we interpret the first three digits as an
// octal escape, and the remaining characters stand for
// themselves as literals.
const char *s = ts;
unsigned int accum = 0;
unsigned int oct_digits = 0;
assert(*s == '\\'); // token starts at backslash
for (++s; s < te && oct_digits < 3; ++oct_digits, ++s) {
u8 digit = *s - '0';
if (digit < 8) {
accum = digit + accum * 8;
} else {
break;
}
}
if (oct_digits > 0) {
addEscapedOctal(currentSeq, accum, mode);
}
// And then the rest of the digits, if any, are literal.
for (; s < te; ++s) {
addLiteral(currentSeq, *s, mode);
}
}
};
backReferenceG => addNumberedBackRef;
backReferenceGNegative => addNegativeNumberedBackRef;
backReferenceGBracket => addNumberedBackRef;
backReferenceGBracket2 => addNegativeNumberedBackRef;
backReferenceGBracketName => addNamedBackRef;
backReferenceKBracketName => addNamedBackRef;
backReferenceKBracketName2 => addNamedBackRef;
backReferenceKBracketName3 => addNamedBackRef;
backReferenceP => addNamedBackRef;
# Oniguruma - either angle braces or single quotes for this one
('\\g<' [^>]*? '>'|'\\g\'' [^\']*? '\'') => {
ostringstream str;
str << "Onigiruma subroutine call at index " << ts - ptr <<
" not supported.";
throw ParseError(str.str());
};
# Fallthrough: a \g that hasn't been caught by one of the above
# is invalid syntax. Without this rule, we would accept /A\g/.
'\\g' => {
throw LocatedParseError("Invalid reference after \\g");
};
'\\o{' [0-7]+ '}' => {
string oct(ts + 3, te - ts - 4);
unsigned long val;
try {
val = stoul(oct, nullptr, 8);
} catch (const std::out_of_range &) {
val = MAX_UNICODE + 1;
}
if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
throw LocatedParseError("Value in \\o{...} sequence is too large");
}
addEscapedOctal(currentSeq, (unichar)val, mode);
};
# And for when it goes wrong
'\\o' => {
throw LocatedParseError("Value in \\o{...} sequence is non-octal or missing braces");
};
# Hex
escapedHex => {
addEscapedHex(currentSeq, accumulator, mode);
};
# Unicode Hex
'\\x{' xdigit+ '}' => {
string hex(ts + 3, te - ts - 4);
unsigned long val;
try {
val = stoul(hex, nullptr, 16);
} catch (const std::out_of_range &) {
val = MAX_UNICODE + 1;
}
if (val > MAX_UNICODE) {
throw LocatedParseError("Value in \\x{...} sequence is too large");
}
addEscapedHex(currentSeq, (unichar)val, mode);
};
# And for when it goes wrong
'\\x{' => {
throw LocatedParseError("Value in \\x{...} sequence is non-hex or missing }");
};
# Control characters
escapedCtrl => {
if (te - ts < 3) {
assert(te - ts == 2);
throw LocatedParseError(SLASH_C_ERROR);
} else {
assert(te - ts == 3);
addLiteral(currentSeq, decodeCtrl(ts[2]), mode);
}
};
# A bunch of unsupported (for now) escapes
escapedUnsupported => {
ostringstream str;
str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
<< " not supported.";
throw ParseError(str.str());
};
# Word character
'\\w' => {
auto cc = generateComponent(CLASS_WORD, false, mode);
currentSeq->addComponent(std::move(cc));
};
# Non word character
'\\W' => {
auto cc = generateComponent(CLASS_WORD, true, mode);
currentSeq->addComponent(std::move(cc));
};
# Whitespace character
'\\s' => {
auto cc = generateComponent(CLASS_SPACE, false, mode);
currentSeq->addComponent(std::move(cc));
};
# Non whitespace character
'\\S' => {
auto cc = generateComponent(CLASS_SPACE, true, mode);
currentSeq->addComponent(std::move(cc));
};
# Digit character
'\\d' => {
auto cc = generateComponent(CLASS_DIGIT, false, mode);
currentSeq->addComponent(std::move(cc));
};
# Non digit character
'\\D' => {
auto cc = generateComponent(CLASS_DIGIT, true, mode);
currentSeq->addComponent(std::move(cc));
};
# Horizontal whitespace
'\\h' => {
auto cc = generateComponent(CLASS_HORZ, false, mode);
currentSeq->addComponent(std::move(cc));
};
# Not horizontal whitespace
'\\H' => {
auto cc = generateComponent(CLASS_HORZ, true, mode);
currentSeq->addComponent(std::move(cc));
};
# Vertical whitespace
'\\v' => {
auto cc = generateComponent(CLASS_VERT, false, mode);
currentSeq->addComponent(std::move(cc));
};
# Not vertical whitespace
'\\V' => {
auto cc = generateComponent(CLASS_VERT, true, mode);
currentSeq->addComponent(std::move(cc));
};
'\\p{' => {
assert(!currentCls && !inCharClass);
currentCls = getComponentClass(mode);
negated = false;
fhold;
fcall readBracedUCP;
};
'\\p' any => {
assert(!currentCls && !inCharClass);
currentCls = getComponentClass(mode);
negated = false;
fhold;
fcall readUCPSingle;
};
'\\P{' => {
assert(!currentCls && !inCharClass);
currentCls = getComponentClass(mode);
negated = true;
fhold;
fcall readBracedUCP;
};
'\\P' any => {
assert(!currentCls && !inCharClass);
currentCls = getComponentClass(mode);
negated = true;
fhold;
fcall readUCPSingle;
};
'\\P' => { throw LocatedParseError("Malformed property"); };
'\\p' => { throw LocatedParseError("Malformed property"); };
# Newline sequence, hairy semantics that we don't do
'\\R' => {
ostringstream str;
str << "\\R at index " << ts - ptr << " not supported.";
throw ParseError(str.str());
};
# Reset start of match, also hairy semantics that we don't do
'\\K' => {
ostringstream str;
str << "\\K at index " << ts - ptr << " not supported.";
throw ParseError(str.str());
};
# \k without a backref is bugged in PCRE so we have no
# idea what our semantics should be on it
'\\k' => {
ostringstream str;
str << "\\k at index " << ts - ptr << " not supported.";
throw ParseError(str.str());
};
# \G is more hairy pcre-api stuff, DO NOT WANT
'\\G' => {
ostringstream str;
str << "\\G at index " << ts - ptr << " not supported.";
throw ParseError(str.str());
};
'\\X' => {
currentSeq->addComponent(std::make_unique<ComponentEUS>(ts - ptr, mode));
};
# Fall through general escaped character
'\\' any => {
addLiteral(currentSeq, *(ts + 1), mode);
};
# A backslash with no follower is not allowed
'\\' => {
assert(ts + 1 == pe);
ostringstream str;
str << "Unescaped \\ at end of input, index " << ts - ptr << ".";
throw ParseError(str.str());
};
#############################################################
# Extended patterns
#############################################################
# Comment
'\(\?\#' => enterComment;
# Match modifiers
'\(\?' matchModifiers >resetModifiers ')' => applyModifiers;
# Non-capturing group, with flag modifiers
'\(\?' matchModifiers >resetModifiers ':' => enterModifiedGroup;
# Zero width look ahead assertion
'\(\?=' => enterZWLookAhead;
# Zero width negative look ahead assertion
'\(\?\!' => enterZWNegLookAhead;
# Zero width look behind assertion
'\(\?\<=' => enterZWLookBehind;
# Zero width negative look behind assertion
'\(\?\<\!' => enterZWNegLookBehind;
# Code (TOTALLY unsupported... for good reason)
'\(\?\{' => enterEmbeddedCode;
'\(\?\?\{' => enterEmbeddedCode;
# Atomic group
'\(\?\>' => enterAtomicGroup;
# Named capturing groups
( namedGroup1 |
namedGroup2 |
namedGroup3 ) => enterNamedGroup;
# named/numbered subroutine references
numberedSubExpression => enterReferenceUnsupported;
namedSubExpression => enterReferenceUnsupported;
# Conditional reference with a positive lookahead assertion
'(?(?=' => {
auto a = std::make_unique<ComponentAssertion>(
ComponentAssertion::LOOKAHEAD, ComponentAssertion::POS);
ComponentAssertion *a_seq = a.get();
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentCondReference>(std::move(a)));
PUSH_SEQUENCE;
currentSeq = a_seq;
};
# Conditional reference with a negative lookahead assertion
'(?(?!' => {
auto a = std::make_unique<ComponentAssertion>(
ComponentAssertion::LOOKAHEAD, ComponentAssertion::NEG);
ComponentAssertion *a_seq = a.get();
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentCondReference>(std::move(a)));
PUSH_SEQUENCE;
currentSeq = a_seq;
};
# Conditional reference with a positive lookbehind assertion
'(?(?<=' => {
auto a = std::make_unique<ComponentAssertion>(
ComponentAssertion::LOOKBEHIND, ComponentAssertion::POS);
ComponentAssertion *a_seq = a.get();
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentCondReference>(std::move(a)));
PUSH_SEQUENCE;
currentSeq = a_seq;
};
# Conditional reference with a negative lookbehind assertion
'(?(?<!' => {
auto a = std::make_unique<ComponentAssertion>(
ComponentAssertion::LOOKBEHIND, ComponentAssertion::NEG);
ComponentAssertion *a_seq = a.get();
PUSH_SEQUENCE;
currentSeq = enterSequence(currentSeq,
std::make_unique<ComponentCondReference>(std::move(a)));
PUSH_SEQUENCE;
currentSeq = a_seq;
};
# Recursive conditional references (unsupported)
'(?(R' ( [0-9]+ | ('&' [A-Za-z0-9_]+) ) ? ')' => {
throw LocatedParseError("Pattern recursion not supported");
};
# Conditional references
# numbered
'\(\?\(' (backRefIdSingle | backRefId) ')' => enterNumberedConditionalRef;
# named
( namedConditionalRef1 |
namedConditionalRef2 |
namedConditionalRef3 ) => enterNamedConditionalRef;
# Conditions (unsupported)
'\(\?\(' => enterConditionUnsupported;
# Callouts (unsupported)
'\(\?C' [0-9]* '\)' => {
ostringstream str;
str << "Callout at index " << ts - ptr << " not supported.";
throw ParseError(str.str());
};
# Any other char after '(?' is a pattern modifier we don't
# recognise.
'\(\?' any => {
throw LocatedParseError("Unrecognised character after (?");
};
#unicode chars
utf8_2c when is_utf8 => {
assert(mode.utf8);
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
cc->add(readUtf8CodePoint2c(ts));
cc->finalize();
currentSeq->addComponent(std::move(cc));
};
utf8_3c when is_utf8 => {
assert(mode.utf8);
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
cc->add(readUtf8CodePoint3c(ts));
cc->finalize();
currentSeq->addComponent(std::move(cc));
};
utf8_4c when is_utf8 => {
assert(mode.utf8);
/* leverage ComponentClass to generate the vertices */
auto cc = getComponentClass(mode);
cc->add(readUtf8CodePoint4c(ts));
cc->finalize();
currentSeq->addComponent(std::move(cc));
};
hi_byte when is_utf8 => {
assert(mode.utf8);
throwInvalidUtf8();
};
#############################################################
# Literal character
#############################################################
# literal character
whitespace => {
if (mode.ignore_space == false) {
addLiteral(currentSeq, *ts, mode);
}
};
any => {
addLiteral(currentSeq, *ts, mode);
};
*|;
prepush {
DEBUG_PRINTF("stack %zu top %d\n", stack.size(), top);
if ((int)stack.size() == top) {
stack.resize(2 * (top + 1));
}
}
}%%
%% write data nofinal;
/** \brief Main parser call, returns root Component or nullptr. */
unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
assert(ptr);
const char *p = ptr;
const char *pe = ptr + strlen(ptr);
// First, read the control verbs, set any global mode flags and move the
// ptr forward.
p = read_control_verbs(p, pe, 0, globalMode);
const char *eof = pe;
int cs;
UNUSED int act;
int top;
vector<int> stack;
const char *ts, *te;
unichar accumulator = 0;
unichar octAccumulator = 0; /* required as we are also accumulating for
* back ref when looking for octals */
unsigned repeatN = 0;
unsigned repeatM = 0;
string label;
ParseMode mode = globalMode;
ParseMode newMode;
bool negated = false;
bool inComment = false;
// Stack of sequences and flags used to store state when we enter
// sub-sequences.
vector<ExprState> sequences;
// Index of the next capturing group. Note that zero is reserved for the
// root sequence.
unsigned groupIndex = 1;
// Set storing group names that are currently in use.
flat_set<string> groupNames;
// Root sequence.
unique_ptr<ComponentSequence> rootSeq = std::make_unique<ComponentSequence>();
rootSeq->setCaptureIndex(0);
// Current sequence being appended to
ComponentSequence *currentSeq = rootSeq.get();
// The current character class being appended to. This is used as the
// accumulator for both character class and UCP properties.
unique_ptr<ComponentClass> currentCls;
// True if the machine is currently inside a character class, i.e. square
// brackets [..].
bool inCharClass = false;
// True if the machine is inside a character class but it has not processed
// any "real" elements yet, i.e. it's still processing meta-characters like
// '^'.
bool inCharClassEarly = false;
// Location at which the current character class began.
const char *currentClsBegin = p;
// We throw exceptions on various parsing failures beyond this point: we
// use a try/catch block here to clean up our allocated memory before we
// re-throw the exception to the caller.
try {
// Embed the Ragel machine here
%% write init;
%% write exec;
if (p != pe && *p != '\0') {
// didn't make it to the end of our input, but we didn't throw a ParseError?
assert(0);
ostringstream str;
str << "Parse error at index " << (p - ptr) << ".";
throw ParseError(str.str());
}
if (currentCls) {
assert(inCharClass);
assert(currentClsBegin);
ostringstream oss;
oss << "Unterminated character class starting at index "
<< currentClsBegin - ptr << ".";
throw ParseError(oss.str());
}
if (inComment) {
throw ParseError("Unterminated comment.");
}
if (!sequences.empty()) {
ostringstream str;
str << "Missing close parenthesis for group started at index "
<< sequences.back().seqOffset << ".";
throw ParseError(str.str());
}
// Unlikely, but possible
if (groupIndex > 65535) {
throw ParseError("The maximum number of capturing subexpressions is 65535.");
}
// Finalize the top-level sequence, which will take care of any
// top-level alternation.
currentSeq->finalize();
assert(currentSeq == rootSeq.get());
// Ensure that all references are valid.
checkReferences(*rootSeq, groupIndex, groupNames);
return std::move(rootSeq);
} catch (LocatedParseError &error) {
if (ts >= ptr && ts <= pe) {
error.locate(ts - ptr);
} else {
error.locate(0);
}
throw;
}
}
} // namespace ue2