/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Parser code (generated with Ragel from Parser.rl). */ #include "config.h" /* Parser.cpp is a built source, may not be in same dir as parser files */ #include "parser/check_refs.h" #include "parser/control_verbs.h" #include "parser/ComponentAlternation.h" #include "parser/ComponentAssertion.h" #include "parser/ComponentAtomicGroup.h" #include "parser/ComponentBackReference.h" #include "parser/ComponentBoundary.h" #include "parser/ComponentByte.h" #include "parser/ComponentClass.h" #include "parser/ComponentCondReference.h" #include "parser/ComponentEmpty.h" #include "parser/ComponentEUS.h" #include "parser/Component.h" #include "parser/ComponentRepeat.h" #include "parser/ComponentSequence.h" #include "parser/ComponentWordBoundary.h" #include "parser/parse_error.h" #include "parser/Parser.h" #include "ue2common.h" #include "util/compare.h" #include "util/flat_containers.h" #include "util/unicode_def.h" #include "util/verify_types.h" #include #include #include #include #include #include #include #include using namespace std; namespace ue2 { #define PUSH_SEQUENCE do {\ sequences.push_back(ExprState(currentSeq, (size_t)(ts - ptr), \ mode)); \ } while(0) #define POP_SEQUENCE do {\ currentSeq = sequences.back().seq; \ mode = sequences.back().mode; \ sequences.pop_back(); \ } while(0) namespace { /** \brief Structure representing current state as we're parsing (current * sequence, current options). Stored in the 'sequences' vector. */ struct ExprState { ExprState(ComponentSequence *seq_in, size_t offset, const ParseMode &mode_in) : seq(seq_in), seqOffset(offset), mode(mode_in) {} ComponentSequence *seq; //!< current sequence size_t seqOffset; //!< offset seq was entered, for error reporting ParseMode mode; //!< current mode flags }; } // namespace static unsigned parseAsDecimal(unsigned oct) { // The input was parsed as octal, but should have been parsed as decimal. // Deconstruct the octal number and reconstruct into decimal unsigned ret = 0; unsigned multiplier = 1; while (oct) { ret += (oct & 0x7) * multiplier; oct >>= 3; multiplier *= 10; } return ret; } /** \brief Maximum value for a positive integer. We use INT_MAX, as that's what * PCRE uses. */ static constexpr u32 MAX_NUMBER = INT_MAX; static void pushDec(u32 *acc, char raw_digit) { assert(raw_digit >= '0' && raw_digit <= '9'); u32 digit_val = raw_digit - '0'; // Ensure that we don't overflow. u64a val = ((u64a)*acc * 10) + digit_val; if (val > MAX_NUMBER) { throw LocatedParseError("Number is too big"); } *acc = verify_u32(val); } static void pushOct(u32 *acc, char raw_digit) { assert(raw_digit >= '0' && raw_digit <= '7'); u32 digit_val = raw_digit - '0'; // Ensure that we don't overflow. u64a val = ((u64a)*acc * 8) + digit_val; if (val > MAX_NUMBER) { throw LocatedParseError("Number is too big"); } *acc = verify_u32(val); } static void throwInvalidRepeat(void) { throw LocatedParseError("Invalid repeat"); } static void throwInvalidUtf8(void) { throw ParseError("Expression is not valid UTF-8."); } /** * Adds the given child component to the parent sequence, returning a pointer * to the new (child) "current sequence". */ static ComponentSequence *enterSequence(ComponentSequence *parent, unique_ptr child) { assert(parent); assert(child); ComponentSequence *seq = child.get(); parent->addComponent(std::move(child)); return seq; } static void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) { if (mode.utf8 && mode.caseless) { /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); assert(cc); cc->add(c); cc->finalize(); currentSeq->addComponent(std::move(cc)); } else { currentSeq->addComponent(getLiteralComponentClass(c, mode.caseless)); } } static void addEscaped(ComponentSequence *currentSeq, unichar accum, const ParseMode &mode, const char *err_msg) { if (mode.utf8) { /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); assert(cc); cc->add(accum); cc->finalize(); currentSeq->addComponent(std::move(cc)); } else { if (accum > 255) { throw LocatedParseError(err_msg); } addLiteral(currentSeq, (char)accum, mode); } } static void addEscapedOctal(ComponentSequence *currentSeq, unichar accum, const ParseMode &mode) { addEscaped(currentSeq, accum, mode, "Octal value is greater than \\377"); } static void addEscapedHex(ComponentSequence *currentSeq, unichar accum, const ParseMode &mode) { addEscaped(currentSeq, accum, mode, "Hexadecimal value is greater than \\xFF"); } #define SLASH_C_ERROR "\\c must be followed by an ASCII character" static u8 decodeCtrl(char raw) { if (raw & 0x80) { throw LocatedParseError(SLASH_C_ERROR); } return mytoupper(raw) ^ 0x40; } static unichar readUtf8CodePoint2c(const char *s) { auto *ts = reinterpret_cast(s); assert(ts[0] >= 0xc0 && ts[0] < 0xe0); assert(ts[1] >= 0x80 && ts[1] < 0xc0); unichar val = ts[0] & 0x1f; val <<= 6; val |= ts[1] & 0x3f; DEBUG_PRINTF("utf8 %02hhx %02hhx ->\\x{%x}\n", ts[0], ts[1], val); return val; } static unichar readUtf8CodePoint3c(const char *s) { auto *ts = reinterpret_cast(s); assert(ts[0] >= 0xe0 && ts[0] < 0xf0); assert(ts[1] >= 0x80 && ts[1] < 0xc0); assert(ts[2] >= 0x80 && ts[2] < 0xc0); unichar val = ts[0] & 0x0f; val <<= 6; val |= ts[1] & 0x3f; val <<= 6; val |= ts[2] & 0x3f; DEBUG_PRINTF("utf8 %02hhx %02hhx %02hhx ->\\x{%x}\n", ts[0], ts[1], ts[2], val); return val; } static unichar readUtf8CodePoint4c(const char *s) { auto *ts = reinterpret_cast(s); assert(ts[0] >= 0xf0 && ts[0] < 0xf8); assert(ts[1] >= 0x80 && ts[1] < 0xc0); assert(ts[2] >= 0x80 && ts[2] < 0xc0); assert(ts[3] >= 0x80 && ts[3] < 0xc0); unichar val = ts[0] & 0x07; val <<= 6; val |= ts[1] & 0x3f; val <<= 6; val |= ts[2] & 0x3f; val <<= 6; val |= ts[3] & 0x3f; DEBUG_PRINTF("utf8 %02hhx %02hhx %02hhx %02hhx ->\\x{%x}\n", ts[0], ts[1], ts[2], ts[3], val); return val; } %%{ machine regex; alphtype unsigned char; action throwUnsupportedEscape { ostringstream str; str << "'\\" << *(ts + 1) << "' at index " << ts - ptr << " not supported in a character class."; throw ParseError(str.str()); } action unsupportedProperty { throw LocatedParseError("Character property not supported"); } action clearLabel { label.clear();} action appendLabelCharacter { label.push_back(fc);} action clearOctAccumulator { octAccumulator = 0;} action clearAccumulator { accumulator = 0;} action setOctAccumulator { octAccumulator = 0; pushOct(&octAccumulator, fc); } action setDecAccumulator { accumulator = 0; pushDec(&accumulator, fc); } action clearNM { repeatN = 0; repeatM = 0; } action appendN { pushDec(&repeatN, fc); } action appendM { pushDec(&repeatM, fc); } action appendAccumulatorOctDigit { pushOct(&octAccumulator, fc); } action appendAccumulatorDecDigit { pushDec(&accumulator, fc); } action appendAccumulatorHexDigit { accumulator *= 16; accumulator += fc - '0'; } action appendAccumulatorHexL { accumulator *= 16; accumulator += 10 + fc - 'a'; } action appendAccumulatorHexU { accumulator *= 16; accumulator += 10 + fc - 'A'; } # enter a comment group, where we just scan for a close paren. action enterComment { inComment = true; fgoto readComment; } # enter an extended mode comment, where we just scan for a newline. action enterNewlineTerminatedComment { inComment = true; fgoto readNewlineTerminatedComment; } # enter a CAPTURING group ( e.g. '(blah)' ) action enterCapturingGroup { PUSH_SEQUENCE; auto seq = std::make_unique(); seq->setCaptureIndex(groupIndex++); currentSeq = enterSequence(currentSeq, std::move(seq)); } # enter a NAMED CAPTURING group ( e.g. (?'blah) ) action enterNamedGroup { assert(!label.empty()); // should be guaranteed by machine char c = *label.begin(); if (c >= '0' && c <= '9') { throw LocatedParseError("Group name cannot begin with a digit"); } if (!groupNames.insert(label).second) { throw LocatedParseError("Two named subpatterns use the name '" + label + "'"); } PUSH_SEQUENCE; auto seq = std::make_unique(); seq->setCaptureIndex(groupIndex++); seq->setCaptureName(label); currentSeq = enterSequence(currentSeq, std::move(seq)); } # enter a NON-CAPTURING group where we're modifying flags # ( e.g. '(?i:blah)' ). Standard non-capturing groups use this path # as well. action enterModifiedGroup { PUSH_SEQUENCE; mode = newMode; currentSeq = enterSequence(currentSeq, std::make_unique()); } action exitGroup { if (sequences.empty()) { throw LocatedParseError("Unmatched parentheses"); } currentSeq->finalize(); POP_SEQUENCE; } action enterZWLookAhead { PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(ComponentAssertion::LOOKAHEAD, ComponentAssertion::POS)); } action enterZWNegLookAhead { PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(ComponentAssertion::LOOKAHEAD, ComponentAssertion::NEG)); } action enterZWLookBehind { PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(ComponentAssertion::LOOKBEHIND, ComponentAssertion::POS)); } action enterZWNegLookBehind { PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(ComponentAssertion::LOOKBEHIND, ComponentAssertion::NEG)); } action enterEmbeddedCode { throw LocatedParseError("Embedded code is not supported"); } action enterConditionUnsupported { throw LocatedParseError("Conditional subpattern unsupported"); } action enterReferenceUnsupported { throw LocatedParseError("Subpattern reference unsupported"); } action enterNumberedConditionalRef { if (accumulator == 0) { throw LocatedParseError("Numbered reference cannot be zero"); } PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(accumulator)); } action enterNamedConditionalRef { PUSH_SEQUENCE; assert(!label.empty()); currentSeq = enterSequence(currentSeq, std::make_unique(label)); } action enterAtomicGroup { PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique()); } action eatClass { assert(!currentCls); assert(!inCharClass); // not reentrant currentCls = getComponentClass(mode); inCharClass = true; inCharClassEarly = true; currentClsBegin = ts; fgoto readClass; } action resetModifiers { newMode = mode; } action applyModifiers { mode = newMode; currentSeq->addComponent(std::make_unique()); } action modifyMatchPositive { switch (fc) { case 'i': newMode.caseless = true; break; case 'm': newMode.multiline = true; break; case 's': newMode.dotall = true; break; case 'x': newMode.ignore_space = true; break; default: assert(0); // this action only called for [imsx] break; } } action modifyMatchNegative { switch (fc) { case 'i': newMode.caseless = false; break; case 'm': newMode.multiline = false; break; case 's': newMode.dotall = false; break; case 'x': newMode.ignore_space = false; break; default: assert(0); // this action only called for [imsx] break; } } action is_utf8 { mode.utf8 } action is_ignore_space { mode.ignore_space } action is_early_charclass { inCharClassEarly } action addNumberedBackRef { if (accumulator == 0) { throw LocatedParseError("Numbered reference cannot be zero"); } currentSeq->addComponent(std::make_unique(accumulator)); } action addNegativeNumberedBackRef { // Accumulator is a negative offset. if (accumulator == 0) { throw LocatedParseError("Numbered reference cannot be zero"); } if (accumulator >= groupIndex) { throw LocatedParseError("Invalid reference"); } unsigned idx = groupIndex - accumulator; currentSeq->addComponent(std::make_unique(idx)); } action addNamedBackRef { currentSeq->addComponent(std::make_unique(label)); } escapedOctal0 = '\\0' @clearOctAccumulator [0-7]{0,2} $appendAccumulatorOctDigit; escapedOctal2 = '\\' [1-7] $setOctAccumulator [0-7]{1,2} $appendAccumulatorOctDigit; escapedOctal2c = '\\' [1-7] $setOctAccumulator [0-7]{0,2} $appendAccumulatorOctDigit; backRefIdSingle = [1-7] $setDecAccumulator; backRefId = [1-9] $setDecAccumulator [0-9]+ $appendAccumulatorDecDigit; escapedHex = '\\x' @clearAccumulator ([0-9] $appendAccumulatorHexDigit | [a-f] $appendAccumulatorHexL | [A-F] $appendAccumulatorHexU){0,2}; escapedCtrl = '\\c' any?; escapedUnsupported = '\\' [NluLU]; repeatNM1 = '\{' @clearNM [0-9]+ $appendN ('}' @{repeatM = repeatN;} | ',' '\}' @{repeatM = ComponentRepeat::NoLimit;} | ',' [0-9]+ $appendM '}'); backReferenceG = '\\g' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit; backReferenceGNegative = '\\g-' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit; backReferenceGBracket = '\\g{' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit '}'; backReferenceGBracket2 = '\\g{-' @clearAccumulator [0-9]{1,3} $appendAccumulatorDecDigit '}'; backReferenceGBracketName = '\\g{' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '}'; backReferenceKBracketName = '\\k{' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '}'; backReferenceKBracketName2 = '\\k<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>'; backReferenceKBracketName3 = '\\k\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\''; backReferenceP = '(?P=' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter ')'; namedGroup1 = '(?<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>'; namedGroup2 = '(?\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\''; namedGroup3 = '(?P<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>'; namedConditionalRef1 = '(?(<' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '>)'; namedConditionalRef2 = '(?(\'' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter '\')'; namedConditionalRef3 = '(?(' @clearLabel [A-Za-z0-9_]+ $appendLabelCharacter ')'; numberedSubExpression = '(?' [+\-]? [0-9]+ ')'; namedSubExpression = '(?' ('&'|'P>') [A-Za-z0-9_]+ ')'; positiveMatchModifiers = [imsx]+ $modifyMatchPositive; negativeMatchModifiers = '-' [imsx]+ $modifyMatchNegative; matchModifiers = positiveMatchModifiers ? negativeMatchModifiers ?; utf8_cont = 0x80..0xbf; utf8_2c = 0xc0..0xdf utf8_cont; utf8_3c = 0xe0..0xef utf8_cont utf8_cont; utf8_4c = 0xf0..0xf7 utf8_cont utf8_cont utf8_cont; hi_byte = 0x80..0xff; whitespace = [\t\n\v\f\r ]; ############################################################# # Trivial parser to read Perl 5.10+ control verbs, introduced # by '(*'. ############################################################# readVerb := |* 'UTF8)' => { throw LocatedParseError("(*UTF8) must be at start of " "expression, encountered"); }; 'UTF)' => { throw LocatedParseError("(*UTF) must be at start of " "expression, encountered"); }; 'UCP)' => { throw LocatedParseError("(*UCP) must be at start of " "expression, encountered"); }; # Use the control verb mini-parser to report an error for this # unsupported/unknown verb. [^)]+ ')' => { ParseMode temp_mode; assert(ts - 2 >= ptr); // parser needs the '(*' at the start too. read_control_verbs(ts - 2, te, (ts - 2 - ptr), temp_mode); assert(0); // Should have thrown a parse error. throw LocatedParseError("Unknown control verb"); }; any => { throw LocatedParseError("Unknown control verb"); }; *|; ############################################################# # Parser to read UCP ############################################################# readUCP := |* 'C' => { currentCls->add(CLASS_UCP_C, negated); fret; }; 'Cc' => { currentCls->add(CLASS_UCP_CC, negated); fret; }; 'Cf' => { currentCls->add(CLASS_UCP_CF, negated); fret; }; 'Cn' => { currentCls->add(CLASS_UCP_CN, negated); fret; }; 'Co' => { currentCls->add(CLASS_UCP_CO, negated); fret; }; 'Cs' => { currentCls->add(CLASS_UCP_CS, negated); fret; }; 'L' => { currentCls->add(CLASS_UCP_L, negated); fret; }; 'Ll' => { currentCls->add(CLASS_UCP_LL, negated); fret; }; 'Lm' => { currentCls->add(CLASS_UCP_LM, negated); fret; }; 'Lo' => { currentCls->add(CLASS_UCP_LO, negated); fret; }; 'Lt' => { currentCls->add(CLASS_UCP_LT, negated); fret; }; 'Lu' => { currentCls->add(CLASS_UCP_LU, negated); fret; }; 'L&' => { currentCls->add(CLASS_UCP_L_AND, negated); fret; }; 'M' => { currentCls->add(CLASS_UCP_M, negated); fret; }; 'Mc' => { currentCls->add(CLASS_UCP_MC, negated); fret; }; 'Me' => { currentCls->add(CLASS_UCP_ME, negated); fret; }; 'Mn' => { currentCls->add(CLASS_UCP_MN, negated); fret; }; 'N' => { currentCls->add(CLASS_UCP_N, negated); fret; }; 'Nd' => { currentCls->add(CLASS_UCP_ND, negated); fret; }; 'Nl' => { currentCls->add(CLASS_UCP_NL, negated); fret; }; 'No' => { currentCls->add(CLASS_UCP_NO, negated); fret; }; 'P' => { currentCls->add(CLASS_UCP_P, negated); fret; }; 'Pc' => { currentCls->add(CLASS_UCP_PC, negated); fret; }; 'Pd' => { currentCls->add(CLASS_UCP_PD, negated); fret; }; 'Pe' => { currentCls->add(CLASS_UCP_PE, negated); fret; }; 'Pf' => { currentCls->add(CLASS_UCP_PF, negated); fret; }; 'Pi' => { currentCls->add(CLASS_UCP_PI, negated); fret; }; 'Po' => { currentCls->add(CLASS_UCP_PO, negated); fret; }; 'Ps' => { currentCls->add(CLASS_UCP_PS, negated); fret; }; 'S' => { currentCls->add(CLASS_UCP_S, negated); fret; }; 'Sc' => { currentCls->add(CLASS_UCP_SC, negated); fret; }; 'Sk' => { currentCls->add(CLASS_UCP_SK, negated); fret; }; 'Sm' => { currentCls->add(CLASS_UCP_SM, negated); fret; }; 'So' => { currentCls->add(CLASS_UCP_SO, negated); fret; }; 'Z' => { currentCls->add(CLASS_UCP_Z, negated); fret; }; 'Zl' => { currentCls->add(CLASS_UCP_ZL, negated); fret; }; 'Zp' => { currentCls->add(CLASS_UCP_ZP, negated); fret; }; 'Zs' => { currentCls->add(CLASS_UCP_ZS, negated); fret; }; 'Xan' => { currentCls->add(CLASS_UCP_XAN, negated); fret; }; 'Xps' => { currentCls->add(CLASS_UCP_XPS, negated); fret; }; 'Xsp' => { currentCls->add(CLASS_UCP_XSP, negated); fret; }; 'Xwd' => { currentCls->add(CLASS_UCP_XWD, negated); fret; }; 'Arabic' => { currentCls->add(CLASS_SCRIPT_ARABIC, negated); fret; }; 'Armenian' => { currentCls->add(CLASS_SCRIPT_ARMENIAN, negated); fret; }; 'Avestan' => { currentCls->add(CLASS_SCRIPT_AVESTAN, negated); fret; }; 'Balinese' => { currentCls->add(CLASS_SCRIPT_BALINESE, negated); fret; }; 'Bamum' => { currentCls->add(CLASS_SCRIPT_BAMUM, negated); fret; }; 'Batak' => { currentCls->add(CLASS_SCRIPT_BATAK, negated); fret; }; 'Bengali' => { currentCls->add(CLASS_SCRIPT_BENGALI, negated); fret; }; 'Bopomofo' => { currentCls->add(CLASS_SCRIPT_BOPOMOFO, negated); fret; }; 'Brahmi' => { currentCls->add(CLASS_SCRIPT_BRAHMI, negated); fret; }; 'Braille' => { currentCls->add(CLASS_SCRIPT_BRAILLE, negated); fret; }; 'Buginese' => { currentCls->add(CLASS_SCRIPT_BUGINESE, negated); fret; }; 'Buhid' => { currentCls->add(CLASS_SCRIPT_BUHID, negated); fret; }; 'Canadian_Aboriginal' => { currentCls->add(CLASS_SCRIPT_CANADIAN_ABORIGINAL, negated); fret; }; 'Carian' => { currentCls->add(CLASS_SCRIPT_CARIAN, negated); fret; }; 'Cham' => { currentCls->add(CLASS_SCRIPT_CHAM, negated); fret; }; 'Cherokee' => { currentCls->add(CLASS_SCRIPT_CHEROKEE, negated); fret; }; 'Common' => { currentCls->add(CLASS_SCRIPT_COMMON, negated); fret; }; 'Coptic' => { currentCls->add(CLASS_SCRIPT_COPTIC, negated); fret; }; 'Cuneiform' => { currentCls->add(CLASS_SCRIPT_CUNEIFORM, negated); fret; }; 'Cypriot' => { currentCls->add(CLASS_SCRIPT_CYPRIOT, negated); fret; }; 'Cyrillic' => { currentCls->add(CLASS_SCRIPT_CYRILLIC, negated); fret; }; 'Deseret' => { currentCls->add(CLASS_SCRIPT_DESERET, negated); fret; }; 'Devanagari' => { currentCls->add(CLASS_SCRIPT_DEVANAGARI, negated); fret; }; 'Egyptian_Hieroglyphs' => { currentCls->add(CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS, negated); fret; }; 'Ethiopic' => { currentCls->add(CLASS_SCRIPT_ETHIOPIC, negated); fret; }; 'Georgian' => { currentCls->add(CLASS_SCRIPT_GEORGIAN, negated); fret; }; 'Glagolitic' => { currentCls->add(CLASS_SCRIPT_GLAGOLITIC, negated); fret; }; 'Gothic' => { currentCls->add(CLASS_SCRIPT_GOTHIC, negated); fret; }; 'Greek' => { currentCls->add(CLASS_SCRIPT_GREEK, negated); fret; }; 'Gujarati' => { currentCls->add(CLASS_SCRIPT_GUJARATI, negated); fret; }; 'Gurmukhi' => { currentCls->add(CLASS_SCRIPT_GURMUKHI, negated); fret; }; 'Han' => { currentCls->add(CLASS_SCRIPT_HAN, negated); fret; }; 'Hangul' => { currentCls->add(CLASS_SCRIPT_HANGUL, negated); fret; }; 'Hanunoo' => { currentCls->add(CLASS_SCRIPT_HANUNOO, negated); fret; }; 'Hebrew' => { currentCls->add(CLASS_SCRIPT_HEBREW, negated); fret; }; 'Hiragana' => { currentCls->add(CLASS_SCRIPT_HIRAGANA, negated); fret; }; 'Imperial_Aramaic' => { currentCls->add(CLASS_SCRIPT_IMPERIAL_ARAMAIC, negated); fret; }; 'Inherited' => { currentCls->add(CLASS_SCRIPT_INHERITED, negated); fret; }; 'Inscriptional_Pahlavi' => { currentCls->add(CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI, negated); fret; }; 'Inscriptional_Parthian' => { currentCls->add(CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN, negated); fret; }; 'Javanese' => { currentCls->add(CLASS_SCRIPT_JAVANESE, negated); fret; }; 'Kaithi' => { currentCls->add(CLASS_SCRIPT_KAITHI, negated); fret; }; 'Kannada' => { currentCls->add(CLASS_SCRIPT_KANNADA, negated); fret; }; 'Katakana' => { currentCls->add(CLASS_SCRIPT_KATAKANA, negated); fret; }; 'Kayah_Li' => { currentCls->add(CLASS_SCRIPT_KAYAH_LI, negated); fret; }; 'Kharoshthi' => { currentCls->add(CLASS_SCRIPT_KHAROSHTHI, negated); fret; }; 'Khmer' => { currentCls->add(CLASS_SCRIPT_KHMER, negated); fret; }; 'Lao' => { currentCls->add(CLASS_SCRIPT_LAO, negated); fret; }; 'Latin' => { currentCls->add(CLASS_SCRIPT_LATIN, negated); fret; }; 'Lepcha' => { currentCls->add(CLASS_SCRIPT_LEPCHA, negated); fret; }; 'Limbu' => { currentCls->add(CLASS_SCRIPT_LIMBU, negated); fret; }; 'Linear_B' => { currentCls->add(CLASS_SCRIPT_LINEAR_B, negated); fret; }; 'Lisu' => { currentCls->add(CLASS_SCRIPT_LISU, negated); fret; }; 'Lycian' => { currentCls->add(CLASS_SCRIPT_LYCIAN, negated); fret; }; 'Lydian' => { currentCls->add(CLASS_SCRIPT_LYDIAN, negated); fret; }; 'Malayalam' => { currentCls->add(CLASS_SCRIPT_MALAYALAM, negated); fret; }; 'Mandaic' => { currentCls->add(CLASS_SCRIPT_MANDAIC, negated); fret; }; 'Meetei_Mayek' => { currentCls->add(CLASS_SCRIPT_MEETEI_MAYEK, negated); fret; }; 'Mongolian' => { currentCls->add(CLASS_SCRIPT_MONGOLIAN, negated); fret; }; 'Myanmar' => { currentCls->add(CLASS_SCRIPT_MYANMAR, negated); fret; }; 'New_Tai_Lue' => { currentCls->add(CLASS_SCRIPT_NEW_TAI_LUE, negated); fret; }; 'Nko' => { currentCls->add(CLASS_SCRIPT_NKO, negated); fret; }; 'Ogham' => { currentCls->add(CLASS_SCRIPT_OGHAM, negated); fret; }; 'Ol_Chiki' => { currentCls->add(CLASS_SCRIPT_OL_CHIKI, negated); fret; }; 'Old_Italic' => { currentCls->add(CLASS_SCRIPT_OLD_ITALIC, negated); fret; }; 'Old_Persian' => { currentCls->add(CLASS_SCRIPT_OLD_PERSIAN, negated); fret; }; 'Old_South_Arabian' => { currentCls->add(CLASS_SCRIPT_OLD_SOUTH_ARABIAN, negated); fret; }; 'Old_Turkic' => { currentCls->add(CLASS_SCRIPT_OLD_TURKIC, negated); fret; }; 'Oriya' => { currentCls->add(CLASS_SCRIPT_ORIYA, negated); fret; }; 'Osmanya' => { currentCls->add(CLASS_SCRIPT_OSMANYA, negated); fret; }; 'Phags_Pa' => { currentCls->add(CLASS_SCRIPT_PHAGS_PA, negated); fret; }; 'Phoenician' => { currentCls->add(CLASS_SCRIPT_PHOENICIAN, negated); fret; }; 'Rejang' => { currentCls->add(CLASS_SCRIPT_REJANG, negated); fret; }; 'Runic' => { currentCls->add(CLASS_SCRIPT_RUNIC, negated); fret; }; 'Samaritan' => { currentCls->add(CLASS_SCRIPT_SAMARITAN, negated); fret; }; 'Saurashtra' => { currentCls->add(CLASS_SCRIPT_SAURASHTRA, negated); fret; }; 'Shavian' => { currentCls->add(CLASS_SCRIPT_SHAVIAN, negated); fret; }; 'Sinhala' => { currentCls->add(CLASS_SCRIPT_SINHALA, negated); fret; }; 'Sundanese' => { currentCls->add(CLASS_SCRIPT_SUNDANESE, negated); fret; }; 'Syloti_Nagri' => { currentCls->add(CLASS_SCRIPT_SYLOTI_NAGRI, negated); fret; }; 'Syriac' => { currentCls->add(CLASS_SCRIPT_SYRIAC, negated); fret; }; 'Tagalog' => { currentCls->add(CLASS_SCRIPT_TAGALOG, negated); fret; }; 'Tagbanwa' => { currentCls->add(CLASS_SCRIPT_TAGBANWA, negated); fret; }; 'Tai_Le' => { currentCls->add(CLASS_SCRIPT_TAI_LE, negated); fret; }; 'Tai_Tham' => { currentCls->add(CLASS_SCRIPT_TAI_THAM, negated); fret; }; 'Tai_Viet' => { currentCls->add(CLASS_SCRIPT_TAI_VIET, negated); fret; }; 'Tamil' => { currentCls->add(CLASS_SCRIPT_TAMIL, negated); fret; }; 'Telugu' => { currentCls->add(CLASS_SCRIPT_TELUGU, negated); fret; }; 'Thaana' => { currentCls->add(CLASS_SCRIPT_THAANA, negated); fret; }; 'Thai' => { currentCls->add(CLASS_SCRIPT_THAI, negated); fret; }; 'Tibetan' => { currentCls->add(CLASS_SCRIPT_TIBETAN, negated); fret; }; 'Tifinagh' => { currentCls->add(CLASS_SCRIPT_TIFINAGH, negated); fret; }; 'Ugaritic' => { currentCls->add(CLASS_SCRIPT_UGARITIC, negated); fret; }; 'Vai' => { currentCls->add(CLASS_SCRIPT_VAI, negated); fret; }; 'Yi' => { currentCls->add(CLASS_SCRIPT_YI, negated); fret; }; 'Any' => { currentCls->add(CLASS_UCP_ANY, negated); fret; }; any => { throw LocatedParseError("Unknown property"); }; *|; readBracedUCP := ('{' ('^' ${ negated = !negated; }) ? ([^^] ${ fhold; fcall readUCP; }) '}' ${ if (!inCharClass) { // not inside [..] currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }) $^{ throw LocatedParseError("Malformed property"); }; readUCPSingle := |* 'C' => { currentCls->add(CLASS_UCP_C, negated); if (!inCharClass) { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }; 'L' => { currentCls->add(CLASS_UCP_L, negated); if (!inCharClass) { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }; 'M' => { currentCls->add(CLASS_UCP_M, negated); if (!inCharClass) { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }; 'N' => { currentCls->add(CLASS_UCP_N, negated); if (!inCharClass) { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }; 'P' => { currentCls->add(CLASS_UCP_P, negated); if (!inCharClass) { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }; 'S' => { currentCls->add(CLASS_UCP_S, negated); if (!inCharClass) { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }; 'Z' => { currentCls->add(CLASS_UCP_Z, negated); if (!inCharClass) { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); } fret; }; any => { throw LocatedParseError("Unknown property"); }; *|; charClassGuts := |* # We don't support POSIX collating elements (neither does PCRE # or Perl). These look like [.ch.] or [=ch=]. '\[\.' ( '\\]' | [^\]] )* '\.\]' | '\[=' ( '\\]' | [^\]] )* '=\]' => { throw LocatedParseError("Unsupported POSIX collating " "element"); }; # Named sets # Adding these may cause the charclass to close, hence the # finalized check - UE-2276 '[:alnum:]' => { currentCls->add(CLASS_ALNUM, false); }; '[:^alnum:]' => { currentCls->add(CLASS_ALNUM, true); }; '[:alpha:]' => { currentCls->add(CLASS_ALPHA, false); }; '[:^alpha:]' => { currentCls->add(CLASS_ALPHA, true); }; '[:ascii:]' => { currentCls->add(CLASS_ASCII, false); }; '[:^ascii:]' => { currentCls->add(CLASS_ASCII, true); }; '[:blank:]' => { currentCls->add(CLASS_BLANK, false); }; '[:^blank:]' => { currentCls->add(CLASS_BLANK, true); }; '[:cntrl:]' => { currentCls->add(CLASS_CNTRL, false); }; '[:^cntrl:]' => { currentCls->add(CLASS_CNTRL, true); }; '[:digit:]' => { currentCls->add(CLASS_DIGIT, false); }; '[:^digit:]' => { currentCls->add(CLASS_DIGIT, true); }; '[:graph:]' => { currentCls->add(CLASS_GRAPH, false); }; '[:^graph:]' => { currentCls->add(CLASS_GRAPH, true); }; '[:lower:]' => { currentCls->add(CLASS_LOWER, false); }; '[:^lower:]' => { currentCls->add(CLASS_LOWER, true); }; '[:print:]' => { currentCls->add(CLASS_PRINT, false); }; '[:^print:]' => { currentCls->add(CLASS_PRINT, true); }; '[:punct:]' => { currentCls->add(CLASS_PUNCT, false); }; '[:^punct:]' => { currentCls->add(CLASS_PUNCT, true); }; # Posix SPACE covers 9, 10, 11, 12, 13, 32 '[:space:]' => { currentCls->add(CLASS_SPACE, false); }; '[:^space:]' => { currentCls->add(CLASS_SPACE, true); }; '[:upper:]' => { currentCls->add(CLASS_UPPER, false); }; '[:^upper:]' => { currentCls->add(CLASS_UPPER, true); }; '[:word:]' => { currentCls->add(CLASS_WORD, false); }; '[:^word:]' => { currentCls->add(CLASS_WORD, true); }; '[:xdigit:]' => { currentCls->add(CLASS_XDIGIT, false); }; '[:^xdigit:]' => { currentCls->add(CLASS_XDIGIT, true); }; # Anything else between "[:" and ":]" is an invalid POSIX class. # Note that "\]" counts as a literal char here. '\[:' ( '\\]' | [^\]] )* ':\]' => { throw LocatedParseError("Invalid POSIX named class"); }; '\\Q' => { fcall readQuotedClass; }; '\\E' => { /*noop*/}; # Backspace (this is only valid for \b in char classes) '\\b' => { currentCls->add('\x08'); }; # Tab '\\t' => { currentCls->add('\x09'); }; # Newline '\\n' => { currentCls->add('\x0a'); }; # Carriage return '\\r' => { currentCls->add('\x0d'); }; # Form feed '\\f' => { currentCls->add('\x0c'); }; # Bell '\\a' => { currentCls->add('\x07'); }; # Escape '\\e' => { currentCls->add('\x1b'); }; # Horizontal whitespace '\\h' => { currentCls->add(CLASS_HORZ, false); }; # Not horizontal whitespace '\\H' => { currentCls->add(CLASS_HORZ, true); }; # Vertical whitespace '\\v' => { currentCls->add(CLASS_VERT, false); }; # Not vertical whitespace '\\V' => { currentCls->add(CLASS_VERT, true); }; '\\p{' => { negated = false; fhold; fcall readBracedUCP; }; '\\p' any => { negated = false; fhold; fcall readUCPSingle; }; '\\P{' => { negated = true; fhold; fcall readBracedUCP; }; '\\P'any => { negated = true; fhold; fcall readUCPSingle; }; '\\P' => { throw LocatedParseError("Malformed property"); }; '\\p' => { throw LocatedParseError("Malformed property"); }; # Octal escapedOctal0 => { currentCls->add(octAccumulator); }; escapedOctal2c => { currentCls->add(octAccumulator); }; '\\o{' [0-7]+ '}' => { string oct(ts + 3, te - ts - 4); unsigned long val; try { val = stoul(oct, nullptr, 8); } catch (const std::out_of_range &) { val = MAX_UNICODE + 1; } if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) { throw LocatedParseError("Value in \\o{...} sequence is too large"); } currentCls->add((unichar)val); }; # And for when it goes wrong '\\o' => { throw LocatedParseError("Value in \\o{...} sequence is non-octal or missing braces"); }; # Hex escapedHex => { currentCls->add(accumulator); }; # not a back-ref, not octal, just PCRE madness '\\' [89] => { // whatever we found here currentCls->add(*(ts + 1)); }; # Unicode Hex '\\x{' xdigit+ '}' => { string hex(ts + 3, te - ts - 4); unsigned long val; try { val = stoul(hex, nullptr, 16); } catch (const std::out_of_range &) { val = MAX_UNICODE + 1; } if (val > MAX_UNICODE) { throw LocatedParseError("Value in \\x{...} sequence is too large"); } currentCls->add((unichar)val); }; # And for when it goes wrong '\\x{' => { throw LocatedParseError("Value in \\x{...} sequence is non-hex or missing }"); }; # Control characters escapedCtrl => { if (te - ts < 3) { assert(te - ts == 2); throw LocatedParseError(SLASH_C_ERROR); } else { assert(te - ts == 3); currentCls->add(decodeCtrl(ts[2])); } }; # Word character '\\w' => { currentCls->add(CLASS_WORD, false); }; # Non word character '\\W' => { currentCls->add(CLASS_WORD, true); }; # Whitespace character (except VT) '\\s' => { currentCls->add(CLASS_SPACE, false); }; # Non whitespace character '\\S' => { currentCls->add(CLASS_SPACE, true); }; # Digit character '\\d' => { currentCls->add(CLASS_DIGIT, false); }; # Non digit character '\\D' => { currentCls->add(CLASS_DIGIT, true); }; '\-' => { currentCls->addDash(); }; # A bunch of unsupported (for now) escapes escapedUnsupported - '\\X' => throwUnsupportedEscape; # PCRE appears to discard escaped g in a char class (a backref bug?) '\\g' => throwUnsupportedEscape; # the too-hard basket: UE-944, UE-1134, UE-1157 # many escaped single char literals shold be benign, but PCRE # breaks with them when adding to ranges, so unless they have # defined special meaning in a char-class we reject them to be # safe. '\\' alpha => throwUnsupportedEscape; '\\' any => { // add the literal char currentCls->add(*(ts + 1)); }; #unicode chars utf8_2c when is_utf8 => { assert(mode.utf8); currentCls->add(readUtf8CodePoint2c(ts)); }; utf8_3c when is_utf8 => { assert(mode.utf8); currentCls->add(readUtf8CodePoint3c(ts)); }; utf8_4c when is_utf8 => { assert(mode.utf8); currentCls->add(readUtf8CodePoint4c(ts)); }; hi_byte when is_utf8 => { assert(mode.utf8); throwInvalidUtf8(); }; # Literal character (any - ']') => { currentCls->add((u8)*ts); }; ']' => { currentCls->finalize(); currentSeq->addComponent(std::move(currentCls)); inCharClass = false; fgoto main; }; *|; ############################################################# # Parser to read stuff from a character class ############################################################# readClass := |* # A caret at the beginning of the class means that the rest of the # class is negated. '\^' when is_early_charclass => { if (currentCls->isNegated()) { // Already seen a caret; the second one is not a meta-character. inCharClassEarly = false; fhold; fgoto charClassGuts; } else { currentCls->negate(); // Note: we cannot switch off inCharClassEarly here, as /[^]]/ // needs to use the right square bracket path below. } }; # A right square bracket before anything "real" is interpreted as a # literal right square bracket. ']' when is_early_charclass => { currentCls->add(']'); inCharClassEarly = false; }; # if we hit a quote before anything "real", handle it '\\Q' => { fcall readQuotedClass; }; '\\E' => { /*noop*/}; # time for the real work to happen any => { inCharClassEarly = false; fhold; fgoto charClassGuts; }; *|; ############################################################# # Parser to read a quoted literal ############################################################# readQuotedLiteral := |* # Escape sequence '\\E' => { fgoto main; }; #unicode chars utf8_2c when is_utf8 => { assert(mode.utf8); /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); cc->add(readUtf8CodePoint2c(ts)); cc->finalize(); currentSeq->addComponent(std::move(cc)); }; utf8_3c when is_utf8 => { assert(mode.utf8); /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); cc->add(readUtf8CodePoint3c(ts)); cc->finalize(); currentSeq->addComponent(std::move(cc)); }; utf8_4c when is_utf8 => { assert(mode.utf8); /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); cc->add(readUtf8CodePoint4c(ts)); cc->finalize(); currentSeq->addComponent(std::move(cc)); }; hi_byte when is_utf8 => { assert(mode.utf8); throwInvalidUtf8(); }; # Literal character any => { addLiteral(currentSeq, *ts, mode); }; *|; ############################################################# # Parser to read a quoted class ############################################################# readQuotedClass := |* # Escape sequence '\\E' => { fret; }; #unicode chars utf8_2c when is_utf8 => { assert(mode.utf8); currentCls->add(readUtf8CodePoint2c(ts)); inCharClassEarly = false; }; utf8_3c when is_utf8 => { assert(mode.utf8); currentCls->add(readUtf8CodePoint3c(ts)); inCharClassEarly = false; }; utf8_4c when is_utf8 => { assert(mode.utf8); currentCls->add(readUtf8CodePoint4c(ts)); inCharClassEarly = false; }; hi_byte when is_utf8 => { assert(mode.utf8); throwInvalidUtf8(); }; # Literal character any => { currentCls->add(*ts); inCharClassEarly = false; }; *|; ############################################################# # Parser to read (and ignore) a comment block ############################################################# readComment := |* # Right paren '\)' => { inComment = false; fgoto main; }; # absolutely everything gets ignored until we see a right # paren any; *|; ############################################################# # Parser to read (and ignore) a newline-terminated comment # block ############################################################# readNewlineTerminatedComment := |* '\n' => { inComment = false; fgoto main; }; # absolutely everything gets ignored until we see a # newline any; *|; ############################################################# # Parser for standard components ############################################################# main := |* ############################################################# # Standard components ############################################################# # Begin capturing group (non-capturing handled further down) '\(' => enterCapturingGroup; # End group '\)' => exitGroup; # Mark alternation '\|' => { currentSeq->addAlternation(); }; # POSIX named elements should only be used inside a class. Note # that we need to be able to reject /[:\]:]/ here. '\[:' ( '\\]' | [^\]] )* ':\]' => { throw LocatedParseError("POSIX named classes are only " "supported inside a class"); }; # We don't support POSIX collating elements (neither does PCRE # or Perl). These look like [.ch.] or [=ch=]. '\[\.' ( '\\]' | [^\]] )* '\.\]' | '\[=' ( '\\]' | [^\]] )* '=\]' => { throw LocatedParseError("Unsupported POSIX collating " "element"); }; # Begin eating characters for class '\[' => eatClass; # Begin quoted literal '\\Q' => { fgoto readQuotedLiteral; }; # An \E that is not preceded by a \Q is ignored '\\E' => { /* noop */ }; # Match any character '\.' => { currentSeq->addComponent(generateComponent(CLASS_ANY, false, mode)); }; # Match one byte '\\C' => { if (mode.utf8) { throw LocatedParseError("\\C is unsupported in UTF8"); } currentSeq->addComponent(std::make_unique()); }; # Match 0 or more times (greedy) '\*' => { if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit, ComponentRepeat::REPEAT_GREEDY)) { throwInvalidRepeat(); } }; # Match 0 or more times (non-greedy) '\*\?' => { if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit, ComponentRepeat::REPEAT_NONGREEDY)) { throwInvalidRepeat(); } }; # Match 0 or more times (possessive) '\*\+' => { if (!currentSeq->addRepeat(0, ComponentRepeat::NoLimit, ComponentRepeat::REPEAT_POSSESSIVE)) { throwInvalidRepeat(); } }; # Match 1 or more times (greedy) '\+' => { if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit, ComponentRepeat::REPEAT_GREEDY)) { throwInvalidRepeat(); } }; # Match 1 or more times (non-greedy) '\+\?' => { if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit, ComponentRepeat::REPEAT_NONGREEDY)) { throwInvalidRepeat(); } }; # Match 1 or more times (possessive) '\+\+' => { if (!currentSeq->addRepeat(1, ComponentRepeat::NoLimit, ComponentRepeat::REPEAT_POSSESSIVE)) { throwInvalidRepeat(); } }; # Match 0 or 1 times (greedy) '\?' => { if (!currentSeq->addRepeat( 0, 1, ComponentRepeat::REPEAT_GREEDY)) { throwInvalidRepeat(); } }; # Match 0 or 1 times (non-greedy) '\?\?' => { if (!currentSeq->addRepeat( 0, 1, ComponentRepeat::REPEAT_NONGREEDY)) { throwInvalidRepeat(); } }; # Match 0 or 1 times (possessive) '\?\+' => { if (!currentSeq->addRepeat( 0, 1, ComponentRepeat::REPEAT_POSSESSIVE)) { throwInvalidRepeat(); } }; # Match {n}|{n,}|{n,m} times (greedy) repeatNM1 => { if (repeatN > repeatM || repeatM == 0) { throwInvalidRepeat(); } else if (!currentSeq->addRepeat( repeatN, repeatM, ComponentRepeat::REPEAT_GREEDY)) { throwInvalidRepeat(); } }; # Match {n}|{n,}|{n,m} times (non-greedy) repeatNM1 '\?' => { if (repeatN > repeatM || repeatM == 0) { throwInvalidRepeat(); } else if (!currentSeq->addRepeat( repeatN, repeatM, ComponentRepeat::REPEAT_NONGREEDY)) { throwInvalidRepeat(); } }; # Match {n}|{n,}|{n,m} times (possessive) repeatNM1 '\+' => { if (repeatN > repeatM || repeatM == 0) { throwInvalidRepeat(); } else if (!currentSeq->addRepeat( repeatN, repeatM, ComponentRepeat::REPEAT_POSSESSIVE)) { throwInvalidRepeat(); } }; # In ignore_space mode, an unescaped # character introduces a # comment that runs until the next newline or the end of the # pattern. '\#' when is_ignore_space => enterNewlineTerminatedComment; # Perl 5.10 Special Backtracking Control Verbs: we support # UTF8/UCP, none of the others '(*' [^)] => { fhold; fcall readVerb; }; # Earlier parser code checked for the terminating NULL and exited # explicitly. '\0' => { assert(0); fbreak; }; ############################################################# # Boundaries ############################################################# # Start of data; also after internal newline in multiline mode '\^' => { auto bound = mode.multiline ? ComponentBoundary::BEGIN_LINE : ComponentBoundary::BEGIN_STRING; currentSeq->addComponent(std::make_unique(bound)); }; # End of data (with optional internal newline); also before # internal newline in multiline mode '\$' => { auto bound = mode.multiline ? ComponentBoundary::END_LINE : ComponentBoundary::END_STRING_OPTIONAL_LF; currentSeq->addComponent(std::make_unique(bound)); }; # Beginning of data '\\A' => { auto bound = ComponentBoundary::BEGIN_STRING; currentSeq->addComponent(std::make_unique(bound)); }; # End of data (with optional internal newline) '\\Z' => { auto bound = ComponentBoundary::END_STRING_OPTIONAL_LF; currentSeq->addComponent(std::make_unique(bound)); }; # End of data '\\z' => { auto bound = ComponentBoundary::END_STRING; currentSeq->addComponent(std::make_unique(bound)); }; # Word boundary '\\b' => { currentSeq->addComponent( std::make_unique(ts - ptr, false, mode)); }; # Non-word boundary '\\B' => { currentSeq->addComponent( std::make_unique(ts - ptr, true, mode)); }; ############################################################# # Escaped chars ############################################################# # Tab '\\t' => { addLiteral(currentSeq, '\x09', mode); }; # Newline '\\n' => { addLiteral(currentSeq, '\x0a', mode); }; # Carriage return '\\r' => { addLiteral(currentSeq, '\x0d', mode); }; # Form feed '\\f' => { addLiteral(currentSeq, '\x0c', mode); }; # Bell '\\a' => { addLiteral(currentSeq, '\x07', mode); }; # Escape '\\e' => { addLiteral(currentSeq, '\x1b', mode); }; # Octal escapedOctal0 => { addLiteral(currentSeq, octAccumulator, mode); }; escapedOctal2 => { // If there are enough capturing sub expressions, this may be // a back reference accumulator = parseAsDecimal(octAccumulator); if (accumulator < groupIndex) { currentSeq->addComponent(std::make_unique(accumulator)); } else { addEscapedOctal(currentSeq, octAccumulator, mode); } }; # Numeric back reference # everything less than 8 is a straight up back ref, even if # it is a forwards backward reference (aieeee!) # Note that \8 and \9 are the literal chars '8' and '9'. '\\' backRefIdSingle => addNumberedBackRef; # otherwise we need to munge through the possible backref '\\' backRefId => { // if there are enough left parens to this point, back ref if (accumulator < groupIndex) { currentSeq->addComponent(std::make_unique(accumulator)); } else { // Otherwise, we interpret the first three digits as an // octal escape, and the remaining characters stand for // themselves as literals. const char *s = ts; unsigned int accum = 0; unsigned int oct_digits = 0; assert(*s == '\\'); // token starts at backslash for (++s; s < te && oct_digits < 3; ++oct_digits, ++s) { u8 digit = *s - '0'; if (digit < 8) { accum = digit + accum * 8; } else { break; } } if (oct_digits > 0) { addEscapedOctal(currentSeq, accum, mode); } // And then the rest of the digits, if any, are literal. for (; s < te; ++s) { addLiteral(currentSeq, *s, mode); } } }; backReferenceG => addNumberedBackRef; backReferenceGNegative => addNegativeNumberedBackRef; backReferenceGBracket => addNumberedBackRef; backReferenceGBracket2 => addNegativeNumberedBackRef; backReferenceGBracketName => addNamedBackRef; backReferenceKBracketName => addNamedBackRef; backReferenceKBracketName2 => addNamedBackRef; backReferenceKBracketName3 => addNamedBackRef; backReferenceP => addNamedBackRef; # Oniguruma - either angle braces or single quotes for this one ('\\g<' [^>]*? '>'|'\\g\'' [^\']*? '\'') => { ostringstream str; str << "Onigiruma subroutine call at index " << ts - ptr << " not supported."; throw ParseError(str.str()); }; # Fallthrough: a \g that hasn't been caught by one of the above # is invalid syntax. Without this rule, we would accept /A\g/. '\\g' => { throw LocatedParseError("Invalid reference after \\g"); }; '\\o{' [0-7]+ '}' => { string oct(ts + 3, te - ts - 4); unsigned long val; try { val = stoul(oct, nullptr, 8); } catch (const std::out_of_range &) { val = MAX_UNICODE + 1; } if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) { throw LocatedParseError("Value in \\o{...} sequence is too large"); } addEscapedOctal(currentSeq, (unichar)val, mode); }; # And for when it goes wrong '\\o' => { throw LocatedParseError("Value in \\o{...} sequence is non-octal or missing braces"); }; # Hex escapedHex => { addEscapedHex(currentSeq, accumulator, mode); }; # Unicode Hex '\\x{' xdigit+ '}' => { string hex(ts + 3, te - ts - 4); unsigned long val; try { val = stoul(hex, nullptr, 16); } catch (const std::out_of_range &) { val = MAX_UNICODE + 1; } if (val > MAX_UNICODE) { throw LocatedParseError("Value in \\x{...} sequence is too large"); } addEscapedHex(currentSeq, (unichar)val, mode); }; # And for when it goes wrong '\\x{' => { throw LocatedParseError("Value in \\x{...} sequence is non-hex or missing }"); }; # Control characters escapedCtrl => { if (te - ts < 3) { assert(te - ts == 2); throw LocatedParseError(SLASH_C_ERROR); } else { assert(te - ts == 3); addLiteral(currentSeq, decodeCtrl(ts[2]), mode); } }; # A bunch of unsupported (for now) escapes escapedUnsupported => { ostringstream str; str << "'\\" << *(ts + 1) << "' at index " << ts - ptr << " not supported."; throw ParseError(str.str()); }; # Word character '\\w' => { auto cc = generateComponent(CLASS_WORD, false, mode); currentSeq->addComponent(std::move(cc)); }; # Non word character '\\W' => { auto cc = generateComponent(CLASS_WORD, true, mode); currentSeq->addComponent(std::move(cc)); }; # Whitespace character '\\s' => { auto cc = generateComponent(CLASS_SPACE, false, mode); currentSeq->addComponent(std::move(cc)); }; # Non whitespace character '\\S' => { auto cc = generateComponent(CLASS_SPACE, true, mode); currentSeq->addComponent(std::move(cc)); }; # Digit character '\\d' => { auto cc = generateComponent(CLASS_DIGIT, false, mode); currentSeq->addComponent(std::move(cc)); }; # Non digit character '\\D' => { auto cc = generateComponent(CLASS_DIGIT, true, mode); currentSeq->addComponent(std::move(cc)); }; # Horizontal whitespace '\\h' => { auto cc = generateComponent(CLASS_HORZ, false, mode); currentSeq->addComponent(std::move(cc)); }; # Not horizontal whitespace '\\H' => { auto cc = generateComponent(CLASS_HORZ, true, mode); currentSeq->addComponent(std::move(cc)); }; # Vertical whitespace '\\v' => { auto cc = generateComponent(CLASS_VERT, false, mode); currentSeq->addComponent(std::move(cc)); }; # Not vertical whitespace '\\V' => { auto cc = generateComponent(CLASS_VERT, true, mode); currentSeq->addComponent(std::move(cc)); }; '\\p{' => { assert(!currentCls && !inCharClass); currentCls = getComponentClass(mode); negated = false; fhold; fcall readBracedUCP; }; '\\p' any => { assert(!currentCls && !inCharClass); currentCls = getComponentClass(mode); negated = false; fhold; fcall readUCPSingle; }; '\\P{' => { assert(!currentCls && !inCharClass); currentCls = getComponentClass(mode); negated = true; fhold; fcall readBracedUCP; }; '\\P' any => { assert(!currentCls && !inCharClass); currentCls = getComponentClass(mode); negated = true; fhold; fcall readUCPSingle; }; '\\P' => { throw LocatedParseError("Malformed property"); }; '\\p' => { throw LocatedParseError("Malformed property"); }; # Newline sequence, hairy semantics that we don't do '\\R' => { ostringstream str; str << "\\R at index " << ts - ptr << " not supported."; throw ParseError(str.str()); }; # Reset start of match, also hairy semantics that we don't do '\\K' => { ostringstream str; str << "\\K at index " << ts - ptr << " not supported."; throw ParseError(str.str()); }; # \k without a backref is bugged in PCRE so we have no # idea what our semantics should be on it '\\k' => { ostringstream str; str << "\\k at index " << ts - ptr << " not supported."; throw ParseError(str.str()); }; # \G is more hairy pcre-api stuff, DO NOT WANT '\\G' => { ostringstream str; str << "\\G at index " << ts - ptr << " not supported."; throw ParseError(str.str()); }; '\\X' => { currentSeq->addComponent(std::make_unique(ts - ptr, mode)); }; # Fall through general escaped character '\\' any => { addLiteral(currentSeq, *(ts + 1), mode); }; # A backslash with no follower is not allowed '\\' => { assert(ts + 1 == pe); ostringstream str; str << "Unescaped \\ at end of input, index " << ts - ptr << "."; throw ParseError(str.str()); }; ############################################################# # Extended patterns ############################################################# # Comment '\(\?\#' => enterComment; # Match modifiers '\(\?' matchModifiers >resetModifiers ')' => applyModifiers; # Non-capturing group, with flag modifiers '\(\?' matchModifiers >resetModifiers ':' => enterModifiedGroup; # Zero width look ahead assertion '\(\?=' => enterZWLookAhead; # Zero width negative look ahead assertion '\(\?\!' => enterZWNegLookAhead; # Zero width look behind assertion '\(\?\<=' => enterZWLookBehind; # Zero width negative look behind assertion '\(\?\<\!' => enterZWNegLookBehind; # Code (TOTALLY unsupported... for good reason) '\(\?\{' => enterEmbeddedCode; '\(\?\?\{' => enterEmbeddedCode; # Atomic group '\(\?\>' => enterAtomicGroup; # Named capturing groups ( namedGroup1 | namedGroup2 | namedGroup3 ) => enterNamedGroup; # named/numbered subroutine references numberedSubExpression => enterReferenceUnsupported; namedSubExpression => enterReferenceUnsupported; # Conditional reference with a positive lookahead assertion '(?(?=' => { auto a = std::make_unique( ComponentAssertion::LOOKAHEAD, ComponentAssertion::POS); ComponentAssertion *a_seq = a.get(); PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(std::move(a))); PUSH_SEQUENCE; currentSeq = a_seq; }; # Conditional reference with a negative lookahead assertion '(?(?!' => { auto a = std::make_unique( ComponentAssertion::LOOKAHEAD, ComponentAssertion::NEG); ComponentAssertion *a_seq = a.get(); PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(std::move(a))); PUSH_SEQUENCE; currentSeq = a_seq; }; # Conditional reference with a positive lookbehind assertion '(?(?<=' => { auto a = std::make_unique( ComponentAssertion::LOOKBEHIND, ComponentAssertion::POS); ComponentAssertion *a_seq = a.get(); PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(std::move(a))); PUSH_SEQUENCE; currentSeq = a_seq; }; # Conditional reference with a negative lookbehind assertion '(?(? { auto a = std::make_unique( ComponentAssertion::LOOKBEHIND, ComponentAssertion::NEG); ComponentAssertion *a_seq = a.get(); PUSH_SEQUENCE; currentSeq = enterSequence(currentSeq, std::make_unique(std::move(a))); PUSH_SEQUENCE; currentSeq = a_seq; }; # Recursive conditional references (unsupported) '(?(R' ( [0-9]+ | ('&' [A-Za-z0-9_]+) ) ? ')' => { throw LocatedParseError("Pattern recursion not supported"); }; # Conditional references # numbered '\(\?\(' (backRefIdSingle | backRefId) ')' => enterNumberedConditionalRef; # named ( namedConditionalRef1 | namedConditionalRef2 | namedConditionalRef3 ) => enterNamedConditionalRef; # Conditions (unsupported) '\(\?\(' => enterConditionUnsupported; # Callouts (unsupported) '\(\?C' [0-9]* '\)' => { ostringstream str; str << "Callout at index " << ts - ptr << " not supported."; throw ParseError(str.str()); }; # Any other char after '(?' is a pattern modifier we don't # recognise. '\(\?' any => { throw LocatedParseError("Unrecognised character after (?"); }; #unicode chars utf8_2c when is_utf8 => { assert(mode.utf8); /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); cc->add(readUtf8CodePoint2c(ts)); cc->finalize(); currentSeq->addComponent(std::move(cc)); }; utf8_3c when is_utf8 => { assert(mode.utf8); /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); cc->add(readUtf8CodePoint3c(ts)); cc->finalize(); currentSeq->addComponent(std::move(cc)); }; utf8_4c when is_utf8 => { assert(mode.utf8); /* leverage ComponentClass to generate the vertices */ auto cc = getComponentClass(mode); cc->add(readUtf8CodePoint4c(ts)); cc->finalize(); currentSeq->addComponent(std::move(cc)); }; hi_byte when is_utf8 => { assert(mode.utf8); throwInvalidUtf8(); }; ############################################################# # Literal character ############################################################# # literal character whitespace => { if (mode.ignore_space == false) { addLiteral(currentSeq, *ts, mode); } }; any => { addLiteral(currentSeq, *ts, mode); }; *|; prepush { DEBUG_PRINTF("stack %zu top %d\n", stack.size(), top); if ((int)stack.size() == top) { stack.resize(2 * (top + 1)); } } }%% %% write data nofinal; /** \brief Main parser call, returns root Component or nullptr. */ unique_ptr parse(const char *ptr, ParseMode &globalMode) { assert(ptr); const char *p = ptr; const char *pe = ptr + strlen(ptr); // First, read the control verbs, set any global mode flags and move the // ptr forward. p = read_control_verbs(p, pe, 0, globalMode); const char *eof = pe; int cs; UNUSED int act; int top; vector stack; const char *ts, *te; unichar accumulator = 0; unichar octAccumulator = 0; /* required as we are also accumulating for * back ref when looking for octals */ unsigned repeatN = 0; unsigned repeatM = 0; string label; ParseMode mode = globalMode; ParseMode newMode; bool negated = false; bool inComment = false; // Stack of sequences and flags used to store state when we enter // sub-sequences. vector sequences; // Index of the next capturing group. Note that zero is reserved for the // root sequence. unsigned groupIndex = 1; // Set storing group names that are currently in use. flat_set groupNames; // Root sequence. unique_ptr rootSeq = std::make_unique(); rootSeq->setCaptureIndex(0); // Current sequence being appended to ComponentSequence *currentSeq = rootSeq.get(); // The current character class being appended to. This is used as the // accumulator for both character class and UCP properties. unique_ptr currentCls; // True if the machine is currently inside a character class, i.e. square // brackets [..]. bool inCharClass = false; // True if the machine is inside a character class but it has not processed // any "real" elements yet, i.e. it's still processing meta-characters like // '^'. bool inCharClassEarly = false; // Location at which the current character class began. const char *currentClsBegin = p; // We throw exceptions on various parsing failures beyond this point: we // use a try/catch block here to clean up our allocated memory before we // re-throw the exception to the caller. try { // Embed the Ragel machine here %% write init; %% write exec; if (p != pe && *p != '\0') { // didn't make it to the end of our input, but we didn't throw a ParseError? assert(0); ostringstream str; str << "Parse error at index " << (p - ptr) << "."; throw ParseError(str.str()); } if (currentCls) { assert(inCharClass); assert(currentClsBegin); ostringstream oss; oss << "Unterminated character class starting at index " << currentClsBegin - ptr << "."; throw ParseError(oss.str()); } if (inComment) { throw ParseError("Unterminated comment."); } if (!sequences.empty()) { ostringstream str; str << "Missing close parenthesis for group started at index " << sequences.back().seqOffset << "."; throw ParseError(str.str()); } // Unlikely, but possible if (groupIndex > 65535) { throw ParseError("The maximum number of capturing subexpressions is 65535."); } // Finalize the top-level sequence, which will take care of any // top-level alternation. currentSeq->finalize(); assert(currentSeq == rootSeq.get()); // Ensure that all references are valid. checkReferences(*rootSeq, groupIndex, groupNames); return rootSeq; } catch (LocatedParseError &error) { if (ts >= ptr && ts <= pe) { error.locate(ts - ptr); } else { error.locate(0); } throw; } } } // namespace ue2