From c68bfe05d860265887101c4196f1c02d15d506b6 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Mon, 9 Nov 2015 12:50:52 +1100 Subject: [PATCH] Don't use class_empty in early class parsing Instead, explicitly track whether we're still in the early class parsing machine. --- src/parser/Parser.rl | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index a0378dce..96656875 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -424,6 +424,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { assert(!inCharClass); // not reentrant currentCls = getComponentClass(mode); inCharClass = true; + inCharClassEarly = true; currentClsBegin = ts; fgoto readClass; } @@ -474,6 +475,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) { } action is_utf8 { mode.utf8 } action is_ignore_space { mode.ignore_space } + action is_early_charclass { inCharClassEarly } action addNumberedBackRef { if (accumulator == 0) { @@ -1109,25 +1111,24 @@ unichar readUtf8CodePoint4c(const u8 *ts) { # Parser to read stuff from a character class ############################################################# readClass := |* - # the negate and right bracket out the front are special - '\^' => { + # A caret at the beginning of the class means that the rest of the + # class is negated. + '\^' when is_early_charclass => { if (currentCls->isNegated()) { + // Already seen a caret; the second one is not a meta-character. + inCharClassEarly = false; fhold; fgoto charClassGuts; } else { currentCls->negate(); + // Note: we cannot switch off inCharClassEarly here, as /[^]]/ + // needs to use the right square bracket path below. } }; - ']' => { - // if this is the first thing in the class, add it and move along, - // otherwise jump into the char class machine to handle what might - // end up as fail - if (currentCls->class_empty()) { - currentCls->add(']'); - } else { - // leave it for the next machine - fhold; - } - fgoto charClassGuts; + # A right square bracket before anything "real" is interpreted as a + # literal right square bracket. + ']' when is_early_charclass => { + currentCls->add(']'); + inCharClassEarly = false; }; # if we hit a quote before anything "real", handle it #'\\Q' => { fcall readQuotedClass; }; @@ -1137,7 +1138,11 @@ unichar readUtf8CodePoint4c(const u8 *ts) { '\\E' => { /*noop*/}; # time for the real work to happen - any => { fhold; fgoto charClassGuts; }; + any => { + inCharClassEarly = false; + fhold; + fgoto charClassGuts; + }; *|; ############################################################# @@ -1885,6 +1890,11 @@ unique_ptr parse(const char *const c_ptr, ParseMode &globalMode) { // brackets [..]. bool inCharClass = false; + // True if the machine is inside a character class but it has not processed + // any "real" elements yet, i.e. it's still processing meta-characters like + // '^'. + bool inCharClassEarly = false; + // Location at which the current character class began. const u8 *currentClsBegin = p;