Don't use class_empty in early class parsing

Instead, explicitly track whether we're still in the early class parsing
machine.
This commit is contained in:
Justin Viiret 2015-11-09 12:50:52 +11:00 committed by Matthew Barr
parent b1f6a539c7
commit c68bfe05d8

View File

@ -424,6 +424,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
assert(!inCharClass); // not reentrant assert(!inCharClass); // not reentrant
currentCls = getComponentClass(mode); currentCls = getComponentClass(mode);
inCharClass = true; inCharClass = true;
inCharClassEarly = true;
currentClsBegin = ts; currentClsBegin = ts;
fgoto readClass; fgoto readClass;
} }
@ -474,6 +475,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
} }
action is_utf8 { mode.utf8 } action is_utf8 { mode.utf8 }
action is_ignore_space { mode.ignore_space } action is_ignore_space { mode.ignore_space }
action is_early_charclass { inCharClassEarly }
action addNumberedBackRef { action addNumberedBackRef {
if (accumulator == 0) { if (accumulator == 0) {
@ -1109,25 +1111,24 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
# Parser to read stuff from a character class # Parser to read stuff from a character class
############################################################# #############################################################
readClass := |* readClass := |*
# the negate and right bracket out the front are special # A caret at the beginning of the class means that the rest of the
'\^' => { # class is negated.
'\^' when is_early_charclass => {
if (currentCls->isNegated()) { if (currentCls->isNegated()) {
// Already seen a caret; the second one is not a meta-character.
inCharClassEarly = false;
fhold; fgoto charClassGuts; fhold; fgoto charClassGuts;
} else { } else {
currentCls->negate(); currentCls->negate();
// Note: we cannot switch off inCharClassEarly here, as /[^]]/
// needs to use the right square bracket path below.
} }
}; };
']' => { # A right square bracket before anything "real" is interpreted as a
// if this is the first thing in the class, add it and move along, # literal right square bracket.
// otherwise jump into the char class machine to handle what might ']' when is_early_charclass => {
// end up as fail currentCls->add(']');
if (currentCls->class_empty()) { inCharClassEarly = false;
currentCls->add(']');
} else {
// leave it for the next machine
fhold;
}
fgoto charClassGuts;
}; };
# if we hit a quote before anything "real", handle it # if we hit a quote before anything "real", handle it
#'\\Q' => { fcall readQuotedClass; }; #'\\Q' => { fcall readQuotedClass; };
@ -1137,7 +1138,11 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
'\\E' => { /*noop*/}; '\\E' => { /*noop*/};
# time for the real work to happen # time for the real work to happen
any => { fhold; fgoto charClassGuts; }; any => {
inCharClassEarly = false;
fhold;
fgoto charClassGuts;
};
*|; *|;
############################################################# #############################################################
@ -1885,6 +1890,11 @@ unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
// brackets [..]. // brackets [..].
bool inCharClass = false; bool inCharClass = false;
// True if the machine is inside a character class but it has not processed
// any "real" elements yet, i.e. it's still processing meta-characters like
// '^'.
bool inCharClassEarly = false;
// Location at which the current character class began. // Location at which the current character class began.
const u8 *currentClsBegin = p; const u8 *currentClsBegin = p;