mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Don't use class_empty in early class parsing
Instead, explicitly track whether we're still in the early class parsing machine.
This commit is contained in:
parent
b1f6a539c7
commit
c68bfe05d8
@ -424,6 +424,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
assert(!inCharClass); // not reentrant
|
assert(!inCharClass); // not reentrant
|
||||||
currentCls = getComponentClass(mode);
|
currentCls = getComponentClass(mode);
|
||||||
inCharClass = true;
|
inCharClass = true;
|
||||||
|
inCharClassEarly = true;
|
||||||
currentClsBegin = ts;
|
currentClsBegin = ts;
|
||||||
fgoto readClass;
|
fgoto readClass;
|
||||||
}
|
}
|
||||||
@ -474,6 +475,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
}
|
}
|
||||||
action is_utf8 { mode.utf8 }
|
action is_utf8 { mode.utf8 }
|
||||||
action is_ignore_space { mode.ignore_space }
|
action is_ignore_space { mode.ignore_space }
|
||||||
|
action is_early_charclass { inCharClassEarly }
|
||||||
|
|
||||||
action addNumberedBackRef {
|
action addNumberedBackRef {
|
||||||
if (accumulator == 0) {
|
if (accumulator == 0) {
|
||||||
@ -1109,25 +1111,24 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
# Parser to read stuff from a character class
|
# Parser to read stuff from a character class
|
||||||
#############################################################
|
#############################################################
|
||||||
readClass := |*
|
readClass := |*
|
||||||
# the negate and right bracket out the front are special
|
# A caret at the beginning of the class means that the rest of the
|
||||||
'\^' => {
|
# class is negated.
|
||||||
|
'\^' when is_early_charclass => {
|
||||||
if (currentCls->isNegated()) {
|
if (currentCls->isNegated()) {
|
||||||
|
// Already seen a caret; the second one is not a meta-character.
|
||||||
|
inCharClassEarly = false;
|
||||||
fhold; fgoto charClassGuts;
|
fhold; fgoto charClassGuts;
|
||||||
} else {
|
} else {
|
||||||
currentCls->negate();
|
currentCls->negate();
|
||||||
|
// Note: we cannot switch off inCharClassEarly here, as /[^]]/
|
||||||
|
// needs to use the right square bracket path below.
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
']' => {
|
# A right square bracket before anything "real" is interpreted as a
|
||||||
// if this is the first thing in the class, add it and move along,
|
# literal right square bracket.
|
||||||
// otherwise jump into the char class machine to handle what might
|
']' when is_early_charclass => {
|
||||||
// end up as fail
|
currentCls->add(']');
|
||||||
if (currentCls->class_empty()) {
|
inCharClassEarly = false;
|
||||||
currentCls->add(']');
|
|
||||||
} else {
|
|
||||||
// leave it for the next machine
|
|
||||||
fhold;
|
|
||||||
}
|
|
||||||
fgoto charClassGuts;
|
|
||||||
};
|
};
|
||||||
# if we hit a quote before anything "real", handle it
|
# if we hit a quote before anything "real", handle it
|
||||||
#'\\Q' => { fcall readQuotedClass; };
|
#'\\Q' => { fcall readQuotedClass; };
|
||||||
@ -1137,7 +1138,11 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
'\\E' => { /*noop*/};
|
'\\E' => { /*noop*/};
|
||||||
|
|
||||||
# time for the real work to happen
|
# time for the real work to happen
|
||||||
any => { fhold; fgoto charClassGuts; };
|
any => {
|
||||||
|
inCharClassEarly = false;
|
||||||
|
fhold;
|
||||||
|
fgoto charClassGuts;
|
||||||
|
};
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
#############################################################
|
#############################################################
|
||||||
@ -1885,6 +1890,11 @@ unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
|
|||||||
// brackets [..].
|
// brackets [..].
|
||||||
bool inCharClass = false;
|
bool inCharClass = false;
|
||||||
|
|
||||||
|
// True if the machine is inside a character class but it has not processed
|
||||||
|
// any "real" elements yet, i.e. it's still processing meta-characters like
|
||||||
|
// '^'.
|
||||||
|
bool inCharClassEarly = false;
|
||||||
|
|
||||||
// Location at which the current character class began.
|
// Location at which the current character class began.
|
||||||
const u8 *currentClsBegin = p;
|
const u8 *currentClsBegin = p;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user