mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Rework parser rejection for POSIX collating elems
Implement rejection of POSIX collating elements ("[.ch.]" and "[=ch=]" entirely in the Ragel parser, using the same approach both inside and ouside character classes. Fix buggy rejection of [^.ch.], which we should accept as a character class.
This commit is contained in:
parent
d9efe07125
commit
9a7b912a5d
@ -790,10 +790,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
any => { throw LocatedParseError("Unknown property"); };
|
any => { throw LocatedParseError("Unknown property"); };
|
||||||
*|;
|
*|;
|
||||||
charClassGuts := |*
|
charClassGuts := |*
|
||||||
# We don't like POSIX collating elements (neither does PCRE or Perl).
|
# We don't support POSIX collating elements (neither does PCRE
|
||||||
'\[\.' [^\]]* '\.\]' |
|
# or Perl). These look like [.ch.] or [=ch=].
|
||||||
'\[=' [^\]]* '=\]' => {
|
'\[\.' ( '\\]' | [^\]] )* '\.\]' |
|
||||||
throw LocatedParseError("Unsupported POSIX collating element");
|
'\[=' ( '\\]' | [^\]] )* '=\]' => {
|
||||||
|
throw LocatedParseError("Unsupported POSIX collating "
|
||||||
|
"element");
|
||||||
};
|
};
|
||||||
# Named sets
|
# Named sets
|
||||||
# Adding these may cause the charclass to close, hence the
|
# Adding these may cause the charclass to close, hence the
|
||||||
@ -1090,23 +1092,6 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
throwInvalidUtf8();
|
throwInvalidUtf8();
|
||||||
};
|
};
|
||||||
|
|
||||||
# dot or equals at the end of a character class could be the end
|
|
||||||
# of a collating element, like [.blah.] or [=blah=].
|
|
||||||
[.=] ']' => {
|
|
||||||
if (currentCls->getFirstChar() == *ts) {
|
|
||||||
assert(currentClsBegin);
|
|
||||||
ostringstream oss;
|
|
||||||
oss << "Unsupported POSIX collating element at index "
|
|
||||||
<< currentClsBegin - ptr << ".";
|
|
||||||
throw ParseError(oss.str());
|
|
||||||
}
|
|
||||||
currentCls->add(*ts);
|
|
||||||
currentCls->finalize();
|
|
||||||
currentSeq->addComponent(move(currentCls));
|
|
||||||
inCharClass = false;
|
|
||||||
fgoto main;
|
|
||||||
};
|
|
||||||
|
|
||||||
# Literal character
|
# Literal character
|
||||||
(any - ']') => {
|
(any - ']') => {
|
||||||
if (currentCls->class_empty()) {
|
if (currentCls->class_empty()) {
|
||||||
@ -1232,6 +1217,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
|
|||||||
throw LocatedParseError("POSIX named classes are only "
|
throw LocatedParseError("POSIX named classes are only "
|
||||||
"supported inside a class");
|
"supported inside a class");
|
||||||
};
|
};
|
||||||
|
# We don't support POSIX collating elements (neither does PCRE
|
||||||
|
# or Perl). These look like [.ch.] or [=ch=].
|
||||||
|
'\[\.' ( '\\]' | [^\]] )* '\.\]' |
|
||||||
|
'\[=' ( '\\]' | [^\]] )* '=\]' => {
|
||||||
|
throw LocatedParseError("Unsupported POSIX collating "
|
||||||
|
"element");
|
||||||
|
};
|
||||||
# Begin eating characters for class
|
# Begin eating characters for class
|
||||||
'\[' => eatClass;
|
'\[' => eatClass;
|
||||||
# Begin quoted literal
|
# Begin quoted literal
|
||||||
|
@ -128,3 +128,7 @@
|
|||||||
128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ñññññññññññññññññññññññññññ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
|
128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ñññññññññññññññññññññññññññ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
|
||||||
129:/bignum \1111111111111111111/ #Number is too big at index 7.
|
129:/bignum \1111111111111111111/ #Number is too big at index 7.
|
||||||
130:/foo|&{5555555,}/ #Bounded repeat is too large.
|
130:/foo|&{5555555,}/ #Bounded repeat is too large.
|
||||||
|
131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
|
||||||
|
132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
|
||||||
|
133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
|
||||||
|
134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user