Rework parser rejection for POSIX collating elems

Implement rejection of POSIX collating elements ("[.ch.]" and "[=ch=]"
entirely in the Ragel parser, using the same approach both inside and
ouside character classes.

Fix buggy rejection of [^.ch.], which we should accept as a character
class.
This commit is contained in:
Justin Viiret 2015-11-09 10:37:20 +11:00 committed by Matthew Barr
parent d9efe07125
commit 9a7b912a5d
2 changed files with 17 additions and 21 deletions

View File

@ -790,10 +790,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
any => { throw LocatedParseError("Unknown property"); }; any => { throw LocatedParseError("Unknown property"); };
*|; *|;
charClassGuts := |* charClassGuts := |*
# We don't like POSIX collating elements (neither does PCRE or Perl). # We don't support POSIX collating elements (neither does PCRE
'\[\.' [^\]]* '\.\]' | # or Perl). These look like [.ch.] or [=ch=].
'\[=' [^\]]* '=\]' => { '\[\.' ( '\\]' | [^\]] )* '\.\]' |
throw LocatedParseError("Unsupported POSIX collating element"); '\[=' ( '\\]' | [^\]] )* '=\]' => {
throw LocatedParseError("Unsupported POSIX collating "
"element");
}; };
# Named sets # Named sets
# Adding these may cause the charclass to close, hence the # Adding these may cause the charclass to close, hence the
@ -1090,23 +1092,6 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
throwInvalidUtf8(); throwInvalidUtf8();
}; };
# dot or equals at the end of a character class could be the end
# of a collating element, like [.blah.] or [=blah=].
[.=] ']' => {
if (currentCls->getFirstChar() == *ts) {
assert(currentClsBegin);
ostringstream oss;
oss << "Unsupported POSIX collating element at index "
<< currentClsBegin - ptr << ".";
throw ParseError(oss.str());
}
currentCls->add(*ts);
currentCls->finalize();
currentSeq->addComponent(move(currentCls));
inCharClass = false;
fgoto main;
};
# Literal character # Literal character
(any - ']') => { (any - ']') => {
if (currentCls->class_empty()) { if (currentCls->class_empty()) {
@ -1232,6 +1217,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
throw LocatedParseError("POSIX named classes are only " throw LocatedParseError("POSIX named classes are only "
"supported inside a class"); "supported inside a class");
}; };
# We don't support POSIX collating elements (neither does PCRE
# or Perl). These look like [.ch.] or [=ch=].
'\[\.' ( '\\]' | [^\]] )* '\.\]' |
'\[=' ( '\\]' | [^\]] )* '=\]' => {
throw LocatedParseError("Unsupported POSIX collating "
"element");
};
# Begin eating characters for class # Begin eating characters for class
'\[' => eatClass; '\[' => eatClass;
# Begin quoted literal # Begin quoted literal

View File

@ -128,3 +128,7 @@
128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ñññññññññññññññññññññññññññ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8. 128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ñññññññññññññññññññññññññññ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
129:/bignum \1111111111111111111/ #Number is too big at index 7. 129:/bignum \1111111111111111111/ #Number is too big at index 7.
130:/foo|&{5555555,}/ #Bounded repeat is too large. 130:/foo|&{5555555,}/ #Bounded repeat is too large.
131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.