Rework parser rejection for POSIX collating elems

Implement rejection of POSIX collating elements ("[.ch.]" and "[=ch=]" entirely in the Ragel parser, using the same approach both inside and ouside character classes. Fix buggy rejection of [^.ch.], which we should accept as a character class.
2026-01-17 16:00:26 +03:00 · 2015-11-09 10:37:20 +11:00
parent d9efe07125
commit 9a7b912a5d
2 changed files with 17 additions and 21 deletions
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -790,10 +790,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
        any => { throw LocatedParseError("Unknown property"); };
                     *|;
    charClassGuts := |*
-              # We don't like POSIX collating elements (neither does PCRE or Perl).
+              # We don't support POSIX collating elements (neither does PCRE
-              '\[\.' [^\]]* '\.\]' | 
+              # or Perl). These look like [.ch.] or [=ch=].
-              '\[=' [^\]]* '=\]' => {
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
-                  throw LocatedParseError("Unsupported POSIX collating element");
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
                  throw LocatedParseError("Unsupported POSIX collating "
                                          "element");
              };
              # Named sets
              # Adding these may cause the charclass to close, hence the
@@ -1090,23 +1092,6 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throwInvalidUtf8();
              };
              # dot or equals at the end of a character class could be the end
              # of a collating element, like [.blah.] or [=blah=].
              [.=] ']' => {
                  if (currentCls->getFirstChar() == *ts) {
                      assert(currentClsBegin);
                      ostringstream oss;
                      oss << "Unsupported POSIX collating element at index "
                          << currentClsBegin - ptr << ".";
                      throw ParseError(oss.str());
                  }
                  currentCls->add(*ts);
                  currentCls->finalize();
                  currentSeq->addComponent(move(currentCls));
                  inCharClass = false;
                  fgoto main;
              };
              # Literal character
              (any - ']') => {
                  if (currentCls->class_empty()) {
@@ -1232,6 +1217,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throw LocatedParseError("POSIX named classes are only "
                                          "supported inside a class");
              };
              # We don't support POSIX collating elements (neither does PCRE
              # or Perl). These look like [.ch.] or [=ch=].
              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
              '\[=' ( '\\]' | [^\]] )* '=\]' => {
                  throw LocatedParseError("Unsupported POSIX collating "
                                          "element");
              };
              # Begin eating characters for class
              '\[' => eatClass;
              # Begin quoted literal
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -128,3 +128,7 @@
 128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
 129:/bignum \1111111111111111111/ #Number is too big at index 7.
 130:/foo|&{5555555,}/ #Bounded repeat is too large.
 131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
 132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
 133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
 134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.