Fix defn of POSIX graph, print, punct classes

The POSIX classes [:graph:], [:print:] and [:punct:] are handled specially in UCP mode by PCRE. This change matches that behaviour.
2025-06-28 16:41:01 +03:00 · 2015-11-16 16:43:43 +11:00 · 2015-11-16 16:43:43 +11:00 · bdb7a10034
commit bdb7a10034
parent 313822c157
4 changed files with 35 additions and 3 deletions
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
    case CLASS_DIGIT:
        return number;
    case CLASS_GRAPH:
-    case CLASS_XGRAPH:
        return CharReach(0x21, 0x7e);
+    case CLASS_XGRAPH:
+        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_HORZ:
        return CharReach("\x09\x20\xA0");
    case CLASS_LOWER:
@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
        }
    case CLASS_PRINT:
        return CharReach(0x20, 0x7e);
+    case CLASS_XPRINT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_PUNCT:
        return CharReach(0x21, '0' - 1)
            | CharReach('9' + 1, 'A' - 1)
            | CharReach('Z' + 1, 'a' - 1)
            | CharReach('z' + 1, 126);
+    case CLASS_XPUNCT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_SPACE:
        return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
    case CLASS_UPPER:
--- a/src/parser/ComponentClass.h
+++ b/src/parser/ComponentClass.h
@ -63,7 +63,9 @@ enum PredefinedClass {
    CLASS_VERT,
    CLASS_WORD,
    CLASS_XDIGIT,
-    CLASS_XGRAPH,
+    CLASS_XGRAPH, /* [:graph:] in UCP mode */
+    CLASS_XPRINT, /* [:print:] in UCP mode */
+    CLASS_XPUNCT, /* [:punct:] in UCP mode */
    CLASS_UCP_C,
    CLASS_UCP_CC,
    CLASS_UCP_CF,
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
        } else {
            return CLASS_UCP_LL;
        }
+    case CLASS_PRINT:
+        return CLASS_XPRINT;
+    case CLASS_PUNCT:
+        return CLASS_XPUNCT;
    case CLASS_SPACE:
        return CLASS_UCP_XPS;
    case CLASS_UPPER:
@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
    }
 }

-static
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                       const ParseMode &mode) {
    /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
@ -117,6 +120,25 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
        rv |= cf;
        return rv;
    }
+    case CLASS_XPRINT: {
+        // Same as graph, plus everything with the Zs property.
+        CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
+        rv |= getUcpZs();
+        return rv;
+    }
+    case CLASS_XPUNCT: {
+        // Everything with the P (punctuation) property, plus code points in S
+        // (symbols) that are < 128.
+        // NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the
+        // cut-off here, so we are compatible with that for now. PCRE bug #1718
+        // tracks this; once PCRE 8.38 is released we should correct this
+        // behaviour.
+        CodePointSet rv = getUcpP();
+        CodePointSet symbols = getUcpS();
+        symbols.unsetRange(256, MAX_UNICODE);
+        rv |= symbols;
+        return rv;
+    }
    case CLASS_HORZ: {
        CodePointSet rv;
        rv.set(0x0009); /* Horizontal tab */
--- a/src/parser/Utf8ComponentClass.h
+++ b/src/parser/Utf8ComponentClass.h
@ -110,6 +110,9 @@ private:
 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
 bool isUcp(PredefinedClass c);

+CodePointSet getPredefinedCodePointSet(PredefinedClass c,
+                                       const ParseMode &mode);
+
 } // namespace

 #endif // UTF8_COMPONENT_CLASS_H