From bdb7a100344d5e500081d061ea42f24e532e1ba4 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Mon, 16 Nov 2015 16:43:43 +1100 Subject: [PATCH] Fix defn of POSIX graph, print, punct classes The POSIX classes [:graph:], [:print:] and [:punct:] are handled specially in UCP mode by PCRE. This change matches that behaviour. --- src/parser/ComponentClass.cpp | 7 ++++++- src/parser/ComponentClass.h | 4 +++- src/parser/Utf8ComponentClass.cpp | 24 +++++++++++++++++++++++- src/parser/Utf8ComponentClass.h | 3 +++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/parser/ComponentClass.cpp b/src/parser/ComponentClass.cpp index 43c05898..a91ae979 100644 --- a/src/parser/ComponentClass.cpp +++ b/src/parser/ComponentClass.cpp @@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) { case CLASS_DIGIT: return number; case CLASS_GRAPH: - case CLASS_XGRAPH: return CharReach(0x21, 0x7e); + case CLASS_XGRAPH: + return to_cr(getPredefinedCodePointSet(c, mode)); case CLASS_HORZ: return CharReach("\x09\x20\xA0"); case CLASS_LOWER: @@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) { } case CLASS_PRINT: return CharReach(0x20, 0x7e); + case CLASS_XPRINT: + return to_cr(getPredefinedCodePointSet(c, mode)); case CLASS_PUNCT: return CharReach(0x21, '0' - 1) | CharReach('9' + 1, 'A' - 1) | CharReach('Z' + 1, 'a' - 1) | CharReach('z' + 1, 126); + case CLASS_XPUNCT: + return to_cr(getPredefinedCodePointSet(c, mode)); case CLASS_SPACE: return CharReach("\x09\x0a\x0c\x0b\x0d\x20"); case CLASS_UPPER: diff --git a/src/parser/ComponentClass.h b/src/parser/ComponentClass.h index 1cb1a7d0..040e6d78 100644 --- a/src/parser/ComponentClass.h +++ b/src/parser/ComponentClass.h @@ -63,7 +63,9 @@ enum PredefinedClass { CLASS_VERT, CLASS_WORD, CLASS_XDIGIT, - CLASS_XGRAPH, + CLASS_XGRAPH, /* [:graph:] in UCP mode */ + CLASS_XPRINT, /* [:print:] in UCP mode */ + CLASS_XPUNCT, /* [:punct:] in UCP mode */ CLASS_UCP_C, CLASS_UCP_CC, CLASS_UCP_CF, diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp index 3a6a85a4..54f9edb9 100644 --- a/src/parser/Utf8ComponentClass.cpp +++ b/src/parser/Utf8ComponentClass.cpp @@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) { } else { return CLASS_UCP_LL; } + case CLASS_PRINT: + return CLASS_XPRINT; + case CLASS_PUNCT: + return CLASS_XPUNCT; case CLASS_SPACE: return CLASS_UCP_XPS; case CLASS_UPPER: @@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) { } } -static CodePointSet getPredefinedCodePointSet(PredefinedClass c, const ParseMode &mode) { /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */ @@ -117,6 +120,25 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c, rv |= cf; return rv; } + case CLASS_XPRINT: { + // Same as graph, plus everything with the Zs property. + CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode); + rv |= getUcpZs(); + return rv; + } + case CLASS_XPUNCT: { + // Everything with the P (punctuation) property, plus code points in S + // (symbols) that are < 128. + // NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the + // cut-off here, so we are compatible with that for now. PCRE bug #1718 + // tracks this; once PCRE 8.38 is released we should correct this + // behaviour. + CodePointSet rv = getUcpP(); + CodePointSet symbols = getUcpS(); + symbols.unsetRange(256, MAX_UNICODE); + rv |= symbols; + return rv; + } case CLASS_HORZ: { CodePointSet rv; rv.set(0x0009); /* Horizontal tab */ diff --git a/src/parser/Utf8ComponentClass.h b/src/parser/Utf8ComponentClass.h index b2c402f9..3d21a278 100644 --- a/src/parser/Utf8ComponentClass.h +++ b/src/parser/Utf8ComponentClass.h @@ -110,6 +110,9 @@ private: PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode); bool isUcp(PredefinedClass c); +CodePointSet getPredefinedCodePointSet(PredefinedClass c, + const ParseMode &mode); + } // namespace #endif // UTF8_COMPONENT_CLASS_H