Fix defn of POSIX graph, print, punct classes

The POSIX classes [:graph:], [:print:] and [:punct:] are handled
specially in UCP mode by PCRE. This change matches that behaviour.
This commit is contained in:
Justin Viiret 2015-11-16 16:43:43 +11:00 committed by Matthew Barr
parent 313822c157
commit bdb7a10034
4 changed files with 35 additions and 3 deletions

View File

@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
case CLASS_DIGIT:
return number;
case CLASS_GRAPH:
case CLASS_XGRAPH:
return CharReach(0x21, 0x7e);
case CLASS_XGRAPH:
return to_cr(getPredefinedCodePointSet(c, mode));
case CLASS_HORZ:
return CharReach("\x09\x20\xA0");
case CLASS_LOWER:
@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
}
case CLASS_PRINT:
return CharReach(0x20, 0x7e);
case CLASS_XPRINT:
return to_cr(getPredefinedCodePointSet(c, mode));
case CLASS_PUNCT:
return CharReach(0x21, '0' - 1)
| CharReach('9' + 1, 'A' - 1)
| CharReach('Z' + 1, 'a' - 1)
| CharReach('z' + 1, 126);
case CLASS_XPUNCT:
return to_cr(getPredefinedCodePointSet(c, mode));
case CLASS_SPACE:
return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
case CLASS_UPPER:

View File

@ -63,7 +63,9 @@ enum PredefinedClass {
CLASS_VERT,
CLASS_WORD,
CLASS_XDIGIT,
CLASS_XGRAPH,
CLASS_XGRAPH, /* [:graph:] in UCP mode */
CLASS_XPRINT, /* [:print:] in UCP mode */
CLASS_XPUNCT, /* [:punct:] in UCP mode */
CLASS_UCP_C,
CLASS_UCP_CC,
CLASS_UCP_CF,

View File

@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
} else {
return CLASS_UCP_LL;
}
case CLASS_PRINT:
return CLASS_XPRINT;
case CLASS_PUNCT:
return CLASS_XPUNCT;
case CLASS_SPACE:
return CLASS_UCP_XPS;
case CLASS_UPPER:
@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
}
}
static
CodePointSet getPredefinedCodePointSet(PredefinedClass c,
const ParseMode &mode) {
/* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
@ -117,6 +120,25 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
rv |= cf;
return rv;
}
case CLASS_XPRINT: {
// Same as graph, plus everything with the Zs property.
CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
rv |= getUcpZs();
return rv;
}
case CLASS_XPUNCT: {
// Everything with the P (punctuation) property, plus code points in S
// (symbols) that are < 128.
// NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the
// cut-off here, so we are compatible with that for now. PCRE bug #1718
// tracks this; once PCRE 8.38 is released we should correct this
// behaviour.
CodePointSet rv = getUcpP();
CodePointSet symbols = getUcpS();
symbols.unsetRange(256, MAX_UNICODE);
rv |= symbols;
return rv;
}
case CLASS_HORZ: {
CodePointSet rv;
rv.set(0x0009); /* Horizontal tab */

View File

@ -110,6 +110,9 @@ private:
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
bool isUcp(PredefinedClass c);
CodePointSet getPredefinedCodePointSet(PredefinedClass c,
const ParseMode &mode);
} // namespace
#endif // UTF8_COMPONENT_CLASS_H