mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Fix defn of POSIX graph, print, punct classes
The POSIX classes [:graph:], [:print:] and [:punct:] are handled specially in UCP mode by PCRE. This change matches that behaviour.
This commit is contained in:
parent
313822c157
commit
bdb7a10034
@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
|
||||
case CLASS_DIGIT:
|
||||
return number;
|
||||
case CLASS_GRAPH:
|
||||
case CLASS_XGRAPH:
|
||||
return CharReach(0x21, 0x7e);
|
||||
case CLASS_XGRAPH:
|
||||
return to_cr(getPredefinedCodePointSet(c, mode));
|
||||
case CLASS_HORZ:
|
||||
return CharReach("\x09\x20\xA0");
|
||||
case CLASS_LOWER:
|
||||
@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
|
||||
}
|
||||
case CLASS_PRINT:
|
||||
return CharReach(0x20, 0x7e);
|
||||
case CLASS_XPRINT:
|
||||
return to_cr(getPredefinedCodePointSet(c, mode));
|
||||
case CLASS_PUNCT:
|
||||
return CharReach(0x21, '0' - 1)
|
||||
| CharReach('9' + 1, 'A' - 1)
|
||||
| CharReach('Z' + 1, 'a' - 1)
|
||||
| CharReach('z' + 1, 126);
|
||||
case CLASS_XPUNCT:
|
||||
return to_cr(getPredefinedCodePointSet(c, mode));
|
||||
case CLASS_SPACE:
|
||||
return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
|
||||
case CLASS_UPPER:
|
||||
|
@ -63,7 +63,9 @@ enum PredefinedClass {
|
||||
CLASS_VERT,
|
||||
CLASS_WORD,
|
||||
CLASS_XDIGIT,
|
||||
CLASS_XGRAPH,
|
||||
CLASS_XGRAPH, /* [:graph:] in UCP mode */
|
||||
CLASS_XPRINT, /* [:print:] in UCP mode */
|
||||
CLASS_XPUNCT, /* [:punct:] in UCP mode */
|
||||
CLASS_UCP_C,
|
||||
CLASS_UCP_CC,
|
||||
CLASS_UCP_CF,
|
||||
|
@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
|
||||
} else {
|
||||
return CLASS_UCP_LL;
|
||||
}
|
||||
case CLASS_PRINT:
|
||||
return CLASS_XPRINT;
|
||||
case CLASS_PUNCT:
|
||||
return CLASS_XPUNCT;
|
||||
case CLASS_SPACE:
|
||||
return CLASS_UCP_XPS;
|
||||
case CLASS_UPPER:
|
||||
@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
CodePointSet getPredefinedCodePointSet(PredefinedClass c,
|
||||
const ParseMode &mode) {
|
||||
/* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
|
||||
@ -117,6 +120,25 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
|
||||
rv |= cf;
|
||||
return rv;
|
||||
}
|
||||
case CLASS_XPRINT: {
|
||||
// Same as graph, plus everything with the Zs property.
|
||||
CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
|
||||
rv |= getUcpZs();
|
||||
return rv;
|
||||
}
|
||||
case CLASS_XPUNCT: {
|
||||
// Everything with the P (punctuation) property, plus code points in S
|
||||
// (symbols) that are < 128.
|
||||
// NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the
|
||||
// cut-off here, so we are compatible with that for now. PCRE bug #1718
|
||||
// tracks this; once PCRE 8.38 is released we should correct this
|
||||
// behaviour.
|
||||
CodePointSet rv = getUcpP();
|
||||
CodePointSet symbols = getUcpS();
|
||||
symbols.unsetRange(256, MAX_UNICODE);
|
||||
rv |= symbols;
|
||||
return rv;
|
||||
}
|
||||
case CLASS_HORZ: {
|
||||
CodePointSet rv;
|
||||
rv.set(0x0009); /* Horizontal tab */
|
||||
|
@ -110,6 +110,9 @@ private:
|
||||
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
|
||||
bool isUcp(PredefinedClass c);
|
||||
|
||||
CodePointSet getPredefinedCodePointSet(PredefinedClass c,
|
||||
const ParseMode &mode);
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // UTF8_COMPONENT_CLASS_H
|
||||
|
Loading…
x
Reference in New Issue
Block a user