mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Unify handling of caseless flag in class parser
Apply caselessness to each element added to a class, rather than all at finalize time (which required separated ucp dnf and-ucp working data). Unifies the behaviour of AsciiComponentClass and Utf8ComponentClass in this respect.
This commit is contained in:
parent
bdb7a10034
commit
25a01e1c3c
@ -61,11 +61,15 @@ void AsciiComponentClass::createRange(unichar to) {
|
||||
unsigned char from = (u8)range_start;
|
||||
if (from > to) {
|
||||
throw LocatedParseError("Range out of order in character class");
|
||||
} else {
|
||||
in_cand_range = false;
|
||||
cr.setRange(from, to);
|
||||
range_start = INVALID_UNICODE;
|
||||
}
|
||||
|
||||
in_cand_range = false;
|
||||
CharReach ncr(from, to);
|
||||
if (mode.caseless) {
|
||||
make_caseless(&ncr);
|
||||
}
|
||||
cr |= ncr;
|
||||
range_start = INVALID_UNICODE;
|
||||
}
|
||||
|
||||
void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
|
||||
@ -95,16 +99,13 @@ void AsciiComponentClass::add(PredefinedClass c, bool negative) {
|
||||
c = translateForUcpMode(c, mode);
|
||||
}
|
||||
|
||||
// Note: caselessness is handled by getPredefinedCharReach.
|
||||
CharReach pcr = getPredefinedCharReach(c, mode);
|
||||
if (negative) {
|
||||
pcr.flip();
|
||||
}
|
||||
|
||||
if (isUcp(c)) {
|
||||
cr_ucp |= pcr;
|
||||
} else {
|
||||
cr |= pcr;
|
||||
}
|
||||
cr |= pcr;
|
||||
range_start = INVALID_UNICODE;
|
||||
in_cand_range = false;
|
||||
}
|
||||
@ -120,7 +121,12 @@ void AsciiComponentClass::add(unichar c) {
|
||||
return;
|
||||
}
|
||||
|
||||
cr.set(c);
|
||||
CharReach ncr(c, c);
|
||||
if (mode.caseless) {
|
||||
make_caseless(&ncr);
|
||||
}
|
||||
|
||||
cr |= ncr;
|
||||
range_start = c;
|
||||
}
|
||||
|
||||
@ -136,12 +142,6 @@ void AsciiComponentClass::finalize() {
|
||||
in_cand_range = false;
|
||||
}
|
||||
|
||||
if (mode.caseless) {
|
||||
make_caseless(&cr);
|
||||
}
|
||||
|
||||
cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
|
||||
|
||||
if (m_negate) {
|
||||
cr.flip();
|
||||
}
|
||||
|
@ -78,12 +78,10 @@ protected:
|
||||
private:
|
||||
Position position;
|
||||
CharReach cr;
|
||||
CharReach cr_ucp;
|
||||
|
||||
// Private copy ctor. Use clone instead.
|
||||
AsciiComponentClass(const AsciiComponentClass &other)
|
||||
: ComponentClass(other), position(other.position), cr(other.cr),
|
||||
cr_ucp(other.cr_ucp) {}
|
||||
: ComponentClass(other), position(other.position), cr(other.cr) {}
|
||||
};
|
||||
|
||||
} // namespace ue2
|
||||
|
@ -515,16 +515,16 @@ void UTF8ComponentClass::createRange(unichar to) {
|
||||
unichar from = range_start;
|
||||
if (from > to) {
|
||||
throw LocatedParseError("Range out of order in character class");
|
||||
} else {
|
||||
in_cand_range = false;
|
||||
CodePointSet ncps;
|
||||
ncps.setRange(from, to);
|
||||
if (mode.caseless) {
|
||||
make_caseless(&ncps);
|
||||
}
|
||||
cps |= ncps;
|
||||
range_start = INVALID_UNICODE;
|
||||
}
|
||||
|
||||
in_cand_range = false;
|
||||
CodePointSet ncps;
|
||||
ncps.setRange(from, to);
|
||||
if (mode.caseless) {
|
||||
make_caseless(&ncps);
|
||||
}
|
||||
cps |= ncps;
|
||||
range_start = INVALID_UNICODE;
|
||||
}
|
||||
|
||||
void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
|
||||
@ -543,11 +543,7 @@ void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
|
||||
pcps.flip();
|
||||
}
|
||||
|
||||
if (isUcp(c)) {
|
||||
cps_ucp |= pcps;
|
||||
} else {
|
||||
cps |= pcps;
|
||||
}
|
||||
cps |= pcps;
|
||||
|
||||
range_start = INVALID_UNICODE;
|
||||
in_cand_range = false;
|
||||
@ -585,8 +581,6 @@ void UTF8ComponentClass::finalize() {
|
||||
in_cand_range = false;
|
||||
}
|
||||
|
||||
cps |= cps_ucp; /* characters from ucp props always case sensitive */
|
||||
|
||||
if (m_negate) {
|
||||
cps.flip();
|
||||
}
|
||||
@ -594,31 +588,6 @@ void UTF8ComponentClass::finalize() {
|
||||
finalized = true;
|
||||
}
|
||||
|
||||
bool isUcp(PredefinedClass c) {
|
||||
switch (c) {
|
||||
case CLASS_ALNUM:
|
||||
case CLASS_ALPHA:
|
||||
case CLASS_ANY:
|
||||
case CLASS_ASCII:
|
||||
case CLASS_BLANK:
|
||||
case CLASS_CNTRL:
|
||||
case CLASS_DIGIT:
|
||||
case CLASS_GRAPH:
|
||||
case CLASS_HORZ:
|
||||
case CLASS_LOWER:
|
||||
case CLASS_PRINT:
|
||||
case CLASS_PUNCT:
|
||||
case CLASS_SPACE:
|
||||
case CLASS_UPPER:
|
||||
case CLASS_VERT:
|
||||
case CLASS_WORD:
|
||||
case CLASS_XDIGIT:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
|
||||
map<u8, Position>::const_iterator it = heads.find(first_byte);
|
||||
if (it != heads.end()) {
|
||||
|
@ -93,7 +93,6 @@ private:
|
||||
void buildFourByte(GlushkovBuildState &bs);
|
||||
|
||||
CodePointSet cps;
|
||||
CodePointSet cps_ucp;
|
||||
|
||||
std::map<u8, Position> heads;
|
||||
Position single_pos;
|
||||
@ -108,7 +107,6 @@ private:
|
||||
};
|
||||
|
||||
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
|
||||
bool isUcp(PredefinedClass c);
|
||||
|
||||
CodePointSet getPredefinedCodePointSet(PredefinedClass c,
|
||||
const ParseMode &mode);
|
||||
|
Loading…
x
Reference in New Issue
Block a user