Unify handling of caseless flag in class parser

Apply caselessness to each element added to a class, rather than all at
finalize time (which required separated ucp dnf and-ucp working data).

Unifies the behaviour of AsciiComponentClass and Utf8ComponentClass in
this respect.
This commit is contained in:
Justin Viiret 2015-11-17 17:23:52 +11:00 committed by Matthew Barr
parent bdb7a10034
commit 25a01e1c3c
4 changed files with 27 additions and 62 deletions

View File

@ -61,11 +61,15 @@ void AsciiComponentClass::createRange(unichar to) {
unsigned char from = (u8)range_start; unsigned char from = (u8)range_start;
if (from > to) { if (from > to) {
throw LocatedParseError("Range out of order in character class"); throw LocatedParseError("Range out of order in character class");
} else {
in_cand_range = false;
cr.setRange(from, to);
range_start = INVALID_UNICODE;
} }
in_cand_range = false;
CharReach ncr(from, to);
if (mode.caseless) {
make_caseless(&ncr);
}
cr |= ncr;
range_start = INVALID_UNICODE;
} }
void AsciiComponentClass::notePositions(GlushkovBuildState &bs) { void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
@ -95,16 +99,13 @@ void AsciiComponentClass::add(PredefinedClass c, bool negative) {
c = translateForUcpMode(c, mode); c = translateForUcpMode(c, mode);
} }
// Note: caselessness is handled by getPredefinedCharReach.
CharReach pcr = getPredefinedCharReach(c, mode); CharReach pcr = getPredefinedCharReach(c, mode);
if (negative) { if (negative) {
pcr.flip(); pcr.flip();
} }
if (isUcp(c)) {
cr_ucp |= pcr;
} else {
cr |= pcr; cr |= pcr;
}
range_start = INVALID_UNICODE; range_start = INVALID_UNICODE;
in_cand_range = false; in_cand_range = false;
} }
@ -120,7 +121,12 @@ void AsciiComponentClass::add(unichar c) {
return; return;
} }
cr.set(c); CharReach ncr(c, c);
if (mode.caseless) {
make_caseless(&ncr);
}
cr |= ncr;
range_start = c; range_start = c;
} }
@ -136,12 +142,6 @@ void AsciiComponentClass::finalize() {
in_cand_range = false; in_cand_range = false;
} }
if (mode.caseless) {
make_caseless(&cr);
}
cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
if (m_negate) { if (m_negate) {
cr.flip(); cr.flip();
} }

View File

@ -78,12 +78,10 @@ protected:
private: private:
Position position; Position position;
CharReach cr; CharReach cr;
CharReach cr_ucp;
// Private copy ctor. Use clone instead. // Private copy ctor. Use clone instead.
AsciiComponentClass(const AsciiComponentClass &other) AsciiComponentClass(const AsciiComponentClass &other)
: ComponentClass(other), position(other.position), cr(other.cr), : ComponentClass(other), position(other.position), cr(other.cr) {}
cr_ucp(other.cr_ucp) {}
}; };
} // namespace ue2 } // namespace ue2

View File

@ -515,7 +515,8 @@ void UTF8ComponentClass::createRange(unichar to) {
unichar from = range_start; unichar from = range_start;
if (from > to) { if (from > to) {
throw LocatedParseError("Range out of order in character class"); throw LocatedParseError("Range out of order in character class");
} else { }
in_cand_range = false; in_cand_range = false;
CodePointSet ncps; CodePointSet ncps;
ncps.setRange(from, to); ncps.setRange(from, to);
@ -524,7 +525,6 @@ void UTF8ComponentClass::createRange(unichar to) {
} }
cps |= ncps; cps |= ncps;
range_start = INVALID_UNICODE; range_start = INVALID_UNICODE;
}
} }
void UTF8ComponentClass::add(PredefinedClass c, bool negative) { void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
@ -543,11 +543,7 @@ void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
pcps.flip(); pcps.flip();
} }
if (isUcp(c)) {
cps_ucp |= pcps;
} else {
cps |= pcps; cps |= pcps;
}
range_start = INVALID_UNICODE; range_start = INVALID_UNICODE;
in_cand_range = false; in_cand_range = false;
@ -585,8 +581,6 @@ void UTF8ComponentClass::finalize() {
in_cand_range = false; in_cand_range = false;
} }
cps |= cps_ucp; /* characters from ucp props always case sensitive */
if (m_negate) { if (m_negate) {
cps.flip(); cps.flip();
} }
@ -594,31 +588,6 @@ void UTF8ComponentClass::finalize() {
finalized = true; finalized = true;
} }
bool isUcp(PredefinedClass c) {
switch (c) {
case CLASS_ALNUM:
case CLASS_ALPHA:
case CLASS_ANY:
case CLASS_ASCII:
case CLASS_BLANK:
case CLASS_CNTRL:
case CLASS_DIGIT:
case CLASS_GRAPH:
case CLASS_HORZ:
case CLASS_LOWER:
case CLASS_PRINT:
case CLASS_PUNCT:
case CLASS_SPACE:
case CLASS_UPPER:
case CLASS_VERT:
case CLASS_WORD:
case CLASS_XDIGIT:
return false;
default:
return true;
}
}
Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) { Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
map<u8, Position>::const_iterator it = heads.find(first_byte); map<u8, Position>::const_iterator it = heads.find(first_byte);
if (it != heads.end()) { if (it != heads.end()) {

View File

@ -93,7 +93,6 @@ private:
void buildFourByte(GlushkovBuildState &bs); void buildFourByte(GlushkovBuildState &bs);
CodePointSet cps; CodePointSet cps;
CodePointSet cps_ucp;
std::map<u8, Position> heads; std::map<u8, Position> heads;
Position single_pos; Position single_pos;
@ -108,7 +107,6 @@ private:
}; };
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode); PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
bool isUcp(PredefinedClass c);
CodePointSet getPredefinedCodePointSet(PredefinedClass c, CodePointSet getPredefinedCodePointSet(PredefinedClass c,
const ParseMode &mode); const ParseMode &mode);