/* * Copyright (c) 2015, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Character class in UTF-8 mode. */ #include "Utf8ComponentClass.h" #include "buildstate.h" #include "Parser.h" #include "parse_error.h" #include "position.h" #include "position_info.h" #include "nfagraph/ng_builder.h" #include "util/compare.h" #include "util/unicode_def.h" #include #include "ucp_table.h" using namespace std; namespace ue2 { PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) { /* Note: the mapping used here for mapping posix character classes * matches the observed behaviour of PCRE (lower and upper going to \p{L} * is not documented by pcre). * * Note: this mapping is quite different from both of the mappings * recommended in the unicode regex tech report (TR-18) appendix C */ switch (in) { case CLASS_ALNUM: return CLASS_UCP_XAN; case CLASS_ALPHA: return CLASS_UCP_L; case CLASS_BLANK: return CLASS_HORZ; case CLASS_DIGIT: return CLASS_UCP_ND; case CLASS_GRAPH: return CLASS_XGRAPH; case CLASS_LOWER: if (mode.caseless) { /* we also pick up uppercase titlecase and others */ return CLASS_UCP_L; } else { return CLASS_UCP_LL; } case CLASS_PRINT: return CLASS_XPRINT; case CLASS_PUNCT: return CLASS_XPUNCT; case CLASS_SPACE: return CLASS_UCP_XPS; case CLASS_UPPER: if (mode.caseless) { /* we also pick up lowercase titlecase and others */ return CLASS_UCP_L; } else { return CLASS_UCP_LU; } case CLASS_WORD: return CLASS_UCP_XWD; default: return in; } } CodePointSet getPredefinedCodePointSet(PredefinedClass c, const ParseMode &mode) { /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */ switch (c) { case CLASS_ANY: if (mode.dotall) { return CodePointSet(CodePointSet::interval(0, MAX_UNICODE)); } else { CodePointSet rv; rv.set('\n'); rv.flip(); return rv; } case CLASS_XGRAPH: { CodePointSet rv; rv = getUcpZ(); rv |= getUcpC(); rv.flip(); // most of Cf, except for ... CodePointSet cf = getUcpCf(); cf.unset(0x061c); cf.unset(0x180e); cf.unsetRange(0x2066, 0x2069); rv |= cf; return rv; } case CLASS_XPRINT: { // Same as graph, plus everything with the Zs property. CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode); rv |= getUcpZs(); rv.set(0x180e); // Also included in this class by PCRE 8.38. return rv; } case CLASS_XPUNCT: { // Everything with the P (punctuation) property, plus code points in S // (symbols) that are < 128. CodePointSet rv = getUcpP(); CodePointSet symbols = getUcpS(); symbols.unsetRange(128, MAX_UNICODE); rv |= symbols; return rv; } case CLASS_HORZ: { CodePointSet rv; rv.set(0x0009); /* Horizontal tab */ rv.set(0x0020); /* Space */ rv.set(0x00A0); /* Non-break space */ rv.set(0x1680); /* Ogham space mark */ rv.set(0x180E); /* Mongolian vowel separator */ rv.set(0x2000); /* En quad */ rv.set(0x2001); /* Em quad */ rv.set(0x2002); /* En space */ rv.set(0x2003); /* Em space */ rv.set(0x2004); /* Three-per-em space */ rv.set(0x2005); /* Four-per-em space */ rv.set(0x2006); /* Six-per-em space */ rv.set(0x2007); /* Figure space */ rv.set(0x2008); /* Punctuation space */ rv.set(0x2009); /* Thin space */ rv.set(0x200A); /* Hair space */ rv.set(0x202F); /* Narrow no-break space */ rv.set(0x205F); /* Medium mathematical space */ rv.set(0x3000); /* Ideographic space */ return rv; } case CLASS_VERT: { CodePointSet rv; rv.set(0x000A); /* Linefeed */ rv.set(0x000B); /* Vertical tab */ rv.set(0x000C); /* Formfeed */ rv.set(0x000D); /* Carriage return */ rv.set(0x0085); /* Next line */ rv.set(0x2028); /* Line separator */ rv.set(0x2029); /* Paragraph separator */ return rv; } case CLASS_UCP_XPS: case CLASS_UCP_XSP: { CodePointSet rv; rv.set(0x0009); /* Horizontal tab */ rv.set(0x0020); /* Space */ rv.set(0x00A0); /* Non-break space */ rv.set(0x1680); /* Ogham space mark */ rv.set(0x180E); /* Mongolian vowel separator */ rv.set(0x2000); /* En quad */ rv.set(0x2001); /* Em quad */ rv.set(0x2002); /* En space */ rv.set(0x2003); /* Em space */ rv.set(0x2004); /* Three-per-em space */ rv.set(0x2005); /* Four-per-em space */ rv.set(0x2006); /* Six-per-em space */ rv.set(0x2007); /* Figure space */ rv.set(0x2008); /* Punctuation space */ rv.set(0x2009); /* Thin space */ rv.set(0x200A); /* Hair space */ rv.set(0x202F); /* Narrow no-break space */ rv.set(0x205F); /* Medium mathematical space */ rv.set(0x3000); /* Ideographic space */ rv.set(0x000A); /* Linefeed */ rv.set(0x000B); /* Vertical tab */ rv.set(0x000C); /* Formfeed */ rv.set(0x000D); /* Carriage return */ rv.set(0x0085); /* Next line */ rv.set(0x2028); /* Line separator */ rv.set(0x2029); /* Paragraph separator */ return rv; } case CLASS_UCP_C: return getUcpC(); case CLASS_UCP_CC: return getUcpCc(); case CLASS_UCP_CF: return getUcpCf(); case CLASS_UCP_CN: return getUcpCn(); case CLASS_UCP_CO: return getUcpCo(); case CLASS_UCP_CS: return getUcpCs(); case CLASS_UCP_L: return getUcpL(); case CLASS_UCP_L_AND: return getUcpL_and(); case CLASS_UCP_LL: return getUcpLl(); case CLASS_UCP_LM: return getUcpLm(); case CLASS_UCP_LO: return getUcpLo(); case CLASS_UCP_LT: return getUcpLt(); case CLASS_UCP_LU: return getUcpLu(); case CLASS_UCP_M: return getUcpM(); case CLASS_UCP_MC: return getUcpMc(); case CLASS_UCP_ME: return getUcpMe(); case CLASS_UCP_MN: return getUcpMn(); case CLASS_UCP_N: return getUcpN(); case CLASS_UCP_ND: return getUcpNd(); case CLASS_UCP_NL: return getUcpNl(); case CLASS_UCP_NO: return getUcpNo(); case CLASS_UCP_P: return getUcpP(); case CLASS_UCP_PC: return getUcpPc(); case CLASS_UCP_PD: return getUcpPd(); case CLASS_UCP_PE: return getUcpPe(); case CLASS_UCP_PF: return getUcpPf(); case CLASS_UCP_PI: return getUcpPi(); case CLASS_UCP_PO: return getUcpPo(); case CLASS_UCP_PS: return getUcpPs(); case CLASS_UCP_S: return getUcpS(); case CLASS_UCP_SC: return getUcpSc(); case CLASS_UCP_SK: return getUcpSk(); case CLASS_UCP_SM: return getUcpSm(); case CLASS_UCP_SO: return getUcpSo(); case CLASS_UCP_XAN: return getUcpXan(); case CLASS_UCP_XWD: return getUcpXwd(); case CLASS_UCP_Z: return getUcpZ(); case CLASS_UCP_ZL: return getUcpZl(); case CLASS_UCP_ZP: return getUcpZp(); case CLASS_UCP_ZS: return getUcpZs(); case CLASS_SCRIPT_ARABIC: return getUcpArabic(); case CLASS_SCRIPT_ARMENIAN: return getUcpArmenian(); case CLASS_SCRIPT_AVESTAN: return getUcpAvestan(); case CLASS_SCRIPT_BALINESE: return getUcpBalinese(); case CLASS_SCRIPT_BAMUM: return getUcpBamum(); case CLASS_SCRIPT_BATAK: return getUcpBatak(); case CLASS_SCRIPT_BENGALI: return getUcpBengali(); case CLASS_SCRIPT_BOPOMOFO: return getUcpBopomofo(); case CLASS_SCRIPT_BRAHMI: return getUcpBrahmi(); case CLASS_SCRIPT_BRAILLE: return getUcpBraille(); case CLASS_SCRIPT_BUGINESE: return getUcpBuginese(); case CLASS_SCRIPT_BUHID: return getUcpBuhid(); case CLASS_SCRIPT_CANADIAN_ABORIGINAL: return getUcpCanadian_Aboriginal(); case CLASS_SCRIPT_CARIAN: return getUcpCarian(); case CLASS_SCRIPT_CHAM: return getUcpCham(); case CLASS_SCRIPT_CHEROKEE: return getUcpCherokee(); case CLASS_SCRIPT_COMMON: return getUcpCommon(); case CLASS_SCRIPT_COPTIC: return getUcpCoptic(); case CLASS_SCRIPT_CUNEIFORM: return getUcpCuneiform(); case CLASS_SCRIPT_CYPRIOT: return getUcpCypriot(); case CLASS_SCRIPT_CYRILLIC: return getUcpCyrillic(); case CLASS_SCRIPT_DESERET: return getUcpDeseret(); case CLASS_SCRIPT_DEVANAGARI: return getUcpDevanagari(); case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS: return getUcpEgyptian_Hieroglyphs(); case CLASS_SCRIPT_ETHIOPIC: return getUcpEthiopic(); case CLASS_SCRIPT_GEORGIAN: return getUcpGeorgian(); case CLASS_SCRIPT_GLAGOLITIC: return getUcpGlagolitic(); case CLASS_SCRIPT_GOTHIC: return getUcpGothic(); case CLASS_SCRIPT_GREEK: return getUcpGreek(); case CLASS_SCRIPT_GUJARATI: return getUcpGujarati(); case CLASS_SCRIPT_GURMUKHI: return getUcpGurmukhi(); case CLASS_SCRIPT_HAN: return getUcpHan(); case CLASS_SCRIPT_HANGUL: return getUcpHangul(); case CLASS_SCRIPT_HANUNOO: return getUcpHanunoo(); case CLASS_SCRIPT_HEBREW: return getUcpHebrew(); case CLASS_SCRIPT_HIRAGANA: return getUcpHiragana(); case CLASS_SCRIPT_IMPERIAL_ARAMAIC: return getUcpImperial_Aramaic(); case CLASS_SCRIPT_INHERITED: return getUcpInherited(); case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI: return getUcpInscriptional_Pahlavi(); case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN: return getUcpInscriptional_Parthian(); case CLASS_SCRIPT_JAVANESE: return getUcpJavanese(); case CLASS_SCRIPT_KAITHI: return getUcpKaithi(); case CLASS_SCRIPT_KANNADA: return getUcpKannada(); case CLASS_SCRIPT_KATAKANA: return getUcpKatakana(); case CLASS_SCRIPT_KAYAH_LI: return getUcpKayah_Li(); case CLASS_SCRIPT_KHAROSHTHI: return getUcpKharoshthi(); case CLASS_SCRIPT_KHMER: return getUcpKhmer(); case CLASS_SCRIPT_LAO: return getUcpLao(); case CLASS_SCRIPT_LATIN: return getUcpLatin(); case CLASS_SCRIPT_LEPCHA: return getUcpLepcha(); case CLASS_SCRIPT_LIMBU: return getUcpLimbu(); case CLASS_SCRIPT_LINEAR_B: return getUcpLinear_B(); case CLASS_SCRIPT_LISU: return getUcpLisu(); case CLASS_SCRIPT_LYCIAN: return getUcpLycian(); case CLASS_SCRIPT_LYDIAN: return getUcpLydian(); case CLASS_SCRIPT_MALAYALAM: return getUcpMalayalam(); case CLASS_SCRIPT_MANDAIC: return getUcpMandaic(); case CLASS_SCRIPT_MEETEI_MAYEK: return getUcpMeetei_Mayek(); case CLASS_SCRIPT_MONGOLIAN: return getUcpMongolian(); case CLASS_SCRIPT_MYANMAR: return getUcpMyanmar(); case CLASS_SCRIPT_NEW_TAI_LUE: return getUcpNew_Tai_Lue(); case CLASS_SCRIPT_NKO: return getUcpNko(); case CLASS_SCRIPT_OGHAM: return getUcpOgham(); case CLASS_SCRIPT_OL_CHIKI: return getUcpOl_Chiki(); case CLASS_SCRIPT_OLD_ITALIC: return getUcpOld_Italic(); case CLASS_SCRIPT_OLD_PERSIAN: return getUcpOld_Persian(); case CLASS_SCRIPT_OLD_SOUTH_ARABIAN: return getUcpOld_South_Arabian(); case CLASS_SCRIPT_OLD_TURKIC: return getUcpOld_Turkic(); case CLASS_SCRIPT_ORIYA: return getUcpOriya(); case CLASS_SCRIPT_OSMANYA: return getUcpOsmanya(); case CLASS_SCRIPT_PHAGS_PA: return getUcpPhags_Pa(); case CLASS_SCRIPT_PHOENICIAN: return getUcpPhoenician(); case CLASS_SCRIPT_REJANG: return getUcpRejang(); case CLASS_SCRIPT_RUNIC: return getUcpRunic(); case CLASS_SCRIPT_SAMARITAN: return getUcpSamaritan(); case CLASS_SCRIPT_SAURASHTRA: return getUcpSaurashtra(); case CLASS_SCRIPT_SHAVIAN: return getUcpShavian(); case CLASS_SCRIPT_SINHALA: return getUcpSinhala(); case CLASS_SCRIPT_SUNDANESE: return getUcpSundanese(); case CLASS_SCRIPT_SYLOTI_NAGRI: return getUcpSyloti_Nagri(); case CLASS_SCRIPT_SYRIAC: return getUcpSyriac(); case CLASS_SCRIPT_TAGALOG: return getUcpTagalog(); case CLASS_SCRIPT_TAGBANWA: return getUcpTagbanwa(); case CLASS_SCRIPT_TAI_LE: return getUcpTai_Le(); case CLASS_SCRIPT_TAI_THAM: return getUcpTai_Tham(); case CLASS_SCRIPT_TAI_VIET: return getUcpTai_Viet(); case CLASS_SCRIPT_TAMIL: return getUcpTamil(); case CLASS_SCRIPT_TELUGU: return getUcpTelugu(); case CLASS_SCRIPT_THAANA: return getUcpThaana(); case CLASS_SCRIPT_THAI: return getUcpThai(); case CLASS_SCRIPT_TIBETAN: return getUcpTibetan(); case CLASS_SCRIPT_TIFINAGH: return getUcpTifinagh(); case CLASS_SCRIPT_UGARITIC: return getUcpUgaritic(); case CLASS_SCRIPT_VAI: return getUcpVai(); case CLASS_SCRIPT_YI: return getUcpYi(); case CLASS_UCP_ANY: return CodePointSet(CodePointSet::interval(0, MAX_UNICODE)); default: { /* currently uses ascii defns */ CharReach cr = getPredefinedCharReach(c, mode); CodePointSet rv; for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) { rv.set(i); } return rv; } } } UTF8ComponentClass::UTF8ComponentClass(const ParseMode &mode_in) : ComponentClass(mode_in), single_pos( GlushkovBuildState::POS_UNINITIALIZED), one_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED), two_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED), three_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED), two_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED), three_char_dot_head(GlushkovBuildState::POS_UNINITIALIZED), four_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED) { assert(mode.utf8); } UTF8ComponentClass *UTF8ComponentClass::clone() const { return new UTF8ComponentClass(*this); } bool UTF8ComponentClass::class_empty(void) const { assert(finalized); return cps.none(); } void UTF8ComponentClass::createRange(unichar to) { assert(range_start != INVALID_UNICODE); unichar from = range_start; if (from > to) { throw LocatedParseError("Range out of order in character class"); } in_cand_range = false; CodePointSet ncps; ncps.setRange(from, to); if (mode.caseless) { make_caseless(&ncps); } cps |= ncps; range_start = INVALID_UNICODE; } void UTF8ComponentClass::add(PredefinedClass c, bool negative) { if (in_cand_range) { // can't form a range here throw LocatedParseError("Invalid range in character class"); } if (mode.ucp) { c = translateForUcpMode(c, mode); } // caselessness is handled inside this call - don't apply make_caseless // to the result CodePointSet pcps = getPredefinedCodePointSet(c, mode); if (negative) { pcps.flip(); } cps |= pcps; range_start = INVALID_UNICODE; in_cand_range = false; } void UTF8ComponentClass::add(unichar c) { DEBUG_PRINTF("adding \\x%08x\n", c); if (c > MAX_UNICODE) { // too big! throw LocatedParseError("Hexadecimal value is greater than \\x10FFFF"); } if (in_cand_range) { createRange(c); return; } CodePointSet ncps; ncps.set(c); if (mode.caseless) { make_caseless(&ncps); } cps |= ncps; range_start = c; } void UTF8ComponentClass::finalize() { if (finalized) { return; } // Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the // dash is a literal dash. if (in_cand_range) { cps.set('-'); in_cand_range = false; } if (m_negate) { cps.flip(); } finalized = true; } Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) { map::const_iterator it = heads.find(first_byte); if (it != heads.end()) { return it->second; } Position head = builder.makePositions(1); assert(heads.find(first_byte) == heads.end()); builder.addCharReach(head, CharReach(first_byte)); /* no report id as head can not be directly wired to accept */ heads[first_byte] = head; return head; } void UTF8ComponentClass::ensureDotTrailer(GlushkovBuildState &bs) { NFABuilder &builder = bs.getBuilder(); if (one_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) { return; } one_dot_trailer = builder.makePositions(1); builder.setNodeReportID(one_dot_trailer, 0); builder.addCharReach(one_dot_trailer, CharReach(0x80, 0xbf)); tails.insert(one_dot_trailer); } void UTF8ComponentClass::ensureTwoDotTrailer(GlushkovBuildState &bs) { NFABuilder &builder = bs.getBuilder(); if (two_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) { return; } ensureDotTrailer(bs); two_dot_trailer = builder.makePositions(1); builder.addCharReach(two_dot_trailer, CharReach(0x80, 0xbf)); bs.addSuccessor(two_dot_trailer, one_dot_trailer); } void UTF8ComponentClass::ensureThreeDotTrailer(GlushkovBuildState &bs) { NFABuilder &builder = bs.getBuilder(); if (three_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) { return; } ensureTwoDotTrailer(bs); three_dot_trailer = builder.makePositions(1); builder.addCharReach(three_dot_trailer, CharReach(0x80, 0xbf)); bs.addSuccessor(three_dot_trailer, two_dot_trailer); } void UTF8ComponentClass::buildOneByte(GlushkovBuildState &bs) { NFABuilder &builder = bs.getBuilder(); for (CodePointSet::const_iterator it = cps.begin(); it != cps.end(); ++it) { unichar b = lower(*it); unichar e = upper(*it) + 1; if (b >= UTF_2CHAR_MIN) { continue; } DEBUG_PRINTF("building vertices for [%u, %u)\n", b, e); if (single_pos == GlushkovBuildState::POS_UNINITIALIZED) { single_pos = builder.makePositions(1); builder.setNodeReportID(single_pos, 0 /* offset adj */); tails.insert(single_pos); } CharReach cr(b, MIN(e, UTF_2CHAR_MIN) - 1); builder.addCharReach(single_pos, cr); } } void UTF8ComponentClass::addToTail(GlushkovBuildState &bs, map &finals, Position prev, unichar b, unichar e) { NFABuilder &builder = bs.getBuilder(); Position tail; if (finals.find(prev) == finals.end()) { tail = builder.makePositions(1); builder.setNodeReportID(tail, 0 /* offset adj */); bs.addSuccessor(prev, tail); finals[prev] = tail; tails.insert(tail); } else { tail = finals[prev]; } u8 bb = makeContByte(b); u8 ee = makeContByte(e - 1); builder.addCharReach(tail, CharReach(bb, ee)); } void UTF8ComponentClass::buildTwoByte(GlushkovBuildState &bs) { NFABuilder &builder = bs.getBuilder(); map finals; for (auto it = cps.begin(); it != cps.end(); ++it) { unichar b = lower(*it); unichar e = upper(*it) + 1; b = MAX(b, UTF_2CHAR_MIN); e = MIN(e, UTF_3CHAR_MIN); if (b >= e) { continue; /* we're done here */ } /* raise b to the start of the next tail byte boundary */ if (b & UTF_CONT_BYTE_VALUE_MASK) { unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE)); u8 first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT); assert(first_byte > 0xc1 && first_byte <= 0xdf); Position head = getHead(builder, first_byte); addToTail(bs, finals, head, b, bb); b = bb; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* lower e to the end of a tail byte boundary */ if (e & UTF_CONT_BYTE_VALUE_MASK) { unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK; assert(ee >= b); u8 first_byte = UTF_TWO_BYTE_HEADER | (ee >> UTF_CONT_SHIFT); assert(first_byte > 0xc1 && first_byte <= 0xdf); Position head = getHead(builder, first_byte); addToTail(bs, finals, head, ee, e); e = ee; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* middle section just goes to a common full vertex */ ensureDotTrailer(bs); if (two_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) { two_char_dot_head = builder.makePositions(1); bs.addSuccessor(two_char_dot_head, one_dot_trailer); } u8 min_first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT); u8 max_first_byte = UTF_TWO_BYTE_HEADER | ((e - 1) >> UTF_CONT_SHIFT); assert(min_first_byte > 0xc1 && min_first_byte <= 0xdf); assert(max_first_byte > 0xc1 && max_first_byte <= 0xdf); builder.addCharReach(two_char_dot_head, CharReach(min_first_byte, max_first_byte)); } } static Position getMid(GlushkovBuildState &bs, map > &mids, const Position &prev, u8 byte_val) { NFABuilder &builder = bs.getBuilder(); map &by_byte = mids[prev]; map::const_iterator it = by_byte.find(byte_val); if (it != by_byte.end()) { return it->second; } Position mid = builder.makePositions(1); builder.addCharReach(mid, CharReach(byte_val)); bs.addSuccessor(prev, mid); /* no report id as mid can not be directly wired to accept */ by_byte[byte_val] = mid; return mid; } void UTF8ComponentClass::buildThreeByte(GlushkovBuildState &bs) { NFABuilder &builder = bs.getBuilder(); map > mids; map finals; for (auto it = cps.begin(); it != cps.end(); ++it) { unichar b = lower(*it); unichar e = upper(*it) + 1; b = MAX(b, UTF_3CHAR_MIN); e = MIN(e, UTF_4CHAR_MIN); if (b >= e) { continue; /* we're done here */ } /* raise b to the start of the next tail byte boundary */ if (b & UTF_CONT_BYTE_VALUE_MASK) { unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE)); u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT)); assert(first_byte >= 0xe0 && first_byte <= 0xef); Position head = getHead(builder, first_byte); u8 second_byte = makeContByte(b >> UTF_CONT_SHIFT); Position mid = getMid(bs, mids, head, second_byte); addToTail(bs, finals, mid, b, bb); b = bb; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* lower e to the end of a tail byte boundary */ if (e & UTF_CONT_BYTE_VALUE_MASK) { unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK; assert(ee >= b); u8 first_byte = UTF_THREE_BYTE_HEADER | (ee >> (2 * UTF_CONT_SHIFT)); assert(first_byte >= 0xe0 && first_byte <= 0xef); Position head = getHead(builder, first_byte); u8 second_byte = makeContByte(ee >> UTF_CONT_SHIFT); Position mid = getMid(bs, mids, head, second_byte); addToTail(bs, finals, mid, ee, e); e = ee; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* from here on in the last byte is always full */ ensureDotTrailer(bs); /* raise b to the start of the next mid byte boundary */ if (b & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) { unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT))); u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT)); Position head = getHead(builder, first_byte); Position mid = builder.makePositions(1); bs.addSuccessor(head, mid); bs.addSuccessor(mid, one_dot_trailer); /* no report id as mid can not be directly wired to accept, * not adding to mids as we are completely filling its downstream */ u8 second_min = makeContByte(b >> UTF_CONT_SHIFT); u8 second_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT); builder.addCharReach(mid, CharReach(second_min, second_max)); b = bb; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* lower e to the end of a mid byte boundary */ if (e & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) { unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1); assert(ee >= b); u8 first_byte = UTF_THREE_BYTE_HEADER | (ee >> (2 * UTF_CONT_SHIFT)); Position head = getHead(builder, first_byte); Position mid = builder.makePositions(1); bs.addSuccessor(head, mid); bs.addSuccessor(mid, one_dot_trailer); /* no report id as mid can not be directly wired to accept, * not adding to mids as we are completely filling its downstream */ u8 second_min = makeContByte(ee >> UTF_CONT_SHIFT); u8 second_max = makeContByte((e - 1) >> UTF_CONT_SHIFT); builder.addCharReach(mid, CharReach(second_min, second_max)); e = ee; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* now we just have to wire head to a common dot trailer */ ensureTwoDotTrailer(bs); if (three_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) { three_char_dot_head = builder.makePositions(1); bs.addSuccessor(three_char_dot_head, two_dot_trailer); } u8 min_first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT)); u8 max_first_byte = UTF_THREE_BYTE_HEADER | ((e - 1) >> (2 * UTF_CONT_SHIFT)); assert(min_first_byte > 0xdf && min_first_byte <= 0xef); assert(max_first_byte > 0xdf && max_first_byte <= 0xef); builder.addCharReach(three_char_dot_head, CharReach(min_first_byte, max_first_byte)); } } static u8 makeFirstByteOfFour(unichar raw) { u8 first_byte = UTF_FOUR_BYTE_HEADER | (raw >> (3 * UTF_CONT_SHIFT)); assert(first_byte > 0xef && first_byte <= 0xf7); return first_byte; } static bool isTwoContAligned(unichar raw) { return !(raw & ((1 << (2 * UTF_CONT_SHIFT)) - 1)); } static bool isThreeContAligned(unichar raw) { return !(raw & ((1 << (3 * UTF_CONT_SHIFT)) - 1)); } void UTF8ComponentClass::buildFourByte(GlushkovBuildState &bs) { NFABuilder &builder = bs.getBuilder(); map > mids; map finals; for (auto it = cps.begin(); it != cps.end(); ++it) { unichar b = lower(*it); unichar e = upper(*it) + 1; b = MAX(b, UTF_4CHAR_MIN); e = MIN(e, MAX_UNICODE + 1); if (b >= e) { continue; } /* raise b to the start of the next tail byte boundary */ if (b & UTF_CONT_BYTE_VALUE_MASK) { unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE)); u8 first_byte = makeFirstByteOfFour(b); Position head = getHead(builder, first_byte); u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT)); Position mid1 = getMid(bs, mids, head, second_byte); u8 third_byte = makeContByte(b >> UTF_CONT_SHIFT); Position mid2 = getMid(bs, mids, mid1, third_byte); addToTail(bs, finals, mid2, b, bb); b = bb; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* lower e to the end of a tail byte boundary */ if (e & UTF_CONT_BYTE_VALUE_MASK) { unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK; assert(ee >= b); u8 first_byte = makeFirstByteOfFour(ee); Position head = getHead(builder, first_byte); u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT)); Position mid1 = getMid(bs, mids, head, second_byte); u8 third_byte = makeContByte(ee >> UTF_CONT_SHIFT); Position mid2 = getMid(bs, mids, mid1, third_byte); addToTail(bs, finals, mid2, ee, e); e = ee; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* from here on in the last byte is always full */ ensureDotTrailer(bs); /* raise b to the start of the next mid byte boundary */ if (!isTwoContAligned(b)) { unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT))); u8 first_byte = makeFirstByteOfFour(b); Position head = getHead(builder, first_byte); u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT)); Position mid1 = getMid(bs, mids, head, second_byte); Position mid2 = builder.makePositions(1); bs.addSuccessor(mid1, mid2); bs.addSuccessor(mid2, one_dot_trailer); /* no report id as mid can not be directly wired to accept, * not adding to mids as we are completely filling its downstream */ u8 byte_min = makeContByte(b >> UTF_CONT_SHIFT); u8 byte_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT); builder.addCharReach(mid2, CharReach(byte_min, byte_max)); b = bb; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* lower e to the end of a mid byte boundary */ if (!isTwoContAligned(e)) { unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1); assert(ee >= b); u8 first_byte = makeFirstByteOfFour(ee); Position head = getHead(builder, first_byte); u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT)); Position mid1 = getMid(bs, mids, head, second_byte); Position mid2 = builder.makePositions(1); bs.addSuccessor(mid1, mid2); bs.addSuccessor(mid2, one_dot_trailer); /* no report id as mid can not be directly wired to accept, * not adding to mids as we are completely filling its downstream */ u8 byte_min = makeContByte(ee >> UTF_CONT_SHIFT); u8 byte_max = makeContByte((e - 1) >> UTF_CONT_SHIFT); builder.addCharReach(mid2, CharReach(byte_min, byte_max)); e = ee; } if (b == e) { continue; /* we're done here */ } assert(b < e); ensureTwoDotTrailer(bs); /* raise b to the next byte boundary */ if (!isThreeContAligned(b)) { unichar bb = MIN(e, ROUNDUP_N(b, 1 << (3 * UTF_CONT_SHIFT))); u8 first_byte = makeFirstByteOfFour(b); Position head = getHead(builder, first_byte); Position mid1 = builder.makePositions(1); bs.addSuccessor(head, mid1); bs.addSuccessor(mid1, two_dot_trailer); /* no report id as mid can not be directly wired to accept, * not adding to mids as we are completely filling its downstream */ u8 byte_min = makeContByte(b >> (2 * UTF_CONT_SHIFT)); u8 byte_max = makeContByte((bb - 1) >> (2 * UTF_CONT_SHIFT)); builder.addCharReach(mid1, CharReach(byte_min, byte_max)); b = bb; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* lower e to the next byte boundary */ if (!isThreeContAligned(e)) { unichar ee = e & ~((1 << (3 * UTF_CONT_SHIFT)) - 1); assert(ee >= b); u8 first_byte = makeFirstByteOfFour(ee); Position head = getHead(builder, first_byte); Position mid1 = builder.makePositions(1); bs.addSuccessor(head, mid1); bs.addSuccessor(mid1, two_dot_trailer); /* no report id as mid can not be directly wired to accept, * not adding to mids as we are completely filling its downstream */ u8 byte_min = makeContByte(ee >> (2 * UTF_CONT_SHIFT)); u8 byte_max = makeContByte((e - 1) >> (2 * UTF_CONT_SHIFT)); builder.addCharReach(mid1, CharReach(byte_min, byte_max)); e = ee; } if (b == e) { continue; /* we're done here */ } assert(b < e); /* now we just have to wire head to a common dot trailer */ ensureThreeDotTrailer(bs); if (four_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) { four_char_dot_head = builder.makePositions(1); bs.addSuccessor(four_char_dot_head, three_dot_trailer); } u8 min_first_byte = makeFirstByteOfFour(b); u8 max_first_byte = makeFirstByteOfFour(e - 1); builder.addCharReach(four_char_dot_head, CharReach(min_first_byte, max_first_byte)); } } void UTF8ComponentClass::notePositions(GlushkovBuildState &bs) { // We should always be finalized by now. assert(finalized); // An empty class is a special case; this would be generated by something // like /[\s\S]/8, which can never match. We treat these like we do the non // UTF-8 version: add a vertex with empty reach (to ensure we create a // connected graph) and pick it up later on. if (class_empty()) { DEBUG_PRINTF("empty class!\n"); assert(single_pos == GlushkovBuildState::POS_UNINITIALIZED); NFABuilder &builder = bs.getBuilder(); single_pos = builder.makePositions(1); builder.setNodeReportID(single_pos, 0 /* offset adj */); builder.addCharReach(single_pos, CharReach()); tails.insert(single_pos); return; } buildOneByte(bs); buildTwoByte(bs); buildThreeByte(bs); buildFourByte(bs); } void UTF8ComponentClass::buildFollowSet(GlushkovBuildState &, const vector &) { /* states are wired in notePositions as all belong to this component. */ } vector UTF8ComponentClass::first(void) const { vector rv; if (single_pos != GlushkovBuildState::POS_UNINITIALIZED) { rv.emplace_back(single_pos); } if (two_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) { rv.emplace_back(two_char_dot_head); } if (three_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) { rv.emplace_back(three_char_dot_head); } if (four_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) { rv.emplace_back(four_char_dot_head); } for (auto it = heads.begin(); it != heads.end(); ++it) { rv.emplace_back(it->second); } return rv; } vector UTF8ComponentClass::last(void) const { vector rv; rv.insert(rv.end(), tails.begin(), tails.end()); return rv; } } // namespace ue2