vectorscan/src/parser/Utf8ComponentClass.cpp
2021-10-12 11:51:33 +03:00

1174 lines
37 KiB
C++

/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Character class in UTF-8 mode.
*/
#include "Utf8ComponentClass.h"
#include "buildstate.h"
#include "Parser.h"
#include "parse_error.h"
#include "position.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
#include "util/compare.h"
#include "util/unicode_def.h"
#include <cstring>
#include "ucp_table.h"
using namespace std;
namespace ue2 {
PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
/* Note: the mapping used here for mapping posix character classes
* matches the observed behaviour of PCRE (lower and upper going to \p{L}
* is not documented by pcre).
*
* Note: this mapping is quite different from both of the mappings
* recommended in the unicode regex tech report (TR-18) appendix C
*/
switch (in) {
case CLASS_ALNUM:
return CLASS_UCP_XAN;
case CLASS_ALPHA:
return CLASS_UCP_L;
case CLASS_BLANK:
return CLASS_HORZ;
case CLASS_DIGIT:
return CLASS_UCP_ND;
case CLASS_GRAPH:
return CLASS_XGRAPH;
case CLASS_LOWER:
if (mode.caseless) { /* we also pick up uppercase titlecase and others */
return CLASS_UCP_L;
} else {
return CLASS_UCP_LL;
}
case CLASS_PRINT:
return CLASS_XPRINT;
case CLASS_PUNCT:
return CLASS_XPUNCT;
case CLASS_SPACE:
return CLASS_UCP_XPS;
case CLASS_UPPER:
if (mode.caseless) { /* we also pick up lowercase titlecase and others */
return CLASS_UCP_L;
} else {
return CLASS_UCP_LU;
}
case CLASS_WORD:
return CLASS_UCP_XWD;
default:
return in;
}
}
CodePointSet getPredefinedCodePointSet(PredefinedClass c,
const ParseMode &mode) {
/* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
switch (c) {
case CLASS_ANY:
if (mode.dotall) {
return CodePointSet(CodePointSet::interval(0, MAX_UNICODE));
} else {
CodePointSet rv;
rv.set('\n');
rv.flip();
return rv;
}
case CLASS_XGRAPH: {
CodePointSet rv;
rv = getUcpZ();
rv |= getUcpC();
rv.flip();
// most of Cf, except for ...
CodePointSet cf = getUcpCf();
cf.unset(0x061c);
cf.unset(0x180e);
cf.unsetRange(0x2066, 0x2069);
rv |= cf;
return rv;
}
case CLASS_XPRINT: {
// Same as graph, plus everything with the Zs property.
CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
rv |= getUcpZs();
rv.set(0x180e); // Also included in this class by PCRE 8.38.
return rv;
}
case CLASS_XPUNCT: {
// Everything with the P (punctuation) property, plus code points in S
// (symbols) that are < 128.
CodePointSet rv = getUcpP();
CodePointSet symbols = getUcpS();
symbols.unsetRange(128, MAX_UNICODE);
rv |= symbols;
return rv;
}
case CLASS_HORZ: {
CodePointSet rv;
rv.set(0x0009); /* Horizontal tab */
rv.set(0x0020); /* Space */
rv.set(0x00A0); /* Non-break space */
rv.set(0x1680); /* Ogham space mark */
rv.set(0x180E); /* Mongolian vowel separator */
rv.set(0x2000); /* En quad */
rv.set(0x2001); /* Em quad */
rv.set(0x2002); /* En space */
rv.set(0x2003); /* Em space */
rv.set(0x2004); /* Three-per-em space */
rv.set(0x2005); /* Four-per-em space */
rv.set(0x2006); /* Six-per-em space */
rv.set(0x2007); /* Figure space */
rv.set(0x2008); /* Punctuation space */
rv.set(0x2009); /* Thin space */
rv.set(0x200A); /* Hair space */
rv.set(0x202F); /* Narrow no-break space */
rv.set(0x205F); /* Medium mathematical space */
rv.set(0x3000); /* Ideographic space */
return rv;
}
case CLASS_VERT: {
CodePointSet rv;
rv.set(0x000A); /* Linefeed */
rv.set(0x000B); /* Vertical tab */
rv.set(0x000C); /* Formfeed */
rv.set(0x000D); /* Carriage return */
rv.set(0x0085); /* Next line */
rv.set(0x2028); /* Line separator */
rv.set(0x2029); /* Paragraph separator */
return rv;
}
case CLASS_UCP_XPS:
case CLASS_UCP_XSP: {
CodePointSet rv;
rv.set(0x0009); /* Horizontal tab */
rv.set(0x0020); /* Space */
rv.set(0x00A0); /* Non-break space */
rv.set(0x1680); /* Ogham space mark */
rv.set(0x180E); /* Mongolian vowel separator */
rv.set(0x2000); /* En quad */
rv.set(0x2001); /* Em quad */
rv.set(0x2002); /* En space */
rv.set(0x2003); /* Em space */
rv.set(0x2004); /* Three-per-em space */
rv.set(0x2005); /* Four-per-em space */
rv.set(0x2006); /* Six-per-em space */
rv.set(0x2007); /* Figure space */
rv.set(0x2008); /* Punctuation space */
rv.set(0x2009); /* Thin space */
rv.set(0x200A); /* Hair space */
rv.set(0x202F); /* Narrow no-break space */
rv.set(0x205F); /* Medium mathematical space */
rv.set(0x3000); /* Ideographic space */
rv.set(0x000A); /* Linefeed */
rv.set(0x000B); /* Vertical tab */
rv.set(0x000C); /* Formfeed */
rv.set(0x000D); /* Carriage return */
rv.set(0x0085); /* Next line */
rv.set(0x2028); /* Line separator */
rv.set(0x2029); /* Paragraph separator */
return rv;
}
case CLASS_UCP_C:
return getUcpC();
case CLASS_UCP_CC:
return getUcpCc();
case CLASS_UCP_CF:
return getUcpCf();
case CLASS_UCP_CN:
return getUcpCn();
case CLASS_UCP_CO:
return getUcpCo();
case CLASS_UCP_CS:
return getUcpCs();
case CLASS_UCP_L:
return getUcpL();
case CLASS_UCP_L_AND:
return getUcpL_and();
case CLASS_UCP_LL:
return getUcpLl();
case CLASS_UCP_LM:
return getUcpLm();
case CLASS_UCP_LO:
return getUcpLo();
case CLASS_UCP_LT:
return getUcpLt();
case CLASS_UCP_LU:
return getUcpLu();
case CLASS_UCP_M:
return getUcpM();
case CLASS_UCP_MC:
return getUcpMc();
case CLASS_UCP_ME:
return getUcpMe();
case CLASS_UCP_MN:
return getUcpMn();
case CLASS_UCP_N:
return getUcpN();
case CLASS_UCP_ND:
return getUcpNd();
case CLASS_UCP_NL:
return getUcpNl();
case CLASS_UCP_NO:
return getUcpNo();
case CLASS_UCP_P:
return getUcpP();
case CLASS_UCP_PC:
return getUcpPc();
case CLASS_UCP_PD:
return getUcpPd();
case CLASS_UCP_PE:
return getUcpPe();
case CLASS_UCP_PF:
return getUcpPf();
case CLASS_UCP_PI:
return getUcpPi();
case CLASS_UCP_PO:
return getUcpPo();
case CLASS_UCP_PS:
return getUcpPs();
case CLASS_UCP_S:
return getUcpS();
case CLASS_UCP_SC:
return getUcpSc();
case CLASS_UCP_SK:
return getUcpSk();
case CLASS_UCP_SM:
return getUcpSm();
case CLASS_UCP_SO:
return getUcpSo();
case CLASS_UCP_XAN:
return getUcpXan();
case CLASS_UCP_XWD:
return getUcpXwd();
case CLASS_UCP_Z:
return getUcpZ();
case CLASS_UCP_ZL:
return getUcpZl();
case CLASS_UCP_ZP:
return getUcpZp();
case CLASS_UCP_ZS:
return getUcpZs();
case CLASS_SCRIPT_ARABIC:
return getUcpArabic();
case CLASS_SCRIPT_ARMENIAN:
return getUcpArmenian();
case CLASS_SCRIPT_AVESTAN:
return getUcpAvestan();
case CLASS_SCRIPT_BALINESE:
return getUcpBalinese();
case CLASS_SCRIPT_BAMUM:
return getUcpBamum();
case CLASS_SCRIPT_BATAK:
return getUcpBatak();
case CLASS_SCRIPT_BENGALI:
return getUcpBengali();
case CLASS_SCRIPT_BOPOMOFO:
return getUcpBopomofo();
case CLASS_SCRIPT_BRAHMI:
return getUcpBrahmi();
case CLASS_SCRIPT_BRAILLE:
return getUcpBraille();
case CLASS_SCRIPT_BUGINESE:
return getUcpBuginese();
case CLASS_SCRIPT_BUHID:
return getUcpBuhid();
case CLASS_SCRIPT_CANADIAN_ABORIGINAL:
return getUcpCanadian_Aboriginal();
case CLASS_SCRIPT_CARIAN:
return getUcpCarian();
case CLASS_SCRIPT_CHAM:
return getUcpCham();
case CLASS_SCRIPT_CHEROKEE:
return getUcpCherokee();
case CLASS_SCRIPT_COMMON:
return getUcpCommon();
case CLASS_SCRIPT_COPTIC:
return getUcpCoptic();
case CLASS_SCRIPT_CUNEIFORM:
return getUcpCuneiform();
case CLASS_SCRIPT_CYPRIOT:
return getUcpCypriot();
case CLASS_SCRIPT_CYRILLIC:
return getUcpCyrillic();
case CLASS_SCRIPT_DESERET:
return getUcpDeseret();
case CLASS_SCRIPT_DEVANAGARI:
return getUcpDevanagari();
case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS:
return getUcpEgyptian_Hieroglyphs();
case CLASS_SCRIPT_ETHIOPIC:
return getUcpEthiopic();
case CLASS_SCRIPT_GEORGIAN:
return getUcpGeorgian();
case CLASS_SCRIPT_GLAGOLITIC:
return getUcpGlagolitic();
case CLASS_SCRIPT_GOTHIC:
return getUcpGothic();
case CLASS_SCRIPT_GREEK:
return getUcpGreek();
case CLASS_SCRIPT_GUJARATI:
return getUcpGujarati();
case CLASS_SCRIPT_GURMUKHI:
return getUcpGurmukhi();
case CLASS_SCRIPT_HAN:
return getUcpHan();
case CLASS_SCRIPT_HANGUL:
return getUcpHangul();
case CLASS_SCRIPT_HANUNOO:
return getUcpHanunoo();
case CLASS_SCRIPT_HEBREW:
return getUcpHebrew();
case CLASS_SCRIPT_HIRAGANA:
return getUcpHiragana();
case CLASS_SCRIPT_IMPERIAL_ARAMAIC:
return getUcpImperial_Aramaic();
case CLASS_SCRIPT_INHERITED:
return getUcpInherited();
case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI:
return getUcpInscriptional_Pahlavi();
case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN:
return getUcpInscriptional_Parthian();
case CLASS_SCRIPT_JAVANESE:
return getUcpJavanese();
case CLASS_SCRIPT_KAITHI:
return getUcpKaithi();
case CLASS_SCRIPT_KANNADA:
return getUcpKannada();
case CLASS_SCRIPT_KATAKANA:
return getUcpKatakana();
case CLASS_SCRIPT_KAYAH_LI:
return getUcpKayah_Li();
case CLASS_SCRIPT_KHAROSHTHI:
return getUcpKharoshthi();
case CLASS_SCRIPT_KHMER:
return getUcpKhmer();
case CLASS_SCRIPT_LAO:
return getUcpLao();
case CLASS_SCRIPT_LATIN:
return getUcpLatin();
case CLASS_SCRIPT_LEPCHA:
return getUcpLepcha();
case CLASS_SCRIPT_LIMBU:
return getUcpLimbu();
case CLASS_SCRIPT_LINEAR_B:
return getUcpLinear_B();
case CLASS_SCRIPT_LISU:
return getUcpLisu();
case CLASS_SCRIPT_LYCIAN:
return getUcpLycian();
case CLASS_SCRIPT_LYDIAN:
return getUcpLydian();
case CLASS_SCRIPT_MALAYALAM:
return getUcpMalayalam();
case CLASS_SCRIPT_MANDAIC:
return getUcpMandaic();
case CLASS_SCRIPT_MEETEI_MAYEK:
return getUcpMeetei_Mayek();
case CLASS_SCRIPT_MONGOLIAN:
return getUcpMongolian();
case CLASS_SCRIPT_MYANMAR:
return getUcpMyanmar();
case CLASS_SCRIPT_NEW_TAI_LUE:
return getUcpNew_Tai_Lue();
case CLASS_SCRIPT_NKO:
return getUcpNko();
case CLASS_SCRIPT_OGHAM:
return getUcpOgham();
case CLASS_SCRIPT_OL_CHIKI:
return getUcpOl_Chiki();
case CLASS_SCRIPT_OLD_ITALIC:
return getUcpOld_Italic();
case CLASS_SCRIPT_OLD_PERSIAN:
return getUcpOld_Persian();
case CLASS_SCRIPT_OLD_SOUTH_ARABIAN:
return getUcpOld_South_Arabian();
case CLASS_SCRIPT_OLD_TURKIC:
return getUcpOld_Turkic();
case CLASS_SCRIPT_ORIYA:
return getUcpOriya();
case CLASS_SCRIPT_OSMANYA:
return getUcpOsmanya();
case CLASS_SCRIPT_PHAGS_PA:
return getUcpPhags_Pa();
case CLASS_SCRIPT_PHOENICIAN:
return getUcpPhoenician();
case CLASS_SCRIPT_REJANG:
return getUcpRejang();
case CLASS_SCRIPT_RUNIC:
return getUcpRunic();
case CLASS_SCRIPT_SAMARITAN:
return getUcpSamaritan();
case CLASS_SCRIPT_SAURASHTRA:
return getUcpSaurashtra();
case CLASS_SCRIPT_SHAVIAN:
return getUcpShavian();
case CLASS_SCRIPT_SINHALA:
return getUcpSinhala();
case CLASS_SCRIPT_SUNDANESE:
return getUcpSundanese();
case CLASS_SCRIPT_SYLOTI_NAGRI:
return getUcpSyloti_Nagri();
case CLASS_SCRIPT_SYRIAC:
return getUcpSyriac();
case CLASS_SCRIPT_TAGALOG:
return getUcpTagalog();
case CLASS_SCRIPT_TAGBANWA:
return getUcpTagbanwa();
case CLASS_SCRIPT_TAI_LE:
return getUcpTai_Le();
case CLASS_SCRIPT_TAI_THAM:
return getUcpTai_Tham();
case CLASS_SCRIPT_TAI_VIET:
return getUcpTai_Viet();
case CLASS_SCRIPT_TAMIL:
return getUcpTamil();
case CLASS_SCRIPT_TELUGU:
return getUcpTelugu();
case CLASS_SCRIPT_THAANA:
return getUcpThaana();
case CLASS_SCRIPT_THAI:
return getUcpThai();
case CLASS_SCRIPT_TIBETAN:
return getUcpTibetan();
case CLASS_SCRIPT_TIFINAGH:
return getUcpTifinagh();
case CLASS_SCRIPT_UGARITIC:
return getUcpUgaritic();
case CLASS_SCRIPT_VAI:
return getUcpVai();
case CLASS_SCRIPT_YI:
return getUcpYi();
case CLASS_UCP_ANY:
return CodePointSet(CodePointSet::interval(0, MAX_UNICODE));
default: { /* currently uses ascii defns */
CharReach cr = getPredefinedCharReach(c, mode);
CodePointSet rv;
for (u32 i = cr.find_first(); i != CharReach::npos;
i = cr.find_next(i)) {
rv.set(i);
}
return rv;
}
}
}
UTF8ComponentClass::UTF8ComponentClass(const ParseMode &mode_in)
: ComponentClass(mode_in),
single_pos( GlushkovBuildState::POS_UNINITIALIZED),
one_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED),
two_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED),
three_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED),
two_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED),
three_char_dot_head(GlushkovBuildState::POS_UNINITIALIZED),
four_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED) {
assert(mode.utf8);
}
UTF8ComponentClass *UTF8ComponentClass::clone() const {
return new UTF8ComponentClass(*this);
}
bool UTF8ComponentClass::class_empty(void) const {
assert(finalized);
return cps.none();
}
void UTF8ComponentClass::createRange(unichar to) {
assert(range_start != INVALID_UNICODE);
unichar from = range_start;
if (from > to) {
throw LocatedParseError("Range out of order in character class");
}
in_cand_range = false;
CodePointSet ncps;
ncps.setRange(from, to);
if (mode.caseless) {
make_caseless(&ncps);
}
cps |= ncps;
range_start = INVALID_UNICODE;
}
void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
if (in_cand_range) { // can't form a range here
throw LocatedParseError("Invalid range in character class");
}
if (mode.ucp) {
c = translateForUcpMode(c, mode);
}
// caselessness is handled inside this call - don't apply make_caseless
// to the result
CodePointSet pcps = getPredefinedCodePointSet(c, mode);
if (negative) {
pcps.flip();
}
cps |= pcps;
range_start = INVALID_UNICODE;
in_cand_range = false;
}
void UTF8ComponentClass::add(unichar c) {
DEBUG_PRINTF("adding \\x%08x\n", c);
if (c > MAX_UNICODE) { // too big!
throw LocatedParseError("Hexadecimal value is greater than \\x10FFFF");
}
if (in_cand_range) {
createRange(c);
return;
}
CodePointSet ncps;
ncps.set(c);
if (mode.caseless) {
make_caseless(&ncps);
}
cps |= ncps;
range_start = c;
}
void UTF8ComponentClass::finalize() {
if (finalized) {
return;
}
// Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the
// dash is a literal dash.
if (in_cand_range) {
cps.set('-');
in_cand_range = false;
}
if (m_negate) {
cps.flip();
}
finalized = true;
}
Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
map<u8, Position>::const_iterator it = heads.find(first_byte);
if (it != heads.end()) {
return it->second;
}
Position head = builder.makePositions(1);
assert(heads.find(first_byte) == heads.end());
builder.addCharReach(head, CharReach(first_byte));
/* no report id as head can not be directly wired to accept */
heads[first_byte] = head;
return head;
}
void UTF8ComponentClass::ensureDotTrailer(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
if (one_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
return;
}
one_dot_trailer = builder.makePositions(1);
builder.setNodeReportID(one_dot_trailer, 0);
builder.addCharReach(one_dot_trailer, CharReach(0x80, 0xbf));
tails.insert(one_dot_trailer);
}
void UTF8ComponentClass::ensureTwoDotTrailer(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
if (two_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
return;
}
ensureDotTrailer(bs);
two_dot_trailer = builder.makePositions(1);
builder.addCharReach(two_dot_trailer, CharReach(0x80, 0xbf));
bs.addSuccessor(two_dot_trailer, one_dot_trailer);
}
void UTF8ComponentClass::ensureThreeDotTrailer(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
if (three_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
return;
}
ensureTwoDotTrailer(bs);
three_dot_trailer = builder.makePositions(1);
builder.addCharReach(three_dot_trailer, CharReach(0x80, 0xbf));
bs.addSuccessor(three_dot_trailer, two_dot_trailer);
}
void UTF8ComponentClass::buildOneByte(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
for (CodePointSet::const_iterator it = cps.begin(); it != cps.end(); ++it) {
unichar b = lower(*it);
unichar e = upper(*it) + 1;
if (b >= UTF_2CHAR_MIN) {
continue;
}
DEBUG_PRINTF("building vertices for [%u, %u)\n", b, e);
if (single_pos == GlushkovBuildState::POS_UNINITIALIZED) {
single_pos = builder.makePositions(1);
builder.setNodeReportID(single_pos, 0 /* offset adj */);
tails.insert(single_pos);
}
CharReach cr(b, MIN(e, UTF_2CHAR_MIN) - 1);
builder.addCharReach(single_pos, cr);
}
}
void UTF8ComponentClass::addToTail(GlushkovBuildState &bs,
map<Position, Position> &finals,
Position prev, unichar b, unichar e) {
NFABuilder &builder = bs.getBuilder();
Position tail;
if (finals.find(prev) == finals.end()) {
tail = builder.makePositions(1);
builder.setNodeReportID(tail, 0 /* offset adj */);
bs.addSuccessor(prev, tail);
finals[prev] = tail;
tails.insert(tail);
} else {
tail = finals[prev];
}
u8 bb = makeContByte(b);
u8 ee = makeContByte(e - 1);
builder.addCharReach(tail, CharReach(bb, ee));
}
void UTF8ComponentClass::buildTwoByte(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
map<Position, Position> finals;
for (auto it = cps.begin(); it != cps.end(); ++it) {
unichar b = lower(*it);
unichar e = upper(*it) + 1;
b = MAX(b, UTF_2CHAR_MIN);
e = MIN(e, UTF_3CHAR_MIN);
if (b >= e) {
continue; /* we're done here */
}
/* raise b to the start of the next tail byte boundary */
if (b & UTF_CONT_BYTE_VALUE_MASK) {
unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));
u8 first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT);
assert(first_byte > 0xc1 && first_byte <= 0xdf);
Position head = getHead(builder, first_byte);
addToTail(bs, finals, head, b, bb);
b = bb;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* lower e to the end of a tail byte boundary */
if (e & UTF_CONT_BYTE_VALUE_MASK) {
unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
assert(ee >= b);
u8 first_byte = UTF_TWO_BYTE_HEADER | (ee >> UTF_CONT_SHIFT);
assert(first_byte > 0xc1 && first_byte <= 0xdf);
Position head = getHead(builder, first_byte);
addToTail(bs, finals, head, ee, e);
e = ee;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* middle section just goes to a common full vertex */
ensureDotTrailer(bs);
if (two_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
two_char_dot_head = builder.makePositions(1);
bs.addSuccessor(two_char_dot_head, one_dot_trailer);
}
u8 min_first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT);
u8 max_first_byte = UTF_TWO_BYTE_HEADER | ((e - 1) >> UTF_CONT_SHIFT);
assert(min_first_byte > 0xc1 && min_first_byte <= 0xdf);
assert(max_first_byte > 0xc1 && max_first_byte <= 0xdf);
builder.addCharReach(two_char_dot_head,
CharReach(min_first_byte, max_first_byte));
}
}
static
Position getMid(GlushkovBuildState &bs, map<Position, map<u8, Position> > &mids,
const Position &prev, u8 byte_val) {
NFABuilder &builder = bs.getBuilder();
map<u8, Position> &by_byte = mids[prev];
map<u8, Position>::const_iterator it = by_byte.find(byte_val);
if (it != by_byte.end()) {
return it->second;
}
Position mid = builder.makePositions(1);
builder.addCharReach(mid, CharReach(byte_val));
bs.addSuccessor(prev, mid);
/* no report id as mid can not be directly wired to accept */
by_byte[byte_val] = mid;
return mid;
}
void UTF8ComponentClass::buildThreeByte(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
map<Position, map<u8, Position> > mids;
map<Position, Position> finals;
for (auto it = cps.begin(); it != cps.end(); ++it) {
unichar b = lower(*it);
unichar e = upper(*it) + 1;
b = MAX(b, UTF_3CHAR_MIN);
e = MIN(e, UTF_4CHAR_MIN);
if (b >= e) {
continue; /* we're done here */
}
/* raise b to the start of the next tail byte boundary */
if (b & UTF_CONT_BYTE_VALUE_MASK) {
unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));
u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT));
assert(first_byte >= 0xe0 && first_byte <= 0xef);
Position head = getHead(builder, first_byte);
u8 second_byte = makeContByte(b >> UTF_CONT_SHIFT);
Position mid = getMid(bs, mids, head, second_byte);
addToTail(bs, finals, mid, b, bb);
b = bb;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* lower e to the end of a tail byte boundary */
if (e & UTF_CONT_BYTE_VALUE_MASK) {
unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
assert(ee >= b);
u8 first_byte = UTF_THREE_BYTE_HEADER
| (ee >> (2 * UTF_CONT_SHIFT));
assert(first_byte >= 0xe0 && first_byte <= 0xef);
Position head = getHead(builder, first_byte);
u8 second_byte = makeContByte(ee >> UTF_CONT_SHIFT);
Position mid = getMid(bs, mids, head, second_byte);
addToTail(bs, finals, mid, ee, e);
e = ee;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* from here on in the last byte is always full */
ensureDotTrailer(bs);
/* raise b to the start of the next mid byte boundary */
if (b & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) {
unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT)));
u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT));
Position head = getHead(builder, first_byte);
Position mid = builder.makePositions(1);
bs.addSuccessor(head, mid);
bs.addSuccessor(mid, one_dot_trailer);
/* no report id as mid can not be directly wired to accept,
* not adding to mids as we are completely filling its downstream */
u8 second_min = makeContByte(b >> UTF_CONT_SHIFT);
u8 second_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT);
builder.addCharReach(mid, CharReach(second_min, second_max));
b = bb;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* lower e to the end of a mid byte boundary */
if (e & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) {
unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1);
assert(ee >= b);
u8 first_byte = UTF_THREE_BYTE_HEADER
| (ee >> (2 * UTF_CONT_SHIFT));
Position head = getHead(builder, first_byte);
Position mid = builder.makePositions(1);
bs.addSuccessor(head, mid);
bs.addSuccessor(mid, one_dot_trailer);
/* no report id as mid can not be directly wired to accept,
* not adding to mids as we are completely filling its downstream */
u8 second_min = makeContByte(ee >> UTF_CONT_SHIFT);
u8 second_max = makeContByte((e - 1) >> UTF_CONT_SHIFT);
builder.addCharReach(mid, CharReach(second_min, second_max));
e = ee;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* now we just have to wire head to a common dot trailer */
ensureTwoDotTrailer(bs);
if (three_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
three_char_dot_head = builder.makePositions(1);
bs.addSuccessor(three_char_dot_head, two_dot_trailer);
}
u8 min_first_byte = UTF_THREE_BYTE_HEADER
| (b >> (2 * UTF_CONT_SHIFT));
u8 max_first_byte = UTF_THREE_BYTE_HEADER
| ((e - 1) >> (2 * UTF_CONT_SHIFT));
assert(min_first_byte > 0xdf && min_first_byte <= 0xef);
assert(max_first_byte > 0xdf && max_first_byte <= 0xef);
builder.addCharReach(three_char_dot_head,
CharReach(min_first_byte, max_first_byte));
}
}
static
u8 makeFirstByteOfFour(unichar raw) {
u8 first_byte = UTF_FOUR_BYTE_HEADER | (raw >> (3 * UTF_CONT_SHIFT));
assert(first_byte > 0xef && first_byte <= 0xf7);
return first_byte;
}
static
bool isTwoContAligned(unichar raw) {
return !(raw & ((1 << (2 * UTF_CONT_SHIFT)) - 1));
}
static
bool isThreeContAligned(unichar raw) {
return !(raw & ((1 << (3 * UTF_CONT_SHIFT)) - 1));
}
void UTF8ComponentClass::buildFourByte(GlushkovBuildState &bs) {
NFABuilder &builder = bs.getBuilder();
map<Position, map<u8, Position> > mids;
map<Position, Position> finals;
for (auto it = cps.begin(); it != cps.end(); ++it) {
unichar b = lower(*it);
unichar e = upper(*it) + 1;
b = MAX(b, UTF_4CHAR_MIN);
e = MIN(e, MAX_UNICODE + 1);
if (b >= e) {
continue;
}
/* raise b to the start of the next tail byte boundary */
if (b & UTF_CONT_BYTE_VALUE_MASK) {
unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));
u8 first_byte = makeFirstByteOfFour(b);
Position head = getHead(builder, first_byte);
u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT));
Position mid1 = getMid(bs, mids, head, second_byte);
u8 third_byte = makeContByte(b >> UTF_CONT_SHIFT);
Position mid2 = getMid(bs, mids, mid1, third_byte);
addToTail(bs, finals, mid2, b, bb);
b = bb;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* lower e to the end of a tail byte boundary */
if (e & UTF_CONT_BYTE_VALUE_MASK) {
unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
assert(ee >= b);
u8 first_byte = makeFirstByteOfFour(ee);
Position head = getHead(builder, first_byte);
u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
Position mid1 = getMid(bs, mids, head, second_byte);
u8 third_byte = makeContByte(ee >> UTF_CONT_SHIFT);
Position mid2 = getMid(bs, mids, mid1, third_byte);
addToTail(bs, finals, mid2, ee, e);
e = ee;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* from here on in the last byte is always full */
ensureDotTrailer(bs);
/* raise b to the start of the next mid byte boundary */
if (!isTwoContAligned(b)) {
unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT)));
u8 first_byte = makeFirstByteOfFour(b);
Position head = getHead(builder, first_byte);
u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT));
Position mid1 = getMid(bs, mids, head, second_byte);
Position mid2 = builder.makePositions(1);
bs.addSuccessor(mid1, mid2);
bs.addSuccessor(mid2, one_dot_trailer);
/* no report id as mid can not be directly wired to accept,
* not adding to mids as we are completely filling its downstream */
u8 byte_min = makeContByte(b >> UTF_CONT_SHIFT);
u8 byte_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT);
builder.addCharReach(mid2, CharReach(byte_min, byte_max));
b = bb;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* lower e to the end of a mid byte boundary */
if (!isTwoContAligned(e)) {
unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1);
assert(ee >= b);
u8 first_byte = makeFirstByteOfFour(ee);
Position head = getHead(builder, first_byte);
u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
Position mid1 = getMid(bs, mids, head, second_byte);
Position mid2 = builder.makePositions(1);
bs.addSuccessor(mid1, mid2);
bs.addSuccessor(mid2, one_dot_trailer);
/* no report id as mid can not be directly wired to accept,
* not adding to mids as we are completely filling its downstream */
u8 byte_min = makeContByte(ee >> UTF_CONT_SHIFT);
u8 byte_max = makeContByte((e - 1) >> UTF_CONT_SHIFT);
builder.addCharReach(mid2, CharReach(byte_min, byte_max));
e = ee;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
ensureTwoDotTrailer(bs);
/* raise b to the next byte boundary */
if (!isThreeContAligned(b)) {
unichar bb = MIN(e, ROUNDUP_N(b, 1 << (3 * UTF_CONT_SHIFT)));
u8 first_byte = makeFirstByteOfFour(b);
Position head = getHead(builder, first_byte);
Position mid1 = builder.makePositions(1);
bs.addSuccessor(head, mid1);
bs.addSuccessor(mid1, two_dot_trailer);
/* no report id as mid can not be directly wired to accept,
* not adding to mids as we are completely filling its downstream */
u8 byte_min = makeContByte(b >> (2 * UTF_CONT_SHIFT));
u8 byte_max = makeContByte((bb - 1) >> (2 * UTF_CONT_SHIFT));
builder.addCharReach(mid1, CharReach(byte_min, byte_max));
b = bb;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* lower e to the next byte boundary */
if (!isThreeContAligned(e)) {
unichar ee = e & ~((1 << (3 * UTF_CONT_SHIFT)) - 1);
assert(ee >= b);
u8 first_byte = makeFirstByteOfFour(ee);
Position head = getHead(builder, first_byte);
Position mid1 = builder.makePositions(1);
bs.addSuccessor(head, mid1);
bs.addSuccessor(mid1, two_dot_trailer);
/* no report id as mid can not be directly wired to accept,
* not adding to mids as we are completely filling its downstream */
u8 byte_min = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
u8 byte_max = makeContByte((e - 1) >> (2 * UTF_CONT_SHIFT));
builder.addCharReach(mid1, CharReach(byte_min, byte_max));
e = ee;
}
if (b == e) {
continue; /* we're done here */
}
assert(b < e);
/* now we just have to wire head to a common dot trailer */
ensureThreeDotTrailer(bs);
if (four_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
four_char_dot_head = builder.makePositions(1);
bs.addSuccessor(four_char_dot_head, three_dot_trailer);
}
u8 min_first_byte = makeFirstByteOfFour(b);
u8 max_first_byte = makeFirstByteOfFour(e - 1);
builder.addCharReach(four_char_dot_head,
CharReach(min_first_byte, max_first_byte));
}
}
void UTF8ComponentClass::notePositions(GlushkovBuildState &bs) {
// We should always be finalized by now.
assert(finalized);
// An empty class is a special case; this would be generated by something
// like /[\s\S]/8, which can never match. We treat these like we do the non
// UTF-8 version: add a vertex with empty reach (to ensure we create a
// connected graph) and pick it up later on.
if (class_empty()) {
DEBUG_PRINTF("empty class!\n");
assert(single_pos == GlushkovBuildState::POS_UNINITIALIZED);
NFABuilder &builder = bs.getBuilder();
single_pos = builder.makePositions(1);
builder.setNodeReportID(single_pos, 0 /* offset adj */);
builder.addCharReach(single_pos, CharReach());
tails.insert(single_pos);
return;
}
buildOneByte(bs);
buildTwoByte(bs);
buildThreeByte(bs);
buildFourByte(bs);
}
void UTF8ComponentClass::buildFollowSet(GlushkovBuildState &,
const vector<PositionInfo> &) {
/* states are wired in notePositions as all belong to this component. */
}
vector<PositionInfo> UTF8ComponentClass::first(void) const {
vector<PositionInfo> rv;
if (single_pos != GlushkovBuildState::POS_UNINITIALIZED) {
rv.emplace_back(single_pos);
}
if (two_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
rv.emplace_back(two_char_dot_head);
}
if (three_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
rv.emplace_back(three_char_dot_head);
}
if (four_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
rv.emplace_back(four_char_dot_head);
}
for (auto it = heads.begin(); it != heads.end(); ++it) {
rv.emplace_back(it->second);
}
return rv;
}
vector<PositionInfo> UTF8ComponentClass::last(void) const {
vector<PositionInfo> rv;
rv.insert(rv.end(), tails.begin(), tails.end());
return rv;
}
} // namespace ue2