vectorscan/src/parser/Utf8ComponentClass.cpp

/*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/** \file
 * \brief Character class in UTF-8 mode.
  */


#include "Utf8ComponentClass.h"

#include "buildstate.h"
#include "Parser.h"
#include "parse_error.h"
#include "position.h"
#include "position_info.h"
#include "nfagraph/ng_builder.h"
#include "util/compare.h"
#include "util/unicode_def.h"

#include <cstring>

#include "ucp_table.h"

using namespace std;

namespace ue2 {

PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
    /* Note: the mapping used here for mapping posix character classes
     * matches the observed behaviour of PCRE (lower and upper going to \p{L}
     * is not documented by pcre).
     *
     * Note: this mapping is quite different from both of the mappings
     * recommended in the unicode regex tech report (TR-18) appendix C
     */
    switch (in) {
    case CLASS_ALNUM:
        return CLASS_UCP_XAN;
    case CLASS_ALPHA:
        return CLASS_UCP_L;
    case CLASS_BLANK:
        return CLASS_HORZ;
    case CLASS_DIGIT:
        return CLASS_UCP_ND;
    case CLASS_GRAPH:
        return CLASS_XGRAPH;
    case CLASS_LOWER:
        if (mode.caseless) { /* we also pick up uppercase titlecase and others */
            return CLASS_UCP_L;
        } else {
            return CLASS_UCP_LL;
        }
    case CLASS_PRINT:
        return CLASS_XPRINT;
    case CLASS_PUNCT:
        return CLASS_XPUNCT;
    case CLASS_SPACE:
        return CLASS_UCP_XPS;
    case CLASS_UPPER:
        if (mode.caseless) { /* we also pick up lowercase titlecase and others */
            return CLASS_UCP_L;
        } else {
            return CLASS_UCP_LU;
        }
    case CLASS_WORD:
        return CLASS_UCP_XWD;
    default:
        return in;
    }
}

CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                       const ParseMode &mode) {
    /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
    switch (c) {
    case CLASS_ANY:
        if (mode.dotall) {
            return CodePointSet(CodePointSet::interval(0, MAX_UNICODE));
        } else {
            CodePointSet rv;
            rv.set('\n');
            rv.flip();
            return rv;
        }
    case CLASS_XGRAPH: {
        CodePointSet rv;
        rv = getUcpZ();
        rv |= getUcpC();
        rv.flip();
        // most of Cf, except for ...
        CodePointSet cf = getUcpCf();
        cf.unset(0x061c);
        cf.unset(0x180e);
        cf.unsetRange(0x2066, 0x2069);
        rv |= cf;
        return rv;
    }
    case CLASS_XPRINT: {
        // Same as graph, plus everything with the Zs property.
        CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
        rv |= getUcpZs();
        rv.set(0x180e); // Also included in this class by PCRE 8.38.
        return rv;
    }
    case CLASS_XPUNCT: {
        // Everything with the P (punctuation) property, plus code points in S
        // (symbols) that are < 128.
        CodePointSet rv = getUcpP();
        CodePointSet symbols = getUcpS();
        symbols.unsetRange(128, MAX_UNICODE);
        rv |= symbols;
        return rv;
    }
    case CLASS_HORZ: {
        CodePointSet rv;
        rv.set(0x0009); /* Horizontal tab */
        rv.set(0x0020); /* Space */
        rv.set(0x00A0); /* Non-break space */
        rv.set(0x1680); /* Ogham space mark */
        rv.set(0x180E); /* Mongolian vowel separator */
        rv.set(0x2000); /* En quad */
        rv.set(0x2001); /* Em quad */
        rv.set(0x2002); /* En space */
        rv.set(0x2003); /* Em space */
        rv.set(0x2004); /* Three-per-em space */
        rv.set(0x2005); /* Four-per-em space */
        rv.set(0x2006); /* Six-per-em space */
        rv.set(0x2007); /* Figure space */
        rv.set(0x2008); /* Punctuation space */
        rv.set(0x2009); /* Thin space */
        rv.set(0x200A); /* Hair space */
        rv.set(0x202F); /* Narrow no-break space */
        rv.set(0x205F); /* Medium mathematical space */
        rv.set(0x3000); /* Ideographic space */
        return rv;
    }
    case CLASS_VERT: {
        CodePointSet rv;
        rv.set(0x000A); /* Linefeed */
        rv.set(0x000B); /* Vertical tab */
        rv.set(0x000C); /* Formfeed */
        rv.set(0x000D); /* Carriage return */
        rv.set(0x0085); /* Next line */
        rv.set(0x2028); /* Line separator */
        rv.set(0x2029); /* Paragraph separator */
        return rv;
    }
    case CLASS_UCP_XPS:
    case CLASS_UCP_XSP: {
        CodePointSet rv;
        rv.set(0x0009); /* Horizontal tab */
        rv.set(0x0020); /* Space */
        rv.set(0x00A0); /* Non-break space */
        rv.set(0x1680); /* Ogham space mark */
        rv.set(0x180E); /* Mongolian vowel separator */
        rv.set(0x2000); /* En quad */
        rv.set(0x2001); /* Em quad */
        rv.set(0x2002); /* En space */
        rv.set(0x2003); /* Em space */
        rv.set(0x2004); /* Three-per-em space */
        rv.set(0x2005); /* Four-per-em space */
        rv.set(0x2006); /* Six-per-em space */
        rv.set(0x2007); /* Figure space */
        rv.set(0x2008); /* Punctuation space */
        rv.set(0x2009); /* Thin space */
        rv.set(0x200A); /* Hair space */
        rv.set(0x202F); /* Narrow no-break space */
        rv.set(0x205F); /* Medium mathematical space */
        rv.set(0x3000); /* Ideographic space */
        rv.set(0x000A); /* Linefeed */
        rv.set(0x000B); /* Vertical tab */
        rv.set(0x000C); /* Formfeed */
        rv.set(0x000D); /* Carriage return */
        rv.set(0x0085); /* Next line */
        rv.set(0x2028); /* Line separator */
        rv.set(0x2029); /* Paragraph separator */
        return rv;
    }
    case CLASS_UCP_C:
        return getUcpC();
    case CLASS_UCP_CC:
        return getUcpCc();
    case CLASS_UCP_CF:
        return getUcpCf();
    case CLASS_UCP_CN:
        return getUcpCn();
    case CLASS_UCP_CO:
        return getUcpCo();
    case CLASS_UCP_CS:
        return getUcpCs();
    case CLASS_UCP_L:
        return getUcpL();
    case CLASS_UCP_L_AND:
        return getUcpL_and();
    case CLASS_UCP_LL:
        return getUcpLl();
    case CLASS_UCP_LM:
        return getUcpLm();
    case CLASS_UCP_LO:
        return getUcpLo();
    case CLASS_UCP_LT:
        return getUcpLt();
    case CLASS_UCP_LU:
        return getUcpLu();
    case CLASS_UCP_M:
        return getUcpM();
    case CLASS_UCP_MC:
        return getUcpMc();
    case CLASS_UCP_ME:
        return getUcpMe();
    case CLASS_UCP_MN:
        return getUcpMn();
    case CLASS_UCP_N:
        return getUcpN();
    case CLASS_UCP_ND:
        return getUcpNd();
    case CLASS_UCP_NL:
        return getUcpNl();
    case CLASS_UCP_NO:
        return getUcpNo();
    case CLASS_UCP_P:
        return getUcpP();
    case CLASS_UCP_PC:
        return getUcpPc();
    case CLASS_UCP_PD:
        return getUcpPd();
    case CLASS_UCP_PE:
        return getUcpPe();
    case CLASS_UCP_PF:
        return getUcpPf();
    case CLASS_UCP_PI:
        return getUcpPi();
    case CLASS_UCP_PO:
        return getUcpPo();
    case CLASS_UCP_PS:
        return getUcpPs();
    case CLASS_UCP_S:
        return getUcpS();
    case CLASS_UCP_SC:
        return getUcpSc();
    case CLASS_UCP_SK:
        return getUcpSk();
    case CLASS_UCP_SM:
        return getUcpSm();
    case CLASS_UCP_SO:
        return getUcpSo();
    case CLASS_UCP_XAN:
        return getUcpXan();
    case CLASS_UCP_XWD:
        return getUcpXwd();
    case CLASS_UCP_Z:
        return getUcpZ();
    case CLASS_UCP_ZL:
        return getUcpZl();
    case CLASS_UCP_ZP:
        return getUcpZp();
    case CLASS_UCP_ZS:
        return getUcpZs();
    case CLASS_SCRIPT_ARABIC:
        return getUcpArabic();
    case CLASS_SCRIPT_ARMENIAN:
        return getUcpArmenian();
    case CLASS_SCRIPT_AVESTAN:
        return getUcpAvestan();
    case CLASS_SCRIPT_BALINESE:
        return getUcpBalinese();
    case CLASS_SCRIPT_BAMUM:
        return getUcpBamum();
    case CLASS_SCRIPT_BATAK:
        return getUcpBatak();
    case CLASS_SCRIPT_BENGALI:
        return getUcpBengali();
    case CLASS_SCRIPT_BOPOMOFO:
        return getUcpBopomofo();
    case CLASS_SCRIPT_BRAHMI:
        return getUcpBrahmi();
    case CLASS_SCRIPT_BRAILLE:
        return getUcpBraille();
    case CLASS_SCRIPT_BUGINESE:
        return getUcpBuginese();
    case CLASS_SCRIPT_BUHID:
        return getUcpBuhid();
    case CLASS_SCRIPT_CANADIAN_ABORIGINAL:
        return getUcpCanadian_Aboriginal();
    case CLASS_SCRIPT_CARIAN:
        return getUcpCarian();
    case CLASS_SCRIPT_CHAM:
        return getUcpCham();
    case CLASS_SCRIPT_CHEROKEE:
        return getUcpCherokee();
    case CLASS_SCRIPT_COMMON:
        return getUcpCommon();
    case CLASS_SCRIPT_COPTIC:
        return getUcpCoptic();
    case CLASS_SCRIPT_CUNEIFORM:
        return getUcpCuneiform();
    case CLASS_SCRIPT_CYPRIOT:
        return getUcpCypriot();
    case CLASS_SCRIPT_CYRILLIC:
        return getUcpCyrillic();
    case CLASS_SCRIPT_DESERET:
        return getUcpDeseret();
    case CLASS_SCRIPT_DEVANAGARI:
        return getUcpDevanagari();
    case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS:
        return getUcpEgyptian_Hieroglyphs();
    case CLASS_SCRIPT_ETHIOPIC:
        return getUcpEthiopic();
    case CLASS_SCRIPT_GEORGIAN:
        return getUcpGeorgian();
    case CLASS_SCRIPT_GLAGOLITIC:
        return getUcpGlagolitic();
    case CLASS_SCRIPT_GOTHIC:
        return getUcpGothic();
    case CLASS_SCRIPT_GREEK:
        return getUcpGreek();
    case CLASS_SCRIPT_GUJARATI:
        return getUcpGujarati();
    case CLASS_SCRIPT_GURMUKHI:
        return getUcpGurmukhi();
    case CLASS_SCRIPT_HAN:
        return getUcpHan();
    case CLASS_SCRIPT_HANGUL:
        return getUcpHangul();
    case CLASS_SCRIPT_HANUNOO:
        return getUcpHanunoo();
    case CLASS_SCRIPT_HEBREW:
        return getUcpHebrew();
    case CLASS_SCRIPT_HIRAGANA:
        return getUcpHiragana();
    case CLASS_SCRIPT_IMPERIAL_ARAMAIC:
        return getUcpImperial_Aramaic();
    case CLASS_SCRIPT_INHERITED:
        return getUcpInherited();
    case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI:
        return getUcpInscriptional_Pahlavi();
    case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN:
        return getUcpInscriptional_Parthian();
    case CLASS_SCRIPT_JAVANESE:
        return getUcpJavanese();
    case CLASS_SCRIPT_KAITHI:
        return getUcpKaithi();
    case CLASS_SCRIPT_KANNADA:
        return getUcpKannada();
    case CLASS_SCRIPT_KATAKANA:
        return getUcpKatakana();
    case CLASS_SCRIPT_KAYAH_LI:
        return getUcpKayah_Li();
    case CLASS_SCRIPT_KHAROSHTHI:
        return getUcpKharoshthi();
    case CLASS_SCRIPT_KHMER:
        return getUcpKhmer();
    case CLASS_SCRIPT_LAO:
        return getUcpLao();
    case CLASS_SCRIPT_LATIN:
        return getUcpLatin();
    case CLASS_SCRIPT_LEPCHA:
        return getUcpLepcha();
    case CLASS_SCRIPT_LIMBU:
        return getUcpLimbu();
    case CLASS_SCRIPT_LINEAR_B:
        return getUcpLinear_B();
    case CLASS_SCRIPT_LISU:
        return getUcpLisu();
    case CLASS_SCRIPT_LYCIAN:
        return getUcpLycian();
    case CLASS_SCRIPT_LYDIAN:
        return getUcpLydian();
    case CLASS_SCRIPT_MALAYALAM:
        return getUcpMalayalam();
    case CLASS_SCRIPT_MANDAIC:
        return getUcpMandaic();
    case CLASS_SCRIPT_MEETEI_MAYEK:
        return getUcpMeetei_Mayek();
    case CLASS_SCRIPT_MONGOLIAN:
        return getUcpMongolian();
    case CLASS_SCRIPT_MYANMAR:
        return getUcpMyanmar();
    case CLASS_SCRIPT_NEW_TAI_LUE:
        return getUcpNew_Tai_Lue();
    case CLASS_SCRIPT_NKO:
        return getUcpNko();
    case CLASS_SCRIPT_OGHAM:
        return getUcpOgham();
    case CLASS_SCRIPT_OL_CHIKI:
        return getUcpOl_Chiki();
    case CLASS_SCRIPT_OLD_ITALIC:
        return getUcpOld_Italic();
    case CLASS_SCRIPT_OLD_PERSIAN:
        return getUcpOld_Persian();
    case CLASS_SCRIPT_OLD_SOUTH_ARABIAN:
        return getUcpOld_South_Arabian();
    case CLASS_SCRIPT_OLD_TURKIC:
        return getUcpOld_Turkic();
    case CLASS_SCRIPT_ORIYA:
        return getUcpOriya();
    case CLASS_SCRIPT_OSMANYA:
        return getUcpOsmanya();
    case CLASS_SCRIPT_PHAGS_PA:
        return getUcpPhags_Pa();
    case CLASS_SCRIPT_PHOENICIAN:
        return getUcpPhoenician();
    case CLASS_SCRIPT_REJANG:
        return getUcpRejang();
    case CLASS_SCRIPT_RUNIC:
        return getUcpRunic();
    case CLASS_SCRIPT_SAMARITAN:
        return getUcpSamaritan();
    case CLASS_SCRIPT_SAURASHTRA:
        return getUcpSaurashtra();
    case CLASS_SCRIPT_SHAVIAN:
        return getUcpShavian();
    case CLASS_SCRIPT_SINHALA:
        return getUcpSinhala();
    case CLASS_SCRIPT_SUNDANESE:
        return getUcpSundanese();
    case CLASS_SCRIPT_SYLOTI_NAGRI:
        return getUcpSyloti_Nagri();
    case CLASS_SCRIPT_SYRIAC:
        return getUcpSyriac();
    case CLASS_SCRIPT_TAGALOG:
        return getUcpTagalog();
    case CLASS_SCRIPT_TAGBANWA:
        return getUcpTagbanwa();
    case CLASS_SCRIPT_TAI_LE:
        return getUcpTai_Le();
    case CLASS_SCRIPT_TAI_THAM:
        return getUcpTai_Tham();
    case CLASS_SCRIPT_TAI_VIET:
        return getUcpTai_Viet();
    case CLASS_SCRIPT_TAMIL:
        return getUcpTamil();
    case CLASS_SCRIPT_TELUGU:
        return getUcpTelugu();
    case CLASS_SCRIPT_THAANA:
        return getUcpThaana();
    case CLASS_SCRIPT_THAI:
        return getUcpThai();
    case CLASS_SCRIPT_TIBETAN:
        return getUcpTibetan();
    case CLASS_SCRIPT_TIFINAGH:
        return getUcpTifinagh();
    case CLASS_SCRIPT_UGARITIC:
        return getUcpUgaritic();
    case CLASS_SCRIPT_VAI:
        return getUcpVai();
    case CLASS_SCRIPT_YI:
        return getUcpYi();
    case CLASS_UCP_ANY:
        return CodePointSet(CodePointSet::interval(0, MAX_UNICODE));

    default: { /* currently uses ascii defns */
        CharReach cr = getPredefinedCharReach(c, mode);
        CodePointSet rv;
        for (u32 i = cr.find_first(); i != CharReach::npos;
             i = cr.find_next(i)) {
            rv.set(i);
        }
        return rv;
    }
    }
}

UTF8ComponentClass::UTF8ComponentClass(const ParseMode &mode_in)
    : ComponentClass(mode_in),
      single_pos(         GlushkovBuildState::POS_UNINITIALIZED),
      one_dot_trailer(    GlushkovBuildState::POS_UNINITIALIZED),
      two_dot_trailer(    GlushkovBuildState::POS_UNINITIALIZED),
      three_dot_trailer(  GlushkovBuildState::POS_UNINITIALIZED),
      two_char_dot_head(  GlushkovBuildState::POS_UNINITIALIZED),
      three_char_dot_head(GlushkovBuildState::POS_UNINITIALIZED),
      four_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED) {
    assert(mode.utf8);
}

UTF8ComponentClass *UTF8ComponentClass::clone() const {
    return new UTF8ComponentClass(*this);
}

bool UTF8ComponentClass::class_empty(void) const {
    assert(finalized);
    return cps.none();
}

void UTF8ComponentClass::createRange(unichar to) {
    assert(range_start != INVALID_UNICODE);
    unichar from = range_start;
    if (from > to) {
        throw LocatedParseError("Range out of order in character class");
    }

    in_cand_range = false;
    CodePointSet ncps;
    ncps.setRange(from, to);
    if (mode.caseless) {
        make_caseless(&ncps);
    }
    cps |= ncps;
    range_start = INVALID_UNICODE;
}

void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
    if (in_cand_range) { // can't form a range here
        throw LocatedParseError("Invalid range in character class");
    }

    if (mode.ucp) {
        c = translateForUcpMode(c, mode);
    }

    // caselessness is handled inside this call - don't apply make_caseless
    // to the result
    CodePointSet pcps = getPredefinedCodePointSet(c, mode);
    if (negative) {
        pcps.flip();
    }

    cps |= pcps;

    range_start = INVALID_UNICODE;
    in_cand_range = false;
}

void UTF8ComponentClass::add(unichar c) {
    DEBUG_PRINTF("adding \\x%08x\n", c);
    if (c > MAX_UNICODE) { // too big!
        throw LocatedParseError("Hexadecimal value is greater than \\x10FFFF");
    }

    if (in_cand_range) {
        createRange(c);
        return;
    }

    CodePointSet ncps;
    ncps.set(c);
    if (mode.caseless) {
        make_caseless(&ncps);
    }
    cps |= ncps;
    range_start = c;
}

void UTF8ComponentClass::finalize() {
    if (finalized) {
        return;
    }

    // Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the
    // dash is a literal dash.
    if (in_cand_range) {
        cps.set('-');
        in_cand_range = false;
    }

    if (m_negate) {
        cps.flip();
    }

    finalized = true;
}

Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
    map<u8, Position>::const_iterator it = heads.find(first_byte);
    if (it != heads.end()) {
        return it->second;
    }

    Position head = builder.makePositions(1);
    assert(heads.find(first_byte) == heads.end());
    builder.addCharReach(head, CharReach(first_byte));
    /* no report id as head can not be directly wired to accept */

    heads[first_byte] = head;
    return head;
}

void UTF8ComponentClass::ensureDotTrailer(GlushkovBuildState &bs) {
    NFABuilder &builder = bs.getBuilder();
    if (one_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
        return;
    }

    one_dot_trailer = builder.makePositions(1);
    builder.setNodeReportID(one_dot_trailer, 0);
    builder.addCharReach(one_dot_trailer, CharReach(0x80, 0xbf));
    tails.insert(one_dot_trailer);
}

void UTF8ComponentClass::ensureTwoDotTrailer(GlushkovBuildState &bs) {
    NFABuilder &builder = bs.getBuilder();
    if (two_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
        return;
    }

    ensureDotTrailer(bs);

    two_dot_trailer = builder.makePositions(1);
    builder.addCharReach(two_dot_trailer, CharReach(0x80, 0xbf));
    bs.addSuccessor(two_dot_trailer, one_dot_trailer);
}

void UTF8ComponentClass::ensureThreeDotTrailer(GlushkovBuildState &bs) {
    NFABuilder &builder = bs.getBuilder();
    if (three_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
        return;
    }

    ensureTwoDotTrailer(bs);

    three_dot_trailer = builder.makePositions(1);
    builder.addCharReach(three_dot_trailer, CharReach(0x80, 0xbf));
    bs.addSuccessor(three_dot_trailer, two_dot_trailer);
}

void UTF8ComponentClass::buildOneByte(GlushkovBuildState &bs) {
    NFABuilder &builder = bs.getBuilder();
    for (CodePointSet::const_iterator it = cps.begin(); it != cps.end(); ++it) {
        unichar b = lower(*it);
        unichar e = upper(*it) + 1;
        if (b >= UTF_2CHAR_MIN) {
            continue;
        }

        DEBUG_PRINTF("building vertices for [%u, %u)\n", b, e);

        if (single_pos == GlushkovBuildState::POS_UNINITIALIZED) {
            single_pos = builder.makePositions(1);
            builder.setNodeReportID(single_pos, 0 /* offset adj */);
            tails.insert(single_pos);
        }
        CharReach cr(b, MIN(e, UTF_2CHAR_MIN) - 1);
        builder.addCharReach(single_pos, cr);
    }
}

void UTF8ComponentClass::addToTail(GlushkovBuildState &bs,
                                   map<Position, Position> &finals,
                                   Position prev, unichar b, unichar e) {
    NFABuilder &builder = bs.getBuilder();
    Position tail;
    if (finals.find(prev) == finals.end()) {
        tail = builder.makePositions(1);
        builder.setNodeReportID(tail, 0 /* offset adj */);
        bs.addSuccessor(prev, tail);
        finals[prev] = tail;
        tails.insert(tail);
    } else {
        tail = finals[prev];
    }

    u8 bb = makeContByte(b);
    u8 ee = makeContByte(e - 1);
    builder.addCharReach(tail, CharReach(bb, ee));
}

void UTF8ComponentClass::buildTwoByte(GlushkovBuildState &bs) {
    NFABuilder &builder = bs.getBuilder();
    map<Position, Position> finals;

    for (auto it = cps.begin(); it != cps.end(); ++it) {
        unichar b = lower(*it);
        unichar e = upper(*it) + 1;

        b = MAX(b, UTF_2CHAR_MIN);
        e = MIN(e, UTF_3CHAR_MIN);

        if (b >= e) {
            continue; /* we're done here */
        }

        /* raise b to the start of the next tail byte boundary */
        if (b & UTF_CONT_BYTE_VALUE_MASK) {
            unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));
            u8 first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT);
            assert(first_byte > 0xc1 && first_byte <= 0xdf);

            Position head = getHead(builder, first_byte);
            addToTail(bs, finals, head, b, bb);

            b = bb;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* lower e to the end of a tail byte boundary */
        if (e & UTF_CONT_BYTE_VALUE_MASK) {
            unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
            assert(ee >= b);

            u8 first_byte = UTF_TWO_BYTE_HEADER | (ee >> UTF_CONT_SHIFT);
            assert(first_byte > 0xc1 && first_byte <= 0xdf);

            Position head = getHead(builder, first_byte);
            addToTail(bs, finals, head, ee, e);

            e = ee;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* middle section just goes to a common full vertex */
        ensureDotTrailer(bs);

        if (two_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
            two_char_dot_head = builder.makePositions(1);
            bs.addSuccessor(two_char_dot_head, one_dot_trailer);
        }

        u8 min_first_byte =  UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT);
        u8 max_first_byte =  UTF_TWO_BYTE_HEADER | ((e - 1) >> UTF_CONT_SHIFT);

        assert(min_first_byte > 0xc1 && min_first_byte <= 0xdf);
        assert(max_first_byte > 0xc1 && max_first_byte <= 0xdf);

        builder.addCharReach(two_char_dot_head,
                             CharReach(min_first_byte, max_first_byte));
    }
}

static
Position getMid(GlushkovBuildState &bs, map<Position, map<u8, Position> > &mids,
                const Position &prev, u8 byte_val) {
    NFABuilder &builder = bs.getBuilder();
    map<u8, Position> &by_byte = mids[prev];

    map<u8, Position>::const_iterator it = by_byte.find(byte_val);
    if (it != by_byte.end()) {
        return it->second;
    }

    Position mid = builder.makePositions(1);
    builder.addCharReach(mid, CharReach(byte_val));
    bs.addSuccessor(prev, mid);
    /* no report id as mid can not be directly wired to accept */

    by_byte[byte_val] = mid;
    return mid;
}

void UTF8ComponentClass::buildThreeByte(GlushkovBuildState &bs) {
    NFABuilder &builder = bs.getBuilder();

    map<Position, map<u8, Position> > mids;
    map<Position, Position> finals;

    for (auto it = cps.begin(); it != cps.end(); ++it) {
        unichar b = lower(*it);
        unichar e = upper(*it) + 1;

        b = MAX(b, UTF_3CHAR_MIN);
        e = MIN(e, UTF_4CHAR_MIN);

        if (b >= e) {
            continue; /* we're done here */
        }

        /* raise b to the start of the next tail byte boundary */
        if (b & UTF_CONT_BYTE_VALUE_MASK) {
            unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));

            u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT));
            assert(first_byte >= 0xe0 && first_byte <= 0xef);
            Position head = getHead(builder, first_byte);

            u8 second_byte = makeContByte(b >> UTF_CONT_SHIFT);
            Position mid = getMid(bs, mids, head, second_byte);

            addToTail(bs, finals, mid, b, bb);

            b = bb;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* lower e to the end of a tail byte boundary */
        if (e & UTF_CONT_BYTE_VALUE_MASK) {
            unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
            assert(ee >= b);

            u8 first_byte = UTF_THREE_BYTE_HEADER
                          | (ee >> (2 * UTF_CONT_SHIFT));
            assert(first_byte >= 0xe0 && first_byte <= 0xef);
            Position head = getHead(builder, first_byte);

            u8 second_byte = makeContByte(ee >> UTF_CONT_SHIFT);
            Position mid = getMid(bs, mids, head, second_byte);

            addToTail(bs, finals, mid, ee, e);

            e = ee;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* from here on in the last byte is always full */
        ensureDotTrailer(bs);

        /* raise b to the start of the next mid byte boundary */
        if (b & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) {
            unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT)));

            u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT));
            Position head = getHead(builder, first_byte);

            Position mid = builder.makePositions(1);
            bs.addSuccessor(head, mid);
            bs.addSuccessor(mid, one_dot_trailer);
            /* no report id as mid can not be directly wired to accept,
             * not adding to mids as we are completely filling its downstream */
            u8 second_min = makeContByte(b >> UTF_CONT_SHIFT);
            u8 second_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT);

            builder.addCharReach(mid, CharReach(second_min, second_max));

            b = bb;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* lower e to the end of a mid byte boundary */
        if (e & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) {
            unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1);
            assert(ee >= b);

            u8 first_byte = UTF_THREE_BYTE_HEADER
                          | (ee >> (2 * UTF_CONT_SHIFT));
            Position head = getHead(builder, first_byte);

            Position mid = builder.makePositions(1);
            bs.addSuccessor(head, mid);
            bs.addSuccessor(mid, one_dot_trailer);
            /* no report id as mid can not be directly wired to accept,
             * not adding to mids as we are completely filling its downstream */
            u8 second_min = makeContByte(ee >> UTF_CONT_SHIFT);
            u8 second_max = makeContByte((e - 1) >> UTF_CONT_SHIFT);

            builder.addCharReach(mid, CharReach(second_min, second_max));

            e = ee;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* now we just have to wire head to a common dot trailer */
        ensureTwoDotTrailer(bs);
        if (three_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
            three_char_dot_head = builder.makePositions(1);
            bs.addSuccessor(three_char_dot_head, two_dot_trailer);
        }

        u8 min_first_byte =  UTF_THREE_BYTE_HEADER
                          | (b >> (2 * UTF_CONT_SHIFT));
        u8 max_first_byte =  UTF_THREE_BYTE_HEADER
                          | ((e - 1) >> (2 * UTF_CONT_SHIFT));

        assert(min_first_byte > 0xdf && min_first_byte <= 0xef);
        assert(max_first_byte > 0xdf && max_first_byte <= 0xef);

        builder.addCharReach(three_char_dot_head,
                             CharReach(min_first_byte, max_first_byte));
    }
}

static
u8 makeFirstByteOfFour(unichar raw) {
    u8 first_byte = UTF_FOUR_BYTE_HEADER | (raw >> (3 * UTF_CONT_SHIFT));
    assert(first_byte > 0xef && first_byte <= 0xf7);
    return first_byte;
}

static
bool isTwoContAligned(unichar raw) {
    return !(raw & ((1 << (2 * UTF_CONT_SHIFT)) - 1));
}

static
bool isThreeContAligned(unichar raw) {
    return !(raw & ((1 << (3 * UTF_CONT_SHIFT)) - 1));
}

void UTF8ComponentClass::buildFourByte(GlushkovBuildState &bs) {
    NFABuilder &builder = bs.getBuilder();
    map<Position, map<u8, Position> > mids;
    map<Position, Position> finals;

    for (auto it = cps.begin(); it != cps.end(); ++it) {
        unichar b = lower(*it);
        unichar e = upper(*it) + 1;

        b = MAX(b, UTF_4CHAR_MIN);
        e = MIN(e, MAX_UNICODE + 1);

        if (b >= e) {
            continue;
        }

        /* raise b to the start of the next tail byte boundary */
        if (b & UTF_CONT_BYTE_VALUE_MASK) {
            unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));

            u8 first_byte = makeFirstByteOfFour(b);
            Position head = getHead(builder, first_byte);

            u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT));
            Position mid1 = getMid(bs, mids, head, second_byte);

            u8 third_byte = makeContByte(b >> UTF_CONT_SHIFT);
            Position mid2 = getMid(bs, mids, mid1, third_byte);

            addToTail(bs, finals, mid2, b, bb);

            b = bb;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* lower e to the end of a tail byte boundary */
        if (e & UTF_CONT_BYTE_VALUE_MASK) {
            unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
            assert(ee >= b);

            u8 first_byte = makeFirstByteOfFour(ee);
            Position head = getHead(builder, first_byte);

            u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
            Position mid1 = getMid(bs, mids, head, second_byte);

            u8 third_byte = makeContByte(ee >> UTF_CONT_SHIFT);
            Position mid2 = getMid(bs, mids, mid1, third_byte);

            addToTail(bs, finals, mid2, ee, e);

            e = ee;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* from here on in the last byte is always full */
        ensureDotTrailer(bs);

        /* raise b to the start of the next mid byte boundary */
        if (!isTwoContAligned(b)) {
            unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT)));

            u8 first_byte = makeFirstByteOfFour(b);
            Position head = getHead(builder, first_byte);

            u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT));
            Position mid1 = getMid(bs, mids, head, second_byte);

            Position mid2 = builder.makePositions(1);
            bs.addSuccessor(mid1, mid2);
            bs.addSuccessor(mid2, one_dot_trailer);
            /* no report id as mid can not be directly wired to accept,
             * not adding to mids as we are completely filling its downstream */
            u8 byte_min = makeContByte(b >> UTF_CONT_SHIFT);
            u8 byte_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT);

            builder.addCharReach(mid2, CharReach(byte_min, byte_max));

            b = bb;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* lower e to the end of a mid byte boundary */
        if (!isTwoContAligned(e)) {
            unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1);
            assert(ee >= b);

            u8 first_byte = makeFirstByteOfFour(ee);
            Position head = getHead(builder, first_byte);

            u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
            Position mid1 = getMid(bs, mids, head, second_byte);

            Position mid2 = builder.makePositions(1);
            bs.addSuccessor(mid1, mid2);
            bs.addSuccessor(mid2, one_dot_trailer);
            /* no report id as mid can not be directly wired to accept,
             * not adding to mids as we are completely filling its downstream */
            u8 byte_min = makeContByte(ee >> UTF_CONT_SHIFT);
            u8 byte_max = makeContByte((e - 1) >> UTF_CONT_SHIFT);

            builder.addCharReach(mid2, CharReach(byte_min, byte_max));

            e = ee;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        ensureTwoDotTrailer(bs);

        /* raise b to the next byte boundary */
        if (!isThreeContAligned(b)) {
            unichar bb = MIN(e, ROUNDUP_N(b, 1 << (3 * UTF_CONT_SHIFT)));

            u8 first_byte = makeFirstByteOfFour(b);
            Position head = getHead(builder, first_byte);

            Position mid1 = builder.makePositions(1);
            bs.addSuccessor(head, mid1);
            bs.addSuccessor(mid1, two_dot_trailer);
            /* no report id as mid can not be directly wired to accept,
             * not adding to mids as we are completely filling its downstream */
            u8 byte_min = makeContByte(b >> (2 * UTF_CONT_SHIFT));
            u8 byte_max = makeContByte((bb - 1) >> (2 * UTF_CONT_SHIFT));

            builder.addCharReach(mid1, CharReach(byte_min, byte_max));

            b = bb;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* lower e to the next byte boundary */
        if (!isThreeContAligned(e)) {
            unichar ee = e & ~((1 << (3 * UTF_CONT_SHIFT)) - 1);
            assert(ee >= b);

            u8 first_byte = makeFirstByteOfFour(ee);
            Position head = getHead(builder, first_byte);
            Position mid1 = builder.makePositions(1);
            bs.addSuccessor(head, mid1);
            bs.addSuccessor(mid1, two_dot_trailer);
            /* no report id as mid can not be directly wired to accept,
             * not adding to mids as we are completely filling its downstream */
            u8 byte_min = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
            u8 byte_max = makeContByte((e - 1) >> (2 * UTF_CONT_SHIFT));

            builder.addCharReach(mid1, CharReach(byte_min, byte_max));

            e = ee;
        }

        if (b == e) {
            continue; /* we're done here */
        }
        assert(b < e);

        /* now we just have to wire head to a common dot trailer */
        ensureThreeDotTrailer(bs);
        if (four_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
            four_char_dot_head = builder.makePositions(1);
            bs.addSuccessor(four_char_dot_head, three_dot_trailer);
        }

        u8 min_first_byte = makeFirstByteOfFour(b);
        u8 max_first_byte = makeFirstByteOfFour(e - 1);

        builder.addCharReach(four_char_dot_head,
                             CharReach(min_first_byte, max_first_byte));
    }
}

void UTF8ComponentClass::notePositions(GlushkovBuildState &bs) {
    // We should always be finalized by now.
    assert(finalized);

    // An empty class is a special case; this would be generated by something
    // like /[\s\S]/8, which can never match. We treat these like we do the non
    // UTF-8 version: add a vertex with empty reach (to ensure we create a
    // connected graph) and pick it up later on.
    if (class_empty()) {
        DEBUG_PRINTF("empty class!\n");
        assert(single_pos == GlushkovBuildState::POS_UNINITIALIZED);
        NFABuilder &builder = bs.getBuilder();
        single_pos = builder.makePositions(1);
        builder.setNodeReportID(single_pos, 0 /* offset adj */);
        builder.addCharReach(single_pos, CharReach());
        tails.insert(single_pos);
        return;
    }

    buildOneByte(bs);
    buildTwoByte(bs);
    buildThreeByte(bs);
    buildFourByte(bs);
}

void UTF8ComponentClass::buildFollowSet(GlushkovBuildState &,
                                        const vector<PositionInfo> &) {
    /* states are wired in notePositions as all belong to this component. */
}

vector<PositionInfo> UTF8ComponentClass::first(void) const {
    vector<PositionInfo> rv;
    if (single_pos != GlushkovBuildState::POS_UNINITIALIZED) {
        rv.emplace_back(single_pos);
    }
    if (two_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
        rv.emplace_back(two_char_dot_head);
    }
    if (three_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
        rv.emplace_back(three_char_dot_head);
    }
    if (four_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
        rv.emplace_back(four_char_dot_head);
    }

    for (auto it = heads.begin(); it != heads.end(); ++it) {
        rv.emplace_back(it->second);
    }
    return rv;
}

vector<PositionInfo> UTF8ComponentClass::last(void) const {
    vector<PositionInfo> rv;

    rv.insert(rv.end(), tails.begin(), tails.end());
    return rv;
}

} // namespace ue2