From a185be5a4f684c9bdbd90a2b9716ca02dde9e7b2 Mon Sep 17 00:00:00 2001 From: Alex Coyte Date: Mon, 19 Jun 2017 11:03:05 +1000 Subject: [PATCH] Treat characters between \Q \E as codepoints in UTF8 mode. fixes github issue #57 --- src/parser/Parser.rl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index 52b3340c..05a084bb 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -1155,6 +1155,35 @@ unichar readUtf8CodePoint4c(const char *s) { '\\E' => { fgoto main; }; + + #unicode chars + utf8_2c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint2c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + + utf8_3c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint3c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + + utf8_4c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint4c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + # Literal character any => { addLiteral(currentSeq, *ts, mode);