From d317d75615cdc6d0533e290de21a9bd205ffd12e Mon Sep 17 00:00:00 2001 From: Alex Coyte Date: Tue, 20 Jun 2017 10:19:32 +1000 Subject: [PATCH] character classes: handle \Q\E and utf8 --- src/parser/Parser.rl | 30 ++++++++++++++++++++++++++++++ unit/hyperscan/bad_patterns.txt | 2 ++ 2 files changed, 32 insertions(+) diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index 05a084bb..ce9ca865 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -1184,6 +1184,11 @@ unichar readUtf8CodePoint4c(const char *s) { currentSeq->addComponent(move(cc)); }; + hi_byte when is_utf8 => { + assert(mode.utf8); + throwInvalidUtf8(); + }; + # Literal character any => { addLiteral(currentSeq, *ts, mode); @@ -1198,6 +1203,31 @@ unichar readUtf8CodePoint4c(const char *s) { '\\E' => { fret; }; + + #unicode chars + utf8_2c when is_utf8 => { + assert(mode.utf8); + currentCls->add(readUtf8CodePoint2c(ts)); + inCharClassEarly = false; + }; + + utf8_3c when is_utf8 => { + assert(mode.utf8); + currentCls->add(readUtf8CodePoint3c(ts)); + inCharClassEarly = false; + }; + + utf8_4c when is_utf8 => { + assert(mode.utf8); + currentCls->add(readUtf8CodePoint4c(ts)); + inCharClassEarly = false; + }; + + hi_byte when is_utf8 => { + assert(mode.utf8); + throwInvalidUtf8(); + }; + # Literal character any => { currentCls->add(*ts); diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index 3d6d9db9..3042dc82 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -142,3 +142,5 @@ 145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching. 146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching. 147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match. +148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8. +149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8.