mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
character classes: handle \Q\E and utf8
This commit is contained in:
parent
a185be5a4f
commit
d317d75615
@ -1184,6 +1184,11 @@ unichar readUtf8CodePoint4c(const char *s) {
|
||||
currentSeq->addComponent(move(cc));
|
||||
};
|
||||
|
||||
hi_byte when is_utf8 => {
|
||||
assert(mode.utf8);
|
||||
throwInvalidUtf8();
|
||||
};
|
||||
|
||||
# Literal character
|
||||
any => {
|
||||
addLiteral(currentSeq, *ts, mode);
|
||||
@ -1198,6 +1203,31 @@ unichar readUtf8CodePoint4c(const char *s) {
|
||||
'\\E' => {
|
||||
fret;
|
||||
};
|
||||
|
||||
#unicode chars
|
||||
utf8_2c when is_utf8 => {
|
||||
assert(mode.utf8);
|
||||
currentCls->add(readUtf8CodePoint2c(ts));
|
||||
inCharClassEarly = false;
|
||||
};
|
||||
|
||||
utf8_3c when is_utf8 => {
|
||||
assert(mode.utf8);
|
||||
currentCls->add(readUtf8CodePoint3c(ts));
|
||||
inCharClassEarly = false;
|
||||
};
|
||||
|
||||
utf8_4c when is_utf8 => {
|
||||
assert(mode.utf8);
|
||||
currentCls->add(readUtf8CodePoint4c(ts));
|
||||
inCharClassEarly = false;
|
||||
};
|
||||
|
||||
hi_byte when is_utf8 => {
|
||||
assert(mode.utf8);
|
||||
throwInvalidUtf8();
|
||||
};
|
||||
|
||||
# Literal character
|
||||
any => {
|
||||
currentCls->add(*ts);
|
||||
|
@ -142,3 +142,5 @@
|
||||
145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching.
|
||||
146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching.
|
||||
147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
|
||||
148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8.
|
||||
149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8.
|
||||
|
Loading…
x
Reference in New Issue
Block a user