mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
character classes: handle \Q\E and utf8
This commit is contained in:
parent
a185be5a4f
commit
d317d75615
@ -1184,6 +1184,11 @@ unichar readUtf8CodePoint4c(const char *s) {
|
|||||||
currentSeq->addComponent(move(cc));
|
currentSeq->addComponent(move(cc));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
hi_byte when is_utf8 => {
|
||||||
|
assert(mode.utf8);
|
||||||
|
throwInvalidUtf8();
|
||||||
|
};
|
||||||
|
|
||||||
# Literal character
|
# Literal character
|
||||||
any => {
|
any => {
|
||||||
addLiteral(currentSeq, *ts, mode);
|
addLiteral(currentSeq, *ts, mode);
|
||||||
@ -1198,6 +1203,31 @@ unichar readUtf8CodePoint4c(const char *s) {
|
|||||||
'\\E' => {
|
'\\E' => {
|
||||||
fret;
|
fret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#unicode chars
|
||||||
|
utf8_2c when is_utf8 => {
|
||||||
|
assert(mode.utf8);
|
||||||
|
currentCls->add(readUtf8CodePoint2c(ts));
|
||||||
|
inCharClassEarly = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
utf8_3c when is_utf8 => {
|
||||||
|
assert(mode.utf8);
|
||||||
|
currentCls->add(readUtf8CodePoint3c(ts));
|
||||||
|
inCharClassEarly = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
utf8_4c when is_utf8 => {
|
||||||
|
assert(mode.utf8);
|
||||||
|
currentCls->add(readUtf8CodePoint4c(ts));
|
||||||
|
inCharClassEarly = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
hi_byte when is_utf8 => {
|
||||||
|
assert(mode.utf8);
|
||||||
|
throwInvalidUtf8();
|
||||||
|
};
|
||||||
|
|
||||||
# Literal character
|
# Literal character
|
||||||
any => {
|
any => {
|
||||||
currentCls->add(*ts);
|
currentCls->add(*ts);
|
||||||
|
@ -142,3 +142,5 @@
|
|||||||
145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching.
|
145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching.
|
||||||
146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching.
|
146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching.
|
||||||
147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
|
147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
|
||||||
|
148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8.
|
||||||
|
149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user