regexp: added v flag support - fixed corner cases of case insensitive matching

This commit is contained in:
Fabrice Bellard
2025-05-16 17:43:03 +02:00
parent a8b2d7c2b2
commit d7cdfdc8d7
13 changed files with 2004 additions and 205 deletions

View File

@@ -44179,6 +44179,9 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
case 'u':
mask = LRE_FLAG_UNICODE;
break;
case 'v':
mask = LRE_FLAG_UNICODE_SETS;
break;
case 'y':
mask = LRE_FLAG_STICKY;
break;
@@ -44188,14 +44191,20 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
if ((re_flags & mask) != 0) {
bad_flags:
JS_FreeCString(ctx, str);
return JS_ThrowSyntaxError(ctx, "invalid regular expression flags");
goto bad_flags1;
}
re_flags |= mask;
}
JS_FreeCString(ctx, str);
}
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE));
/* 'u' and 'v' cannot be both set */
if ((re_flags & LRE_FLAG_UNICODE_SETS) && (re_flags & LRE_FLAG_UNICODE)) {
bad_flags1:
return JS_ThrowSyntaxError(ctx, "invalid regular expression flags");
}
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)));
if (!str)
return JS_EXCEPTION;
re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg,
@@ -44499,49 +44508,34 @@ static JSValue js_regexp_get_flag(JSContext *ctx, JSValueConst this_val, int mas
return JS_NewBool(ctx, flags & mask);
}
#define RE_FLAG_COUNT 8
static JSValue js_regexp_get_flags(JSContext *ctx, JSValueConst this_val)
{
char str[8], *p = str;
int res;
char str[RE_FLAG_COUNT], *p = str;
int res, i;
static const int flag_atom[RE_FLAG_COUNT] = {
JS_ATOM_hasIndices,
JS_ATOM_global,
JS_ATOM_ignoreCase,
JS_ATOM_multiline,
JS_ATOM_dotAll,
JS_ATOM_unicode,
JS_ATOM_unicodeSets,
JS_ATOM_sticky,
};
static const char flag_char[RE_FLAG_COUNT] = { 'd', 'g', 'i', 'm', 's', 'u', 'v', 'y' };
if (JS_VALUE_GET_TAG(this_val) != JS_TAG_OBJECT)
return JS_ThrowTypeErrorNotAnObject(ctx);
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "hasIndices"));
if (res < 0)
goto exception;
if (res)
*p++ = 'd';
res = JS_ToBoolFree(ctx, JS_GetProperty(ctx, this_val, JS_ATOM_global));
if (res < 0)
goto exception;
if (res)
*p++ = 'g';
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "ignoreCase"));
if (res < 0)
goto exception;
if (res)
*p++ = 'i';
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "multiline"));
if (res < 0)
goto exception;
if (res)
*p++ = 'm';
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "dotAll"));
if (res < 0)
goto exception;
if (res)
*p++ = 's';
res = JS_ToBoolFree(ctx, JS_GetProperty(ctx, this_val, JS_ATOM_unicode));
if (res < 0)
goto exception;
if (res)
*p++ = 'u';
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "sticky"));
if (res < 0)
goto exception;
if (res)
*p++ = 'y';
for(i = 0; i < RE_FLAG_COUNT; i++) {
res = JS_ToBoolFree(ctx, JS_GetProperty(ctx, this_val, flag_atom[i]));
if (res < 0)
goto exception;
if (res)
*p++ = flag_char[i];
}
return JS_NewStringLen(ctx, str, p - str);
exception:
@@ -45026,14 +45020,12 @@ static JSValue js_regexp_Symbol_match(JSContext *ctx, JSValueConst this_val,
goto exception;
p = JS_VALUE_GET_STRING(flags);
// TODO(bnoordhuis) query 'u' flag the same way?
global = (-1 != string_indexof_char(p, 'g', 0));
if (!global) {
A = JS_RegExpExec(ctx, rx, S);
} else {
fullUnicode = JS_ToBoolFree(ctx, JS_GetProperty(ctx, rx, JS_ATOM_unicode));
if (fullUnicode < 0)
goto exception;
fullUnicode = (string_indexof_char(p, 'u', 0) >= 0 ||
string_indexof_char(p, 'v', 0) >= 0);
if (JS_SetProperty(ctx, rx, JS_ATOM_lastIndex, JS_NewInt32(ctx, 0)) < 0)
goto exception;
@@ -45217,7 +45209,8 @@ static JSValue js_regexp_Symbol_matchAll(JSContext *ctx, JSValueConst this_val,
it->iterated_string = S;
strp = JS_VALUE_GET_STRING(flags);
it->global = string_indexof_char(strp, 'g', 0) >= 0;
it->unicode = string_indexof_char(strp, 'u', 0) >= 0;
it->unicode = (string_indexof_char(strp, 'u', 0) >= 0 ||
string_indexof_char(strp, 'v', 0) >= 0);
it->done = FALSE;
JS_SetOpaque(iter, it);
@@ -45364,13 +45357,11 @@ static JSValue js_regexp_Symbol_replace(JSContext *ctx, JSValueConst this_val,
goto exception;
p = JS_VALUE_GET_STRING(flags);
// TODO(bnoordhuis) query 'u' flag the same way?
fullUnicode = 0;
is_global = (-1 != string_indexof_char(p, 'g', 0));
if (is_global) {
fullUnicode = JS_ToBoolFree(ctx, JS_GetProperty(ctx, rx, JS_ATOM_unicode));
if (fullUnicode < 0)
goto exception;
fullUnicode = (string_indexof_char(p, 'u', 0) >= 0 ||
string_indexof_char(p, 'v', 0) >= 0);
if (JS_SetProperty(ctx, rx, JS_ATOM_lastIndex, JS_NewInt32(ctx, 0)) < 0)
goto exception;
}
@@ -45596,7 +45587,8 @@ static JSValue js_regexp_Symbol_split(JSContext *ctx, JSValueConst this_val,
if (JS_IsException(flags))
goto exception;
strp = JS_VALUE_GET_STRING(flags);
unicodeMatching = string_indexof_char(strp, 'u', 0) >= 0;
unicodeMatching = (string_indexof_char(strp, 'u', 0) >= 0 ||
string_indexof_char(strp, 'v', 0) >= 0);
if (string_indexof_char(strp, 'y', 0) < 0) {
flags = JS_ConcatString3(ctx, "", flags, "y");
if (JS_IsException(flags))
@@ -45707,6 +45699,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = {
JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ),
JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ),
JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ),
JS_CGETSET_MAGIC_DEF("unicodeSets", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE_SETS ),
JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ),
JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ),
JS_CFUNC_DEF("exec", 1, js_regexp_exec ),