regexp: removed alloca() is lre_exec() - added specific opcodes for \s and \S to have a smaller bytecode - optimized \b and \B

This commit is contained in:
Fabrice Bellard
2025-12-22 15:12:46 +01:00
parent 7bd1ae2c76
commit f1139494d1
5 changed files with 94 additions and 69 deletions

View File

@@ -31,6 +31,8 @@ DEF(char32, 5)
DEF(char32_i, 5) DEF(char32_i, 5)
DEF(dot, 1) DEF(dot, 1)
DEF(any, 1) /* same as dot but match any character including line terminator */ DEF(any, 1) /* same as dot but match any character including line terminator */
DEF(space, 1)
DEF(not_space, 1) /* must come after */
DEF(line_start, 1) DEF(line_start, 1)
DEF(line_start_m, 1) DEF(line_start_m, 1)
DEF(line_end, 1) DEF(line_end, 1)

View File

@@ -34,7 +34,9 @@
/* /*
TODO: TODO:
- remove REOP_char_i and REOP_range_i by precomputing the case folding.
- add specific opcodes for simple unicode property tests so that the
generated bytecode is smaller.
- Add a lock step execution mode (=linear time execution guaranteed) - Add a lock step execution mode (=linear time execution guaranteed)
when the regular expression is "simple" i.e. no backreference nor when the regular expression is "simple" i.e. no backreference nor
complicated lookahead. The opcodes are designed for this execution complicated lookahead. The opcodes are designed for this execution
@@ -1078,7 +1080,7 @@ static int get_class_atom(REParseState *s, REStringList *cr,
goto default_escape; goto default_escape;
if (cr_init_char_range(s, cr, c)) if (cr_init_char_range(s, cr, c))
return -1; return -1;
c = CLASS_RANGE_BASE; c += CLASS_RANGE_BASE;
break; break;
case 'c': case 'c':
c = *p; c = *p;
@@ -1584,6 +1586,8 @@ static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init,
case REOP_char32_i: case REOP_char32_i:
case REOP_dot: case REOP_dot:
case REOP_any: case REOP_any:
case REOP_space:
case REOP_not_space:
need_check_adv = FALSE; need_check_adv = FALSE;
break; break;
case REOP_line_start: case REOP_line_start:
@@ -2028,9 +2032,9 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
case 'b': case 'b':
case 'B': case 'B':
if (p[1] != 'b') { if (p[1] != 'b') {
re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary); re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary);
} else { } else {
re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary); re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary);
} }
p += 2; p += 2;
break; break;
@@ -2167,8 +2171,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
if (is_backward_dir) if (is_backward_dir)
re_emit_op(s, REOP_prev); re_emit_op(s, REOP_prev);
if (c >= CLASS_RANGE_BASE) { if (c >= CLASS_RANGE_BASE) {
int ret; int ret = 0;
ret = re_emit_string_list(s, cr); /* optimize the common 'space' tests */
if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) {
re_emit_op(s, REOP_space);
} else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) {
re_emit_op(s, REOP_not_space);
} else {
ret = re_emit_string_list(s, cr);
}
re_string_list_free(cr); re_string_list_free(cr);
if (ret) if (ret)
return -1; return -1;
@@ -2607,14 +2618,6 @@ static BOOL is_line_terminator(uint32_t c)
return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
} }
static BOOL is_word_char(uint32_t c)
{
return ((c >= '0' && c <= '9') ||
(c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c == '_'));
}
#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
do { \ do { \
if (cbuf_type == 0) { \ if (cbuf_type == 0) { \
@@ -2769,7 +2772,7 @@ static no_inline int stack_realloc(REExecContext *s, size_t n)
/* return 1 if match, 0 if not match or < 0 if error. */ /* return 1 if match, 0 if not match or < 0 if error. */
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
uint8_t **regs, const uint8_t *pc, const uint8_t *cptr) const uint8_t *pc, const uint8_t *cptr)
{ {
int opcode; int opcode;
int cbuf_type; int cbuf_type;
@@ -2809,24 +2812,24 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
} }
/* avoid saving the previous value if already saved */ /* avoid saving the previous value if already saved */
#define SAVE_REG(idx, value) \ #define SAVE_CAPTURE_CHECK(idx, value) \
{ \ { \
StackElem *sp1; \ StackElem *sp1; \
sp1 = sp; \ sp1 = sp; \
for(;;) { \ for(;;) { \
if (sp1 > bp) { \ if (sp1 > bp) { \
if (sp1[-2].val == -(int)(idx + 1)) \ if (sp1[-2].val == idx) \
break; \ break; \
sp1 -= 2; \ sp1 -= 2; \
} else { \ } else { \
CHECK_STACK_SPACE(2); \ CHECK_STACK_SPACE(2); \
sp[0].val = -(int)(idx + 1); \ sp[0].val = idx; \
sp[1].ptr = regs[idx]; \ sp[1].ptr = capture[idx]; \
sp += 2; \ sp += 2; \
break; \ break; \
} \ } \
} \ } \
regs[idx] = (value); \ capture[idx] = (value); \
} }
@@ -2851,13 +2854,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
REExecStateEnum type; REExecStateEnum type;
if (bp == s->stack_buf) if (bp == s->stack_buf)
return 0; return 0;
/* undo the modifications to capture[] and regs[] */ /* undo the modifications to capture[] */
while (sp > bp) { while (sp > bp) {
intptr_t idx2 = sp[-2].val; capture[sp[-2].val] = sp[-1].ptr;
if (idx2 >= 0)
capture[idx2] = sp[-1].ptr;
else
regs[-idx2 - 1] = sp[-1].ptr;
sp -= 2; sp -= 2;
} }
@@ -2910,13 +2909,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
for(;;) { for(;;) {
REExecStateEnum type; REExecStateEnum type;
type = bp[-1].bp.type; type = bp[-1].bp.type;
/* undo the modifications to capture[] and regs[] */ /* undo the modifications to capture[] */
while (sp > bp) { while (sp > bp) {
intptr_t idx2 = sp[-2].val; capture[sp[-2].val] = sp[-1].ptr;
if (idx2 >= 0)
capture[idx2] = sp[-1].ptr;
else
regs[-idx2 - 1] = sp[-1].ptr;
sp -= 2; sp -= 2;
} }
pc = sp[-3].ptr; pc = sp[-3].ptr;
@@ -3019,6 +3014,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end, cbuf_type); GET_CHAR(c, cptr, cbuf_end, cbuf_type);
break; break;
case REOP_space:
if (cptr == cbuf_end)
goto no_match;
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (!lre_is_space(c))
goto no_match;
break;
case REOP_not_space:
if (cptr == cbuf_end)
goto no_match;
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (lre_is_space(c))
goto no_match;
break;
case REOP_save_start: case REOP_save_start:
case REOP_save_end: case REOP_save_end:
val = *pc++; val = *pc++;
@@ -3044,20 +3053,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
} }
break; break;
case REOP_set_i32: case REOP_set_i32:
idx = pc[0]; idx = 2 * s->capture_count + pc[0];
val = get_u32(pc + 1); val = get_u32(pc + 1);
pc += 5; pc += 5;
SAVE_REG(idx, (void *)(uintptr_t)val); SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val);
break; break;
case REOP_loop: case REOP_loop:
{ {
uint32_t val2; uint32_t val2;
idx = pc[0]; idx = 2 * s->capture_count + pc[0];
val = get_u32(pc + 1); val = get_u32(pc + 1);
pc += 5; pc += 5;
val2 = (uintptr_t)regs[idx] - 1; val2 = (uintptr_t)capture[idx] - 1;
SAVE_REG(idx, (void *)(uintptr_t)val2); SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
if (val2 != 0) { if (val2 != 0) {
pc += (int)val; pc += (int)val;
if (lre_poll_timeout(s)) if (lre_poll_timeout(s))
@@ -3072,14 +3081,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
{ {
const uint8_t *pc1; const uint8_t *pc1;
uint32_t val2, limit; uint32_t val2, limit;
idx = pc[0]; idx = 2 * s->capture_count + pc[0];
limit = get_u32(pc + 1); limit = get_u32(pc + 1);
val = get_u32(pc + 5); val = get_u32(pc + 5);
pc += 9; pc += 9;
/* decrement the counter */ /* decrement the counter */
val2 = (uintptr_t)regs[idx] - 1; val2 = (uintptr_t)capture[idx] - 1;
SAVE_REG(idx, (void *)(uintptr_t)val2); SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
if (val2 > limit) { if (val2 > limit) {
/* normal loop if counter > limit */ /* normal loop if counter > limit */
@@ -3090,7 +3099,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
/* check advance */ /* check advance */
if ((opcode == REOP_loop_check_adv_split_goto_first || if ((opcode == REOP_loop_check_adv_split_goto_first ||
opcode == REOP_loop_check_adv_split_next_first) && opcode == REOP_loop_check_adv_split_next_first) &&
regs[idx + 1] == cptr && capture[idx + 1] == cptr &&
val2 != limit) { val2 != limit) {
goto no_match; goto no_match;
} }
@@ -3116,14 +3125,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
} }
break; break;
case REOP_set_char_pos: case REOP_set_char_pos:
idx = pc[0]; idx = 2 * s->capture_count + pc[0];
pc++; pc++;
SAVE_REG(idx, (uint8_t *)cptr); SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr);
break; break;
case REOP_check_advance: case REOP_check_advance:
idx = pc[0]; idx = 2 * s->capture_count + pc[0];
pc++; pc++;
if (regs[idx] == cptr) if (capture[idx] == cptr)
goto no_match; goto no_match;
break; break;
case REOP_word_boundary: case REOP_word_boundary:
@@ -3139,18 +3148,22 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
v1 = FALSE; v1 = FALSE;
} else { } else {
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
if (ignore_case) if (c < 256) {
c = lre_canonicalize(c, s->is_unicode); v1 = (lre_is_word_byte(c) != 0);
v1 = is_word_char(c); } else {
v1 = ignore_case && (c == 0x017f || c == 0x212a);
}
} }
/* current char */ /* current char */
if (cptr >= cbuf_end) { if (cptr >= cbuf_end) {
v2 = FALSE; v2 = FALSE;
} else { } else {
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
if (ignore_case) if (c < 256) {
c = lre_canonicalize(c, s->is_unicode); v2 = (lre_is_word_byte(c) != 0);
v2 = is_word_char(c); } else {
v2 = ignore_case && (c == 0x017f || c == 0x212a);
}
} }
if (v1 ^ v2 ^ is_boundary) if (v1 ^ v2 ^ is_boundary)
goto no_match; goto no_match;
@@ -3315,8 +3328,7 @@ int lre_exec(uint8_t **capture,
int cbuf_type, void *opaque) int cbuf_type, void *opaque)
{ {
REExecContext s_s, *s = &s_s; REExecContext s_s, *s = &s_s;
int re_flags, i, ret, register_count; int re_flags, i, ret;
uint8_t **regs;
const uint8_t *cptr; const uint8_t *cptr;
re_flags = lre_get_flags(bc_buf); re_flags = lre_get_flags(bc_buf);
@@ -3335,10 +3347,6 @@ int lre_exec(uint8_t **capture,
for(i = 0; i < s->capture_count * 2; i++) for(i = 0; i < s->capture_count * 2; i++)
capture[i] = NULL; capture[i] = NULL;
/* XXX: modify the API so that the registers are allocated after
the captures to suppress some tests */
register_count = bc_buf[RE_HEADER_REGISTER_COUNT];
regs = alloca(register_count * sizeof(regs[0]));
cptr = cbuf + (cindex << cbuf_type); cptr = cbuf + (cindex << cbuf_type);
if (0 < cindex && cindex < clen && s->cbuf_type == 2) { if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
@@ -3348,13 +3356,19 @@ int lre_exec(uint8_t **capture,
} }
} }
ret = lre_exec_backtrack(s, capture, regs, bc_buf + RE_HEADER_LEN, ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr);
cptr);
if (s->stack_buf != s->static_stack_buf) if (s->stack_buf != s->static_stack_buf)
lre_realloc(s->opaque, s->stack_buf, 0); lre_realloc(s->opaque, s->stack_buf, 0);
return ret; return ret;
} }
int lre_get_alloc_count(const uint8_t *bc_buf)
{
return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 +
bc_buf[RE_HEADER_REGISTER_COUNT];
}
int lre_get_capture_count(const uint8_t *bc_buf) int lre_get_capture_count(const uint8_t *bc_buf)
{ {
return bc_buf[RE_HEADER_CAPTURE_COUNT]; return bc_buf[RE_HEADER_CAPTURE_COUNT];
@@ -3393,7 +3407,7 @@ int main(int argc, char **argv)
int len, flags, ret, i; int len, flags, ret, i;
uint8_t *bc; uint8_t *bc;
char error_msg[64]; char error_msg[64];
uint8_t *capture[CAPTURE_COUNT_MAX * 2]; uint8_t *capture;
const char *input; const char *input;
int input_len, capture_count; int input_len, capture_count;
@@ -3412,6 +3426,7 @@ int main(int argc, char **argv)
input = argv[3]; input = argv[3];
input_len = strlen(input); input_len = strlen(input);
capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc));
ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
printf("ret=%d\n", ret); printf("ret=%d\n", ret);
if (ret == 1) { if (ret == 1) {
@@ -3427,6 +3442,7 @@ int main(int argc, char **argv)
printf("\n"); printf("\n");
} }
} }
free(capture);
return 0; return 0;
} }
#endif #endif

View File

@@ -46,6 +46,7 @@
uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
const char *buf, size_t buf_len, int re_flags, const char *buf, size_t buf_len, int re_flags,
void *opaque); void *opaque);
int lre_get_alloc_count(const uint8_t *bc_buf);
int lre_get_capture_count(const uint8_t *bc_buf); int lre_get_capture_count(const uint8_t *bc_buf);
int lre_get_flags(const uint8_t *bc_buf); int lre_get_flags(const uint8_t *bc_buf);
const char *lre_get_groupnames(const uint8_t *bc_buf); const char *lre_get_groupnames(const uint8_t *bc_buf);

View File

@@ -147,6 +147,11 @@ static inline int lre_is_id_continue_byte(uint8_t c) {
UNICODE_C_DIGIT); UNICODE_C_DIGIT);
} }
static inline int lre_is_word_byte(uint8_t c) {
return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
UNICODE_C_UNDER | UNICODE_C_DIGIT);
}
int lre_is_space_non_ascii(uint32_t c); int lre_is_space_non_ascii(uint32_t c);
static inline int lre_is_space(uint32_t c) { static inline int lre_is_space(uint32_t c) {

View File

@@ -45487,7 +45487,6 @@ static JSValue js_string_split(JSContext *ctx, JSValueConst this_val,
goto add_tail; goto add_tail;
goto done; goto done;
} }
q = p;
for (q = p; (q += !r) <= s - r - !r; q = p = e + r) { for (q = p; (q += !r) <= s - r - !r; q = p = e + r) {
e = string_indexof(sp, rp, q); e = string_indexof(sp, rp, q);
if (e < 0) if (e < 0)
@@ -47423,7 +47422,7 @@ static JSValue js_regexp_exec(JSContext *ctx, JSValueConst this_val,
JSValue indices, indices_groups; JSValue indices, indices_groups;
uint8_t *re_bytecode; uint8_t *re_bytecode;
uint8_t **capture, *str_buf; uint8_t **capture, *str_buf;
int rc, capture_count, shift, i, re_flags; int rc, capture_count, shift, i, re_flags, alloc_count;
int64_t last_index; int64_t last_index;
const char *group_name_ptr; const char *group_name_ptr;
JSObject *p_obj; JSObject *p_obj;
@@ -47453,12 +47452,13 @@ static JSValue js_regexp_exec(JSContext *ctx, JSValueConst this_val,
last_index = 0; last_index = 0;
} }
str = JS_VALUE_GET_STRING(str_val); str = JS_VALUE_GET_STRING(str_val);
capture_count = lre_get_capture_count(re_bytecode); alloc_count = lre_get_alloc_count(re_bytecode);
if (capture_count > 0) { if (alloc_count > 0) {
capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2); capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count);
if (!capture) if (!capture)
goto fail; goto fail;
} }
capture_count = lre_get_capture_count(re_bytecode);
shift = str->is_wide_char; shift = str->is_wide_char;
str_buf = str->u.str8; str_buf = str->u.str8;
if (last_index > str->len) { if (last_index > str->len) {
@@ -47642,7 +47642,7 @@ static JSValue js_regexp_replace(JSContext *ctx, JSValueConst this_val, JSValueC
uint8_t *re_bytecode; uint8_t *re_bytecode;
int ret; int ret;
uint8_t **capture, *str_buf; uint8_t **capture, *str_buf;
int capture_count, shift, re_flags; int capture_count, alloc_count, shift, re_flags;
int next_src_pos, start, end; int next_src_pos, start, end;
int64_t last_index; int64_t last_index;
StringBuffer b_s, *b = &b_s; StringBuffer b_s, *b = &b_s;
@@ -47676,12 +47676,13 @@ static JSValue js_regexp_replace(JSContext *ctx, JSValueConst this_val, JSValueC
if (js_regexp_get_lastIndex(ctx, &last_index, this_val)) if (js_regexp_get_lastIndex(ctx, &last_index, this_val))
goto fail; goto fail;
} }
capture_count = lre_get_capture_count(re_bytecode); alloc_count = lre_get_alloc_count(re_bytecode);
if (capture_count > 0) { if (alloc_count > 0) {
capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2); capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count);
if (!capture) if (!capture)
goto fail; goto fail;
} }
capture_count = lre_get_capture_count(re_bytecode);
fullUnicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0); fullUnicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
shift = str->is_wide_char; shift = str->is_wide_char;
str_buf = str->u.str8; str_buf = str->u.str8;