From f1139494d18a2053630c5ed3384a42bb70db3c53 Mon Sep 17 00:00:00 2001 From: Fabrice Bellard Date: Mon, 22 Dec 2025 15:12:46 +0100 Subject: [PATCH] regexp: removed alloca() is lre_exec() - added specific opcodes for \s and \S to have a smaller bytecode - optimized \b and \B --- libregexp-opcode.h | 2 + libregexp.c | 136 +++++++++++++++++++++++++-------------------- libregexp.h | 1 + libunicode.h | 5 ++ quickjs.c | 19 ++++--- 5 files changed, 94 insertions(+), 69 deletions(-) diff --git a/libregexp-opcode.h b/libregexp-opcode.h index 17ec8d6..b3d7b6f 100644 --- a/libregexp-opcode.h +++ b/libregexp-opcode.h @@ -31,6 +31,8 @@ DEF(char32, 5) DEF(char32_i, 5) DEF(dot, 1) DEF(any, 1) /* same as dot but match any character including line terminator */ +DEF(space, 1) +DEF(not_space, 1) /* must come after */ DEF(line_start, 1) DEF(line_start_m, 1) DEF(line_end, 1) diff --git a/libregexp.c b/libregexp.c index 9d9e361..c387f00 100644 --- a/libregexp.c +++ b/libregexp.c @@ -34,7 +34,9 @@ /* TODO: - + - remove REOP_char_i and REOP_range_i by precomputing the case folding. + - add specific opcodes for simple unicode property tests so that the + generated bytecode is smaller. - Add a lock step execution mode (=linear time execution guaranteed) when the regular expression is "simple" i.e. no backreference nor complicated lookahead. The opcodes are designed for this execution @@ -1078,7 +1080,7 @@ static int get_class_atom(REParseState *s, REStringList *cr, goto default_escape; if (cr_init_char_range(s, cr, c)) return -1; - c = CLASS_RANGE_BASE; + c += CLASS_RANGE_BASE; break; case 'c': c = *p; @@ -1584,6 +1586,8 @@ static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init, case REOP_char32_i: case REOP_dot: case REOP_any: + case REOP_space: + case REOP_not_space: need_check_adv = FALSE; break; case REOP_line_start: @@ -2028,9 +2032,9 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) case 'b': case 'B': if (p[1] != 'b') { - re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary); + re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary); } else { - re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary); + re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary); } p += 2; break; @@ -2167,8 +2171,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) if (is_backward_dir) re_emit_op(s, REOP_prev); if (c >= CLASS_RANGE_BASE) { - int ret; - ret = re_emit_string_list(s, cr); + int ret = 0; + /* optimize the common 'space' tests */ + if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) { + re_emit_op(s, REOP_space); + } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) { + re_emit_op(s, REOP_not_space); + } else { + ret = re_emit_string_list(s, cr); + } re_string_list_free(cr); if (ret) return -1; @@ -2607,14 +2618,6 @@ static BOOL is_line_terminator(uint32_t c) return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); } -static BOOL is_word_char(uint32_t c) -{ - return ((c >= '0' && c <= '9') || - (c >= 'a' && c <= 'z') || - (c >= 'A' && c <= 'Z') || - (c == '_')); -} - #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ do { \ if (cbuf_type == 0) { \ @@ -2769,7 +2772,7 @@ static no_inline int stack_realloc(REExecContext *s, size_t n) /* return 1 if match, 0 if not match or < 0 if error. */ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, - uint8_t **regs, const uint8_t *pc, const uint8_t *cptr) + const uint8_t *pc, const uint8_t *cptr) { int opcode; int cbuf_type; @@ -2809,24 +2812,24 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, } /* avoid saving the previous value if already saved */ -#define SAVE_REG(idx, value) \ +#define SAVE_CAPTURE_CHECK(idx, value) \ { \ StackElem *sp1; \ sp1 = sp; \ for(;;) { \ if (sp1 > bp) { \ - if (sp1[-2].val == -(int)(idx + 1)) \ + if (sp1[-2].val == idx) \ break; \ sp1 -= 2; \ } else { \ CHECK_STACK_SPACE(2); \ - sp[0].val = -(int)(idx + 1); \ - sp[1].ptr = regs[idx]; \ + sp[0].val = idx; \ + sp[1].ptr = capture[idx]; \ sp += 2; \ break; \ } \ } \ - regs[idx] = (value); \ + capture[idx] = (value); \ } @@ -2851,13 +2854,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, REExecStateEnum type; if (bp == s->stack_buf) return 0; - /* undo the modifications to capture[] and regs[] */ + /* undo the modifications to capture[] */ while (sp > bp) { - intptr_t idx2 = sp[-2].val; - if (idx2 >= 0) - capture[idx2] = sp[-1].ptr; - else - regs[-idx2 - 1] = sp[-1].ptr; + capture[sp[-2].val] = sp[-1].ptr; sp -= 2; } @@ -2910,13 +2909,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, for(;;) { REExecStateEnum type; type = bp[-1].bp.type; - /* undo the modifications to capture[] and regs[] */ + /* undo the modifications to capture[] */ while (sp > bp) { - intptr_t idx2 = sp[-2].val; - if (idx2 >= 0) - capture[idx2] = sp[-1].ptr; - else - regs[-idx2 - 1] = sp[-1].ptr; + capture[sp[-2].val] = sp[-1].ptr; sp -= 2; } pc = sp[-3].ptr; @@ -3019,6 +3014,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, goto no_match; GET_CHAR(c, cptr, cbuf_end, cbuf_type); break; + case REOP_space: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end, cbuf_type); + if (!lre_is_space(c)) + goto no_match; + break; + case REOP_not_space: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end, cbuf_type); + if (lre_is_space(c)) + goto no_match; + break; case REOP_save_start: case REOP_save_end: val = *pc++; @@ -3044,20 +3053,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, } break; case REOP_set_i32: - idx = pc[0]; + idx = 2 * s->capture_count + pc[0]; val = get_u32(pc + 1); pc += 5; - SAVE_REG(idx, (void *)(uintptr_t)val); + SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val); break; case REOP_loop: { uint32_t val2; - idx = pc[0]; + idx = 2 * s->capture_count + pc[0]; val = get_u32(pc + 1); pc += 5; - val2 = (uintptr_t)regs[idx] - 1; - SAVE_REG(idx, (void *)(uintptr_t)val2); + val2 = (uintptr_t)capture[idx] - 1; + SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2); if (val2 != 0) { pc += (int)val; if (lre_poll_timeout(s)) @@ -3072,14 +3081,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, { const uint8_t *pc1; uint32_t val2, limit; - idx = pc[0]; + idx = 2 * s->capture_count + pc[0]; limit = get_u32(pc + 1); val = get_u32(pc + 5); pc += 9; /* decrement the counter */ - val2 = (uintptr_t)regs[idx] - 1; - SAVE_REG(idx, (void *)(uintptr_t)val2); + val2 = (uintptr_t)capture[idx] - 1; + SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2); if (val2 > limit) { /* normal loop if counter > limit */ @@ -3090,7 +3099,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, /* check advance */ if ((opcode == REOP_loop_check_adv_split_goto_first || opcode == REOP_loop_check_adv_split_next_first) && - regs[idx + 1] == cptr && + capture[idx + 1] == cptr && val2 != limit) { goto no_match; } @@ -3116,14 +3125,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, } break; case REOP_set_char_pos: - idx = pc[0]; + idx = 2 * s->capture_count + pc[0]; pc++; - SAVE_REG(idx, (uint8_t *)cptr); + SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr); break; case REOP_check_advance: - idx = pc[0]; + idx = 2 * s->capture_count + pc[0]; pc++; - if (regs[idx] == cptr) + if (capture[idx] == cptr) goto no_match; break; case REOP_word_boundary: @@ -3139,18 +3148,22 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, v1 = FALSE; } else { PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); - if (ignore_case) - c = lre_canonicalize(c, s->is_unicode); - v1 = is_word_char(c); + if (c < 256) { + v1 = (lre_is_word_byte(c) != 0); + } else { + v1 = ignore_case && (c == 0x017f || c == 0x212a); + } } /* current char */ if (cptr >= cbuf_end) { v2 = FALSE; } else { PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); - if (ignore_case) - c = lre_canonicalize(c, s->is_unicode); - v2 = is_word_char(c); + if (c < 256) { + v2 = (lre_is_word_byte(c) != 0); + } else { + v2 = ignore_case && (c == 0x017f || c == 0x212a); + } } if (v1 ^ v2 ^ is_boundary) goto no_match; @@ -3315,8 +3328,7 @@ int lre_exec(uint8_t **capture, int cbuf_type, void *opaque) { REExecContext s_s, *s = &s_s; - int re_flags, i, ret, register_count; - uint8_t **regs; + int re_flags, i, ret; const uint8_t *cptr; re_flags = lre_get_flags(bc_buf); @@ -3335,10 +3347,6 @@ int lre_exec(uint8_t **capture, for(i = 0; i < s->capture_count * 2; i++) capture[i] = NULL; - /* XXX: modify the API so that the registers are allocated after - the captures to suppress some tests */ - register_count = bc_buf[RE_HEADER_REGISTER_COUNT]; - regs = alloca(register_count * sizeof(regs[0])); cptr = cbuf + (cindex << cbuf_type); if (0 < cindex && cindex < clen && s->cbuf_type == 2) { @@ -3348,13 +3356,19 @@ int lre_exec(uint8_t **capture, } } - ret = lre_exec_backtrack(s, capture, regs, bc_buf + RE_HEADER_LEN, - cptr); + ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr); + if (s->stack_buf != s->static_stack_buf) lre_realloc(s->opaque, s->stack_buf, 0); return ret; } +int lre_get_alloc_count(const uint8_t *bc_buf) +{ + return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 + + bc_buf[RE_HEADER_REGISTER_COUNT]; +} + int lre_get_capture_count(const uint8_t *bc_buf) { return bc_buf[RE_HEADER_CAPTURE_COUNT]; @@ -3393,7 +3407,7 @@ int main(int argc, char **argv) int len, flags, ret, i; uint8_t *bc; char error_msg[64]; - uint8_t *capture[CAPTURE_COUNT_MAX * 2]; + uint8_t *capture; const char *input; int input_len, capture_count; @@ -3412,6 +3426,7 @@ int main(int argc, char **argv) input = argv[3]; input_len = strlen(input); + capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc)); ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); printf("ret=%d\n", ret); if (ret == 1) { @@ -3427,6 +3442,7 @@ int main(int argc, char **argv) printf("\n"); } } + free(capture); return 0; } #endif diff --git a/libregexp.h b/libregexp.h index c0ac11c..0905bcb 100644 --- a/libregexp.h +++ b/libregexp.h @@ -46,6 +46,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, const char *buf, size_t buf_len, int re_flags, void *opaque); +int lre_get_alloc_count(const uint8_t *bc_buf); int lre_get_capture_count(const uint8_t *bc_buf); int lre_get_flags(const uint8_t *bc_buf); const char *lre_get_groupnames(const uint8_t *bc_buf); diff --git a/libunicode.h b/libunicode.h index 5d964e4..5b02c82 100644 --- a/libunicode.h +++ b/libunicode.h @@ -147,6 +147,11 @@ static inline int lre_is_id_continue_byte(uint8_t c) { UNICODE_C_DIGIT); } +static inline int lre_is_word_byte(uint8_t c) { + return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | + UNICODE_C_UNDER | UNICODE_C_DIGIT); +} + int lre_is_space_non_ascii(uint32_t c); static inline int lre_is_space(uint32_t c) { diff --git a/quickjs.c b/quickjs.c index e0d60bd..e30d393 100644 --- a/quickjs.c +++ b/quickjs.c @@ -45487,7 +45487,6 @@ static JSValue js_string_split(JSContext *ctx, JSValueConst this_val, goto add_tail; goto done; } - q = p; for (q = p; (q += !r) <= s - r - !r; q = p = e + r) { e = string_indexof(sp, rp, q); if (e < 0) @@ -47423,7 +47422,7 @@ static JSValue js_regexp_exec(JSContext *ctx, JSValueConst this_val, JSValue indices, indices_groups; uint8_t *re_bytecode; uint8_t **capture, *str_buf; - int rc, capture_count, shift, i, re_flags; + int rc, capture_count, shift, i, re_flags, alloc_count; int64_t last_index; const char *group_name_ptr; JSObject *p_obj; @@ -47453,12 +47452,13 @@ static JSValue js_regexp_exec(JSContext *ctx, JSValueConst this_val, last_index = 0; } str = JS_VALUE_GET_STRING(str_val); - capture_count = lre_get_capture_count(re_bytecode); - if (capture_count > 0) { - capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2); + alloc_count = lre_get_alloc_count(re_bytecode); + if (alloc_count > 0) { + capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count); if (!capture) goto fail; } + capture_count = lre_get_capture_count(re_bytecode); shift = str->is_wide_char; str_buf = str->u.str8; if (last_index > str->len) { @@ -47642,7 +47642,7 @@ static JSValue js_regexp_replace(JSContext *ctx, JSValueConst this_val, JSValueC uint8_t *re_bytecode; int ret; uint8_t **capture, *str_buf; - int capture_count, shift, re_flags; + int capture_count, alloc_count, shift, re_flags; int next_src_pos, start, end; int64_t last_index; StringBuffer b_s, *b = &b_s; @@ -47676,12 +47676,13 @@ static JSValue js_regexp_replace(JSContext *ctx, JSValueConst this_val, JSValueC if (js_regexp_get_lastIndex(ctx, &last_index, this_val)) goto fail; } - capture_count = lre_get_capture_count(re_bytecode); - if (capture_count > 0) { - capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2); + alloc_count = lre_get_alloc_count(re_bytecode); + if (alloc_count > 0) { + capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count); if (!capture) goto fail; } + capture_count = lre_get_capture_count(re_bytecode); fullUnicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0); shift = str->is_wide_char; str_buf = str->u.str8;