From f1139494d18a2053630c5ed3384a42bb70db3c53 Mon Sep 17 00:00:00 2001
From: Fabrice Bellard <fabrice@bellard.org>
Date: Mon, 22 Dec 2025 15:12:46 +0100
Subject: [PATCH] regexp: removed alloca() is lre_exec() - added specific
 opcodes for \s and \S to have a smaller bytecode - optimized \b and \B

---
 libregexp-opcode.h |   2 +
 libregexp.c        | 136 +++++++++++++++++++++++++--------------------
 libregexp.h        |   1 +
 libunicode.h       |   5 ++
 quickjs.c          |  19 ++++---
 5 files changed, 94 insertions(+), 69 deletions(-)

diff --git a/libregexp-opcode.h b/libregexp-opcode.h
index 17ec8d6..b3d7b6f 100644
--- a/libregexp-opcode.h
+++ b/libregexp-opcode.h
@@ -31,6 +31,8 @@ DEF(char32, 5)
 DEF(char32_i, 5)
 DEF(dot, 1)
 DEF(any, 1) /* same as dot but match any character including line terminator */
+DEF(space, 1)
+DEF(not_space, 1) /* must come after */
 DEF(line_start, 1)
 DEF(line_start_m, 1)
 DEF(line_end, 1)
diff --git a/libregexp.c b/libregexp.c
index 9d9e361..c387f00 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -34,7 +34,9 @@
 
 /*
   TODO:
-
+  - remove REOP_char_i and REOP_range_i by precomputing the case folding.
+  - add specific opcodes for simple unicode property tests so that the
+    generated bytecode is smaller.
   - Add a lock step execution mode (=linear time execution guaranteed)
     when the regular expression is "simple" i.e. no backreference nor
     complicated lookahead. The opcodes are designed for this execution
@@ -1078,7 +1080,7 @@ static int get_class_atom(REParseState *s, REStringList *cr,
                 goto default_escape;
             if (cr_init_char_range(s, cr, c))
                 return -1;
-            c = CLASS_RANGE_BASE;
+            c += CLASS_RANGE_BASE;
             break;
         case 'c':
             c = *p;
@@ -1584,6 +1586,8 @@ static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init,
         case REOP_char32_i:
         case REOP_dot:
         case REOP_any:
+        case REOP_space:
+        case REOP_not_space:
             need_check_adv = FALSE;
             break;
         case REOP_line_start:
@@ -2028,9 +2032,9 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
         case 'b':
         case 'B':
             if (p[1] != 'b') {
-                re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
+                re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary);
             } else {
-                re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
+                re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary);
             }
             p += 2;
             break;
@@ -2167,8 +2171,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
         if (is_backward_dir)
             re_emit_op(s, REOP_prev);
         if (c >= CLASS_RANGE_BASE) {
-            int ret;
-            ret = re_emit_string_list(s, cr);
+            int ret = 0;
+            /* optimize the common 'space' tests */
+            if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) {
+                re_emit_op(s, REOP_space);
+            } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) {
+                re_emit_op(s, REOP_not_space);
+            } else {
+                ret = re_emit_string_list(s, cr);
+            }
             re_string_list_free(cr);
             if (ret)
                 return -1;
@@ -2607,14 +2618,6 @@ static BOOL is_line_terminator(uint32_t c)
     return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
 }
 
-static BOOL is_word_char(uint32_t c)
-{
-    return ((c >= '0' && c <= '9') ||
-            (c >= 'a' && c <= 'z') ||
-            (c >= 'A' && c <= 'Z') ||
-            (c == '_'));
-}
-
 #define GET_CHAR(c, cptr, cbuf_end, cbuf_type)                          \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
@@ -2769,7 +2772,7 @@ static no_inline int stack_realloc(REExecContext *s, size_t n)
 
 /* return 1 if match, 0 if not match or < 0 if error. */
 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
-                                   uint8_t **regs, const uint8_t *pc, const uint8_t *cptr)
+                                   const uint8_t *pc, const uint8_t *cptr)
 {
     int opcode;
     int cbuf_type;
@@ -2809,24 +2812,24 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
     }
 
     /* avoid saving the previous value if already saved */
-#define SAVE_REG(idx, value)                    \
+#define SAVE_CAPTURE_CHECK(idx, value)          \
     {                                           \
         StackElem *sp1;                         \
         sp1 = sp;                               \
         for(;;) {                               \
             if (sp1 > bp) {                             \
-                if (sp1[-2].val == -(int)(idx + 1))     \
+                if (sp1[-2].val == idx)                 \
                     break;                              \
                 sp1 -= 2;                               \
             } else {                                    \
                 CHECK_STACK_SPACE(2);                   \
-                sp[0].val = -(int)(idx + 1);            \
-                sp[1].ptr = regs[idx];                  \
+                sp[0].val = idx;                        \
+                sp[1].ptr = capture[idx];               \
                 sp += 2;                                \
                 break;                                  \
             }                                           \
         }                                               \
-        regs[idx] = (value);                            \
+        capture[idx] = (value);                         \
     }
 
 
@@ -2851,13 +2854,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 REExecStateEnum type;
                 if (bp == s->stack_buf)
                     return 0;
-                /* undo the modifications to capture[] and regs[] */
+                /* undo the modifications to capture[] */
                 while (sp > bp) {
-                    intptr_t idx2 = sp[-2].val;
-                    if (idx2 >= 0)
-                        capture[idx2] = sp[-1].ptr;
-                    else
-                        regs[-idx2 - 1] = sp[-1].ptr;
+                    capture[sp[-2].val] = sp[-1].ptr;
                     sp -= 2;
                 }
                 
@@ -2910,13 +2909,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
             for(;;) {
                 REExecStateEnum type;
                 type = bp[-1].bp.type;
-                /* undo the modifications to capture[] and regs[] */
+                /* undo the modifications to capture[] */
                 while (sp > bp) {
-                    intptr_t idx2 = sp[-2].val;
-                    if (idx2 >= 0)
-                        capture[idx2] = sp[-1].ptr;
-                    else
-                        regs[-idx2 - 1] = sp[-1].ptr;
+                    capture[sp[-2].val] = sp[-1].ptr;
                     sp -= 2;
                 }
                 pc = sp[-3].ptr;
@@ -3019,6 +3014,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 goto no_match;
             GET_CHAR(c, cptr, cbuf_end, cbuf_type);
             break;
+        case REOP_space:
+            if (cptr == cbuf_end)
+                goto no_match;
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+            if (!lre_is_space(c))
+                goto no_match;
+            break;
+        case REOP_not_space:
+            if (cptr == cbuf_end)
+                goto no_match;
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+            if (lre_is_space(c))
+                goto no_match;
+            break;
         case REOP_save_start:
         case REOP_save_end:
             val = *pc++;
@@ -3044,20 +3053,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
             }
             break;
         case REOP_set_i32:
-            idx = pc[0];
+            idx = 2 * s->capture_count + pc[0];
             val = get_u32(pc + 1);
             pc += 5;
-            SAVE_REG(idx, (void *)(uintptr_t)val);
+            SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val);
             break;
         case REOP_loop:
             {
                 uint32_t val2;
-                idx = pc[0];
+                idx = 2 * s->capture_count + pc[0];
                 val = get_u32(pc + 1);
                 pc += 5;
 
-                val2 = (uintptr_t)regs[idx] - 1;
-                SAVE_REG(idx, (void *)(uintptr_t)val2);
+                val2 = (uintptr_t)capture[idx] - 1;
+                SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
                 if (val2 != 0) {
                     pc += (int)val;
                     if (lre_poll_timeout(s))
@@ -3072,14 +3081,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
             {
                 const uint8_t *pc1;
                 uint32_t val2, limit;
-                idx = pc[0];
+                idx = 2 * s->capture_count + pc[0];
                 limit = get_u32(pc + 1);
                 val = get_u32(pc + 5);
                 pc += 9;
 
                 /* decrement the counter */
-                val2 = (uintptr_t)regs[idx] - 1;
-                SAVE_REG(idx, (void *)(uintptr_t)val2);
+                val2 = (uintptr_t)capture[idx] - 1;
+                SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
 
                 if (val2 > limit) {
                     /* normal loop if counter > limit */
@@ -3090,7 +3099,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                     /* check advance */
                     if ((opcode == REOP_loop_check_adv_split_goto_first ||
                          opcode == REOP_loop_check_adv_split_next_first) &&
-                        regs[idx + 1] == cptr &&
+                        capture[idx + 1] == cptr &&
                         val2 != limit) {
                         goto no_match;
                     }
@@ -3116,14 +3125,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
             }
             break;
         case REOP_set_char_pos:
-            idx = pc[0];
+            idx = 2 * s->capture_count + pc[0];
             pc++;
-            SAVE_REG(idx, (uint8_t *)cptr);
+            SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr);
             break;
         case REOP_check_advance:
-            idx = pc[0];
+            idx = 2 * s->capture_count + pc[0];
             pc++;
-            if (regs[idx] == cptr)
+            if (capture[idx] == cptr)
                 goto no_match;
             break;
         case REOP_word_boundary:
@@ -3139,18 +3148,22 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                     v1 = FALSE;
                 } else {
                     PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
-                    if (ignore_case)
-                        c = lre_canonicalize(c, s->is_unicode);
-                    v1 = is_word_char(c);
+                    if (c < 256) {
+                        v1 = (lre_is_word_byte(c) != 0);
+                    } else {
+                        v1 = ignore_case && (c == 0x017f || c == 0x212a);
+                    }
                 }
                 /* current char */
                 if (cptr >= cbuf_end) {
                     v2 = FALSE;
                 } else {
                     PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
-                    if (ignore_case)
-                        c = lre_canonicalize(c, s->is_unicode);
-                    v2 = is_word_char(c);
+                    if (c < 256) {
+                        v2 = (lre_is_word_byte(c) != 0);
+                    } else {
+                        v2 = ignore_case && (c == 0x017f || c == 0x212a);
+                    }
                 }
                 if (v1 ^ v2 ^ is_boundary)
                     goto no_match;
@@ -3315,8 +3328,7 @@ int lre_exec(uint8_t **capture,
              int cbuf_type, void *opaque)
 {
     REExecContext s_s, *s = &s_s;
-    int re_flags, i, ret, register_count;
-    uint8_t **regs;
+    int re_flags, i, ret;
     const uint8_t *cptr;
 
     re_flags = lre_get_flags(bc_buf);
@@ -3335,10 +3347,6 @@ int lre_exec(uint8_t **capture,
 
     for(i = 0; i < s->capture_count * 2; i++)
         capture[i] = NULL;
-    /* XXX: modify the API so that the registers are allocated after
-       the captures to suppress some tests */
-    register_count = bc_buf[RE_HEADER_REGISTER_COUNT];
-    regs = alloca(register_count * sizeof(regs[0]));
 
     cptr = cbuf + (cindex << cbuf_type);
     if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
@@ -3348,13 +3356,19 @@ int lre_exec(uint8_t **capture,
         }
     }
 
-    ret = lre_exec_backtrack(s, capture, regs, bc_buf + RE_HEADER_LEN,
-                             cptr);
+    ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr);
+
     if (s->stack_buf != s->static_stack_buf)
         lre_realloc(s->opaque, s->stack_buf, 0);
     return ret;
 }
 
+int lre_get_alloc_count(const uint8_t *bc_buf)
+{
+    return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 +
+        bc_buf[RE_HEADER_REGISTER_COUNT];
+}
+
 int lre_get_capture_count(const uint8_t *bc_buf)
 {
     return bc_buf[RE_HEADER_CAPTURE_COUNT];
@@ -3393,7 +3407,7 @@ int main(int argc, char **argv)
     int len, flags, ret, i;
     uint8_t *bc;
     char error_msg[64];
-    uint8_t *capture[CAPTURE_COUNT_MAX * 2];
+    uint8_t *capture;
     const char *input;
     int input_len, capture_count;
 
@@ -3412,6 +3426,7 @@ int main(int argc, char **argv)
     input = argv[3];
     input_len = strlen(input);
 
+    capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc));
     ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
     printf("ret=%d\n", ret);
     if (ret == 1) {
@@ -3427,6 +3442,7 @@ int main(int argc, char **argv)
             printf("\n");
         }
     }
+    free(capture);
     return 0;
 }
 #endif
diff --git a/libregexp.h b/libregexp.h
index c0ac11c..0905bcb 100644
--- a/libregexp.h
+++ b/libregexp.h
@@ -46,6 +46,7 @@
 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
                      const char *buf, size_t buf_len, int re_flags,
                      void *opaque);
+int lre_get_alloc_count(const uint8_t *bc_buf);
 int lre_get_capture_count(const uint8_t *bc_buf);
 int lre_get_flags(const uint8_t *bc_buf);
 const char *lre_get_groupnames(const uint8_t *bc_buf);
diff --git a/libunicode.h b/libunicode.h
index 5d964e4..5b02c82 100644
--- a/libunicode.h
+++ b/libunicode.h
@@ -147,6 +147,11 @@ static inline int lre_is_id_continue_byte(uint8_t c) {
                                 UNICODE_C_DIGIT);
 }
 
+static inline int lre_is_word_byte(uint8_t c) {
+    return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
+                                UNICODE_C_UNDER | UNICODE_C_DIGIT);
+}
+
 int lre_is_space_non_ascii(uint32_t c);
 
 static inline int lre_is_space(uint32_t c) {
diff --git a/quickjs.c b/quickjs.c
index e0d60bd..e30d393 100644
--- a/quickjs.c
+++ b/quickjs.c
@@ -45487,7 +45487,6 @@ static JSValue js_string_split(JSContext *ctx, JSValueConst this_val,
             goto add_tail;
         goto done;
     }
-    q = p;
     for (q = p; (q += !r) <= s - r - !r; q = p = e + r) {
         e = string_indexof(sp, rp, q);
         if (e < 0)
@@ -47423,7 +47422,7 @@ static JSValue js_regexp_exec(JSContext *ctx, JSValueConst this_val,
     JSValue indices, indices_groups;
     uint8_t *re_bytecode;
     uint8_t **capture, *str_buf;
-    int rc, capture_count, shift, i, re_flags;
+    int rc, capture_count, shift, i, re_flags, alloc_count;
     int64_t last_index;
     const char *group_name_ptr;
     JSObject *p_obj;
@@ -47453,12 +47452,13 @@ static JSValue js_regexp_exec(JSContext *ctx, JSValueConst this_val,
         last_index = 0;
     }
     str = JS_VALUE_GET_STRING(str_val);
-    capture_count = lre_get_capture_count(re_bytecode);
-    if (capture_count > 0) {
-        capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2);
+    alloc_count = lre_get_alloc_count(re_bytecode);
+    if (alloc_count > 0) {
+        capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count);
         if (!capture)
             goto fail;
     }
+    capture_count = lre_get_capture_count(re_bytecode);
     shift = str->is_wide_char;
     str_buf = str->u.str8;
     if (last_index > str->len) {
@@ -47642,7 +47642,7 @@ static JSValue js_regexp_replace(JSContext *ctx, JSValueConst this_val, JSValueC
     uint8_t *re_bytecode;
     int ret;
     uint8_t **capture, *str_buf;
-    int capture_count, shift, re_flags;
+    int capture_count, alloc_count, shift, re_flags;
     int next_src_pos, start, end;
     int64_t last_index;
     StringBuffer b_s, *b = &b_s;
@@ -47676,12 +47676,13 @@ static JSValue js_regexp_replace(JSContext *ctx, JSValueConst this_val, JSValueC
         if (js_regexp_get_lastIndex(ctx, &last_index, this_val))
             goto fail;
     }
-    capture_count = lre_get_capture_count(re_bytecode);
-    if (capture_count > 0) {
-        capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2);
+    alloc_count = lre_get_alloc_count(re_bytecode);
+    if (alloc_count > 0) {
+        capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count);
         if (!capture)
             goto fail;
     }
+    capture_count = lre_get_capture_count(re_bytecode);
     fullUnicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
     shift = str->is_wide_char;
     str_buf = str->u.str8;