From 371c06e35964a8211b30d94511eb29b8bffe1360 Mon Sep 17 00:00:00 2001 From: Fabrice Bellard Date: Sat, 29 Nov 2025 12:39:52 +0100 Subject: [PATCH] regexp: ensure that the bytecode size grows linearly with respect to the input regexp. This way, pathological regexps such as /(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(?:a|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+/ are no longer an issue. The generated bytecode is also simpler and faster. --- libregexp-opcode.h | 4 ++ libregexp.c | 148 ++++++++++++++++++++++++++++++++------------- 2 files changed, 111 insertions(+), 41 deletions(-) diff --git a/libregexp-opcode.h b/libregexp-opcode.h index 9908cf3..f0e2345 100644 --- a/libregexp-opcode.h +++ b/libregexp-opcode.h @@ -45,6 +45,10 @@ DEF(save_start, 2) /* save start position */ DEF(save_end, 2) /* save end position, must come after saved_start */ DEF(save_reset, 3) /* reset save positions */ DEF(loop, 6) /* decrement the top the stack and goto if != 0 */ +DEF(loop_split_goto_first, 10) +DEF(loop_split_next_first, 10) +DEF(loop_check_adv_split_goto_first, 10) +DEF(loop_check_adv_split_next_first, 10) DEF(push_i32, 6) /* push integer on the stack */ DEF(word_boundary, 1) DEF(word_boundary_i, 1) diff --git a/libregexp.c b/libregexp.c index 28f407b..d880b11 100644 --- a/libregexp.c +++ b/libregexp.c @@ -532,6 +532,19 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, val += (pos + 6); printf(" %u, %u", val2, val); break; + case REOP_loop_split_goto_first: + case REOP_loop_split_next_first: + case REOP_loop_check_adv_split_goto_first: + case REOP_loop_check_adv_split_next_first: + { + uint32_t limit; + val2 = buf[pos + 1]; + limit = get_u32(buf + pos + 2); + val = get_u32(buf + pos + 6); + val += (pos + 10); + printf(" %u, %u, %u", val2, limit, val); + } + break; case REOP_save_start: case REOP_save_end: case REOP_back_reference: @@ -620,6 +633,17 @@ static int re_emit_goto_u8(REParseState *s, int op, uint32_t arg, uint32_t val) return pos; } +static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val) +{ + int pos; + dbuf_putc(&s->byte_code, op); + dbuf_putc(&s->byte_code, arg0); + dbuf_put_u32(&s->byte_code, arg1); + pos = s->byte_code.size; + dbuf_put_u32(&s->byte_code, val - (pos + 4)); + return pos; +} + static void re_emit_op_u8(REParseState *s, int op, uint32_t val) { dbuf_putc(&s->byte_code, op); @@ -2183,62 +2207,46 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2)) goto out_of_memory; pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; + put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10); + pos += 4; + s->byte_code.buf[pos++] = REOP_push_i32; s->byte_code.buf[pos++] = 0; put_u32(s->byte_code.buf + pos, quant_max); pos += 4; - - s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; - put_u32(s->byte_code.buf + pos, len + 6 + add_zero_advance_check * 2 * 2); - pos += 4; + last_atom_start = pos; if (add_zero_advance_check) { s->byte_code.buf[pos++] = REOP_push_char_pos; s->byte_code.buf[pos++] = 0; - re_emit_op_u8(s, REOP_check_advance, 0); } - re_emit_goto_u8(s, REOP_loop, 0, last_atom_start + 6); + re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start); } } else if (quant_min == 1 && quant_max == INT32_MAX && !add_zero_advance_check) { re_emit_goto(s, REOP_split_next_first - greedy, last_atom_start); } else { - if (quant_min == 1) { - /* nothing to add */ - } else { - if (dbuf_insert(&s->byte_code, last_atom_start, 6)) - goto out_of_memory; - s->byte_code.buf[last_atom_start++] = REOP_push_i32; - s->byte_code.buf[last_atom_start++] = 0; - put_u32(s->byte_code.buf + last_atom_start, quant_min); - last_atom_start += 4; - re_emit_goto_u8(s, REOP_loop, 0, last_atom_start); + if (quant_min == quant_max) + add_zero_advance_check = FALSE; + if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2)) + goto out_of_memory; + /* Note: we assume the string length is < INT32_MAX */ + pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_push_i32; + s->byte_code.buf[pos++] = 0; + put_u32(s->byte_code.buf + pos, quant_max); + pos += 4; + last_atom_start = pos; + if (add_zero_advance_check) { + s->byte_code.buf[pos++] = REOP_push_char_pos; + s->byte_code.buf[pos++] = 0; } - if (quant_max == INT32_MAX) { - pos = s->byte_code.size; - re_emit_op_u32(s, REOP_split_goto_first + greedy, - len + 5 + add_zero_advance_check * 2 * 2); - if (add_zero_advance_check) - re_emit_op_u8(s, REOP_push_char_pos, 0); - /* copy the atom */ - dbuf_put_self(&s->byte_code, last_atom_start, len); - if (add_zero_advance_check) - re_emit_op_u8(s, REOP_check_advance, 0); - re_emit_goto(s, REOP_goto, pos); - } else if (quant_max > quant_min) { - re_emit_op_u8(s, REOP_push_i32, 0); - dbuf_put_u32(&s->byte_code, quant_max - quant_min); - - pos = s->byte_code.size; - re_emit_op_u32(s, REOP_split_goto_first + greedy, - len + 6 + add_zero_advance_check * 2 * 2); - if (add_zero_advance_check) - re_emit_op_u8(s, REOP_push_char_pos, 0); - /* copy the atom */ - dbuf_put_self(&s->byte_code, last_atom_start, len); - if (add_zero_advance_check) - re_emit_op_u8(s, REOP_check_advance, 0); - re_emit_goto_u8(s, REOP_loop, 0, pos); + if (quant_min == quant_max) { + /* a simple loop is enough */ + re_emit_goto_u8(s, REOP_loop, 0, last_atom_start); + } else { + re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start); } } last_atom_start = -1; @@ -2352,10 +2360,18 @@ static int compute_stack_size(uint8_t *bc_buf, int bc_buf_len) break; case REOP_check_advance: case REOP_loop: + case REOP_loop_split_goto_first: + case REOP_loop_split_next_first: assert(stack_size > 0); stack_size--; bc_buf[pos + 1] = stack_size; break; + case REOP_loop_check_adv_split_goto_first: + case REOP_loop_check_adv_split_next_first: + assert(stack_size >= 2); + stack_size -= 2; + bc_buf[pos + 1] = stack_size; + break; case REOP_range: case REOP_range_i: val = get_u16(bc_buf + pos + 1); @@ -2956,6 +2972,56 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, } } break; + case REOP_loop_split_goto_first: + case REOP_loop_split_next_first: + case REOP_loop_check_adv_split_goto_first: + case REOP_loop_check_adv_split_next_first: + { + const uint8_t *pc1; + uint32_t val2, limit; + idx = pc[0]; + limit = get_u32(pc + 1); + val = get_u32(pc + 5); + pc += 9; + + /* decrement the counter */ + val2 = (uintptr_t)aux_stack[idx] - 1; + SAVE_AUX_STACK(idx, (void *)(uintptr_t)val2); + + if (val2 > limit) { + /* normal loop if counter > limit */ + pc += (int)val; + if (lre_poll_timeout(s)) + return LRE_RET_TIMEOUT; + } else { + /* check advance */ + if ((opcode == REOP_loop_check_adv_split_goto_first || + opcode == REOP_loop_check_adv_split_next_first) && + aux_stack[idx + 1] == cptr && + val2 != limit) { + goto no_match; + } + + /* otherwise conditional split */ + if (val2 != 0) { + if (opcode == REOP_loop_split_next_first || + opcode == REOP_loop_check_adv_split_next_first) { + pc1 = pc + (int)val; + } else { + pc1 = pc; + pc = pc + (int)val; + } + CHECK_STACK_SPACE(3); + sp[0].ptr = (uint8_t *)pc1; + sp[1].ptr = (uint8_t *)cptr; + sp[2].bp.val = bp - s->stack_buf; + sp[2].bp.type = RE_EXEC_STATE_SPLIT; + sp += 3; + bp = sp; + } + } + } + break; case REOP_push_char_pos: idx = pc[0]; pc++;