regexp: ensure that the bytecode size grows linearly with respect to

the input regexp.

This way, pathological regexps such as
/(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(?:a|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+/ are no longer an issue. The generated bytecode is also simpler and
faster.
This commit is contained in:
Fabrice Bellard
2025-11-29 12:39:52 +01:00
parent a77400796d
commit 371c06e359
2 changed files with 111 additions and 41 deletions

View File

@@ -45,6 +45,10 @@ DEF(save_start, 2) /* save start position */
DEF(save_end, 2) /* save end position, must come after saved_start */
DEF(save_reset, 3) /* reset save positions */
DEF(loop, 6) /* decrement the top the stack and goto if != 0 */
DEF(loop_split_goto_first, 10)
DEF(loop_split_next_first, 10)
DEF(loop_check_adv_split_goto_first, 10)
DEF(loop_check_adv_split_next_first, 10)
DEF(push_i32, 6) /* push integer on the stack */
DEF(word_boundary, 1)
DEF(word_boundary_i, 1)

View File

@@ -532,6 +532,19 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
val += (pos + 6);
printf(" %u, %u", val2, val);
break;
case REOP_loop_split_goto_first:
case REOP_loop_split_next_first:
case REOP_loop_check_adv_split_goto_first:
case REOP_loop_check_adv_split_next_first:
{
uint32_t limit;
val2 = buf[pos + 1];
limit = get_u32(buf + pos + 2);
val = get_u32(buf + pos + 6);
val += (pos + 10);
printf(" %u, %u, %u", val2, limit, val);
}
break;
case REOP_save_start:
case REOP_save_end:
case REOP_back_reference:
@@ -620,6 +633,17 @@ static int re_emit_goto_u8(REParseState *s, int op, uint32_t arg, uint32_t val)
return pos;
}
static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val)
{
int pos;
dbuf_putc(&s->byte_code, op);
dbuf_putc(&s->byte_code, arg0);
dbuf_put_u32(&s->byte_code, arg1);
pos = s->byte_code.size;
dbuf_put_u32(&s->byte_code, val - (pos + 4));
return pos;
}
static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
{
dbuf_putc(&s->byte_code, op);
@@ -2183,62 +2207,46 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2))
goto out_of_memory;
pos = last_atom_start;
s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10);
pos += 4;
s->byte_code.buf[pos++] = REOP_push_i32;
s->byte_code.buf[pos++] = 0;
put_u32(s->byte_code.buf + pos, quant_max);
pos += 4;
s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
put_u32(s->byte_code.buf + pos, len + 6 + add_zero_advance_check * 2 * 2);
pos += 4;
last_atom_start = pos;
if (add_zero_advance_check) {
s->byte_code.buf[pos++] = REOP_push_char_pos;
s->byte_code.buf[pos++] = 0;
re_emit_op_u8(s, REOP_check_advance, 0);
}
re_emit_goto_u8(s, REOP_loop, 0, last_atom_start + 6);
re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start);
}
} else if (quant_min == 1 && quant_max == INT32_MAX &&
!add_zero_advance_check) {
re_emit_goto(s, REOP_split_next_first - greedy,
last_atom_start);
} else {
if (quant_min == 1) {
/* nothing to add */
} else {
if (dbuf_insert(&s->byte_code, last_atom_start, 6))
goto out_of_memory;
s->byte_code.buf[last_atom_start++] = REOP_push_i32;
s->byte_code.buf[last_atom_start++] = 0;
put_u32(s->byte_code.buf + last_atom_start, quant_min);
last_atom_start += 4;
re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
if (quant_min == quant_max)
add_zero_advance_check = FALSE;
if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2))
goto out_of_memory;
/* Note: we assume the string length is < INT32_MAX */
pos = last_atom_start;
s->byte_code.buf[pos++] = REOP_push_i32;
s->byte_code.buf[pos++] = 0;
put_u32(s->byte_code.buf + pos, quant_max);
pos += 4;
last_atom_start = pos;
if (add_zero_advance_check) {
s->byte_code.buf[pos++] = REOP_push_char_pos;
s->byte_code.buf[pos++] = 0;
}
if (quant_max == INT32_MAX) {
pos = s->byte_code.size;
re_emit_op_u32(s, REOP_split_goto_first + greedy,
len + 5 + add_zero_advance_check * 2 * 2);
if (add_zero_advance_check)
re_emit_op_u8(s, REOP_push_char_pos, 0);
/* copy the atom */
dbuf_put_self(&s->byte_code, last_atom_start, len);
if (add_zero_advance_check)
re_emit_op_u8(s, REOP_check_advance, 0);
re_emit_goto(s, REOP_goto, pos);
} else if (quant_max > quant_min) {
re_emit_op_u8(s, REOP_push_i32, 0);
dbuf_put_u32(&s->byte_code, quant_max - quant_min);
pos = s->byte_code.size;
re_emit_op_u32(s, REOP_split_goto_first + greedy,
len + 6 + add_zero_advance_check * 2 * 2);
if (add_zero_advance_check)
re_emit_op_u8(s, REOP_push_char_pos, 0);
/* copy the atom */
dbuf_put_self(&s->byte_code, last_atom_start, len);
if (add_zero_advance_check)
re_emit_op_u8(s, REOP_check_advance, 0);
re_emit_goto_u8(s, REOP_loop, 0, pos);
if (quant_min == quant_max) {
/* a simple loop is enough */
re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
} else {
re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start);
}
}
last_atom_start = -1;
@@ -2352,10 +2360,18 @@ static int compute_stack_size(uint8_t *bc_buf, int bc_buf_len)
break;
case REOP_check_advance:
case REOP_loop:
case REOP_loop_split_goto_first:
case REOP_loop_split_next_first:
assert(stack_size > 0);
stack_size--;
bc_buf[pos + 1] = stack_size;
break;
case REOP_loop_check_adv_split_goto_first:
case REOP_loop_check_adv_split_next_first:
assert(stack_size >= 2);
stack_size -= 2;
bc_buf[pos + 1] = stack_size;
break;
case REOP_range:
case REOP_range_i:
val = get_u16(bc_buf + pos + 1);
@@ -2956,6 +2972,56 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
}
}
break;
case REOP_loop_split_goto_first:
case REOP_loop_split_next_first:
case REOP_loop_check_adv_split_goto_first:
case REOP_loop_check_adv_split_next_first:
{
const uint8_t *pc1;
uint32_t val2, limit;
idx = pc[0];
limit = get_u32(pc + 1);
val = get_u32(pc + 5);
pc += 9;
/* decrement the counter */
val2 = (uintptr_t)aux_stack[idx] - 1;
SAVE_AUX_STACK(idx, (void *)(uintptr_t)val2);
if (val2 > limit) {
/* normal loop if counter > limit */
pc += (int)val;
if (lre_poll_timeout(s))
return LRE_RET_TIMEOUT;
} else {
/* check advance */
if ((opcode == REOP_loop_check_adv_split_goto_first ||
opcode == REOP_loop_check_adv_split_next_first) &&
aux_stack[idx + 1] == cptr &&
val2 != limit) {
goto no_match;
}
/* otherwise conditional split */
if (val2 != 0) {
if (opcode == REOP_loop_split_next_first ||
opcode == REOP_loop_check_adv_split_next_first) {
pc1 = pc + (int)val;
} else {
pc1 = pc;
pc = pc + (int)val;
}
CHECK_STACK_SPACE(3);
sp[0].ptr = (uint8_t *)pc1;
sp[1].ptr = (uint8_t *)cptr;
sp[2].bp.val = bp - s->stack_buf;
sp[2].bp.type = RE_EXEC_STATE_SPLIT;
sp += 3;
bp = sp;
}
}
}
break;
case REOP_push_char_pos:
idx = pc[0];
pc++;