mirror of
https://github.com/bellard/quickjs.git
synced 2025-12-31 05:39:10 +03:00
regexp: ensure that the bytecode size grows linearly with respect to
the input regexp. This way, pathological regexps such as /(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(?:a|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+/ are no longer an issue. The generated bytecode is also simpler and faster.
This commit is contained in:
@@ -45,6 +45,10 @@ DEF(save_start, 2) /* save start position */
|
|||||||
DEF(save_end, 2) /* save end position, must come after saved_start */
|
DEF(save_end, 2) /* save end position, must come after saved_start */
|
||||||
DEF(save_reset, 3) /* reset save positions */
|
DEF(save_reset, 3) /* reset save positions */
|
||||||
DEF(loop, 6) /* decrement the top the stack and goto if != 0 */
|
DEF(loop, 6) /* decrement the top the stack and goto if != 0 */
|
||||||
|
DEF(loop_split_goto_first, 10)
|
||||||
|
DEF(loop_split_next_first, 10)
|
||||||
|
DEF(loop_check_adv_split_goto_first, 10)
|
||||||
|
DEF(loop_check_adv_split_next_first, 10)
|
||||||
DEF(push_i32, 6) /* push integer on the stack */
|
DEF(push_i32, 6) /* push integer on the stack */
|
||||||
DEF(word_boundary, 1)
|
DEF(word_boundary, 1)
|
||||||
DEF(word_boundary_i, 1)
|
DEF(word_boundary_i, 1)
|
||||||
|
|||||||
148
libregexp.c
148
libregexp.c
@@ -532,6 +532,19 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
|||||||
val += (pos + 6);
|
val += (pos + 6);
|
||||||
printf(" %u, %u", val2, val);
|
printf(" %u, %u", val2, val);
|
||||||
break;
|
break;
|
||||||
|
case REOP_loop_split_goto_first:
|
||||||
|
case REOP_loop_split_next_first:
|
||||||
|
case REOP_loop_check_adv_split_goto_first:
|
||||||
|
case REOP_loop_check_adv_split_next_first:
|
||||||
|
{
|
||||||
|
uint32_t limit;
|
||||||
|
val2 = buf[pos + 1];
|
||||||
|
limit = get_u32(buf + pos + 2);
|
||||||
|
val = get_u32(buf + pos + 6);
|
||||||
|
val += (pos + 10);
|
||||||
|
printf(" %u, %u, %u", val2, limit, val);
|
||||||
|
}
|
||||||
|
break;
|
||||||
case REOP_save_start:
|
case REOP_save_start:
|
||||||
case REOP_save_end:
|
case REOP_save_end:
|
||||||
case REOP_back_reference:
|
case REOP_back_reference:
|
||||||
@@ -620,6 +633,17 @@ static int re_emit_goto_u8(REParseState *s, int op, uint32_t arg, uint32_t val)
|
|||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val)
|
||||||
|
{
|
||||||
|
int pos;
|
||||||
|
dbuf_putc(&s->byte_code, op);
|
||||||
|
dbuf_putc(&s->byte_code, arg0);
|
||||||
|
dbuf_put_u32(&s->byte_code, arg1);
|
||||||
|
pos = s->byte_code.size;
|
||||||
|
dbuf_put_u32(&s->byte_code, val - (pos + 4));
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
|
static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
|
||||||
{
|
{
|
||||||
dbuf_putc(&s->byte_code, op);
|
dbuf_putc(&s->byte_code, op);
|
||||||
@@ -2183,62 +2207,46 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
|||||||
if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2))
|
if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2))
|
||||||
goto out_of_memory;
|
goto out_of_memory;
|
||||||
pos = last_atom_start;
|
pos = last_atom_start;
|
||||||
|
s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
|
||||||
|
put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10);
|
||||||
|
pos += 4;
|
||||||
|
|
||||||
s->byte_code.buf[pos++] = REOP_push_i32;
|
s->byte_code.buf[pos++] = REOP_push_i32;
|
||||||
s->byte_code.buf[pos++] = 0;
|
s->byte_code.buf[pos++] = 0;
|
||||||
put_u32(s->byte_code.buf + pos, quant_max);
|
put_u32(s->byte_code.buf + pos, quant_max);
|
||||||
pos += 4;
|
pos += 4;
|
||||||
|
last_atom_start = pos;
|
||||||
s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
|
|
||||||
put_u32(s->byte_code.buf + pos, len + 6 + add_zero_advance_check * 2 * 2);
|
|
||||||
pos += 4;
|
|
||||||
if (add_zero_advance_check) {
|
if (add_zero_advance_check) {
|
||||||
s->byte_code.buf[pos++] = REOP_push_char_pos;
|
s->byte_code.buf[pos++] = REOP_push_char_pos;
|
||||||
s->byte_code.buf[pos++] = 0;
|
s->byte_code.buf[pos++] = 0;
|
||||||
re_emit_op_u8(s, REOP_check_advance, 0);
|
|
||||||
}
|
}
|
||||||
re_emit_goto_u8(s, REOP_loop, 0, last_atom_start + 6);
|
re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start);
|
||||||
}
|
}
|
||||||
} else if (quant_min == 1 && quant_max == INT32_MAX &&
|
} else if (quant_min == 1 && quant_max == INT32_MAX &&
|
||||||
!add_zero_advance_check) {
|
!add_zero_advance_check) {
|
||||||
re_emit_goto(s, REOP_split_next_first - greedy,
|
re_emit_goto(s, REOP_split_next_first - greedy,
|
||||||
last_atom_start);
|
last_atom_start);
|
||||||
} else {
|
} else {
|
||||||
if (quant_min == 1) {
|
if (quant_min == quant_max)
|
||||||
/* nothing to add */
|
add_zero_advance_check = FALSE;
|
||||||
} else {
|
if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2))
|
||||||
if (dbuf_insert(&s->byte_code, last_atom_start, 6))
|
goto out_of_memory;
|
||||||
goto out_of_memory;
|
/* Note: we assume the string length is < INT32_MAX */
|
||||||
s->byte_code.buf[last_atom_start++] = REOP_push_i32;
|
pos = last_atom_start;
|
||||||
s->byte_code.buf[last_atom_start++] = 0;
|
s->byte_code.buf[pos++] = REOP_push_i32;
|
||||||
put_u32(s->byte_code.buf + last_atom_start, quant_min);
|
s->byte_code.buf[pos++] = 0;
|
||||||
last_atom_start += 4;
|
put_u32(s->byte_code.buf + pos, quant_max);
|
||||||
re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
|
pos += 4;
|
||||||
|
last_atom_start = pos;
|
||||||
|
if (add_zero_advance_check) {
|
||||||
|
s->byte_code.buf[pos++] = REOP_push_char_pos;
|
||||||
|
s->byte_code.buf[pos++] = 0;
|
||||||
}
|
}
|
||||||
if (quant_max == INT32_MAX) {
|
if (quant_min == quant_max) {
|
||||||
pos = s->byte_code.size;
|
/* a simple loop is enough */
|
||||||
re_emit_op_u32(s, REOP_split_goto_first + greedy,
|
re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
|
||||||
len + 5 + add_zero_advance_check * 2 * 2);
|
} else {
|
||||||
if (add_zero_advance_check)
|
re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start);
|
||||||
re_emit_op_u8(s, REOP_push_char_pos, 0);
|
|
||||||
/* copy the atom */
|
|
||||||
dbuf_put_self(&s->byte_code, last_atom_start, len);
|
|
||||||
if (add_zero_advance_check)
|
|
||||||
re_emit_op_u8(s, REOP_check_advance, 0);
|
|
||||||
re_emit_goto(s, REOP_goto, pos);
|
|
||||||
} else if (quant_max > quant_min) {
|
|
||||||
re_emit_op_u8(s, REOP_push_i32, 0);
|
|
||||||
dbuf_put_u32(&s->byte_code, quant_max - quant_min);
|
|
||||||
|
|
||||||
pos = s->byte_code.size;
|
|
||||||
re_emit_op_u32(s, REOP_split_goto_first + greedy,
|
|
||||||
len + 6 + add_zero_advance_check * 2 * 2);
|
|
||||||
if (add_zero_advance_check)
|
|
||||||
re_emit_op_u8(s, REOP_push_char_pos, 0);
|
|
||||||
/* copy the atom */
|
|
||||||
dbuf_put_self(&s->byte_code, last_atom_start, len);
|
|
||||||
if (add_zero_advance_check)
|
|
||||||
re_emit_op_u8(s, REOP_check_advance, 0);
|
|
||||||
re_emit_goto_u8(s, REOP_loop, 0, pos);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
last_atom_start = -1;
|
last_atom_start = -1;
|
||||||
@@ -2352,10 +2360,18 @@ static int compute_stack_size(uint8_t *bc_buf, int bc_buf_len)
|
|||||||
break;
|
break;
|
||||||
case REOP_check_advance:
|
case REOP_check_advance:
|
||||||
case REOP_loop:
|
case REOP_loop:
|
||||||
|
case REOP_loop_split_goto_first:
|
||||||
|
case REOP_loop_split_next_first:
|
||||||
assert(stack_size > 0);
|
assert(stack_size > 0);
|
||||||
stack_size--;
|
stack_size--;
|
||||||
bc_buf[pos + 1] = stack_size;
|
bc_buf[pos + 1] = stack_size;
|
||||||
break;
|
break;
|
||||||
|
case REOP_loop_check_adv_split_goto_first:
|
||||||
|
case REOP_loop_check_adv_split_next_first:
|
||||||
|
assert(stack_size >= 2);
|
||||||
|
stack_size -= 2;
|
||||||
|
bc_buf[pos + 1] = stack_size;
|
||||||
|
break;
|
||||||
case REOP_range:
|
case REOP_range:
|
||||||
case REOP_range_i:
|
case REOP_range_i:
|
||||||
val = get_u16(bc_buf + pos + 1);
|
val = get_u16(bc_buf + pos + 1);
|
||||||
@@ -2956,6 +2972,56 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case REOP_loop_split_goto_first:
|
||||||
|
case REOP_loop_split_next_first:
|
||||||
|
case REOP_loop_check_adv_split_goto_first:
|
||||||
|
case REOP_loop_check_adv_split_next_first:
|
||||||
|
{
|
||||||
|
const uint8_t *pc1;
|
||||||
|
uint32_t val2, limit;
|
||||||
|
idx = pc[0];
|
||||||
|
limit = get_u32(pc + 1);
|
||||||
|
val = get_u32(pc + 5);
|
||||||
|
pc += 9;
|
||||||
|
|
||||||
|
/* decrement the counter */
|
||||||
|
val2 = (uintptr_t)aux_stack[idx] - 1;
|
||||||
|
SAVE_AUX_STACK(idx, (void *)(uintptr_t)val2);
|
||||||
|
|
||||||
|
if (val2 > limit) {
|
||||||
|
/* normal loop if counter > limit */
|
||||||
|
pc += (int)val;
|
||||||
|
if (lre_poll_timeout(s))
|
||||||
|
return LRE_RET_TIMEOUT;
|
||||||
|
} else {
|
||||||
|
/* check advance */
|
||||||
|
if ((opcode == REOP_loop_check_adv_split_goto_first ||
|
||||||
|
opcode == REOP_loop_check_adv_split_next_first) &&
|
||||||
|
aux_stack[idx + 1] == cptr &&
|
||||||
|
val2 != limit) {
|
||||||
|
goto no_match;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* otherwise conditional split */
|
||||||
|
if (val2 != 0) {
|
||||||
|
if (opcode == REOP_loop_split_next_first ||
|
||||||
|
opcode == REOP_loop_check_adv_split_next_first) {
|
||||||
|
pc1 = pc + (int)val;
|
||||||
|
} else {
|
||||||
|
pc1 = pc;
|
||||||
|
pc = pc + (int)val;
|
||||||
|
}
|
||||||
|
CHECK_STACK_SPACE(3);
|
||||||
|
sp[0].ptr = (uint8_t *)pc1;
|
||||||
|
sp[1].ptr = (uint8_t *)cptr;
|
||||||
|
sp[2].bp.val = bp - s->stack_buf;
|
||||||
|
sp[2].bp.type = RE_EXEC_STATE_SPLIT;
|
||||||
|
sp += 3;
|
||||||
|
bp = sp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case REOP_push_char_pos:
|
case REOP_push_char_pos:
|
||||||
idx = pc[0];
|
idx = pc[0];
|
||||||
pc++;
|
pc++;
|
||||||
|
|||||||
Reference in New Issue
Block a user