regexp: cosmetic: make it clearer that there is now a set of registers instead of an auxiliary stack

This commit is contained in:
Fabrice Bellard
2025-11-29 13:04:47 +01:00
parent 371c06e359
commit 47aac8b2a8
2 changed files with 60 additions and 59 deletions

View File

@@ -45,11 +45,11 @@ DEF(save_start, 2) /* save start position */
DEF(save_end, 2) /* save end position, must come after saved_start */ DEF(save_end, 2) /* save end position, must come after saved_start */
DEF(save_reset, 3) /* reset save positions */ DEF(save_reset, 3) /* reset save positions */
DEF(loop, 6) /* decrement the top the stack and goto if != 0 */ DEF(loop, 6) /* decrement the top the stack and goto if != 0 */
DEF(loop_split_goto_first, 10) DEF(loop_split_goto_first, 10) /* loop and then split */
DEF(loop_split_next_first, 10) DEF(loop_split_next_first, 10)
DEF(loop_check_adv_split_goto_first, 10) DEF(loop_check_adv_split_goto_first, 10) /* loop and then check advance and split */
DEF(loop_check_adv_split_next_first, 10) DEF(loop_check_adv_split_next_first, 10)
DEF(push_i32, 6) /* push integer on the stack */ DEF(set_i32, 6) /* store the immediate value to a register */
DEF(word_boundary, 1) DEF(word_boundary, 1)
DEF(word_boundary_i, 1) DEF(word_boundary_i, 1)
DEF(not_word_boundary, 1) DEF(not_word_boundary, 1)
@@ -64,8 +64,8 @@ DEF(range32, 3) /* variable length */
DEF(range32_i, 3) /* variable length */ DEF(range32_i, 3) /* variable length */
DEF(lookahead, 5) DEF(lookahead, 5)
DEF(negative_lookahead, 5) /* must come after */ DEF(negative_lookahead, 5) /* must come after */
DEF(push_char_pos, 2) /* push the character position on the stack */ DEF(set_char_pos, 2) /* store the character position to a register */
DEF(check_advance, 2) /* pop one stack element and check that it is different from the character position */ DEF(check_advance, 2) /* check that the register is different from the character position */
DEF(prev, 1) /* go to the previous char */ DEF(prev, 1) /* go to the previous char */
#endif /* DEF */ #endif /* DEF */

View File

@@ -55,7 +55,7 @@ typedef enum {
} REOPCodeEnum; } REOPCodeEnum;
#define CAPTURE_COUNT_MAX 255 #define CAPTURE_COUNT_MAX 255
#define STACK_SIZE_MAX 255 #define REGISTER_COUNT_MAX 255
/* must be large enough to have a negligible runtime cost and small /* must be large enough to have a negligible runtime cost and small
enough to call the interrupt callback often. */ enough to call the interrupt callback often. */
#define INTERRUPT_COUNTER_INIT 10000 #define INTERRUPT_COUNTER_INIT 10000
@@ -107,7 +107,7 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
#define RE_HEADER_FLAGS 0 #define RE_HEADER_FLAGS 0
#define RE_HEADER_CAPTURE_COUNT 2 #define RE_HEADER_CAPTURE_COUNT 2
#define RE_HEADER_STACK_SIZE 3 #define RE_HEADER_REGISTER_COUNT 3
#define RE_HEADER_BYTECODE_LEN 4 #define RE_HEADER_BYTECODE_LEN 4
#define RE_HEADER_LEN 8 #define RE_HEADER_LEN 8
@@ -468,8 +468,8 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
re_flags = lre_get_flags(buf); re_flags = lre_get_flags(buf);
bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN); bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
assert(bc_len + RE_HEADER_LEN <= buf_len); assert(bc_len + RE_HEADER_LEN <= buf_len);
printf("flags: 0x%x capture_count=%d aux_stack_size=%d\n", printf("flags: 0x%x capture_count=%d reg_count=%d\n",
re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]); re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_REGISTER_COUNT]);
if (re_flags & LRE_FLAG_NAMED_GROUPS) { if (re_flags & LRE_FLAG_NAMED_GROUPS) {
const char *p; const char *p;
p = (char *)buf + RE_HEADER_LEN + bc_len; p = (char *)buf + RE_HEADER_LEN + bc_len;
@@ -530,7 +530,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
val2 = buf[pos + 1]; val2 = buf[pos + 1];
val = get_u32(buf + pos + 2); val = get_u32(buf + pos + 2);
val += (pos + 6); val += (pos + 6);
printf(" %u, %u", val2, val); printf(" r%u, %u", val2, val);
break; break;
case REOP_loop_split_goto_first: case REOP_loop_split_goto_first:
case REOP_loop_split_next_first: case REOP_loop_split_next_first:
@@ -542,7 +542,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
limit = get_u32(buf + pos + 2); limit = get_u32(buf + pos + 2);
val = get_u32(buf + pos + 6); val = get_u32(buf + pos + 6);
val += (pos + 10); val += (pos + 10);
printf(" %u, %u, %u", val2, limit, val); printf(" r%u, %u, %u", val2, limit, val);
} }
break; break;
case REOP_save_start: case REOP_save_start:
@@ -556,15 +556,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
case REOP_save_reset: case REOP_save_reset:
printf(" %u %u", buf[pos + 1], buf[pos + 2]); printf(" %u %u", buf[pos + 1], buf[pos + 2]);
break; break;
case REOP_push_i32: case REOP_set_i32:
val = buf[pos + 1]; val = buf[pos + 1];
val2 = get_u32(buf + pos + 2); val2 = get_u32(buf + pos + 2);
printf(" %u, %d", val, val2); printf(" r%u, %d", val, val2);
break; break;
case REOP_push_char_pos: case REOP_set_char_pos:
case REOP_check_advance: case REOP_check_advance:
val = buf[pos + 1]; val = buf[pos + 1];
printf(" %u", val); printf(" r%u", val);
break; break;
case REOP_range: case REOP_range:
case REOP_range_i: case REOP_range_i:
@@ -1570,8 +1570,8 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
case REOP_line_start_m: case REOP_line_start_m:
case REOP_line_end: case REOP_line_end:
case REOP_line_end_m: case REOP_line_end_m:
case REOP_push_i32: case REOP_set_i32:
case REOP_push_char_pos: case REOP_set_char_pos:
case REOP_word_boundary: case REOP_word_boundary:
case REOP_word_boundary_i: case REOP_word_boundary_i:
case REOP_not_word_boundary: case REOP_not_word_boundary:
@@ -2197,7 +2197,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
put_u32(s->byte_code.buf + last_atom_start + 1, put_u32(s->byte_code.buf + last_atom_start + 1,
len + 5 * has_goto + add_zero_advance_check * 2 * 2); len + 5 * has_goto + add_zero_advance_check * 2 * 2);
if (add_zero_advance_check) { if (add_zero_advance_check) {
s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; s->byte_code.buf[last_atom_start + 1 + 4] = REOP_set_char_pos;
s->byte_code.buf[last_atom_start + 1 + 4 + 1] = 0; s->byte_code.buf[last_atom_start + 1 + 4 + 1] = 0;
re_emit_op_u8(s, REOP_check_advance, 0); re_emit_op_u8(s, REOP_check_advance, 0);
} }
@@ -2211,13 +2211,13 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10); put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10);
pos += 4; pos += 4;
s->byte_code.buf[pos++] = REOP_push_i32; s->byte_code.buf[pos++] = REOP_set_i32;
s->byte_code.buf[pos++] = 0; s->byte_code.buf[pos++] = 0;
put_u32(s->byte_code.buf + pos, quant_max); put_u32(s->byte_code.buf + pos, quant_max);
pos += 4; pos += 4;
last_atom_start = pos; last_atom_start = pos;
if (add_zero_advance_check) { if (add_zero_advance_check) {
s->byte_code.buf[pos++] = REOP_push_char_pos; s->byte_code.buf[pos++] = REOP_set_char_pos;
s->byte_code.buf[pos++] = 0; s->byte_code.buf[pos++] = 0;
} }
re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start); re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start);
@@ -2233,13 +2233,13 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
goto out_of_memory; goto out_of_memory;
/* Note: we assume the string length is < INT32_MAX */ /* Note: we assume the string length is < INT32_MAX */
pos = last_atom_start; pos = last_atom_start;
s->byte_code.buf[pos++] = REOP_push_i32; s->byte_code.buf[pos++] = REOP_set_i32;
s->byte_code.buf[pos++] = 0; s->byte_code.buf[pos++] = 0;
put_u32(s->byte_code.buf + pos, quant_max); put_u32(s->byte_code.buf + pos, quant_max);
pos += 4; pos += 4;
last_atom_start = pos; last_atom_start = pos;
if (add_zero_advance_check) { if (add_zero_advance_check) {
s->byte_code.buf[pos++] = REOP_push_char_pos; s->byte_code.buf[pos++] = REOP_set_char_pos;
s->byte_code.buf[pos++] = 0; s->byte_code.buf[pos++] = 0;
} }
if (quant_min == quant_max) { if (quant_min == quant_max) {
@@ -2330,9 +2330,9 @@ static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
return 0; return 0;
} }
/* the control flow is recursive so the analysis can be linear. As a /* Allocate the registers as a stack. The control flow is recursive so
side effect, the auxiliary stack addresses are computed. */ the analysis can be linear. */
static int compute_stack_size(uint8_t *bc_buf, int bc_buf_len) static int compute_register_count(uint8_t *bc_buf, int bc_buf_len)
{ {
int stack_size, stack_size_max, pos, opcode, len; int stack_size, stack_size_max, pos, opcode, len;
uint32_t val; uint32_t val;
@@ -2348,12 +2348,12 @@ static int compute_stack_size(uint8_t *bc_buf, int bc_buf_len)
assert(opcode < REOP_COUNT); assert(opcode < REOP_COUNT);
assert((pos + len) <= bc_buf_len); assert((pos + len) <= bc_buf_len);
switch(opcode) { switch(opcode) {
case REOP_push_i32: case REOP_set_i32:
case REOP_push_char_pos: case REOP_set_char_pos:
bc_buf[pos + 1] = stack_size; bc_buf[pos + 1] = stack_size;
stack_size++; stack_size++;
if (stack_size > stack_size_max) { if (stack_size > stack_size_max) {
if (stack_size > STACK_SIZE_MAX) if (stack_size > REGISTER_COUNT_MAX)
return -1; return -1;
stack_size_max = stack_size; stack_size_max = stack_size;
} }
@@ -2408,7 +2408,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
void *opaque) void *opaque)
{ {
REParseState s_s, *s = &s_s; REParseState s_s, *s = &s_s;
int stack_size; int register_count;
BOOL is_sticky; BOOL is_sticky;
memset(s, 0, sizeof(*s)); memset(s, 0, sizeof(*s));
@@ -2469,14 +2469,14 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
goto error; goto error;
} }
stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); register_count = compute_register_count(s->byte_code.buf, s->byte_code.size);
if (stack_size < 0) { if (register_count < 0) {
re_parse_error(s, "too many imbricated quantifiers"); re_parse_error(s, "too many imbricated quantifiers");
goto error; goto error;
} }
s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; s->byte_code.buf[RE_HEADER_REGISTER_COUNT] = register_count;
put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN, put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
s->byte_code.size - RE_HEADER_LEN); s->byte_code.size - RE_HEADER_LEN);
@@ -2620,7 +2620,6 @@ typedef struct {
/* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
int cbuf_type; int cbuf_type;
int capture_count; int capture_count;
int stack_size_max;
BOOL is_unicode; BOOL is_unicode;
int interrupt_counter; int interrupt_counter;
void *opaque; /* used for stack overflow check */ void *opaque; /* used for stack overflow check */
@@ -2665,7 +2664,7 @@ static no_inline int stack_realloc(REExecContext *s, size_t n)
/* return 1 if match, 0 if not match or < 0 if error. */ /* return 1 if match, 0 if not match or < 0 if error. */
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
uint8_t **aux_stack, const uint8_t *pc, const uint8_t *cptr) uint8_t **regs, const uint8_t *pc, const uint8_t *cptr)
{ {
int opcode; int opcode;
int cbuf_type; int cbuf_type;
@@ -2705,7 +2704,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
} }
/* avoid saving the previous value if already saved */ /* avoid saving the previous value if already saved */
#define SAVE_AUX_STACK(idx, value) \ #define SAVE_REG(idx, value) \
{ \ { \
StackElem *sp1; \ StackElem *sp1; \
sp1 = sp; \ sp1 = sp; \
@@ -2717,12 +2716,12 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
} else { \ } else { \
CHECK_STACK_SPACE(2); \ CHECK_STACK_SPACE(2); \
sp[0].val = -(int)(idx + 1); \ sp[0].val = -(int)(idx + 1); \
sp[1].ptr = aux_stack[idx]; \ sp[1].ptr = regs[idx]; \
sp += 2; \ sp += 2; \
break; \ break; \
} \ } \
} \ } \
aux_stack[idx] = (value); \ regs[idx] = (value); \
} }
@@ -2747,13 +2746,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
REExecStateEnum type; REExecStateEnum type;
if (bp == s->stack_buf) if (bp == s->stack_buf)
return 0; return 0;
/* undo the modifications to capture[] and aux_stack[] */ /* undo the modifications to capture[] and regs[] */
while (sp > bp) { while (sp > bp) {
intptr_t idx2 = sp[-2].val; intptr_t idx2 = sp[-2].val;
if (idx2 >= 0) if (idx2 >= 0)
capture[idx2] = sp[-1].ptr; capture[idx2] = sp[-1].ptr;
else else
aux_stack[-idx2 - 1] = sp[-1].ptr; regs[-idx2 - 1] = sp[-1].ptr;
sp -= 2; sp -= 2;
} }
@@ -2804,13 +2803,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
for(;;) { for(;;) {
REExecStateEnum type; REExecStateEnum type;
type = bp[-1].bp.type; type = bp[-1].bp.type;
/* undo the modifications to capture[] and aux_stack[] */ /* undo the modifications to capture[] and regs[] */
while (sp > bp) { while (sp > bp) {
intptr_t idx2 = sp[-2].val; intptr_t idx2 = sp[-2].val;
if (idx2 >= 0) if (idx2 >= 0)
capture[idx2] = sp[-1].ptr; capture[idx2] = sp[-1].ptr;
else else
aux_stack[-idx2 - 1] = sp[-1].ptr; regs[-idx2 - 1] = sp[-1].ptr;
sp -= 2; sp -= 2;
} }
pc = sp[-3].ptr; pc = sp[-3].ptr;
@@ -2950,11 +2949,11 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
} }
} }
break; break;
case REOP_push_i32: case REOP_set_i32:
idx = pc[0]; idx = pc[0];
val = get_u32(pc + 1); val = get_u32(pc + 1);
pc += 5; pc += 5;
SAVE_AUX_STACK(idx, (void *)(uintptr_t)val); SAVE_REG(idx, (void *)(uintptr_t)val);
break; break;
case REOP_loop: case REOP_loop:
{ {
@@ -2963,8 +2962,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
val = get_u32(pc + 1); val = get_u32(pc + 1);
pc += 5; pc += 5;
val2 = (uintptr_t)aux_stack[idx] - 1; val2 = (uintptr_t)regs[idx] - 1;
SAVE_AUX_STACK(idx, (void *)(uintptr_t)val2); SAVE_REG(idx, (void *)(uintptr_t)val2);
if (val2 != 0) { if (val2 != 0) {
pc += (int)val; pc += (int)val;
if (lre_poll_timeout(s)) if (lre_poll_timeout(s))
@@ -2985,8 +2984,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
pc += 9; pc += 9;
/* decrement the counter */ /* decrement the counter */
val2 = (uintptr_t)aux_stack[idx] - 1; val2 = (uintptr_t)regs[idx] - 1;
SAVE_AUX_STACK(idx, (void *)(uintptr_t)val2); SAVE_REG(idx, (void *)(uintptr_t)val2);
if (val2 > limit) { if (val2 > limit) {
/* normal loop if counter > limit */ /* normal loop if counter > limit */
@@ -2997,7 +2996,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
/* check advance */ /* check advance */
if ((opcode == REOP_loop_check_adv_split_goto_first || if ((opcode == REOP_loop_check_adv_split_goto_first ||
opcode == REOP_loop_check_adv_split_next_first) && opcode == REOP_loop_check_adv_split_next_first) &&
aux_stack[idx + 1] == cptr && regs[idx + 1] == cptr &&
val2 != limit) { val2 != limit) {
goto no_match; goto no_match;
} }
@@ -3022,15 +3021,15 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
} }
} }
break; break;
case REOP_push_char_pos: case REOP_set_char_pos:
idx = pc[0]; idx = pc[0];
pc++; pc++;
SAVE_AUX_STACK(idx, (uint8_t *)cptr); SAVE_REG(idx, (uint8_t *)cptr);
break; break;
case REOP_check_advance: case REOP_check_advance:
idx = pc[0]; idx = pc[0];
pc++; pc++;
if (aux_stack[idx] == cptr) if (regs[idx] == cptr)
goto no_match; goto no_match;
break; break;
case REOP_word_boundary: case REOP_word_boundary:
@@ -3212,14 +3211,13 @@ int lre_exec(uint8_t **capture,
int cbuf_type, void *opaque) int cbuf_type, void *opaque)
{ {
REExecContext s_s, *s = &s_s; REExecContext s_s, *s = &s_s;
int re_flags, i, ret; int re_flags, i, ret, register_count;
uint8_t **aux_stack; uint8_t **regs;
const uint8_t *cptr; const uint8_t *cptr;
re_flags = lre_get_flags(bc_buf); re_flags = lre_get_flags(bc_buf);
s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0; s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
s->cbuf = cbuf; s->cbuf = cbuf;
s->cbuf_end = cbuf + (clen << cbuf_type); s->cbuf_end = cbuf + (clen << cbuf_type);
s->cbuf_type = cbuf_type; s->cbuf_type = cbuf_type;
@@ -3233,7 +3231,10 @@ int lre_exec(uint8_t **capture,
for(i = 0; i < s->capture_count * 2; i++) for(i = 0; i < s->capture_count * 2; i++)
capture[i] = NULL; capture[i] = NULL;
aux_stack = alloca(s->stack_size_max * sizeof(aux_stack[0])); /* XXX: modify the API so that the registers are allocated after
the captures to suppress some tests */
register_count = bc_buf[RE_HEADER_REGISTER_COUNT];
regs = alloca(register_count * sizeof(regs[0]));
cptr = cbuf + (cindex << cbuf_type); cptr = cbuf + (cindex << cbuf_type);
if (0 < cindex && cindex < clen && s->cbuf_type == 2) { if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
@@ -3243,7 +3244,7 @@ int lre_exec(uint8_t **capture,
} }
} }
ret = lre_exec_backtrack(s, capture, aux_stack, bc_buf + RE_HEADER_LEN, ret = lre_exec_backtrack(s, capture, regs, bc_buf + RE_HEADER_LEN,
cptr); cptr);
if (s->stack_buf != s->static_stack_buf) if (s->stack_buf != s->static_stack_buf)
lre_realloc(s->opaque, s->stack_buf, 0); lre_realloc(s->opaque, s->stack_buf, 0);