mirror of
https://github.com/bellard/quickjs.git
synced 2025-09-30 23:07:42 +03:00
Improve surrogate handling readability
- add inline function to test and convert surrogates is_surrogate(c), is_hi_surrogate(c), is_lo_surrogate(c), get_hi_surrogate(c), get_lo_surrogate(c), from_surrogate(hi, lo) - use names for BC header offsets and lengths in libregexp.c - remove strict aliasing violations in `lre_exec_backtrack()` - pass all context variables to XXX_CHAR macros in `lre_exec_backtrack()`
This commit is contained in:
159
libregexp.c
159
libregexp.c
@@ -100,6 +100,7 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
|
||||
#define RE_HEADER_FLAGS 0
|
||||
#define RE_HEADER_CAPTURE_COUNT 1
|
||||
#define RE_HEADER_STACK_SIZE 2
|
||||
#define RE_HEADER_BYTECODE_LEN 3
|
||||
|
||||
#define RE_HEADER_LEN 7
|
||||
|
||||
@@ -224,16 +225,16 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
||||
|
||||
assert(buf_len >= RE_HEADER_LEN);
|
||||
|
||||
re_flags = buf[0];
|
||||
bc_len = get_u32(buf + 3);
|
||||
re_flags = lre_get_flags(buf);
|
||||
bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
|
||||
assert(bc_len + RE_HEADER_LEN <= buf_len);
|
||||
printf("flags: 0x%x capture_count=%d stack_size=%d\n",
|
||||
re_flags, buf[1], buf[2]);
|
||||
re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
|
||||
if (re_flags & LRE_FLAG_NAMED_GROUPS) {
|
||||
const char *p;
|
||||
p = (char *)buf + RE_HEADER_LEN + bc_len;
|
||||
printf("named groups: ");
|
||||
for(i = 1; i < buf[1]; i++) {
|
||||
for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
|
||||
if (i != 1)
|
||||
printf(",");
|
||||
printf("<%s>", p);
|
||||
@@ -494,7 +495,7 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
|
||||
}
|
||||
c = (c << 4) | h;
|
||||
}
|
||||
if (c >= 0xd800 && c < 0xdc00 &&
|
||||
if (is_hi_surrogate(c) &&
|
||||
allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
|
||||
/* convert an escaped surrogate pair into a
|
||||
unicode char */
|
||||
@@ -505,9 +506,9 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
|
||||
break;
|
||||
c1 = (c1 << 4) | h;
|
||||
}
|
||||
if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
|
||||
if (i == 4 && is_lo_surrogate(c1)) {
|
||||
p += 6;
|
||||
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
|
||||
c = from_surrogate(c, c1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -936,7 +937,7 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
|
||||
case REOP_backward_back_reference:
|
||||
break;
|
||||
default:
|
||||
/* safe behvior: we cannot predict the outcome */
|
||||
/* safe behavior: we cannot predict the outcome */
|
||||
return TRUE;
|
||||
}
|
||||
pos += len;
|
||||
@@ -1005,10 +1006,10 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
|
||||
break;
|
||||
} else if (c >= 128) {
|
||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
|
||||
if (c >= 0xD800 && c <= 0xDBFF) {
|
||||
if (is_hi_surrogate(c)) {
|
||||
d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
|
||||
if (d >= 0xDC00 && d <= 0xDFFF) {
|
||||
c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00);
|
||||
if (is_lo_surrogate(d)) {
|
||||
c = from_surrogate(c, d);
|
||||
p = p1;
|
||||
}
|
||||
}
|
||||
@@ -1116,9 +1117,10 @@ static int find_group_name(REParseState *s, const char *name)
|
||||
size_t len, name_len;
|
||||
int capture_index;
|
||||
|
||||
name_len = strlen(name);
|
||||
p = (char *)s->group_names.buf;
|
||||
if (!p) return -1;
|
||||
buf_end = (char *)s->group_names.buf + s->group_names.size;
|
||||
name_len = strlen(name);
|
||||
capture_index = 1;
|
||||
while (p < buf_end) {
|
||||
len = strlen(p);
|
||||
@@ -1813,7 +1815,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
|
||||
|
||||
s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
|
||||
s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
|
||||
put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
|
||||
put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
|
||||
s->byte_code.size - RE_HEADER_LEN);
|
||||
|
||||
/* add the named groups if needed */
|
||||
if (s->group_names.size > (s->capture_count - 1)) {
|
||||
@@ -1844,93 +1847,86 @@ static BOOL is_word_char(uint32_t c)
|
||||
(c == '_'));
|
||||
}
|
||||
|
||||
#define GET_CHAR(c, cptr, cbuf_end) \
|
||||
#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
|
||||
do { \
|
||||
if (cbuf_type == 0) { \
|
||||
c = *cptr++; \
|
||||
} else { \
|
||||
uint32_t __c1; \
|
||||
c = *(uint16_t *)cptr; \
|
||||
cptr += 2; \
|
||||
if (c >= 0xd800 && c < 0xdc00 && \
|
||||
cbuf_type == 2 && cptr < cbuf_end) { \
|
||||
__c1 = *(uint16_t *)cptr; \
|
||||
if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
|
||||
c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
|
||||
cptr += 2; \
|
||||
const uint16_t *_p = (const uint16_t *)cptr; \
|
||||
const uint16_t *_end = (const uint16_t *)cbuf_end; \
|
||||
c = *_p++; \
|
||||
if (is_hi_surrogate(c) && cbuf_type == 2) { \
|
||||
if (_p < _end && is_lo_surrogate(*_p)) { \
|
||||
c = from_surrogate(c, *_p++); \
|
||||
} \
|
||||
} \
|
||||
cptr = (const void *)_p; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PEEK_CHAR(c, cptr, cbuf_end) \
|
||||
#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \
|
||||
do { \
|
||||
if (cbuf_type == 0) { \
|
||||
c = cptr[0]; \
|
||||
} else { \
|
||||
uint32_t __c1; \
|
||||
c = ((uint16_t *)cptr)[0]; \
|
||||
if (c >= 0xd800 && c < 0xdc00 && \
|
||||
cbuf_type == 2 && (cptr + 2) < cbuf_end) { \
|
||||
__c1 = ((uint16_t *)cptr)[1]; \
|
||||
if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
|
||||
c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
|
||||
const uint16_t *_p = (const uint16_t *)cptr; \
|
||||
const uint16_t *_end = (const uint16_t *)cbuf_end; \
|
||||
c = *_p++; \
|
||||
if (is_hi_surrogate(c) && cbuf_type == 2) { \
|
||||
if (_p < _end && is_lo_surrogate(*_p)) { \
|
||||
c = from_surrogate(c, *_p); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
|
||||
#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
|
||||
do { \
|
||||
if (cbuf_type == 0) { \
|
||||
c = cptr[-1]; \
|
||||
} else { \
|
||||
uint32_t __c1; \
|
||||
c = ((uint16_t *)cptr)[-1]; \
|
||||
if (c >= 0xdc00 && c < 0xe000 && \
|
||||
cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \
|
||||
__c1 = ((uint16_t *)cptr)[-2]; \
|
||||
if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
|
||||
c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
|
||||
const uint16_t *_p = (const uint16_t *)cptr - 1; \
|
||||
const uint16_t *_start = (const uint16_t *)cbuf_start; \
|
||||
c = *_p; \
|
||||
if (is_lo_surrogate(c) && cbuf_type == 2) { \
|
||||
if (_p > _start && is_hi_surrogate(_p[-1])) { \
|
||||
c = from_surrogate(*--_p, c); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define GET_PREV_CHAR(c, cptr, cbuf_start) \
|
||||
#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
|
||||
do { \
|
||||
if (cbuf_type == 0) { \
|
||||
cptr--; \
|
||||
c = cptr[0]; \
|
||||
} else { \
|
||||
uint32_t __c1; \
|
||||
cptr -= 2; \
|
||||
c = ((uint16_t *)cptr)[0]; \
|
||||
if (c >= 0xdc00 && c < 0xe000 && \
|
||||
cbuf_type == 2 && cptr > cbuf_start) { \
|
||||
__c1 = ((uint16_t *)cptr)[-1]; \
|
||||
if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
|
||||
cptr -= 2; \
|
||||
c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
|
||||
const uint16_t *_p = (const uint16_t *)cptr - 1; \
|
||||
const uint16_t *_start = (const uint16_t *)cbuf_start; \
|
||||
c = *_p; \
|
||||
if (is_lo_surrogate(c) && cbuf_type == 2) { \
|
||||
if (_p > _start && is_hi_surrogate(_p[-1])) { \
|
||||
c = from_surrogate(*--_p, c); \
|
||||
} \
|
||||
} \
|
||||
cptr = (const void *)_p; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PREV_CHAR(cptr, cbuf_start) \
|
||||
#define PREV_CHAR(cptr, cbuf_start, cbuf_type) \
|
||||
do { \
|
||||
if (cbuf_type == 0) { \
|
||||
cptr--; \
|
||||
} else { \
|
||||
cptr -= 2; \
|
||||
if (cbuf_type == 2) { \
|
||||
c = ((uint16_t *)cptr)[0]; \
|
||||
if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \
|
||||
c = ((uint16_t *)cptr)[-1]; \
|
||||
if (c >= 0xd800 && c < 0xdc00) \
|
||||
cptr -= 2; \
|
||||
const uint16_t *_p = (const uint16_t *)cptr - 1; \
|
||||
const uint16_t *_start = (const uint16_t *)cbuf_start; \
|
||||
if (is_lo_surrogate(*_p) && cbuf_type == 2) { \
|
||||
if (_p > _start && is_hi_surrogate(_p[-1])) { \
|
||||
--_p; \
|
||||
} \
|
||||
} \
|
||||
cptr = (const void *)_p; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -2070,7 +2066,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
/* go backward */
|
||||
char_count = get_u32(pc + 12);
|
||||
for(i = 0; i < char_count; i++) {
|
||||
PREV_CHAR(cptr, s->cbuf);
|
||||
PREV_CHAR(cptr, s->cbuf, cbuf_type);
|
||||
}
|
||||
pc = (pc + 16) + (int)get_u32(pc);
|
||||
rs->cptr = cptr;
|
||||
@@ -2105,7 +2101,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
test_char:
|
||||
if (cptr >= cbuf_end)
|
||||
goto no_match;
|
||||
GET_CHAR(c, cptr, cbuf_end);
|
||||
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
||||
if (s->ignore_case) {
|
||||
c = lre_canonicalize(c, s->is_unicode);
|
||||
}
|
||||
@@ -2152,7 +2148,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
break;
|
||||
if (!s->multi_line)
|
||||
goto no_match;
|
||||
PEEK_PREV_CHAR(c, cptr, s->cbuf);
|
||||
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
|
||||
if (!is_line_terminator(c))
|
||||
goto no_match;
|
||||
break;
|
||||
@@ -2161,21 +2157,21 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
break;
|
||||
if (!s->multi_line)
|
||||
goto no_match;
|
||||
PEEK_CHAR(c, cptr, cbuf_end);
|
||||
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
|
||||
if (!is_line_terminator(c))
|
||||
goto no_match;
|
||||
break;
|
||||
case REOP_dot:
|
||||
if (cptr == cbuf_end)
|
||||
goto no_match;
|
||||
GET_CHAR(c, cptr, cbuf_end);
|
||||
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
||||
if (is_line_terminator(c))
|
||||
goto no_match;
|
||||
break;
|
||||
case REOP_any:
|
||||
if (cptr == cbuf_end)
|
||||
goto no_match;
|
||||
GET_CHAR(c, cptr, cbuf_end);
|
||||
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
||||
break;
|
||||
case REOP_save_start:
|
||||
case REOP_save_end:
|
||||
@@ -2227,14 +2223,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
if (cptr == s->cbuf) {
|
||||
v1 = FALSE;
|
||||
} else {
|
||||
PEEK_PREV_CHAR(c, cptr, s->cbuf);
|
||||
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
|
||||
v1 = is_word_char(c);
|
||||
}
|
||||
/* current char */
|
||||
if (cptr >= cbuf_end) {
|
||||
v2 = FALSE;
|
||||
} else {
|
||||
PEEK_CHAR(c, cptr, cbuf_end);
|
||||
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
|
||||
v2 = is_word_char(c);
|
||||
}
|
||||
if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
|
||||
@@ -2259,8 +2255,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
while (cptr1 < cptr1_end) {
|
||||
if (cptr >= cbuf_end)
|
||||
goto no_match;
|
||||
GET_CHAR(c1, cptr1, cptr1_end);
|
||||
GET_CHAR(c2, cptr, cbuf_end);
|
||||
GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
|
||||
GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
|
||||
if (s->ignore_case) {
|
||||
c1 = lre_canonicalize(c1, s->is_unicode);
|
||||
c2 = lre_canonicalize(c2, s->is_unicode);
|
||||
@@ -2273,8 +2269,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
while (cptr1 > cptr1_start) {
|
||||
if (cptr == s->cbuf)
|
||||
goto no_match;
|
||||
GET_PREV_CHAR(c1, cptr1, cptr1_start);
|
||||
GET_PREV_CHAR(c2, cptr, s->cbuf);
|
||||
GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
|
||||
GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
|
||||
if (s->ignore_case) {
|
||||
c1 = lre_canonicalize(c1, s->is_unicode);
|
||||
c2 = lre_canonicalize(c2, s->is_unicode);
|
||||
@@ -2294,7 +2290,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
pc += 2;
|
||||
if (cptr >= cbuf_end)
|
||||
goto no_match;
|
||||
GET_CHAR(c, cptr, cbuf_end);
|
||||
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
||||
if (s->ignore_case) {
|
||||
c = lre_canonicalize(c, s->is_unicode);
|
||||
}
|
||||
@@ -2334,7 +2330,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
pc += 2;
|
||||
if (cptr >= cbuf_end)
|
||||
goto no_match;
|
||||
GET_CHAR(c, cptr, cbuf_end);
|
||||
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
||||
if (s->ignore_case) {
|
||||
c = lre_canonicalize(c, s->is_unicode);
|
||||
}
|
||||
@@ -2366,7 +2362,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||
/* go to the previous char */
|
||||
if (cptr == s->cbuf)
|
||||
goto no_match;
|
||||
PREV_CHAR(cptr, s->cbuf);
|
||||
PREV_CHAR(cptr, s->cbuf, cbuf_type);
|
||||
break;
|
||||
case REOP_simple_greedy_quant:
|
||||
{
|
||||
@@ -2425,7 +2421,7 @@ int lre_exec(uint8_t **capture,
|
||||
int re_flags, i, alloca_size, ret;
|
||||
StackInt *stack_buf;
|
||||
|
||||
re_flags = bc_buf[RE_HEADER_FLAGS];
|
||||
re_flags = lre_get_flags(bc_buf);
|
||||
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
|
||||
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
|
||||
s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
|
||||
@@ -2472,8 +2468,8 @@ const char *lre_get_groupnames(const uint8_t *bc_buf)
|
||||
uint32_t re_bytecode_len;
|
||||
if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
|
||||
return NULL;
|
||||
re_bytecode_len = get_u32(bc_buf + 3);
|
||||
return (const char *)(bc_buf + 7 + re_bytecode_len);
|
||||
re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
|
||||
return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
|
||||
}
|
||||
|
||||
#ifdef TEST
|
||||
@@ -2490,25 +2486,26 @@ void *lre_realloc(void *opaque, void *ptr, size_t size)
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int len, ret, i;
|
||||
int len, flags, ret, i;
|
||||
uint8_t *bc;
|
||||
char error_msg[64];
|
||||
uint8_t *capture[CAPTURE_COUNT_MAX * 2];
|
||||
const char *input;
|
||||
int input_len, capture_count;
|
||||
|
||||
if (argc < 3) {
|
||||
printf("usage: %s regexp input\n", argv[0]);
|
||||
exit(1);
|
||||
if (argc < 4) {
|
||||
printf("usage: %s regexp flags input\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
flags = atoi(argv[2]);
|
||||
bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
|
||||
strlen(argv[1]), 0, NULL);
|
||||
strlen(argv[1]), flags, NULL);
|
||||
if (!bc) {
|
||||
fprintf(stderr, "error: %s\n", error_msg);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
input = argv[2];
|
||||
input = argv[3];
|
||||
input_len = strlen(input);
|
||||
|
||||
ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
|
||||
|
Reference in New Issue
Block a user