mirror of
https://github.com/bellard/quickjs.git
synced 2025-09-30 15:04:24 +03:00
Improve unicode table handling (#286)
- Document table and index formats - Add size statistics - Fix UBSAN issue in `get_le24()` Fixes #285
This commit is contained in:
114
unicode_gen.c
114
unicode_gen.c
@@ -33,6 +33,11 @@
|
||||
|
||||
#include "cutils.h"
|
||||
|
||||
uint32_t total_tables;
|
||||
uint32_t total_table_bytes;
|
||||
uint32_t total_index;
|
||||
uint32_t total_index_bytes;
|
||||
|
||||
/* define it to be able to test unicode.c */
|
||||
//#define USE_TEST
|
||||
/* profile tests */
|
||||
@@ -1328,7 +1333,9 @@ void dump_case_conv_table(FILE *f)
|
||||
uint32_t v;
|
||||
const TableEntry *te;
|
||||
|
||||
fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len);
|
||||
total_tables++;
|
||||
total_table_bytes += conv_table_len * sizeof(uint32_t);
|
||||
fprintf(f, "static const uint32_t case_conv_table1[%d] = {", conv_table_len);
|
||||
for(i = 0; i < conv_table_len; i++) {
|
||||
if (i % 4 == 0)
|
||||
fprintf(f, "\n ");
|
||||
@@ -1341,7 +1348,9 @@ void dump_case_conv_table(FILE *f)
|
||||
}
|
||||
fprintf(f, "\n};\n\n");
|
||||
|
||||
fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len);
|
||||
total_tables++;
|
||||
total_table_bytes += conv_table_len;
|
||||
fprintf(f, "static const uint8_t case_conv_table2[%d] = {", conv_table_len);
|
||||
for(i = 0; i < conv_table_len; i++) {
|
||||
if (i % 8 == 0)
|
||||
fprintf(f, "\n ");
|
||||
@@ -1350,7 +1359,9 @@ void dump_case_conv_table(FILE *f)
|
||||
}
|
||||
fprintf(f, "\n};\n\n");
|
||||
|
||||
fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len);
|
||||
total_tables++;
|
||||
total_table_bytes += ext_data_len * sizeof(uint16_t);
|
||||
fprintf(f, "static const uint16_t case_conv_ext[%d] = {", ext_data_len);
|
||||
for(i = 0; i < ext_data_len; i++) {
|
||||
if (i % 8 == 0)
|
||||
fprintf(f, "\n ");
|
||||
@@ -1470,6 +1481,9 @@ void compute_internal_props(void)
|
||||
void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
|
||||
{
|
||||
int i;
|
||||
|
||||
total_tables++;
|
||||
total_table_bytes += len;
|
||||
fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
|
||||
for(i = 0; i < len; i++) {
|
||||
if (i % 8 == 0)
|
||||
@@ -1479,9 +1493,26 @@ void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
|
||||
fprintf(f, "\n};\n\n");
|
||||
}
|
||||
|
||||
void dump_index_table(FILE *f, const char *cname, const uint8_t *tab, int len)
|
||||
{
|
||||
int i, code, offset;
|
||||
|
||||
total_index++;
|
||||
total_index_bytes += len;
|
||||
fprintf(f, "static const uint8_t %s[%d] = {\n", cname, len);
|
||||
for(i = 0; i < len; i += 3) {
|
||||
code = tab[i] + (tab[i+1] << 8) + ((tab[i+2] & 0x1f) << 16);
|
||||
offset = ((i / 3) + 1) * 32 + (tab[i+2] >> 5);
|
||||
fprintf(f, " 0x%02x, 0x%02x, 0x%02x,", tab[i], tab[i+1], tab[i+2]);
|
||||
fprintf(f, " // %6.5X at %d%s\n", code, offset,
|
||||
i == len - 3 ? " (upper bound)" : "");
|
||||
}
|
||||
fprintf(f, "};\n\n");
|
||||
}
|
||||
|
||||
#define PROP_BLOCK_LEN 32
|
||||
|
||||
void build_prop_table(FILE *f, int prop_index, BOOL add_index)
|
||||
void build_prop_table(FILE *f, const char *name, int prop_index, BOOL add_index)
|
||||
{
|
||||
int i, j, n, v, offset, code;
|
||||
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
||||
@@ -1533,6 +1564,14 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index)
|
||||
block_end_pos += PROP_BLOCK_LEN;
|
||||
}
|
||||
|
||||
/* Compressed byte encoding:
|
||||
00..3F: 2 packed lengths: 3-bit + 3-bit
|
||||
40..5F: 5-bits plus extra byte for length
|
||||
60..7F: 5-bits plus 2 extra bytes for length
|
||||
80..FF: 7-bit length
|
||||
lengths must be incremented to get character count
|
||||
Ranges alternate between false and true return value.
|
||||
*/
|
||||
v = buf[i];
|
||||
code += v + 1;
|
||||
bit ^= 1;
|
||||
@@ -1573,7 +1612,7 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index)
|
||||
dump_byte_table(f, cname, dbuf->buf, dbuf->size);
|
||||
if (add_index) {
|
||||
snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
|
||||
dump_byte_table(f, cname, dbuf2->buf, dbuf2->size);
|
||||
dump_index_table(f, cname, dbuf2->buf, dbuf2->size);
|
||||
}
|
||||
|
||||
dbuf_free(dbuf);
|
||||
@@ -1583,10 +1622,10 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index)
|
||||
|
||||
void build_flags_tables(FILE *f)
|
||||
{
|
||||
build_prop_table(f, PROP_Cased1, TRUE);
|
||||
build_prop_table(f, PROP_Case_Ignorable, TRUE);
|
||||
build_prop_table(f, PROP_ID_Start, TRUE);
|
||||
build_prop_table(f, PROP_ID_Continue1, TRUE);
|
||||
build_prop_table(f, "Cased1", PROP_Cased1, TRUE);
|
||||
build_prop_table(f, "Case_Ignorable", PROP_Case_Ignorable, TRUE);
|
||||
build_prop_table(f, "ID_Start", PROP_ID_Start, TRUE);
|
||||
build_prop_table(f, "ID_Continue1", PROP_ID_Continue1, TRUE);
|
||||
}
|
||||
|
||||
void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
|
||||
@@ -1845,7 +1884,7 @@ void build_prop_list_table(FILE *f)
|
||||
i == PROP_ID_Continue1) {
|
||||
/* already generated */
|
||||
} else {
|
||||
build_prop_table(f, i, FALSE);
|
||||
build_prop_table(f, unicode_prop_name[i], i, FALSE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1997,6 +2036,8 @@ void check_flags(void)
|
||||
|
||||
void build_cc_table(FILE *f)
|
||||
{
|
||||
// Compress combining class table
|
||||
// see: https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
|
||||
int i, cc, n, type, n1, block_end_pos;
|
||||
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
||||
DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
|
||||
@@ -2055,6 +2096,13 @@ void build_cc_table(FILE *f)
|
||||
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
|
||||
cw_start = dbuf->size;
|
||||
#endif
|
||||
/* Compressed run length encoding:
|
||||
- 2 high order bits are combining class type
|
||||
- 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
|
||||
- 00..2F: range length (add 1)
|
||||
- 30..37: 3-bit range-length + 1 extra byte
|
||||
- 38..3F: 3-bit range-length + 2 extra byte
|
||||
*/
|
||||
if (n1 < 48) {
|
||||
dbuf_putc(dbuf, n1 | (type << 6));
|
||||
} else if (n1 < 48 + (1 << 11)) {
|
||||
@@ -2084,7 +2132,7 @@ void build_cc_table(FILE *f)
|
||||
dbuf_putc(dbuf1, v >> 16);
|
||||
|
||||
dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
|
||||
dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
|
||||
dump_index_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
|
||||
|
||||
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
|
||||
printf("CC table: size=%d (%d entries) [",
|
||||
@@ -2765,8 +2813,9 @@ void build_decompose_table(FILE *f)
|
||||
}
|
||||
#endif
|
||||
|
||||
fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {",
|
||||
array_len);
|
||||
total_tables++;
|
||||
total_table_bytes += array_len * sizeof(uint32_t);
|
||||
fprintf(f, "static const uint32_t unicode_decomp_table1[%d] = {", array_len);
|
||||
count = 0;
|
||||
for(i = 0; i <= code_max; i++) {
|
||||
de = &tab_de[i];
|
||||
@@ -2784,8 +2833,9 @@ void build_decompose_table(FILE *f)
|
||||
}
|
||||
fprintf(f, "\n};\n\n");
|
||||
|
||||
fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {",
|
||||
array_len);
|
||||
total_tables++;
|
||||
total_table_bytes += array_len * sizeof(uint16_t);
|
||||
fprintf(f, "static const uint16_t unicode_decomp_table2[%d] = {", array_len);
|
||||
count = 0;
|
||||
for(i = 0; i <= code_max; i++) {
|
||||
de = &tab_de[i];
|
||||
@@ -2798,8 +2848,9 @@ void build_decompose_table(FILE *f)
|
||||
}
|
||||
fprintf(f, "\n};\n\n");
|
||||
|
||||
fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {",
|
||||
data_len);
|
||||
total_tables++;
|
||||
total_table_bytes += data_len;
|
||||
fprintf(f, "static const uint8_t unicode_decomp_data[%d] = {", data_len);
|
||||
for(i = 0; i < data_len; i++) {
|
||||
if (i % 8 == 0)
|
||||
fprintf(f, "\n ");
|
||||
@@ -2890,8 +2941,9 @@ void build_compose_table(FILE *f, const DecompEntry *tab_de)
|
||||
}
|
||||
#endif
|
||||
|
||||
fprintf(f, "static const uint16_t unicode_comp_table[%u] = {",
|
||||
tab_ce_len);
|
||||
total_tables++;
|
||||
total_table_bytes += tab_ce_len * sizeof(uint16_t);
|
||||
fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", tab_ce_len);
|
||||
for(i = 0; i < tab_ce_len; i++) {
|
||||
if (i % 8 == 0)
|
||||
fprintf(f, "\n ");
|
||||
@@ -3066,22 +3118,24 @@ void normalization_test(const char *filename)
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int argc, char **argv)
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *unicode_db_path, *outfilename;
|
||||
char filename[1024];
|
||||
int arg = 1;
|
||||
|
||||
if (argc < 2) {
|
||||
printf("usage: %s unicode_db_path [output_file]\n"
|
||||
"\n"
|
||||
"If no output_file is given, a self test is done using the current unicode library\n",
|
||||
argv[0]);
|
||||
exit(1);
|
||||
if (arg >= argc || (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help"))) {
|
||||
printf("usage: %s PATH [OUTPUT]\n"
|
||||
" PATH path to the Unicode database directory\n"
|
||||
" OUTPUT name of the output file. If omitted, a self test is performed\n"
|
||||
" using the files from the Unicode library\n"
|
||||
, argv[0]);
|
||||
return 1;
|
||||
}
|
||||
unicode_db_path = argv[1];
|
||||
unicode_db_path = argv[arg++];
|
||||
outfilename = NULL;
|
||||
if (argc >= 3)
|
||||
outfilename = argv[2];
|
||||
if (arg < argc)
|
||||
outfilename = argv[arg++];
|
||||
|
||||
unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
|
||||
|
||||
@@ -3163,6 +3217,8 @@ int main(int argc, char **argv)
|
||||
build_script_ext_table(fo);
|
||||
build_prop_list_table(fo);
|
||||
fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
|
||||
fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n",
|
||||
total_tables, total_table_bytes, total_index, total_index_bytes);
|
||||
fclose(fo);
|
||||
}
|
||||
return 0;
|
||||
|
Reference in New Issue
Block a user