Skip to content

Commit d7cdfdc

Browse files
author
Fabrice Bellard
committed
regexp: added v flag support - fixed corner cases of case insensitive matching
1 parent a8b2d7c commit d7cdfdc

File tree

13 files changed

+2003
-204
lines changed

13 files changed

+2003
-204
lines changed

libregexp.c

Lines changed: 721 additions & 107 deletions
Large diffs are not rendered by default.

libregexp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#define LRE_FLAG_STICKY (1 << 5)
3636
#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */
3737
#define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */
38+
#define LRE_FLAG_UNICODE_SETS (1 << 8)
3839

3940
#define LRE_RET_MEMORY_ERROR (-1)
4041
#define LRE_RET_TIMEOUT (-2)

libunicode-table.h

Lines changed: 419 additions & 1 deletion
Large diffs are not rendered by default.

libunicode.c

Lines changed: 213 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,9 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
499499
case CR_OP_XOR:
500500
is_in = (a_idx & 1) ^ (b_idx & 1);
501501
break;
502+
case CR_OP_SUB:
503+
is_in = (a_idx & 1) & ((b_idx & 1) ^ 1);
504+
break;
502505
default:
503506
abort();
504507
}
@@ -511,14 +514,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
511514
return 0;
512515
}
513516

514-
int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
517+
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op)
515518
{
516519
CharRange a = *cr;
517520
int ret;
518521
cr->len = 0;
519522
cr->size = 0;
520523
cr->points = NULL;
521-
ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
524+
ret = cr_op(cr, a.points, a.len, b_pt, b_len, op);
522525
cr_free(&a);
523526
return ret;
524527
}
@@ -1554,6 +1557,7 @@ static int unicode_prop_ops(CharRange *cr, ...)
15541557
cr2 = &stack[stack_len - 1];
15551558
cr3 = &stack[stack_len++];
15561559
cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1560+
/* CR_OP_XOR may be used here */
15571561
if (cr_op(cr3, cr1->points, cr1->len,
15581562
cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
15591563
goto fail;
@@ -1908,3 +1912,210 @@ BOOL lre_is_space_non_ascii(uint32_t c)
19081912
}
19091913
return FALSE;
19101914
}
1915+
1916+
#define SEQ_MAX_LEN 16
1917+
1918+
static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque,
1919+
CharRange *cr)
1920+
{
1921+
int i, c, j;
1922+
uint32_t seq[SEQ_MAX_LEN];
1923+
1924+
switch(seq_prop_idx) {
1925+
case UNICODE_SEQUENCE_PROP_Basic_Emoji:
1926+
if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0)
1927+
return -1;
1928+
for(i = 0; i < cr->len; i += 2) {
1929+
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1930+
seq[0] = c;
1931+
cb(opaque, seq, 1);
1932+
}
1933+
}
1934+
1935+
cr->len = 0;
1936+
1937+
if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0)
1938+
return -1;
1939+
for(i = 0; i < cr->len; i += 2) {
1940+
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1941+
seq[0] = c;
1942+
seq[1] = 0xfe0f;
1943+
cb(opaque, seq, 2);
1944+
}
1945+
}
1946+
1947+
break;
1948+
case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
1949+
if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0)
1950+
return -1;
1951+
for(i = 0; i < cr->len; i += 2) {
1952+
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1953+
for(j = 0; j < 5; j++) {
1954+
seq[0] = c;
1955+
seq[1] = 0x1f3fb + j;
1956+
cb(opaque, seq, 2);
1957+
}
1958+
}
1959+
}
1960+
break;
1961+
case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
1962+
if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0)
1963+
return -1;
1964+
for(i = 0; i < cr->len; i += 2) {
1965+
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1966+
int c0, c1;
1967+
c0 = c / 26;
1968+
c1 = c % 26;
1969+
seq[0] = 0x1F1E6 + c0;
1970+
seq[1] = 0x1F1E6 + c1;
1971+
cb(opaque, seq, 2);
1972+
}
1973+
}
1974+
break;
1975+
case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
1976+
{
1977+
int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1;
1978+
int mod_idx, hc_idx, i0, i1;
1979+
const uint8_t *tab = unicode_rgi_emoji_zwj_sequence;
1980+
1981+
for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) {
1982+
len = tab[i++];
1983+
k = 0;
1984+
mod = 0;
1985+
mod_count = 0;
1986+
hc_pos = -1;
1987+
for(j = 0; j < len; j++) {
1988+
code = tab[i++];
1989+
code |= tab[i++] << 8;
1990+
pres = code >> 15;
1991+
mod1 = (code >> 13) & 3;
1992+
code &= 0x1fff;
1993+
if (code < 0x1000) {
1994+
c = code + 0x2000;
1995+
} else {
1996+
c = 0x1f000 + (code - 0x1000);
1997+
}
1998+
if (c == 0x1f9b0)
1999+
hc_pos = k;
2000+
seq[k++] = c;
2001+
if (mod1 != 0) {
2002+
assert(mod_count < 2);
2003+
mod = mod1;
2004+
mod_pos[mod_count++] = k;
2005+
seq[k++] = 0; /* will be filled later */
2006+
}
2007+
if (pres) {
2008+
seq[k++] = 0xfe0f;
2009+
}
2010+
if (j < len - 1) {
2011+
seq[k++] = 0x200d;
2012+
}
2013+
}
2014+
2015+
/* genrate all the variants */
2016+
switch(mod) {
2017+
case 1:
2018+
n_mod = 5;
2019+
break;
2020+
case 2:
2021+
n_mod = 25;
2022+
break;
2023+
case 3:
2024+
n_mod = 20;
2025+
break;
2026+
default:
2027+
n_mod = 1;
2028+
break;
2029+
}
2030+
if (hc_pos >= 0)
2031+
n_hc = 4;
2032+
else
2033+
n_hc = 1;
2034+
for(hc_idx = 0; hc_idx < n_hc; hc_idx++) {
2035+
for(mod_idx = 0; mod_idx < n_mod; mod_idx++) {
2036+
if (hc_pos >= 0)
2037+
seq[hc_pos] = 0x1f9b0 + hc_idx;
2038+
2039+
switch(mod) {
2040+
case 1:
2041+
seq[mod_pos[0]] = 0x1f3fb + mod_idx;
2042+
break;
2043+
case 2:
2044+
case 3:
2045+
i0 = mod_idx / 5;
2046+
i1 = mod_idx % 5;
2047+
/* avoid identical values */
2048+
if (mod == 3 && i0 >= i1)
2049+
i0++;
2050+
seq[mod_pos[0]] = 0x1f3fb + i0;
2051+
seq[mod_pos[1]] = 0x1f3fb + i1;
2052+
break;
2053+
default:
2054+
break;
2055+
}
2056+
#if 0
2057+
for(j = 0; j < k; j++)
2058+
printf(" %04x", seq[j]);
2059+
printf("\n");
2060+
#endif
2061+
cb(opaque, seq, k);
2062+
}
2063+
}
2064+
}
2065+
}
2066+
break;
2067+
case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
2068+
{
2069+
for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) {
2070+
j = 0;
2071+
seq[j++] = 0x1F3F4;
2072+
for(;;) {
2073+
c = unicode_rgi_emoji_tag_sequence[i++];
2074+
if (c == 0x00)
2075+
break;
2076+
seq[j++] = 0xe0000 + c;
2077+
}
2078+
seq[j++] = 0xe007f;
2079+
cb(opaque, seq, j);
2080+
}
2081+
}
2082+
break;
2083+
case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence:
2084+
if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0)
2085+
return -1;
2086+
for(i = 0; i < cr->len; i += 2) {
2087+
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
2088+
seq[0] = c;
2089+
seq[1] = 0xfe0f;
2090+
seq[2] = 0x20e3;
2091+
cb(opaque, seq, 3);
2092+
}
2093+
}
2094+
break;
2095+
case UNICODE_SEQUENCE_PROP_RGI_Emoji:
2096+
/* all prevous sequences */
2097+
for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) {
2098+
int ret;
2099+
ret = unicode_sequence_prop1(i, cb, opaque, cr);
2100+
if (ret < 0)
2101+
return ret;
2102+
cr->len = 0;
2103+
}
2104+
break;
2105+
default:
2106+
return -2;
2107+
}
2108+
return 0;
2109+
}
2110+
2111+
/* build a unicode sequence property */
2112+
/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
2113+
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
2114+
CharRange *cr)
2115+
{
2116+
int seq_prop_idx;
2117+
seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name);
2118+
if (seq_prop_idx < 0)
2119+
return -2;
2120+
return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr);
2121+
}

libunicode.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ typedef enum {
4545
CR_OP_UNION,
4646
CR_OP_INTER,
4747
CR_OP_XOR,
48+
CR_OP_SUB,
4849
} CharRangeOpEnum;
4950

5051
void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
@@ -73,19 +74,18 @@ static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2)
7374
return 0;
7475
}
7576

76-
int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len);
77+
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
78+
const uint32_t *b_pt, int b_len, int op);
79+
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op);
7780

7881
static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2)
7982
{
8083
uint32_t b_pt[2];
8184
b_pt[0] = c1;
8285
b_pt[1] = c2 + 1;
83-
return cr_union1(cr, b_pt, 2);
86+
return cr_op1(cr, b_pt, 2, CR_OP_UNION);
8487
}
8588

86-
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
87-
const uint32_t *b_pt, int b_len, int op);
88-
8989
int cr_invert(CharRange *cr);
9090

9191
int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
@@ -107,6 +107,10 @@ int unicode_script(CharRange *cr, const char *script_name, int is_ext);
107107
int unicode_general_category(CharRange *cr, const char *gc_name);
108108
int unicode_prop(CharRange *cr, const char *prop_name);
109109

110+
typedef void UnicodeSequencePropCB(void *opaque, const uint32_t *buf, int len);
111+
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
112+
CharRange *cr);
113+
110114
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
111115
int lre_canonicalize(uint32_t c, int is_unicode);
112116

quickjs-atom.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,12 @@ DEF(minus_zero, "-0")
177177
DEF(Infinity, "Infinity")
178178
DEF(minus_Infinity, "-Infinity")
179179
DEF(NaN, "NaN")
180+
DEF(hasIndices, "hasIndices")
181+
DEF(ignoreCase, "ignoreCase")
182+
DEF(multiline, "multiline")
183+
DEF(dotAll, "dotAll")
184+
DEF(sticky, "sticky")
185+
DEF(unicodeSets, "unicodeSets")
180186
/* the following 3 atoms are only used with CONFIG_ATOMICS */
181187
DEF(not_equal, "not-equal")
182188
DEF(timed_out, "timed-out")

0 commit comments

Comments
 (0)