@@ -499,6 +499,9 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
499499 case CR_OP_XOR :
500500 is_in = (a_idx & 1 ) ^ (b_idx & 1 );
501501 break ;
502+ case CR_OP_SUB :
503+ is_in = (a_idx & 1 ) & ((b_idx & 1 ) ^ 1 );
504+ break ;
502505 default :
503506 abort ();
504507 }
@@ -511,14 +514,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
511514 return 0 ;
512515}
513516
514- int cr_union1 (CharRange * cr , const uint32_t * b_pt , int b_len )
517+ int cr_op1 (CharRange * cr , const uint32_t * b_pt , int b_len , int op )
515518{
516519 CharRange a = * cr ;
517520 int ret ;
518521 cr -> len = 0 ;
519522 cr -> size = 0 ;
520523 cr -> points = NULL ;
521- ret = cr_op (cr , a .points , a .len , b_pt , b_len , CR_OP_UNION );
524+ ret = cr_op (cr , a .points , a .len , b_pt , b_len , op );
522525 cr_free (& a );
523526 return ret ;
524527}
@@ -1554,6 +1557,7 @@ static int unicode_prop_ops(CharRange *cr, ...)
15541557 cr2 = & stack [stack_len - 1 ];
15551558 cr3 = & stack [stack_len ++ ];
15561559 cr_init (cr3 , cr -> mem_opaque , cr -> realloc_func );
1560+ /* CR_OP_XOR may be used here */
15571561 if (cr_op (cr3 , cr1 -> points , cr1 -> len ,
15581562 cr2 -> points , cr2 -> len , op - POP_UNION + CR_OP_UNION ))
15591563 goto fail ;
@@ -1908,3 +1912,210 @@ BOOL lre_is_space_non_ascii(uint32_t c)
19081912 }
19091913 return FALSE;
19101914}
1915+
1916+ #define SEQ_MAX_LEN 16
1917+
1918+ static int unicode_sequence_prop1 (int seq_prop_idx , UnicodeSequencePropCB * cb , void * opaque ,
1919+ CharRange * cr )
1920+ {
1921+ int i , c , j ;
1922+ uint32_t seq [SEQ_MAX_LEN ];
1923+
1924+ switch (seq_prop_idx ) {
1925+ case UNICODE_SEQUENCE_PROP_Basic_Emoji :
1926+ if (unicode_prop1 (cr , UNICODE_PROP_Basic_Emoji1 ) < 0 )
1927+ return -1 ;
1928+ for (i = 0 ; i < cr -> len ; i += 2 ) {
1929+ for (c = cr -> points [i ]; c < cr -> points [i + 1 ]; c ++ ) {
1930+ seq [0 ] = c ;
1931+ cb (opaque , seq , 1 );
1932+ }
1933+ }
1934+
1935+ cr -> len = 0 ;
1936+
1937+ if (unicode_prop1 (cr , UNICODE_PROP_Basic_Emoji2 ) < 0 )
1938+ return -1 ;
1939+ for (i = 0 ; i < cr -> len ; i += 2 ) {
1940+ for (c = cr -> points [i ]; c < cr -> points [i + 1 ]; c ++ ) {
1941+ seq [0 ] = c ;
1942+ seq [1 ] = 0xfe0f ;
1943+ cb (opaque , seq , 2 );
1944+ }
1945+ }
1946+
1947+ break ;
1948+ case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence :
1949+ if (unicode_prop1 (cr , UNICODE_PROP_Emoji_Modifier_Base ) < 0 )
1950+ return -1 ;
1951+ for (i = 0 ; i < cr -> len ; i += 2 ) {
1952+ for (c = cr -> points [i ]; c < cr -> points [i + 1 ]; c ++ ) {
1953+ for (j = 0 ; j < 5 ; j ++ ) {
1954+ seq [0 ] = c ;
1955+ seq [1 ] = 0x1f3fb + j ;
1956+ cb (opaque , seq , 2 );
1957+ }
1958+ }
1959+ }
1960+ break ;
1961+ case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence :
1962+ if (unicode_prop1 (cr , UNICODE_PROP_RGI_Emoji_Flag_Sequence ) < 0 )
1963+ return -1 ;
1964+ for (i = 0 ; i < cr -> len ; i += 2 ) {
1965+ for (c = cr -> points [i ]; c < cr -> points [i + 1 ]; c ++ ) {
1966+ int c0 , c1 ;
1967+ c0 = c / 26 ;
1968+ c1 = c % 26 ;
1969+ seq [0 ] = 0x1F1E6 + c0 ;
1970+ seq [1 ] = 0x1F1E6 + c1 ;
1971+ cb (opaque , seq , 2 );
1972+ }
1973+ }
1974+ break ;
1975+ case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence :
1976+ {
1977+ int len , code , pres , k , mod , mod_count , mod_pos [2 ], hc_pos , n_mod , n_hc , mod1 ;
1978+ int mod_idx , hc_idx , i0 , i1 ;
1979+ const uint8_t * tab = unicode_rgi_emoji_zwj_sequence ;
1980+
1981+ for (i = 0 ; i < countof (unicode_rgi_emoji_zwj_sequence );) {
1982+ len = tab [i ++ ];
1983+ k = 0 ;
1984+ mod = 0 ;
1985+ mod_count = 0 ;
1986+ hc_pos = -1 ;
1987+ for (j = 0 ; j < len ; j ++ ) {
1988+ code = tab [i ++ ];
1989+ code |= tab [i ++ ] << 8 ;
1990+ pres = code >> 15 ;
1991+ mod1 = (code >> 13 ) & 3 ;
1992+ code &= 0x1fff ;
1993+ if (code < 0x1000 ) {
1994+ c = code + 0x2000 ;
1995+ } else {
1996+ c = 0x1f000 + (code - 0x1000 );
1997+ }
1998+ if (c == 0x1f9b0 )
1999+ hc_pos = k ;
2000+ seq [k ++ ] = c ;
2001+ if (mod1 != 0 ) {
2002+ assert (mod_count < 2 );
2003+ mod = mod1 ;
2004+ mod_pos [mod_count ++ ] = k ;
2005+ seq [k ++ ] = 0 ; /* will be filled later */
2006+ }
2007+ if (pres ) {
2008+ seq [k ++ ] = 0xfe0f ;
2009+ }
2010+ if (j < len - 1 ) {
2011+ seq [k ++ ] = 0x200d ;
2012+ }
2013+ }
2014+
2015+ /* genrate all the variants */
2016+ switch (mod ) {
2017+ case 1 :
2018+ n_mod = 5 ;
2019+ break ;
2020+ case 2 :
2021+ n_mod = 25 ;
2022+ break ;
2023+ case 3 :
2024+ n_mod = 20 ;
2025+ break ;
2026+ default :
2027+ n_mod = 1 ;
2028+ break ;
2029+ }
2030+ if (hc_pos >= 0 )
2031+ n_hc = 4 ;
2032+ else
2033+ n_hc = 1 ;
2034+ for (hc_idx = 0 ; hc_idx < n_hc ; hc_idx ++ ) {
2035+ for (mod_idx = 0 ; mod_idx < n_mod ; mod_idx ++ ) {
2036+ if (hc_pos >= 0 )
2037+ seq [hc_pos ] = 0x1f9b0 + hc_idx ;
2038+
2039+ switch (mod ) {
2040+ case 1 :
2041+ seq [mod_pos [0 ]] = 0x1f3fb + mod_idx ;
2042+ break ;
2043+ case 2 :
2044+ case 3 :
2045+ i0 = mod_idx / 5 ;
2046+ i1 = mod_idx % 5 ;
2047+ /* avoid identical values */
2048+ if (mod == 3 && i0 >= i1 )
2049+ i0 ++ ;
2050+ seq [mod_pos [0 ]] = 0x1f3fb + i0 ;
2051+ seq [mod_pos [1 ]] = 0x1f3fb + i1 ;
2052+ break ;
2053+ default :
2054+ break ;
2055+ }
2056+ #if 0
2057+ for (j = 0 ; j < k ; j ++ )
2058+ printf (" %04x" , seq [j ]);
2059+ printf ("\n" );
2060+ #endif
2061+ cb (opaque , seq , k );
2062+ }
2063+ }
2064+ }
2065+ }
2066+ break ;
2067+ case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence :
2068+ {
2069+ for (i = 0 ; i < countof (unicode_rgi_emoji_tag_sequence );) {
2070+ j = 0 ;
2071+ seq [j ++ ] = 0x1F3F4 ;
2072+ for (;;) {
2073+ c = unicode_rgi_emoji_tag_sequence [i ++ ];
2074+ if (c == 0x00 )
2075+ break ;
2076+ seq [j ++ ] = 0xe0000 + c ;
2077+ }
2078+ seq [j ++ ] = 0xe007f ;
2079+ cb (opaque , seq , j );
2080+ }
2081+ }
2082+ break ;
2083+ case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence :
2084+ if (unicode_prop1 (cr , UNICODE_PROP_Emoji_Keycap_Sequence ) < 0 )
2085+ return -1 ;
2086+ for (i = 0 ; i < cr -> len ; i += 2 ) {
2087+ for (c = cr -> points [i ]; c < cr -> points [i + 1 ]; c ++ ) {
2088+ seq [0 ] = c ;
2089+ seq [1 ] = 0xfe0f ;
2090+ seq [2 ] = 0x20e3 ;
2091+ cb (opaque , seq , 3 );
2092+ }
2093+ }
2094+ break ;
2095+ case UNICODE_SEQUENCE_PROP_RGI_Emoji :
2096+ /* all prevous sequences */
2097+ for (i = UNICODE_SEQUENCE_PROP_Basic_Emoji ; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence ; i ++ ) {
2098+ int ret ;
2099+ ret = unicode_sequence_prop1 (i , cb , opaque , cr );
2100+ if (ret < 0 )
2101+ return ret ;
2102+ cr -> len = 0 ;
2103+ }
2104+ break ;
2105+ default :
2106+ return -2 ;
2107+ }
2108+ return 0 ;
2109+ }
2110+
2111+ /* build a unicode sequence property */
2112+ /* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
2113+ int unicode_sequence_prop (const char * prop_name , UnicodeSequencePropCB * cb , void * opaque ,
2114+ CharRange * cr )
2115+ {
2116+ int seq_prop_idx ;
2117+ seq_prop_idx = unicode_find_name (unicode_sequence_prop_name_table , prop_name );
2118+ if (seq_prop_idx < 0 )
2119+ return -2 ;
2120+ return unicode_sequence_prop1 (seq_prop_idx , cb , opaque , cr );
2121+ }
0 commit comments