4343from itertools import batched
4444from typing import Callable , Iterable
4545
46- UNICODE_VERSION = "15.1 .0"
46+ UNICODE_VERSION = "16.0 .0"
4747"""The version of the Unicode data files to download."""
4848
4949NUM_CODEPOINTS = 0x110000
@@ -175,8 +175,11 @@ class WidthState(enum.IntEnum):
175175 - 4th bit: whether to set top bit on emoji presentation.
176176 If this is set but 3rd is not, the width mode is related to zwj sequences
177177 - 5th from top: whether this is unaffected by ligature-transparent
178+ (if set, should also set 3rd and 4th)
178179 - 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
179- where no ZWJ has been encountered yet; encountering one flips this on"""
180+ where no ZWJ has been encountered yet; encountering one flips this on
181+ - Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
182+ """
180183
181184 # BASIC WIDTHS
182185
@@ -264,8 +267,17 @@ class WidthState(enum.IntEnum):
264267 TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110
265268 "(\\ uE0061..=\\ uE007A){6} \\ uE007F \\ u200D `Emoji_Presentation`"
266269
270+ # Kirat Rai
271+ KIRAT_RAI_VOWEL_SIGN_E = 0b0000_0000_0010_0000
272+ "\\ u16D67 (\\ u16D67 \\ u16D67)+ and canonical equivalents"
273+ KIRAT_RAI_VOWEL_SIGN_AI = 0b0000_0000_0010_0001
274+ "(\\ u16D68)+ and canonical equivalents"
275+
267276 # VARIATION SELECTORS
268277
278+ VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
279+ "\\ uFE00 if CJK, or \\ uFE01 otherwise"
280+
269281 # Text presentation sequences (not CJK)
270282 VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
271283 "\\ uFE0E (text presentation sequences)"
@@ -361,6 +373,7 @@ def width_alone(self) -> int:
361373 | WidthState .COMBINING_LONG_SOLIDUS_OVERLAY
362374 | WidthState .VARIATION_SELECTOR_15
363375 | WidthState .VARIATION_SELECTOR_16
376+ | WidthState .VARIATION_SELECTOR_1_OR_2
364377 ):
365378 return 0
366379 case (
@@ -493,12 +506,6 @@ def load_zero_widths() -> list[bool]:
493506 lambda cp : operator .setitem (zw_map , cp , True ),
494507 )
495508
496- # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
497- # as they canonically decompose to two characters with this property,
498- # but they aren't.
499- for c in [0x0CC0 , 0x0CC7 , 0x0CC8 , 0x0CCA , 0x0CCB , 0x1B3B , 0x1B3D , 0x1B43 ]:
500- zw_map [c ] = True
501-
502509 # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
503510 # as zero-width. This matches the behavior of glibc `wcwidth`.
504511 #
@@ -639,6 +646,8 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
639646 ([0xA4FD ], WidthState .LISU_TONE_LETTER_MYA_NA_JEU ),
640647 ([0xFE0F ], WidthState .VARIATION_SELECTOR_16 ),
641648 ([0x10C03 ], WidthState .OLD_TURKIC_LETTER_ORKHON_I ),
649+ ([0x16D67 ], WidthState .KIRAT_RAI_VOWEL_SIGN_E ),
650+ ([0x16D68 ], WidthState .KIRAT_RAI_VOWEL_SIGN_AI ),
642651 (emoji_presentation , WidthState .EMOJI_PRESENTATION ),
643652 (emoji_modifiers , WidthState .EMOJI_MODIFIER ),
644653 (regional_indicators , WidthState .REGIONAL_INDICATOR ),
@@ -648,9 +657,11 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
648657 ea [cp ] = width
649658
650659 # East-Asian only
660+ ea [0xFE00 ] = WidthState .VARIATION_SELECTOR_1_OR_2
651661 ea [0x0338 ] = WidthState .COMBINING_LONG_SOLIDUS_OVERLAY
652662
653663 # Not East Asian only
664+ not_ea [0xFE01 ] = WidthState .VARIATION_SELECTOR_1_OR_2
654665 not_ea [0xFE0E ] = WidthState .VARIATION_SELECTOR_15
655666
656667 return (not_ea , ea )
@@ -716,7 +727,7 @@ def load_solidus_transparent(
716727 cjk_width_map : list [WidthState ],
717728) -> list [tuple [Codepoint , Codepoint ]]:
718729 """Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
719- Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
730+ Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to be checked also.
720731 """
721732
722733 ccc_above_1 = set ()
@@ -748,7 +759,7 @@ def load_solidus_transparent(
748759 num_chars = len (ccc_above_1 )
749760
750761 for cp in ccc_above_1 :
751- if cp != 0xFE0F :
762+ if cp not in [ 0xFE00 , 0xFE0F ] :
752763 assert (
753764 cjk_width_map [cp ].table_width () != CharWidthInTable .SPECIAL
754765 ), f"U+{ cp :X} "
@@ -1304,8 +1315,17 @@ def lookup_fns(
13041315 return (0, next_info.set_emoji_presentation());
13051316 }"""
13061317
1307- if not is_cjk :
1318+ if is_cjk :
13081319 s += """
1320+ if c == '\\ u{FE00}' {
1321+ return (0, next_info.set_vs1_2());
1322+ }
1323+ """
1324+ else :
1325+ s += """
1326+ if c == '\\ u{FE01}' {
1327+ return (0, next_info.set_vs1_2());
1328+ }
13091329 if c == '\\ u{FE0E}' {
13101330 return (0, next_info.set_text_presentation());
13111331 }
@@ -1315,9 +1335,19 @@ def lookup_fns(
13151335 } else {
13161336 next_info = next_info.unset_text_presentation();
13171337 }
1318- }"""
1338+ } else """
13191339
1320- s += """
1340+ s += """if next_info.is_vs1_2() {
1341+ if matches!(c, '\\ u{2018}' | '\\ u{2019}' | '\\ u{201C}' | '\\ u{201D}') {
1342+ return ("""
1343+
1344+ s += str (2 - is_cjk )
1345+
1346+ s += """, WidthInfo::DEFAULT);
1347+ } else {
1348+ next_info = next_info.unset_vs1_2();
1349+ }
1350+ }
13211351 if next_info.is_ligature_transparent() {
13221352 if c == '\\ u{200D}' {
13231353 return (0, next_info.set_zwj_bit());
@@ -1496,6 +1526,22 @@ def lookup_fns(
14961526 return (0, WidthInfo::EMOJI_PRESENTATION)
14971527 }}
14981528
1529+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D63}}') => {{
1530+ return (0, WidthInfo::DEFAULT);
1531+ }}
1532+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D67}}') => {{
1533+ return (0, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI);
1534+ }}
1535+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D68}}') => {{
1536+ return (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E);
1537+ }}
1538+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D69}}') => {{
1539+ return (0, WidthInfo::DEFAULT);
1540+ }}
1541+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI, '\\ u{{16D63}}') => {{
1542+ return (0, WidthInfo::DEFAULT);
1543+ }}
1544+
14991545 // Fallback
15001546 _ => {{}}
15011547 }}
@@ -1562,6 +1608,8 @@ def emit_module(
15621608#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15631609struct WidthInfo(u16);
15641610
1611+ const LIGATURE_TRANSPARENT_MASK: u16 = 0b0010_0000_0000_0000;
1612+
15651613impl WidthInfo {
15661614 /// No special handling necessary
15671615 const DEFAULT: Self = Self(0);
@@ -1591,51 +1639,84 @@ def emit_module(
15911639
15921640 /// Has top bit set
15931641 fn is_emoji_presentation(self) -> bool {{
1594- (self.0 & 0b1000_0000_0000_0000 ) == 0b1000_0000_0000_0000
1642+ (self.0 & WidthInfo::VARIATION_SELECTOR_16.0 ) == WidthInfo::VARIATION_SELECTOR_16.0
15951643 }}
15961644
1597- /// Has top bit set
15981645 fn is_zwj_emoji_presentation(self) -> bool {{
15991646 (self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
16001647 }}
16011648
16021649 /// Set top bit
16031650 fn set_emoji_presentation(self) -> Self {{
1604- if (self.0 & 0b0010_0000_0000_0000 ) == 0b0010_0000_0000_0000
1651+ if (self.0 & LIGATURE_TRANSPARENT_MASK ) == LIGATURE_TRANSPARENT_MASK
16051652 || (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
16061653 {{
1607- Self(self.0 | 0b1000_0000_0000_0000)
1654+ Self(
1655+ self.0
1656+ | WidthInfo::VARIATION_SELECTOR_16.0
1657+ & !WidthInfo::VARIATION_SELECTOR_15.0
1658+ & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1659+ )
16081660 }} else {{
16091661 Self::VARIATION_SELECTOR_16
16101662 }}
16111663 }}
16121664
16131665 /// Clear top bit
16141666 fn unset_emoji_presentation(self) -> Self {{
1615- if (self.0 & 0b0010_0000_0000_0000 ) == 0b0010_0000_0000_0000 {{
1616- Self(self.0 & 0b0111_1111_1111_1111 )
1667+ if (self.0 & LIGATURE_TRANSPARENT_MASK ) == LIGATURE_TRANSPARENT_MASK {{
1668+ Self(self.0 & !WidthInfo::VARIATION_SELECTOR_16.0 )
16171669 }} else {{
16181670 Self::DEFAULT
16191671 }}
16201672 }}
16211673
16221674 /// Has 2nd bit set
16231675 fn is_text_presentation(self) -> bool {{
1624- (self.0 & 0b0100_0000_0000_0000 ) == 0b0100_0000_0000_0000
1676+ (self.0 & WidthInfo::VARIATION_SELECTOR_15.0 ) == WidthInfo::VARIATION_SELECTOR_15.0
16251677 }}
16261678
16271679 /// Set 2nd bit
16281680 fn set_text_presentation(self) -> Self {{
1629- if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1630- Self(self.0 | 0b0100_0000_0000_0000)
1681+ if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1682+ Self(
1683+ self.0
1684+ | WidthInfo::VARIATION_SELECTOR_15.0
1685+ & !WidthInfo::VARIATION_SELECTOR_16.0
1686+ & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1687+ )
16311688 }} else {{
1632- Self(0b0100_0000_0000_0000 )
1689+ Self(WidthInfo::VARIATION_SELECTOR_15.0 )
16331690 }}
16341691 }}
16351692
16361693 /// Clear 2nd bit
16371694 fn unset_text_presentation(self) -> Self {{
1638- Self(self.0 & 0b1011_1111_1111_1111)
1695+ Self(self.0 & !WidthInfo::VARIATION_SELECTOR_15.0)
1696+ }}
1697+
1698+ /// Has 7th bit set
1699+ fn is_vs1_2(self) -> bool {{
1700+ (self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1701+ }}
1702+
1703+ /// Set 7th bit
1704+ fn set_vs1_2(self) -> Self {{
1705+ if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1706+ Self(
1707+ self.0
1708+ | WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1709+ & !WidthInfo::VARIATION_SELECTOR_15.0
1710+ & !WidthInfo::VARIATION_SELECTOR_16.0,
1711+ )
1712+ }} else {{
1713+ Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1714+ }}
1715+ }}
1716+
1717+ /// Clear 7th bit
1718+ fn unset_vs1_2(self) -> Self {{
1719+ Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
16391720 }}
16401721}}
16411722
0 commit comments