unicode-rs · Manishearth · Feb 24, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -17,6 +17,7 @@
 # - confusables.txt
 # - ReadMe.txt
 # This script also uses the following Unicode UCD data:
+# - DerivedCoreProperties.txt
 # - Scripts.txt
 #
 # Since this should not require frequent updates, we just store this
@@ -53,6 +54,8 @@ def fetch(f):
         sys.stderr.write("cannot load %s\n" % f)
         exit(1)
 
+    return f
+
 # Download a UCD table file
 def fetch_unidata(f):
     if not os.path.exists(os.path.basename(f)):
@@ -63,14 +66,14 @@ def fetch_unidata(f):
         sys.stderr.write("cannot load %s" % f)
         exit(1)
 
-# Loads code point data from IdentifierStatus.txt and
-# IdentifierType.txt
-# Implementation from unicode-segmentation
+    return f
+
+# Loads code point data from provided filename f
+# Implementation adapted from unicode-segmentation
 def load_properties(f, interestingprops = None):
-    fetch(f)
     props = {}
-    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
-    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
+    re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#\s]+) *#")
+    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *#")
 
     for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
         prop = None
@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):
 
     return props
 
-# Loads script data from Scripts.txt
-def load_script_properties(f, interestingprops):
-    fetch_unidata(f)
-    props = {}
-    # Note: these regexes are different from those in unicode-segmentation,
-    # becase we need to handle spaces here
-    re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
-    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
-
-    for line in fileinput.input(os.path.basename(f)):
-        prop = None
-        d_lo = 0
-        d_hi = 0
-        m = re1.match(line)
-        if m:
-            d_lo = m.group(1)
-            d_hi = m.group(1)
-            prop = m.group(2).strip()
-        else:
-            m = re2.match(line)
-            if m:
-                d_lo = m.group(1)
-                d_hi = m.group(2)
-                prop = m.group(3).strip()
-            else:
-                continue
-        if interestingprops and prop not in interestingprops:
-            continue
-        d_lo = int(d_lo, 16)
-        d_hi = int(d_hi, 16)
-        if prop not in props:
-            props[prop] = []
-        props[prop].append((d_lo, d_hi))
-
-    return props
-
 # Loads confusables data from confusables.txt
 def load_confusables(f):
     fetch(f)
@@ -189,7 +156,7 @@ def load_scripts(f):
     # changes are introduced, update accordingly.
 
     (longforms, shortforms) = aliases()
-    scripts = load_script_properties(f, [])
+    scripts = load_properties(fetch_unidata(f), [])
 
     script_table = []
     script_list = []
@@ -546,10 +513,10 @@ def emit_identifier_module(f):
 """)
 
     f.write("    // Identifier status table:\n")
-    identifier_status_table = load_properties("IdentifierStatus.txt")
+    identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
     emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
             pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
-    identifier_type = load_properties("IdentifierType.txt")
+    identifier_type = load_properties(fetch("IdentifierType.txt"))
     type_table = []
     for ty in identifier_type:
         type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
@@ -560,6 +527,26 @@ def emit_identifier_module(f):
             pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
     f.write("}\n\n")
 
+def emit_default_ignorable_detection_module(f):
+    f.write("pub mod default_ignorable_code_point {")
+    f.write("""
+
+    #[inline]
+    pub fn default_ignorable_code_point(c: char) -> bool {
+        match c as usize {
+            _ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
+        }
+    }
+
+""")
+
+    f.write("    // Default ignorable code point table:\n")
+    default_ignorable_table = load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"])
+    emit_table(f, "DEFAULT_IGNORABLE", default_ignorable_table["Default_Ignorable_Code_Point"], "&'static [(char, char)]", is_pub=False,
+            pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
+
+    f.write("}\n\n")
+
 def emit_confusable_detection_module(f):
     f.write("pub mod confusable_detection {")
     f.write("""
@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):
         }
     }
 """)
-    identifier_status_table = load_properties("IdentifierStatus.txt")
+    identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
     _, scripts = load_scripts("Scripts.txt")
     identifier_allowed = identifier_status_table['Allowed']
     (mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
@@ -688,6 +675,8 @@ def emit_util_mod(f):
         emit_util_mod(rf)
         ### identifier module
         emit_identifier_module(rf)
+        ### default_ignorable_detection module
+        emit_default_ignorable_detection_module(rf)
         ### confusable_detection module
         emit_confusable_detection_module(rf)
         ### mixed_script_confusable_detection module

diff --git a/src/confusable_detection.rs b/src/confusable_detection.rs
@@ -34,6 +34,12 @@ fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
 
 /// Calculate skeleton for string, as defined by UTS 39
 pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
+    use crate::tables::default_ignorable_code_point::default_ignorable_code_point;
     use unicode_normalization::UnicodeNormalization;
-    s.chars().nfd().flat_map(char_prototype).nfd()
+
+    s.chars()
+        .nfd()
+        .filter(|c| !default_ignorable_code_point(*c))
+        .flat_map(char_prototype)
+        .nfd()
 }