|
| 1 | +# Copyright (C) 2020-2023 Free Software Foundation, Inc. |
| 2 | + |
| 3 | +# This file is part of GCC. |
| 4 | + |
| 5 | +# GCC is free software; you can redistribute it and/or modify it under |
| 6 | +# the terms of the GNU General Public License as published by the Free |
| 7 | +# Software Foundation; either version 3, or (at your option) any later |
| 8 | +# version. |
| 9 | + |
| 10 | +# GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| 11 | +# WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 12 | +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 13 | +# for more details. |
| 14 | + |
| 15 | +# You should have received a copy of the GNU General Public License |
| 16 | +# along with GCC; see the file COPYING3. If not see |
| 17 | +# <http://www.gnu.org/licenses/>. |
| 18 | + |
| 19 | +# Run this program as |
| 20 | +# python ./make-rust-unicode.py UnicodeData.txt \ |
| 21 | +# DerivedNormalizationProps.txt DerivedCoreProperties.txt \ |
| 22 | +# > rust-unicode-data.h |
| 23 | + |
| 24 | +import sys |
| 25 | + |
| 26 | +COPYRIGHT = ( |
| 27 | + "// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n" |
| 28 | + "\n" |
| 29 | + "// This file is part of GCC.\n" |
| 30 | + "\n" |
| 31 | + "// GCC is free software; you can redistribute it and/or modify it under\n" |
| 32 | + "// the terms of the GNU General Public License as published by the Free\n" |
| 33 | + "// Software Foundation; either version 3, or (at your option) any later\n" |
| 34 | + "// version.\n" |
| 35 | + "\n" |
| 36 | + "// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n" |
| 37 | + "// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n" |
| 38 | + "// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n" |
| 39 | + "// for more details.\n" |
| 40 | + "\n" |
| 41 | + "// You should have received a copy of the GNU General Public License\n" |
| 42 | + "// along with GCC; see the file COPYING3. If not see\n" |
| 43 | + "// <http://www.gnu.org/licenses/>." |
| 44 | +) |
| 45 | + |
| 46 | +# Decomposition_Mapping table |
| 47 | +decomposition_map = {} |
| 48 | +# Canonical_Combining_Class table |
| 49 | +ccc_table = {} |
| 50 | +# Ranges of codepoints with the Full_Composition_Exclusion property |
| 51 | +composition_exclusion_ranges = [] |
| 52 | +# Ranges of codepoints with the Full_Composition_Exclusion property |
| 53 | +alphabetic_ranges = [] |
| 54 | +# Ranges of codepoints with NFC_QC=No |
| 55 | +nfc_qc_no_ranges = [] |
| 56 | +# Ranges of codepoints with NFC_QC=Maybe |
| 57 | +nfc_qc_maybe_ranges = [] |
| 58 | +numeric_codepoints = [] |
| 59 | + |
| 60 | +# Note that an element of range `[m, n]` (a list in python) represents [m, n) |
| 61 | + |
| 62 | + |
| 63 | +def binary_search_ranges(ranges, target): |
| 64 | + low = 0 |
| 65 | + high = len(ranges) - 1 |
| 66 | + while low <= high: |
| 67 | + mid = (low + high) // 2 |
| 68 | + start, end = ranges[mid] |
| 69 | + if start <= target <= end - 1: |
| 70 | + return mid # target found. returns index. |
| 71 | + elif target < start: |
| 72 | + high = mid - 1 |
| 73 | + else: |
| 74 | + low = mid + 1 |
| 75 | + # target not found. |
| 76 | + return -1 |
| 77 | + |
| 78 | + |
| 79 | +# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>' |
| 80 | +def parse_codepoint_range(range_str): |
| 81 | + codepoint_range = range_str.split("..") |
| 82 | + assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format" |
| 83 | + start_cp, end_cp = 0, 0 |
| 84 | + if len(codepoint_range) == 1: |
| 85 | + # m..n => [m, n+1) |
| 86 | + start_cp = int(codepoint_range[0], 16) |
| 87 | + end_cp = start_cp + 1 |
| 88 | + else: |
| 89 | + # m => [m, m+1) |
| 90 | + start_cp = int(codepoint_range[0], 16) |
| 91 | + end_cp = int(codepoint_range[1], 16) + 1 |
| 92 | + return [start_cp, end_cp] |
| 93 | + |
| 94 | + |
| 95 | +def read_unicode_data_txt(filepath): |
| 96 | + def process_line(line): |
| 97 | + rows = line.split(";") |
| 98 | + if len(rows) != 15: |
| 99 | + return |
| 100 | + # Parse codepoint |
| 101 | + cp = int(rows[0], 16) |
| 102 | + # Parse general category |
| 103 | + category = rows[2] |
| 104 | + if category == "Nd" or category == "Nl" or category == "No": |
| 105 | + numeric_codepoints.append(cp) |
| 106 | + |
| 107 | + # Parse CCC |
| 108 | + ccc = int(rows[3], 10) |
| 109 | + if ccc != 0: |
| 110 | + ccc_table[cp] = ccc |
| 111 | + # Parse decomposition mapping |
| 112 | + # Ignore compatibility decomposition mapping because |
| 113 | + # it is not required for **NFC** normalization. |
| 114 | + if not rows[5].startswith("<"): |
| 115 | + decomp_cp_strs = rows[5].split(" ") |
| 116 | + decomp_cps = [] |
| 117 | + for s in decomp_cp_strs: |
| 118 | + if s == "": |
| 119 | + continue |
| 120 | + decomp_cps.append(int(s, 16)) |
| 121 | + assert ( |
| 122 | + len(decomp_cps) <= 2 |
| 123 | + ), "Decomposition_Mapping must not contain more than 2 characters." |
| 124 | + if len(decomp_cps) > 0: |
| 125 | + decomposition_map[cp] = decomp_cps |
| 126 | + |
| 127 | + with open(sys.argv[1], "r", encoding="UTF-8") as file: |
| 128 | + while line := file.readline(): |
| 129 | + process_line(line.rstrip()) |
| 130 | + |
| 131 | + |
| 132 | +def read_derived_norm_props_txt(filepath): |
| 133 | + def process_line(line): |
| 134 | + # Ignore comments |
| 135 | + line = line.split("#")[0] |
| 136 | + rows = line.split(";") |
| 137 | + # Too few rows. Skipped. |
| 138 | + if len(rows) < 2: |
| 139 | + return |
| 140 | + rows[0] = rows[0].lstrip().rstrip() |
| 141 | + rows[1] = rows[1].lstrip().rstrip() |
| 142 | + cp_range = parse_codepoint_range(rows[0]) |
| 143 | + if rows[1] == "Full_Composition_Exclusion": |
| 144 | + composition_exclusion_ranges.append(cp_range) |
| 145 | + elif rows[1] == "NFC_QC": |
| 146 | + assert len(rows) >= 3, "Too few rows for NFC_QC" |
| 147 | + rows[2] = rows[2].lstrip().rstrip() |
| 148 | + if rows[2] == "N": |
| 149 | + nfc_qc_no_ranges.append(cp_range) |
| 150 | + elif rows[2] == "M": |
| 151 | + nfc_qc_maybe_ranges.append(cp_range) |
| 152 | + else: |
| 153 | + raise RuntimeError("Value of NFC_QC must be N or M") |
| 154 | + |
| 155 | + with open(filepath, "r", encoding="UTF-8") as file: |
| 156 | + while line := file.readline(): |
| 157 | + process_line(line.rstrip()) |
| 158 | + |
| 159 | + |
| 160 | +def read_derived_core_props_txt(filepath): |
| 161 | + def process_line(line): |
| 162 | + # Ignore comments |
| 163 | + line = line.split("#")[0] |
| 164 | + rows = line.split(";") |
| 165 | + # Too few rows. Skipped. |
| 166 | + if len(rows) < 2: |
| 167 | + return |
| 168 | + rows[0] = rows[0].lstrip().rstrip() |
| 169 | + rows[1] = rows[1].lstrip().rstrip() |
| 170 | + if rows[1] != "Alphabetic": |
| 171 | + return |
| 172 | + cp_range = parse_codepoint_range(rows[0]) |
| 173 | + alphabetic_ranges.append(cp_range) |
| 174 | + |
| 175 | + with open(filepath, "r", encoding="UTF-8") as file: |
| 176 | + while line := file.readline(): |
| 177 | + process_line(line.rstrip()) |
| 178 | + |
| 179 | + |
| 180 | +def write_decomposition(): |
| 181 | + print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {") |
| 182 | + print(" // clang-format off") |
| 183 | + for cp in sorted(decomposition_map): |
| 184 | + print(" {{{:#06x}, ".format(cp), end="") |
| 185 | + print("{", end="") |
| 186 | + for decomp_cp in decomposition_map[cp]: |
| 187 | + print("{:#06x}, ".format(decomp_cp), end="") |
| 188 | + print("}},") |
| 189 | + print(" // clang-format on") |
| 190 | + print("};") |
| 191 | + |
| 192 | + |
| 193 | +def write_recomposition(): |
| 194 | + print( |
| 195 | + "const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{" |
| 196 | + ) |
| 197 | + print(" // clang-format off") |
| 198 | + for cp in decomposition_map: |
| 199 | + if binary_search_ranges(composition_exclusion_ranges, cp) != -1: |
| 200 | + continue |
| 201 | + if len(decomposition_map[cp]) == 1: |
| 202 | + d1 = decomposition_map[cp][0] |
| 203 | + d2 = 0 |
| 204 | + else: |
| 205 | + d1 = decomposition_map[cp][0] |
| 206 | + d2 = decomposition_map[cp][1] |
| 207 | + print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp)) |
| 208 | + print(" // clang-format on") |
| 209 | + print("}};") |
| 210 | + |
| 211 | + |
| 212 | +def write_ccc(): |
| 213 | + print("const std::map<uint32_t, int32_t> CCC_TABLE = {") |
| 214 | + print(" // clang-format off") |
| 215 | + for cp in ccc_table: |
| 216 | + print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp])) |
| 217 | + print(" // clang-format on") |
| 218 | + print("};") |
| 219 | + |
| 220 | + |
| 221 | +def write_alphabetic(): |
| 222 | + print( |
| 223 | + "const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{" |
| 224 | + ) |
| 225 | + print(" // clang-format off") |
| 226 | + for r in alphabetic_ranges: |
| 227 | + print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) |
| 228 | + print(" // clang-format on") |
| 229 | + print("}};") |
| 230 | + |
| 231 | + |
| 232 | +def write_numeric(): |
| 233 | + print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{") |
| 234 | + print(" // clang-format off") |
| 235 | + for i, cp in enumerate(numeric_codepoints): |
| 236 | + if i % 16 == 0: |
| 237 | + print(" ", end="") |
| 238 | + print("{:#06x}, ".format(cp), end="") |
| 239 | + if i % 16 == 15: |
| 240 | + print() |
| 241 | + if i % 16 != 15: |
| 242 | + print() |
| 243 | + print(" // clang-format on") |
| 244 | + print("}};") |
| 245 | + |
| 246 | + |
| 247 | +def main(): |
| 248 | + if len(sys.argv) != 4: |
| 249 | + print("too few arguments", file=sys.stderr) |
| 250 | + exit(-1) |
| 251 | + unicode_txt_path = sys.argv[1] |
| 252 | + norm_props_txt_path = sys.argv[2] |
| 253 | + core_props_txt_path = sys.argv[3] |
| 254 | + |
| 255 | + read_unicode_data_txt(unicode_txt_path) |
| 256 | + read_derived_norm_props_txt(norm_props_txt_path) |
| 257 | + read_derived_core_props_txt(core_props_txt_path) |
| 258 | + |
| 259 | + print(COPYRIGHT) |
| 260 | + print() |
| 261 | + |
| 262 | + print('#include "rust-system.h"') |
| 263 | + print() |
| 264 | + print("namespace Rust {") |
| 265 | + print() |
| 266 | + print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges))) |
| 267 | + print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints))) |
| 268 | + print() |
| 269 | + |
| 270 | + write_decomposition() |
| 271 | + print() |
| 272 | + write_recomposition() |
| 273 | + print() |
| 274 | + # write_composition_exclusion() |
| 275 | + # print() |
| 276 | + write_ccc() |
| 277 | + print() |
| 278 | + write_alphabetic() |
| 279 | + print() |
| 280 | + write_numeric() |
| 281 | + print() |
| 282 | + |
| 283 | + # TODO: write NFC_QC table |
| 284 | + |
| 285 | + print("} // namespace Rust") |
| 286 | + |
| 287 | + |
| 288 | +if __name__ == "__main__": |
| 289 | + main() |
0 commit comments