22
22
# > rust-unicode-data.h
23
23
24
24
import sys
25
+ from typing import Tuple
26
+
27
+ Codepoint = int
28
+ Range = Tuple [Codepoint , Codepoint ]
25
29
26
30
COPYRIGHT = (
27
31
"// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n "
44
48
)
45
49
46
50
# Decomposition_Mapping table
47
- decomposition_map = {}
51
+ decomposition_map : dict [ Codepoint , list [ Codepoint ]] = {}
48
52
# Canonical_Combining_Class table
49
- ccc_table = {}
53
+ ccc_table : dict [ Codepoint , int ] = {}
50
54
# Ranges of codepoints with the Full_Composition_Exclusion property
51
- composition_exclusion_ranges = []
55
+ composition_exclusion_ranges : list [ Range ] = []
52
56
# Ranges of codepoints with the Full_Composition_Exclusion property
53
- alphabetic_ranges = []
57
+ alphabetic_ranges : list [ Range ] = []
54
58
# Ranges of codepoints with NFC_QC=No
55
- nfc_qc_no_ranges = []
59
+ nfc_qc_no_ranges : list [ Range ] = []
56
60
# Ranges of codepoints with NFC_QC=Maybe
57
- nfc_qc_maybe_ranges = []
58
- numeric_codepoints = []
61
+ nfc_qc_maybe_ranges : list [ Range ] = []
62
+ numeric_codepoints : list [ Codepoint ] = []
59
63
60
64
# Note that an element of range `[m, n]` (a list in python) represents [m, n)
61
65
62
66
63
- def binary_search_ranges (ranges , target ) :
64
- low = 0
65
- high = len (ranges ) - 1
67
+ def binary_search_ranges (ranges : list [ Range ] , target : Codepoint ) -> int :
68
+ low : int = 0
69
+ high : int = len (ranges ) - 1
66
70
while low <= high :
67
71
mid = (low + high ) // 2
68
72
start , end = ranges [mid ]
@@ -77,8 +81,8 @@ def binary_search_ranges(ranges, target):
77
81
78
82
79
83
# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>'
80
- def parse_codepoint_range (range_str ) :
81
- codepoint_range = range_str .split (".." )
84
+ def parse_codepoint_range (range_str : str ) -> Range :
85
+ codepoint_range : list [ str ] = range_str .split (".." )
82
86
assert len (codepoint_range ) == 1 or len (codepoint_range ) == 2 , "Invalid format"
83
87
start_cp , end_cp = 0 , 0
84
88
if len (codepoint_range ) == 1 :
@@ -89,11 +93,11 @@ def parse_codepoint_range(range_str):
89
93
# m => [m, m+1)
90
94
start_cp = int (codepoint_range [0 ], 16 )
91
95
end_cp = int (codepoint_range [1 ], 16 ) + 1
92
- return [ start_cp , end_cp ]
96
+ return start_cp , end_cp
93
97
94
98
95
- def read_unicode_data_txt (filepath ) :
96
- def process_line (line ) :
99
+ def read_unicode_data_txt (filepath : str ) -> None :
100
+ def process_line (line : str ) -> None :
97
101
rows = line .split (";" )
98
102
if len (rows ) != 15 :
99
103
return
@@ -124,13 +128,13 @@ def process_line(line):
124
128
if len (decomp_cps ) > 0 :
125
129
decomposition_map [cp ] = decomp_cps
126
130
127
- with open (sys . argv [ 1 ] , "r" , encoding = "UTF-8" ) as file :
131
+ with open (filepath , "r" , encoding = "UTF-8" ) as file :
128
132
while line := file .readline ():
129
133
process_line (line .rstrip ())
130
134
131
135
132
- def read_derived_norm_props_txt (filepath ) :
133
- def process_line (line ):
136
+ def read_derived_norm_props_txt (filepath : str ) -> None :
137
+ def process_line (line ) -> None :
134
138
# Ignore comments
135
139
line = line .split ("#" )[0 ]
136
140
rows = line .split (";" )
@@ -157,8 +161,8 @@ def process_line(line):
157
161
process_line (line .rstrip ())
158
162
159
163
160
- def read_derived_core_props_txt (filepath ) :
161
- def process_line (line ) :
164
+ def read_derived_core_props_txt (filepath : str ) -> None :
165
+ def process_line (line : str ) -> None :
162
166
# Ignore comments
163
167
line = line .split ("#" )[0 ]
164
168
rows = line .split (";" )
@@ -169,15 +173,15 @@ def process_line(line):
169
173
rows [1 ] = rows [1 ].lstrip ().rstrip ()
170
174
if rows [1 ] != "Alphabetic" :
171
175
return
172
- cp_range = parse_codepoint_range (rows [0 ])
176
+ cp_range : Range = parse_codepoint_range (rows [0 ])
173
177
alphabetic_ranges .append (cp_range )
174
178
175
179
with open (filepath , "r" , encoding = "UTF-8" ) as file :
176
180
while line := file .readline ():
177
181
process_line (line .rstrip ())
178
182
179
183
180
- def write_decomposition ():
184
+ def write_decomposition () -> None :
181
185
print ("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {" )
182
186
print (" // clang-format off" )
183
187
for cp in sorted (decomposition_map ):
@@ -190,14 +194,16 @@ def write_decomposition():
190
194
print ("};" )
191
195
192
196
193
- def write_recomposition ():
197
+ def write_recomposition () -> None :
194
198
print (
195
199
"const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
196
200
)
197
201
print (" // clang-format off" )
198
202
for cp in decomposition_map :
199
203
if binary_search_ranges (composition_exclusion_ranges , cp ) != - 1 :
200
204
continue
205
+ d1 : Codepoint
206
+ d2 : Codepoint
201
207
if len (decomposition_map [cp ]) == 1 :
202
208
d1 = decomposition_map [cp ][0 ]
203
209
d2 = 0
@@ -209,7 +215,7 @@ def write_recomposition():
209
215
print ("}};" )
210
216
211
217
212
- def write_ccc ():
218
+ def write_ccc () -> None :
213
219
print ("const std::map<uint32_t, int32_t> CCC_TABLE = {" )
214
220
print (" // clang-format off" )
215
221
for cp in ccc_table :
@@ -218,7 +224,7 @@ def write_ccc():
218
224
print ("};" )
219
225
220
226
221
- def write_alphabetic ():
227
+ def write_alphabetic () -> None :
222
228
print (
223
229
"const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
224
230
)
@@ -229,7 +235,7 @@ def write_alphabetic():
229
235
print ("}};" )
230
236
231
237
232
- def write_numeric ():
238
+ def write_numeric () -> None :
233
239
print ("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{" )
234
240
print (" // clang-format off" )
235
241
for i , cp in enumerate (numeric_codepoints ):
@@ -244,13 +250,13 @@ def write_numeric():
244
250
print ("}};" )
245
251
246
252
247
- def main ():
253
+ def main () -> None :
248
254
if len (sys .argv ) != 4 :
249
255
print ("too few arguments" , file = sys .stderr )
250
256
exit (- 1 )
251
- unicode_txt_path = sys .argv [1 ]
252
- norm_props_txt_path = sys .argv [2 ]
253
- core_props_txt_path = sys .argv [3 ]
257
+ unicode_txt_path : str = sys .argv [1 ]
258
+ norm_props_txt_path : str = sys .argv [2 ]
259
+ core_props_txt_path : str = sys .argv [3 ]
254
260
255
261
read_unicode_data_txt (unicode_txt_path )
256
262
read_derived_norm_props_txt (norm_props_txt_path )
@@ -271,8 +277,6 @@ def main():
271
277
print ()
272
278
write_recomposition ()
273
279
print ()
274
- # write_composition_exclusion()
275
- # print()
276
280
write_ccc ()
277
281
print ()
278
282
write_alphabetic ()
0 commit comments