Skip to content

Commit d36e4eb

Browse files
committed
Add function for Unicode NFC normalization
gcc/rust/ChangeLog: * Make-lang.in: Add rust-unicode.o * rust-lang.cc (run_rust_tests): Add test. * util/make-rust-unicode.py: Generater of rust-unicode-data.h. * util/rust-unicode-data.h: Auto-generated file. * util/rust-unicode.cc: New file. * util/rust-unicode.h: New file. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 601c289 commit d36e4eb

File tree

6 files changed

+5657
-0
lines changed

6 files changed

+5657
-0
lines changed

gcc/rust/Make-lang.in

+1
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ GRS_OBJS = \
180180
rust/rust-feature.o \
181181
rust/rust-feature-gate.o \
182182
rust/rust-dir-owner.o \
183+
rust/rust-unicode.o \
183184
$(END)
184185
# removed object files from here
185186

gcc/rust/rust-lang.cc

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "rust-ast-resolve-item.h"
4040
#include "rust-lex.h"
4141
#include "optional.h"
42+
#include "rust-unicode.h"
4243

4344
#include <mpfr.h>
4445
// note: header files must be in this order or else forward declarations don't
@@ -458,6 +459,7 @@ run_rust_tests ()
458459
rust_privacy_ctx_test ();
459460
rust_crate_name_validation_test ();
460461
rust_simple_path_resolve_test ();
462+
rust_utf8_normalize_test ();
461463
}
462464
} // namespace selftest
463465

gcc/rust/util/make-rust-unicode.py

+256
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
# Copyright (C) 2020-2023 Free Software Foundation, Inc.
2+
3+
# This file is part of GCC.
4+
5+
# GCC is free software; you can redistribute it and/or modify it under
6+
# the terms of the GNU General Public License as published by the Free
7+
# Software Foundation; either version 3, or (at your option) any later
8+
# version.
9+
10+
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
12+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13+
# for more details.
14+
15+
# You should have received a copy of the GNU General Public License
16+
# along with GCC; see the file COPYING3. If not see
17+
# <http://www.gnu.org/licenses/>.
18+
19+
# Run this program as
20+
# python ./make-rust-unicode.py UnicodeData.txt \
21+
# DerivedNormalizationProps.txt DerivedCoreProperties.txt
22+
# > rust-unicode-data.cc
23+
24+
import sys
25+
26+
COPYRIGHT = (
27+
"// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n"
28+
"\n"
29+
"// This file is part of GCC.\n"
30+
"\n"
31+
"// GCC is free software; you can redistribute it and/or modify it under\n"
32+
"// the terms of the GNU General Public License as published by the Free\n"
33+
"// Software Foundation; either version 3, or (at your option) any later\n"
34+
"// version.\n"
35+
"\n"
36+
"// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n"
37+
"// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n"
38+
"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
39+
"// for more details.\n"
40+
"\n"
41+
"// You should have received a copy of the GNU General Public License\n"
42+
"// along with GCC; see the file COPYING3. If not see\n"
43+
"// <http://www.gnu.org/licenses/>."
44+
)
45+
46+
# Decomposition_Mapping table
47+
decomposition_map = {}
48+
# Canonical_Combining_Class table
49+
ccc_table = {}
50+
# Ranges of codepoints with the Full_Composition_Exclusion property
51+
composition_exclusion_ranges = []
52+
# Ranges of codepoints with the Full_Composition_Exclusion property
53+
alphabetic_ranges = []
54+
# Ranges of codepoints with NFC_QC=No
55+
nfc_qc_no_ranges = []
56+
# Ranges of codepoints with NFC_QC=Maybe
57+
nfc_qc_maybe_ranges = []
58+
59+
60+
# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>'
61+
def parse_codepoint_range(range_str):
62+
codepoint_range = range_str.split("..")
63+
assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format"
64+
start_cp, end_cp = 0, 0
65+
if len(codepoint_range) == 1:
66+
start_cp = int(codepoint_range[0], 16)
67+
end_cp = start_cp + 1
68+
else:
69+
start_cp = int(codepoint_range[0], 16)
70+
end_cp = int(codepoint_range[1], 16)
71+
return [start_cp, end_cp]
72+
73+
74+
def read_unicode_data_txt(filepath):
75+
def process_line(line):
76+
rows = line.split(";")
77+
if len(rows) != 15:
78+
return
79+
# Parse codepoint
80+
cp = int(rows[0], 16)
81+
# Parse CCC
82+
ccc = int(rows[3], 10)
83+
if ccc != 0:
84+
ccc_table[cp] = ccc
85+
# Parse decomposition mapping
86+
# Ignore compatibility decomposition mapping because
87+
# it is not required for **NFC** normalization.
88+
if not rows[5].startswith("<"):
89+
decomp_cp_strs = rows[5].split(" ")
90+
decomp_cps = []
91+
for s in decomp_cp_strs:
92+
if s == "":
93+
continue
94+
decomp_cps.append(int(s, 16))
95+
assert (
96+
len(decomp_cps) <= 2
97+
), "Decomposition_Mapping must not contain more than 2 characters."
98+
if len(decomp_cps) > 0:
99+
decomposition_map[cp] = decomp_cps
100+
101+
with open(sys.argv[1], "r", encoding="UTF-8") as file:
102+
while line := file.readline():
103+
process_line(line.rstrip())
104+
105+
106+
def read_derived_norm_props_txt(filepath):
107+
def process_line(line):
108+
# Ignore comments
109+
line = line.split("#")[0]
110+
rows = line.split(";")
111+
# Too few rows. Skipped.
112+
if len(rows) < 2:
113+
return
114+
rows[0] = rows[0].lstrip().rstrip()
115+
rows[1] = rows[1].lstrip().rstrip()
116+
cp_range = parse_codepoint_range(rows[0])
117+
if rows[1] == "Full_Composition_Exclusion":
118+
composition_exclusion_ranges.append(cp_range)
119+
elif rows[1] == "NFC_QC":
120+
assert len(rows) >= 3, "Too few rows for NFC_QC"
121+
rows[2] = rows[2].lstrip().rstrip()
122+
if rows[2] == "N":
123+
nfc_qc_no_ranges.append(cp_range)
124+
elif rows[2] == "M":
125+
nfc_qc_maybe_ranges.append(cp_range)
126+
else:
127+
raise RuntimeError("Value of NFC_QC must be N or M")
128+
129+
with open(filepath, "r", encoding="UTF-8") as file:
130+
while line := file.readline():
131+
process_line(line.rstrip())
132+
133+
134+
def read_derived_core_props_txt(filepath):
135+
def process_line(line):
136+
# Ignore comments
137+
line = line.split("#")[0]
138+
rows = line.split(";")
139+
# Too few rows. Skipped.
140+
if len(rows) < 2:
141+
return
142+
rows[0] = rows[0].lstrip().rstrip()
143+
rows[1] = rows[1].lstrip().rstrip()
144+
if rows[1] != "Alphabetic":
145+
return
146+
cp_range = parse_codepoint_range(rows[0])
147+
alphabetic_ranges.append(cp_range)
148+
149+
with open(filepath, "r", encoding="UTF-8") as file:
150+
while line := file.readline():
151+
process_line(line.rstrip())
152+
153+
154+
def write_decomposition():
155+
print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {")
156+
print(" // clang-format off")
157+
for cp in sorted(decomposition_map):
158+
print(" {{{:#06x}, ".format(cp), end="")
159+
print("{", end="")
160+
for decomp_cp in decomposition_map[cp]:
161+
print("{:#06x}, ".format(decomp_cp), end="")
162+
print("}},")
163+
print(" // clang-format on")
164+
print("};")
165+
166+
167+
def write_ccc():
168+
print("const std::map<uint32_t, int32_t> CCC_TABLE = {")
169+
print(" // clang-format off")
170+
for cp in ccc_table:
171+
print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp]))
172+
print(" // clang-format on")
173+
print("};")
174+
175+
176+
# TODO: remove this
177+
# def write_composition_exclusion():
178+
# print(
179+
# "const std::array<std::pair<uint32_t, uint32_t>, {}>".format(
180+
# len(composition_exclusion_ranges)
181+
# )
182+
# )
183+
# print("NO_RECOMP_RANGES = {{")
184+
# for r in composition_exclusion_ranges:
185+
# print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
186+
# print("}};")
187+
188+
189+
def write_alphabetic():
190+
print(
191+
"const std::array<std::pair<uint32_t, uint32_t>, {}> ALPHABETIC_RANGES = {{".format(
192+
len(composition_exclusion_ranges)
193+
)
194+
)
195+
print(" // clang-format off")
196+
for r in composition_exclusion_ranges:
197+
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
198+
print(" // clang-format on")
199+
print("};")
200+
201+
202+
def write_recomposition():
203+
# non const.
204+
# TODO: Exclude `Composition_Exclusion`s
205+
print("std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{")
206+
print(" // clang-format off")
207+
for cp in decomposition_map:
208+
if len(decomposition_map[cp]) == 1:
209+
d1 = decomposition_map[cp][0]
210+
d2 = 0
211+
else:
212+
d1 = decomposition_map[cp][0]
213+
d2 = decomposition_map[cp][1]
214+
print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp))
215+
print(" // clang-format on")
216+
print("}};")
217+
218+
219+
def main():
220+
if len(sys.argv) != 4:
221+
print("too few arguments", file=sys.stderr)
222+
exit(-1)
223+
unicode_txt_path = sys.argv[1]
224+
norm_props_txt_path = sys.argv[2]
225+
core_props_txt_path = sys.argv[3]
226+
227+
read_unicode_data_txt(unicode_txt_path)
228+
read_derived_norm_props_txt(norm_props_txt_path)
229+
read_derived_core_props_txt(core_props_txt_path)
230+
231+
print(COPYRIGHT)
232+
print()
233+
234+
print('#include "rust-system.h"')
235+
print()
236+
print("namespace Rust {")
237+
print()
238+
239+
write_decomposition()
240+
print()
241+
write_recomposition()
242+
print()
243+
# write_composition_exclusion()
244+
# print()
245+
write_ccc()
246+
print()
247+
write_alphabetic()
248+
print()
249+
250+
# TODO: write NFC_QC table
251+
252+
print("} // namespace Rust")
253+
254+
255+
if __name__ == "__main__":
256+
main()

0 commit comments

Comments
 (0)