Skip to content

Commit 7ce263e

Browse files
tamaroningphilberty
authored andcommitted
Add function for Unicode NFC normalization
gcc/rust/ChangeLog: * Make-lang.in: Add rust-unicode.o * rust-lang.cc (run_rust_tests): Add test. * rust-system.h: Include <array> * util/make-rust-unicode.py: Generater of rust-unicode-data.h. * util/rust-unicode-data.h: Auto-generated file. * util/rust-unicode.cc: New file. * util/rust-unicode.h: New file. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 42bd81f commit 7ce263e

File tree

7 files changed

+5879
-0
lines changed

7 files changed

+5879
-0
lines changed

gcc/rust/Make-lang.in

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ GRS_OBJS = \
181181
rust/rust-feature.o \
182182
rust/rust-feature-gate.o \
183183
rust/rust-dir-owner.o \
184+
rust/rust-unicode.o \
184185
$(END)
185186
# removed object files from here
186187

gcc/rust/rust-lang.cc

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "rust-ast-resolve-item.h"
4040
#include "rust-lex.h"
4141
#include "optional.h"
42+
#include "rust-unicode.h"
4243

4344
#include <mpfr.h>
4445
// note: header files must be in this order or else forward declarations don't
@@ -458,6 +459,7 @@ run_rust_tests ()
458459
rust_privacy_ctx_test ();
459460
rust_crate_name_validation_test ();
460461
rust_simple_path_resolve_test ();
462+
rust_utf8_normalize_test ();
461463
}
462464
} // namespace selftest
463465

gcc/rust/rust-system.h

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include <memory>
4444
#include <utility>
4545
#include <fstream>
46+
#include <array>
4647

4748
// Rust frontend requires C++11 minimum, so will have unordered_map and set
4849
#include <unordered_map>

gcc/rust/util/make-rust-unicode.py

+289
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
# Copyright (C) 2020-2023 Free Software Foundation, Inc.
2+
3+
# This file is part of GCC.
4+
5+
# GCC is free software; you can redistribute it and/or modify it under
6+
# the terms of the GNU General Public License as published by the Free
7+
# Software Foundation; either version 3, or (at your option) any later
8+
# version.
9+
10+
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
12+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13+
# for more details.
14+
15+
# You should have received a copy of the GNU General Public License
16+
# along with GCC; see the file COPYING3. If not see
17+
# <http://www.gnu.org/licenses/>.
18+
19+
# Run this program as
20+
# python ./make-rust-unicode.py UnicodeData.txt \
21+
# DerivedNormalizationProps.txt DerivedCoreProperties.txt \
22+
# > rust-unicode-data.h
23+
24+
import sys
25+
26+
COPYRIGHT = (
27+
"// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n"
28+
"\n"
29+
"// This file is part of GCC.\n"
30+
"\n"
31+
"// GCC is free software; you can redistribute it and/or modify it under\n"
32+
"// the terms of the GNU General Public License as published by the Free\n"
33+
"// Software Foundation; either version 3, or (at your option) any later\n"
34+
"// version.\n"
35+
"\n"
36+
"// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n"
37+
"// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n"
38+
"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
39+
"// for more details.\n"
40+
"\n"
41+
"// You should have received a copy of the GNU General Public License\n"
42+
"// along with GCC; see the file COPYING3. If not see\n"
43+
"// <http://www.gnu.org/licenses/>."
44+
)
45+
46+
# Decomposition_Mapping table
47+
decomposition_map = {}
48+
# Canonical_Combining_Class table
49+
ccc_table = {}
50+
# Ranges of codepoints with the Full_Composition_Exclusion property
51+
composition_exclusion_ranges = []
52+
# Ranges of codepoints with the Full_Composition_Exclusion property
53+
alphabetic_ranges = []
54+
# Ranges of codepoints with NFC_QC=No
55+
nfc_qc_no_ranges = []
56+
# Ranges of codepoints with NFC_QC=Maybe
57+
nfc_qc_maybe_ranges = []
58+
numeric_codepoints = []
59+
60+
# Note that an element of range `[m, n]` (a list in python) represents [m, n)
61+
62+
63+
def binary_search_ranges(ranges, target):
64+
low = 0
65+
high = len(ranges) - 1
66+
while low <= high:
67+
mid = (low + high) // 2
68+
start, end = ranges[mid]
69+
if start <= target <= end - 1:
70+
return mid # target found. returns index.
71+
elif target < start:
72+
high = mid - 1
73+
else:
74+
low = mid + 1
75+
# target not found.
76+
return -1
77+
78+
79+
# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>'
80+
def parse_codepoint_range(range_str):
81+
codepoint_range = range_str.split("..")
82+
assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format"
83+
start_cp, end_cp = 0, 0
84+
if len(codepoint_range) == 1:
85+
# m..n => [m, n+1)
86+
start_cp = int(codepoint_range[0], 16)
87+
end_cp = start_cp + 1
88+
else:
89+
# m => [m, m+1)
90+
start_cp = int(codepoint_range[0], 16)
91+
end_cp = int(codepoint_range[1], 16) + 1
92+
return [start_cp, end_cp]
93+
94+
95+
def read_unicode_data_txt(filepath):
96+
def process_line(line):
97+
rows = line.split(";")
98+
if len(rows) != 15:
99+
return
100+
# Parse codepoint
101+
cp = int(rows[0], 16)
102+
# Parse general category
103+
category = rows[2]
104+
if category == "Nd" or category == "Nl" or category == "No":
105+
numeric_codepoints.append(cp)
106+
107+
# Parse CCC
108+
ccc = int(rows[3], 10)
109+
if ccc != 0:
110+
ccc_table[cp] = ccc
111+
# Parse decomposition mapping
112+
# Ignore compatibility decomposition mapping because
113+
# it is not required for **NFC** normalization.
114+
if not rows[5].startswith("<"):
115+
decomp_cp_strs = rows[5].split(" ")
116+
decomp_cps = []
117+
for s in decomp_cp_strs:
118+
if s == "":
119+
continue
120+
decomp_cps.append(int(s, 16))
121+
assert (
122+
len(decomp_cps) <= 2
123+
), "Decomposition_Mapping must not contain more than 2 characters."
124+
if len(decomp_cps) > 0:
125+
decomposition_map[cp] = decomp_cps
126+
127+
with open(sys.argv[1], "r", encoding="UTF-8") as file:
128+
while line := file.readline():
129+
process_line(line.rstrip())
130+
131+
132+
def read_derived_norm_props_txt(filepath):
133+
def process_line(line):
134+
# Ignore comments
135+
line = line.split("#")[0]
136+
rows = line.split(";")
137+
# Too few rows. Skipped.
138+
if len(rows) < 2:
139+
return
140+
rows[0] = rows[0].lstrip().rstrip()
141+
rows[1] = rows[1].lstrip().rstrip()
142+
cp_range = parse_codepoint_range(rows[0])
143+
if rows[1] == "Full_Composition_Exclusion":
144+
composition_exclusion_ranges.append(cp_range)
145+
elif rows[1] == "NFC_QC":
146+
assert len(rows) >= 3, "Too few rows for NFC_QC"
147+
rows[2] = rows[2].lstrip().rstrip()
148+
if rows[2] == "N":
149+
nfc_qc_no_ranges.append(cp_range)
150+
elif rows[2] == "M":
151+
nfc_qc_maybe_ranges.append(cp_range)
152+
else:
153+
raise RuntimeError("Value of NFC_QC must be N or M")
154+
155+
with open(filepath, "r", encoding="UTF-8") as file:
156+
while line := file.readline():
157+
process_line(line.rstrip())
158+
159+
160+
def read_derived_core_props_txt(filepath):
161+
def process_line(line):
162+
# Ignore comments
163+
line = line.split("#")[0]
164+
rows = line.split(";")
165+
# Too few rows. Skipped.
166+
if len(rows) < 2:
167+
return
168+
rows[0] = rows[0].lstrip().rstrip()
169+
rows[1] = rows[1].lstrip().rstrip()
170+
if rows[1] != "Alphabetic":
171+
return
172+
cp_range = parse_codepoint_range(rows[0])
173+
alphabetic_ranges.append(cp_range)
174+
175+
with open(filepath, "r", encoding="UTF-8") as file:
176+
while line := file.readline():
177+
process_line(line.rstrip())
178+
179+
180+
def write_decomposition():
181+
print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {")
182+
print(" // clang-format off")
183+
for cp in sorted(decomposition_map):
184+
print(" {{{:#06x}, ".format(cp), end="")
185+
print("{", end="")
186+
for decomp_cp in decomposition_map[cp]:
187+
print("{:#06x}, ".format(decomp_cp), end="")
188+
print("}},")
189+
print(" // clang-format on")
190+
print("};")
191+
192+
193+
def write_recomposition():
194+
print(
195+
"const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
196+
)
197+
print(" // clang-format off")
198+
for cp in decomposition_map:
199+
if binary_search_ranges(composition_exclusion_ranges, cp) != -1:
200+
continue
201+
if len(decomposition_map[cp]) == 1:
202+
d1 = decomposition_map[cp][0]
203+
d2 = 0
204+
else:
205+
d1 = decomposition_map[cp][0]
206+
d2 = decomposition_map[cp][1]
207+
print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp))
208+
print(" // clang-format on")
209+
print("}};")
210+
211+
212+
def write_ccc():
213+
print("const std::map<uint32_t, int32_t> CCC_TABLE = {")
214+
print(" // clang-format off")
215+
for cp in ccc_table:
216+
print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp]))
217+
print(" // clang-format on")
218+
print("};")
219+
220+
221+
def write_alphabetic():
222+
print(
223+
"const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
224+
)
225+
print(" // clang-format off")
226+
for r in alphabetic_ranges:
227+
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
228+
print(" // clang-format on")
229+
print("}};")
230+
231+
232+
def write_numeric():
233+
print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{")
234+
print(" // clang-format off")
235+
for i, cp in enumerate(numeric_codepoints):
236+
if i % 16 == 0:
237+
print(" ", end="")
238+
print("{:#06x}, ".format(cp), end="")
239+
if i % 16 == 15:
240+
print()
241+
if i % 16 != 15:
242+
print()
243+
print(" // clang-format on")
244+
print("}};")
245+
246+
247+
def main():
248+
if len(sys.argv) != 4:
249+
print("too few arguments", file=sys.stderr)
250+
exit(-1)
251+
unicode_txt_path = sys.argv[1]
252+
norm_props_txt_path = sys.argv[2]
253+
core_props_txt_path = sys.argv[3]
254+
255+
read_unicode_data_txt(unicode_txt_path)
256+
read_derived_norm_props_txt(norm_props_txt_path)
257+
read_derived_core_props_txt(core_props_txt_path)
258+
259+
print(COPYRIGHT)
260+
print()
261+
262+
print('#include "rust-system.h"')
263+
print()
264+
print("namespace Rust {")
265+
print()
266+
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
267+
print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
268+
print()
269+
270+
write_decomposition()
271+
print()
272+
write_recomposition()
273+
print()
274+
# write_composition_exclusion()
275+
# print()
276+
write_ccc()
277+
print()
278+
write_alphabetic()
279+
print()
280+
write_numeric()
281+
print()
282+
283+
# TODO: write NFC_QC table
284+
285+
print("} // namespace Rust")
286+
287+
288+
if __name__ == "__main__":
289+
main()

0 commit comments

Comments
 (0)