Skip to content

Commit 4a9fc52

Browse files
committed
gccrs: Implement quick-check for Unicode
gcc/rust/ChangeLog: * rust-system.h: Add <algorithm>. * util/make-rust-unicode.py: Output NFC_Quick_Check table. * util/rust-codepoint.h (struct Codepoint): Add is_supplementary method. * util/rust-unicode-data.h: Generated. * util/rust-unicode.cc (lookup_cc): Modified to use std::lower_bound. (is_alphabetic): Likewise. (nfc_quick_check): New function. (nfc_normalize): Use nfc_quick_check. (is_nfc_qc_maybe): New function. (is_nfc_qc_no): New function. (rust_nfc_qc_test): New test. * util/rust-unicode.h (is_nfc_qc_no): New function. (is_nfc_qc_maybe): New function. (enum class): New enum class. (nfc_quick_check): New function. (rust_nfc_qc_test): New test. * util/DerivedCoreProperties.txt: New file. * util/DerivedNormalizationProps.txt: New file. * util/UnicodeData.txt: New file. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 9864d7f commit 4a9fc52

9 files changed

+64141
-77
lines changed

gcc/rust/rust-system.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include <utility>
4545
#include <fstream>
4646
#include <array>
47+
#include <algorithm>
4748

4849
// Rust frontend requires C++11 minimum, so will have unordered_map and set
4950
#include <unordered_map>

gcc/rust/util/DerivedCoreProperties.txt

Lines changed: 12832 additions & 0 deletions
Large diffs are not rendered by default.

gcc/rust/util/DerivedNormalizationProps.txt

Lines changed: 16089 additions & 0 deletions
Large diffs are not rendered by default.

gcc/rust/util/UnicodeData.txt

Lines changed: 34931 additions & 0 deletions
Large diffs are not rendered by default.

gcc/rust/util/make-rust-unicode.py

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,15 @@
1616
# along with GCC; see the file COPYING3. If not see
1717
# <http://www.gnu.org/licenses/>.
1818

19-
# Run this program as
19+
# First, download the following files from unicode.org
20+
# curl https://unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt \
21+
# -o gcc/rust/util/DerivedNormalizationProps.txt
22+
# curl https://unicode.org/Public/UNIDATA/UnicodeData.txt \
23+
# -o gcc/rust/util/UnicodeData.txt
24+
# curl https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt \
25+
# -o gcc/rust/util/DerivedCoreProperties.txt
26+
#
27+
# Then run this program as
2028
# python ./make-rust-unicode.py UnicodeData.txt \
2129
# DerivedNormalizationProps.txt DerivedCoreProperties.txt \
2230
# > rust-unicode-data.h
@@ -250,6 +258,30 @@ def write_numeric() -> None:
250258
print("}};")
251259

252260

261+
def write_nfc_qc():
262+
print(
263+
"const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_NO_RANGES = {{{{".format(
264+
len(nfc_qc_no_ranges)
265+
)
266+
)
267+
print(" // clang-format off")
268+
for r in nfc_qc_no_ranges:
269+
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
270+
print(" // clang-format on")
271+
print("}};")
272+
273+
print(
274+
"const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_MAYBE_RANGES = {{{{".format(
275+
len(nfc_qc_maybe_ranges)
276+
)
277+
)
278+
print(" // clang-format off")
279+
for r in nfc_qc_maybe_ranges:
280+
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
281+
print(" // clang-format on")
282+
print("}};")
283+
284+
253285
def main() -> None:
254286
if len(sys.argv) != 4:
255287
print("too few arguments", file=sys.stderr)
@@ -265,13 +297,12 @@ def main() -> None:
265297
print(COPYRIGHT)
266298
print()
267299

268-
print('#include "rust-system.h"')
269-
print()
270-
print("namespace Rust {")
271-
print()
300+
print('#include "rust-system.h"\n')
301+
print("namespace Rust {\n")
272302
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
273-
print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
274-
print()
303+
print(
304+
"const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints))
305+
)
275306

276307
write_decomposition()
277308
print()
@@ -283,8 +314,8 @@ def main() -> None:
283314
print()
284315
write_numeric()
285316
print()
286-
287-
# TODO: write NFC_QC table
317+
write_nfc_qc()
318+
print()
288319

289320
print("} // namespace Rust")
290321

gcc/rust/util/rust-codepoint.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct Codepoint
4040
static Codepoint eof () { return Codepoint (UINT32_MAX); }
4141
bool is_eof () const { return value == UINT32_MAX; }
4242
bool is_ascii () const { return value <= MAX_ASCII_CODEPOINT; }
43+
bool is_supplementary_character () const { return value > 0xFFFF; }
4344

4445
// Returns a C++ string containing string value of codepoint.
4546
std::string as_string ();

gcc/rust/util/rust-unicode-data.h

Lines changed: 154 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
namespace Rust {
2222

23-
const uint32_t NUM_ALPHABETIC_RANGES = 1117;
23+
const uint32_t NUM_ALPHABETIC_RANGES = 1141;
2424
const uint32_t NUM_NUMERIC_CODEPOINTS = 1831;
2525

2626
const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
@@ -4167,6 +4167,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
41674167
{0x0bd7, 0x0bd8},
41684168
{0x0c00, 0x0c01},
41694169
{0x0c01, 0x0c04},
4170+
{0x0c04, 0x0c05},
41704171
{0x0c05, 0x0c0d},
41714172
{0x0c0e, 0x0c11},
41724173
{0x0c12, 0x0c29},
@@ -4202,6 +4203,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
42024203
{0x0ce0, 0x0ce2},
42034204
{0x0ce2, 0x0ce4},
42044205
{0x0cf1, 0x0cf3},
4206+
{0x0cf3, 0x0cf4},
42054207
{0x0d00, 0x0d02},
42064208
{0x0d02, 0x0d04},
42074209
{0x0d04, 0x0d0d},
@@ -4257,7 +4259,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
42574259
{0x0f49, 0x0f6d},
42584260
{0x0f71, 0x0f7f},
42594261
{0x0f7f, 0x0f80},
4260-
{0x0f80, 0x0f82},
4262+
{0x0f80, 0x0f84},
42614263
{0x0f88, 0x0f8d},
42624264
{0x0f8d, 0x0f98},
42634265
{0x0f99, 0x0fbd},
@@ -4758,6 +4760,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
47584760
{0x11071, 0x11073},
47594761
{0x11073, 0x11075},
47604762
{0x11075, 0x11076},
4763+
{0x11080, 0x11082},
47614764
{0x11082, 0x11083},
47624765
{0x11083, 0x110b0},
47634766
{0x110b0, 0x110b3},
@@ -4794,6 +4797,8 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
47944797
{0x11234, 0x11235},
47954798
{0x11237, 0x11238},
47964799
{0x1123e, 0x1123f},
4800+
{0x1123f, 0x11241},
4801+
{0x11241, 0x11242},
47974802
{0x11280, 0x11287},
47984803
{0x11288, 0x11289},
47994804
{0x1128a, 0x1128e},
@@ -4948,12 +4953,22 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
49484953
{0x11ee0, 0x11ef3},
49494954
{0x11ef3, 0x11ef5},
49504955
{0x11ef5, 0x11ef7},
4956+
{0x11f00, 0x11f02},
4957+
{0x11f02, 0x11f03},
4958+
{0x11f03, 0x11f04},
4959+
{0x11f04, 0x11f11},
4960+
{0x11f12, 0x11f34},
4961+
{0x11f34, 0x11f36},
4962+
{0x11f36, 0x11f3b},
4963+
{0x11f3e, 0x11f40},
4964+
{0x11f40, 0x11f41},
49514965
{0x11fb0, 0x11fb1},
49524966
{0x12000, 0x1239a},
49534967
{0x12400, 0x1246f},
49544968
{0x12480, 0x12544},
49554969
{0x12f90, 0x12ff1},
4956-
{0x13000, 0x1342f},
4970+
{0x13000, 0x13430},
4971+
{0x13441, 0x13447},
49574972
{0x14400, 0x14647},
49584973
{0x16800, 0x16a39},
49594974
{0x16a40, 0x16a5f},
@@ -4980,7 +4995,9 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
49804995
{0x1aff5, 0x1affc},
49814996
{0x1affd, 0x1afff},
49824997
{0x1b000, 0x1b123},
4998+
{0x1b132, 0x1b133},
49834999
{0x1b150, 0x1b153},
5000+
{0x1b155, 0x1b156},
49845001
{0x1b164, 0x1b168},
49855002
{0x1b170, 0x1b2fc},
49865003
{0x1bc00, 0x1bc6b},
@@ -5021,16 +5038,21 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
50215038
{0x1df00, 0x1df0a},
50225039
{0x1df0a, 0x1df0b},
50235040
{0x1df0b, 0x1df1f},
5041+
{0x1df25, 0x1df2b},
50245042
{0x1e000, 0x1e007},
50255043
{0x1e008, 0x1e019},
50265044
{0x1e01b, 0x1e022},
50275045
{0x1e023, 0x1e025},
50285046
{0x1e026, 0x1e02b},
5047+
{0x1e030, 0x1e06e},
5048+
{0x1e08f, 0x1e090},
50295049
{0x1e100, 0x1e12d},
50305050
{0x1e137, 0x1e13e},
50315051
{0x1e14e, 0x1e14f},
50325052
{0x1e290, 0x1e2ae},
50335053
{0x1e2c0, 0x1e2ec},
5054+
{0x1e4d0, 0x1e4eb},
5055+
{0x1e4eb, 0x1e4ec},
50345056
{0x1e7e0, 0x1e7e7},
50355057
{0x1e7e8, 0x1e7ec},
50365058
{0x1e7ed, 0x1e7ef},
@@ -5076,12 +5098,14 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
50765098
{0x1f150, 0x1f16a},
50775099
{0x1f170, 0x1f18a},
50785100
{0x20000, 0x2a6e0},
5079-
{0x2a700, 0x2b739},
5101+
{0x2a700, 0x2b73a},
50805102
{0x2b740, 0x2b81e},
50815103
{0x2b820, 0x2cea2},
50825104
{0x2ceb0, 0x2ebe1},
5105+
{0x2ebf0, 0x2ee5e},
50835106
{0x2f800, 0x2fa1e},
50845107
{0x30000, 0x3134b},
5108+
{0x31350, 0x323b0},
50855109
// clang-format on
50865110
}};
50875111

@@ -5205,4 +5229,130 @@ const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{
52055229
// clang-format on
52065230
}};
52075231

5232+
const std::array<std::pair<uint32_t, uint32_t>, 74> NFC_QC_NO_RANGES = {{
5233+
// clang-format off
5234+
{0x0340, 0x0342},
5235+
{0x0343, 0x0345},
5236+
{0x0374, 0x0375},
5237+
{0x037e, 0x037f},
5238+
{0x0387, 0x0388},
5239+
{0x0958, 0x0960},
5240+
{0x09dc, 0x09de},
5241+
{0x09df, 0x09e0},
5242+
{0x0a33, 0x0a34},
5243+
{0x0a36, 0x0a37},
5244+
{0x0a59, 0x0a5c},
5245+
{0x0a5e, 0x0a5f},
5246+
{0x0b5c, 0x0b5e},
5247+
{0x0f43, 0x0f44},
5248+
{0x0f4d, 0x0f4e},
5249+
{0x0f52, 0x0f53},
5250+
{0x0f57, 0x0f58},
5251+
{0x0f5c, 0x0f5d},
5252+
{0x0f69, 0x0f6a},
5253+
{0x0f73, 0x0f74},
5254+
{0x0f75, 0x0f77},
5255+
{0x0f78, 0x0f79},
5256+
{0x0f81, 0x0f82},
5257+
{0x0f93, 0x0f94},
5258+
{0x0f9d, 0x0f9e},
5259+
{0x0fa2, 0x0fa3},
5260+
{0x0fa7, 0x0fa8},
5261+
{0x0fac, 0x0fad},
5262+
{0x0fb9, 0x0fba},
5263+
{0x1f71, 0x1f72},
5264+
{0x1f73, 0x1f74},
5265+
{0x1f75, 0x1f76},
5266+
{0x1f77, 0x1f78},
5267+
{0x1f79, 0x1f7a},
5268+
{0x1f7b, 0x1f7c},
5269+
{0x1f7d, 0x1f7e},
5270+
{0x1fbb, 0x1fbc},
5271+
{0x1fbe, 0x1fbf},
5272+
{0x1fc9, 0x1fca},
5273+
{0x1fcb, 0x1fcc},
5274+
{0x1fd3, 0x1fd4},
5275+
{0x1fdb, 0x1fdc},
5276+
{0x1fe3, 0x1fe4},
5277+
{0x1feb, 0x1fec},
5278+
{0x1fee, 0x1ff0},
5279+
{0x1ff9, 0x1ffa},
5280+
{0x1ffb, 0x1ffc},
5281+
{0x1ffd, 0x1ffe},
5282+
{0x2000, 0x2002},
5283+
{0x2126, 0x2127},
5284+
{0x212a, 0x212c},
5285+
{0x2329, 0x232a},
5286+
{0x232a, 0x232b},
5287+
{0x2adc, 0x2add},
5288+
{0xf900, 0xfa0e},
5289+
{0xfa10, 0xfa11},
5290+
{0xfa12, 0xfa13},
5291+
{0xfa15, 0xfa1f},
5292+
{0xfa20, 0xfa21},
5293+
{0xfa22, 0xfa23},
5294+
{0xfa25, 0xfa27},
5295+
{0xfa2a, 0xfa6e},
5296+
{0xfa70, 0xfada},
5297+
{0xfb1d, 0xfb1e},
5298+
{0xfb1f, 0xfb20},
5299+
{0xfb2a, 0xfb37},
5300+
{0xfb38, 0xfb3d},
5301+
{0xfb3e, 0xfb3f},
5302+
{0xfb40, 0xfb42},
5303+
{0xfb43, 0xfb45},
5304+
{0xfb46, 0xfb4f},
5305+
{0x1d15e, 0x1d165},
5306+
{0x1d1bb, 0x1d1c1},
5307+
{0x2f800, 0x2fa1e},
5308+
// clang-format on
5309+
}};
5310+
const std::array<std::pair<uint32_t, uint32_t>, 43> NFC_QC_MAYBE_RANGES = {{
5311+
// clang-format off
5312+
{0x0300, 0x0305},
5313+
{0x0306, 0x030d},
5314+
{0x030f, 0x0310},
5315+
{0x0311, 0x0312},
5316+
{0x0313, 0x0315},
5317+
{0x031b, 0x031c},
5318+
{0x0323, 0x0329},
5319+
{0x032d, 0x032f},
5320+
{0x0330, 0x0332},
5321+
{0x0338, 0x0339},
5322+
{0x0342, 0x0343},
5323+
{0x0345, 0x0346},
5324+
{0x0653, 0x0656},
5325+
{0x093c, 0x093d},
5326+
{0x09be, 0x09bf},
5327+
{0x09d7, 0x09d8},
5328+
{0x0b3e, 0x0b3f},
5329+
{0x0b56, 0x0b57},
5330+
{0x0b57, 0x0b58},
5331+
{0x0bbe, 0x0bbf},
5332+
{0x0bd7, 0x0bd8},
5333+
{0x0c56, 0x0c57},
5334+
{0x0cc2, 0x0cc3},
5335+
{0x0cd5, 0x0cd7},
5336+
{0x0d3e, 0x0d3f},
5337+
{0x0d57, 0x0d58},
5338+
{0x0dca, 0x0dcb},
5339+
{0x0dcf, 0x0dd0},
5340+
{0x0ddf, 0x0de0},
5341+
{0x102e, 0x102f},
5342+
{0x1161, 0x1176},
5343+
{0x11a8, 0x11c3},
5344+
{0x1b35, 0x1b36},
5345+
{0x3099, 0x309b},
5346+
{0x110ba, 0x110bb},
5347+
{0x11127, 0x11128},
5348+
{0x1133e, 0x1133f},
5349+
{0x11357, 0x11358},
5350+
{0x114b0, 0x114b1},
5351+
{0x114ba, 0x114bb},
5352+
{0x114bd, 0x114be},
5353+
{0x115af, 0x115b0},
5354+
{0x11930, 0x11931},
5355+
// clang-format on
5356+
}};
5357+
52085358
} // namespace Rust

0 commit comments

Comments
 (0)