Skip to content

Commit b893439

Browse files
committed
Auto merge of #47088 - clarcharr:cleanup_unicode_py, r=alexcrichton
Move static code outside of unciode.py. This script in libstd_unicode is a mess and also contains code that shouldn't be output by a script, and instead just put in modules. So, this change does that.
2 parents 0f4ebf9 + b4b3ddd commit b893439

File tree

6 files changed

+143
-225
lines changed

6 files changed

+143
-225
lines changed

src/libstd_unicode/bool_trie.rs

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
/// BoolTrie is a trie for representing a set of Unicode codepoints. It is
12+
/// implemented with postfix compression (sharing of identical child nodes),
13+
/// which gives both compact size and fast lookup.
14+
///
15+
/// The space of Unicode codepoints is divided into 3 subareas, each
16+
/// represented by a trie with different depth. In the first (0..0x800), there
17+
/// is no trie structure at all; each u64 entry corresponds to a bitvector
18+
/// effectively holding 64 bool values.
19+
///
20+
/// In the second (0x800..0x10000), each child of the root node represents a
21+
/// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
22+
/// the trie stores an 8-bit index into a shared table of leaf values. This
23+
/// exploits the fact that in reasonable sets, many such leaves can be shared.
24+
///
25+
/// In the third (0x10000..0x110000), each child of the root node represents a
26+
/// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
27+
/// of a child tree. Each of these 64 bytes represents an index into the table
28+
/// of shared 64-bit leaf values. This exploits the sparse structure in the
29+
/// non-BMP range of most Unicode sets.
30+
pub struct BoolTrie {
31+
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
32+
pub r1: [u64; 32], // leaves
33+
34+
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
35+
pub r2: [u8; 992], // first level
36+
pub r3: &'static [u64], // leaves
37+
38+
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
39+
pub r4: [u8; 256], // first level
40+
pub r5: &'static [u8], // second level
41+
pub r6: &'static [u64], // leaves
42+
}
43+
impl BoolTrie {
44+
pub fn lookup(&self, c: char) -> bool {
45+
let c = c as usize;
46+
if c < 0x800 {
47+
trie_range_leaf(c, self.r1[c >> 6])
48+
} else if c < 0x10000 {
49+
let child = self.r2[(c >> 6) - 0x20];
50+
trie_range_leaf(c, self.r3[child as usize])
51+
} else {
52+
let child = self.r4[(c >> 12) - 0x10];
53+
let leaf = self.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
54+
trie_range_leaf(c, self.r6[leaf as usize])
55+
}
56+
}
57+
}
58+
59+
pub struct SmallBoolTrie {
60+
pub(crate) r1: &'static [u8], // first level
61+
pub(crate) r2: &'static [u64], // leaves
62+
}
63+
64+
impl SmallBoolTrie {
65+
pub fn lookup(&self, c: char) -> bool {
66+
let c = c as usize;
67+
match self.r1.get(c >> 6) {
68+
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
69+
None => false,
70+
}
71+
}
72+
}
73+
74+
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
75+
((bitmap_chunk >> (c & 63)) & 1) != 0
76+
}

src/libstd_unicode/char.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ pub use core::char::CharTryFromError;
4747
#[unstable(feature = "decode_utf8", issue = "33906")]
4848
pub use core::char::{DecodeUtf8, decode_utf8};
4949
#[unstable(feature = "unicode", issue = "27783")]
50-
pub use tables::{UnicodeVersion, UNICODE_VERSION};
50+
pub use tables::{UNICODE_VERSION};
51+
#[unstable(feature = "unicode", issue = "27783")]
52+
pub use version::UnicodeVersion;
5153

5254
/// Returns an iterator that yields the lowercase equivalent of a `char`.
5355
///

src/libstd_unicode/lib.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,18 @@
3535
#![feature(core_char_ext)]
3636
#![feature(str_internals)]
3737
#![feature(decode_utf8)]
38-
#![feature(fused)]
3938
#![feature(fn_traits)]
39+
#![feature(fused)]
4040
#![feature(lang_items)]
41+
#![feature(non_exhaustive)]
4142
#![feature(staged_api)]
4243
#![feature(try_from)]
4344
#![feature(unboxed_closures)]
4445

46+
mod bool_trie;
4547
mod tables;
4648
mod u_str;
49+
mod version;
4750
pub mod char;
4851
pub mod lossy;
4952

src/libstd_unicode/tables.rs

+24-107
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,8 @@
1212

1313
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
1414

15-
/// Represents a Unicode Version.
16-
///
17-
/// See also: <http://www.unicode.org/versions/>
18-
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
19-
pub struct UnicodeVersion {
20-
/// Major version.
21-
pub major: u32,
22-
23-
/// Minor version.
24-
pub minor: u32,
25-
26-
/// Micro (or Update) version.
27-
pub micro: u32,
28-
29-
// Private field to keep struct expandable.
30-
_priv: (),
31-
}
15+
use version::UnicodeVersion;
16+
use bool_trie::{BoolTrie, SmallBoolTrie};
3217

3318
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
3419
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
@@ -38,76 +23,8 @@ pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
3823
micro: 0,
3924
_priv: (),
4025
};
41-
42-
43-
// BoolTrie is a trie for representing a set of Unicode codepoints. It is
44-
// implemented with postfix compression (sharing of identical child nodes),
45-
// which gives both compact size and fast lookup.
46-
//
47-
// The space of Unicode codepoints is divided into 3 subareas, each
48-
// represented by a trie with different depth. In the first (0..0x800), there
49-
// is no trie structure at all; each u64 entry corresponds to a bitvector
50-
// effectively holding 64 bool values.
51-
//
52-
// In the second (0x800..0x10000), each child of the root node represents a
53-
// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
54-
// the trie stores an 8-bit index into a shared table of leaf values. This
55-
// exploits the fact that in reasonable sets, many such leaves can be shared.
56-
//
57-
// In the third (0x10000..0x110000), each child of the root node represents a
58-
// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
59-
// of a child tree. Each of these 64 bytes represents an index into the table
60-
// of shared 64-bit leaf values. This exploits the sparse structure in the
61-
// non-BMP range of most Unicode sets.
62-
pub struct BoolTrie {
63-
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
64-
r1: [u64; 32], // leaves
65-
66-
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
67-
r2: [u8; 992], // first level
68-
r3: &'static [u64], // leaves
69-
70-
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
71-
r4: [u8; 256], // first level
72-
r5: &'static [u8], // second level
73-
r6: &'static [u64], // leaves
74-
}
75-
76-
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
77-
((bitmap_chunk >> (c & 63)) & 1) != 0
78-
}
79-
80-
fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
81-
let c = c as usize;
82-
if c < 0x800 {
83-
trie_range_leaf(c, r.r1[c >> 6])
84-
} else if c < 0x10000 {
85-
let child = r.r2[(c >> 6) - 0x20];
86-
trie_range_leaf(c, r.r3[child as usize])
87-
} else {
88-
let child = r.r4[(c >> 12) - 0x10];
89-
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
90-
trie_range_leaf(c, r.r6[leaf as usize])
91-
}
92-
}
93-
94-
pub struct SmallBoolTrie {
95-
r1: &'static [u8], // first level
96-
r2: &'static [u64], // leaves
97-
}
98-
99-
impl SmallBoolTrie {
100-
fn lookup(&self, c: char) -> bool {
101-
let c = c as usize;
102-
match self.r1.get(c >> 6) {
103-
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
104-
None => false,
105-
}
106-
}
107-
}
108-
10926
pub mod general_category {
110-
pub const Cc_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
27+
pub const Cc_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
11128
r1: &[
11229
0, 1, 0
11330
],
@@ -120,7 +37,7 @@ pub mod general_category {
12037
Cc_table.lookup(c)
12138
}
12239

123-
pub const N_table: &'static super::BoolTrie = &super::BoolTrie {
40+
pub const N_table: &super::BoolTrie = &super::BoolTrie {
12441
r1: [
12542
0x03ff000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
12643
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
@@ -212,13 +129,13 @@ pub mod general_category {
212129
};
213130

214131
pub fn N(c: char) -> bool {
215-
super::trie_lookup_range_table(c, N_table)
132+
N_table.lookup(c)
216133
}
217134

218135
}
219136

220137
pub mod derived_property {
221-
pub const Alphabetic_table: &'static super::BoolTrie = &super::BoolTrie {
138+
pub const Alphabetic_table: &super::BoolTrie = &super::BoolTrie {
222139
r1: [
223140
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
224141
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
@@ -397,10 +314,10 @@ pub mod derived_property {
397314
};
398315

399316
pub fn Alphabetic(c: char) -> bool {
400-
super::trie_lookup_range_table(c, Alphabetic_table)
317+
Alphabetic_table.lookup(c)
401318
}
402319

403-
pub const Case_Ignorable_table: &'static super::BoolTrie = &super::BoolTrie {
320+
pub const Case_Ignorable_table: &super::BoolTrie = &super::BoolTrie {
404321
r1: [
405322
0x0400408000000000, 0x0000000140000000, 0x0190a10000000000, 0x0000000000000000,
406323
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
@@ -529,10 +446,10 @@ pub mod derived_property {
529446
};
530447

531448
pub fn Case_Ignorable(c: char) -> bool {
532-
super::trie_lookup_range_table(c, Case_Ignorable_table)
449+
Case_Ignorable_table.lookup(c)
533450
}
534451

535-
pub const Cased_table: &'static super::BoolTrie = &super::BoolTrie {
452+
pub const Cased_table: &super::BoolTrie = &super::BoolTrie {
536453
r1: [
537454
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
538455
0xffffffffffffffff, 0xffffffffffffffff, 0xf7ffffffffffffff, 0xfffffffffffffff0,
@@ -628,10 +545,10 @@ pub mod derived_property {
628545
};
629546

630547
pub fn Cased(c: char) -> bool {
631-
super::trie_lookup_range_table(c, Cased_table)
548+
Cased_table.lookup(c)
632549
}
633550

634-
pub const Lowercase_table: &'static super::BoolTrie = &super::BoolTrie {
551+
pub const Lowercase_table: &super::BoolTrie = &super::BoolTrie {
635552
r1: [
636553
0x0000000000000000, 0x07fffffe00000000, 0x0420040000000000, 0xff7fffff80000000,
637554
0x55aaaaaaaaaaaaaa, 0xd4aaaaaaaaaaab55, 0xe6512d2a4e243129, 0xaa29aaaab5555240,
@@ -725,10 +642,10 @@ pub mod derived_property {
725642
};
726643

727644
pub fn Lowercase(c: char) -> bool {
728-
super::trie_lookup_range_table(c, Lowercase_table)
645+
Lowercase_table.lookup(c)
729646
}
730647

731-
pub const Uppercase_table: &'static super::BoolTrie = &super::BoolTrie {
648+
pub const Uppercase_table: &super::BoolTrie = &super::BoolTrie {
732649
r1: [
733650
0x0000000000000000, 0x0000000007fffffe, 0x0000000000000000, 0x000000007f7fffff,
734651
0xaa55555555555555, 0x2b555555555554aa, 0x11aed2d5b1dbced6, 0x55d255554aaaa490,
@@ -823,10 +740,10 @@ pub mod derived_property {
823740
};
824741

825742
pub fn Uppercase(c: char) -> bool {
826-
super::trie_lookup_range_table(c, Uppercase_table)
743+
Uppercase_table.lookup(c)
827744
}
828745

829-
pub const XID_Continue_table: &'static super::BoolTrie = &super::BoolTrie {
746+
pub const XID_Continue_table: &super::BoolTrie = &super::BoolTrie {
830747
r1: [
831748
0x03ff000000000000, 0x07fffffe87fffffe, 0x04a0040000000000, 0xff7fffffff7fffff,
832749
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
@@ -998,10 +915,10 @@ pub mod derived_property {
998915
};
999916

1000917
pub fn XID_Continue(c: char) -> bool {
1001-
super::trie_lookup_range_table(c, XID_Continue_table)
918+
XID_Continue_table.lookup(c)
1002919
}
1003920

1004-
pub const XID_Start_table: &'static super::BoolTrie = &super::BoolTrie {
921+
pub const XID_Start_table: &super::BoolTrie = &super::BoolTrie {
1005922
r1: [
1006923
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
1007924
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
@@ -1175,13 +1092,13 @@ pub mod derived_property {
11751092
};
11761093

11771094
pub fn XID_Start(c: char) -> bool {
1178-
super::trie_lookup_range_table(c, XID_Start_table)
1095+
XID_Start_table.lookup(c)
11791096
}
11801097

11811098
}
11821099

11831100
pub mod property {
1184-
pub const Pattern_White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
1101+
pub const Pattern_White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
11851102
r1: &[
11861103
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
11871104
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -1198,7 +1115,7 @@ pub mod property {
11981115
Pattern_White_Space_table.lookup(c)
11991116
}
12001117

1201-
pub const White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
1118+
pub const White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
12021119
r1: &[
12031120
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
12041121
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -1238,11 +1155,11 @@ pub mod conversions {
12381155
}
12391156
}
12401157

1241-
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
1158+
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
12421159
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
12431160
}
12441161

1245-
const to_lowercase_table: &'static [(char, [char; 3])] = &[
1162+
const to_lowercase_table: &[(char, [char; 3])] = &[
12461163
('\u{41}', ['\u{61}', '\0', '\0']), ('\u{42}', ['\u{62}', '\0', '\0']), ('\u{43}',
12471164
['\u{63}', '\0', '\0']), ('\u{44}', ['\u{64}', '\0', '\0']), ('\u{45}', ['\u{65}', '\0',
12481165
'\0']), ('\u{46}', ['\u{66}', '\0', '\0']), ('\u{47}', ['\u{67}', '\0', '\0']), ('\u{48}',
@@ -1826,7 +1743,7 @@ pub mod conversions {
18261743
('\u{1e920}', ['\u{1e942}', '\0', '\0']), ('\u{1e921}', ['\u{1e943}', '\0', '\0'])
18271744
];
18281745

1829-
const to_uppercase_table: &'static [(char, [char; 3])] = &[
1746+
const to_uppercase_table: &[(char, [char; 3])] = &[
18301747
('\u{61}', ['\u{41}', '\0', '\0']), ('\u{62}', ['\u{42}', '\0', '\0']), ('\u{63}',
18311748
['\u{43}', '\0', '\0']), ('\u{64}', ['\u{44}', '\0', '\0']), ('\u{65}', ['\u{45}', '\0',
18321749
'\0']), ('\u{66}', ['\u{46}', '\0', '\0']), ('\u{67}', ['\u{47}', '\0', '\0']), ('\u{68}',

0 commit comments

Comments
 (0)