Skip to content

Commit 288f502

Browse files
committed
fix: derive canonicalize result from UCD
This ensures that the case mappings in data always conform to the latest Unicode version.
1 parent f2f88e6 commit 288f502

File tree

4 files changed

+46
-15
lines changed

4 files changed

+46
-15
lines changed

data/i-bmp-mappings.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ module.exports = new Map([
210210
[0x198, 0x199],
211211
[0x199, 0x198],
212212
[0x19A, 0x23D],
213+
[0x19B, 0xA7DC],
213214
[0x19C, 0x26F],
214215
[0x19D, 0x272],
215216
[0x19E, 0x220],
@@ -398,6 +399,7 @@ module.exports = new Map([
398399
[0x260, 0x193],
399400
[0x261, 0xA7AC],
400401
[0x263, 0x194],
402+
[0x264, 0xA7CB],
401403
[0x265, 0xA78D],
402404
[0x266, 0xA7AA],
403405
[0x268, 0x197],
@@ -1155,6 +1157,8 @@ module.exports = new Map([
11551157
[0x1C86, 0x44A],
11561158
[0x1C87, 0x463],
11571159
[0x1C88, 0xA64B],
1160+
[0x1C89, 0x1C8A],
1161+
[0x1C8A, 0x1C89],
11581162
[0x1C90, 0x10D0],
11591163
[0x1C91, 0x10D1],
11601164
[0x1C92, 0x10D2],
@@ -2174,12 +2178,18 @@ module.exports = new Map([
21742178
[0xA7C8, 0xA7C7],
21752179
[0xA7C9, 0xA7CA],
21762180
[0xA7CA, 0xA7C9],
2181+
[0xA7CB, 0x264],
2182+
[0xA7CC, 0xA7CD],
2183+
[0xA7CD, 0xA7CC],
21772184
[0xA7D0, 0xA7D1],
21782185
[0xA7D1, 0xA7D0],
21792186
[0xA7D6, 0xA7D7],
21802187
[0xA7D7, 0xA7D6],
21812188
[0xA7D8, 0xA7D9],
21822189
[0xA7D9, 0xA7D8],
2190+
[0xA7DA, 0xA7DB],
2191+
[0xA7DB, 0xA7DA],
2192+
[0xA7DC, 0x19B],
21832193
[0xA7F5, 0xA7F6],
21842194
[0xA7F6, 0xA7F5],
21852195
[0xAB53, 0xA7B3],

data/iu-mappings.js

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ module.exports = new Map([
88
[0xDF, 0x1E9E],
99
[0xE5, 0x212B],
1010
[0x17F, 0x53],
11-
[0x19B, 0xA7DC],
1211
[0x1C4, 0x1C5],
1312
[0x1C5, 0x1C4],
1413
[0x1C7, 0x1C8],
@@ -17,7 +16,6 @@ module.exports = new Map([
1716
[0x1CB, 0x1CA],
1817
[0x1F1, 0x1F2],
1918
[0x1F2, 0x1F1],
20-
[0x264, 0xA7CB],
2119
[0x345, 0x1FBE],
2220
[0x390, 0x1FD3],
2321
[0x392, 0x3D0],
@@ -66,8 +64,6 @@ module.exports = new Map([
6664
[0x1C86, 0x42A],
6765
[0x1C87, 0x462],
6866
[0x1C88, 0xA64A],
69-
[0x1C89, 0x1C8A],
70-
[0x1C8A, 0x1C89],
7167
[0x1E60, 0x1E9B],
7268
[0x1E9B, 0x1E60],
7369
[0x1E9E, 0xDF],
@@ -141,12 +137,6 @@ module.exports = new Map([
141137
0xE5
142138
]],
143139
[0xA64A, 0x1C88],
144-
[0xA7CB, 0x264],
145-
[0xA7CC, 0xA7CD],
146-
[0xA7CD, 0xA7CC],
147-
[0xA7DA, 0xA7DB],
148-
[0xA7DB, 0xA7DA],
149-
[0xA7DC, 0x19B],
150140
[0xFB05, 0xFB06],
151141
[0xFB06, 0xFB05],
152142
[0x10400, 0x10428],

scripts/case-mappings.js

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,36 @@ const flattenMapping = (mapping, extendFilter) => {
7777
return result;
7878
};
7979

80+
const simpleUppercaseMapping = require('@unicode/unicode-16.0.0/Simple_Case_Mapping/Uppercase/symbols.js');
81+
const specialUppercaseMapping = require('@unicode/unicode-16.0.0/Special_Casing/Uppercase/symbols.js');
82+
83+
const characterToUppercase = (character) => {
84+
// Note: While the spec requires pulling in the Final_Sigma casing context data
85+
// (can be accessed from ./Special_Casing/Uppercase--Final_Sigma/) to do a locale-
86+
// insensitive full case conversion, we intentionally skip this data because
87+
// Final_Sigma should not be activated when there is only one character in the string
88+
return (
89+
specialUppercaseMapping.get(character) ??
90+
simpleUppercaseMapping.get(character) ??
91+
character
92+
);
93+
};
94+
95+
// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
96+
const canonicalize = (codepoint) => {
97+
// when HasEitherUnicodeFlag is false and rer.[[IgnoreCase]] is true
98+
const character = String.fromCodePoint(codepoint);
99+
const u = characterToUppercase(character);
100+
if (u.length !== 1) {
101+
return codepoint;
102+
}
103+
const cu = u.codePointAt(0);
104+
if (codepoint >= 0x7f && cu < 0x7f) {
105+
return codepoint;
106+
}
107+
return cu;
108+
}
109+
80110
// From <http://unicode.org/Public/UCD/latest/ucd/CaseFolding.txt>:
81111
//
82112
// The status field is:
@@ -161,11 +191,7 @@ for (const [from, to] of oneWayMappings) {
161191
extend(filteredMappings, from, to);
162192
} else {
163193
// https://mths.be/es6#sec-runtime-semantics-canonicalize-abstract-operation
164-
if(
165-
// TODO: Make this not depend on the engine in which this build script
166-
// runs. (If V8 has a bug, then the generated data has the same bug.)
167-
!RegExp(String.fromCodePoint(from), 'i').test(String.fromCodePoint(to))
168-
) {
194+
if (canonicalize(from) !== canonicalize(to)) {
169195
extend(filteredMappings, from, to);
170196
} else if (from > 0x80 || to > 0x80) {
171197
extend(filteredBMPMappings, from, to);

tests/fixtures/modifiers.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ const modifiersFixtures = [
156156
'matches': ['k', 'K', '\u{212A}', '\u{0131}'],
157157
'nonMatches': ['0', ',']
158158
},
159+
{
160+
// Unicode 16
161+
'pattern': '(?i:\u1C89)',
162+
'expected': '(?:[\\u1C89\\u1C8A])'
163+
},
159164
// +m
160165
{
161166
'pattern': '(?m:^[a-z])',

0 commit comments

Comments
 (0)