Skip to content

Commit 1ad6bc2

Browse files
committed
Improve Character emulation
1 parent 15609b6 commit 1ad6bc2

File tree

6 files changed

+346
-179
lines changed

6 files changed

+346
-179
lines changed

user/super/com/google/gwt/emul/java/lang/CaseMapper.java

+17-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,23 @@ public static char charToLowerCase(char c) {
2323
}
2424

2525
public static char charToUpperCase(char c) {
26-
return String.valueOf(c).toUpperCase().charAt(0);
26+
String upper = String.valueOf(c).toUpperCase();
27+
return hasExtraCodePoints(upper) ? c : upper.charAt(0);
28+
}
29+
30+
public static int intToLowerCase(int codePoint) {
31+
return String.NativeString.fromCodePoint(codePoint).toLowerCase().codePointAt(0);
32+
}
33+
34+
public static int intToUpperCase(int codePoint) {
35+
String upper = String.NativeString.fromCodePoint(codePoint).toUpperCase();
36+
return hasExtraCodePoints(upper) ? codePoint : upper.codePointAt(0);
37+
}
38+
39+
// If String.toUpperCase produces more than 1 codepoint, Character.toUpperCase should
40+
// act either as identity or title-case conversion (not supported in GWT).
41+
private static boolean hasExtraCodePoints(String str) {
42+
return str.asNativeString().codePointAt(1) > 0;
2743
}
2844

2945
private CaseMapper() {}

user/super/com/google/gwt/emul/java/lang/Character.java

+178-43
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,21 @@
2626
*
2727
* TODO(jat): many of the classification methods implemented here are not
2828
* correct in that they only handle ASCII characters, and many other methods
29-
* are not currently implemented. I think the proper approach is to introduce * a deferred binding parameter which substitutes an implementation using
29+
* are not currently implemented. I think the proper approach is to introduce
30+
* a deferred binding parameter which substitutes an implementation using
3031
* a fully-correct Unicode character database, at the expense of additional
3132
* data being downloaded. That way developers that need the functionality
3233
* can get it without those who don't need it paying for it.
3334
*
3435
* <pre>
3536
* The following methods are still not implemented -- most would require Unicode
3637
* character db to be useful:
37-
* - digit / is* / to*(int codePoint)
38-
* - isDefined(char)
38+
* - digit(int codePoint)
3939
* - isIdentifierIgnorable(char)
4040
* - isJavaIdentifierPart(char)
4141
* - isJavaIdentifierStart(char)
4242
* - isJavaLetter(char) -- deprecated, so probably not
4343
* - isJavaLetterOrDigit(char) -- deprecated, so probably not
44-
* - isISOControl(char)
45-
* - isMirrored(char)
46-
* - isSpaceChar(char)
4744
* - isUnicodeIdentifierPart(char)
4845
* - isUnicodeIdentifierStart(char)
4946
* - getDirectionality(*)
@@ -55,9 +52,6 @@
5552
*
5653
* The following do not properly handle characters outside of ASCII:
5754
* - digit(char c, int radix)
58-
* - isDigit(char c)
59-
* - isLetter(char c)
60-
* - isLetterOrDigit(char c)
6155
* - isLowerCase(char c)
6256
* - isUpperCase(char c)
6357
* </pre>
@@ -72,11 +66,11 @@ static class CharSequenceAdapter implements CharSequence {
7266
private int start;
7367
private int end;
7468

75-
public CharSequenceAdapter(char[] charArray) {
69+
CharSequenceAdapter(char[] charArray) {
7670
this(charArray, 0, charArray.length);
7771
}
7872

79-
public CharSequenceAdapter(char[] charArray, int start, int end) {
73+
CharSequenceAdapter(char[] charArray, int start, int end) {
8074
this.charArray = charArray;
8175
this.start = start;
8276
this.end = end;
@@ -234,57 +228,136 @@ public static boolean isBmpCodePoint(int codePoint) {
234228
return codePoint >= MIN_VALUE && codePoint <= MAX_VALUE;
235229
}
236230

231+
private static NativeRegExp definedRegex;
232+
233+
public static boolean isDefined(char c) {
234+
return isDefined(String.valueOf(c));
235+
}
236+
237+
public static boolean isDefined(int codePoint) {
238+
return isValidCodePoint(codePoint)
239+
&& isDefined(String.NativeString.fromCodePoint(codePoint));
240+
}
241+
242+
private static boolean isDefined(String str) {
243+
if (definedRegex == null) {
244+
definedRegex = new NativeRegExp("\\P{Cn}", "u");
245+
}
246+
return definedRegex.test(str);
247+
}
248+
237249
private static NativeRegExp digitRegex;
238250

239-
/*
240-
* TODO: correct Unicode handling.
241-
*/
242251
public static boolean isDigit(char c) {
252+
return isDigit(String.valueOf(c));
253+
}
254+
255+
// Known differences between Java 17 and Chrome 135
256+
// 11f50 .. 11f59, 16ac0 .. 16ac9, 1e4f0 .. 1e4f9, 1fbf0 .. 1fbf9
257+
public static boolean isDigit(int codePoint) {
258+
return isValidCodePoint(codePoint) && isDigit(String.NativeString.fromCodePoint(codePoint));
259+
}
260+
261+
private static boolean isDigit(String str) {
243262
if (digitRegex == null) {
244-
digitRegex = new NativeRegExp("\\d");
263+
digitRegex = new NativeRegExp("\\p{Nd}", "u");
245264
}
246-
return digitRegex.test(String.valueOf(c));
265+
return digitRegex.test(String.valueOf(str));
247266
}
248267

249268
public static boolean isHighSurrogate(char ch) {
250269
return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
251270
}
252271

272+
private static NativeRegExp ideographicRegex;
273+
274+
public static boolean isIdeographic(int codePoint) {
275+
return isValidCodePoint(codePoint)
276+
&& isIdeographic(String.NativeString.fromCodePoint(codePoint));
277+
}
278+
279+
private static boolean isIdeographic(String str) {
280+
if (ideographicRegex == null) {
281+
ideographicRegex = new NativeRegExp("\\p{Ideographic}", "u");
282+
}
283+
return ideographicRegex.test(str);
284+
}
285+
253286
private static NativeRegExp leterRegex;
254287

255-
/*
256-
* TODO: correct Unicode handling.
257-
*/
258288
public static boolean isLetter(char c) {
289+
return isLetter(String.valueOf(c));
290+
}
291+
292+
public static boolean isLetter(int codePoint) {
293+
return isValidCodePoint(codePoint)
294+
&& isLetter(String.NativeString.fromCodePoint(codePoint));
295+
}
296+
297+
public static boolean isLetter(String str) {
259298
if (leterRegex == null) {
260-
leterRegex = new NativeRegExp("[A-Z]", "i");
299+
leterRegex = new NativeRegExp("\\p{L}", "u");
261300
}
262-
return leterRegex.test(String.valueOf(c));
301+
return leterRegex.test(str);
263302
}
264303

265304
private static NativeRegExp isLeterOrDigitRegex;
266305

267-
/*
268-
* TODO: correct Unicode handling.
269-
*/
270306
public static boolean isLetterOrDigit(char c) {
271307
if (isLeterOrDigitRegex == null) {
272-
isLeterOrDigitRegex = new NativeRegExp("[A-Z\\d]", "i");
308+
isLeterOrDigitRegex = new NativeRegExp("[\\p{Nd}\\p{L}]", "u");
273309
}
274310
return isLeterOrDigitRegex.test(String.valueOf(c));
275311
}
276312

277-
/*
278-
* TODO: correct Unicode handling.
279-
*/
313+
private static NativeRegExp lowerCaseRegex;
314+
280315
public static boolean isLowerCase(char c) {
281-
return toLowerCase(c) == c && isLetter(c);
316+
return isLowerCase(String.valueOf(c));
317+
}
318+
319+
public static boolean isLowerCase(int codePoint) {
320+
return isValidCodePoint(codePoint)
321+
&& isLowerCase(String.NativeString.fromCodePoint(codePoint));
322+
}
323+
324+
private static boolean isLowerCase(String str) {
325+
if (lowerCaseRegex == null) {
326+
lowerCaseRegex = new NativeRegExp("\\p{Lowercase}", "u");
327+
}
328+
return lowerCaseRegex.test(str);
282329
}
283330

284331
public static boolean isLowSurrogate(char ch) {
285332
return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
286333
}
287334

335+
private static NativeRegExp mirroredRegex;
336+
337+
public static boolean isMirrored(char c) {
338+
return isMirrored(String.valueOf(c));
339+
}
340+
341+
public static boolean isMirrored(int codePoint) {
342+
return isValidCodePoint(codePoint)
343+
&& isMirrored(String.NativeString.fromCodePoint(codePoint));
344+
}
345+
346+
private static boolean isMirrored(String str) {
347+
if (mirroredRegex == null) {
348+
mirroredRegex = new NativeRegExp("\\p{Bidi_Mirrored}", "u");
349+
}
350+
return mirroredRegex.test(str);
351+
}
352+
353+
public static boolean isISOControl(char ch) {
354+
return ch <= '\u001F' || (ch >= '\u007F' && ch <= '\u009F');
355+
}
356+
357+
public static boolean isISOControl(int codePoint) {
358+
return codePoint <= '\u001F' || (codePoint >= '\u007F' && codePoint <= '\u009F');
359+
}
360+
288361
/**
289362
* Deprecated - see isWhitespace(char).
290363
*/
@@ -306,12 +379,35 @@ public static boolean isSpace(char c) {
306379
}
307380
}
308381

382+
private static NativeRegExp spaceRegex;
383+
384+
public static boolean isSpaceChar(char c) {
385+
return isSpaceChar(String.valueOf(c));
386+
}
387+
388+
public static boolean isSpaceChar(int codePoint) {
389+
return isValidCodePoint(codePoint)
390+
&& isSpaceChar(String.NativeString.fromCodePoint(codePoint));
391+
}
392+
393+
private static boolean isSpaceChar(String str) {
394+
if (spaceRegex == null) {
395+
spaceRegex = new NativeRegExp("\\p{Z}", "u");
396+
}
397+
return spaceRegex.test(str);
398+
}
399+
400+
public static boolean isSurrogate(char ch) {
401+
return ch >= MIN_SURROGATE && ch <= MAX_SURROGATE;
402+
}
403+
309404
public static boolean isWhitespace(char ch) {
310405
return isWhitespace(String.valueOf(ch));
311406
}
312407

313408
public static boolean isWhitespace(int codePoint) {
314-
return isWhitespace(String.fromCodePoint(codePoint));
409+
return isValidCodePoint(codePoint)
410+
&& isWhitespace(String.NativeString.fromCodePoint(codePoint));
315411
}
316412

317413
private static NativeRegExp whitespaceRegex;
@@ -339,14 +435,31 @@ public static boolean isSurrogatePair(char highSurrogate, char lowSurrogate) {
339435

340436
public static boolean isTitleCase(char c) {
341437
// https://www.compart.com/en/unicode/category/Lt
342-
return c != toUpperCase(c) && c != toLowerCase(c);
438+
// here we should use the semantic of String.toUpperCase
439+
return c != String.valueOf(c).toUpperCase().charAt(0) && c != toLowerCase(c);
343440
}
344441

345-
/*
346-
* TODO: correct Unicode handling.
347-
*/
442+
public static boolean isTitleCase(int codePoint) {
443+
// as of Unicode 16 there are no title-case chars beyond 0xffff
444+
return codePoint > 0 && codePoint < 0xffff && isTitleCase((char) codePoint);
445+
}
446+
447+
private static NativeRegExp upperCaseRegex;
448+
348449
public static boolean isUpperCase(char c) {
349-
return toUpperCase(c) == c && isLetter(c);
450+
return isUpperCase(String.valueOf(c));
451+
}
452+
453+
public static boolean isUpperCase(int codePoint) {
454+
return isValidCodePoint(codePoint)
455+
&& isUpperCase(String.NativeString.fromCodePoint(codePoint));
456+
}
457+
458+
private static boolean isUpperCase(String c) {
459+
if (upperCaseRegex == null) {
460+
upperCaseRegex = new NativeRegExp("\\p{Uppercase}", "u");
461+
}
462+
return upperCaseRegex.test(c);
350463
}
351464

352465
public static boolean isValidCodePoint(int codePoint) {
@@ -390,8 +503,8 @@ public static char[] toChars(int codePoint) {
390503

391504
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
392505
return new char[] {
393-
getHighSurrogate(codePoint),
394-
getLowSurrogate(codePoint),
506+
highSurrogate(codePoint),
507+
lowSurrogate(codePoint),
395508
};
396509
} else {
397510
return new char[] {
@@ -404,8 +517,8 @@ public static int toChars(int codePoint, char[] dst, int dstIndex) {
404517
checkCriticalArgument(codePoint >= 0 && codePoint <= MAX_CODE_POINT);
405518

406519
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
407-
dst[dstIndex++] = getHighSurrogate(codePoint);
408-
dst[dstIndex] = getLowSurrogate(codePoint);
520+
dst[dstIndex++] = highSurrogate(codePoint);
521+
dst[dstIndex] = lowSurrogate(codePoint);
409522
return 2;
410523
} else {
411524
dst[dstIndex] = (char) codePoint;
@@ -426,14 +539,36 @@ public static char toLowerCase(char c) {
426539
return CaseMapper.charToLowerCase(c);
427540
}
428541

542+
public static int toLowerCase(int codePoint) {
543+
if (codePoint > MAX_CODE_POINT) {
544+
return codePoint;
545+
}
546+
return CaseMapper.intToLowerCase(codePoint);
547+
}
548+
429549
public static String toString(char x) {
430550
return String.valueOf(x);
431551
}
432552

553+
public static String toString(int codePoint) {
554+
if (isValidCodePoint(codePoint)) {
555+
return String.NativeString.fromCodePoint(codePoint);
556+
} else {
557+
throw new IllegalArgumentException("Invalid code point: " + codePoint);
558+
}
559+
}
560+
433561
public static char toUpperCase(char c) {
434562
return CaseMapper.charToUpperCase(c);
435563
}
436564

565+
public static int toUpperCase(int codePoint) {
566+
if (!isValidCodePoint(codePoint)) {
567+
return codePoint;
568+
}
569+
return CaseMapper.intToUpperCase(codePoint);
570+
}
571+
437572
public static Character valueOf(char c) {
438573
if (c < 128) {
439574
return BoxedValues.get(c);
@@ -473,26 +608,26 @@ static char forDigit(int digit) {
473608

474609
/**
475610
* Computes the high surrogate character of the UTF16 representation of a
476-
* non-BMP code point. See {@link getLowSurrogate}.
611+
* non-BMP code point. See {@link #lowSurrogate}.
477612
*
478613
* @param codePoint requested codePoint, required to be >=
479614
* MIN_SUPPLEMENTARY_CODE_POINT
480615
* @return high surrogate character
481616
*/
482-
static char getHighSurrogate(int codePoint) {
617+
public static char highSurrogate(int codePoint) {
483618
return (char) (MIN_HIGH_SURROGATE
484619
+ (((codePoint - MIN_SUPPLEMENTARY_CODE_POINT) >> 10) & 1023));
485620
}
486621

487622
/**
488623
* Computes the low surrogate character of the UTF16 representation of a
489-
* non-BMP code point. See {@link getHighSurrogate}.
624+
* non-BMP code point. See {@link #highSurrogate}.
490625
*
491626
* @param codePoint requested codePoint, required to be >=
492627
* MIN_SUPPLEMENTARY_CODE_POINT
493628
* @return low surrogate character
494629
*/
495-
static char getLowSurrogate(int codePoint) {
630+
public static char lowSurrogate(int codePoint) {
496631
return (char) (MIN_LOW_SURROGATE + ((codePoint - MIN_SUPPLEMENTARY_CODE_POINT) & 1023));
497632
}
498633

0 commit comments

Comments
 (0)