26
26
*
27
27
* TODO(jat): many of the classification methods implemented here are not
28
28
* correct in that they only handle ASCII characters, and many other methods
29
- * are not currently implemented. I think the proper approach is to introduce * a deferred binding parameter which substitutes an implementation using
29
+ * are not currently implemented. I think the proper approach is to introduce
30
+ * a deferred binding parameter which substitutes an implementation using
30
31
* a fully-correct Unicode character database, at the expense of additional
31
32
* data being downloaded. That way developers that need the functionality
32
33
* can get it without those who don't need it paying for it.
33
34
*
34
35
* <pre>
35
36
* The following methods are still not implemented -- most would require Unicode
36
37
* character db to be useful:
37
- * - digit / is* / to*(int codePoint)
38
- * - isDefined(char)
38
+ * - digit(int codePoint)
39
39
* - isIdentifierIgnorable(char)
40
40
* - isJavaIdentifierPart(char)
41
41
* - isJavaIdentifierStart(char)
42
42
* - isJavaLetter(char) -- deprecated, so probably not
43
43
* - isJavaLetterOrDigit(char) -- deprecated, so probably not
44
- * - isISOControl(char)
45
- * - isMirrored(char)
46
- * - isSpaceChar(char)
47
44
* - isUnicodeIdentifierPart(char)
48
45
* - isUnicodeIdentifierStart(char)
49
46
* - getDirectionality(*)
55
52
*
56
53
* The following do not properly handle characters outside of ASCII:
57
54
* - digit(char c, int radix)
58
- * - isDigit(char c)
59
- * - isLetter(char c)
60
- * - isLetterOrDigit(char c)
61
55
* - isLowerCase(char c)
62
56
* - isUpperCase(char c)
63
57
* </pre>
@@ -72,11 +66,11 @@ static class CharSequenceAdapter implements CharSequence {
72
66
private int start ;
73
67
private int end ;
74
68
75
- public CharSequenceAdapter (char [] charArray ) {
69
+ CharSequenceAdapter (char [] charArray ) {
76
70
this (charArray , 0 , charArray .length );
77
71
}
78
72
79
- public CharSequenceAdapter (char [] charArray , int start , int end ) {
73
+ CharSequenceAdapter (char [] charArray , int start , int end ) {
80
74
this .charArray = charArray ;
81
75
this .start = start ;
82
76
this .end = end ;
@@ -234,57 +228,136 @@ public static boolean isBmpCodePoint(int codePoint) {
234
228
return codePoint >= MIN_VALUE && codePoint <= MAX_VALUE ;
235
229
}
236
230
231
+ private static NativeRegExp definedRegex ;
232
+
233
+ public static boolean isDefined (char c ) {
234
+ return isDefined (String .valueOf (c ));
235
+ }
236
+
237
+ public static boolean isDefined (int codePoint ) {
238
+ return isValidCodePoint (codePoint )
239
+ && isDefined (String .NativeString .fromCodePoint (codePoint ));
240
+ }
241
+
242
+ private static boolean isDefined (String str ) {
243
+ if (definedRegex == null ) {
244
+ definedRegex = new NativeRegExp ("\\ P{Cn}" , "u" );
245
+ }
246
+ return definedRegex .test (str );
247
+ }
248
+
237
249
private static NativeRegExp digitRegex ;
238
250
239
- /*
240
- * TODO: correct Unicode handling.
241
- */
242
251
public static boolean isDigit (char c ) {
252
+ return isDigit (String .valueOf (c ));
253
+ }
254
+
255
+ // Known differences between Java 17 and Chrome 135
256
+ // 11f50 .. 11f59, 16ac0 .. 16ac9, 1e4f0 .. 1e4f9, 1fbf0 .. 1fbf9
257
+ public static boolean isDigit (int codePoint ) {
258
+ return isValidCodePoint (codePoint ) && isDigit (String .NativeString .fromCodePoint (codePoint ));
259
+ }
260
+
261
+ private static boolean isDigit (String str ) {
243
262
if (digitRegex == null ) {
244
- digitRegex = new NativeRegExp ("\\ d " );
263
+ digitRegex = new NativeRegExp ("\\ p{Nd}" , "u " );
245
264
}
246
- return digitRegex .test (String .valueOf (c ));
265
+ return digitRegex .test (String .valueOf (str ));
247
266
}
248
267
249
268
public static boolean isHighSurrogate (char ch ) {
250
269
return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE ;
251
270
}
252
271
272
+ private static NativeRegExp ideographicRegex ;
273
+
274
+ public static boolean isIdeographic (int codePoint ) {
275
+ return isValidCodePoint (codePoint )
276
+ && isIdeographic (String .NativeString .fromCodePoint (codePoint ));
277
+ }
278
+
279
+ private static boolean isIdeographic (String str ) {
280
+ if (ideographicRegex == null ) {
281
+ ideographicRegex = new NativeRegExp ("\\ p{Ideographic}" , "u" );
282
+ }
283
+ return ideographicRegex .test (str );
284
+ }
285
+
253
286
private static NativeRegExp leterRegex ;
254
287
255
- /*
256
- * TODO: correct Unicode handling.
257
- */
258
288
public static boolean isLetter (char c ) {
289
+ return isLetter (String .valueOf (c ));
290
+ }
291
+
292
+ public static boolean isLetter (int codePoint ) {
293
+ return isValidCodePoint (codePoint )
294
+ && isLetter (String .NativeString .fromCodePoint (codePoint ));
295
+ }
296
+
297
+ public static boolean isLetter (String str ) {
259
298
if (leterRegex == null ) {
260
- leterRegex = new NativeRegExp ("[A-Z] " , "i " );
299
+ leterRegex = new NativeRegExp ("\\ p{L} " , "u " );
261
300
}
262
- return leterRegex .test (String . valueOf ( c ) );
301
+ return leterRegex .test (str );
263
302
}
264
303
265
304
private static NativeRegExp isLeterOrDigitRegex ;
266
305
267
- /*
268
- * TODO: correct Unicode handling.
269
- */
270
306
public static boolean isLetterOrDigit (char c ) {
271
307
if (isLeterOrDigitRegex == null ) {
272
- isLeterOrDigitRegex = new NativeRegExp ("[A-Z \\ d ]" , "i " );
308
+ isLeterOrDigitRegex = new NativeRegExp ("[\\ p{Nd} \\ p{L} ]" , "u " );
273
309
}
274
310
return isLeterOrDigitRegex .test (String .valueOf (c ));
275
311
}
276
312
277
- /*
278
- * TODO: correct Unicode handling.
279
- */
313
+ private static NativeRegExp lowerCaseRegex ;
314
+
280
315
public static boolean isLowerCase (char c ) {
281
- return toLowerCase (c ) == c && isLetter (c );
316
+ return isLowerCase (String .valueOf (c ));
317
+ }
318
+
319
+ public static boolean isLowerCase (int codePoint ) {
320
+ return isValidCodePoint (codePoint )
321
+ && isLowerCase (String .NativeString .fromCodePoint (codePoint ));
322
+ }
323
+
324
+ private static boolean isLowerCase (String str ) {
325
+ if (lowerCaseRegex == null ) {
326
+ lowerCaseRegex = new NativeRegExp ("\\ p{Lowercase}" , "u" );
327
+ }
328
+ return lowerCaseRegex .test (str );
282
329
}
283
330
284
331
public static boolean isLowSurrogate (char ch ) {
285
332
return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE ;
286
333
}
287
334
335
+ private static NativeRegExp mirroredRegex ;
336
+
337
+ public static boolean isMirrored (char c ) {
338
+ return isMirrored (String .valueOf (c ));
339
+ }
340
+
341
+ public static boolean isMirrored (int codePoint ) {
342
+ return isValidCodePoint (codePoint )
343
+ && isMirrored (String .NativeString .fromCodePoint (codePoint ));
344
+ }
345
+
346
+ private static boolean isMirrored (String str ) {
347
+ if (mirroredRegex == null ) {
348
+ mirroredRegex = new NativeRegExp ("\\ p{Bidi_Mirrored}" , "u" );
349
+ }
350
+ return mirroredRegex .test (str );
351
+ }
352
+
353
+ public static boolean isISOControl (char ch ) {
354
+ return ch <= '\u001F' || (ch >= '\u007F' && ch <= '\u009F' );
355
+ }
356
+
357
+ public static boolean isISOControl (int codePoint ) {
358
+ return codePoint <= '\u001F' || (codePoint >= '\u007F' && codePoint <= '\u009F' );
359
+ }
360
+
288
361
/**
289
362
* Deprecated - see isWhitespace(char).
290
363
*/
@@ -306,12 +379,35 @@ public static boolean isSpace(char c) {
306
379
}
307
380
}
308
381
382
+ private static NativeRegExp spaceRegex ;
383
+
384
+ public static boolean isSpaceChar (char c ) {
385
+ return isSpaceChar (String .valueOf (c ));
386
+ }
387
+
388
+ public static boolean isSpaceChar (int codePoint ) {
389
+ return isValidCodePoint (codePoint )
390
+ && isSpaceChar (String .NativeString .fromCodePoint (codePoint ));
391
+ }
392
+
393
+ private static boolean isSpaceChar (String str ) {
394
+ if (spaceRegex == null ) {
395
+ spaceRegex = new NativeRegExp ("\\ p{Z}" , "u" );
396
+ }
397
+ return spaceRegex .test (str );
398
+ }
399
+
400
+ public static boolean isSurrogate (char ch ) {
401
+ return ch >= MIN_SURROGATE && ch <= MAX_SURROGATE ;
402
+ }
403
+
309
404
public static boolean isWhitespace (char ch ) {
310
405
return isWhitespace (String .valueOf (ch ));
311
406
}
312
407
313
408
public static boolean isWhitespace (int codePoint ) {
314
- return isWhitespace (String .fromCodePoint (codePoint ));
409
+ return isValidCodePoint (codePoint )
410
+ && isWhitespace (String .NativeString .fromCodePoint (codePoint ));
315
411
}
316
412
317
413
private static NativeRegExp whitespaceRegex ;
@@ -339,14 +435,31 @@ public static boolean isSurrogatePair(char highSurrogate, char lowSurrogate) {
339
435
340
436
public static boolean isTitleCase (char c ) {
341
437
// https://www.compart.com/en/unicode/category/Lt
342
- return c != toUpperCase (c ) && c != toLowerCase (c );
438
+ // here we should use the semantic of String.toUpperCase
439
+ return c != String .valueOf (c ).toUpperCase ().charAt (0 ) && c != toLowerCase (c );
343
440
}
344
441
345
- /*
346
- * TODO: correct Unicode handling.
347
- */
442
+ public static boolean isTitleCase (int codePoint ) {
443
+ // as of Unicode 16 there are no title-case chars beyond 0xffff
444
+ return codePoint > 0 && codePoint < 0xffff && isTitleCase ((char ) codePoint );
445
+ }
446
+
447
+ private static NativeRegExp upperCaseRegex ;
448
+
348
449
public static boolean isUpperCase (char c ) {
349
- return toUpperCase (c ) == c && isLetter (c );
450
+ return isUpperCase (String .valueOf (c ));
451
+ }
452
+
453
+ public static boolean isUpperCase (int codePoint ) {
454
+ return isValidCodePoint (codePoint )
455
+ && isUpperCase (String .NativeString .fromCodePoint (codePoint ));
456
+ }
457
+
458
+ private static boolean isUpperCase (String c ) {
459
+ if (upperCaseRegex == null ) {
460
+ upperCaseRegex = new NativeRegExp ("\\ p{Uppercase}" , "u" );
461
+ }
462
+ return upperCaseRegex .test (c );
350
463
}
351
464
352
465
public static boolean isValidCodePoint (int codePoint ) {
@@ -390,8 +503,8 @@ public static char[] toChars(int codePoint) {
390
503
391
504
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ) {
392
505
return new char [] {
393
- getHighSurrogate (codePoint ),
394
- getLowSurrogate (codePoint ),
506
+ highSurrogate (codePoint ),
507
+ lowSurrogate (codePoint ),
395
508
};
396
509
} else {
397
510
return new char [] {
@@ -404,8 +517,8 @@ public static int toChars(int codePoint, char[] dst, int dstIndex) {
404
517
checkCriticalArgument (codePoint >= 0 && codePoint <= MAX_CODE_POINT );
405
518
406
519
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ) {
407
- dst [dstIndex ++] = getHighSurrogate (codePoint );
408
- dst [dstIndex ] = getLowSurrogate (codePoint );
520
+ dst [dstIndex ++] = highSurrogate (codePoint );
521
+ dst [dstIndex ] = lowSurrogate (codePoint );
409
522
return 2 ;
410
523
} else {
411
524
dst [dstIndex ] = (char ) codePoint ;
@@ -426,14 +539,36 @@ public static char toLowerCase(char c) {
426
539
return CaseMapper .charToLowerCase (c );
427
540
}
428
541
542
+ public static int toLowerCase (int codePoint ) {
543
+ if (codePoint > MAX_CODE_POINT ) {
544
+ return codePoint ;
545
+ }
546
+ return CaseMapper .intToLowerCase (codePoint );
547
+ }
548
+
429
549
public static String toString (char x ) {
430
550
return String .valueOf (x );
431
551
}
432
552
553
+ public static String toString (int codePoint ) {
554
+ if (isValidCodePoint (codePoint )) {
555
+ return String .NativeString .fromCodePoint (codePoint );
556
+ } else {
557
+ throw new IllegalArgumentException ("Invalid code point: " + codePoint );
558
+ }
559
+ }
560
+
433
561
public static char toUpperCase (char c ) {
434
562
return CaseMapper .charToUpperCase (c );
435
563
}
436
564
565
+ public static int toUpperCase (int codePoint ) {
566
+ if (!isValidCodePoint (codePoint )) {
567
+ return codePoint ;
568
+ }
569
+ return CaseMapper .intToUpperCase (codePoint );
570
+ }
571
+
437
572
public static Character valueOf (char c ) {
438
573
if (c < 128 ) {
439
574
return BoxedValues .get (c );
@@ -473,26 +608,26 @@ static char forDigit(int digit) {
473
608
474
609
/**
475
610
* Computes the high surrogate character of the UTF16 representation of a
476
- * non-BMP code point. See {@link getLowSurrogate }.
611
+ * non-BMP code point. See {@link #lowSurrogate }.
477
612
*
478
613
* @param codePoint requested codePoint, required to be >=
479
614
* MIN_SUPPLEMENTARY_CODE_POINT
480
615
* @return high surrogate character
481
616
*/
482
- static char getHighSurrogate (int codePoint ) {
617
+ public static char highSurrogate (int codePoint ) {
483
618
return (char ) (MIN_HIGH_SURROGATE
484
619
+ (((codePoint - MIN_SUPPLEMENTARY_CODE_POINT ) >> 10 ) & 1023 ));
485
620
}
486
621
487
622
/**
488
623
* Computes the low surrogate character of the UTF16 representation of a
489
- * non-BMP code point. See {@link getHighSurrogate }.
624
+ * non-BMP code point. See {@link #highSurrogate }.
490
625
*
491
626
* @param codePoint requested codePoint, required to be >=
492
627
* MIN_SUPPLEMENTARY_CODE_POINT
493
628
* @return low surrogate character
494
629
*/
495
- static char getLowSurrogate (int codePoint ) {
630
+ public static char lowSurrogate (int codePoint ) {
496
631
return (char ) (MIN_LOW_SURROGATE + ((codePoint - MIN_SUPPLEMENTARY_CODE_POINT ) & 1023 ));
497
632
}
498
633
0 commit comments