diff --git a/compiler/test/stdlib/char.test.gr b/compiler/test/stdlib/char.test.gr index cef8d1af6..f19d07ac8 100644 --- a/compiler/test/stdlib/char.test.gr +++ b/compiler/test/stdlib/char.test.gr @@ -40,6 +40,20 @@ assert Char.code(Char.pred(Char.fromCode(0xE000))) == 0xD7FF assert Char.toString('f') == "f" assert Char.toString('💯') == "💯" +// byteCount +assert Char.encodedLength(Char.UTF8, 'a') == 1 +assert Char.encodedLength(Char.UTF8, '©') == 2 +assert Char.encodedLength(Char.UTF8, '☃') == 3 +assert Char.encodedLength(Char.UTF8, '🌾') == 4 +assert Char.encodedLength(Char.UTF16, 'a') == 1 +assert Char.encodedLength(Char.UTF16, '©') == 1 +assert Char.encodedLength(Char.UTF16, '☃') == 1 +assert Char.encodedLength(Char.UTF16, '🌾') == 2 +assert Char.encodedLength(Char.UTF32, 'a') == 4 +assert Char.encodedLength(Char.UTF32, '©') == 4 +assert Char.encodedLength(Char.UTF32, '☃') == 4 +assert Char.encodedLength(Char.UTF32, '🌾') == 4 + // issue #927 let chars = [> '\u{1F3F4}', '\u{E0067}'] let mut charPosition = 0 diff --git a/stdlib/char.gr b/stdlib/char.gr index 10ef6361b..d7dd2dee7 100644 --- a/stdlib/char.gr +++ b/stdlib/char.gr @@ -172,6 +172,42 @@ provide let toString = (char: Char) => { WasmI32.toGrain(string): String } +/** + * Byte encodings + * + * @since v0.7.0 + */ +provide enum Encoding { + UTF8, + UTF16, + UTF32, +} + +/** + * Returns the byte count of a character if encoded in the given encoding. + * + * @param encoding: The encoding to check + * @param char: The character + * @returns The byte count of the character in the given encoding + * + * @example Char.encodedLength(Char.UTF8, 'a') == 1 + * @example Char.encodedLength(Char.UTF8, '🌾') == 4 + * @example Char.encodedLength(Char.UTF16, '©') == 1 + * + * @since v0.7.0 + */ +@unsafe +provide let encodedLength = (encoding, char: Char) => { + let usv = untagChar(char) + let utf8ByteCount = usvEncodeLength(usv) + let utf8ByteCount = tagSimpleNumber(utf8ByteCount) + match (encoding) { + UTF32 => 4, + UTF16 => if (utf8ByteCount == 4) 2 else 1, + UTF8 => utf8ByteCount, + } +} + /** * Checks if the first character is less than the second character by Unicode scalar value. * diff --git a/stdlib/char.md b/stdlib/char.md index f5a2def06..f0dd17111 100644 --- a/stdlib/char.md +++ b/stdlib/char.md @@ -27,6 +27,27 @@ from "char" include Char '🌾' ``` +## Types + +Type declarations included in the Char module. + +### Char.**Encoding** + +
+Added in next +No other changes yet. +
+ +```grain +enum Encoding { + UTF8, + UTF16, + UTF32, +} +``` + +Byte encodings + ## Values Functions and constants included in the Char module. @@ -285,6 +306,46 @@ Char.toString('a') == "a" Char.toString('🌾') == "🌾" ``` +### Char.**encodedLength** + +
+Added in next +No other changes yet. +
+ +```grain +encodedLength : (encoding: Encoding, char: Char) => Number +``` + +Returns the byte count of a character if encoded in the given encoding. + +Parameters: + +|param|type|description| +|-----|----|-----------| +|`encoding`|`Encoding`|The encoding to check| +|`char`|`Char`|The character| + +Returns: + +|type|description| +|----|-----------| +|`Number`|The byte count of the character in the given encoding| + +Examples: + +```grain +Char.encodedLength(Char.UTF8, 'a') == 1 +``` + +```grain +Char.encodedLength(Char.UTF8, '🌾') == 4 +``` + +```grain +Char.encodedLength(Char.UTF16, '©') == 1 +``` + ### Char.**(<)**