Skip to content

Commit 72d85db

Browse files
committed
expose char::encode_utf8_raw for libstd
1 parent 52ed89a commit 72d85db

File tree

2 files changed

+63
-40
lines changed

2 files changed

+63
-40
lines changed

src/libcore/char/methods.rs

+59-40
Original file line numberDiff line numberDiff line change
@@ -593,16 +593,7 @@ impl char {
593593
#[stable(feature = "rust1", since = "1.0.0")]
594594
#[inline]
595595
pub fn len_utf8(self) -> usize {
596-
let code = self as u32;
597-
if code < MAX_ONE_B {
598-
1
599-
} else if code < MAX_TWO_B {
600-
2
601-
} else if code < MAX_THREE_B {
602-
3
603-
} else {
604-
4
605-
}
596+
len_utf8(self as u32)
606597
}
607598

608599
/// Returns the number of 16-bit code units this `char` would need if
@@ -670,36 +661,7 @@ impl char {
670661
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
671662
#[inline]
672663
pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
673-
let code = self as u32;
674-
let len = self.len_utf8();
675-
match (len, &mut dst[..]) {
676-
(1, [a, ..]) => {
677-
*a = code as u8;
678-
}
679-
(2, [a, b, ..]) => {
680-
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
681-
*b = (code & 0x3F) as u8 | TAG_CONT;
682-
}
683-
(3, [a, b, c, ..]) => {
684-
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
685-
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
686-
*c = (code & 0x3F) as u8 | TAG_CONT;
687-
}
688-
(4, [a, b, c, d, ..]) => {
689-
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
690-
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
691-
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
692-
*d = (code & 0x3F) as u8 | TAG_CONT;
693-
}
694-
_ => panic!(
695-
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
696-
len,
697-
code,
698-
dst.len(),
699-
),
700-
};
701-
// SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
702-
unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
664+
encode_utf8_raw(self as u32, dst)
703665
}
704666

705667
/// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -1673,3 +1635,60 @@ impl char {
16731635
}
16741636
}
16751637
}
1638+
1639+
#[inline]
1640+
fn len_utf8(code: u32) -> usize {
1641+
if code < MAX_ONE_B {
1642+
1
1643+
} else if code < MAX_TWO_B {
1644+
2
1645+
} else if code < MAX_THREE_B {
1646+
3
1647+
} else {
1648+
4
1649+
}
1650+
}
1651+
1652+
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
1653+
/// and then returns the subslice of the buffer that contains the encoded character.
1654+
///
1655+
/// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range.
1656+
///
1657+
/// # Panics
1658+
///
1659+
/// Panics if the buffer is not large enough.
1660+
/// A buffer of length four is large enough to encode any `char`.
1661+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1662+
#[doc(hidden)]
1663+
#[inline]
1664+
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
1665+
let len = len_utf8(code);
1666+
match (len, &mut dst[..]) {
1667+
(1, [a, ..]) => {
1668+
*a = code as u8;
1669+
}
1670+
(2, [a, b, ..]) => {
1671+
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
1672+
*b = (code & 0x3F) as u8 | TAG_CONT;
1673+
}
1674+
(3, [a, b, c, ..]) => {
1675+
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
1676+
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1677+
*c = (code & 0x3F) as u8 | TAG_CONT;
1678+
}
1679+
(4, [a, b, c, d, ..]) => {
1680+
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
1681+
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
1682+
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1683+
*d = (code & 0x3F) as u8 | TAG_CONT;
1684+
}
1685+
_ => panic!(
1686+
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
1687+
len,
1688+
code,
1689+
dst.len(),
1690+
),
1691+
};
1692+
// SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
1693+
unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
1694+
}

src/libcore/char/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
3737
#[stable(feature = "unicode_version", since = "1.45.0")]
3838
pub use crate::unicode::UNICODE_VERSION;
3939

40+
// perma-unstable re-exports
41+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
42+
pub use self::methods::encode_utf8_raw;
43+
4044
use crate::fmt::{self, Write};
4145
use crate::iter::FusedIterator;
4246

0 commit comments

Comments
 (0)