speed up String::push and String::insert

lincot · lincot · commit 2e719f2e7524 · 2024-05-06T19:26:16.000+03:00
diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs
@@ -103,6 +103,7 @@
 #![feature(assert_matches)]
 #![feature(async_fn_traits)]
 #![feature(async_iterator)]
+#![feature(char_internals)]
 #![feature(coerce_unsized)]
 #![feature(const_align_of_val)]
 #![feature(const_box)]
diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
@@ -1354,9 +1354,14 @@ impl String {
     #[inline]
     #[stable(feature = "rust1", since = "1.0.0")]
     pub fn push(&mut self, ch: char) {
-        match ch.len_utf8() {
-            1 => self.vec.push(ch as u8),
-            _ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()),
+        let len = self.len();
+        let ch_len = ch.len_utf8();
+        self.reserve(ch_len);
+
+        // SAFETY: just reserved capacity for at least the length needed to encode `ch`
+        unsafe {
+            core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.spare_capacity_mut());
+            self.vec.set_len(len + ch_len);
         }
     }
 
@@ -1655,24 +1660,34 @@ impl String {
     #[rustc_confusables("set")]
     pub fn insert(&mut self, idx: usize, ch: char) {
         assert!(self.is_char_boundary(idx));
-        let mut bits = [0; 4];
-        let bits = ch.encode_utf8(&mut bits).as_bytes();
 
+        let len = self.len();
+        let ch_len = ch.len_utf8();
+        self.reserve(ch_len);
+
+        // SAFETY: shift data `ch_len` bytes to the right,
+        // capacity was just reserved for at least that many bytes
         unsafe {
-            self.insert_bytes(idx, bits);
+            ptr::copy(
+                self.vec.as_ptr().add(idx),
+                self.vec.as_mut_ptr().add(idx + ch_len),
+                len - idx,
+            );
         }
-    }
 
-    #[cfg(not(no_global_oom_handling))]
-    unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) {
-        let len = self.len();
-        let amt = bytes.len();
-        self.vec.reserve(amt);
+        // SAFETY: encode the character into the space left after the shift if `idx != len`,
+        // or into the uninitialized spare capacity otherwise
+        unsafe {
+            let dst = slice::from_raw_parts_mut(
+                self.vec.as_mut_ptr().add(idx) as *mut core::mem::MaybeUninit<u8>,
+                ch_len,
+            );
+            core::char::encode_utf8_raw_unchecked(ch as u32, dst);
+        }
 
+        // SAFETY: `ch_len` initialized bytes have been added
         unsafe {
-            ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
-            ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
-            self.vec.set_len(len + amt);
+            self.vec.set_len(len + ch_len);
         }
     }
 
@@ -1701,8 +1716,14 @@ impl String {
     pub fn insert_str(&mut self, idx: usize, string: &str) {
         assert!(self.is_char_boundary(idx));
 
+        let len = self.len();
+        let amt = string.len();
+        self.reserve(amt);
+
         unsafe {
-            self.insert_bytes(idx, string.as_bytes());
+            ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
+            ptr::copy_nonoverlapping(string.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
+            self.vec.set_len(len + amt);
         }
     }
 
diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs
@@ -1,5 +1,6 @@
 //! impl char {}
 
+use crate::mem::MaybeUninit;
 use crate::slice;
 use crate::str::from_utf8_unchecked_mut;
 use crate::unicode::printable::is_printable;
@@ -1768,33 +1769,66 @@ const fn len_utf8(code: u32) -> usize {
 #[inline]
 pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
     let len = len_utf8(code);
-    match (len, &mut dst[..]) {
-        (1, [a, ..]) => {
-            *a = code as u8;
-        }
-        (2, [a, b, ..]) => {
-            *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
-            *b = (code & 0x3F) as u8 | TAG_CONT;
-        }
-        (3, [a, b, c, ..]) => {
-            *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
-            *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
-            *c = (code & 0x3F) as u8 | TAG_CONT;
-        }
-        (4, [a, b, c, d, ..]) => {
-            *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
-            *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-            *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
-            *d = (code & 0x3F) as u8 | TAG_CONT;
-        }
-        _ => panic!(
+    if dst.len() < len {
+        panic!(
             "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
             len,
             code,
             dst.len(),
-        ),
-    };
-    &mut dst[..len]
+        );
+    }
+    // SAFETY: it's safe to pretend that the bytes in the slice may be uninitialized
+    let dst = unsafe { &mut *(dst as *mut [u8] as *mut [MaybeUninit<u8>]) };
+    // SAFETY: `dst` has been checked to be long enough to hold the encoded codepoint
+    unsafe { encode_utf8_raw_unchecked(code, dst) }
+}
+
+/// Encodes a raw u32 value as UTF-8 into the provided possibly uninitialized byte buffer,
+/// and then returns the subslice of the buffer that contains the encoded character.
+///
+/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
+/// (Creating a `char` in the surrogate range is UB.)
+/// The result is valid [generalized UTF-8] but not valid UTF-8.
+///
+/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
+///
+/// # Safety
+///
+/// The behavior is undefined if the buffer is not large enough to hold the encoded codepoint.
+/// A buffer of length four is large enough to encode any `char`.
+///
+/// For a safe version of this function, see the [`encode_utf8_raw`] function.
+#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+#[doc(hidden)]
+#[inline]
+pub unsafe fn encode_utf8_raw_unchecked(code: u32, dst: &mut [MaybeUninit<u8>]) -> &mut [u8] {
+    let len = len_utf8(code);
+    // SAFETY: the caller must guarantee that `dst` is at least `len` bytes long
+    unsafe {
+        match len {
+            1 => {
+                dst.get_unchecked_mut(0).write(code as u8);
+            }
+            2 => {
+                dst.get_unchecked_mut(0).write((code >> 6 & 0x1F) as u8 | TAG_TWO_B);
+                dst.get_unchecked_mut(1).write((code & 0x3F) as u8 | TAG_CONT);
+            }
+            3 => {
+                dst.get_unchecked_mut(0).write((code >> 12 & 0x0F) as u8 | TAG_THREE_B);
+                dst.get_unchecked_mut(1).write((code >> 6 & 0x3F) as u8 | TAG_CONT);
+                dst.get_unchecked_mut(2).write((code & 0x3F) as u8 | TAG_CONT);
+            }
+            4 => {
+                dst.get_unchecked_mut(0).write((code >> 18 & 0x07) as u8 | TAG_FOUR_B);
+                dst.get_unchecked_mut(1).write((code >> 12 & 0x3F) as u8 | TAG_CONT);
+                dst.get_unchecked_mut(2).write((code >> 6 & 0x3F) as u8 | TAG_CONT);
+                dst.get_unchecked_mut(3).write((code & 0x3F) as u8 | TAG_CONT);
+            }
+            _ => unreachable!(),
+        }
+    }
+    // SAFETY: data has been written to the first `len` bytes
+    unsafe { &mut *(dst.get_unchecked_mut(..len) as *mut [MaybeUninit<u8>] as *mut [u8]) }
 }
 
 /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs
@@ -36,7 +36,7 @@ pub use self::decode::{DecodeUtf16, DecodeUtf16Error};
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 pub use self::methods::encode_utf16_raw;
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
-pub use self::methods::encode_utf8_raw;
+pub use self::methods::{encode_utf8_raw, encode_utf8_raw_unchecked};
 
 use crate::ascii;
 use crate::error::Error;