diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 23d060d2158c..d6d1295f5039 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -350,6 +350,46 @@ use crate::vec::Vec; /// /// Here, there's no need to allocate more memory inside the loop. /// +/// # Invariant +/// +/// Rust libraries may assume that `String`s are always valid UTF-8, [just like `str`s](prim@str#invariant). +/// +/// Constructing a non-UTF-8 `String` is not immediate undefined behavior, but any function +/// called on a `String` may assume that it is valid UTF-8, which means that a non-UTF-8 `String` +/// can lead to undefined behavior down the road. +/// +/// As an exception, some functions explicitly allow invalid UTF-8, and will not immediately cause undefined +/// behavior if called on a `String` containing invalid UTF-8. In general, all of `String`'s associated +/// functions other than those listed here should be assumed to require their input be valid UTF-8. +/// Note that calling one of these functions on a `String` containing invalid UTF-8, may result in the return value +/// also containing invalid UTF-8, if relevant. +/// +/// * `String::as_bytes` +/// * `String::as_bytes_mut` +/// * `String::as_str` +/// * `String::as_mut_str` +/// * `String::as_ptr` +/// * `String::as_mut_ptr` +/// * `String::as_mut_vec` +/// * `String::capacity` +/// * `String::len` +/// * `String::clear` +/// * `::drop` (i.e. dropping a `String` that contains invalid UTF-8 does not alone cause UB) +/// * `String::leak` +/// * `String::into_boxed_str` +/// * `String::reserve` +/// * `String::reserve_exact` +/// * `String::try_reserve` +/// * `String::try_reserve_exact` +/// * `::deref` +/// * `::deref_mut` +/// * `>::as_ref` +/// * `>::as_mut` +/// * `>::borrow` +/// * `>::borrow_mut` +/// * `::clone` +/// * `::clone_from` +/// /// [str]: prim@str "str" /// [`str`]: prim@str "str" /// [`&str`]: prim@str "&str" @@ -1023,7 +1063,7 @@ impl String { /// This function is unsafe because it does not check that the bytes passed /// to it are valid UTF-8. If this constraint is violated, it may cause /// memory unsafety issues with future users of the `String`, as the rest of - /// the standard library assumes that `String`s are valid UTF-8. + /// the standard library [assumes that `String`s are valid UTF-8](String#invariant). /// /// # Examples /// @@ -1775,8 +1815,8 @@ impl String { /// This function is unsafe because the returned `&mut Vec` allows writing /// bytes which are not valid UTF-8. If this constraint is violated, using /// the original `String` after dropping the `&mut Vec` may violate memory - /// safety, as the rest of the standard library assumes that `String`s are - /// valid UTF-8. + /// safety, as the rest of the standard library [assumes that `String`s are + /// valid UTF-8](String#invariant). /// /// # Examples /// diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs index e105ceadff75..5df07a2f1d14 100644 --- a/library/core/src/primitive_docs.rs +++ b/library/core/src/primitive_docs.rs @@ -1015,6 +1015,29 @@ mod prim_slice {} /// Constructing a non-UTF-8 string slice is not immediate undefined behavior, but any function /// called on a string slice may assume that it is valid UTF-8, which means that a non-UTF-8 string /// slice can lead to undefined behavior down the road. +/// +/// As an exception, some functions explicitly allow invalid UTF-8, and will not immediately cause undefined +/// behavior if called on a `str` containing invalid UTF-8. In general, all of `str`'s associated +/// functions other than those listed here should be assumed to require their input be valid UTF-8. +/// Note that calling one of these functions on a `str` containing invalid UTF-8, may result in the return value +/// also containing invalid UTF-8, if relevant. +/// +/// * `str::as_bytes` +/// * `str::as_bytes_mut` +/// * `str::as_str` +/// * `str::as_ptr` +/// * `str::as_mut_ptr` +/// * `str::bytes` +/// * `str::into_string` +/// * `str::into_boxed_bytes` +/// * `str::len` +/// * `>::as_ref` +/// * `>::as_ref` +/// * `>::as_mut` +/// * `>::borrow` +/// * `>::borrow_mut` +/// * ` as Clone>::clone` +/// * ` as Clone>::clone_from` #[stable(feature = "rust1", since = "1.0.0")] mod prim_str {} diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs index c7bae42765f4..f3d359133388 100644 --- a/library/core/src/str/converts.rs +++ b/library/core/src/str/converts.rs @@ -2,7 +2,7 @@ use super::Utf8Error; use super::validations::run_utf8_validation; -use crate::{mem, ptr}; +use crate::ptr; /// Converts a slice of bytes to a string slice. /// @@ -145,7 +145,10 @@ pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { /// /// # Safety /// -/// The bytes passed in must be valid UTF-8. +/// This function is unsafe because it does not check that the bytes passed +/// to it are valid UTF-8. If this constraint is violated, it may cause +/// memory unsafety issues with future users of the `str`, as the rest of +/// the standard library [assumes that `str`s are valid UTF-8](prim@str#invariant). /// /// # Examples /// @@ -169,9 +172,11 @@ pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { #[rustc_const_stable(feature = "const_str_from_utf8_unchecked", since = "1.55.0")] #[rustc_diagnostic_item = "str_from_utf8_unchecked"] pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { - // SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8. - // Also relies on `&str` and `&[u8]` having the same layout. - unsafe { mem::transmute(v) } + // SAFETY: the pointer dereference is safe because that pointer + // comes from a reference which is guaranteed to be valid for reads. + // If the input bytes are not valid UTF-8, then the returned `&str` will + // have invalid UTF-8, which is unsafe but not immediate UB. + unsafe { &*(v as *const [u8] as *const str) } } /// Converts a slice of bytes to a string slice without checking @@ -197,10 +202,10 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { #[rustc_const_stable(feature = "const_str_from_utf8_unchecked_mut", since = "1.83.0")] #[rustc_diagnostic_item = "str_from_utf8_unchecked_mut"] pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { - // SAFETY: the caller must guarantee that the bytes `v` - // are valid UTF-8, thus the cast to `*mut str` is safe. - // Also, the pointer dereference is safe because that pointer + // SAFETY: the pointer dereference is safe because that pointer // comes from a reference which is guaranteed to be valid for writes. + // If the input bytes are not valid UTF-8, then the returned `&mut str` will + // have invalid UTF-8, which is unsafe but not immediate UB. unsafe { &mut *(v as *mut [u8] as *mut str) } } diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 8a473b398bb5..f9af64d7c180 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -309,10 +309,11 @@ impl str { /// /// # Safety /// - /// The caller must ensure that the content of the slice is valid UTF-8 - /// before the borrow ends and the underlying `str` is used. - /// - /// Use of a `str` whose contents are not valid UTF-8 is undefined behavior. + /// This function is unsafe because the returned `&mut [u8]` allows writing + /// bytes which are not valid UTF-8. If this constraint is violated, using + /// the original `str` after the `&mut [u8]` borrow expires may violate memory + /// safety, as the rest of the standard library [assumes that `str`s are + /// valid UTF-8](prim@str#invariant). /// /// # Examples ///