GString: add try_from_bytes(), try_from_cstr(), Encoding

Bromeon · Bromeon · commit 2e3a1204a0a3 · 2025-03-02T12:26:32.000+01:00
diff --git a/godot-core/src/builtin/mod.rs b/godot-core/src/builtin/mod.rs
@@ -168,7 +168,7 @@ pub mod __prelude_reexport {
     pub use rect2i::*;
     pub use rid::*;
     pub use signal::*;
-    pub use string::{GString, NodePath, StringName};
+    pub use string::{Encoding, GString, NodePath, StringName};
     pub use transform2d::*;
     pub use transform3d::*;
     pub use variant::*;
diff --git a/godot-core/src/builtin/string/gstring.rs b/godot-core/src/builtin/string/gstring.rs
@@ -6,14 +6,14 @@
  */
 
 use std::convert::Infallible;
-use std::ffi::c_char;
 use std::fmt;
 use std::fmt::Write;
 
 use godot_ffi as sys;
 use sys::types::OpaqueString;
 use sys::{ffi_methods, interface_fn, GodotFfi};
 
+use crate::builtin::string::Encoding;
 use crate::builtin::{inner, NodePath, StringName, Variant};
 use crate::meta::AsArg;
 use crate::{impl_shared_string_api, meta};
@@ -77,6 +77,61 @@ impl GString {
         Self::default()
     }
 
+    /// Convert string from bytes with given encoding, returning `None` on validation errors.
+    ///
+    /// Intermediate `NUL` characters are not accepted in Godot and always return `None`.
+    ///
+    /// Some notes on the encodings:
+    /// - **Latin-1:** Since every byte is a valid Latin-1 character, no validation besides the `NUL` byte is performed.
+    ///   It is your responsibility to ensure that the input is valid Latin-1.
+    /// - **ASCII**: Subset of Latin-1, which is additionally validated to be valid, non-`NUL` ASCII characters.
+    /// - **UTF-8**: The input is validated to be UTF-8.
+    ///
+    /// Specifying incorrect encoding is safe, but may result in unintended string values.
+    pub fn try_from_bytes(bytes: &[u8], encoding: Encoding) -> Option<Self> {
+        match encoding {
+            Encoding::Ascii => {
+                // If the bytes are ASCII, we can fall back to Latin-1, which is always valid (except for NUL).
+                // is_ascii() does *not* check for the NUL byte, so the check in the Latin-1 branch is still necessary.
+                if bytes.is_ascii() {
+                    Self::try_from_bytes(bytes, Encoding::Latin1)
+                } else {
+                    None
+                }
+            }
+            Encoding::Latin1 => {
+                // Intermediate NUL bytes are not accepted in Godot. Both ASCII + Latin-1 encodings need to explicitly check for this.
+                if bytes.contains(&0) {
+                    return None;
+                }
+
+                let s = unsafe {
+                    Self::new_with_string_uninit(|string_ptr| {
+                        let ctor = interface_fn!(string_new_with_latin1_chars_and_len);
+                        ctor(
+                            string_ptr,
+                            bytes.as_ptr() as *const std::ffi::c_char,
+                            bytes.len() as i64,
+                        );
+                    })
+                };
+                Some(s)
+            }
+            Encoding::Utf8 => {
+                // from_utf8() also checks for intermediate NUL bytes.
+                let utf8 = std::str::from_utf8(bytes);
+                utf8.ok().map(GString::from)
+            }
+        }
+    }
+
+    /// Convert string from bytes with given encoding, returning `None` on validation errors.
+    ///
+    /// Convenience function for [`try_from_bytes()`](Self::try_from_bytes); see its docs for more information.
+    pub fn try_from_cstr(cstr: &std::ffi::CStr, encoding: Encoding) -> Option<Self> {
+        Self::try_from_bytes(cstr.to_bytes(), encoding)
+    }
+
     /// Number of characters in the string.
     ///
     /// _Godot equivalent: `length`_
@@ -260,7 +315,7 @@ impl From<&str> for GString {
                 let ctor = interface_fn!(string_new_with_utf8_chars_and_len);
                 ctor(
                     string_ptr,
-                    bytes.as_ptr() as *const c_char,
+                    bytes.as_ptr() as *const std::ffi::c_char,
                     bytes.len() as i64,
                 );
             })
@@ -307,7 +362,7 @@ impl From<&GString> for String {
 
             interface_fn!(string_to_utf8_chars)(
                 string.string_sys(),
-                buf.as_mut_ptr() as *mut c_char,
+                buf.as_mut_ptr() as *mut std::ffi::c_char,
                 len,
             );
 
diff --git a/godot-core/src/builtin/string/mod.rs b/godot-core/src/builtin/string/mod.rs
@@ -54,6 +54,19 @@ impl FromGodot for String {
     }
 }
 
+// ----------------------------------------------------------------------------------------------------------------------------------------------
+// Encoding
+
+/// Specifies string encoding.
+///
+/// Used in functions such as [`GString::try_from_bytes()`][GString::try_from_bytes] to handle multiple input string encodings.
+#[non_exhaustive]
+pub enum Encoding {
+    Ascii,
+    Latin1,
+    Utf8,
+}
+
 // ----------------------------------------------------------------------------------------------------------------------------------------------
 
 /// Returns a tuple of `(from, len)` from a Rust range.
diff --git a/itest/rust/src/builtin_tests/string/gstring_test.rs b/itest/rust/src/builtin_tests/string/gstring_test.rs
@@ -8,7 +8,7 @@
 use std::collections::HashSet;
 
 use crate::framework::{expect_debug_panic_or_release_ok, itest};
-use godot::builtin::{GString, PackedStringArray};
+use godot::builtin::{Encoding, GString, PackedStringArray};
 
 // TODO use tests from godot-rust/gdnative
 
@@ -267,6 +267,131 @@ fn string_pad() {
 }
 
 // ----------------------------------------------------------------------------------------------------------------------------------------------
+// Byte and C-string conversions
+
+#[itest]
+fn string_from_bytes_ascii() {
+    let ascii = GString::try_from_bytes(b"Hello", Encoding::Ascii).expect("valid ASCII");
+    assert_eq!(ascii, GString::from("Hello"));
+    assert_eq!(ascii.len(), 5);
+
+    let ascii_nul = GString::try_from_bytes(b"Hello\0", Encoding::Ascii);
+    assert_eq!(ascii_nul, None, "intermediate NUL byte is not valid ASCII"); // at end, but still not NUL terminator.
+
+    let latin1 = GString::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Ascii);
+    assert_eq!(latin1, None, "Latin-1 is *not* valid ASCII");
+
+    let utf8 = GString::try_from_bytes(b"\xF6\xF0\x9F\x8D\x8E\xF0\x9F\x92\xA1", Encoding::Ascii);
+    assert_eq!(utf8, None, "UTF-8 is *not* valid ASCII");
+}
+
+#[itest]
+fn string_from_cstr_ascii() {
+    let ascii = GString::try_from_cstr(c"Hello", Encoding::Ascii);
+    let ascii = ascii.expect("valid ASCII");
+    assert_eq!(ascii, GString::from("Hello"));
+    assert_eq!(ascii.len(), 5);
+
+    let latin1 = GString::try_from_cstr(c"/ðõ¾", Encoding::Ascii);
+    assert_eq!(latin1, None, "Latin-1 is *not* valid ASCII");
+
+    let utf8 = GString::try_from_cstr(c"ö🍎A💡", Encoding::Ascii);
+    assert_eq!(utf8, None, "UTF-8 is *not* valid ASCII");
+}
+
+#[itest]
+fn string_from_bytes_latin1() {
+    let ascii = GString::try_from_bytes(b"Hello", Encoding::Latin1);
+    let ascii = ascii.expect("ASCII is valid Latin-1");
+    assert_eq!(ascii, GString::from("Hello"));
+    assert_eq!(ascii.len(), 5);
+
+    let latin1 = GString::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Latin1);
+    let latin1 = latin1.expect("Latin-1 is valid Latin-1");
+    assert_eq!(latin1, GString::from("/ðõ¾"));
+    assert_eq!(latin1.len(), 4);
+
+    let latin1_nul = GString::try_from_bytes(b"/\0\xF0\xF5\xBE", Encoding::Latin1);
+    assert_eq!(
+        latin1_nul, None,
+        "intermediate NUL byte is not valid Latin-1"
+    );
+
+    // UTF-8 -> Latin-1: always succeeds, even if result is garbage, since every byte is a valid Latin-1 character.
+    let utf8 = GString::try_from_bytes(
+        b"\xC3\xB6\xF0\x9F\x8D\x8E\x41\xF0\x9F\x92\xA1",
+        Encoding::Latin1,
+    );
+    let utf8 = utf8.expect("UTF-8 is valid Latin-1, even if garbage");
+    assert_eq!(utf8, GString::from("Ã¶ðAð¡"));
+}
+
+#[itest]
+fn string_from_cstr_latin1() {
+    let ascii = GString::try_from_cstr(c"Hello", Encoding::Latin1);
+    let ascii = ascii.expect("ASCII is valid Latin-1");
+    assert_eq!(ascii, GString::from("Hello"));
+    assert_eq!(ascii.len(), 5);
+
+    // The C-string literal is interpreted as UTF-8, not Latin-1 (which is btw still valid Latin-1), see last test in this #[itest].
+    // So we use explicit bytes in the following tests.
+    assert_eq!(c"/ðõ¾".to_bytes(), b"/\xC3\xB0\xC3\xB5\xC2\xBE");
+    let latin1 = GString::try_from_cstr(c"/\xF0\xF5\xBE", Encoding::Latin1);
+    let latin1 = latin1.expect("Latin-1 is valid Latin-1");
+    assert_eq!(latin1, GString::from("/ðõ¾"));
+    assert_eq!(latin1.len(), 4);
+
+    // UTF-8 -> Latin-1: always succeeds, even if result is garbage, since every byte is a valid Latin-1 character.
+    let utf8 = GString::try_from_cstr(c"ö🍎A💡", Encoding::Latin1);
+    let utf8 = utf8.expect("UTF-8 is valid Latin-1, even if garbage");
+    assert_eq!(utf8, GString::from("Ã¶ðAð¡"));
+}
+
+#[itest]
+fn string_from_bytes_utf8() {
+    let ascii = GString::try_from_bytes(b"Hello", Encoding::Utf8);
+    let ascii = ascii.expect("ASCII is valid UTF-8");
+    assert_eq!(ascii, GString::from("Hello"));
+    assert_eq!(ascii.len(), 5);
+
+    let latin1 = GString::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Utf8);
+    assert_eq!(latin1, None, "Latin-1 is *not* valid UTF-8");
+
+    let utf8 = GString::try_from_bytes(
+        b"\xC3\xB6\xF0\x9F\x8D\x8E\x41\xF0\x9F\x92\xA1",
+        Encoding::Utf8,
+    );
+    let utf8 = utf8.expect("UTF-8 is valid UTF-8");
+    assert_eq!(utf8, GString::from("ö🍎A💡"));
+    assert_eq!(utf8.len(), 4);
+
+    let utf8_nul = GString::try_from_bytes(b"\xC3\0A", Encoding::Utf8);
+    assert_eq!(utf8_nul, None, "intermediate NUL byte is not valid UTF-8");
+}
+
+#[itest]
+fn string_from_cstr_utf8() {
+    let ascii = GString::try_from_cstr(c"Hello", Encoding::Utf8);
+    let ascii = ascii.expect("ASCII is valid UTF-8");
+    assert_eq!(ascii, GString::from("Hello"));
+    assert_eq!(ascii.len(), 5);
+
+    // The latin1 checks pass even though try_from_bytes() for the Latin-1 string b"/\xF0\xF5\xBE" fails.
+    // When using a C string literal, the characters are interpreted as UTF-8, *not* Latin-1, see following assertion.
+    assert_eq!(c"/ðõ¾".to_bytes(), b"/\xC3\xB0\xC3\xB5\xC2\xBE");
+    let latin1 = GString::try_from_cstr(c"/ðõ¾", Encoding::Utf8);
+    let latin1 = latin1.expect("Characters from Latin-1 set re-encoded as UTF-8 are valid UTF-8");
+    assert_eq!(latin1, GString::from("/ðõ¾"));
+    assert_eq!(latin1.len(), 4);
+
+    let utf8 = GString::try_from_cstr(c"ö🍎A💡", Encoding::Utf8);
+    let utf8 = utf8.expect("valid UTF-8");
+    assert_eq!(utf8, GString::from("ö🍎A💡"));
+    assert_eq!(utf8.len(), 4);
+}
+
+// ----------------------------------------------------------------------------------------------------------------------------------------------
+// Helpers
 
 fn packed(strings: &[&str]) -> PackedStringArray {
     strings.iter().map(|&s| GString::from(s)).collect()