diff --git a/godot-core/src/builtin/mod.rs b/godot-core/src/builtin/mod.rs index a72c2672e..83c8b8b48 100644 --- a/godot-core/src/builtin/mod.rs +++ b/godot-core/src/builtin/mod.rs @@ -168,7 +168,7 @@ pub mod __prelude_reexport { pub use rect2i::*; pub use rid::*; pub use signal::*; - pub use string::{GString, NodePath, StringName}; + pub use string::{Encoding, GString, NodePath, StringName}; pub use transform2d::*; pub use transform3d::*; pub use variant::*; diff --git a/godot-core/src/builtin/string/gstring.rs b/godot-core/src/builtin/string/gstring.rs index 4bda7bec5..93f37b770 100644 --- a/godot-core/src/builtin/string/gstring.rs +++ b/godot-core/src/builtin/string/gstring.rs @@ -6,7 +6,6 @@ */ use std::convert::Infallible; -use std::ffi::c_char; use std::fmt; use std::fmt::Write; @@ -14,7 +13,9 @@ use godot_ffi as sys; use sys::types::OpaqueString; use sys::{ffi_methods, interface_fn, GodotFfi}; +use crate::builtin::string::Encoding; use crate::builtin::{inner, NodePath, StringName, Variant}; +use crate::meta::error::StringError; use crate::meta::AsArg; use crate::{impl_shared_string_api, meta}; @@ -77,6 +78,73 @@ impl GString { Self::default() } + /// Convert string from bytes with given encoding, returning `Err` on validation errors. + /// + /// Intermediate `NUL` characters are not accepted in Godot and always return `Err`. + /// + /// Some notes on the encodings: + /// - **Latin-1:** Since every byte is a valid Latin-1 character, no validation besides the `NUL` byte is performed. + /// It is your responsibility to ensure that the input is valid Latin-1. + /// - **ASCII**: Subset of Latin-1, which is additionally validated to be valid, non-`NUL` ASCII characters. + /// - **UTF-8**: The input is validated to be UTF-8. + /// + /// Specifying incorrect encoding is safe, but may result in unintended string values. + pub fn try_from_bytes(bytes: &[u8], encoding: Encoding) -> Result { + Self::try_from_bytes_with_nul_check(bytes, encoding, true) + } + + /// Convert string from C-string with given encoding, returning `Err` on validation errors. + /// + /// Convenience function for [`try_from_bytes()`](Self::try_from_bytes); see its docs for more information. + pub fn try_from_cstr(cstr: &std::ffi::CStr, encoding: Encoding) -> Result { + Self::try_from_bytes_with_nul_check(cstr.to_bytes(), encoding, false) + } + + pub(super) fn try_from_bytes_with_nul_check( + bytes: &[u8], + encoding: Encoding, + check_nul: bool, + ) -> Result { + match encoding { + Encoding::Ascii => { + // If the bytes are ASCII, we can fall back to Latin-1, which is always valid (except for NUL). + // is_ascii() does *not* check for the NUL byte, so the check in the Latin-1 branch is still necessary. + if bytes.is_ascii() { + Self::try_from_bytes_with_nul_check(bytes, Encoding::Latin1, check_nul) + .map_err(|_e| StringError::new("intermediate NUL byte in ASCII string")) + } else { + Err(StringError::new("invalid ASCII")) + } + } + Encoding::Latin1 => { + // Intermediate NUL bytes are not accepted in Godot. Both ASCII + Latin-1 encodings need to explicitly check for this. + if check_nul && bytes.contains(&0) { + // Error overwritten when called from ASCII branch. + return Err(StringError::new("intermediate NUL byte in Latin-1 string")); + } + + let s = unsafe { + Self::new_with_string_uninit(|string_ptr| { + let ctor = interface_fn!(string_new_with_latin1_chars_and_len); + ctor( + string_ptr, + bytes.as_ptr() as *const std::ffi::c_char, + bytes.len() as i64, + ); + }) + }; + Ok(s) + } + Encoding::Utf8 => { + // from_utf8() also checks for intermediate NUL bytes. + let utf8 = std::str::from_utf8(bytes); + + utf8.map(GString::from) + .map_err(|e| StringError::with_source("invalid UTF-8", e)) + } + } + } + /// Number of characters in the string. /// /// _Godot equivalent: `length`_ @@ -260,7 +328,7 @@ impl From<&str> for GString { let ctor = interface_fn!(string_new_with_utf8_chars_and_len); ctor( string_ptr, - bytes.as_ptr() as *const c_char, + bytes.as_ptr() as *const std::ffi::c_char, bytes.len() as i64, ); }) @@ -307,7 +375,7 @@ impl From<&GString> for String { interface_fn!(string_to_utf8_chars)( string.string_sys(), - buf.as_mut_ptr() as *mut c_char, + buf.as_mut_ptr() as *mut std::ffi::c_char, len, ); diff --git a/godot-core/src/builtin/string/mod.rs b/godot-core/src/builtin/string/mod.rs index 93e1751d7..644487b80 100644 --- a/godot-core/src/builtin/string/mod.rs +++ b/godot-core/src/builtin/string/mod.rs @@ -54,6 +54,20 @@ impl FromGodot for String { } } +// ---------------------------------------------------------------------------------------------------------------------------------------------- +// Encoding + +/// Specifies string encoding. +/// +/// Used in functions such as [`GString::try_from_bytes()`][GString::try_from_bytes] to handle multiple input string encodings. +#[non_exhaustive] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum Encoding { + Ascii, + Latin1, + Utf8, +} + // ---------------------------------------------------------------------------------------------------------------------------------------------- /// Returns a tuple of `(from, len)` from a Rust range. diff --git a/godot-core/src/builtin/string/string_name.rs b/godot-core/src/builtin/string/string_name.rs index ceadd1b61..f3a0921e6 100644 --- a/godot-core/src/builtin/string/string_name.rs +++ b/godot-core/src/builtin/string/string_name.rs @@ -4,13 +4,14 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ - use std::fmt; use godot_ffi as sys; +use godot_ffi::interface_fn; use sys::{ffi_methods, GodotFfi}; -use crate::builtin::{inner, GString, NodePath, Variant}; +use crate::builtin::{inner, Encoding, GString, NodePath, Variant}; +use crate::meta::error::StringError; use crate::meta::AsArg; use crate::{impl_shared_string_api, meta}; @@ -60,6 +61,83 @@ impl StringName { Self { opaque } } + /// Convert string from bytes with given encoding, returning `Err` on validation errors. + /// + /// Intermediate `NUL` characters are not accepted in Godot and always return `Err`. + /// + /// Some notes on the encodings: + /// - **Latin-1:** Since every byte is a valid Latin-1 character, no validation besides the `NUL` byte is performed. + /// It is your responsibility to ensure that the input is valid Latin-1. + /// - **ASCII**: Subset of Latin-1, which is additionally validated to be valid, non-`NUL` ASCII characters. + /// - **UTF-8**: The input is validated to be UTF-8. + /// + /// Specifying incorrect encoding is safe, but may result in unintended string values. + pub fn try_from_bytes(bytes: &[u8], encoding: Encoding) -> Result { + Self::try_from_bytes_with_nul_check(bytes, encoding, true) + } + + /// Convert string from bytes with given encoding, returning `Err` on validation errors. + /// + /// Convenience function for [`try_from_bytes()`](Self::try_from_bytes); see its docs for more information. + /// + /// When called with `Encoding::Latin1`, this can be slightly more efficient than `try_from_bytes()`. + pub fn try_from_cstr(cstr: &std::ffi::CStr, encoding: Encoding) -> Result { + // Short-circuit the direct Godot 4.2 function for Latin-1, which takes a null-terminated C string. + #[cfg(since_api = "4.2")] + if encoding == Encoding::Latin1 { + // Note: CStr guarantees no intermediate NUL bytes, so we don't need to check for them. + + let is_static = sys::conv::SYS_FALSE; + let s = unsafe { + Self::new_with_string_uninit(|string_ptr| { + let ctor = interface_fn!(string_name_new_with_latin1_chars); + ctor( + string_ptr, + cstr.as_ptr() as *const std::ffi::c_char, + is_static, + ); + }) + }; + return Ok(s); + } + + Self::try_from_bytes_with_nul_check(cstr.to_bytes(), encoding, false) + } + + fn try_from_bytes_with_nul_check( + bytes: &[u8], + encoding: Encoding, + check_nul: bool, + ) -> Result { + match encoding { + Encoding::Ascii => { + // ASCII is a subset of UTF-8, and UTF-8 has a more direct implementation than Latin-1; thus use UTF-8 via `From<&str>`. + if !bytes.is_ascii() { + Err(StringError::new("invalid ASCII")) + } else if check_nul && bytes.contains(&0) { + Err(StringError::new("intermediate NUL byte in ASCII string")) + } else { + // SAFETY: ASCII is a subset of UTF-8 and was verified above. + let ascii = unsafe { std::str::from_utf8_unchecked(bytes) }; + Ok(Self::from(ascii)) + } + } + Encoding::Latin1 => { + // This branch is short-circuited if invoked for CStr and Godot 4.2+, which uses `string_name_new_with_latin1_chars` + // (requires nul-termination). In general, fall back to GString conversion. + GString::try_from_bytes_with_nul_check(bytes, Encoding::Latin1, check_nul) + .map(Self::from) + } + Encoding::Utf8 => { + // from_utf8() also checks for intermediate NUL bytes. + let utf8 = std::str::from_utf8(bytes); + + utf8.map(StringName::from) + .map_err(|e| StringError::with_source("invalid UTF-8", e)) + } + } + } + /// Number of characters in the string. /// /// _Godot equivalent: `length`_ diff --git a/godot-core/src/meta/error/mod.rs b/godot-core/src/meta/error/mod.rs index 4c16addd8..97562634e 100644 --- a/godot-core/src/meta/error/mod.rs +++ b/godot-core/src/meta/error/mod.rs @@ -10,7 +10,9 @@ mod call_error; mod convert_error; mod io_error; +mod string_error; pub use call_error::*; pub use convert_error::*; pub use io_error::*; +pub use string_error::*; diff --git a/godot-core/src/meta/error/string_error.rs b/godot-core/src/meta/error/string_error.rs new file mode 100644 index 000000000..cc1012a14 --- /dev/null +++ b/godot-core/src/meta/error/string_error.rs @@ -0,0 +1,51 @@ +/* + * Copyright (c) godot-rust; Bromeon and contributors. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use std::error::Error; +use std::fmt; + +/// Error related to string encoding/decoding. +#[derive(Debug)] +pub struct StringError { + message: String, + source: Option>, +} + +impl fmt::Display for StringError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(source) = self.source() { + write!(f, "{}: {}", self.message, source) + } else { + write!(f, "{}", self.message) + } + } +} + +impl Error for StringError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + self.source.as_deref() + } +} + +impl StringError { + pub(crate) fn new(message: impl Into) -> Self { + Self { + message: message.into(), + source: None, + } + } + + pub(crate) fn with_source( + message: impl Into, + source: impl Into>, + ) -> Self { + Self { + message: message.into(), + source: Some(source.into()), + } + } +} diff --git a/itest/rust/src/builtin_tests/mod.rs b/itest/rust/src/builtin_tests/mod.rs index 85cc29ad4..1fb815950 100644 --- a/itest/rust/src/builtin_tests/mod.rs +++ b/itest/rust/src/builtin_tests/mod.rs @@ -38,6 +38,7 @@ mod string { mod gstring_test; mod node_path_test; mod string_name_test; + mod string_test_macros; } mod script { diff --git a/itest/rust/src/builtin_tests/string/gstring_test.rs b/itest/rust/src/builtin_tests/string/gstring_test.rs index 173657f0d..33622556b 100644 --- a/itest/rust/src/builtin_tests/string/gstring_test.rs +++ b/itest/rust/src/builtin_tests/string/gstring_test.rs @@ -8,7 +8,7 @@ use std::collections::HashSet; use crate::framework::{expect_debug_panic_or_release_ok, itest}; -use godot::builtin::{GString, PackedStringArray}; +use godot::builtin::{Encoding, GString, PackedStringArray}; // TODO use tests from godot-rust/gdnative @@ -150,7 +150,7 @@ fn string_substr() { } #[itest] -fn string_find() { +fn gstring_find() { let s = GString::from("Hello World"); assert_eq!(s.find("o"), Some(4)); @@ -171,7 +171,7 @@ fn string_find() { } #[itest] -fn string_split() { +fn gstring_split() { let s = GString::from("Hello World"); assert_eq!(s.split(" "), packed(&["Hello", "World"])); assert_eq!( @@ -206,7 +206,7 @@ fn string_split() { } #[itest] -fn string_count() { +fn gstring_count() { let s = GString::from("Long sentence with Sentry guns."); assert_eq!(s.count("sent", ..), 1); assert_eq!(s.count("en", 6..), 3); @@ -224,7 +224,7 @@ fn string_count() { } #[itest] -fn string_erase() { +fn gstring_erase() { let s = GString::from("Hello World"); assert_eq!(s.erase(..), GString::new()); assert_eq!(s.erase(4..4), s); @@ -236,7 +236,7 @@ fn string_erase() { } #[itest] -fn string_insert() { +fn gstring_insert() { let s = GString::from("H World"); assert_eq!(s.insert(1, "i"), "Hi World".into()); assert_eq!(s.insert(1, "ello"), "Hello World".into()); @@ -248,7 +248,7 @@ fn string_insert() { } #[itest] -fn string_pad() { +fn gstring_pad() { let s = GString::from("123"); assert_eq!(s.lpad(5, '0'), "00123".into()); assert_eq!(s.lpad(2, ' '), "123".into()); @@ -266,7 +266,21 @@ fn string_pad() { assert_eq!(s.pad_zeros(2), "123.456".into()); } +// Byte and C-string conversions. +crate::generate_string_bytes_and_cstr_tests!( + builtin: GString, + tests: [ + gstring_from_bytes_ascii, + gstring_from_cstr_ascii, + gstring_from_bytes_latin1, + gstring_from_cstr_latin1, + gstring_from_bytes_utf8, + gstring_from_cstr_utf8, + ] +); + // ---------------------------------------------------------------------------------------------------------------------------------------------- +// Helpers fn packed(strings: &[&str]) -> PackedStringArray { strings.iter().map(|&s| GString::from(s)).collect() diff --git a/itest/rust/src/builtin_tests/string/string_name_test.rs b/itest/rust/src/builtin_tests/string/string_name_test.rs index 5a32ddce5..1023eef6e 100644 --- a/itest/rust/src/builtin_tests/string/string_name_test.rs +++ b/itest/rust/src/builtin_tests/string/string_name_test.rs @@ -8,7 +8,7 @@ use std::collections::HashSet; use crate::framework::{assert_eq_self, itest}; -use godot::builtin::{GString, NodePath, StringName}; +use godot::builtin::{Encoding, GString, NodePath, StringName}; #[itest] fn string_name_default() { @@ -162,3 +162,16 @@ fn string_name_with_null() { assert_eq!(left, right); } } + +// Byte and C-string conversions. +crate::generate_string_bytes_and_cstr_tests!( + builtin: StringName, + tests: [ + string_name_from_bytes_ascii, + string_name_from_cstr_ascii, + string_name_from_bytes_latin1, + string_name_from_cstr_latin1, + string_name_from_bytes_utf8, + string_name_from_cstr_utf8, + ] +); diff --git a/itest/rust/src/builtin_tests/string/string_test_macros.rs b/itest/rust/src/builtin_tests/string/string_test_macros.rs new file mode 100644 index 000000000..7773cf21e --- /dev/null +++ b/itest/rust/src/builtin_tests/string/string_test_macros.rs @@ -0,0 +1,164 @@ +/* + * Copyright (c) godot-rust; Bromeon and contributors. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +//! Byte and C-string conversions. + +#[macro_export] +macro_rules! generate_string_bytes_and_cstr_tests { + ( + builtin: $T:ty, + tests: [ + $from_bytes_ascii:ident, + $from_cstr_ascii:ident, + $from_bytes_latin1:ident, + $from_cstr_latin1:ident, + $from_bytes_utf8:ident, + $from_cstr_utf8:ident, + ] + ) => { + #[itest] + fn $from_bytes_ascii() { + let ascii = <$T>::try_from_bytes(b"Hello", Encoding::Ascii).expect("valid ASCII"); + assert_eq!(ascii, <$T>::from("Hello")); + assert_eq!(ascii.len(), 5); + + let ascii_nul = <$T>::try_from_bytes(b"Hello\0", Encoding::Ascii); + let ascii_nul = ascii_nul.expect_err("intermediate NUL byte is not valid ASCII"); // at end, but still not NUL terminator. + assert_eq!( + ascii_nul.to_string(), + "intermediate NUL byte in ASCII string" + ); + + let latin1 = <$T>::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Ascii); + let latin1 = latin1.expect_err("Latin-1 is *not* valid ASCII"); + assert_eq!(latin1.to_string(), "invalid ASCII"); + + let utf8 = + <$T>::try_from_bytes(b"\xF6\xF0\x9F\x8D\x8E\xF0\x9F\x92\xA1", Encoding::Ascii); + let utf8 = utf8.expect_err("UTF-8 is *not* valid ASCII"); + assert_eq!(utf8.to_string(), "invalid ASCII"); + } + + #[itest] + fn $from_cstr_ascii() { + let ascii = <$T>::try_from_cstr(c"Hello", Encoding::Ascii); + let ascii = ascii.expect("valid ASCII"); + assert_eq!(ascii, <$T>::from("Hello")); + assert_eq!(ascii.len(), 5); + + let latin1 = <$T>::try_from_cstr(c"/ðõ¾", Encoding::Ascii); + let latin1 = latin1.expect_err("Latin-1 is *not* valid ASCII"); + assert_eq!(latin1.to_string(), "invalid ASCII"); + + let utf8 = <$T>::try_from_cstr(c"ö🍎A💡", Encoding::Ascii); + let utf8 = utf8.expect_err("UTF-8 is *not* valid ASCII"); + assert_eq!(utf8.to_string(), "invalid ASCII"); + } + + #[itest] + fn $from_bytes_latin1() { + let ascii = <$T>::try_from_bytes(b"Hello", Encoding::Latin1); + let ascii = ascii.expect("ASCII is valid Latin-1"); + assert_eq!(ascii, <$T>::from("Hello")); + assert_eq!(ascii.len(), 5); + + let latin1 = <$T>::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Latin1); + let latin1 = latin1.expect("Latin-1 is valid Latin-1"); + assert_eq!(latin1, <$T>::from("/ðõ¾")); + assert_eq!(latin1.len(), 4); + + let latin1_nul = <$T>::try_from_bytes(b"/\0\xF0\xF5\xBE", Encoding::Latin1); + let latin1_nul = latin1_nul.expect_err("intermediate NUL byte is not valid Latin-1"); + assert_eq!( + latin1_nul.to_string(), + "intermediate NUL byte in Latin-1 string" + ); + + // UTF-8 -> Latin-1: always succeeds, even if result is garbage, since every byte is a valid Latin-1 character. + let utf8 = <$T>::try_from_bytes( + b"\xC3\xB6\xF0\x9F\x8D\x8E\x41\xF0\x9F\x92\xA1", + Encoding::Latin1, + ); + let utf8 = utf8.expect("UTF-8 is valid Latin-1, even if garbage"); + assert_eq!(utf8, <$T>::from("ö🍎A💡")); + } + + #[itest] + fn $from_cstr_latin1() { + let ascii = <$T>::try_from_cstr(c"Hello", Encoding::Latin1); + let ascii = ascii.expect("ASCII is valid Latin-1"); + assert_eq!(ascii, <$T>::from("Hello")); + assert_eq!(ascii.len(), 5); + + // The C-string literal is interpreted as UTF-8, not Latin-1 (which is btw still valid Latin-1), see last test in this #[itest]. + // So we use explicit bytes in the following tests. + assert_eq!(c"/ðõ¾".to_bytes(), b"/\xC3\xB0\xC3\xB5\xC2\xBE"); + let latin1 = <$T>::try_from_cstr(c"/\xF0\xF5\xBE", Encoding::Latin1); + let latin1 = latin1.expect("Latin-1 is valid Latin-1"); + assert_eq!(latin1, <$T>::from("/ðõ¾")); + assert_eq!(latin1.len(), 4); + + // UTF-8 -> Latin-1: always succeeds, even if result is garbage, since every byte is a valid Latin-1 character. + let utf8 = <$T>::try_from_cstr(c"ö🍎A💡", Encoding::Latin1); + let utf8 = utf8.expect("UTF-8 is valid Latin-1, even if garbage"); + assert_eq!(utf8, <$T>::from("ö🍎A💡")); + } + + #[itest] + fn $from_bytes_utf8() { + let ascii = <$T>::try_from_bytes(b"Hello", Encoding::Utf8); + let ascii = ascii.expect("ASCII is valid UTF-8"); + assert_eq!(ascii, <$T>::from("Hello")); + assert_eq!(ascii.len(), 5); + + let latin1 = <$T>::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Utf8); + let latin1 = latin1.expect_err("Latin-1 is *not* valid UTF-8"); + // Note: depends on exact output of std's Utf8Error; might need format!() if that changes. + assert_eq!( + latin1.to_string(), + "invalid UTF-8: invalid utf-8 sequence of 1 bytes from index 1" + ); + + let utf8 = <$T>::try_from_bytes( + b"\xC3\xB6\xF0\x9F\x8D\x8E\x41\xF0\x9F\x92\xA1", + Encoding::Utf8, + ); + let utf8 = utf8.expect("UTF-8 is valid UTF-8"); + assert_eq!(utf8, <$T>::from("ö🍎A💡")); + assert_eq!(utf8.len(), 4); + + let utf8_nul = <$T>::try_from_bytes(b"\xC3\0A", Encoding::Utf8); + let utf8_nul = utf8_nul.expect_err("intermediate NUL byte is not valid UTF-8"); + assert_eq!( + utf8_nul.to_string(), + "invalid UTF-8: invalid utf-8 sequence of 1 bytes from index 0" + ); + } + + #[itest] + fn $from_cstr_utf8() { + let ascii = <$T>::try_from_cstr(c"Hello", Encoding::Utf8); + let ascii = ascii.expect("ASCII is valid UTF-8"); + assert_eq!(ascii, <$T>::from("Hello")); + assert_eq!(ascii.len(), 5); + + // The latin1 checks pass even though try_from_bytes() for the Latin-1 string b"/\xF0\xF5\xBE" fails. + // When using a C string literal, the characters are interpreted as UTF-8, *not* Latin-1, see following assertion. + assert_eq!(c"/ðõ¾".to_bytes(), b"/\xC3\xB0\xC3\xB5\xC2\xBE"); + let latin1 = <$T>::try_from_cstr(c"/ðõ¾", Encoding::Utf8); + let latin1 = + latin1.expect("Characters from Latin-1 set re-encoded as UTF-8 are valid UTF-8"); + assert_eq!(latin1, <$T>::from("/ðõ¾")); + assert_eq!(latin1.len(), 4); + + let utf8 = <$T>::try_from_cstr(c"ö🍎A💡", Encoding::Utf8); + let utf8 = utf8.expect("valid UTF-8"); + assert_eq!(utf8, <$T>::from("ö🍎A💡")); + assert_eq!(utf8.len(), 4); + } + }; +}