Skip to content

Commit 2e3a120

Browse files
committed
GString: add try_from_bytes(), try_from_cstr(), Encoding
1 parent d1a25b3 commit 2e3a120

File tree

4 files changed

+198
-5
lines changed

4 files changed

+198
-5
lines changed

godot-core/src/builtin/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ pub mod __prelude_reexport {
168168
pub use rect2i::*;
169169
pub use rid::*;
170170
pub use signal::*;
171-
pub use string::{GString, NodePath, StringName};
171+
pub use string::{Encoding, GString, NodePath, StringName};
172172
pub use transform2d::*;
173173
pub use transform3d::*;
174174
pub use variant::*;

godot-core/src/builtin/string/gstring.rs

+58-3
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
*/
77

88
use std::convert::Infallible;
9-
use std::ffi::c_char;
109
use std::fmt;
1110
use std::fmt::Write;
1211

1312
use godot_ffi as sys;
1413
use sys::types::OpaqueString;
1514
use sys::{ffi_methods, interface_fn, GodotFfi};
1615

16+
use crate::builtin::string::Encoding;
1717
use crate::builtin::{inner, NodePath, StringName, Variant};
1818
use crate::meta::AsArg;
1919
use crate::{impl_shared_string_api, meta};
@@ -77,6 +77,61 @@ impl GString {
7777
Self::default()
7878
}
7979

80+
/// Convert string from bytes with given encoding, returning `None` on validation errors.
81+
///
82+
/// Intermediate `NUL` characters are not accepted in Godot and always return `None`.
83+
///
84+
/// Some notes on the encodings:
85+
/// - **Latin-1:** Since every byte is a valid Latin-1 character, no validation besides the `NUL` byte is performed.
86+
/// It is your responsibility to ensure that the input is valid Latin-1.
87+
/// - **ASCII**: Subset of Latin-1, which is additionally validated to be valid, non-`NUL` ASCII characters.
88+
/// - **UTF-8**: The input is validated to be UTF-8.
89+
///
90+
/// Specifying incorrect encoding is safe, but may result in unintended string values.
91+
pub fn try_from_bytes(bytes: &[u8], encoding: Encoding) -> Option<Self> {
92+
match encoding {
93+
Encoding::Ascii => {
94+
// If the bytes are ASCII, we can fall back to Latin-1, which is always valid (except for NUL).
95+
// is_ascii() does *not* check for the NUL byte, so the check in the Latin-1 branch is still necessary.
96+
if bytes.is_ascii() {
97+
Self::try_from_bytes(bytes, Encoding::Latin1)
98+
} else {
99+
None
100+
}
101+
}
102+
Encoding::Latin1 => {
103+
// Intermediate NUL bytes are not accepted in Godot. Both ASCII + Latin-1 encodings need to explicitly check for this.
104+
if bytes.contains(&0) {
105+
return None;
106+
}
107+
108+
let s = unsafe {
109+
Self::new_with_string_uninit(|string_ptr| {
110+
let ctor = interface_fn!(string_new_with_latin1_chars_and_len);
111+
ctor(
112+
string_ptr,
113+
bytes.as_ptr() as *const std::ffi::c_char,
114+
bytes.len() as i64,
115+
);
116+
})
117+
};
118+
Some(s)
119+
}
120+
Encoding::Utf8 => {
121+
// from_utf8() also checks for intermediate NUL bytes.
122+
let utf8 = std::str::from_utf8(bytes);
123+
utf8.ok().map(GString::from)
124+
}
125+
}
126+
}
127+
128+
/// Convert string from bytes with given encoding, returning `None` on validation errors.
129+
///
130+
/// Convenience function for [`try_from_bytes()`](Self::try_from_bytes); see its docs for more information.
131+
pub fn try_from_cstr(cstr: &std::ffi::CStr, encoding: Encoding) -> Option<Self> {
132+
Self::try_from_bytes(cstr.to_bytes(), encoding)
133+
}
134+
80135
/// Number of characters in the string.
81136
///
82137
/// _Godot equivalent: `length`_
@@ -260,7 +315,7 @@ impl From<&str> for GString {
260315
let ctor = interface_fn!(string_new_with_utf8_chars_and_len);
261316
ctor(
262317
string_ptr,
263-
bytes.as_ptr() as *const c_char,
318+
bytes.as_ptr() as *const std::ffi::c_char,
264319
bytes.len() as i64,
265320
);
266321
})
@@ -307,7 +362,7 @@ impl From<&GString> for String {
307362

308363
interface_fn!(string_to_utf8_chars)(
309364
string.string_sys(),
310-
buf.as_mut_ptr() as *mut c_char,
365+
buf.as_mut_ptr() as *mut std::ffi::c_char,
311366
len,
312367
);
313368

godot-core/src/builtin/string/mod.rs

+13
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,19 @@ impl FromGodot for String {
5454
}
5555
}
5656

57+
// ----------------------------------------------------------------------------------------------------------------------------------------------
58+
// Encoding
59+
60+
/// Specifies string encoding.
61+
///
62+
/// Used in functions such as [`GString::try_from_bytes()`][GString::try_from_bytes] to handle multiple input string encodings.
63+
#[non_exhaustive]
64+
pub enum Encoding {
65+
Ascii,
66+
Latin1,
67+
Utf8,
68+
}
69+
5770
// ----------------------------------------------------------------------------------------------------------------------------------------------
5871

5972
/// Returns a tuple of `(from, len)` from a Rust range.

itest/rust/src/builtin_tests/string/gstring_test.rs

+126-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use std::collections::HashSet;
99

1010
use crate::framework::{expect_debug_panic_or_release_ok, itest};
11-
use godot::builtin::{GString, PackedStringArray};
11+
use godot::builtin::{Encoding, GString, PackedStringArray};
1212

1313
// TODO use tests from godot-rust/gdnative
1414

@@ -267,6 +267,131 @@ fn string_pad() {
267267
}
268268

269269
// ----------------------------------------------------------------------------------------------------------------------------------------------
270+
// Byte and C-string conversions
271+
272+
#[itest]
273+
fn string_from_bytes_ascii() {
274+
let ascii = GString::try_from_bytes(b"Hello", Encoding::Ascii).expect("valid ASCII");
275+
assert_eq!(ascii, GString::from("Hello"));
276+
assert_eq!(ascii.len(), 5);
277+
278+
let ascii_nul = GString::try_from_bytes(b"Hello\0", Encoding::Ascii);
279+
assert_eq!(ascii_nul, None, "intermediate NUL byte is not valid ASCII"); // at end, but still not NUL terminator.
280+
281+
let latin1 = GString::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Ascii);
282+
assert_eq!(latin1, None, "Latin-1 is *not* valid ASCII");
283+
284+
let utf8 = GString::try_from_bytes(b"\xF6\xF0\x9F\x8D\x8E\xF0\x9F\x92\xA1", Encoding::Ascii);
285+
assert_eq!(utf8, None, "UTF-8 is *not* valid ASCII");
286+
}
287+
288+
#[itest]
289+
fn string_from_cstr_ascii() {
290+
let ascii = GString::try_from_cstr(c"Hello", Encoding::Ascii);
291+
let ascii = ascii.expect("valid ASCII");
292+
assert_eq!(ascii, GString::from("Hello"));
293+
assert_eq!(ascii.len(), 5);
294+
295+
let latin1 = GString::try_from_cstr(c"/ðõ¾", Encoding::Ascii);
296+
assert_eq!(latin1, None, "Latin-1 is *not* valid ASCII");
297+
298+
let utf8 = GString::try_from_cstr(c"ö🍎A💡", Encoding::Ascii);
299+
assert_eq!(utf8, None, "UTF-8 is *not* valid ASCII");
300+
}
301+
302+
#[itest]
303+
fn string_from_bytes_latin1() {
304+
let ascii = GString::try_from_bytes(b"Hello", Encoding::Latin1);
305+
let ascii = ascii.expect("ASCII is valid Latin-1");
306+
assert_eq!(ascii, GString::from("Hello"));
307+
assert_eq!(ascii.len(), 5);
308+
309+
let latin1 = GString::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Latin1);
310+
let latin1 = latin1.expect("Latin-1 is valid Latin-1");
311+
assert_eq!(latin1, GString::from("/ðõ¾"));
312+
assert_eq!(latin1.len(), 4);
313+
314+
let latin1_nul = GString::try_from_bytes(b"/\0\xF0\xF5\xBE", Encoding::Latin1);
315+
assert_eq!(
316+
latin1_nul, None,
317+
"intermediate NUL byte is not valid Latin-1"
318+
);
319+
320+
// UTF-8 -> Latin-1: always succeeds, even if result is garbage, since every byte is a valid Latin-1 character.
321+
let utf8 = GString::try_from_bytes(
322+
b"\xC3\xB6\xF0\x9F\x8D\x8E\x41\xF0\x9F\x92\xA1",
323+
Encoding::Latin1,
324+
);
325+
let utf8 = utf8.expect("UTF-8 is valid Latin-1, even if garbage");
326+
assert_eq!(utf8, GString::from("ö🍎A💡"));
327+
}
328+
329+
#[itest]
330+
fn string_from_cstr_latin1() {
331+
let ascii = GString::try_from_cstr(c"Hello", Encoding::Latin1);
332+
let ascii = ascii.expect("ASCII is valid Latin-1");
333+
assert_eq!(ascii, GString::from("Hello"));
334+
assert_eq!(ascii.len(), 5);
335+
336+
// The C-string literal is interpreted as UTF-8, not Latin-1 (which is btw still valid Latin-1), see last test in this #[itest].
337+
// So we use explicit bytes in the following tests.
338+
assert_eq!(c"/ðõ¾".to_bytes(), b"/\xC3\xB0\xC3\xB5\xC2\xBE");
339+
let latin1 = GString::try_from_cstr(c"/\xF0\xF5\xBE", Encoding::Latin1);
340+
let latin1 = latin1.expect("Latin-1 is valid Latin-1");
341+
assert_eq!(latin1, GString::from("/ðõ¾"));
342+
assert_eq!(latin1.len(), 4);
343+
344+
// UTF-8 -> Latin-1: always succeeds, even if result is garbage, since every byte is a valid Latin-1 character.
345+
let utf8 = GString::try_from_cstr(c"ö🍎A💡", Encoding::Latin1);
346+
let utf8 = utf8.expect("UTF-8 is valid Latin-1, even if garbage");
347+
assert_eq!(utf8, GString::from("ö🍎A💡"));
348+
}
349+
350+
#[itest]
351+
fn string_from_bytes_utf8() {
352+
let ascii = GString::try_from_bytes(b"Hello", Encoding::Utf8);
353+
let ascii = ascii.expect("ASCII is valid UTF-8");
354+
assert_eq!(ascii, GString::from("Hello"));
355+
assert_eq!(ascii.len(), 5);
356+
357+
let latin1 = GString::try_from_bytes(b"/\xF0\xF5\xBE", Encoding::Utf8);
358+
assert_eq!(latin1, None, "Latin-1 is *not* valid UTF-8");
359+
360+
let utf8 = GString::try_from_bytes(
361+
b"\xC3\xB6\xF0\x9F\x8D\x8E\x41\xF0\x9F\x92\xA1",
362+
Encoding::Utf8,
363+
);
364+
let utf8 = utf8.expect("UTF-8 is valid UTF-8");
365+
assert_eq!(utf8, GString::from("ö🍎A💡"));
366+
assert_eq!(utf8.len(), 4);
367+
368+
let utf8_nul = GString::try_from_bytes(b"\xC3\0A", Encoding::Utf8);
369+
assert_eq!(utf8_nul, None, "intermediate NUL byte is not valid UTF-8");
370+
}
371+
372+
#[itest]
373+
fn string_from_cstr_utf8() {
374+
let ascii = GString::try_from_cstr(c"Hello", Encoding::Utf8);
375+
let ascii = ascii.expect("ASCII is valid UTF-8");
376+
assert_eq!(ascii, GString::from("Hello"));
377+
assert_eq!(ascii.len(), 5);
378+
379+
// The latin1 checks pass even though try_from_bytes() for the Latin-1 string b"/\xF0\xF5\xBE" fails.
380+
// When using a C string literal, the characters are interpreted as UTF-8, *not* Latin-1, see following assertion.
381+
assert_eq!(c"/ðõ¾".to_bytes(), b"/\xC3\xB0\xC3\xB5\xC2\xBE");
382+
let latin1 = GString::try_from_cstr(c"/ðõ¾", Encoding::Utf8);
383+
let latin1 = latin1.expect("Characters from Latin-1 set re-encoded as UTF-8 are valid UTF-8");
384+
assert_eq!(latin1, GString::from("/ðõ¾"));
385+
assert_eq!(latin1.len(), 4);
386+
387+
let utf8 = GString::try_from_cstr(c"ö🍎A💡", Encoding::Utf8);
388+
let utf8 = utf8.expect("valid UTF-8");
389+
assert_eq!(utf8, GString::from("ö🍎A💡"));
390+
assert_eq!(utf8.len(), 4);
391+
}
392+
393+
// ----------------------------------------------------------------------------------------------------------------------------------------------
394+
// Helpers
270395

271396
fn packed(strings: &[&str]) -> PackedStringArray {
272397
strings.iter().map(|&s| GString::from(s)).collect()

0 commit comments

Comments
 (0)