Skip to content

Commit 40c815e

Browse files
authored
Merge pull request #247 from dgrunwald/unicode-data
`PyString::data()`: return the internal representation of the Python unicode object
2 parents 284c70d + e3976d1 commit 40c815e

File tree

2 files changed

+196
-13
lines changed

2 files changed

+196
-13
lines changed

python3-sys/src/unicodeobject.rs

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ use libc::{c_char, c_int, c_void, wchar_t};
22

33
use crate::object::*;
44
use crate::pyport::Py_ssize_t;
5+
#[cfg(not(Py_LIMITED_API))]
6+
use crate::pyport::Py_hash_t;
57

68
#[cfg(not(Py_LIMITED_API))]
79
#[deprecated(since = "0.2.1", note = "Deprecated since Python 3.3 / PEP 393")]
@@ -123,7 +125,7 @@ extern "C" {
123125
pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
124126
#[cfg(not(Py_3_9))]
125127
pub fn PyUnicode_ClearFreeList() -> c_int;
126-
#[cfg(not(Py_LIMITED_API))]
128+
#[cfg(any(not(Py_LIMITED_API), Py_3_10))]
127129
pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char;
128130
#[cfg(not(Py_LIMITED_API))]
129131
pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char;
@@ -429,4 +431,113 @@ extern "C" {
429431
pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int;
430432
#[cfg(not(Py_LIMITED_API))]
431433
pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE;
434+
435+
#[cfg(not(Py_LIMITED_API))]
436+
fn _PyUnicode_Ready(o: *mut PyObject) -> c_int;
437+
}
438+
439+
#[repr(C)]
440+
#[cfg(not(Py_LIMITED_API))]
441+
pub struct PyASCIIObject {
442+
pub ob_base: PyObject,
443+
pub length: Py_ssize_t,
444+
pub hash: Py_hash_t,
445+
pub state: u32,
446+
pub wstr: *mut c_void
447+
}
448+
449+
#[repr(C)]
450+
#[cfg(not(Py_LIMITED_API))]
451+
pub struct PyCompactUnicodeObject {
452+
_base: PyASCIIObject,
453+
utf8_length: Py_ssize_t,
454+
utf8: *mut u8,
455+
wstr_length: Py_ssize_t
456+
}
457+
458+
#[repr(C)]
459+
#[cfg(not(Py_LIMITED_API))]
460+
pub struct PyUnicodeObject {
461+
_base: PyASCIIObject,
462+
data: *mut c_void
463+
}
464+
465+
#[cfg(not(Py_LIMITED_API))]
466+
#[inline]
467+
unsafe fn PyUnicode_IS_ASCII(o: *mut PyObject) -> bool {
468+
let ascii_bit = 1 << 6;
469+
let state = (*(o as *mut PyASCIIObject)).state;
470+
(state & ascii_bit) != 0
471+
}
472+
473+
#[cfg(not(Py_LIMITED_API))]
474+
#[inline]
475+
unsafe fn PyUnicode_IS_COMPACT(o: *mut PyObject) -> bool {
476+
let compact_bit = 1 << 5;
477+
let state = (*(o as *mut PyASCIIObject)).state;
478+
(state & compact_bit) != 0
479+
}
480+
481+
#[cfg(not(Py_LIMITED_API))]
482+
pub const PyUnicode_WCHAR_KIND: u32 = 0;
483+
#[cfg(not(Py_LIMITED_API))]
484+
pub const PyUnicode_1BYTE_KIND: u32 = 1;
485+
#[cfg(not(Py_LIMITED_API))]
486+
pub const PyUnicode_2BYTE_KIND: u32 = 2;
487+
#[cfg(not(Py_LIMITED_API))]
488+
pub const PyUnicode_4BYTE_KIND: u32 = 4;
489+
490+
#[cfg(not(Py_LIMITED_API))]
491+
#[inline]
492+
pub unsafe fn PyUnicode_KIND(o: *mut PyObject) -> u32 {
493+
debug_assert!(PyUnicode_Check(o) > 0);
494+
debug_assert!(PyUnicode_IS_READY(o));
495+
let state = (*(o as *mut PyASCIIObject)).state;
496+
(state >> 2) & 7
497+
}
498+
499+
#[cfg(not(Py_LIMITED_API))]
500+
pub unsafe fn PyUnicode_DATA(o: *mut PyObject) -> *mut c_void {
501+
debug_assert!(PyUnicode_Check(o) > 0);
502+
debug_assert!(PyUnicode_IS_READY(o));
503+
if PyUnicode_IS_COMPACT(o) {
504+
// fn _PyUnicode_COMPACT_DATA
505+
if PyUnicode_IS_ASCII(o) {
506+
(o as *mut PyASCIIObject).offset(1) as *mut c_void
507+
} else {
508+
(o as *mut PyCompactUnicodeObject).offset(1) as *mut c_void
509+
}
510+
} else {
511+
// fn _PyUnicode_NONCOMPACT_DATA
512+
let data = (*(o as *mut PyUnicodeObject)).data;
513+
debug_assert!(!data.is_null());
514+
data
515+
}
516+
}
517+
518+
#[cfg(not(Py_LIMITED_API))]
519+
#[inline]
520+
pub unsafe fn PyUnicode_GET_LENGTH(o: *mut PyObject) -> Py_ssize_t {
521+
debug_assert!(PyUnicode_Check(o) > 0);
522+
debug_assert!(PyUnicode_IS_READY(o));
523+
(*(o as *mut PyASCIIObject)).length
524+
}
525+
526+
#[cfg(not(Py_LIMITED_API))]
527+
#[inline]
528+
unsafe fn PyUnicode_IS_READY(o: *mut PyObject) -> bool {
529+
let ready_bit = 1 << 7;
530+
let state = (*(o as *mut PyASCIIObject)).state;
531+
(state & ready_bit) != 0
532+
}
533+
534+
#[cfg(not(Py_LIMITED_API))]
535+
#[inline]
536+
pub unsafe fn PyUnicode_READY(o: *mut PyObject) -> c_int {
537+
debug_assert!(PyUnicode_Check(o) > 0);
538+
if PyUnicode_IS_READY(o) {
539+
0
540+
} else {
541+
_PyUnicode_Ready(o)
542+
}
432543
}

src/objects/string.rs

Lines changed: 84 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ impl<'a> PyStringData<'a> {
159159
)),
160160
},
161161
PyStringData::Latin1(data) => {
162-
if data.iter().all(|&b| b.is_ascii()) {
162+
if data.is_ascii() {
163163
Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
164164
} else {
165165
Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
@@ -214,7 +214,7 @@ impl<'a> PyStringData<'a> {
214214
match self {
215215
PyStringData::Utf8(data) => String::from_utf8_lossy(data),
216216
PyStringData::Latin1(data) => {
217-
if data.iter().all(|&b| b.is_ascii()) {
217+
if data.is_ascii() {
218218
Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
219219
} else {
220220
Cow::Owned(data.iter().map(|&b| b as char).collect())
@@ -283,17 +283,24 @@ impl PyString {
283283
}
284284

285285
#[cfg(feature = "python3-sys")]
286-
fn data_impl(&self, py: Python) -> PyStringData {
287-
// TODO: return the original representation instead
288-
// of forcing the UTF-8 representation to be created.
289-
let mut size: ffi::Py_ssize_t = 0;
286+
fn data_impl(&self, _py: Python) -> PyStringData {
287+
let ptr = self.as_ptr();
290288
unsafe {
291-
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size) as *const u8;
292-
if data.is_null() {
293-
PyErr::fetch(py).print(py);
294-
panic!("PyUnicode_AsUTF8AndSize failed");
289+
let ready = ffi::PyUnicode_READY(ptr);
290+
if ready < 0 {
291+
// should fail only on OOM
292+
ffi::PyErr_Print();
293+
panic!("PyUnicode_READY failed");
294+
}
295+
let size = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
296+
let data = ffi::PyUnicode_DATA(ptr);
297+
let kind = ffi::PyUnicode_KIND(ptr);
298+
match kind {
299+
ffi::PyUnicode_1BYTE_KIND => PyStringData::Latin1(std::slice::from_raw_parts(data as *const u8, size)),
300+
ffi::PyUnicode_2BYTE_KIND => PyStringData::Utf16(std::slice::from_raw_parts(data as *const u16, size)),
301+
ffi::PyUnicode_4BYTE_KIND => PyStringData::Utf32(std::slice::from_raw_parts(data as *const u32, size)),
302+
_ => panic!("Unknown PyUnicode_KIND")
295303
}
296-
PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize))
297304
}
298305
}
299306

@@ -306,7 +313,26 @@ impl PyString {
306313
/// (containing unpaired surrogates, or a Python 2.7 byte string that is
307314
/// not valid UTF-8).
308315
pub fn to_string(&self, py: Python) -> PyResult<Cow<str>> {
309-
self.data(py).to_string(py)
316+
#[cfg(feature = "python3-sys")]
317+
unsafe {
318+
// On Python 3, we can use the UTF-8 representation stored
319+
// inside the Python string.
320+
// This should produce identical results to
321+
// `self.data(py).to_string(py)` but avoids
322+
// re-encoding the string on every to_string call.
323+
let mut size: ffi::Py_ssize_t = 0;
324+
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size);
325+
if data.is_null() {
326+
return Err(PyErr::fetch(py));
327+
} else {
328+
let slice = std::slice::from_raw_parts(data as *const u8, size as usize);
329+
return Ok(Cow::Borrowed(std::str::from_utf8_unchecked(slice)));
330+
}
331+
}
332+
#[cfg(feature = "python27-sys")]
333+
{
334+
return self.data(py).to_string(py);
335+
}
310336
}
311337

312338
/// Convert the `PyString` into a Rust string.
@@ -535,6 +561,7 @@ impl RefFromPyObject for [u8] {
535561
mod test {
536562
use crate::conversion::{RefFromPyObject, ToPyObject};
537563
use crate::python::{Python, PythonObject};
564+
use super::{PyString, PyStringData};
538565

539566
#[test]
540567
fn test_non_bmp() {
@@ -583,4 +610,49 @@ mod test {
583610
let v = py_bytes.extract::<Vec<u8>>(py).unwrap();
584611
assert_eq!(b"Hello", &v[..]);
585612
}
613+
614+
#[allow(unused_variables)] // when compiling for py2.7
615+
#[test]
616+
fn test_extract_umlaut() {
617+
let gil = Python::acquire_gil();
618+
let py = gil.python();
619+
let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap();
620+
let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
621+
#[cfg(feature = "python3-sys")]
622+
{
623+
if let PyStringData::Latin1(s) = data {
624+
assert_eq!([b'x', b'=', 0xe4], *s);
625+
} else {
626+
panic!("Expected PyStringData::Latin1");
627+
}
628+
}
629+
assert_eq!("x=ä", py_string.extract::<String>(py).unwrap());
630+
}
631+
632+
#[allow(unused_variables)] // when compiling for py2.7
633+
#[test]
634+
fn test_extract_lone_surrogate() {
635+
let gil = Python::acquire_gil();
636+
let py = gil.python();
637+
let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
638+
let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
639+
#[cfg(feature = "python3-sys")]
640+
{
641+
if let PyStringData::Utf16(s) = data {
642+
assert_eq!(['x' as u16, '=' as u16, 0xd800], *s);
643+
} else {
644+
panic!("Expected PyStringData::Utf16");
645+
}
646+
}
647+
assert!(py_string.extract::<String>(py).is_err());
648+
}
649+
650+
#[test]
651+
fn test_extract_lone_surrogate_lossy() {
652+
let gil = Python::acquire_gil();
653+
let py = gil.python();
654+
let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
655+
let result = py_string.cast_as::<PyString>(py).unwrap().to_string_lossy(py);
656+
assert_eq!("x=\u{fffd}", result);
657+
}
586658
}

0 commit comments

Comments
 (0)