@@ -159,7 +159,7 @@ impl<'a> PyStringData<'a> {
159
159
) ) ,
160
160
} ,
161
161
PyStringData :: Latin1 ( data) => {
162
- if data. iter ( ) . all ( | & b| b . is_ascii ( ) ) {
162
+ if data. is_ascii ( ) {
163
163
Ok ( Cow :: Borrowed ( unsafe { str:: from_utf8_unchecked ( data) } ) )
164
164
} else {
165
165
Ok ( Cow :: Owned ( data. iter ( ) . map ( |& b| b as char ) . collect ( ) ) )
@@ -214,7 +214,7 @@ impl<'a> PyStringData<'a> {
214
214
match self {
215
215
PyStringData :: Utf8 ( data) => String :: from_utf8_lossy ( data) ,
216
216
PyStringData :: Latin1 ( data) => {
217
- if data. iter ( ) . all ( | & b| b . is_ascii ( ) ) {
217
+ if data. is_ascii ( ) {
218
218
Cow :: Borrowed ( unsafe { str:: from_utf8_unchecked ( data) } )
219
219
} else {
220
220
Cow :: Owned ( data. iter ( ) . map ( |& b| b as char ) . collect ( ) )
@@ -283,17 +283,24 @@ impl PyString {
283
283
}
284
284
285
285
#[ cfg( feature = "python3-sys" ) ]
286
- fn data_impl ( & self , py : Python ) -> PyStringData {
287
- // TODO: return the original representation instead
288
- // of forcing the UTF-8 representation to be created.
289
- let mut size: ffi:: Py_ssize_t = 0 ;
286
+ fn data_impl ( & self , _py : Python ) -> PyStringData {
287
+ let ptr = self . as_ptr ( ) ;
290
288
unsafe {
291
- let data = ffi:: PyUnicode_AsUTF8AndSize ( self . as_ptr ( ) , & mut size) as * const u8 ;
292
- if data. is_null ( ) {
293
- PyErr :: fetch ( py) . print ( py) ;
294
- panic ! ( "PyUnicode_AsUTF8AndSize failed" ) ;
289
+ let ready = ffi:: PyUnicode_READY ( ptr) ;
290
+ if ready < 0 {
291
+ // should fail only on OOM
292
+ ffi:: PyErr_Print ( ) ;
293
+ panic ! ( "PyUnicode_READY failed" ) ;
294
+ }
295
+ let size = ffi:: PyUnicode_GET_LENGTH ( ptr) as usize ;
296
+ let data = ffi:: PyUnicode_DATA ( ptr) ;
297
+ let kind = ffi:: PyUnicode_KIND ( ptr) ;
298
+ match kind {
299
+ ffi:: PyUnicode_1BYTE_KIND => PyStringData :: Latin1 ( std:: slice:: from_raw_parts ( data as * const u8 , size) ) ,
300
+ ffi:: PyUnicode_2BYTE_KIND => PyStringData :: Utf16 ( std:: slice:: from_raw_parts ( data as * const u16 , size) ) ,
301
+ ffi:: PyUnicode_4BYTE_KIND => PyStringData :: Utf32 ( std:: slice:: from_raw_parts ( data as * const u32 , size) ) ,
302
+ _ => panic ! ( "Unknown PyUnicode_KIND" )
295
303
}
296
- PyStringData :: Utf8 ( std:: slice:: from_raw_parts ( data, size as usize ) )
297
304
}
298
305
}
299
306
@@ -306,7 +313,26 @@ impl PyString {
306
313
/// (containing unpaired surrogates, or a Python 2.7 byte string that is
307
314
/// not valid UTF-8).
308
315
pub fn to_string ( & self , py : Python ) -> PyResult < Cow < str > > {
309
- self . data ( py) . to_string ( py)
316
+ #[ cfg( feature = "python3-sys" ) ]
317
+ unsafe {
318
+ // On Python 3, we can use the UTF-8 representation stored
319
+ // inside the Python string.
320
+ // This should produce identical results to
321
+ // `self.data(py).to_string(py)` but avoids
322
+ // re-encoding the string on every to_string call.
323
+ let mut size: ffi:: Py_ssize_t = 0 ;
324
+ let data = ffi:: PyUnicode_AsUTF8AndSize ( self . as_ptr ( ) , & mut size) ;
325
+ if data. is_null ( ) {
326
+ return Err ( PyErr :: fetch ( py) ) ;
327
+ } else {
328
+ let slice = std:: slice:: from_raw_parts ( data as * const u8 , size as usize ) ;
329
+ return Ok ( Cow :: Borrowed ( std:: str:: from_utf8_unchecked ( slice) ) ) ;
330
+ }
331
+ }
332
+ #[ cfg( feature = "python27-sys" ) ]
333
+ {
334
+ return self . data ( py) . to_string ( py) ;
335
+ }
310
336
}
311
337
312
338
/// Convert the `PyString` into a Rust string.
@@ -535,6 +561,7 @@ impl RefFromPyObject for [u8] {
535
561
mod test {
536
562
use crate :: conversion:: { RefFromPyObject , ToPyObject } ;
537
563
use crate :: python:: { Python , PythonObject } ;
564
+ use super :: { PyString , PyStringData } ;
538
565
539
566
#[ test]
540
567
fn test_non_bmp ( ) {
@@ -583,4 +610,49 @@ mod test {
583
610
let v = py_bytes. extract :: < Vec < u8 > > ( py) . unwrap ( ) ;
584
611
assert_eq ! ( b"Hello" , & v[ ..] ) ;
585
612
}
613
+
614
+ #[ allow( unused_variables) ] // when compiling for py2.7
615
+ #[ test]
616
+ fn test_extract_umlaut ( ) {
617
+ let gil = Python :: acquire_gil ( ) ;
618
+ let py = gil. python ( ) ;
619
+ let py_string = py. eval ( "u'x=\\ u00e4'" , None , None ) . unwrap ( ) ;
620
+ let data = py_string. cast_as :: < PyString > ( py) . unwrap ( ) . data ( py) ;
621
+ #[ cfg( feature = "python3-sys" ) ]
622
+ {
623
+ if let PyStringData :: Latin1 ( s) = data {
624
+ assert_eq ! ( [ b'x' , b'=' , 0xe4 ] , * s) ;
625
+ } else {
626
+ panic ! ( "Expected PyStringData::Latin1" ) ;
627
+ }
628
+ }
629
+ assert_eq ! ( "x=ä" , py_string. extract:: <String >( py) . unwrap( ) ) ;
630
+ }
631
+
632
+ #[ allow( unused_variables) ] // when compiling for py2.7
633
+ #[ test]
634
+ fn test_extract_lone_surrogate ( ) {
635
+ let gil = Python :: acquire_gil ( ) ;
636
+ let py = gil. python ( ) ;
637
+ let py_string = py. eval ( "u'x=\\ ud800'" , None , None ) . unwrap ( ) ;
638
+ let data = py_string. cast_as :: < PyString > ( py) . unwrap ( ) . data ( py) ;
639
+ #[ cfg( feature = "python3-sys" ) ]
640
+ {
641
+ if let PyStringData :: Utf16 ( s) = data {
642
+ assert_eq ! ( [ 'x' as u16 , '=' as u16 , 0xd800 ] , * s) ;
643
+ } else {
644
+ panic ! ( "Expected PyStringData::Utf16" ) ;
645
+ }
646
+ }
647
+ assert ! ( py_string. extract:: <String >( py) . is_err( ) ) ;
648
+ }
649
+
650
+ #[ test]
651
+ fn test_extract_lone_surrogate_lossy ( ) {
652
+ let gil = Python :: acquire_gil ( ) ;
653
+ let py = gil. python ( ) ;
654
+ let py_string = py. eval ( "u'x=\\ ud800'" , None , None ) . unwrap ( ) ;
655
+ let result = py_string. cast_as :: < PyString > ( py) . unwrap ( ) . to_string_lossy ( py) ;
656
+ assert_eq ! ( "x=\u{fffd} " , result) ;
657
+ }
586
658
}
0 commit comments