pyo3/types/
string.rs

1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::ffi_ptr_ext::FfiPtrExt;
4use crate::instance::Borrowed;
5use crate::py_result_ext::PyResultExt;
6use crate::types::bytes::PyBytesMethods;
7use crate::types::PyBytes;
8use crate::{ffi, Bound, Py, PyAny, PyResult, Python};
9use std::borrow::Cow;
10use std::ffi::{CStr, CString};
11use std::str;
12
13/// Represents raw data backing a Python `str`.
14///
15/// Python internally stores strings in various representations. This enumeration
16/// represents those variations.
17#[cfg(not(Py_LIMITED_API))]
18#[derive(Clone, Copy, Debug, PartialEq, Eq)]
19pub enum PyStringData<'a> {
20    /// UCS1 representation.
21    Ucs1(&'a [u8]),
22
23    /// UCS2 representation.
24    Ucs2(&'a [u16]),
25
26    /// UCS4 representation.
27    Ucs4(&'a [u32]),
28}
29
30#[cfg(not(Py_LIMITED_API))]
31impl<'a> PyStringData<'a> {
32    /// Obtain the raw bytes backing this instance as a [u8] slice.
33    pub fn as_bytes(&self) -> &[u8] {
34        match self {
35            Self::Ucs1(s) => s,
36            Self::Ucs2(s) => unsafe {
37                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
38            },
39            Self::Ucs4(s) => unsafe {
40                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
41            },
42        }
43    }
44
45    /// Size in bytes of each value/item in the underlying slice.
46    #[inline]
47    pub fn value_width_bytes(&self) -> usize {
48        match self {
49            Self::Ucs1(_) => 1,
50            Self::Ucs2(_) => 2,
51            Self::Ucs4(_) => 4,
52        }
53    }
54
55    /// Convert the raw data to a Rust string.
56    ///
57    /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
58    /// returns an owned string.
59    ///
60    /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
61    /// storage format. This should only occur for strings that were created via Python
62    /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
63    /// never occur for strings that were created from Python code.
64    pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
65        match self {
66            Self::Ucs1(data) => match str::from_utf8(data) {
67                Ok(s) => Ok(Cow::Borrowed(s)),
68                Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()),
69            },
70            Self::Ucs2(data) => match String::from_utf16(data) {
71                Ok(s) => Ok(Cow::Owned(s)),
72                Err(e) => {
73                    let mut message = e.to_string().as_bytes().to_vec();
74                    message.push(0);
75
76                    Err(PyUnicodeDecodeError::new(
77                        py,
78                        c"utf-16",
79                        self.as_bytes(),
80                        0..self.as_bytes().len(),
81                        CStr::from_bytes_with_nul(&message).unwrap(),
82                    )?
83                    .into())
84                }
85            },
86            Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
87                Some(s) => Ok(Cow::Owned(s)),
88                None => Err(PyUnicodeDecodeError::new(
89                    py,
90                    c"utf-32",
91                    self.as_bytes(),
92                    0..self.as_bytes().len(),
93                    c"error converting utf-32",
94                )?
95                .into()),
96            },
97        }
98    }
99
100    /// Convert the raw data to a Rust string, possibly with data loss.
101    ///
102    /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
103    ///
104    /// Returns a borrow into original data, when possible, or owned data otherwise.
105    ///
106    /// The return value of this function should only disagree with [Self::to_string]
107    /// when that method would error.
108    pub fn to_string_lossy(self) -> Cow<'a, str> {
109        match self {
110            Self::Ucs1(data) => String::from_utf8_lossy(data),
111            Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
112            Self::Ucs4(data) => Cow::Owned(
113                data.iter()
114                    .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
115                    .collect(),
116            ),
117        }
118    }
119}
120
121/// Represents a Python `string` (a Unicode string object).
122///
123/// Values of this type are accessed via PyO3's smart pointers, e.g. as
124/// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound].
125///
126/// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for
127/// [`Bound<'py, PyString>`][Bound].
128///
129/// # Equality
130///
131/// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the
132/// data in the Python string to a Rust UTF-8 string slice.
133///
134/// This is not always the most appropriate way to compare Python strings, as Python string
135/// subclasses may have different equality semantics. In situations where subclasses overriding
136/// equality might be relevant, use [`PyAnyMethods::eq`](crate::types::any::PyAnyMethods::eq), at
137/// cost of the additional overhead of a Python method call.
138///
139/// ```rust
140/// # use pyo3::prelude::*;
141/// use pyo3::types::PyString;
142///
143/// # Python::attach(|py| {
144/// let py_string = PyString::new(py, "foo");
145/// // via PartialEq<str>
146/// assert_eq!(py_string, "foo");
147///
148/// // via Python equality
149/// assert!(py_string.as_any().eq("foo").unwrap());
150/// # });
151/// ```
152#[repr(transparent)]
153pub struct PyString(PyAny);
154
155pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), "builtins", "str", #checkfunction=ffi::PyUnicode_Check);
156
157impl PyString {
158    /// Creates a new Python string object.
159    ///
160    /// Panics if out of memory.
161    pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
162        let ptr = s.as_ptr().cast();
163        let len = s.len() as ffi::Py_ssize_t;
164        unsafe {
165            ffi::PyUnicode_FromStringAndSize(ptr, len)
166                .assume_owned(py)
167                .cast_into_unchecked()
168        }
169    }
170
171    /// Creates a new Python string object from bytes.
172    ///
173    /// Returns PyMemoryError if out of memory.
174    /// Returns [PyUnicodeDecodeError] if the slice is not a valid UTF-8 string.
175    pub fn from_bytes<'py>(py: Python<'py>, s: &[u8]) -> PyResult<Bound<'py, PyString>> {
176        let ptr = s.as_ptr().cast();
177        let len = s.len() as ffi::Py_ssize_t;
178        unsafe {
179            ffi::PyUnicode_FromStringAndSize(ptr, len)
180                .assume_owned_or_err(py)
181                .cast_into_unchecked()
182        }
183    }
184
185    /// Intern the given string
186    ///
187    /// This will return a reference to the same Python string object if called repeatedly with the same string.
188    ///
189    /// Note that while this is more memory efficient than [`PyString::new`], it unconditionally allocates a
190    /// temporary Python string object and is thereby slower than [`PyString::new`].
191    ///
192    /// Panics if out of memory.
193    pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
194        let ptr = s.as_ptr().cast();
195        let len = s.len() as ffi::Py_ssize_t;
196        unsafe {
197            let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
198            if !ob.is_null() {
199                ffi::PyUnicode_InternInPlace(&mut ob);
200            }
201            ob.assume_owned(py).cast_into_unchecked()
202        }
203    }
204
205    /// Attempts to create a Python string from a Python [bytes-like object].
206    ///
207    /// The `encoding` and `errors` parameters are optional:
208    /// - If `encoding` is `None`, the default encoding is used (UTF-8).
209    /// - If `errors` is `None`, the default error handling is used ("strict").
210    ///
211    /// See the [Python documentation on codecs] for more information.
212    ///
213    /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
214    /// [Python documentation on codecs]: https://docs.python.org/3/library/codecs.html#standard-encodings
215    pub fn from_encoded_object<'py>(
216        src: &Bound<'py, PyAny>,
217        encoding: Option<&CStr>,
218        errors: Option<&CStr>,
219    ) -> PyResult<Bound<'py, PyString>> {
220        let encoding = encoding.map_or(std::ptr::null(), CStr::as_ptr);
221        let errors = errors.map_or(std::ptr::null(), CStr::as_ptr);
222        // Safety:
223        // - `src` is a valid Python object
224        // - `encoding` and `errors` are either null or valid C strings. `encoding` and `errors` are
225        //   documented as allowing null.
226        // - `ffi::PyUnicode_FromEncodedObject` returns a new `str` object, or sets an error.
227        unsafe {
228            ffi::PyUnicode_FromEncodedObject(src.as_ptr(), encoding, errors)
229                .assume_owned_or_err(src.py())
230                .cast_into_unchecked()
231        }
232    }
233
234    /// Deprecated form of `PyString::from_encoded_object`.
235    ///
236    /// This version took `&str` arguments for `encoding` and `errors`, which required a runtime
237    /// conversion to `CString` internally.
238    #[deprecated(
239        since = "0.25.0",
240        note = "replaced with to `PyString::from_encoded_object`"
241    )]
242    pub fn from_object<'py>(
243        src: &Bound<'py, PyAny>,
244        encoding: &str,
245        errors: &str,
246    ) -> PyResult<Bound<'py, PyString>> {
247        let encoding = CString::new(encoding)?;
248        let errors = CString::new(errors)?;
249        PyString::from_encoded_object(src, Some(&encoding), Some(&errors))
250    }
251}
252
253/// Implementation of functionality for [`PyString`].
254///
255/// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call
256/// syntax these methods are separated into a trait, because stable Rust does not yet support
257/// `arbitrary_self_types`.
258#[doc(alias = "PyString")]
259pub trait PyStringMethods<'py>: crate::sealed::Sealed {
260    /// Gets the Python string as a Rust UTF-8 string slice.
261    ///
262    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
263    /// (containing unpaired surrogates).
264    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
265    fn to_str(&self) -> PyResult<&str>;
266
267    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
268    ///
269    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
270    /// (containing unpaired surrogates).
271    fn to_cow(&self) -> PyResult<Cow<'_, str>>;
272
273    /// Converts the `PyString` into a Rust string.
274    ///
275    /// Unpaired surrogates invalid UTF-8 sequences are
276    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
277    fn to_string_lossy(&self) -> Cow<'_, str>;
278
279    /// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
280    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
281
282    /// Obtains the raw data backing the Python string.
283    ///
284    /// If the Python string object was created through legacy APIs, its internal storage format
285    /// will be canonicalized before data is returned.
286    ///
287    /// # Safety
288    ///
289    /// This function implementation relies on manually decoding a C bitfield. In practice, this
290    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
291    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
292    /// x86_64 platforms.
293    ///
294    /// By using this API, you accept responsibility for testing that PyStringData behaves as
295    /// expected on the targets where you plan to distribute your software.
296    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
297    unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
298}
299
300impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
301    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
302    fn to_str(&self) -> PyResult<&str> {
303        self.as_borrowed().to_str()
304    }
305
306    fn to_cow(&self) -> PyResult<Cow<'_, str>> {
307        self.as_borrowed().to_cow()
308    }
309
310    fn to_string_lossy(&self) -> Cow<'_, str> {
311        self.as_borrowed().to_string_lossy()
312    }
313
314    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
315        unsafe {
316            ffi::PyUnicode_AsUTF8String(self.as_ptr())
317                .assume_owned_or_err(self.py())
318                .cast_into_unchecked::<PyBytes>()
319        }
320    }
321
322    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
323    unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
324        unsafe { self.as_borrowed().data() }
325    }
326}
327
328impl<'a> Borrowed<'a, '_, PyString> {
329    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
330    pub(crate) fn to_str(self) -> PyResult<&'a str> {
331        // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
332        let mut size: ffi::Py_ssize_t = 0;
333        let data: *const u8 =
334            unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
335        if data.is_null() {
336            Err(crate::PyErr::fetch(self.py()))
337        } else {
338            Ok(unsafe {
339                std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
340            })
341        }
342    }
343
344    pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
345        // TODO: this method can probably be deprecated once Python 3.9 support is dropped,
346        // because all versions then support the more efficient `to_str`.
347        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
348        {
349            self.to_str().map(Cow::Borrowed)
350        }
351
352        #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
353        {
354            let bytes = self.encode_utf8()?;
355            Ok(Cow::Owned(
356                unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
357            ))
358        }
359    }
360
361    fn to_string_lossy(self) -> Cow<'a, str> {
362        let ptr = self.as_ptr();
363        let py = self.py();
364
365        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
366        if let Ok(s) = self.to_str() {
367            return Cow::Borrowed(s);
368        }
369
370        let bytes = unsafe {
371            ffi::PyUnicode_AsEncodedString(ptr, c"utf-8".as_ptr(), c"surrogatepass".as_ptr())
372                .assume_owned(py)
373                .cast_into_unchecked::<PyBytes>()
374        };
375        Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
376    }
377
378    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
379    unsafe fn data(self) -> PyResult<PyStringData<'a>> {
380        unsafe {
381            let ptr = self.as_ptr();
382
383            #[cfg(not(Py_3_12))]
384            #[allow(deprecated)]
385            {
386                let ready = ffi::PyUnicode_READY(ptr);
387                if ready != 0 {
388                    // Exception was created on failure.
389                    return Err(crate::PyErr::fetch(self.py()));
390                }
391            }
392
393            // The string should be in its canonical form after calling `PyUnicode_READY()`.
394            // And non-canonical form not possible after Python 3.12. So it should be safe
395            // to call these APIs.
396            let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
397            let raw_data = ffi::PyUnicode_DATA(ptr);
398            let kind = ffi::PyUnicode_KIND(ptr);
399
400            match kind {
401                ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
402                    raw_data as *const u8,
403                    length,
404                ))),
405                ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
406                    raw_data as *const u16,
407                    length,
408                ))),
409                ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
410                    raw_data as *const u32,
411                    length,
412                ))),
413                _ => unreachable!(),
414            }
415        }
416    }
417}
418
419impl Py<PyString> {
420    /// Gets the Python string as a Rust UTF-8 string slice.
421    ///
422    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
423    /// (containing unpaired surrogates).
424    ///
425    /// Because `str` objects are immutable, the returned slice is independent of
426    /// the GIL lifetime.
427    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
428    pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
429        self.bind_borrowed(py).to_str()
430    }
431
432    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
433    ///
434    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
435    /// (containing unpaired surrogates).
436    ///
437    /// Because `str` objects are immutable, the returned slice is independent of
438    /// the GIL lifetime.
439    pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
440        self.bind_borrowed(py).to_cow()
441    }
442
443    /// Converts the `PyString` into a Rust string.
444    ///
445    /// Unpaired surrogates invalid UTF-8 sequences are
446    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
447    ///
448    /// Because `str` objects are immutable, the returned slice is independent of
449    /// the GIL lifetime.
450    pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
451        self.bind_borrowed(py).to_string_lossy()
452    }
453}
454
455/// Compares whether the data in the Python string is equal to the given UTF8.
456///
457/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
458impl PartialEq<str> for Bound<'_, PyString> {
459    #[inline]
460    fn eq(&self, other: &str) -> bool {
461        self.as_borrowed() == *other
462    }
463}
464
465/// Compares whether the data in the Python string is equal to the given UTF8.
466///
467/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
468impl PartialEq<&'_ str> for Bound<'_, PyString> {
469    #[inline]
470    fn eq(&self, other: &&str) -> bool {
471        self.as_borrowed() == **other
472    }
473}
474
475/// Compares whether the data in the Python string is equal to the given UTF8.
476///
477/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
478impl PartialEq<Bound<'_, PyString>> for str {
479    #[inline]
480    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
481        *self == other.as_borrowed()
482    }
483}
484
485/// Compares whether the data in the Python string is equal to the given UTF8.
486///
487/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
488impl PartialEq<&'_ Bound<'_, PyString>> for str {
489    #[inline]
490    fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
491        *self == other.as_borrowed()
492    }
493}
494
495/// Compares whether the data in the Python string is equal to the given UTF8.
496///
497/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
498impl PartialEq<Bound<'_, PyString>> for &'_ str {
499    #[inline]
500    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
501        **self == other.as_borrowed()
502    }
503}
504
505/// Compares whether the data in the Python string is equal to the given UTF8.
506///
507/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
508impl PartialEq<str> for &'_ Bound<'_, PyString> {
509    #[inline]
510    fn eq(&self, other: &str) -> bool {
511        self.as_borrowed() == other
512    }
513}
514
515/// Compares whether the data in the Python string is equal to the given UTF8.
516///
517/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
518impl PartialEq<str> for Borrowed<'_, '_, PyString> {
519    #[inline]
520    fn eq(&self, other: &str) -> bool {
521        #[cfg(not(Py_3_13))]
522        {
523            self.to_cow().is_ok_and(|s| s == other)
524        }
525
526        #[cfg(Py_3_13)]
527        unsafe {
528            ffi::PyUnicode_EqualToUTF8AndSize(
529                self.as_ptr(),
530                other.as_ptr().cast(),
531                other.len() as _,
532            ) == 1
533        }
534    }
535}
536
537/// Compares whether the data in the Python string is equal to the given UTF8.
538///
539/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
540impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
541    #[inline]
542    fn eq(&self, other: &&str) -> bool {
543        *self == **other
544    }
545}
546
547/// Compares whether the data in the Python string is equal to the given UTF8.
548///
549/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
550impl PartialEq<Borrowed<'_, '_, PyString>> for str {
551    #[inline]
552    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
553        other == self
554    }
555}
556
557/// Compares whether the data in the Python string is equal to the given UTF8.
558///
559/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
560impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
561    #[inline]
562    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
563        other == self
564    }
565}
566
567#[cfg(test)]
568mod tests {
569    use super::*;
570    use crate::{exceptions::PyLookupError, types::PyAnyMethods as _, IntoPyObject};
571
572    #[test]
573    fn test_to_cow_utf8() {
574        Python::attach(|py| {
575            let s = "ascii 🐈";
576            let py_string = PyString::new(py, s);
577            assert_eq!(s, py_string.to_cow().unwrap());
578        })
579    }
580
581    #[test]
582    fn test_to_cow_surrogate() {
583        Python::attach(|py| {
584            let py_string = py
585                .eval(cr"'\ud800'", None, None)
586                .unwrap()
587                .cast_into::<PyString>()
588                .unwrap();
589            assert!(py_string.to_cow().is_err());
590        })
591    }
592
593    #[test]
594    fn test_to_cow_unicode() {
595        Python::attach(|py| {
596            let s = "哈哈🐈";
597            let py_string = PyString::new(py, s);
598            assert_eq!(s, py_string.to_cow().unwrap());
599        })
600    }
601
602    #[test]
603    fn test_encode_utf8_unicode() {
604        Python::attach(|py| {
605            let s = "哈哈🐈";
606            let obj = PyString::new(py, s);
607            assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
608        })
609    }
610
611    #[test]
612    fn test_encode_utf8_surrogate() {
613        Python::attach(|py| {
614            let obj: Py<PyAny> = py.eval(cr"'\ud800'", None, None).unwrap().into();
615            assert!(obj
616                .bind(py)
617                .cast::<PyString>()
618                .unwrap()
619                .encode_utf8()
620                .is_err());
621        })
622    }
623
624    #[test]
625    fn test_to_string_lossy() {
626        Python::attach(|py| {
627            let py_string = py
628                .eval(cr"'🐈 Hello \ud800World'", None, None)
629                .unwrap()
630                .cast_into::<PyString>()
631                .unwrap();
632
633            assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
634        })
635    }
636
637    #[test]
638    fn test_debug_string() {
639        Python::attach(|py| {
640            let s = "Hello\n".into_pyobject(py).unwrap();
641            assert_eq!(format!("{s:?}"), "'Hello\\n'");
642        })
643    }
644
645    #[test]
646    fn test_display_string() {
647        Python::attach(|py| {
648            let s = "Hello\n".into_pyobject(py).unwrap();
649            assert_eq!(format!("{s}"), "Hello\n");
650        })
651    }
652
653    #[test]
654    fn test_string_from_encoded_object() {
655        Python::attach(|py| {
656            let py_bytes = PyBytes::new(py, b"ab\xFFcd");
657
658            // default encoding is utf-8, default error handler is strict
659            let py_string = PyString::from_encoded_object(&py_bytes, None, None).unwrap_err();
660            assert!(py_string
661                .get_type(py)
662                .is(py.get_type::<crate::exceptions::PyUnicodeDecodeError>()));
663
664            // with `ignore` error handler, the invalid byte is dropped
665            let py_string =
666                PyString::from_encoded_object(&py_bytes, None, Some(c"ignore")).unwrap();
667
668            let result = py_string.to_cow().unwrap();
669            assert_eq!(result, "abcd");
670
671            #[allow(deprecated)]
672            let py_string = PyString::from_object(&py_bytes, "utf-8", "ignore").unwrap();
673
674            let result = py_string.to_cow().unwrap();
675            assert_eq!(result, "abcd");
676        });
677    }
678
679    #[test]
680    fn test_string_from_encoded_object_with_invalid_encoding_errors() {
681        Python::attach(|py| {
682            let py_bytes = PyBytes::new(py, b"abcd");
683
684            // invalid encoding
685            let err = PyString::from_encoded_object(&py_bytes, Some(c"wat"), None).unwrap_err();
686            assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
687            assert_eq!(err.to_string(), "LookupError: unknown encoding: wat");
688
689            // invalid error handler
690            let err =
691                PyString::from_encoded_object(&PyBytes::new(py, b"ab\xFFcd"), None, Some(c"wat"))
692                    .unwrap_err();
693            assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
694            assert_eq!(
695                err.to_string(),
696                "LookupError: unknown error handler name 'wat'"
697            );
698
699            #[allow(deprecated)]
700            let result = PyString::from_object(&py_bytes, "utf\0-8", "ignore");
701            assert!(result.is_err());
702
703            #[allow(deprecated)]
704            let result = PyString::from_object(&py_bytes, "utf-8", "ign\0ore");
705            assert!(result.is_err());
706        });
707    }
708
709    #[test]
710    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
711    fn test_string_data_ucs1() {
712        Python::attach(|py| {
713            let s = PyString::new(py, "hello, world");
714            let data = unsafe { s.data().unwrap() };
715
716            assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
717            assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
718            assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
719        })
720    }
721
722    #[test]
723    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
724    fn test_string_data_ucs1_invalid() {
725        Python::attach(|py| {
726            // 0xfe is not allowed in UTF-8.
727            let buffer = b"f\xfe\0";
728            let ptr = unsafe {
729                crate::ffi::PyUnicode_FromKindAndData(
730                    crate::ffi::PyUnicode_1BYTE_KIND as _,
731                    buffer.as_ptr().cast(),
732                    2,
733                )
734            };
735            assert!(!ptr.is_null());
736            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
737            let data = unsafe { s.data().unwrap() };
738            assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
739            let err = data.to_string(py).unwrap_err();
740            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
741            assert!(err
742                .to_string()
743                .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
744            assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
745        });
746    }
747
748    #[test]
749    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
750    fn test_string_data_ucs2() {
751        Python::attach(|py| {
752            let s = py.eval(c"'foo\\ud800'", None, None).unwrap();
753            let py_string = s.cast::<PyString>().unwrap();
754            let data = unsafe { py_string.data().unwrap() };
755
756            assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
757            assert_eq!(
758                data.to_string_lossy(),
759                Cow::Owned::<str>("foo�".to_string())
760            );
761        })
762    }
763
764    #[test]
765    #[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
766    fn test_string_data_ucs2_invalid() {
767        Python::attach(|py| {
768            // U+FF22 (valid) & U+d800 (never valid)
769            let buffer = b"\x22\xff\x00\xd8\x00\x00";
770            let ptr = unsafe {
771                crate::ffi::PyUnicode_FromKindAndData(
772                    crate::ffi::PyUnicode_2BYTE_KIND as _,
773                    buffer.as_ptr().cast(),
774                    2,
775                )
776            };
777            assert!(!ptr.is_null());
778            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
779            let data = unsafe { s.data().unwrap() };
780            assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
781            let err = data.to_string(py).unwrap_err();
782            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
783            assert!(err
784                .to_string()
785                .contains("'utf-16' codec can't decode bytes in position 0-3"));
786            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
787        });
788    }
789
790    #[test]
791    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
792    fn test_string_data_ucs4() {
793        Python::attach(|py| {
794            let s = "哈哈🐈";
795            let py_string = PyString::new(py, s);
796            let data = unsafe { py_string.data().unwrap() };
797
798            assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
799            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
800        })
801    }
802
803    #[test]
804    #[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
805    fn test_string_data_ucs4_invalid() {
806        Python::attach(|py| {
807            // U+20000 (valid) & U+d800 (never valid)
808            let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
809            let ptr = unsafe {
810                crate::ffi::PyUnicode_FromKindAndData(
811                    crate::ffi::PyUnicode_4BYTE_KIND as _,
812                    buffer.as_ptr().cast(),
813                    2,
814                )
815            };
816            assert!(!ptr.is_null());
817            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
818            let data = unsafe { s.data().unwrap() };
819            assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
820            let err = data.to_string(py).unwrap_err();
821            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
822            assert!(err
823                .to_string()
824                .contains("'utf-32' codec can't decode bytes in position 0-7"));
825            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
826        });
827    }
828
829    #[test]
830    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
831    fn test_pystring_from_bytes() {
832        Python::attach(|py| {
833            let result = PyString::from_bytes(py, "\u{2122}".as_bytes());
834            assert!(result.is_ok());
835            let result = PyString::from_bytes(py, b"\x80");
836            assert!(result
837                .unwrap_err()
838                .get_type(py)
839                .is(py.get_type::<PyUnicodeDecodeError>()));
840        });
841    }
842
843    #[test]
844    fn test_intern_string() {
845        Python::attach(|py| {
846            let py_string1 = PyString::intern(py, "foo");
847            assert_eq!(py_string1, "foo");
848
849            let py_string2 = PyString::intern(py, "foo");
850            assert_eq!(py_string2, "foo");
851
852            assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
853
854            let py_string3 = PyString::intern(py, "bar");
855            assert_eq!(py_string3, "bar");
856
857            assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
858        });
859    }
860
861    #[test]
862    fn test_py_to_str_utf8() {
863        Python::attach(|py| {
864            let s = "ascii 🐈";
865            let py_string = PyString::new(py, s).unbind();
866
867            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
868            assert_eq!(s, py_string.to_str(py).unwrap());
869
870            assert_eq!(s, py_string.to_cow(py).unwrap());
871        })
872    }
873
874    #[test]
875    fn test_py_to_str_surrogate() {
876        Python::attach(|py| {
877            let py_string: Py<PyString> = py
878                .eval(cr"'\ud800'", None, None)
879                .unwrap()
880                .extract()
881                .unwrap();
882
883            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
884            assert!(py_string.to_str(py).is_err());
885
886            assert!(py_string.to_cow(py).is_err());
887        })
888    }
889
890    #[test]
891    fn test_py_to_string_lossy() {
892        Python::attach(|py| {
893            let py_string: Py<PyString> = py
894                .eval(cr"'🐈 Hello \ud800World'", None, None)
895                .unwrap()
896                .extract()
897                .unwrap();
898            assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
899        })
900    }
901
902    #[test]
903    fn test_comparisons() {
904        Python::attach(|py| {
905            let s = "hello, world";
906            let py_string = PyString::new(py, s);
907
908            assert_eq!(py_string, "hello, world");
909
910            assert_eq!(py_string, s);
911            assert_eq!(&py_string, s);
912            assert_eq!(s, py_string);
913            assert_eq!(s, &py_string);
914
915            assert_eq!(py_string, *s);
916            assert_eq!(&py_string, *s);
917            assert_eq!(*s, py_string);
918            assert_eq!(*s, &py_string);
919
920            let py_string = py_string.as_borrowed();
921
922            assert_eq!(py_string, s);
923            assert_eq!(&py_string, s);
924            assert_eq!(s, py_string);
925            assert_eq!(s, &py_string);
926
927            assert_eq!(py_string, *s);
928            assert_eq!(*s, py_string);
929        })
930    }
931}