Skip to main content

pyo3/types/
string.rs

1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::ffi_ptr_ext::FfiPtrExt;
4use crate::instance::Borrowed;
5use crate::py_result_ext::PyResultExt;
6use crate::types::bytes::PyBytesMethods;
7use crate::types::PyBytes;
8use crate::{ffi, Bound, Py, PyAny, PyResult, Python};
9use std::borrow::Cow;
10use std::ffi::CStr;
11use std::{fmt, str};
12
13/// Represents raw data backing a Python `str`.
14///
15/// Python internally stores strings in various representations. This enumeration
16/// represents those variations.
17#[cfg(not(Py_LIMITED_API))]
18#[derive(Clone, Copy, Debug, PartialEq, Eq)]
19pub enum PyStringData<'a> {
20    /// UCS1 representation.
21    Ucs1(&'a [u8]),
22
23    /// UCS2 representation.
24    Ucs2(&'a [u16]),
25
26    /// UCS4 representation.
27    Ucs4(&'a [u32]),
28}
29
30#[cfg(not(Py_LIMITED_API))]
31impl<'a> PyStringData<'a> {
32    /// Obtain the raw bytes backing this instance as a [u8] slice.
33    pub fn as_bytes(&self) -> &[u8] {
34        match self {
35            Self::Ucs1(s) => s,
36            Self::Ucs2(s) => unsafe {
37                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
38            },
39            Self::Ucs4(s) => unsafe {
40                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
41            },
42        }
43    }
44
45    /// Size in bytes of each value/item in the underlying slice.
46    #[inline]
47    pub fn value_width_bytes(&self) -> usize {
48        match self {
49            Self::Ucs1(_) => 1,
50            Self::Ucs2(_) => 2,
51            Self::Ucs4(_) => 4,
52        }
53    }
54
55    /// Convert the raw data to a Rust string.
56    ///
57    /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
58    /// returns an owned string.
59    ///
60    /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
61    /// storage format. This should only occur for strings that were created via Python
62    /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
63    /// never occur for strings that were created from Python code.
64    pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
65        match self {
66            Self::Ucs1(data) => match str::from_utf8(data) {
67                Ok(s) => Ok(Cow::Borrowed(s)),
68                Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()),
69            },
70            Self::Ucs2(data) => match String::from_utf16(data) {
71                Ok(s) => Ok(Cow::Owned(s)),
72                Err(e) => {
73                    let mut message = e.to_string().as_bytes().to_vec();
74                    message.push(0);
75
76                    Err(PyUnicodeDecodeError::new(
77                        py,
78                        c"utf-16",
79                        self.as_bytes(),
80                        0..self.as_bytes().len(),
81                        CStr::from_bytes_with_nul(&message).unwrap(),
82                    )?
83                    .into())
84                }
85            },
86            Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
87                Some(s) => Ok(Cow::Owned(s)),
88                None => Err(PyUnicodeDecodeError::new(
89                    py,
90                    c"utf-32",
91                    self.as_bytes(),
92                    0..self.as_bytes().len(),
93                    c"error converting utf-32",
94                )?
95                .into()),
96            },
97        }
98    }
99
100    /// Convert the raw data to a Rust string, possibly with data loss.
101    ///
102    /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
103    ///
104    /// Returns a borrow into original data, when possible, or owned data otherwise.
105    ///
106    /// The return value of this function should only disagree with [Self::to_string]
107    /// when that method would error.
108    pub fn to_string_lossy(self) -> Cow<'a, str> {
109        match self {
110            Self::Ucs1(data) => String::from_utf8_lossy(data),
111            Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
112            Self::Ucs4(data) => Cow::Owned(
113                data.iter()
114                    .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
115                    .collect(),
116            ),
117        }
118    }
119}
120
121/// Represents a Python `string` (a Unicode string object).
122///
123/// Values of this type are accessed via PyO3's smart pointers, e.g. as
124/// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound].
125///
126/// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for
127/// [`Bound<'py, PyString>`][Bound].
128///
129/// # Equality
130///
131/// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the
132/// data in the Python string to a Rust UTF-8 string slice.
133///
134/// This is not always the most appropriate way to compare Python strings, as Python string
135/// subclasses may have different equality semantics. In situations where subclasses overriding
136/// equality might be relevant, use [`PyAnyMethods::eq`](crate::types::any::PyAnyMethods::eq), at
137/// cost of the additional overhead of a Python method call.
138///
139/// ```rust
140/// # use pyo3::prelude::*;
141/// use pyo3::types::PyString;
142///
143/// # Python::attach(|py| {
144/// let py_string = PyString::new(py, "foo");
145/// // via PartialEq<str>
146/// assert_eq!(py_string, "foo");
147///
148/// // via Python equality
149/// assert!(py_string.as_any().eq("foo").unwrap());
150/// # });
151/// ```
152#[repr(transparent)]
153pub struct PyString(PyAny);
154
155pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), "builtins", "str", #checkfunction=ffi::PyUnicode_Check);
156
157impl PyString {
158    /// Creates a new Python string object.
159    ///
160    /// Panics if out of memory.
161    pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
162        let ptr = s.as_ptr().cast();
163        let len = s.len() as ffi::Py_ssize_t;
164        unsafe {
165            ffi::PyUnicode_FromStringAndSize(ptr, len)
166                .assume_owned(py)
167                .cast_into_unchecked()
168        }
169    }
170
171    /// Creates a new Python string object from bytes.
172    ///
173    /// Returns PyMemoryError if out of memory.
174    /// Returns [PyUnicodeDecodeError] if the slice is not a valid UTF-8 string.
175    pub fn from_bytes<'py>(py: Python<'py>, s: &[u8]) -> PyResult<Bound<'py, PyString>> {
176        let ptr = s.as_ptr().cast();
177        let len = s.len() as ffi::Py_ssize_t;
178        unsafe {
179            ffi::PyUnicode_FromStringAndSize(ptr, len)
180                .assume_owned_or_err(py)
181                .cast_into_unchecked()
182        }
183    }
184
185    /// Intern the given string
186    ///
187    /// This will return a reference to the same Python string object if called repeatedly with the same string.
188    ///
189    /// Note that while this is more memory efficient than [`PyString::new`], it unconditionally allocates a
190    /// temporary Python string object and is thereby slower than [`PyString::new`].
191    ///
192    /// Panics if out of memory.
193    pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
194        let ptr = s.as_ptr().cast();
195        let len = s.len() as ffi::Py_ssize_t;
196        unsafe {
197            let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
198            if !ob.is_null() {
199                ffi::PyUnicode_InternInPlace(&mut ob);
200            }
201            ob.assume_owned(py).cast_into_unchecked()
202        }
203    }
204
205    /// Attempts to create a Python string from a Python [bytes-like object].
206    ///
207    /// The `encoding` and `errors` parameters are optional:
208    /// - If `encoding` is `None`, the default encoding is used (UTF-8).
209    /// - If `errors` is `None`, the default error handling is used ("strict").
210    ///
211    /// See the [Python documentation on codecs] for more information.
212    ///
213    /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
214    /// [Python documentation on codecs]: https://docs.python.org/3/library/codecs.html#standard-encodings
215    pub fn from_encoded_object<'py>(
216        src: &Bound<'py, PyAny>,
217        encoding: Option<&CStr>,
218        errors: Option<&CStr>,
219    ) -> PyResult<Bound<'py, PyString>> {
220        let encoding = encoding.map_or(std::ptr::null(), CStr::as_ptr);
221        let errors = errors.map_or(std::ptr::null(), CStr::as_ptr);
222        // Safety:
223        // - `src` is a valid Python object
224        // - `encoding` and `errors` are either null or valid C strings. `encoding` and `errors` are
225        //   documented as allowing null.
226        // - `ffi::PyUnicode_FromEncodedObject` returns a new `str` object, or sets an error.
227        unsafe {
228            ffi::PyUnicode_FromEncodedObject(src.as_ptr(), encoding, errors)
229                .assume_owned_or_err(src.py())
230                .cast_into_unchecked()
231        }
232    }
233
234    /// Creates a Python string using a format string.
235    ///
236    /// This function is similar to [`format!`], but it returns a Python string object instead of a Rust string.
237    #[inline]
238    pub fn from_fmt<'py>(
239        py: Python<'py>,
240        args: fmt::Arguments<'_>,
241    ) -> PyResult<Bound<'py, PyString>> {
242        if let Some(static_string) = args.as_str() {
243            return Ok(PyString::new(py, static_string));
244        };
245
246        #[cfg(all(Py_3_14, not(Py_LIMITED_API)))]
247        {
248            use crate::fmt::PyUnicodeWriter;
249            use std::fmt::Write as _;
250
251            let mut writer = PyUnicodeWriter::new(py)?;
252            writer
253                .write_fmt(args)
254                .map_err(|_| writer.take_error().expect("expected error"))?;
255            writer.into_py_string()
256        }
257
258        #[cfg(any(not(Py_3_14), Py_LIMITED_API))]
259        {
260            Ok(PyString::new(py, &format!("{args}")))
261        }
262    }
263}
264
265/// Implementation of functionality for [`PyString`].
266///
267/// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call
268/// syntax these methods are separated into a trait, because stable Rust does not yet support
269/// `arbitrary_self_types`.
270#[doc(alias = "PyString")]
271pub trait PyStringMethods<'py>: crate::sealed::Sealed {
272    /// Gets the Python string as a Rust UTF-8 string slice.
273    ///
274    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
275    /// (containing unpaired surrogates).
276    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
277    fn to_str(&self) -> PyResult<&str>;
278
279    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
280    ///
281    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
282    /// (containing unpaired surrogates).
283    fn to_cow(&self) -> PyResult<Cow<'_, str>>;
284
285    /// Converts the `PyString` into a Rust string.
286    ///
287    /// Unpaired surrogates invalid UTF-8 sequences are
288    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
289    fn to_string_lossy(&self) -> Cow<'_, str>;
290
291    /// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
292    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
293
294    /// Obtains the raw data backing the Python string.
295    ///
296    /// If the Python string object was created through legacy APIs, its internal storage format
297    /// will be canonicalized before data is returned.
298    ///
299    /// # Safety
300    ///
301    /// This function implementation relies on manually decoding a C bitfield. In practice, this
302    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
303    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
304    /// x86_64 platforms.
305    ///
306    /// By using this API, you accept responsibility for testing that PyStringData behaves as
307    /// expected on the targets where you plan to distribute your software.
308    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
309    unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
310}
311
312impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
313    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
314    fn to_str(&self) -> PyResult<&str> {
315        self.as_borrowed().to_str()
316    }
317
318    fn to_cow(&self) -> PyResult<Cow<'_, str>> {
319        self.as_borrowed().to_cow()
320    }
321
322    fn to_string_lossy(&self) -> Cow<'_, str> {
323        self.as_borrowed().to_string_lossy()
324    }
325
326    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
327        unsafe {
328            ffi::PyUnicode_AsUTF8String(self.as_ptr())
329                .assume_owned_or_err(self.py())
330                .cast_into_unchecked::<PyBytes>()
331        }
332    }
333
334    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
335    unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
336        unsafe { self.as_borrowed().data() }
337    }
338}
339
340impl<'a> Borrowed<'a, '_, PyString> {
341    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
342    pub(crate) fn to_str(self) -> PyResult<&'a str> {
343        // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
344        let mut size: ffi::Py_ssize_t = 0;
345        let data: *const u8 =
346            unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
347        if data.is_null() {
348            Err(crate::PyErr::fetch(self.py()))
349        } else {
350            Ok(unsafe {
351                std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
352            })
353        }
354    }
355
356    pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
357        // TODO: this method can probably be deprecated once Python 3.9 support is dropped,
358        // because all versions then support the more efficient `to_str`.
359        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
360        {
361            self.to_str().map(Cow::Borrowed)
362        }
363
364        #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
365        {
366            let bytes = self.encode_utf8()?;
367            Ok(Cow::Owned(
368                unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
369            ))
370        }
371    }
372
373    fn to_string_lossy(self) -> Cow<'a, str> {
374        let ptr = self.as_ptr();
375        let py = self.py();
376
377        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
378        if let Ok(s) = self.to_str() {
379            return Cow::Borrowed(s);
380        }
381
382        let bytes = unsafe {
383            ffi::PyUnicode_AsEncodedString(ptr, c"utf-8".as_ptr(), c"surrogatepass".as_ptr())
384                .assume_owned(py)
385                .cast_into_unchecked::<PyBytes>()
386        };
387        Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
388    }
389
390    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
391    unsafe fn data(self) -> PyResult<PyStringData<'a>> {
392        unsafe {
393            let ptr = self.as_ptr();
394
395            #[cfg(not(Py_3_12))]
396            #[allow(deprecated)]
397            {
398                let ready = ffi::PyUnicode_READY(ptr);
399                if ready != 0 {
400                    // Exception was created on failure.
401                    return Err(crate::PyErr::fetch(self.py()));
402                }
403            }
404
405            // The string should be in its canonical form after calling `PyUnicode_READY()`.
406            // And non-canonical form not possible after Python 3.12. So it should be safe
407            // to call these APIs.
408            let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
409            let raw_data = ffi::PyUnicode_DATA(ptr);
410            let kind = ffi::PyUnicode_KIND(ptr);
411
412            match kind {
413                ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
414                    raw_data as *const u8,
415                    length,
416                ))),
417                ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
418                    raw_data as *const u16,
419                    length,
420                ))),
421                ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
422                    raw_data as *const u32,
423                    length,
424                ))),
425                _ => unreachable!(),
426            }
427        }
428    }
429}
430
431impl Py<PyString> {
432    /// Gets the Python string as a Rust UTF-8 string slice.
433    ///
434    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
435    /// (containing unpaired surrogates).
436    ///
437    /// Because `str` objects are immutable, the returned slice is independent of
438    /// the GIL lifetime.
439    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
440    pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
441        self.bind_borrowed(py).to_str()
442    }
443
444    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
445    ///
446    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
447    /// (containing unpaired surrogates).
448    ///
449    /// Because `str` objects are immutable, the returned slice is independent of
450    /// the GIL lifetime.
451    pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
452        self.bind_borrowed(py).to_cow()
453    }
454
455    /// Converts the `PyString` into a Rust string.
456    ///
457    /// Unpaired surrogates invalid UTF-8 sequences are
458    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
459    ///
460    /// Because `str` objects are immutable, the returned slice is independent of
461    /// the GIL lifetime.
462    pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
463        self.bind_borrowed(py).to_string_lossy()
464    }
465}
466
467/// Compares whether the data in the Python string is equal to the given UTF8.
468///
469/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
470impl PartialEq<str> for Bound<'_, PyString> {
471    #[inline]
472    fn eq(&self, other: &str) -> bool {
473        self.as_borrowed() == *other
474    }
475}
476
477/// Compares whether the data in the Python string is equal to the given UTF8.
478///
479/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
480impl PartialEq<&'_ str> for Bound<'_, PyString> {
481    #[inline]
482    fn eq(&self, other: &&str) -> bool {
483        self.as_borrowed() == **other
484    }
485}
486
487/// Compares whether the data in the Python string is equal to the given UTF8.
488///
489/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
490impl PartialEq<Bound<'_, PyString>> for str {
491    #[inline]
492    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
493        *self == other.as_borrowed()
494    }
495}
496
497/// Compares whether the data in the Python string is equal to the given UTF8.
498///
499/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
500impl PartialEq<&'_ Bound<'_, PyString>> for str {
501    #[inline]
502    fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
503        *self == other.as_borrowed()
504    }
505}
506
507/// Compares whether the data in the Python string is equal to the given UTF8.
508///
509/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
510impl PartialEq<Bound<'_, PyString>> for &'_ str {
511    #[inline]
512    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
513        **self == other.as_borrowed()
514    }
515}
516
517/// Compares whether the data in the Python string is equal to the given UTF8.
518///
519/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
520impl PartialEq<str> for &'_ Bound<'_, PyString> {
521    #[inline]
522    fn eq(&self, other: &str) -> bool {
523        self.as_borrowed() == other
524    }
525}
526
527/// Compares whether the data in the Python string is equal to the given UTF8.
528///
529/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
530impl PartialEq<str> for Borrowed<'_, '_, PyString> {
531    #[inline]
532    fn eq(&self, other: &str) -> bool {
533        #[cfg(not(Py_3_13))]
534        {
535            self.to_cow().is_ok_and(|s| s == other)
536        }
537
538        #[cfg(Py_3_13)]
539        unsafe {
540            ffi::PyUnicode_EqualToUTF8AndSize(
541                self.as_ptr(),
542                other.as_ptr().cast(),
543                other.len() as _,
544            ) == 1
545        }
546    }
547}
548
549/// Compares whether the data in the Python string is equal to the given UTF8.
550///
551/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
552impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
553    #[inline]
554    fn eq(&self, other: &&str) -> bool {
555        *self == **other
556    }
557}
558
559/// Compares whether the data in the Python string is equal to the given UTF8.
560///
561/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
562impl PartialEq<Borrowed<'_, '_, PyString>> for str {
563    #[inline]
564    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
565        other == self
566    }
567}
568
569/// Compares whether the data in the Python string is equal to the given UTF8.
570///
571/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
572impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
573    #[inline]
574    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
575        other == self
576    }
577}
578
579#[cfg(test)]
580mod tests {
581    use super::*;
582    use crate::{exceptions::PyLookupError, types::PyAnyMethods as _, IntoPyObject};
583
584    #[test]
585    fn test_to_cow_utf8() {
586        Python::attach(|py| {
587            let s = "ascii 🐈";
588            let py_string = PyString::new(py, s);
589            assert_eq!(s, py_string.to_cow().unwrap());
590        })
591    }
592
593    #[test]
594    fn test_to_cow_surrogate() {
595        Python::attach(|py| {
596            let py_string = py
597                .eval(cr"'\ud800'", None, None)
598                .unwrap()
599                .cast_into::<PyString>()
600                .unwrap();
601            assert!(py_string.to_cow().is_err());
602        })
603    }
604
605    #[test]
606    fn test_to_cow_unicode() {
607        Python::attach(|py| {
608            let s = "哈哈🐈";
609            let py_string = PyString::new(py, s);
610            assert_eq!(s, py_string.to_cow().unwrap());
611        })
612    }
613
614    #[test]
615    fn test_encode_utf8_unicode() {
616        Python::attach(|py| {
617            let s = "哈哈🐈";
618            let obj = PyString::new(py, s);
619            assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
620        })
621    }
622
623    #[test]
624    fn test_encode_utf8_surrogate() {
625        Python::attach(|py| {
626            let obj: Py<PyAny> = py.eval(cr"'\ud800'", None, None).unwrap().into();
627            assert!(obj
628                .bind(py)
629                .cast::<PyString>()
630                .unwrap()
631                .encode_utf8()
632                .is_err());
633        })
634    }
635
636    #[test]
637    fn test_to_string_lossy() {
638        Python::attach(|py| {
639            let py_string = py
640                .eval(cr"'🐈 Hello \ud800World'", None, None)
641                .unwrap()
642                .cast_into::<PyString>()
643                .unwrap();
644
645            assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
646        })
647    }
648
649    #[test]
650    fn test_debug_string() {
651        Python::attach(|py| {
652            let s = "Hello\n".into_pyobject(py).unwrap();
653            assert_eq!(format!("{s:?}"), "'Hello\\n'");
654        })
655    }
656
657    #[test]
658    fn test_display_string() {
659        Python::attach(|py| {
660            let s = "Hello\n".into_pyobject(py).unwrap();
661            assert_eq!(format!("{s}"), "Hello\n");
662        })
663    }
664
665    #[test]
666    fn test_string_from_encoded_object() {
667        Python::attach(|py| {
668            let py_bytes = PyBytes::new(py, b"ab\xFFcd");
669
670            // default encoding is utf-8, default error handler is strict
671            let py_string = PyString::from_encoded_object(&py_bytes, None, None).unwrap_err();
672            assert!(py_string
673                .get_type(py)
674                .is(py.get_type::<crate::exceptions::PyUnicodeDecodeError>()));
675
676            // with `ignore` error handler, the invalid byte is dropped
677            let py_string =
678                PyString::from_encoded_object(&py_bytes, None, Some(c"ignore")).unwrap();
679
680            let result = py_string.to_cow().unwrap();
681            assert_eq!(result, "abcd");
682        });
683    }
684
685    #[test]
686    fn test_string_from_encoded_object_with_invalid_encoding_errors() {
687        Python::attach(|py| {
688            let py_bytes = PyBytes::new(py, b"abcd");
689
690            // invalid encoding
691            let err = PyString::from_encoded_object(&py_bytes, Some(c"wat"), None).unwrap_err();
692            assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
693            assert_eq!(err.to_string(), "LookupError: unknown encoding: wat");
694
695            // invalid error handler
696            let err =
697                PyString::from_encoded_object(&PyBytes::new(py, b"ab\xFFcd"), None, Some(c"wat"))
698                    .unwrap_err();
699            assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
700            assert_eq!(
701                err.to_string(),
702                "LookupError: unknown error handler name 'wat'"
703            );
704        });
705    }
706
707    #[test]
708    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
709    fn test_string_data_ucs1() {
710        Python::attach(|py| {
711            let s = PyString::new(py, "hello, world");
712            let data = unsafe { s.data().unwrap() };
713
714            assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
715            assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
716            assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
717        })
718    }
719
720    #[test]
721    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
722    fn test_string_data_ucs1_invalid() {
723        Python::attach(|py| {
724            // 0xfe is not allowed in UTF-8.
725            let buffer = b"f\xfe\0";
726            let ptr = unsafe {
727                crate::ffi::PyUnicode_FromKindAndData(
728                    crate::ffi::PyUnicode_1BYTE_KIND as _,
729                    buffer.as_ptr().cast(),
730                    2,
731                )
732            };
733            assert!(!ptr.is_null());
734            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
735            let data = unsafe { s.data().unwrap() };
736            assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
737            let err = data.to_string(py).unwrap_err();
738            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
739            assert!(err
740                .to_string()
741                .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
742            assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
743        });
744    }
745
746    #[test]
747    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
748    fn test_string_data_ucs2() {
749        Python::attach(|py| {
750            let s = py.eval(c"'foo\\ud800'", None, None).unwrap();
751            let py_string = s.cast::<PyString>().unwrap();
752            let data = unsafe { py_string.data().unwrap() };
753
754            assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
755            assert_eq!(
756                data.to_string_lossy(),
757                Cow::Owned::<str>("foo�".to_string())
758            );
759        })
760    }
761
762    #[test]
763    #[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
764    fn test_string_data_ucs2_invalid() {
765        Python::attach(|py| {
766            // U+FF22 (valid) & U+d800 (never valid)
767            let buffer = b"\x22\xff\x00\xd8\x00\x00";
768            let ptr = unsafe {
769                crate::ffi::PyUnicode_FromKindAndData(
770                    crate::ffi::PyUnicode_2BYTE_KIND as _,
771                    buffer.as_ptr().cast(),
772                    2,
773                )
774            };
775            assert!(!ptr.is_null());
776            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
777            let data = unsafe { s.data().unwrap() };
778            assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
779            let err = data.to_string(py).unwrap_err();
780            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
781            assert!(err
782                .to_string()
783                .contains("'utf-16' codec can't decode bytes in position 0-3"));
784            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
785        });
786    }
787
788    #[test]
789    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
790    fn test_string_data_ucs4() {
791        Python::attach(|py| {
792            let s = "哈哈🐈";
793            let py_string = PyString::new(py, s);
794            let data = unsafe { py_string.data().unwrap() };
795
796            assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
797            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
798        })
799    }
800
801    #[test]
802    #[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
803    fn test_string_data_ucs4_invalid() {
804        Python::attach(|py| {
805            // U+20000 (valid) & U+d800 (never valid)
806            let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
807            let ptr = unsafe {
808                crate::ffi::PyUnicode_FromKindAndData(
809                    crate::ffi::PyUnicode_4BYTE_KIND as _,
810                    buffer.as_ptr().cast(),
811                    2,
812                )
813            };
814            assert!(!ptr.is_null());
815            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
816            let data = unsafe { s.data().unwrap() };
817            assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
818            let err = data.to_string(py).unwrap_err();
819            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
820            assert!(err
821                .to_string()
822                .contains("'utf-32' codec can't decode bytes in position 0-7"));
823            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
824        });
825    }
826
827    #[test]
828    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
829    fn test_pystring_from_bytes() {
830        Python::attach(|py| {
831            let result = PyString::from_bytes(py, "\u{2122}".as_bytes());
832            assert!(result.is_ok());
833            let result = PyString::from_bytes(py, b"\x80");
834            assert!(result
835                .unwrap_err()
836                .get_type(py)
837                .is(py.get_type::<PyUnicodeDecodeError>()));
838        });
839    }
840
841    #[test]
842    fn test_intern_string() {
843        Python::attach(|py| {
844            let py_string1 = PyString::intern(py, "foo");
845            assert_eq!(py_string1, "foo");
846
847            let py_string2 = PyString::intern(py, "foo");
848            assert_eq!(py_string2, "foo");
849
850            assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
851
852            let py_string3 = PyString::intern(py, "bar");
853            assert_eq!(py_string3, "bar");
854
855            assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
856        });
857    }
858
859    #[test]
860    fn test_py_to_str_utf8() {
861        Python::attach(|py| {
862            let s = "ascii 🐈";
863            let py_string = PyString::new(py, s).unbind();
864
865            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
866            assert_eq!(s, py_string.to_str(py).unwrap());
867
868            assert_eq!(s, py_string.to_cow(py).unwrap());
869        })
870    }
871
872    #[test]
873    fn test_py_to_str_surrogate() {
874        Python::attach(|py| {
875            let py_string: Py<PyString> = py
876                .eval(cr"'\ud800'", None, None)
877                .unwrap()
878                .extract()
879                .unwrap();
880
881            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
882            assert!(py_string.to_str(py).is_err());
883
884            assert!(py_string.to_cow(py).is_err());
885        })
886    }
887
888    #[test]
889    fn test_py_to_string_lossy() {
890        Python::attach(|py| {
891            let py_string: Py<PyString> = py
892                .eval(cr"'🐈 Hello \ud800World'", None, None)
893                .unwrap()
894                .extract()
895                .unwrap();
896            assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
897        })
898    }
899
900    #[test]
901    fn test_comparisons() {
902        Python::attach(|py| {
903            let s = "hello, world";
904            let py_string = PyString::new(py, s);
905
906            assert_eq!(py_string, "hello, world");
907
908            assert_eq!(py_string, s);
909            assert_eq!(&py_string, s);
910            assert_eq!(s, py_string);
911            assert_eq!(s, &py_string);
912
913            assert_eq!(py_string, *s);
914            assert_eq!(&py_string, *s);
915            assert_eq!(*s, py_string);
916            assert_eq!(*s, &py_string);
917
918            let py_string = py_string.as_borrowed();
919
920            assert_eq!(py_string, s);
921            assert_eq!(&py_string, s);
922            assert_eq!(s, py_string);
923            assert_eq!(s, &py_string);
924
925            assert_eq!(py_string, *s);
926            assert_eq!(*s, py_string);
927        })
928    }
929}