Skip to main content

core/
wtf8.rs

1//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
2//!
3//! This library uses Rust’s type system to maintain
4//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
5//! like the `String` and `&str` types do for UTF-8.
6//!
7//! Since [WTF-8 must not be used
8//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
9//! this library deliberately does not provide access to the underlying bytes
10//! of WTF-8 strings,
11//! nor can it decode WTF-8 from arbitrary bytes.
12//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
13#![unstable(
14    feature = "wtf8_internals",
15    issue = "none",
16    reason = "this is internal code for representing OsStr on some platforms and not a public API"
17)]
18// rustdoc bug: doc(hidden) on the module won't stop types in the module from showing up in trait
19// implementations, so, we'll have to add more doc(hidden)s anyway
20#![doc(hidden)]
21
22use crate::char::{EscapeDebugExtArgs, encode_utf16_raw};
23use crate::clone::CloneToUninit;
24use crate::fmt::{self, Write};
25use crate::hash::{Hash, Hasher};
26use crate::iter::FusedIterator;
27use crate::num::niche_types::CodePointInner;
28use crate::str::next_code_point;
29use crate::{ops, slice, str};
30
31/// A Unicode code point: from U+0000 to U+10FFFF.
32///
33/// Compares with the `char` type,
34/// which represents a Unicode scalar value:
35/// a code point that is not a surrogate (U+D800 to U+DFFF).
36#[derive(#[automatically_derived]
impl crate::cmp::Eq for CodePoint {
    #[inline]
    #[doc(hidden)]
    #[coverage(off)]
    fn assert_fields_are_eq(&self) {
        let _: crate::cmp::AssertParamIsEq<CodePointInner>;
    }
}Eq, #[automatically_derived]
impl crate::cmp::PartialEq for CodePoint {
    #[inline]
    fn eq(&self, other: &CodePoint) -> bool { self.0 == other.0 }
}PartialEq, #[automatically_derived]
impl crate::cmp::Ord for CodePoint {
    #[inline]
    fn cmp(&self, other: &CodePoint) -> crate::cmp::Ordering {
        crate::cmp::Ord::cmp(&self.0, &other.0)
    }
}Ord, #[automatically_derived]
impl crate::cmp::PartialOrd for CodePoint {
    #[inline]
    fn partial_cmp(&self, other: &CodePoint)
        -> crate::option::Option<crate::cmp::Ordering> {
        crate::option::Option::Some(crate::cmp::Ord::cmp(self, other))
    }
}PartialOrd, #[automatically_derived]
impl crate::clone::Clone for CodePoint {
    #[inline]
    fn clone(&self) -> CodePoint {
        let _: crate::clone::AssertParamIsClone<CodePointInner>;
        *self
    }
}Clone, #[automatically_derived]
impl crate::marker::Copy for CodePoint { }Copy)]
37#[doc(hidden)]
38pub struct CodePoint(CodePointInner);
39
40/// Format the code point as `U+` followed by four to six hexadecimal digits.
41/// Example: `U+1F4A9`
42impl fmt::Debug for CodePoint {
43    #[inline]
44    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45        formatter.write_fmt(format_args!("U+{0:04X}", self.0.as_inner()))write!(formatter, "U+{:04X}", self.0.as_inner())
46    }
47}
48
49impl CodePoint {
50    /// Unsafely creates a new `CodePoint` without checking the value.
51    ///
52    /// Only use when `value` is known to be less than or equal to 0x10FFFF.
53    #[inline]
54    pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
55        // SAFETY: Guaranteed by caller.
56        CodePoint(unsafe { CodePointInner::new_unchecked(value) })
57    }
58
59    /// Creates a new `CodePoint` if the value is a valid code point.
60    ///
61    /// Returns `None` if `value` is above 0x10FFFF.
62    #[inline]
63    pub fn from_u32(value: u32) -> Option<CodePoint> {
64        Some(CodePoint(CodePointInner::new(value)?))
65    }
66
67    /// Creates a new `CodePoint` from a `char`.
68    ///
69    /// Since all Unicode scalar values are code points, this always succeeds.
70    #[inline]
71    pub fn from_char(value: char) -> CodePoint {
72        // SAFETY: All char are valid for this type.
73        unsafe { CodePoint::from_u32_unchecked(value as u32) }
74    }
75
76    /// Returns the numeric value of the code point.
77    #[inline]
78    pub fn to_u32(&self) -> u32 {
79        self.0.as_inner()
80    }
81
82    /// Returns the numeric value of the code point if it is a leading surrogate.
83    #[inline]
84    pub fn to_lead_surrogate(&self) -> Option<u16> {
85        match self.to_u32() {
86            lead @ 0xD800..=0xDBFF => Some(lead as u16),
87            _ => None,
88        }
89    }
90
91    /// Returns the numeric value of the code point if it is a trailing surrogate.
92    #[inline]
93    pub fn to_trail_surrogate(&self) -> Option<u16> {
94        match self.to_u32() {
95            trail @ 0xDC00..=0xDFFF => Some(trail as u16),
96            _ => None,
97        }
98    }
99
100    /// Optionally returns a Unicode scalar value for the code point.
101    ///
102    /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
103    #[inline]
104    pub fn to_char(&self) -> Option<char> {
105        match self.to_u32() {
106            0xD800..=0xDFFF => None,
107            // SAFETY: We explicitly check that the char is valid.
108            valid => Some(unsafe { char::from_u32_unchecked(valid) }),
109        }
110    }
111
112    /// Returns a Unicode scalar value for the code point.
113    ///
114    /// Returns `'\u{FFFD}'` (the replacement character “�”)
115    /// if the code point is a surrogate (from U+D800 to U+DFFF).
116    #[inline]
117    pub fn to_char_lossy(&self) -> char {
118        self.to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
119    }
120}
121
122/// A borrowed slice of well-formed WTF-8 data.
123///
124/// Similar to `&str`, but can additionally contain surrogate code points
125/// if they’re not in a surrogate pair.
126#[derive(#[automatically_derived]
impl crate::cmp::Eq for Wtf8 {
    #[inline]
    #[doc(hidden)]
    #[coverage(off)]
    fn assert_fields_are_eq(&self) {
        let _: crate::cmp::AssertParamIsEq<[u8]>;
    }
}Eq, #[automatically_derived]
impl crate::cmp::Ord for Wtf8 {
    #[inline]
    fn cmp(&self, other: &Wtf8) -> crate::cmp::Ordering {
        crate::cmp::Ord::cmp(&self.bytes, &other.bytes)
    }
}Ord, #[automatically_derived]
impl crate::cmp::PartialEq for Wtf8 {
    #[inline]
    fn eq(&self, other: &Wtf8) -> bool { self.bytes == other.bytes }
}PartialEq, #[automatically_derived]
impl crate::cmp::PartialOrd for Wtf8 {
    #[inline]
    fn partial_cmp(&self, other: &Wtf8)
        -> crate::option::Option<crate::cmp::Ordering> {
        crate::option::Option::Some(crate::cmp::Ord::cmp(self, other))
    }
}PartialOrd)]
127#[repr(transparent)]
128#[rustc_has_incoherent_inherent_impls]
129#[doc(hidden)]
130pub struct Wtf8 {
131    bytes: [u8],
132}
133
134impl AsRef<[u8]> for Wtf8 {
135    #[inline]
136    fn as_ref(&self) -> &[u8] {
137        &self.bytes
138    }
139}
140
141/// Formats the string in double quotes, with characters escaped according to
142/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
143/// where each `x` is a hexadecimal digit.
144impl fmt::Debug for Wtf8 {
145    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
146        fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
147            use crate::fmt::Write as _;
148            for c in s.chars().flat_map(|c| {
149                c.escape_debug_ext(EscapeDebugExtArgs {
150                    escape_grapheme_extended: true,
151                    escape_single_quote: false,
152                    escape_double_quote: true,
153                })
154            }) {
155                f.write_char(c)?
156            }
157            Ok(())
158        }
159
160        formatter.write_char('"')?;
161        let mut pos = 0;
162        while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
163            // SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes.
164            write_str_escaped(formatter, unsafe {
165                str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
166            })?;
167            formatter.write_fmt(format_args!("\\u{{{0:x}}}", surrogate))write!(formatter, "\\u{{{:x}}}", surrogate)?;
168            pos = surrogate_pos + 3;
169        }
170
171        // SAFETY: after next_surrogate returns None, the remainder is valid UTF-8.
172        write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
173        formatter.write_char('"')
174    }
175}
176
177/// Formats the string with unpaired surrogates substituted with the replacement
178/// character, U+FFFD.
179impl fmt::Display for Wtf8 {
180    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
181        let wtf8_bytes = &self.bytes;
182        let mut pos = 0;
183        loop {
184            match self.next_surrogate(pos) {
185                Some((surrogate_pos, _)) => {
186                    // SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes.
187                    formatter.write_str(unsafe {
188                        str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
189                    })?;
190                    formatter.write_char(char::REPLACEMENT_CHARACTER)?;
191                    pos = surrogate_pos + 3;
192                }
193                None => {
194                    // SAFETY: after next_surrogate returns None, the remainder is valid UTF-8.
195                    let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
196                    if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
197                }
198            }
199        }
200    }
201}
202
203impl Wtf8 {
204    /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
205    #[inline]
206    pub fn from_str(value: &str) -> &Wtf8 {
207        // SAFETY: Since WTF-8 is a superset of UTF-8, this always is valid.
208        unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
209    }
210
211    /// Creates a WTF-8 slice from a WTF-8 byte slice.
212    ///
213    /// Since the byte slice is not checked for valid WTF-8, this functions is
214    /// marked unsafe.
215    #[inline]
216    pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
217        // SAFETY: start with &[u8], end with fancy &[u8]
218        unsafe { &*(value as *const [u8] as *const Wtf8) }
219    }
220
221    /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
222    ///
223    /// Since the byte slice is not checked for valid WTF-8, this functions is
224    /// marked unsafe.
225    #[inline]
226    pub unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
227        // SAFETY: start with &mut [u8], end with fancy &mut [u8]
228        unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
229    }
230
231    /// Returns the length, in WTF-8 bytes.
232    #[inline]
233    pub fn len(&self) -> usize {
234        self.bytes.len()
235    }
236
237    #[inline]
238    pub fn is_empty(&self) -> bool {
239        self.bytes.is_empty()
240    }
241
242    /// Returns the code point at `position` if it is in the ASCII range,
243    /// or `b'\xFF'` otherwise.
244    ///
245    /// # Panics
246    ///
247    /// Panics if `position` is beyond the end of the string.
248    #[inline]
249    pub fn ascii_byte_at(&self, position: usize) -> u8 {
250        match self.bytes[position] {
251            ascii_byte @ 0x00..=0x7F => ascii_byte,
252            _ => 0xFF,
253        }
254    }
255
256    /// Returns an iterator for the string’s code points.
257    #[inline]
258    pub fn code_points(&self) -> Wtf8CodePoints<'_> {
259        Wtf8CodePoints { bytes: self.bytes.iter() }
260    }
261
262    /// Access raw bytes of WTF-8 data
263    #[inline]
264    pub fn as_bytes(&self) -> &[u8] {
265        &self.bytes
266    }
267
268    /// Tries to convert the string to UTF-8 and return a `&str` slice.
269    ///
270    /// Returns `None` if the string contains surrogates.
271    ///
272    /// This does not copy the data.
273    #[inline]
274    pub fn as_str(&self) -> Result<&str, str::Utf8Error> {
275        str::from_utf8(&self.bytes)
276    }
277
278    /// Converts the WTF-8 string to potentially ill-formed UTF-16
279    /// and return an iterator of 16-bit code units.
280    ///
281    /// This is lossless:
282    /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
283    /// would always return the original WTF-8 string.
284    #[inline]
285    pub fn encode_wide(&self) -> EncodeWide<'_> {
286        EncodeWide { code_points: self.code_points(), extra: 0 }
287    }
288
289    #[inline]
290    pub fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
291        let mut iter = self.bytes[pos..].iter();
292        loop {
293            let b = *iter.next()?;
294            if b < 0x80 {
295                pos += 1;
296            } else if b < 0xE0 {
297                iter.next();
298                pos += 2;
299            } else if b == 0xED {
300                match (iter.next(), iter.next()) {
301                    (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
302                        return Some((pos, decode_surrogate(b2, b3)));
303                    }
304                    _ => pos += 3,
305                }
306            } else if b < 0xF0 {
307                iter.next();
308                iter.next();
309                pos += 3;
310            } else {
311                iter.next();
312                iter.next();
313                iter.next();
314                pos += 4;
315            }
316        }
317    }
318
319    #[inline]
320    pub fn final_lead_surrogate(&self) -> Option<u16> {
321        match self.bytes {
322            [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
323            _ => None,
324        }
325    }
326
327    #[inline]
328    pub fn initial_trail_surrogate(&self) -> Option<u16> {
329        match self.bytes {
330            [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
331            _ => None,
332        }
333    }
334
335    #[inline]
336    pub fn make_ascii_lowercase(&mut self) {
337        self.bytes.make_ascii_lowercase()
338    }
339
340    #[inline]
341    pub fn make_ascii_uppercase(&mut self) {
342        self.bytes.make_ascii_uppercase()
343    }
344
345    #[inline]
346    pub fn is_ascii(&self) -> bool {
347        self.bytes.is_ascii()
348    }
349
350    #[inline]
351    pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
352        self.bytes.eq_ignore_ascii_case(&other.bytes)
353    }
354}
355
356/// Returns a slice of the given string for the byte range \[`begin`..`end`).
357///
358/// # Panics
359///
360/// Panics when `begin` and `end` do not point to code point boundaries,
361/// or point beyond the end of the string.
362impl ops::Index<ops::Range<usize>> for Wtf8 {
363    type Output = Wtf8;
364
365    #[inline]
366    fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
367        if range.start <= range.end
368            && self.is_code_point_boundary(range.start)
369            && self.is_code_point_boundary(range.end)
370        {
371            // SAFETY: is_code_point_boundary checks that the index is valid
372            unsafe { slice_unchecked(self, range.start, range.end) }
373        } else {
374            slice_error_fail(self, range.start, range.end)
375        }
376    }
377}
378
379/// Returns a slice of the given string from byte `begin` to its end.
380///
381/// # Panics
382///
383/// Panics when `begin` is not at a code point boundary,
384/// or is beyond the end of the string.
385impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
386    type Output = Wtf8;
387
388    #[inline]
389    fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
390        if self.is_code_point_boundary(range.start) {
391            // SAFETY: is_code_point_boundary checks that the index is valid
392            unsafe { slice_unchecked(self, range.start, self.len()) }
393        } else {
394            slice_error_fail(self, range.start, self.len())
395        }
396    }
397}
398
399/// Returns a slice of the given string from its beginning to byte `end`.
400///
401/// # Panics
402///
403/// Panics when `end` is not at a code point boundary,
404/// or is beyond the end of the string.
405impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
406    type Output = Wtf8;
407
408    #[inline]
409    fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
410        if self.is_code_point_boundary(range.end) {
411            // SAFETY: is_code_point_boundary checks that the index is valid
412            unsafe { slice_unchecked(self, 0, range.end) }
413        } else {
414            slice_error_fail(self, 0, range.end)
415        }
416    }
417}
418
419impl ops::Index<ops::RangeFull> for Wtf8 {
420    type Output = Wtf8;
421
422    #[inline]
423    fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
424        self
425    }
426}
427
428#[inline]
429fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
430    // The first byte is assumed to be 0xED
431    0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
432}
433
434impl Wtf8 {
435    /// Copied from str::is_char_boundary
436    #[inline]
437    pub fn is_code_point_boundary(&self, index: usize) -> bool {
438        if index == 0 {
439            return true;
440        }
441        match self.bytes.get(index) {
442            None => index == self.len(),
443            Some(&b) => (b as i8) >= -0x40,
444        }
445    }
446
447    /// Verify that `index` is at the edge of either a valid UTF-8 codepoint
448    /// (i.e. a codepoint that's not a surrogate) or of the whole string.
449    ///
450    /// These are the cases currently permitted by `OsStr::self_encoded_bytes`.
451    /// Splitting between surrogates is valid as far as WTF-8 is concerned, but
452    /// we do not permit it in the public API because WTF-8 is considered an
453    /// implementation detail.
454    #[track_caller]
455    #[inline]
456    pub fn check_utf8_boundary(&self, index: usize) {
457        let Err(err) = self.try_check_utf8_boundary(index) else { return };
458        match err {
459            Utf8BoundaryError::NotABoundary => {
460                {
    crate::panicking::panic_fmt(format_args!("byte index {0} is not a codepoint boundary",
            index));
}panic!("byte index {index} is not a codepoint boundary")
461            }
462            Utf8BoundaryError::OutOfBounds => {
    crate::panicking::panic_fmt(format_args!("byte index {0} is out of bounds",
            index));
}panic!("byte index {index} is out of bounds"),
463            Utf8BoundaryError::BetweenSurrogates => {
464                {
    crate::panicking::panic_fmt(format_args!("byte index {0} lies between surrogate codepoints",
            index));
}panic!("byte index {index} lies between surrogate codepoints")
465            }
466        }
467    }
468
469    #[track_caller]
470    #[inline]
471    pub fn try_check_utf8_boundary(&self, index: usize) -> Result<(), Utf8BoundaryError> {
472        if index == 0 {
473            return Ok(());
474        }
475        match self.bytes.get(index) {
476            Some(0xED) => (), // Might be a surrogate
477            Some(&b) if (b as i8) >= -0x40 => return Ok(()),
478            Some(_) => return Err(Utf8BoundaryError::NotABoundary),
479            None if index == self.len() => return Ok(()),
480            None => return Err(Utf8BoundaryError::OutOfBounds),
481        }
482        if self.bytes[index + 1] >= 0xA0 {
483            // There's a surrogate after index. Now check before index.
484            if index >= 3 && self.bytes[index - 3] == 0xED && self.bytes[index - 2] >= 0xA0 {
485                return Err(Utf8BoundaryError::BetweenSurrogates);
486            }
487        }
488        Ok(())
489    }
490}
491
492// This error type is only used temporarily to provide better panic messages
493// It does not implement Error.
494#[derive(#[automatically_derived]
impl crate::fmt::Debug for Utf8BoundaryError {
    #[inline]
    fn fmt(&self, f: &mut crate::fmt::Formatter) -> crate::fmt::Result {
        crate::fmt::Formatter::write_str(f,
            match self {
                Utf8BoundaryError::NotABoundary => "NotABoundary",
                Utf8BoundaryError::OutOfBounds => "OutOfBounds",
                Utf8BoundaryError::BetweenSurrogates => "BetweenSurrogates",
            })
    }
}Debug)]
495pub enum Utf8BoundaryError {
496    NotABoundary,
497    OutOfBounds,
498    BetweenSurrogates,
499}
500
501/// Copied from core::str::raw::slice_unchecked
502#[inline]
503unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
504    // SAFETY: memory layout of a &[u8] and &Wtf8 are the same
505    unsafe {
506        let len = end - begin;
507        let start = s.as_bytes().as_ptr().add(begin);
508        Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
509    }
510}
511
512#[inline(never)]
513fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
514    let len = s.len();
515    if begin > len {
516        {
    crate::panicking::panic_fmt(format_args!("start byte index {0} is out of bounds for string of length {1}",
            begin, len));
};panic!("start byte index {begin} is out of bounds for string of length {len}");
517    }
518    if end > len {
519        {
    crate::panicking::panic_fmt(format_args!("end byte index {0} is out of bounds for string of length {1}",
            end, len));
};panic!("end byte index {end} is out of bounds for string of length {len}");
520    }
521    if begin > end {
522        {
    crate::panicking::panic_fmt(format_args!("byte range starts at {0} but ends at {1}",
            begin, end));
};panic!("byte range starts at {begin} but ends at {end}");
523    }
524    if !s.is_code_point_boundary(begin) {
525        {
    crate::panicking::panic_fmt(format_args!("byte index {0} is not a code point boundary",
            begin));
};panic!("byte index {begin} is not a code point boundary");
526    }
527    {
    crate::panicking::panic_fmt(format_args!("byte index {0} is not a code point boundary",
            end));
};panic!("byte index {end} is not a code point boundary");
528}
529
530/// Iterator for the code points of a WTF-8 string.
531///
532/// Created with the method `.code_points()`.
533#[derive(#[automatically_derived]
impl<'a> crate::clone::Clone for Wtf8CodePoints<'a> {
    #[inline]
    fn clone(&self) -> Wtf8CodePoints<'a> {
        Wtf8CodePoints { bytes: crate::clone::Clone::clone(&self.bytes) }
    }
}Clone)]
534#[doc(hidden)]
535pub struct Wtf8CodePoints<'a> {
536    bytes: slice::Iter<'a, u8>,
537}
538
539impl Iterator for Wtf8CodePoints<'_> {
540    type Item = CodePoint;
541
542    #[inline]
543    fn next(&mut self) -> Option<CodePoint> {
544        // SAFETY: `self.bytes` has been created from a WTF-8 string
545        unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint::from_u32_unchecked(c)) }
546    }
547
548    #[inline]
549    fn size_hint(&self) -> (usize, Option<usize>) {
550        let len = self.bytes.len();
551        (len.saturating_add(3) / 4, Some(len))
552    }
553}
554
555impl fmt::Debug for Wtf8CodePoints<'_> {
556    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
557        f.debug_tuple("Wtf8CodePoints")
558            // SAFETY: We always leave the string in a valid state after each iteration.
559            .field(&unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) })
560            .finish()
561    }
562}
563
564/// Generates a wide character sequence for potentially ill-formed UTF-16.
565#[stable(feature = "rust1", since = "1.0.0")]
566#[derive(#[automatically_derived]
#[stable(feature = "rust1", since = "1.0.0")]
impl<'a> crate::clone::Clone for EncodeWide<'a> {
    #[inline]
    fn clone(&self) -> EncodeWide<'a> {
        EncodeWide {
            code_points: crate::clone::Clone::clone(&self.code_points),
            extra: crate::clone::Clone::clone(&self.extra),
        }
    }
}Clone)]
567#[doc(hidden)]
568pub struct EncodeWide<'a> {
569    code_points: Wtf8CodePoints<'a>,
570    extra: u16,
571}
572
573// Copied from libunicode/u_str.rs
574#[stable(feature = "rust1", since = "1.0.0")]
575impl Iterator for EncodeWide<'_> {
576    type Item = u16;
577
578    #[inline]
579    fn next(&mut self) -> Option<u16> {
580        if self.extra != 0 {
581            let tmp = self.extra;
582            self.extra = 0;
583            return Some(tmp);
584        }
585
586        let mut buf = [0; char::MAX_LEN_UTF16];
587        self.code_points.next().map(|code_point| {
588            let n = encode_utf16_raw(code_point.to_u32(), &mut buf).len();
589            if n == 2 {
590                self.extra = buf[1];
591            }
592            buf[0]
593        })
594    }
595
596    #[inline]
597    fn size_hint(&self) -> (usize, Option<usize>) {
598        let (low, high) = self.code_points.size_hint();
599        let ext = (self.extra != 0) as usize;
600        // every code point gets either one u16 or two u16,
601        // so this iterator is between 1 or 2 times as
602        // long as the underlying iterator.
603        (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext)))
604    }
605}
606
607#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")]
608impl FusedIterator for EncodeWide<'_> {}
609
610#[stable(feature = "encode_wide_debug", since = "1.92.0")]
611impl fmt::Debug for EncodeWide<'_> {
612    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
613        struct CodeUnit(u16);
614        impl fmt::Debug for CodeUnit {
615            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
616                // This output attempts to balance readability with precision.
617                // Render characters which take only one WTF-16 code unit using
618                // `char` syntax and everything else as code units with hex
619                // integer syntax (including paired and unpaired surrogate
620                // halves). Since Rust has no `char`-like type for WTF-16, this
621                // isn't perfect, so if this output isn't suitable, it is open
622                // to being changed (see #140153).
623                match char::from_u32(self.0 as u32) {
624                    Some(c) => f.write_fmt(format_args!("{0:?}", c))write!(f, "{c:?}"),
625                    None => f.write_fmt(format_args!("0x{0:04X}", self.0))write!(f, "0x{:04X}", self.0),
626                }
627            }
628        }
629
630        f.write_fmt(format_args!("EncodeWide("))write!(f, "EncodeWide(")?;
631        f.debug_list().entries(self.clone().map(CodeUnit)).finish()?;
632        f.write_fmt(format_args!(")"))write!(f, ")")?;
633        Ok(())
634    }
635}
636
637impl Hash for CodePoint {
638    #[inline]
639    fn hash<H: Hasher>(&self, state: &mut H) {
640        self.0.hash(state)
641    }
642}
643
644impl Hash for Wtf8 {
645    #[inline]
646    fn hash<H: Hasher>(&self, state: &mut H) {
647        state.write(&self.bytes);
648        0xfeu8.hash(state)
649    }
650}
651
652#[unstable(feature = "clone_to_uninit", issue = "126799")]
653unsafe impl CloneToUninit for Wtf8 {
654    #[inline]
655    #[cfg_attr(debug_assertions, track_caller)]
656    unsafe fn clone_to_uninit(&self, dst: *mut u8) {
657        // SAFETY: we're just a transparent wrapper around [u8]
658        unsafe { self.bytes.clone_to_uninit(dst) }
659    }
660}