1//! Heap-allocated counterpart to core `wtf8` module.
2#![unstable(
3 feature = "wtf8_internals",
4 issue = "none",
5 reason = "this is internal code for representing OsStr on some platforms and not a public API"
6)]
7// rustdoc bug: doc(hidden) on the module won't stop types in the module from showing up in trait
8// implementations, so, we'll have to add more doc(hidden)s anyway
9#![doc(hidden)]
1011// Note: This module is also included in the alloctests crate using #[path] to
12// run the tests. See the comment there for an explanation why this is the case.
1314#[cfg(test)]
15mod tests;
1617use core::char::encode_utf8_raw;
18use core::hash::{Hash, Hasher};
19pub use core::wtf8::{CodePoint, Wtf8};
20#[cfg(not(test))]
21pub use core::wtf8::{EncodeWide, Wtf8CodePoints};
22use core::{fmt, mem, ops, str};
2324use crate::borrow::{Cow, ToOwned};
25use crate::boxed::Box;
26use crate::collections::TryReserveError;
27#[cfg(not(test))]
28use crate::rc::Rc;
29use crate::string::String;
30#[cfg(all(not(test), target_has_atomic = "ptr"))]
31use crate::sync::Arc;
32use crate::vec::Vec;
3334/// An owned, growable string of well-formed WTF-8 data.
35///
36/// Similar to `String`, but can additionally contain surrogate code points
37/// if they’re not in a surrogate pair.
38#[derive(#[automatically_derived]
impl ::core::cmp::Eq for Wtf8Buf {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Vec<u8>>;
let _: ::core::cmp::AssertParamIsEq<bool>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Wtf8Buf {
#[inline]
fn eq(&self, other: &Wtf8Buf) -> bool {
self.is_known_utf8 == other.is_known_utf8 && self.bytes == other.bytes
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::Ord for Wtf8Buf {
#[inline]
fn cmp(&self, other: &Wtf8Buf) -> ::core::cmp::Ordering {
match ::core::cmp::Ord::cmp(&self.bytes, &other.bytes) {
::core::cmp::Ordering::Equal =>
::core::cmp::Ord::cmp(&self.is_known_utf8,
&other.is_known_utf8),
cmp => cmp,
}
}
}Ord, #[automatically_derived]
impl ::core::cmp::PartialOrd for Wtf8Buf {
#[inline]
fn partial_cmp(&self, other: &Wtf8Buf)
-> ::core::option::Option<::core::cmp::Ordering> {
match ::core::cmp::PartialOrd::partial_cmp(&self.bytes, &other.bytes)
{
::core::option::Option::Some(::core::cmp::Ordering::Equal) =>
::core::cmp::PartialOrd::partial_cmp(&self.is_known_utf8,
&other.is_known_utf8),
cmp => cmp,
}
}
}PartialOrd, #[automatically_derived]
impl ::core::clone::Clone for Wtf8Buf {
#[inline]
fn clone(&self) -> Wtf8Buf {
Wtf8Buf {
bytes: ::core::clone::Clone::clone(&self.bytes),
is_known_utf8: ::core::clone::Clone::clone(&self.is_known_utf8),
}
}
}Clone)]
39#[doc(hidden)]
40pub struct Wtf8Buf {
41 bytes: Vec<u8>,
4243/// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
44 /// know this if we're constructed from a `String` or `&str`.
45 ///
46 /// It is possible for `bytes` to have valid UTF-8 without this being
47 /// set, such as when we're concatenating `&Wtf8`'s and surrogates become
48 /// paired, as we don't bother to rescan the entire string.
49is_known_utf8: bool,
50}
5152impl ops::Dereffor Wtf8Buf {
53type Target = Wtf8;
5455fn deref(&self) -> &Wtf8 {
56self.as_slice()
57 }
58}
5960impl ops::DerefMutfor Wtf8Buf {
61fn deref_mut(&mut self) -> &mut Wtf8 {
62self.as_mut_slice()
63 }
64}
6566/// Formats the string in double quotes, with characters escaped according to
67/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
68/// where each `x` is a hexadecimal digit.
69///
70/// For example, the code units [U+0061, U+D800, U+000A] are formatted as
71/// `"a\u{D800}\n"`.
72impl fmt::Debugfor Wtf8Buf {
73#[inline]
74fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
75 fmt::Debug::fmt(&**self, formatter)
76 }
77}
7879/// Formats the string with unpaired surrogates substituted with the replacement
80/// character, U+FFFD.
81impl fmt::Displayfor Wtf8Buf {
82fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
83if let Some(s) = self.as_known_utf8() {
84 fmt::Display::fmt(s, formatter)
85 } else {
86 fmt::Display::fmt(&**self, formatter)
87 }
88 }
89}
9091#[cfg_attr(test, allow(dead_code))]
92impl Wtf8Buf {
93/// Creates a new, empty WTF-8 string.
94#[inline]
95pub fn new() -> Wtf8Buf {
96Wtf8Buf { bytes: Vec::new(), is_known_utf8: true }
97 }
9899/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
100#[inline]
101pub fn with_capacity(capacity: usize) -> Wtf8Buf {
102Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
103 }
104105/// Creates a WTF-8 string from a WTF-8 byte vec.
106 ///
107 /// Since the byte vec is not checked for valid WTF-8, this function is
108 /// marked unsafe.
109#[inline]
110pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
111Wtf8Buf { bytes: value, is_known_utf8: false }
112 }
113114/// Creates a WTF-8 string from a UTF-8 `String`.
115 ///
116 /// This takes ownership of the `String` and does not copy.
117 ///
118 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
119#[inline]
120pub const fn from_string(string: String) -> Wtf8Buf {
121Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true }
122 }
123124/// Creates a WTF-8 string from a UTF-8 `&str` slice.
125 ///
126 /// This copies the content of the slice.
127 ///
128 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
129#[inline]
130pub fn from_str(s: &str) -> Wtf8Buf {
131Wtf8Buf { bytes: s.as_bytes().to_vec(), is_known_utf8: true }
132 }
133134pub fn clear(&mut self) {
135self.bytes.clear();
136self.is_known_utf8 = true;
137 }
138139/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
140 ///
141 /// This is lossless: calling `.encode_wide()` on the resulting string
142 /// will always return the original code units.
143pub fn from_wide(v: &[u16]) -> Wtf8Buf {
144let mut string = Wtf8Buf::with_capacity(v.len());
145for item in char::decode_utf16(v.iter().cloned()) {
146match item {
147Ok(ch) => string.push_char(ch),
148Err(surrogate) => {
149let surrogate = surrogate.unpaired_surrogate();
150// Surrogates are known to be in the code point range.
151let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
152// The string will now contain an unpaired surrogate.
153string.is_known_utf8 = false;
154// Skip the WTF-8 concatenation check,
155 // surrogate pairs are already decoded by decode_utf16
156unsafe {
157 string.push_code_point_unchecked(code_point);
158 }
159 }
160 }
161 }
162string163 }
164165/// Appends the given `char` to the end of this string.
166 /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
167 /// Copied from String::push.
168unsafe fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
169let mut bytes = [0; char::MAX_LEN_UTF8];
170let bytes = encode_utf8_raw(code_point.to_u32(), &mut bytes);
171self.bytes.extend_from_slice(bytes)
172 }
173174#[inline]
175pub fn as_slice(&self) -> &Wtf8 {
176unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
177 }
178179#[inline]
180pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
181// Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
182 // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
183 // which would break the assumptions of the `is_known_utf8` field.
184unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
185 }
186187/// Converts the string to UTF-8 without validation, if it was created from
188 /// valid UTF-8.
189#[inline]
190fn as_known_utf8(&self) -> Option<&str> {
191if self.is_known_utf8 {
192// SAFETY: The buffer is known to be valid UTF-8.
193Some(unsafe { str::from_utf8_unchecked(self.as_bytes()) })
194 } else {
195None196 }
197 }
198199/// Reserves capacity for at least `additional` more bytes to be inserted
200 /// in the given `Wtf8Buf`.
201 /// The collection may reserve more space to avoid frequent reallocations.
202 ///
203 /// # Panics
204 ///
205 /// Panics if the new capacity exceeds `isize::MAX` bytes.
206#[inline]
207pub fn reserve(&mut self, additional: usize) {
208self.bytes.reserve(additional)
209 }
210211/// Tries to reserve capacity for at least `additional` more bytes to be
212 /// inserted in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to
213 /// avoid frequent reallocations. After calling `try_reserve`, capacity will
214 /// be greater than or equal to `self.len() + additional`. Does nothing if
215 /// capacity is already sufficient. This method preserves the contents even
216 /// if an error occurs.
217 ///
218 /// # Errors
219 ///
220 /// If the capacity overflows, or the allocator reports a failure, then an error
221 /// is returned.
222#[inline]
223pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
224self.bytes.try_reserve(additional)
225 }
226227#[inline]
228pub fn reserve_exact(&mut self, additional: usize) {
229self.bytes.reserve_exact(additional)
230 }
231232/// Tries to reserve the minimum capacity for exactly `additional` more
233 /// bytes to be inserted in the given `Wtf8Buf`. After calling
234 /// `try_reserve_exact`, capacity will be greater than or equal to
235 /// `self.len() + additional` if it returns `Ok(())`.
236 /// Does nothing if the capacity is already sufficient.
237 ///
238 /// Note that the allocator may give the `Wtf8Buf` more space than it
239 /// requests. Therefore, capacity can not be relied upon to be precisely
240 /// minimal. Prefer [`try_reserve`] if future insertions are expected.
241 ///
242 /// [`try_reserve`]: Wtf8Buf::try_reserve
243 ///
244 /// # Errors
245 ///
246 /// If the capacity overflows, or the allocator reports a failure, then an error
247 /// is returned.
248#[inline]
249pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
250self.bytes.try_reserve_exact(additional)
251 }
252253#[inline]
254pub fn shrink_to_fit(&mut self) {
255self.bytes.shrink_to_fit()
256 }
257258#[inline]
259pub fn shrink_to(&mut self, min_capacity: usize) {
260self.bytes.shrink_to(min_capacity)
261 }
262263#[inline]
264pub fn leak<'a>(self) -> &'a mut Wtf8 {
265unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) }
266 }
267268/// Returns the number of bytes that this string buffer can hold without reallocating.
269#[inline]
270pub fn capacity(&self) -> usize {
271self.bytes.capacity()
272 }
273274/// Append a UTF-8 slice at the end of the string.
275#[inline]
276pub fn push_str(&mut self, other: &str) {
277self.bytes.extend_from_slice(other.as_bytes())
278 }
279280/// Append a WTF-8 slice at the end of the string.
281 ///
282 /// This replaces newly paired surrogates at the boundary
283 /// with a supplementary code point,
284 /// like concatenating ill-formed UTF-16 strings effectively would.
285#[inline]
286pub fn push_wtf8(&mut self, other: &Wtf8) {
287match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
288// Replace newly paired surrogates by a supplementary code point.
289(Some(lead), Some(trail)) => {
290let len_without_lead_surrogate = self.len() - 3;
291self.bytes.truncate(len_without_lead_surrogate);
292let other_without_trail_surrogate = &other.as_bytes()[3..];
293// 4 bytes for the supplementary code point
294self.bytes.reserve(4 + other_without_trail_surrogate.len());
295self.push_char(decode_surrogate_pair(lead, trail));
296self.bytes.extend_from_slice(other_without_trail_surrogate);
297 }
298_ => {
299// If we'll be pushing a string containing a surrogate, we may
300 // no longer have UTF-8.
301if self.is_known_utf8 && other.next_surrogate(0).is_some() {
302self.is_known_utf8 = false;
303 }
304305self.bytes.extend_from_slice(other.as_bytes());
306 }
307 }
308 }
309310/// Append a Unicode scalar value at the end of the string.
311#[inline]
312pub fn push_char(&mut self, c: char) {
313// SAFETY: It's always safe to push a char.
314unsafe { self.push_code_point_unchecked(CodePoint::from_char(c)) }
315 }
316317/// Append a code point at the end of the string.
318 ///
319 /// This replaces newly paired surrogates at the boundary
320 /// with a supplementary code point,
321 /// like concatenating ill-formed UTF-16 strings effectively would.
322#[inline]
323pub fn push(&mut self, code_point: CodePoint) {
324if let Some(trail) = code_point.to_trail_surrogate() {
325if let Some(lead) = (&*self).final_lead_surrogate() {
326let len_without_lead_surrogate = self.len() - 3;
327self.bytes.truncate(len_without_lead_surrogate);
328self.push_char(decode_surrogate_pair(lead, trail));
329return;
330 }
331332// We're pushing a trailing surrogate.
333self.is_known_utf8 = false;
334 } else if code_point.to_lead_surrogate().is_some() {
335// We're pushing a leading surrogate.
336self.is_known_utf8 = false;
337 }
338339// No newly paired surrogates at the boundary.
340unsafe { self.push_code_point_unchecked(code_point) }
341 }
342343/// Shortens a string to the specified length.
344 ///
345 /// If `new_len` is greater than the string's current length, this has no
346 /// effect.
347 ///
348 /// # Panics
349 ///
350 /// Panics if `new_len` does not lie on a code point boundary.
351#[inline]
352pub fn truncate(&mut self, new_len: usize) {
353if new_len <= self.len() {
354if !self.is_code_point_boundary(new_len) {
::core::panicking::panic("assertion failed: self.is_code_point_boundary(new_len)")
};assert!(self.is_code_point_boundary(new_len));
355self.bytes.truncate(new_len)
356 }
357 }
358359/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
360#[inline]
361pub fn into_bytes(self) -> Vec<u8> {
362self.bytes
363 }
364365/// Consumes the WTF-8 string and tries to convert it to UTF-8.
366 ///
367 /// This does not copy the data.
368 ///
369 /// If the contents are not well-formed UTF-8
370 /// (that is, if the string contains surrogates),
371 /// the original WTF-8 string is returned instead.
372pub fn into_string(self) -> Result<String, Wtf8Buf> {
373if self.is_known_utf8 || self.next_surrogate(0).is_none() {
374Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
375 } else {
376Err(self)
377 }
378 }
379380/// Consumes the WTF-8 string and converts it lossily to UTF-8.
381 ///
382 /// This does not copy the data (but may overwrite parts of it in place).
383 ///
384 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
385pub fn into_string_lossy(mut self) -> String {
386if !self.is_known_utf8 {
387let mut pos = 0;
388while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
389 pos = surrogate_pos + 3;
390// Surrogates and the replacement character are all 3 bytes, so
391 // they can substituted in-place.
392self.bytes[surrogate_pos..pos].copy_from_slice("\u{FFFD}".as_bytes());
393 }
394 }
395unsafe { String::from_utf8_unchecked(self.bytes) }
396 }
397398/// Converts this `Wtf8Buf` into a boxed `Wtf8`.
399#[inline]
400pub fn into_box(self) -> Box<Wtf8> {
401// SAFETY: relies on `Wtf8` being `repr(transparent)`.
402unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
403 }
404405/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
406pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
407let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
408Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false }
409 }
410411/// Provides plumbing to core `Vec::extend_from_slice`.
412 /// More well behaving alternative to allowing outer types
413 /// full mutable access to the core `Vec`.
414#[inline]
415pub unsafe fn extend_from_slice_unchecked(&mut self, other: &[u8]) {
416self.bytes.extend_from_slice(other);
417self.is_known_utf8 = false;
418 }
419}
420421/// Creates a new WTF-8 string from an iterator of code points.
422///
423/// This replaces surrogate code point pairs with supplementary code points,
424/// like concatenating ill-formed UTF-16 strings effectively would.
425impl FromIterator<CodePoint> for Wtf8Buf {
426fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
427let mut string = Wtf8Buf::new();
428string.extend(iter);
429string430 }
431}
432433/// Append code points from an iterator to the string.
434///
435/// This replaces surrogate code point pairs with supplementary code points,
436/// like concatenating ill-formed UTF-16 strings effectively would.
437impl Extend<CodePoint> for Wtf8Buf {
438fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
439let iterator = iter.into_iter();
440let (low, _high) = iterator.size_hint();
441// Lower bound of one byte per code point (ASCII only)
442self.bytes.reserve(low);
443iterator.for_each(move |code_point| self.push(code_point));
444 }
445446#[inline]
447fn extend_one(&mut self, code_point: CodePoint) {
448self.push(code_point);
449 }
450451#[inline]
452fn extend_reserve(&mut self, additional: usize) {
453// Lower bound of one byte per code point (ASCII only)
454self.bytes.reserve(additional);
455 }
456}
457458/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
459pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf {
460Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false }
461}
462463/// Lossily converts the string to UTF-8.
464/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
465///
466/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
467///
468/// This only copies the data if necessary (if it contains any surrogate).
469pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> {
470let Some((surrogate_pos, _)) = slice.next_surrogate(0) else {
471return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) });
472 };
473let wtf8_bytes = slice.as_bytes();
474let mut utf8_bytes = Vec::with_capacity(slice.len());
475utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
476utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
477let mut pos = surrogate_pos + 3;
478loop {
479match slice.next_surrogate(pos) {
480Some((surrogate_pos, _)) => {
481utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
482utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
483pos = surrogate_pos + 3;
484 }
485None => {
486utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
487return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
488 }
489 }
490 }
491}
492493#[inline]
494pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) {
495buf.is_known_utf8 = false;
496slice.as_bytes().clone_into(&mut buf.bytes);
497}
498499#[cfg(not(test))]
500impl Wtf8 {
501#[rustc_allow_incoherent_impl]
502pub fn to_owned(&self) -> Wtf8Buf {
503to_owned(self)
504 }
505506#[rustc_allow_incoherent_impl]
507pub fn clone_into(&self, buf: &mut Wtf8Buf) {
508clone_into(self, buf)
509 }
510511#[rustc_allow_incoherent_impl]
512pub fn to_string_lossy(&self) -> Cow<'_, str> {
513to_string_lossy(self)
514 }
515516#[rustc_allow_incoherent_impl]
517pub fn into_box(&self) -> Box<Wtf8> {
518let boxed: Box<[u8]> = self.as_bytes().into();
519unsafe { mem::transmute(boxed) }
520 }
521522#[rustc_allow_incoherent_impl]
523pub fn empty_box() -> Box<Wtf8> {
524let boxed: Box<[u8]> = Default::default();
525unsafe { mem::transmute(boxed) }
526 }
527528#[cfg(target_has_atomic = "ptr")]
529 #[rustc_allow_incoherent_impl]
530pub fn into_arc(&self) -> Arc<Wtf8> {
531let arc: Arc<[u8]> = Arc::from(self.as_bytes());
532unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
533 }
534535#[rustc_allow_incoherent_impl]
536pub fn into_rc(&self) -> Rc<Wtf8> {
537let rc: Rc<[u8]> = Rc::from(self.as_bytes());
538unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
539 }
540541#[inline]
542 #[rustc_allow_incoherent_impl]
543pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
544Wtf8Buf { bytes: self.as_bytes().to_ascii_lowercase(), is_known_utf8: false }
545 }
546547#[inline]
548 #[rustc_allow_incoherent_impl]
549pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
550Wtf8Buf { bytes: self.as_bytes().to_ascii_uppercase(), is_known_utf8: false }
551 }
552}
553554#[inline]
555fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
556let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
557unsafe { char::from_u32_unchecked(code_point) }
558}
559560impl Hashfor Wtf8Buf {
561#[inline]
562fn hash<H: Hasher>(&self, state: &mut H) {
563state.write(&self.bytes);
5640xfeu8.hash(state)
565 }
566}