1#![unstable(
14 feature = "wtf8_internals",
15 issue = "none",
16 reason = "this is internal code for representing OsStr on some platforms and not a public API"
17)]
18#![doc(hidden)]
21
22use crate::char::{EscapeDebugExtArgs, encode_utf16_raw};
23use crate::clone::CloneToUninit;
24use crate::fmt::{self, Write};
25use crate::hash::{Hash, Hasher};
26use crate::iter::FusedIterator;
27use crate::num::niche_types::CodePointInner;
28use crate::str::next_code_point;
29use crate::{ops, slice, str};
30
31#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
37#[doc(hidden)]
38pub struct CodePoint(CodePointInner);
39
40impl fmt::Debug for CodePoint {
43 #[inline]
44 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45 write!(formatter, "U+{:04X}", self.0.as_inner())
46 }
47}
48
49impl CodePoint {
50 #[inline]
54 pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
55 CodePoint(unsafe { CodePointInner::new_unchecked(value) })
57 }
58
59 #[inline]
63 pub fn from_u32(value: u32) -> Option<CodePoint> {
64 Some(CodePoint(CodePointInner::new(value)?))
65 }
66
67 #[inline]
71 pub fn from_char(value: char) -> CodePoint {
72 unsafe { CodePoint::from_u32_unchecked(value as u32) }
74 }
75
76 #[inline]
78 pub fn to_u32(&self) -> u32 {
79 self.0.as_inner()
80 }
81
82 #[inline]
84 pub fn to_lead_surrogate(&self) -> Option<u16> {
85 match self.to_u32() {
86 lead @ 0xD800..=0xDBFF => Some(lead as u16),
87 _ => None,
88 }
89 }
90
91 #[inline]
93 pub fn to_trail_surrogate(&self) -> Option<u16> {
94 match self.to_u32() {
95 trail @ 0xDC00..=0xDFFF => Some(trail as u16),
96 _ => None,
97 }
98 }
99
100 #[inline]
104 pub fn to_char(&self) -> Option<char> {
105 match self.to_u32() {
106 0xD800..=0xDFFF => None,
107 valid => Some(unsafe { char::from_u32_unchecked(valid) }),
109 }
110 }
111
112 #[inline]
117 pub fn to_char_lossy(&self) -> char {
118 self.to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
119 }
120}
121
122#[derive(Eq, Ord, PartialEq, PartialOrd)]
127#[repr(transparent)]
128#[rustc_has_incoherent_inherent_impls]
129#[doc(hidden)]
130pub struct Wtf8 {
131 bytes: [u8],
132}
133
134impl AsRef<[u8]> for Wtf8 {
135 #[inline]
136 fn as_ref(&self) -> &[u8] {
137 &self.bytes
138 }
139}
140
141impl fmt::Debug for Wtf8 {
145 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
146 fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
147 use crate::fmt::Write as _;
148 for c in s.chars().flat_map(|c| {
149 c.escape_debug_ext(EscapeDebugExtArgs {
150 escape_grapheme_extended: true,
151 escape_single_quote: false,
152 escape_double_quote: true,
153 })
154 }) {
155 f.write_char(c)?
156 }
157 Ok(())
158 }
159
160 formatter.write_char('"')?;
161 let mut pos = 0;
162 while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
163 write_str_escaped(formatter, unsafe {
165 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
166 })?;
167 write!(formatter, "\\u{{{:x}}}", surrogate)?;
168 pos = surrogate_pos + 3;
169 }
170
171 write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
173 formatter.write_char('"')
174 }
175}
176
177impl fmt::Display for Wtf8 {
180 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
181 let wtf8_bytes = &self.bytes;
182 let mut pos = 0;
183 loop {
184 match self.next_surrogate(pos) {
185 Some((surrogate_pos, _)) => {
186 formatter.write_str(unsafe {
188 str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
189 })?;
190 formatter.write_char(char::REPLACEMENT_CHARACTER)?;
191 pos = surrogate_pos + 3;
192 }
193 None => {
194 let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
196 if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
197 }
198 }
199 }
200 }
201}
202
203impl Wtf8 {
204 #[inline]
206 pub fn from_str(value: &str) -> &Wtf8 {
207 unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
209 }
210
211 #[inline]
216 pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
217 unsafe { &*(value as *const [u8] as *const Wtf8) }
219 }
220
221 #[inline]
226 pub unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
227 unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
229 }
230
231 #[inline]
233 pub fn len(&self) -> usize {
234 self.bytes.len()
235 }
236
237 #[inline]
238 pub fn is_empty(&self) -> bool {
239 self.bytes.is_empty()
240 }
241
242 #[inline]
249 pub fn ascii_byte_at(&self, position: usize) -> u8 {
250 match self.bytes[position] {
251 ascii_byte @ 0x00..=0x7F => ascii_byte,
252 _ => 0xFF,
253 }
254 }
255
256 #[inline]
258 pub fn code_points(&self) -> Wtf8CodePoints<'_> {
259 Wtf8CodePoints { bytes: self.bytes.iter() }
260 }
261
262 #[inline]
264 pub fn as_bytes(&self) -> &[u8] {
265 &self.bytes
266 }
267
268 #[inline]
274 pub fn as_str(&self) -> Result<&str, str::Utf8Error> {
275 str::from_utf8(&self.bytes)
276 }
277
278 #[inline]
285 pub fn encode_wide(&self) -> EncodeWide<'_> {
286 EncodeWide { code_points: self.code_points(), extra: 0 }
287 }
288
289 #[inline]
290 pub fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
291 let mut iter = self.bytes[pos..].iter();
292 loop {
293 let b = *iter.next()?;
294 if b < 0x80 {
295 pos += 1;
296 } else if b < 0xE0 {
297 iter.next();
298 pos += 2;
299 } else if b == 0xED {
300 match (iter.next(), iter.next()) {
301 (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
302 return Some((pos, decode_surrogate(b2, b3)));
303 }
304 _ => pos += 3,
305 }
306 } else if b < 0xF0 {
307 iter.next();
308 iter.next();
309 pos += 3;
310 } else {
311 iter.next();
312 iter.next();
313 iter.next();
314 pos += 4;
315 }
316 }
317 }
318
319 #[inline]
320 pub fn final_lead_surrogate(&self) -> Option<u16> {
321 match self.bytes {
322 [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
323 _ => None,
324 }
325 }
326
327 #[inline]
328 pub fn initial_trail_surrogate(&self) -> Option<u16> {
329 match self.bytes {
330 [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
331 _ => None,
332 }
333 }
334
335 #[inline]
336 pub fn make_ascii_lowercase(&mut self) {
337 self.bytes.make_ascii_lowercase()
338 }
339
340 #[inline]
341 pub fn make_ascii_uppercase(&mut self) {
342 self.bytes.make_ascii_uppercase()
343 }
344
345 #[inline]
346 pub fn is_ascii(&self) -> bool {
347 self.bytes.is_ascii()
348 }
349
350 #[inline]
351 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
352 self.bytes.eq_ignore_ascii_case(&other.bytes)
353 }
354}
355
356impl ops::Index<ops::Range<usize>> for Wtf8 {
363 type Output = Wtf8;
364
365 #[inline]
366 fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
367 if range.start <= range.end
368 && self.is_code_point_boundary(range.start)
369 && self.is_code_point_boundary(range.end)
370 {
371 unsafe { slice_unchecked(self, range.start, range.end) }
373 } else {
374 slice_error_fail(self, range.start, range.end)
375 }
376 }
377}
378
379impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
386 type Output = Wtf8;
387
388 #[inline]
389 fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
390 if self.is_code_point_boundary(range.start) {
391 unsafe { slice_unchecked(self, range.start, self.len()) }
393 } else {
394 slice_error_fail(self, range.start, self.len())
395 }
396 }
397}
398
399impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
406 type Output = Wtf8;
407
408 #[inline]
409 fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
410 if self.is_code_point_boundary(range.end) {
411 unsafe { slice_unchecked(self, 0, range.end) }
413 } else {
414 slice_error_fail(self, 0, range.end)
415 }
416 }
417}
418
419impl ops::Index<ops::RangeFull> for Wtf8 {
420 type Output = Wtf8;
421
422 #[inline]
423 fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
424 self
425 }
426}
427
428#[inline]
429fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
430 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
432}
433
434impl Wtf8 {
435 #[inline]
437 pub fn is_code_point_boundary(&self, index: usize) -> bool {
438 if index == 0 {
439 return true;
440 }
441 match self.bytes.get(index) {
442 None => index == self.len(),
443 Some(&b) => (b as i8) >= -0x40,
444 }
445 }
446
447 #[track_caller]
455 #[inline]
456 pub fn check_utf8_boundary(&self, index: usize) {
457 if index == 0 {
458 return;
459 }
460 match self.bytes.get(index) {
461 Some(0xED) => (), Some(&b) if (b as i8) >= -0x40 => return,
463 Some(_) => panic!("byte index {index} is not a codepoint boundary"),
464 None if index == self.len() => return,
465 None => panic!("byte index {index} is out of bounds"),
466 }
467 if self.bytes[index + 1] >= 0xA0 {
468 if index >= 3 && self.bytes[index - 3] == 0xED && self.bytes[index - 2] >= 0xA0 {
470 panic!("byte index {index} lies between surrogate codepoints");
471 }
472 }
473 }
474}
475
476#[inline]
478unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
479 unsafe {
481 let len = end - begin;
482 let start = s.as_bytes().as_ptr().add(begin);
483 Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
484 }
485}
486
487#[inline(never)]
489fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
490 assert!(begin <= end);
491 panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
492}
493
494#[derive(Clone)]
498#[doc(hidden)]
499pub struct Wtf8CodePoints<'a> {
500 bytes: slice::Iter<'a, u8>,
501}
502
503impl Iterator for Wtf8CodePoints<'_> {
504 type Item = CodePoint;
505
506 #[inline]
507 fn next(&mut self) -> Option<CodePoint> {
508 unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint::from_u32_unchecked(c)) }
510 }
511
512 #[inline]
513 fn size_hint(&self) -> (usize, Option<usize>) {
514 let len = self.bytes.len();
515 (len.saturating_add(3) / 4, Some(len))
516 }
517}
518
519impl fmt::Debug for Wtf8CodePoints<'_> {
520 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
521 f.debug_tuple("Wtf8CodePoints")
522 .field(&unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) })
524 .finish()
525 }
526}
527
528#[stable(feature = "rust1", since = "1.0.0")]
530#[derive(Clone)]
531#[doc(hidden)]
532pub struct EncodeWide<'a> {
533 code_points: Wtf8CodePoints<'a>,
534 extra: u16,
535}
536
537#[stable(feature = "rust1", since = "1.0.0")]
539impl Iterator for EncodeWide<'_> {
540 type Item = u16;
541
542 #[inline]
543 fn next(&mut self) -> Option<u16> {
544 if self.extra != 0 {
545 let tmp = self.extra;
546 self.extra = 0;
547 return Some(tmp);
548 }
549
550 let mut buf = [0; char::MAX_LEN_UTF16];
551 self.code_points.next().map(|code_point| {
552 let n = encode_utf16_raw(code_point.to_u32(), &mut buf).len();
553 if n == 2 {
554 self.extra = buf[1];
555 }
556 buf[0]
557 })
558 }
559
560 #[inline]
561 fn size_hint(&self) -> (usize, Option<usize>) {
562 let (low, high) = self.code_points.size_hint();
563 let ext = (self.extra != 0) as usize;
564 (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext)))
568 }
569}
570
571#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")]
572impl FusedIterator for EncodeWide<'_> {}
573
574#[stable(feature = "encode_wide_debug", since = "1.92.0")]
575impl fmt::Debug for EncodeWide<'_> {
576 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
577 struct CodeUnit(u16);
578 impl fmt::Debug for CodeUnit {
579 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
580 match char::from_u32(self.0 as u32) {
588 Some(c) => write!(f, "{c:?}"),
589 None => write!(f, "0x{:04X}", self.0),
590 }
591 }
592 }
593
594 write!(f, "EncodeWide(")?;
595 f.debug_list().entries(self.clone().map(CodeUnit)).finish()?;
596 write!(f, ")")?;
597 Ok(())
598 }
599}
600
601impl Hash for CodePoint {
602 #[inline]
603 fn hash<H: Hasher>(&self, state: &mut H) {
604 self.0.hash(state)
605 }
606}
607
608impl Hash for Wtf8 {
609 #[inline]
610 fn hash<H: Hasher>(&self, state: &mut H) {
611 state.write(&self.bytes);
612 0xfeu8.hash(state)
613 }
614}
615
616#[unstable(feature = "clone_to_uninit", issue = "126799")]
617unsafe impl CloneToUninit for Wtf8 {
618 #[inline]
619 #[cfg_attr(debug_assertions, track_caller)]
620 unsafe fn clone_to_uninit(&self, dst: *mut u8) {
621 unsafe { self.bytes.clone_to_uninit(dst) }
623 }
624}