1use super::Utf8Error;
4use crate::intrinsics::const_eval_select;
5
6#[inline]
10const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
11 (byte & (0x7F >> width)) as u32
12}
13
14#[inline]
16const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
17 (ch << 6) | (byte & CONT_MASK) as u32
18}
19
20#[inline]
23pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
24 (byte as i8) < -64
25}
26
27#[unstable(feature = "str_internals", issue = "none")]
34#[inline]
35pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
36 let x = *bytes.next()?;
38 if x < 128 {
39 return Some(x as u32);
40 }
41
42 let init = utf8_first_byte(x, 2);
46 let y = unsafe { *bytes.next().unwrap_unchecked() };
49 let mut ch = utf8_acc_cont_byte(init, y);
50 if x >= 0xE0 {
51 let z = unsafe { *bytes.next().unwrap_unchecked() };
56 let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
57 ch = init << 12 | y_z;
58 if x >= 0xF0 {
59 let w = unsafe { *bytes.next().unwrap_unchecked() };
64 ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
65 }
66 }
67
68 Some(ch)
69}
70
71#[inline]
78pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
79where
80 I: DoubleEndedIterator<Item = &'a u8>,
81{
82 let w = match *bytes.next_back()? {
84 next_byte if next_byte < 128 => return Some(next_byte as u32),
85 back_byte => back_byte,
86 };
87
88 let mut ch;
91 let z = unsafe { *bytes.next_back().unwrap_unchecked() };
94 ch = utf8_first_byte(z, 2);
95 if utf8_is_cont_byte(z) {
96 let y = unsafe { *bytes.next_back().unwrap_unchecked() };
99 ch = utf8_first_byte(y, 3);
100 if utf8_is_cont_byte(y) {
101 let x = unsafe { *bytes.next_back().unwrap_unchecked() };
104 ch = utf8_first_byte(x, 4);
105 ch = utf8_acc_cont_byte(ch, y);
106 }
107 ch = utf8_acc_cont_byte(ch, z);
108 }
109 ch = utf8_acc_cont_byte(ch, w);
110
111 Some(ch)
112}
113
114const NONASCII_MASK: usize = usize::repeat_u8(0x80);
115
116#[inline]
118const fn contains_nonascii(x: usize) -> bool {
119 (x & NONASCII_MASK) != 0
120}
121
122#[inline(always)]
125#[rustc_allow_const_fn_unstable(const_eval_select)] pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
127 let mut index = 0;
128 let len = v.len();
129
130 const USIZE_BYTES: usize = size_of::<usize>();
131
132 let ascii_block_size = 2 * USIZE_BYTES;
133 let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
134 let align = {
#[inline]
fn runtime(v: &[u8]) -> usize { { v.as_ptr().align_offset(USIZE_BYTES) } }
#[inline]
const fn compiletime(v: &[u8]) -> usize { let _ = v; { usize::MAX } }
const_eval_select((v,), compiletime, runtime)
}const_eval_select!(
137 @capture { v: &[u8] } -> usize:
138 if const {
139 usize::MAX
140 } else {
141 v.as_ptr().align_offset(USIZE_BYTES)
142 }
143 );
144
145 while index < len {
146 let old_offset = index;
147 macro_rules! err {
148 ($error_len: expr) => {
149 return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
150 };
151 }
152
153 macro_rules! next {
154 () => {{
155 index += 1;
156 if index >= len {
158 err!(None)
159 }
160 v[index]
161 }};
162 }
163
164 let first = v[index];
165 if first >= 128 {
166 let w = utf8_char_width(first);
167 match w {
186 2 => {
187 if {
index += 1;
if index >= len {
return Err(Utf8Error { valid_up_to: old_offset, error_len: None })
}
v[index]
}next!() as i8 >= -64 {
188 return Err(Utf8Error { valid_up_to: old_offset, error_len: Some(1) })err!(Some(1))
189 }
190 }
191 3 => {
192 match (first, {
index += 1;
if index >= len {
return Err(Utf8Error { valid_up_to: old_offset, error_len: None })
}
v[index]
}next!()) {
193 (0xE0, 0xA0..=0xBF)
194 | (0xE1..=0xEC, 0x80..=0xBF)
195 | (0xED, 0x80..=0x9F)
196 | (0xEE..=0xEF, 0x80..=0xBF) => {}
197 _ => return Err(Utf8Error { valid_up_to: old_offset, error_len: Some(1) })err!(Some(1)),
198 }
199 if {
index += 1;
if index >= len {
return Err(Utf8Error { valid_up_to: old_offset, error_len: None })
}
v[index]
}next!() as i8 >= -64 {
200 return Err(Utf8Error { valid_up_to: old_offset, error_len: Some(2) })err!(Some(2))
201 }
202 }
203 4 => {
204 match (first, {
index += 1;
if index >= len {
return Err(Utf8Error { valid_up_to: old_offset, error_len: None })
}
v[index]
}next!()) {
205 (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
206 _ => return Err(Utf8Error { valid_up_to: old_offset, error_len: Some(1) })err!(Some(1)),
207 }
208 if {
index += 1;
if index >= len {
return Err(Utf8Error { valid_up_to: old_offset, error_len: None })
}
v[index]
}next!() as i8 >= -64 {
209 return Err(Utf8Error { valid_up_to: old_offset, error_len: Some(2) })err!(Some(2))
210 }
211 if {
index += 1;
if index >= len {
return Err(Utf8Error { valid_up_to: old_offset, error_len: None })
}
v[index]
}next!() as i8 >= -64 {
212 return Err(Utf8Error { valid_up_to: old_offset, error_len: Some(3) })err!(Some(3))
213 }
214 }
215 _ => return Err(Utf8Error { valid_up_to: old_offset, error_len: Some(1) })err!(Some(1)),
216 }
217 index += 1;
218 } else {
219 if align != usize::MAX && align.wrapping_sub(index).is_multiple_of(USIZE_BYTES) {
223 let ptr = v.as_ptr();
224 while index < blocks_end {
225 unsafe {
230 let block = ptr.add(index) as *const usize;
231 let zu = contains_nonascii(*block);
233 let zv = contains_nonascii(*block.add(1));
234 if zu || zv {
235 break;
236 }
237 }
238 index += ascii_block_size;
239 }
240 while index < len && v[index] < 128 {
242 index += 1;
243 }
244 } else {
245 index += 1;
246 }
247 }
248 }
249
250 Ok(())
251}
252
253const UTF8_CHAR_WIDTH: &[u8; 256] = &[
255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ];
273
274#[unstable(feature = "str_internals", issue = "none")]
276#[must_use]
277#[inline]
278pub const fn utf8_char_width(b: u8) -> usize {
279 UTF8_CHAR_WIDTH[b as usize] as usize
280}
281
282const CONT_MASK: u8 = 0b0011_1111;