Skip to content

Commit ddf6cdb

Browse files
committed
syntax: fix utf-8 decoder
We need to know the length of the next codepoint we want to debug, otherwise it's possible for a naive 'slice[..4]' to fail if the end of the slice happens to split a codepoint.
1 parent f3cd3fe commit ddf6cdb

File tree

1 file changed

+23
-1
lines changed

1 file changed

+23
-1
lines changed

regex-syntax/src/debug.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,32 @@ impl<'a> core::fmt::Debug for Bytes<'a> {
7575
///
7676
/// This returns `None` if and only if `bytes` is empty.
7777
pub(crate) fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
78+
fn len(byte: u8) -> Option<usize> {
79+
if byte <= 0x7F {
80+
return Some(1);
81+
} else if byte & 0b1100_0000 == 0b1000_0000 {
82+
return None;
83+
} else if byte <= 0b1101_1111 {
84+
Some(2)
85+
} else if byte <= 0b1110_1111 {
86+
Some(3)
87+
} else if byte <= 0b1111_0111 {
88+
Some(4)
89+
} else {
90+
None
91+
}
92+
}
93+
7894
if bytes.is_empty() {
7995
return None;
8096
}
81-
match core::str::from_utf8(&bytes[..core::cmp::min(4, bytes.len())]) {
97+
let len = match len(bytes[0]) {
98+
None => return Some(Err(bytes[0])),
99+
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
100+
Some(1) => return Some(Ok(char::from(bytes[0]))),
101+
Some(len) => len,
102+
};
103+
match core::str::from_utf8(&bytes[..len]) {
82104
Ok(s) => Some(Ok(s.chars().next().unwrap())),
83105
Err(_) => Some(Err(bytes[0])),
84106
}

0 commit comments

Comments
 (0)