syntax: fix utf-8 decoder

BurntSushi · BurntSushi · commit ddf6cdbd5dc1 · 2022-10-09T19:07:47.000-04:00
We need to know the length of the next codepoint we want to debug,
otherwise it's possible for a naive 'slice[..4]' to fail if the end of
the slice happens to split a codepoint.
diff --git a/regex-syntax/src/debug.rs b/regex-syntax/src/debug.rs
@@ -75,10 +75,32 @@ impl<'a> core::fmt::Debug for Bytes<'a> {
 ///
 /// This returns `None` if and only if `bytes` is empty.
 pub(crate) fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
+    fn len(byte: u8) -> Option<usize> {
+        if byte <= 0x7F {
+            return Some(1);
+        } else if byte & 0b1100_0000 == 0b1000_0000 {
+            return None;
+        } else if byte <= 0b1101_1111 {
+            Some(2)
+        } else if byte <= 0b1110_1111 {
+            Some(3)
+        } else if byte <= 0b1111_0111 {
+            Some(4)
+        } else {
+            None
+        }
+    }
+
     if bytes.is_empty() {
         return None;
     }
-    match core::str::from_utf8(&bytes[..core::cmp::min(4, bytes.len())]) {
+    let len = match len(bytes[0]) {
+        None => return Some(Err(bytes[0])),
+        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
+        Some(1) => return Some(Ok(char::from(bytes[0]))),
+        Some(len) => len,
+    };
+    match core::str::from_utf8(&bytes[..len]) {
         Ok(s) => Some(Ok(s.chars().next().unwrap())),
         Err(_) => Some(Err(bytes[0])),
     }