diff --git a/Changelog.md b/Changelog.md index 0dda87b8..c3053257 100644 --- a/Changelog.md +++ b/Changelog.md @@ -134,6 +134,10 @@ scheme, for example, HEX or Base64 - [#421]: All unescaping functions now accepts and returns strings instead of byte slices +- [#423]: All escaping functions now accepts and returns strings instead of byte slices +- [#423]: Removed `BytesText::from_plain` because it internally did escaping of a byte array, + but since now escaping works on strings. Use `BytesText::from_plain_str` instead + ### New Tests - [#9]: Added tests for incorrect nested tags in input @@ -162,6 +166,7 @@ [#416]: https://github.com/tafia/quick-xml/pull/416 [#418]: https://github.com/tafia/quick-xml/pull/418 [#421]: https://github.com/tafia/quick-xml/pull/421 +[#423]: https://github.com/tafia/quick-xml/pull/423 ## 0.23.0 -- 2022-05-08 diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 893f4aa5..8bbe1a67 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -293,26 +293,26 @@ fn escaping(c: &mut Criterion) { group.bench_function("no_chars_to_escape_long", |b| { b.iter(|| { - criterion::black_box(escape(LOREM_IPSUM_TEXT.as_bytes())); + criterion::black_box(escape(LOREM_IPSUM_TEXT)); }) }); group.bench_function("no_chars_to_escape_short", |b| { b.iter(|| { - criterion::black_box(escape(b"just bit of text")); + criterion::black_box(escape("just bit of text")); }) }); group.bench_function("escaped_chars_short", |b| { b.iter(|| { - criterion::black_box(escape(b"age > 72 && age < 21")); - criterion::black_box(escape(b"\"what's that?\"")); + criterion::black_box(escape("age > 72 && age < 21")); + criterion::black_box(escape("\"what's that?\"")); }) }); group.bench_function("escaped_chars_long", |b| { let lorem_ipsum_with_escape_chars = -b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt +"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. & Hac habitasse platea dictumst vestibulum rhoncus est pellentesque. Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus diff --git a/src/de/mod.rs b/src/de/mod.rs index dc6ab4c5..e564e041 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -1405,7 +1405,7 @@ mod tests { br#"item name="hello" source="world.rs""#, 4 )), - Text(BytesText::from_escaped(b"Some text".as_ref())), + Text(BytesText::from_escaped_str("Some text")), End(BytesEnd::borrowed(b"item")), Start(BytesStart::borrowed(b"item2", 5)), End(BytesEnd::borrowed(b"item2")), diff --git a/src/escapei.rs b/src/escapei.rs index b2d7e5fc..21608b87 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -59,7 +59,7 @@ impl std::fmt::Display for EscapeError { impl std::error::Error for EscapeError {} -/// Escapes a `&[u8]` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`) +/// Escapes an `&str` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`) /// with their corresponding xml escaped value. /// /// This function performs following replacements: @@ -71,11 +71,11 @@ impl std::error::Error for EscapeError {} /// | `&` | `&` /// | `'` | `'` /// | `"` | `"` -pub fn escape(raw: &[u8]) -> Cow<[u8]> { +pub fn escape(raw: &str) -> Cow { _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"')) } -/// Escapes a `&[u8]` and replaces xml special characters (`<`, `>`, `&`) +/// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`) /// with their corresponding xml escaped value. /// /// Should only be used for escaping text content. In XML text content, it is allowed @@ -88,24 +88,25 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> { /// | `<` | `<` /// | `>` | `>` /// | `&` | `&` -pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> { +pub fn partial_escape(raw: &str) -> Cow { _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&')) } -/// Escapes a `&[u8]` and replaces a subset of xml special characters (`<`, `>`, +/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`, /// `&`, `'`, `"`) with their corresponding xml escaped value. -fn _escape bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> { +fn _escape bool>(raw: &str, escape_chars: F) -> Cow { + let bytes = raw.as_bytes(); let mut escaped = None; - let mut bytes = raw.iter(); + let mut iter = bytes.iter(); let mut pos = 0; - while let Some(i) = bytes.position(|&b| escape_chars(b)) { + while let Some(i) = iter.position(|&b| escape_chars(b)) { if escaped.is_none() { escaped = Some(Vec::with_capacity(raw.len())); } let escaped = escaped.as_mut().expect("initialized"); let new_pos = pos + i; - escaped.extend_from_slice(&raw[pos..new_pos]); - match raw[new_pos] { + escaped.extend_from_slice(&bytes[pos..new_pos]); + match bytes[new_pos] { b'<' => escaped.extend_from_slice(b"<"), b'>' => escaped.extend_from_slice(b">"), b'\'' => escaped.extend_from_slice(b"'"), @@ -117,10 +118,14 @@ fn _escape bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> { } if let Some(mut escaped) = escaped { - if let Some(raw) = raw.get(pos..) { + if let Some(raw) = bytes.get(pos..) { escaped.extend_from_slice(raw); } - Cow::Owned(escaped) + // SAFETY: we operate on UTF-8 input and search for an one byte chars only, + // so all slices that was put to the `escaped` is a valid UTF-8 encoded strings + // TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }` + // if unsafe code will be allowed + Cow::Owned(String::from_utf8(escaped).unwrap()) } else { Cow::Borrowed(raw) } @@ -1717,10 +1722,10 @@ fn parse_decimal(bytes: &str) -> Result { #[test] fn test_unescape() { - assert_eq!(&*unescape("test").unwrap(), "test"); - assert_eq!(&*unescape("<test>").unwrap(), ""); - assert_eq!(&*unescape("0").unwrap(), "0"); - assert_eq!(&*unescape("0").unwrap(), "0"); + assert_eq!(unescape("test").unwrap(), Cow::Borrowed("test")); + assert_eq!(unescape("<test>").unwrap(), ""); + assert_eq!(unescape("0").unwrap(), "0"); + assert_eq!(unescape("0").unwrap(), "0"); assert!(unescape("&foo;").is_err()); } @@ -1731,37 +1736,40 @@ fn test_unescape_with() { _ => None, }; - assert_eq!(&*unescape_with("test", custom_entities).unwrap(), "test"); assert_eq!( - &*unescape_with("<test>", custom_entities).unwrap(), + unescape_with("test", custom_entities).unwrap(), + Cow::Borrowed("test") + ); + assert_eq!( + unescape_with("<test>", custom_entities).unwrap(), "" ); - assert_eq!(&*unescape_with("0", custom_entities).unwrap(), "0"); - assert_eq!(&*unescape_with("0", custom_entities).unwrap(), "0"); - assert_eq!(&*unescape_with("&foo;", custom_entities).unwrap(), "BAR"); + assert_eq!(unescape_with("0", custom_entities).unwrap(), "0"); + assert_eq!(unescape_with("0", custom_entities).unwrap(), "0"); + assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR"); assert!(unescape_with("&fop;", custom_entities).is_err()); } #[test] fn test_escape() { - assert_eq!(&*escape(b"test"), b"test"); - assert_eq!(&*escape(b""), b"<test>"); - assert_eq!(&*escape(b"\"a\"bc"), b""a"bc"); - assert_eq!(&*escape(b"\"a\"b&c"), b""a"b&c"); + assert_eq!(escape("test"), Cow::Borrowed("test")); + assert_eq!(escape(""), "<test>"); + assert_eq!(escape("\"a\"bc"), ""a"bc"); + assert_eq!(escape("\"a\"b&c"), ""a"b&c"); assert_eq!( - &*escape(b"prefix_\"a\"b&<>c"), - "prefix_"a"b&<>c".as_bytes() + escape("prefix_\"a\"b&<>c"), + "prefix_"a"b&<>c" ); } #[test] fn test_partial_escape() { - assert_eq!(&*partial_escape(b"test"), b"test"); - assert_eq!(&*partial_escape(b""), b"<test>"); - assert_eq!(&*partial_escape(b"\"a\"bc"), b"\"a\"bc"); - assert_eq!(&*partial_escape(b"\"a\"b&c"), b"\"a\"b&c"); + assert_eq!(partial_escape("test"), Cow::Borrowed("test")); + assert_eq!(partial_escape(""), "<test>"); + assert_eq!(partial_escape("\"a\"bc"), "\"a\"bc"); + assert_eq!(partial_escape("\"a\"b&c"), "\"a\"b&c"); assert_eq!( - &*partial_escape(b"prefix_\"a\"b&<>c"), - "prefix_\"a\"b&<>c".as_bytes() + partial_escape("prefix_\"a\"b&<>c"), + "prefix_\"a\"b&<>c" ); } diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d1f1e169..51168d5e 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -143,7 +143,10 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> { fn from(val: (&'a str, &'a str)) -> Attribute<'a> { Attribute { key: QName(val.0.as_bytes()), - value: escape(val.1.as_bytes()), + value: match escape(val.1) { + Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + Cow::Owned(s) => Cow::Owned(s.into_bytes()), + }, } } } diff --git a/src/events/mod.rs b/src/events/mod.rs index b6b6b88f..b2672edf 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -681,15 +681,6 @@ impl<'a> BytesText<'a> { } } - /// Creates a new `BytesText` from a byte sequence. The byte sequence is - /// expected not to be escaped. - #[inline] - pub fn from_plain(content: &'a [u8]) -> Self { - Self { - content: escape(content), - } - } - /// Creates a new `BytesText` from an escaped string. #[inline] pub fn from_escaped_str>>(content: C) -> Self { @@ -703,7 +694,12 @@ impl<'a> BytesText<'a> { /// be escaped. #[inline] pub fn from_plain_str(content: &'a str) -> Self { - Self::from_plain(content.as_bytes()) + Self { + content: match escape(content) { + Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + Cow::Owned(s) => Cow::Owned(s.into_bytes()), + }, + } } /// Ensures that all data is owned to extend the object's lifetime if @@ -901,11 +897,13 @@ impl<'a> BytesCData<'a> { /// | `&` | `&` /// | `'` | `'` /// | `"` | `"` - pub fn escape(self) -> BytesText<'a> { - BytesText::from_escaped(match escape(&self.content) { + pub fn escape(self, decoder: Decoder) -> Result> { + let decoded = self.decode(decoder)?; + Ok(BytesText::from_escaped(match escape(&decoded) { + // Because result is borrowed, no replacements was done and we can use original content Cow::Borrowed(_) => self.content, - Cow::Owned(escaped) => Cow::Owned(escaped), - }) + Cow::Owned(escaped) => Cow::Owned(escaped.into_bytes()), + })) } /// Converts this CDATA content to an escaped version, that can be written @@ -921,15 +919,16 @@ impl<'a> BytesCData<'a> { /// | `<` | `<` /// | `>` | `>` /// | `&` | `&` - pub fn partial_escape(self) -> BytesText<'a> { - BytesText::from_escaped(match partial_escape(&self.content) { + pub fn partial_escape(self, decoder: Decoder) -> Result> { + let decoded = self.decode(decoder)?; + Ok(BytesText::from_escaped(match partial_escape(&decoded) { + // Because result is borrowed, no replacements was done and we can use original content Cow::Borrowed(_) => self.content, - Cow::Owned(escaped) => Cow::Owned(escaped), - }) + Cow::Owned(escaped) => Cow::Owned(escaped.into_bytes()), + })) } /// Gets content of this text buffer in the specified encoding - #[cfg(feature = "serialize")] pub(crate) fn decode(&self, decoder: Decoder) -> Result> { Ok(match &self.content { Cow::Borrowed(bytes) => decoder.decode(bytes)?, diff --git a/src/reader.rs b/src/reader.rs index 19d6a352..bb496932 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -2563,7 +2563,7 @@ mod test { assert_eq!( reader.read_event_impl($buf).unwrap(), - Event::StartText(BytesText::from_escaped(b"bom".as_ref()).into()) + Event::StartText(BytesText::from_escaped_str("bom").into()) ); } @@ -2583,7 +2583,7 @@ mod test { assert_eq!( reader.read_event_impl($buf).unwrap(), - Event::DocType(BytesText::from_escaped(b"x".as_ref())) + Event::DocType(BytesText::from_escaped_str("x")) ); } @@ -2593,7 +2593,7 @@ mod test { assert_eq!( reader.read_event_impl($buf).unwrap(), - Event::PI(BytesText::from_escaped(b"xml-stylesheet".as_ref())) + Event::PI(BytesText::from_escaped_str("xml-stylesheet")) ); } @@ -2642,7 +2642,7 @@ mod test { assert_eq!( reader.read_event_impl($buf).unwrap(), - Event::Text(BytesText::from_escaped(b"text".as_ref())) + Event::Text(BytesText::from_escaped_str("text")) ); } @@ -2662,7 +2662,7 @@ mod test { assert_eq!( reader.read_event_impl($buf).unwrap(), - Event::Comment(BytesText::from_escaped(b"".as_ref())) + Event::Comment(BytesText::from_escaped_str("")) ); } diff --git a/src/se/mod.rs b/src/se/mod.rs index bb737802..896f0aac 100644 --- a/src/se/mod.rs +++ b/src/se/mod.rs @@ -100,11 +100,11 @@ impl<'r, W: Write> Serializer<'r, W> { value: P, escaped: bool, ) -> Result<(), DeError> { - let value = value.to_string().into_bytes(); + let value = value.to_string(); let event = if escaped { - BytesText::from_escaped(value) + BytesText::from_escaped_str(&value) } else { - BytesText::from_plain(&value) + BytesText::from_plain_str(&value) }; self.writer.write_event(Event::Text(event))?; Ok(()) diff --git a/src/writer.rs b/src/writer.rs index ca29954d..54579808 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -422,7 +422,7 @@ mod indentation { let start = BytesStart::borrowed_name(name) .with_attributes(vec![("attr1", "value1"), ("attr2", "value2")].into_iter()); let end = BytesEnd::borrowed(name); - let text = BytesText::from_plain(b"text"); + let text = BytesText::from_plain_str("text"); writer .write_event(Event::Start(start)) @@ -449,7 +449,7 @@ mod indentation { let start = BytesStart::borrowed_name(name) .with_attributes(vec![("attr1", "value1"), ("attr2", "value2")].into_iter()); let end = BytesEnd::borrowed(name); - let text = BytesText::from_plain(b"text"); + let text = BytesText::from_plain_str("text"); let inner = BytesStart::borrowed_name(b"inner"); writer