Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enforce escape functions to operate over UTF-8 #423

Merged
merged 4 commits into from
Jul 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@
scheme, for example, HEX or Base64
- [#421]: All unescaping functions now accepts and returns strings instead of byte slices

- [#423]: All escaping functions now accepts and returns strings instead of byte slices
- [#423]: Removed `BytesText::from_plain` because it internally did escaping of a byte array,
but since now escaping works on strings. Use `BytesText::from_plain_str` instead

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand Down Expand Up @@ -162,6 +166,7 @@
[#416]: https://github.com/tafia/quick-xml/pull/416
[#418]: https://github.com/tafia/quick-xml/pull/418
[#421]: https://github.com/tafia/quick-xml/pull/421
[#423]: https://github.com/tafia/quick-xml/pull/423

## 0.23.0 -- 2022-05-08

Expand Down
10 changes: 5 additions & 5 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,26 +293,26 @@ fn escaping(c: &mut Criterion) {

group.bench_function("no_chars_to_escape_long", |b| {
b.iter(|| {
criterion::black_box(escape(LOREM_IPSUM_TEXT.as_bytes()));
criterion::black_box(escape(LOREM_IPSUM_TEXT));
})
});

group.bench_function("no_chars_to_escape_short", |b| {
b.iter(|| {
criterion::black_box(escape(b"just bit of text"));
criterion::black_box(escape("just bit of text"));
})
});

group.bench_function("escaped_chars_short", |b| {
b.iter(|| {
criterion::black_box(escape(b"age > 72 && age < 21"));
criterion::black_box(escape(b"\"what's that?\""));
criterion::black_box(escape("age > 72 && age < 21"));
criterion::black_box(escape("\"what's that?\""));
})
});

group.bench_function("escaped_chars_long", |b| {
let lorem_ipsum_with_escape_chars =
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
ut labore et dolore magna aliqua. & Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum.
Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus
Expand Down
2 changes: 1 addition & 1 deletion src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1405,7 +1405,7 @@ mod tests {
br#"item name="hello" source="world.rs""#,
4
)),
Text(BytesText::from_escaped(b"Some text".as_ref())),
Text(BytesText::from_escaped_str("Some text")),
End(BytesEnd::borrowed(b"item")),
Start(BytesStart::borrowed(b"item2", 5)),
End(BytesEnd::borrowed(b"item2")),
Expand Down
74 changes: 41 additions & 33 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ impl std::fmt::Display for EscapeError {

impl std::error::Error for EscapeError {}

/// Escapes a `&[u8]` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`)
/// Escapes an `&str` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`)
/// with their corresponding xml escaped value.
///
/// This function performs following replacements:
Expand All @@ -71,11 +71,11 @@ impl std::error::Error for EscapeError {}
/// | `&` | `&amp;`
/// | `'` | `&apos;`
/// | `"` | `&quot;`
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
pub fn escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
}

/// Escapes a `&[u8]` and replaces xml special characters (`<`, `>`, `&`)
/// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`)
/// with their corresponding xml escaped value.
///
/// Should only be used for escaping text content. In XML text content, it is allowed
Expand All @@ -88,24 +88,25 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> {
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
pub fn partial_escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (`<`, `>`,
/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
/// `&`, `'`, `"`) with their corresponding xml escaped value.
fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str> {
let bytes = raw.as_bytes();
let mut escaped = None;
let mut bytes = raw.iter();
let mut iter = bytes.iter();
let mut pos = 0;
while let Some(i) = bytes.position(|&b| escape_chars(b)) {
while let Some(i) = iter.position(|&b| escape_chars(b)) {
if escaped.is_none() {
escaped = Some(Vec::with_capacity(raw.len()));
}
let escaped = escaped.as_mut().expect("initialized");
let new_pos = pos + i;
escaped.extend_from_slice(&raw[pos..new_pos]);
match raw[new_pos] {
escaped.extend_from_slice(&bytes[pos..new_pos]);
match bytes[new_pos] {
b'<' => escaped.extend_from_slice(b"&lt;"),
b'>' => escaped.extend_from_slice(b"&gt;"),
b'\'' => escaped.extend_from_slice(b"&apos;"),
Expand All @@ -117,10 +118,14 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
}

if let Some(mut escaped) = escaped {
if let Some(raw) = raw.get(pos..) {
if let Some(raw) = bytes.get(pos..) {
escaped.extend_from_slice(raw);
}
Cow::Owned(escaped)
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
// so all slices that was put to the `escaped` is a valid UTF-8 encoded strings
// TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }`
// if unsafe code will be allowed
Cow::Owned(String::from_utf8(escaped).unwrap())
} else {
Cow::Borrowed(raw)
}
Expand Down Expand Up @@ -1717,10 +1722,10 @@ fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {

#[test]
fn test_unescape() {
assert_eq!(&*unescape("test").unwrap(), "test");
assert_eq!(&*unescape("&lt;test&gt;").unwrap(), "<test>");
assert_eq!(&*unescape("&#x30;").unwrap(), "0");
assert_eq!(&*unescape("&#48;").unwrap(), "0");
assert_eq!(unescape("test").unwrap(), Cow::Borrowed("test"));
assert_eq!(unescape("&lt;test&gt;").unwrap(), "<test>");
assert_eq!(unescape("&#x30;").unwrap(), "0");
assert_eq!(unescape("&#48;").unwrap(), "0");
assert!(unescape("&foo;").is_err());
}

Expand All @@ -1731,37 +1736,40 @@ fn test_unescape_with() {
_ => None,
};

assert_eq!(&*unescape_with("test", custom_entities).unwrap(), "test");
assert_eq!(
&*unescape_with("&lt;test&gt;", custom_entities).unwrap(),
unescape_with("test", custom_entities).unwrap(),
Cow::Borrowed("test")
);
assert_eq!(
unescape_with("&lt;test&gt;", custom_entities).unwrap(),
"<test>"
);
assert_eq!(&*unescape_with("&#x30;", custom_entities).unwrap(), "0");
assert_eq!(&*unescape_with("&#48;", custom_entities).unwrap(), "0");
assert_eq!(&*unescape_with("&foo;", custom_entities).unwrap(), "BAR");
assert_eq!(unescape_with("&#x30;", custom_entities).unwrap(), "0");
assert_eq!(unescape_with("&#48;", custom_entities).unwrap(), "0");
assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR");
assert!(unescape_with("&fop;", custom_entities).is_err());
}

#[test]
fn test_escape() {
assert_eq!(&*escape(b"test"), b"test");
assert_eq!(&*escape(b"<test>"), b"&lt;test&gt;");
assert_eq!(&*escape(b"\"a\"bc"), b"&quot;a&quot;bc");
assert_eq!(&*escape(b"\"a\"b&c"), b"&quot;a&quot;b&amp;c");
assert_eq!(escape("test"), Cow::Borrowed("test"));
assert_eq!(escape("<test>"), "&lt;test&gt;");
assert_eq!(escape("\"a\"bc"), "&quot;a&quot;bc");
assert_eq!(escape("\"a\"b&c"), "&quot;a&quot;b&amp;c");
assert_eq!(
&*escape(b"prefix_\"a\"b&<>c"),
"prefix_&quot;a&quot;b&amp;&lt;&gt;c".as_bytes()
escape("prefix_\"a\"b&<>c"),
"prefix_&quot;a&quot;b&amp;&lt;&gt;c"
);
}

#[test]
fn test_partial_escape() {
assert_eq!(&*partial_escape(b"test"), b"test");
assert_eq!(&*partial_escape(b"<test>"), b"&lt;test&gt;");
assert_eq!(&*partial_escape(b"\"a\"bc"), b"\"a\"bc");
assert_eq!(&*partial_escape(b"\"a\"b&c"), b"\"a\"b&amp;c");
assert_eq!(partial_escape("test"), Cow::Borrowed("test"));
assert_eq!(partial_escape("<test>"), "&lt;test&gt;");
assert_eq!(partial_escape("\"a\"bc"), "\"a\"bc");
assert_eq!(partial_escape("\"a\"b&c"), "\"a\"b&amp;c");
assert_eq!(
&*partial_escape(b"prefix_\"a\"b&<>c"),
"prefix_\"a\"b&amp;&lt;&gt;c".as_bytes()
partial_escape("prefix_\"a\"b&<>c"),
"prefix_\"a\"b&amp;&lt;&gt;c"
);
}
5 changes: 4 additions & 1 deletion src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,10 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
fn from(val: (&'a str, &'a str)) -> Attribute<'a> {
Attribute {
key: QName(val.0.as_bytes()),
value: escape(val.1.as_bytes()),
value: match escape(val.1) {
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
},
}
}
}
Expand Down
37 changes: 18 additions & 19 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -681,15 +681,6 @@ impl<'a> BytesText<'a> {
}
}

/// Creates a new `BytesText` from a byte sequence. The byte sequence is
/// expected not to be escaped.
#[inline]
pub fn from_plain(content: &'a [u8]) -> Self {
Self {
content: escape(content),
}
}

/// Creates a new `BytesText` from an escaped string.
#[inline]
pub fn from_escaped_str<C: Into<Cow<'a, str>>>(content: C) -> Self {
Expand All @@ -703,7 +694,12 @@ impl<'a> BytesText<'a> {
/// be escaped.
#[inline]
pub fn from_plain_str(content: &'a str) -> Self {
Self::from_plain(content.as_bytes())
Self {
content: match escape(content) {
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
},
}
}

/// Ensures that all data is owned to extend the object's lifetime if
Expand Down Expand Up @@ -901,11 +897,13 @@ impl<'a> BytesCData<'a> {
/// | `&` | `&amp;`
/// | `'` | `&apos;`
/// | `"` | `&quot;`
pub fn escape(self) -> BytesText<'a> {
BytesText::from_escaped(match escape(&self.content) {
pub fn escape(self, decoder: Decoder) -> Result<BytesText<'a>> {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because BytesText::escape and BytesText::partial_escape introduced only in new 0.24 release, the changes in their signatures are not reflected in the changelog. Moreover, the next PRs will change that signature back

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

let decoded = self.decode(decoder)?;
Ok(BytesText::from_escaped(match escape(&decoded) {
// Because result is borrowed, no replacements was done and we can use original content
Cow::Borrowed(_) => self.content,
Cow::Owned(escaped) => Cow::Owned(escaped),
})
Cow::Owned(escaped) => Cow::Owned(escaped.into_bytes()),
}))
}

/// Converts this CDATA content to an escaped version, that can be written
Expand All @@ -921,15 +919,16 @@ impl<'a> BytesCData<'a> {
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(self) -> BytesText<'a> {
BytesText::from_escaped(match partial_escape(&self.content) {
pub fn partial_escape(self, decoder: Decoder) -> Result<BytesText<'a>> {
let decoded = self.decode(decoder)?;
Ok(BytesText::from_escaped(match partial_escape(&decoded) {
// Because result is borrowed, no replacements was done and we can use original content
Cow::Borrowed(_) => self.content,
Cow::Owned(escaped) => Cow::Owned(escaped),
})
Cow::Owned(escaped) => Cow::Owned(escaped.into_bytes()),
}))
}

/// Gets content of this text buffer in the specified encoding
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, decoder: Decoder) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
Expand Down
10 changes: 5 additions & 5 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2563,7 +2563,7 @@ mod test {

assert_eq!(
reader.read_event_impl($buf).unwrap(),
Event::StartText(BytesText::from_escaped(b"bom".as_ref()).into())
Event::StartText(BytesText::from_escaped_str("bom").into())
);
}

Expand All @@ -2583,7 +2583,7 @@ mod test {

assert_eq!(
reader.read_event_impl($buf).unwrap(),
Event::DocType(BytesText::from_escaped(b"x".as_ref()))
Event::DocType(BytesText::from_escaped_str("x"))
);
}

Expand All @@ -2593,7 +2593,7 @@ mod test {

assert_eq!(
reader.read_event_impl($buf).unwrap(),
Event::PI(BytesText::from_escaped(b"xml-stylesheet".as_ref()))
Event::PI(BytesText::from_escaped_str("xml-stylesheet"))
);
}

Expand Down Expand Up @@ -2642,7 +2642,7 @@ mod test {

assert_eq!(
reader.read_event_impl($buf).unwrap(),
Event::Text(BytesText::from_escaped(b"text".as_ref()))
Event::Text(BytesText::from_escaped_str("text"))
);
}

Expand All @@ -2662,7 +2662,7 @@ mod test {

assert_eq!(
reader.read_event_impl($buf).unwrap(),
Event::Comment(BytesText::from_escaped(b"".as_ref()))
Event::Comment(BytesText::from_escaped_str(""))
);
}

Expand Down
6 changes: 3 additions & 3 deletions src/se/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ impl<'r, W: Write> Serializer<'r, W> {
value: P,
escaped: bool,
) -> Result<(), DeError> {
let value = value.to_string().into_bytes();
let value = value.to_string();
let event = if escaped {
BytesText::from_escaped(value)
BytesText::from_escaped_str(&value)
} else {
BytesText::from_plain(&value)
BytesText::from_plain_str(&value)
};
self.writer.write_event(Event::Text(event))?;
Ok(())
Expand Down
4 changes: 2 additions & 2 deletions src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ mod indentation {
let start = BytesStart::borrowed_name(name)
.with_attributes(vec![("attr1", "value1"), ("attr2", "value2")].into_iter());
let end = BytesEnd::borrowed(name);
let text = BytesText::from_plain(b"text");
let text = BytesText::from_plain_str("text");

writer
.write_event(Event::Start(start))
Expand All @@ -449,7 +449,7 @@ mod indentation {
let start = BytesStart::borrowed_name(name)
.with_attributes(vec![("attr1", "value1"), ("attr2", "value2")].into_iter());
let end = BytesEnd::borrowed(name);
let text = BytesText::from_plain(b"text");
let text = BytesText::from_plain_str("text");
let inner = BytesStart::borrowed_name(b"inner");

writer
Expand Down