diff --git a/Changelog.md b/Changelog.md index cf82063d..2deb883c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -20,6 +20,8 @@ - [#180]: Make `Decoder` struct public. You already had access to it via the `Reader::decoder()` method, but could not name it in the code. Now the preferred way to access decoding functionality is via this struct +- [#191]: New event variant `StartText` emitted for bytes before the XML declaration + or a start comment or a tag. For streams with BOM this event will contain a BOM ### Bug Fixes diff --git a/benches/bench.rs b/benches/bench.rs index 389af6f4..29c9f90f 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,6 +1,7 @@ use criterion::{self, criterion_group, criterion_main, Criterion}; use pretty_assertions::assert_eq; use quick_xml::events::Event; +use quick_xml::name::QName; use quick_xml::Reader; static SAMPLE: &[u8] = include_bytes!("../tests/sample_rss.xml"); @@ -173,7 +174,7 @@ fn bytes_text_unescaped(c: &mut Criterion) { /// Benchmarks, how fast individual event parsed fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); - group.bench_function("Text", |b| { + group.bench_function("StartText", |b| { let src = "Hello world!".repeat(512 / 12).into_bytes(); let mut buf = Vec::with_capacity(1024); b.iter(|| { @@ -181,7 +182,7 @@ fn one_event(c: &mut Criterion) { let mut nbtxt = criterion::black_box(0); r.check_end_names(false).check_comments(false); match r.read_event(&mut buf) { - Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::StartText(e)) => nbtxt += e.unescaped().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; @@ -310,7 +311,7 @@ fn attributes(c: &mut Criterion) { let mut buf = Vec::new(); loop { match r.read_event(&mut buf) { - Ok(Event::Empty(e)) if e.name() == b"player" => { + Ok(Event::Empty(e)) if e.name() == QName(b"player") => { for name in ["num", "status", "avg"] { if let Some(_attr) = e.try_get_attribute(name).unwrap() { count += 1 diff --git a/src/de/mod.rs b/src/de/mod.rs index 3529f329..c7adec3e 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -952,6 +952,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { let event = loop { let e = self.reader.read_event(&mut self.buf)?; match e { + //TODO: Probably not the best idea treat StartText as usual text + // Usually this event will represent a BOM + // Changing this requires review of the serde-de::top_level::one_element test + Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())), Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())), Event::End(e) => break Ok(DeEvent::End(e.into_owned())), Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())), @@ -992,6 +996,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { loop { let e = self.reader.read_event_unbuffered()?; match e { + //TODO: Probably not the best idea treat StartText as usual text + // Usually this event will represent a BOM + // Changing this requires review of the serde-de::top_level::one_element test + Event::StartText(e) => break Ok(DeEvent::Text(e.into())), Event::Start(e) => break Ok(DeEvent::Start(e)), Event::End(e) => break Ok(DeEvent::End(e)), Event::Text(e) => break Ok(DeEvent::Text(e)), diff --git a/src/events/mod.rs b/src/events/mod.rs index ce6508bb..70460dbc 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -54,6 +54,46 @@ use attributes::{Attribute, Attributes}; #[cfg(feature = "serialize")] use crate::escape::EscapeError; +/// Text that appeared before an XML declaration, a start element or a comment. +/// +/// In well-formed XML it could contain a Byte-Order-Mark (BOM). If this event +/// contains something else except BOM, the XML should be considered ill-formed. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct BytesStartText<'a> { + content: BytesText<'a>, +} + +impl<'a> BytesStartText<'a> { + /// Converts the event into an owned event. + pub fn into_owned(self) -> BytesStartText<'static> { + BytesStartText { + content: self.content.into_owned(), + } + } + + /// Extracts the inner `Cow` from the `BytesStartText` event container. + #[inline] + pub fn into_inner(self) -> Cow<'a, [u8]> { + self.content.into_inner() + } +} + +impl<'a> Deref for BytesStartText<'a> { + type Target = BytesText<'a>; + + fn deref(&self) -> &Self::Target { + &self.content + } +} + +impl<'a> From> for BytesStartText<'a> { + fn from(content: BytesText<'a>) -> Self { + Self { content } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Opening tag data (`Event::Start`), with optional attributes. /// /// ``. @@ -968,6 +1008,12 @@ impl<'a> Deref for BytesText<'a> { } } +impl<'a> From> for BytesText<'a> { + fn from(content: BytesStartText<'a>) -> Self { + content.content + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// CDATA content contains unescaped data from the reader. If you want to write them as a text, @@ -1092,10 +1138,56 @@ impl<'a> Deref for BytesCData<'a> { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Event emitted by [`Reader::read_event`]. -/// -/// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event #[derive(Clone, Debug, Eq, PartialEq)] pub enum Event<'a> { + /// Text that appeared before the first opening tag or an [XML declaration]. + /// [According to the XML standard][std], no text allowed before the XML + /// declaration. However, if there is a BOM in the stream, some data may be + /// present. + /// + /// When this event is generated, it is the very first event emitted by the + /// [`Reader`], and there can be the only one such event. + /// + /// The [`Writer`] writes content of this event "as is" without encoding or + /// escaping. If you write it, it should be written first and only one time + /// (but writer does not enforce that). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use std::borrow::Cow; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// // XML in UTF-8 with BOM + /// let xml = b"\xEF\xBB\xBF"; + /// let mut reader = Reader::from_bytes(xml); + /// let mut events_processed = 0; + /// loop { + /// match reader.read_event_unbuffered() { + /// Ok(Event::StartText(e)) => { + /// assert_eq!(events_processed, 0); + /// // Content contains BOM + /// assert_eq!(e.into_inner(), Cow::Borrowed(b"\xEF\xBB\xBF")); + /// } + /// Ok(Event::Decl(_)) => { + /// assert_eq!(events_processed, 1); + /// } + /// Ok(Event::Eof) => { + /// assert_eq!(events_processed, 2); + /// break; + /// } + /// e => panic!("Unexpected event {:?}", e), + /// } + /// events_processed += 1; + /// } + /// ``` + /// + /// [XML declaration]: Event::Decl + /// [std]: https://www.w3.org/TR/xml11/#NT-document + /// [`Writer`]: crate::writer::Writer + StartText(BytesStartText<'a>), /// Start tag (with attributes) ``. Start(BytesStart<'a>), /// End tag ``. @@ -1123,6 +1215,7 @@ impl<'a> Event<'a> { /// buffer used when reading but incurring a new, separate allocation. pub fn into_owned(self) -> Event<'static> { match self { + Event::StartText(e) => Event::StartText(e.into_owned()), Event::Start(e) => Event::Start(e.into_owned()), Event::End(e) => Event::End(e.into_owned()), Event::Empty(e) => Event::Empty(e.into_owned()), @@ -1142,6 +1235,7 @@ impl<'a> Deref for Event<'a> { fn deref(&self) -> &[u8] { match *self { + Event::StartText(ref e) => &*e, Event::Start(ref e) | Event::Empty(ref e) => &*e, Event::End(ref e) => &*e, Event::Text(ref e) => &*e, diff --git a/src/reader.rs b/src/reader.rs index 24209e95..ba8a468f 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -14,12 +14,46 @@ use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; use memchr; +/// Possible reader states. The state transition diagram (`true` and `false` shows +/// value of [`Reader::expand_empty_elements()`] option): +/// +/// ```mermaid +/// flowchart LR +/// subgraph _ +/// direction LR +/// +/// Init -- "(no event)"\nStartText --> Opened +/// Opened -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> Closed +/// Closed -- "#lt;false#gt;\n(no event)"\nText --> Opened +/// end +/// Closed -- "#lt;true#gt;"\nStart --> Empty +/// Empty -- End --> Closed +/// _ -. Eof .-> Exit +/// ``` #[derive(Clone)] enum TagState { + /// Initial state in which reader stay after creation. Transition from that + /// state could produce a `StartText`, `Decl`, `Comment` or `Start` event. + /// The next state is always `Opened`. The reader will never return to this + /// state. The event emitted during transition to `Opened` is a `StartEvent` + /// if the first symbol not `<`, otherwise no event are emitted. + Init, + /// State after seeing the `<` symbol. Depending on the next symbol all other + /// events (except `StartText`) could be generated. + /// + /// After generating ane event the reader moves to the `Closed` state. Opened, + /// State in which reader searches the `<` symbol of a markup. All bytes before + /// that symbol will be returned in the [`Event::Text`] event. After that + /// the reader moves to the `Opened` state. Closed, + /// This state is used only if option `expand_empty_elements` is set to `true`. + /// Reader enters to this state when it is in a `Closed` state and emits an + /// [`Event::Start`] event. The next event emitted will be an [`Event::End`], + /// after which reader returned to the `Closed` state. Empty, - /// Either Eof or Errored + /// Reader enters this state when `Eof` event generated or an error occurred. + /// This is the last state, the reader stay in it forever. Exit, } @@ -126,7 +160,7 @@ impl Reader { reader, opened_buffer: Vec::new(), opened_starts: Vec::new(), - tag_state: TagState::Closed, + tag_state: TagState::Init, expand_empty_elements: false, trim_text_start: false, trim_text_end: false, @@ -658,7 +692,8 @@ impl Reader { R: XmlSource<'i, B>, { let event = match self.tag_state { - TagState::Closed => self.read_until_open(buf), + TagState::Init => self.read_until_open(buf, true), + TagState::Closed => self.read_until_open(buf, false), TagState::Opened => self.read_until_close(buf), TagState::Empty => self.close_expanded_empty(), TagState::Exit => return Ok(Event::Eof), @@ -670,9 +705,10 @@ impl Reader { event } - /// private function to read until '<' is found - /// return a `Text` event - fn read_until_open<'i, B>(&mut self, buf: B) -> Result> + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> where R: XmlSource<'i, B>, { @@ -691,15 +727,24 @@ impl Reader { .reader .read_bytes_until(b'<', buf, &mut self.buf_position) { - Ok(Some(bytes)) if self.trim_text_end => { - // Skip the ending '< - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - Ok(Event::Text(BytesText::from_escaped(&bytes[..len]))) + Ok(Some(bytes)) => { + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) } - Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))), Ok(None) => Ok(Event::Eof), Err(e) => Err(e), } @@ -2250,6 +2295,16 @@ mod test { use crate::reader::Reader; use pretty_assertions::assert_eq; + #[test] + fn start_text() { + let mut reader = Reader::from_str("bom"); + + assert_eq!( + reader.read_event_buffered($buf).unwrap(), + Event::StartText(BytesText::from_escaped(b"bom".as_ref()).into()) + ); + } + #[test] fn declaration() { let mut reader = Reader::from_str(""); @@ -2313,9 +2368,15 @@ mod test { ); } + /// Text event cannot be generated without preceding event of another type #[test] fn text() { - let mut reader = Reader::from_str("text"); + let mut reader = Reader::from_str("text"); + + assert_eq!( + reader.read_event_buffered($buf).unwrap(), + Event::Empty(BytesStart::borrowed_name(b"tag")) + ); assert_eq!( reader.read_event_buffered($buf).unwrap(), diff --git a/src/writer.rs b/src/writer.rs index 9b4f484a..d6fcf960 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -91,6 +91,7 @@ impl Writer { pub fn write_event<'a, E: AsRef>>(&mut self, event: E) -> Result<()> { let mut next_should_line_break = true; let result = match *event.as_ref() { + Event::StartText(ref e) => self.write(&e), Event::Start(ref e) => { let result = self.write_wrapped(b"<", e, b">"); if let Some(i) = self.indent.as_mut() { diff --git a/tests/test.rs b/tests/test.rs index 37426391..cf547c8e 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -181,7 +181,10 @@ fn fuzz_101() { fn test_no_trim() { let mut reader = Reader::from_str(" text "); - assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); + assert!(matches!( + reader.read_event_unbuffered().unwrap(), + StartText(_) + )); assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_))); @@ -193,7 +196,10 @@ fn test_trim_end() { let mut reader = Reader::from_str(" text "); reader.trim_text_end(true); - assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); + assert!(matches!( + reader.read_event_unbuffered().unwrap(), + StartText(_) + )); assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_))); diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 21f53fb4..ca081079 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -29,7 +29,7 @@ macro_rules! next_eq_content { ($r:expr, $t:tt, $bytes:expr) => { let mut buf = Vec::new(); match $r.read_event(&mut buf).unwrap() { - $t(ref e) if &**e == $bytes => (), + $t(ref e) if e.as_ref() == $bytes => (), e => panic!( "expecting {}({:?}), found {:?}", stringify!($t), @@ -42,6 +42,7 @@ macro_rules! next_eq_content { } macro_rules! next_eq { + ($r:expr, StartText, $bytes:expr) => (next_eq_content!($r, StartText, $bytes);); ($r:expr, Start, $bytes:expr) => (next_eq_name!($r, Start, $bytes);); ($r:expr, End, $bytes:expr) => (next_eq_name!($r, End, $bytes);); ($r:expr, Empty, $bytes:expr) => (next_eq_name!($r, Empty, $bytes);); @@ -794,7 +795,9 @@ fn test_unescape_and_decode_without_bom_removes_utf8_bom() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } @@ -813,12 +816,14 @@ fn test_unescape_and_decode_without_bom_removes_utf16be_bom() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } } - assert_eq!(txt[0], ""); + assert_eq!(Some(txt[0].as_ref()), Some("")); } #[test] @@ -832,12 +837,14 @@ fn test_unescape_and_decode_without_bom_removes_utf16le_bom() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } } - assert_eq!(txt[0], ""); + assert_eq!(Some(txt[0].as_ref()), Some("")); } #[test] @@ -853,7 +860,9 @@ fn test_unescape_and_decode_without_bom_does_nothing_if_no_bom_exists() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index c6eaadc9..a3f66bcb 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -461,6 +461,7 @@ fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String { match opt_event { + Ok((_, Event::StartText(_))) => "StartText".to_string(), Ok((n, Event::Start(ref e))) => { let name = namespace_name(n, e.name(), reader); match make_attrs(e) {