Skip to content

Commit

Permalink
tafia#191: Add new event StartText which contains text before the f…
Browse files Browse the repository at this point in the history
…irst XML element
  • Loading branch information
Mingun committed Jun 20, 2022
1 parent c8236e4 commit 9f1e655
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 29 deletions.
2 changes: 2 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
- [#180]: Make `Decoder` struct public. You already had access to it via the
`Reader::decoder()` method, but could not name it in the code. Now the preferred
way to access decoding functionality is via this struct
- [#191]: New event variant `StartText` emitted for bytes before the XML declaration
or a start comment or a tag. For streams with BOM this event will contain a BOM

### Bug Fixes

Expand Down
7 changes: 4 additions & 3 deletions benches/bench.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use criterion::{self, criterion_group, criterion_main, Criterion};
use pretty_assertions::assert_eq;
use quick_xml::events::Event;
use quick_xml::name::QName;
use quick_xml::Reader;

static SAMPLE: &[u8] = include_bytes!("../tests/sample_rss.xml");
Expand Down Expand Up @@ -173,15 +174,15 @@ fn bytes_text_unescaped(c: &mut Criterion) {
/// Benchmarks, how fast individual event parsed
fn one_event(c: &mut Criterion) {
let mut group = c.benchmark_group("One event");
group.bench_function("Text", |b| {
group.bench_function("StartText", |b| {
let src = "Hello world!".repeat(512 / 12).into_bytes();
let mut buf = Vec::with_capacity(1024);
b.iter(|| {
let mut r = Reader::from_reader(src.as_ref());
let mut nbtxt = criterion::black_box(0);
r.check_end_names(false).check_comments(false);
match r.read_event(&mut buf) {
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::StartText(e)) => nbtxt += e.unescaped().unwrap().len(),
something_else => panic!("Did not expect {:?}", something_else),
};

Expand Down Expand Up @@ -310,7 +311,7 @@ fn attributes(c: &mut Criterion) {
let mut buf = Vec::new();
loop {
match r.read_event(&mut buf) {
Ok(Event::Empty(e)) if e.name() == b"player" => {
Ok(Event::Empty(e)) if e.name() == QName(b"player") => {
for name in ["num", "status", "avg"] {
if let Some(_attr) = e.try_get_attribute(name).unwrap() {
count += 1
Expand Down
8 changes: 8 additions & 0 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
let event = loop {
let e = self.reader.read_event(&mut self.buf)?;
match e {
//TODO: Probably not the best idea treat StartText as usual text
// Usually this event will represent a BOM
// Changing this requires review of the serde-de::top_level::one_element test
Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())),
Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())),
Event::End(e) => break Ok(DeEvent::End(e.into_owned())),
Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())),
Expand Down Expand Up @@ -992,6 +996,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
loop {
let e = self.reader.read_event_unbuffered()?;
match e {
//TODO: Probably not the best idea treat StartText as usual text
// Usually this event will represent a BOM
// Changing this requires review of the serde-de::top_level::one_element test
Event::StartText(e) => break Ok(DeEvent::Text(e.into())),
Event::Start(e) => break Ok(DeEvent::Start(e)),
Event::End(e) => break Ok(DeEvent::End(e)),
Event::Text(e) => break Ok(DeEvent::Text(e)),
Expand Down
98 changes: 96 additions & 2 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,46 @@ use attributes::{Attribute, Attributes};
#[cfg(feature = "serialize")]
use crate::escape::EscapeError;

/// Text that appeared before an XML declaration, a start element or a comment.
///
/// In well-formed XML it could contain a Byte-Order-Mark (BOM). If this event
/// contains something else except BOM, the XML should be considered ill-formed.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct BytesStartText<'a> {
content: BytesText<'a>,
}

impl<'a> BytesStartText<'a> {
/// Converts the event into an owned event.
pub fn into_owned(self) -> BytesStartText<'static> {
BytesStartText {
content: self.content.into_owned(),
}
}

/// Extracts the inner `Cow` from the `BytesStartText` event container.
#[inline]
pub fn into_inner(self) -> Cow<'a, [u8]> {
self.content.into_inner()
}
}

impl<'a> Deref for BytesStartText<'a> {
type Target = BytesText<'a>;

fn deref(&self) -> &Self::Target {
&self.content
}
}

impl<'a> From<BytesText<'a>> for BytesStartText<'a> {
fn from(content: BytesText<'a>) -> Self {
Self { content }
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Opening tag data (`Event::Start`), with optional attributes.
///
/// `<name attr="value">`.
Expand Down Expand Up @@ -968,6 +1008,12 @@ impl<'a> Deref for BytesText<'a> {
}
}

impl<'a> From<BytesStartText<'a>> for BytesText<'a> {
fn from(content: BytesStartText<'a>) -> Self {
content.content
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// CDATA content contains unescaped data from the reader. If you want to write them as a text,
Expand Down Expand Up @@ -1092,10 +1138,56 @@ impl<'a> Deref for BytesCData<'a> {
////////////////////////////////////////////////////////////////////////////////////////////////////

/// Event emitted by [`Reader::read_event`].
///
/// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Event<'a> {
/// Text that appeared before the first opening tag or an [XML declaration].
/// [According to the XML standard][std], no text allowed before the XML
/// declaration. However, if there is a BOM in the stream, some data may be
/// present.
///
/// When this event is generated, it is the very first event emitted by the
/// [`Reader`], and there can be the only one such event.
///
/// The [`Writer`] writes content of this event "as is" without encoding or
/// escaping. If you write it, it should be written first and only one time
/// (but writer does not enforce that).
///
/// # Examples
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use std::borrow::Cow;
/// use quick_xml::Reader;
/// use quick_xml::events::Event;
///
/// // XML in UTF-8 with BOM
/// let xml = b"\xEF\xBB\xBF<?xml version='1.0'?>";
/// let mut reader = Reader::from_bytes(xml);
/// let mut events_processed = 0;
/// loop {
/// match reader.read_event_unbuffered() {
/// Ok(Event::StartText(e)) => {
/// assert_eq!(events_processed, 0);
/// // Content contains BOM
/// assert_eq!(e.into_inner(), Cow::Borrowed(b"\xEF\xBB\xBF"));
/// }
/// Ok(Event::Decl(_)) => {
/// assert_eq!(events_processed, 1);
/// }
/// Ok(Event::Eof) => {
/// assert_eq!(events_processed, 2);
/// break;
/// }
/// e => panic!("Unexpected event {:?}", e),
/// }
/// events_processed += 1;
/// }
/// ```
///
/// [XML declaration]: Event::Decl
/// [std]: https://www.w3.org/TR/xml11/#NT-document
/// [`Writer`]: crate::writer::Writer
StartText(BytesStartText<'a>),
/// Start tag (with attributes) `<tag attr="value">`.
Start(BytesStart<'a>),
/// End tag `</tag>`.
Expand Down Expand Up @@ -1123,6 +1215,7 @@ impl<'a> Event<'a> {
/// buffer used when reading but incurring a new, separate allocation.
pub fn into_owned(self) -> Event<'static> {
match self {
Event::StartText(e) => Event::StartText(e.into_owned()),
Event::Start(e) => Event::Start(e.into_owned()),
Event::End(e) => Event::End(e.into_owned()),
Event::Empty(e) => Event::Empty(e.into_owned()),
Expand All @@ -1142,6 +1235,7 @@ impl<'a> Deref for Event<'a> {

fn deref(&self) -> &[u8] {
match *self {
Event::StartText(ref e) => &*e,
Event::Start(ref e) | Event::Empty(ref e) => &*e,
Event::End(ref e) => &*e,
Event::Text(ref e) => &*e,
Expand Down
91 changes: 76 additions & 15 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,46 @@ use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult};

use memchr;

/// Possible reader states. The state transition diagram (`true` and `false` shows
/// value of [`Reader::expand_empty_elements()`] option):
///
/// ```mermaid
/// flowchart LR
/// subgraph _
/// direction LR
///
/// Init -- "(no event)"\nStartText --> Opened
/// Opened -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> Closed
/// Closed -- "#lt;false#gt;\n(no event)"\nText --> Opened
/// end
/// Closed -- "#lt;true#gt;"\nStart --> Empty
/// Empty -- End --> Closed
/// _ -. Eof .-> Exit
/// ```
#[derive(Clone)]
enum TagState {
/// Initial state in which reader stay after creation. Transition from that
/// state could produce a `StartText`, `Decl`, `Comment` or `Start` event.
/// The next state is always `Opened`. The reader will never return to this
/// state. The event emitted during transition to `Opened` is a `StartEvent`
/// if the first symbol not `<`, otherwise no event are emitted.
Init,
/// State after seeing the `<` symbol. Depending on the next symbol all other
/// events (except `StartText`) could be generated.
///
/// After generating ane event the reader moves to the `Closed` state.
Opened,
/// State in which reader searches the `<` symbol of a markup. All bytes before
/// that symbol will be returned in the [`Event::Text`] event. After that
/// the reader moves to the `Opened` state.
Closed,
/// This state is used only if option `expand_empty_elements` is set to `true`.
/// Reader enters to this state when it is in a `Closed` state and emits an
/// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
/// after which reader returned to the `Closed` state.
Empty,
/// Either Eof or Errored
/// Reader enters this state when `Eof` event generated or an error occurred.
/// This is the last state, the reader stay in it forever.
Exit,
}

Expand Down Expand Up @@ -126,7 +160,7 @@ impl<R: BufRead> Reader<R> {
reader,
opened_buffer: Vec::new(),
opened_starts: Vec::new(),
tag_state: TagState::Closed,
tag_state: TagState::Init,
expand_empty_elements: false,
trim_text_start: false,
trim_text_end: false,
Expand Down Expand Up @@ -658,7 +692,8 @@ impl<R: BufRead> Reader<R> {
R: XmlSource<'i, B>,
{
let event = match self.tag_state {
TagState::Closed => self.read_until_open(buf),
TagState::Init => self.read_until_open(buf, true),
TagState::Closed => self.read_until_open(buf, false),
TagState::Opened => self.read_until_close(buf),
TagState::Empty => self.close_expanded_empty(),
TagState::Exit => return Ok(Event::Eof),
Expand All @@ -670,9 +705,10 @@ impl<R: BufRead> Reader<R> {
event
}

/// private function to read until '<' is found
/// return a `Text` event
fn read_until_open<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
/// Read until '<' is found and moves reader to an `Opened` state.
///
/// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result<Event<'i>>
where
R: XmlSource<'i, B>,
{
Expand All @@ -691,15 +727,24 @@ impl<R: BufRead> Reader<R> {
.reader
.read_bytes_until(b'<', buf, &mut self.buf_position)
{
Ok(Some(bytes)) if self.trim_text_end => {
// Skip the ending '<
let len = bytes
.iter()
.rposition(|&b| !is_whitespace(b))
.map_or_else(|| bytes.len(), |p| p + 1);
Ok(Event::Text(BytesText::from_escaped(&bytes[..len])))
Ok(Some(bytes)) => {
let content = if self.trim_text_end {
// Skip the ending '<
let len = bytes
.iter()
.rposition(|&b| !is_whitespace(b))
.map_or_else(|| bytes.len(), |p| p + 1);
&bytes[..len]
} else {
bytes
};

Ok(if first {
Event::StartText(BytesText::from_escaped(content).into())
} else {
Event::Text(BytesText::from_escaped(content))
})
}
Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))),
Ok(None) => Ok(Event::Eof),
Err(e) => Err(e),
}
Expand Down Expand Up @@ -2250,6 +2295,16 @@ mod test {
use crate::reader::Reader;
use pretty_assertions::assert_eq;

#[test]
fn start_text() {
let mut reader = Reader::from_str("bom");

assert_eq!(
reader.read_event_buffered($buf).unwrap(),
Event::StartText(BytesText::from_escaped(b"bom".as_ref()).into())
);
}

#[test]
fn declaration() {
let mut reader = Reader::from_str("<?xml ?>");
Expand Down Expand Up @@ -2313,9 +2368,15 @@ mod test {
);
}

/// Text event cannot be generated without preceding event of another type
#[test]
fn text() {
let mut reader = Reader::from_str("text");
let mut reader = Reader::from_str("<tag/>text");

assert_eq!(
reader.read_event_buffered($buf).unwrap(),
Event::Empty(BytesStart::borrowed_name(b"tag"))
);

assert_eq!(
reader.read_event_buffered($buf).unwrap(),
Expand Down
1 change: 1 addition & 0 deletions src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ impl<W: Write> Writer<W> {
pub fn write_event<'a, E: AsRef<Event<'a>>>(&mut self, event: E) -> Result<()> {
let mut next_should_line_break = true;
let result = match *event.as_ref() {
Event::StartText(ref e) => self.write(&e),
Event::Start(ref e) => {
let result = self.write_wrapped(b"<", e, b">");
if let Some(i) = self.indent.as_mut() {
Expand Down
10 changes: 8 additions & 2 deletions tests/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,10 @@ fn fuzz_101() {
fn test_no_trim() {
let mut reader = Reader::from_str(" <tag> text </tag> ");

assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_)));
assert!(matches!(
reader.read_event_unbuffered().unwrap(),
StartText(_)
));
assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_)));
assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_)));
assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_)));
Expand All @@ -193,7 +196,10 @@ fn test_trim_end() {
let mut reader = Reader::from_str(" <tag> text </tag> ");
reader.trim_text_end(true);

assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_)));
assert!(matches!(
reader.read_event_unbuffered().unwrap(),
StartText(_)
));
assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_)));
assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_)));
assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_)));
Expand Down
Loading

0 comments on commit 9f1e655

Please sign in to comment.