Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ElementParser and probably slightly increase performance #754

Merged
merged 9 commits into from
Jun 9, 2024
2 changes: 2 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ resolve predefined entities.
- `quick_xml::escape::resolve_xml_entity`
- `quick_xml::escape::resolve_html5_entity`
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.
- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`.

### Bug Fixes

Expand Down Expand Up @@ -101,6 +102,7 @@ resolve predefined entities.
[#743]: https://github.com/tafia/quick-xml/pull/743
[#748]: https://github.com/tafia/quick-xml/pull/748
[#753]: https://github.com/tafia/quick-xml/pull/753
[#754]: https://github.com/tafia/quick-xml/pull/754
[`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html
Expand Down
2 changes: 1 addition & 1 deletion src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span,
};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand Down
118 changes: 39 additions & 79 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@ use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::path::Path;

use crate::errors::{Error, Result, SyntaxError};
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
#[cfg(not(feature = "encoding"))]
#[inline]
$($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
use crate::encoding::UTF8_BOM;

Expand All @@ -31,6 +32,7 @@ macro_rules! impl_buffered_source {
}

#[cfg(feature = "encoding")]
#[inline]
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Expand Down Expand Up @@ -91,49 +93,50 @@ macro_rules! impl_buffered_source {
Ok((&buf[start..], done))
}

$($async)? fn read_pi $(<$lf>)? (
#[inline]
$($async)? fn read_with<$($lf,)? P: Parser>(
&mut self,
mut parser: P,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<(&'b [u8], bool)> {
let mut parser = super::PiParser::default();

) -> Result<&'b [u8]> {
let mut read = 0;
let mut done = false;
let start = buf.len();
while !done {
let used = {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

match parser.feed(available) {
Some(i) => {
// We does not include `>` in data
buf.extend_from_slice(&available[..i - 1]);
done = true;
i
}
None => {
buf.extend_from_slice(available);
available.len()
}
loop {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

if let Some(i) = parser.feed(available) {
buf.extend_from_slice(&available[..i]);

// +1 for `>` which we do not include
self $(.$reader)? .consume(i + 1);
read += i + 1;

*position += read;
return Ok(&buf[start..]);
}

// The `>` symbol not yet found, continue reading
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
*position += read;

Ok((&buf[start..], done))
*position += read;
Err(Error::Syntax(P::eof_error()))
}

#[inline]
$($async)? fn read_bang_element $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
Expand Down Expand Up @@ -185,49 +188,6 @@ macro_rules! impl_buffered_source {
}

#[inline]
$($async)? fn read_element $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<&'b [u8]> {
let mut state = ReadElementState::Elem;
let mut read = 0;

let start = buf.len();
loop {
match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(available) => {
if let Some((consumed, used)) = state.change(available) {
buf.extend_from_slice(consumed);

self $(.$reader)? .consume(used);
read += used;

// Position now just after the `>` symbol
*position += read;
return Ok(&buf[start..]);
} else {
// The `>` symbol not yet found, continue reading
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};
}

*position += read;
Err(Error::Syntax(SyntaxError::UnclosedTag))
}

$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Expand All @@ -247,25 +207,25 @@ macro_rules! impl_buffered_source {
}
}

$($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
#[inline]
$($async)? fn skip_one(&mut self, byte: u8) -> Result<bool> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());

match self.peek_one() $(.$await)? ? {
Some(b) if b == byte => {
*position += 1;
self $(.$reader)? .consume(1);
Ok(true)
}
_ => Ok(false),
}
}

#[inline]
$($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => Ok(None),
Ok(n) => Ok(Some(n[0])),
Ok(n) => Ok(n.first().cloned()),
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
};
Expand Down
122 changes: 122 additions & 0 deletions src/reader/element.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
//! Contains a parser for an XML element.

use crate::errors::SyntaxError;
use crate::reader::Parser;

/// A parser that search a `>` symbol in the slice outside of quoted regions.
///
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
/// a single-quoted (`'...'`) region. Matches found inside those regions are not
/// considered as results. Each region starts and ends by its quote symbol,
/// which cannot be escaped (but can be encoded as XML character entity or named
/// entity. Anyway, that encoding does not contain literal quotes).
///
/// To use a parser create an instance of parser and [`feed`] data into it.
/// After successful search the parser will return [`Some`] with position of
/// found symbol. If search is unsuccessful, a [`None`] will be returned. You
/// typically would expect positive result of search, so that you should feed
/// new data until you get it.
///
/// NOTE: after successful match the parser does not returned to the initial
/// state and should not be used anymore. Create a new parser if you want to perform
/// new search.
///
/// # Example
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use quick_xml::reader::{ElementParser, Parser};
///
/// let mut parser = ElementParser::default();
///
/// // Parse `<my-element with = 'some > inside'>and the text follow...`
/// // splitted into three chunks
/// assert_eq!(parser.feed(b"<my-element"), None);
/// // ...get new chunk of data
/// assert_eq!(parser.feed(b" with = 'some >"), None);
/// // ...get another chunk of data
/// assert_eq!(parser.feed(b" inside'>and the text follow..."), Some(8));
/// // ^ ^
/// // 0 8
/// ```
///
/// [`feed`]: Self::feed()
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ElementParser {
/// The initial state (inside element, but outside of attribute value).
Outside,
/// Inside a single-quoted region (`'...'`).
SingleQ,
/// Inside a double-quoted region (`"..."`).
DoubleQ,
}

impl Parser for ElementParser {
/// Returns number of consumed bytes or `None` if `>` was not found in `bytes`.
#[inline]
fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) {
*self = match (*self, bytes[i]) {
// only allowed to match `>` while we are in state `Outside`
(Self::Outside, b'>') => return Some(i),
(Self::Outside, b'\'') => Self::SingleQ,
(Self::Outside, b'\"') => Self::DoubleQ,

// the only end_byte that gets us out if the same character
(Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside,

// all other bytes: no state change
_ => continue,
};
}
None
}

#[inline]
fn eof_error() -> SyntaxError {
SyntaxError::UnclosedTag
}
}

impl Default for ElementParser {
#[inline]
fn default() -> Self {
Self::Outside
}
}

#[test]
fn parse() {
use pretty_assertions::assert_eq;
use ElementParser::*;

/// Returns `Ok(pos)` with the position in the buffer where element is ended.
///
/// Returns `Err(internal_state)` if parsing does not done yet.
fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result<usize, ElementParser> {
match parser.feed(bytes) {
Some(i) => Ok(i),
None => Err(parser),
}
}

assert_eq!(parse_element(b"", Outside), Err(Outside));
assert_eq!(parse_element(b"", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"'", Outside), Err(SingleQ));
assert_eq!(parse_element(b"'", SingleQ), Err(Outside));
assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ));
assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside));

assert_eq!(parse_element(b">", Outside), Ok(0));
assert_eq!(parse_element(b">", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"''>", Outside), Ok(2));
assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ));
}
Loading