Skip to content

Commit

Permalink
Fix source.match performance without specifying term string (#186)
Browse files Browse the repository at this point in the history
Performance problem of `source.match(regexp)` was recently fixed by
specifying terminator string. However, I think maintaining appropriate
terminator string for a regexp is hard.
I propose solving this performance issue by increasing bytes to read in
each iteration.
  • Loading branch information
tompng committed Aug 1, 2024
1 parent 033d190 commit 6cac15d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 23 deletions.
22 changes: 7 additions & 15 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,6 @@ class BaseParser
}

module Private
# Terminal requires two or more letters.
INSTRUCTION_TERM = "?>"
COMMENT_TERM = "-->"
CDATA_TERM = "]]>"
DOCTYPE_TERM = "]>"
# Read to the end of DOCTYPE because there is no proper ENTITY termination
ENTITY_TERM = DOCTYPE_TERM

INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
Expand Down Expand Up @@ -253,7 +245,7 @@ def pull_event
return process_instruction(start_position)
elsif @source.match("<!", true)
if @source.match("--", true)
md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM)
md = @source.match(/(.*?)-->/um, true)
if md.nil?
raise REXML::ParseException.new("Unclosed comment", @source)
end
Expand Down Expand Up @@ -320,7 +312,7 @@ def pull_event
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
return [ :elementdecl, "<!ELEMENT" + md[1] ]
elsif @source.match("ENTITY", true)
match_data = @source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM)
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
unless match_data
raise REXML::ParseException.new("Malformed entity declaration", @source)
end
Expand Down Expand Up @@ -389,14 +381,14 @@ def pull_event
raise REXML::ParseException.new(message, @source)
end
return [:notationdecl, name, *id]
elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
elsif md = @source.match(/--(.*?)-->/um, true)
case md[1]
when /--/, /-\z/
raise REXML::ParseException.new("Malformed comment", @source)
end
return [ :comment, md[1] ] if md
end
elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
elsif match = @source.match(/(%.*?;)\s*/um, true)
return [ :externalentity, match[1] ]
elsif @source.match(/\]\s*>/um, true)
@document_status = :after_doctype
Expand Down Expand Up @@ -436,15 +428,15 @@ def pull_event
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][0] == ?-
md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
md = @source.match(/--(.*?)-->/um, true)

if md.nil? || /--|-\z/.match?(md[1])
raise REXML::ParseException.new("Malformed comment", @source)
end

return [ :comment, md[1] ]
else
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
Expand Down Expand Up @@ -673,7 +665,7 @@ def parse_id_invalid_details(accept_external_id:,
end

def process_instruction(start_position)
match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
match_data = @source.match(Private::INSTRUCTION_END, true)
unless match_data
message = "Invalid processing instruction node"
@source.position = start_position
Expand Down
26 changes: 18 additions & 8 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def read_until(term)
def ensure_buffer
end

def match(pattern, cons=false, term: nil)
def match(pattern, cons=false)
if cons
@scanner.scan(pattern).nil? ? nil : @scanner
else
Expand Down Expand Up @@ -204,10 +204,20 @@ def initialize(arg, block_size=500, encoding=nil)
end
end

def read(term = nil)
def read(term = nil, min_bytes = 1)
term = encode(term) if term
begin
@scanner << readline(term)
str = readline(term)
@scanner << str
read_bytes = str.bytesize
begin
while read_bytes < min_bytes
str = readline(term)
@scanner << str
read_bytes += str.bytesize
end
rescue IOError
end
true
rescue Exception, NameError
@source = nil
Expand Down Expand Up @@ -237,10 +247,9 @@ def ensure_buffer
read if @scanner.eos? && @source
end

# Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
# - ">"
# - "XXX>" (X is any string excluding '>')
def match( pattern, cons=false, term: nil )
def match( pattern, cons=false )
# To avoid performance issue, we need to increase bytes to read per scan
min_bytes = 1
while true
if cons
md = @scanner.scan(pattern)
Expand All @@ -250,7 +259,8 @@ def match( pattern, cons=false, term: nil )
break if md
return nil if pattern.is_a?(String)
return nil if @source.nil?
return nil unless read(term)
return nil unless read(nil, min_bytes)
min_bytes *= 2
end

md.nil? ? nil : @scanner
Expand Down

0 comments on commit 6cac15d

Please sign in to comment.