diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 54014e57..af06b6df 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,14 +124,6 @@ class BaseParser } module Private - # Terminal requires two or more letters. - INSTRUCTION_TERM = "?>" - COMMENT_TERM = "-->" - CDATA_TERM = "]]>" - DOCTYPE_TERM = "]>" - # Read to the end of DOCTYPE because there is no proper ENTITY termination - ENTITY_TERM = DOCTYPE_TERM - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um @@ -251,7 +243,7 @@ def pull_event return process_instruction(start_position) elsif @source.match("/um, true, term: Private::COMMENT_TERM) + md = @source.match(/(.*?)-->/um, true) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) end @@ -318,7 +310,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, "/um, true, term: Private::COMMENT_TERM) + elsif md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] if md end - elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM) + elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype @@ -434,7 +426,7 @@ def pull_event #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) + md = @source.match(/--(.*?)-->/um, true) if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) @@ -442,7 +434,7 @@ def pull_event return [ :comment, md[1] ] else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ @@ -656,7 +648,7 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction(start_position) - match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM) + match_data = @source.match(Private::INSTRUCTION_END, true) unless match_data message = "Invalid processing instruction node" @source.position = start_position diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 4c30532a..ff887fc0 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -117,7 +117,7 @@ def read_until(term) def ensure_buffer end - def match(pattern, cons=false, term: nil) + def match(pattern, cons=false) if cons @scanner.scan(pattern).nil? ? nil : @scanner else @@ -204,10 +204,20 @@ def initialize(arg, block_size=500, encoding=nil) end end - def read(term = nil) + def read(term = nil, min_bytes = 1) term = encode(term) if term begin - @scanner << readline(term) + str = readline(term) + @scanner << str + read_bytes = str.bytesize + begin + while read_bytes < min_bytes + str = readline(term) + @scanner << str + read_bytes += str.bytesize + end + rescue IOError + end true rescue Exception, NameError @source = nil @@ -237,10 +247,9 @@ def ensure_buffer read if @scanner.eos? && @source end - # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: - # - ">" - # - "XXX>" (X is any string excluding '>') - def match( pattern, cons=false, term: nil ) + def match( pattern, cons=false ) + # To avoid performance issue, we need to increase bytes to read per scan + min_bytes = 1 while true if cons md = @scanner.scan(pattern) @@ -250,7 +259,8 @@ def match( pattern, cons=false, term: nil ) break if md return nil if pattern.is_a?(String) return nil if @source.nil? - return nil unless read(term) + return nil unless read(nil, min_bytes) + min_bytes *= 2 end md.nil? ? nil : @scanner