Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 107 additions & 49 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ module Private
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
EQUAL_PATTERN = /\s*=\s*/um
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NAME_PATTERN = /#{NAME}/um
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
Expand All @@ -168,6 +169,7 @@ def initialize( source )
@entity_expansion_limit = Security.entity_expansion_limit
@entity_expansion_text_limit = Security.entity_expansion_text_limit
@source.ensure_buffer
@version = nil
end

def add_listener( listener )
Expand Down Expand Up @@ -280,7 +282,7 @@ def pull_event
return [ :comment, process_comment ]
elsif @source.match?("DOCTYPE", true)
base_error_message = "Malformed DOCTYPE"
unless @source.match?(/\s+/um, true)
unless @source.skip_spaces
if @source.match?(">")
message = "#{base_error_message}: name is missing"
else
Expand All @@ -290,7 +292,7 @@ def pull_event
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
@source.match?(/\s*/um, true) # skip spaces
@source.skip_spaces
if @source.match?("[", true)
id = [nil, nil, nil]
@document_status = :in_doctype
Expand All @@ -306,7 +308,7 @@ def pull_event
# For backward compatibility
id[1], id[2] = id[2], nil
end
@source.match?(/\s*/um, true) # skip spaces
@source.skip_spaces
if @source.match?("[", true)
@document_status = :in_doctype
elsif @source.match?(">", true)
Expand All @@ -319,7 +321,7 @@ def pull_event
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match?(/\s*/um, true)
@source.skip_spaces
@stack << [ :end_doctype ]
end
return args
Expand All @@ -330,7 +332,7 @@ def pull_event
end
end
if @document_status == :in_doctype
@source.match?(/\s*/um, true) # skip spaces
@source.skip_spaces
start_position = @source.position
if @source.match?("<!", true)
if @source.match?("ELEMENT", true)
Expand Down Expand Up @@ -391,7 +393,7 @@ def pull_event
return [ :attlistdecl, element, pairs, contents ]
elsif @source.match?("NOTATION", true)
base_error_message = "Malformed notation declaration"
unless @source.match?(/\s+/um, true)
unless @source.skip_spaces
if @source.match?(">")
message = "#{base_error_message}: name is missing"
else
Expand All @@ -404,7 +406,7 @@ def pull_event
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
@source.match?(/\s*/um, true) # skip spaces
@source.skip_spaces
unless @source.match?(">", true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
Expand All @@ -425,7 +427,7 @@ def pull_event
end
end
if @document_status == :after_doctype
@source.match?(/\s*/um, true)
@source.skip_spaces
end
begin
start_position = @source.position
Expand Down Expand Up @@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding)
true
end

def normalize_xml_declaration_encoding(xml_declaration_encoding)
/\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
end

def parse_name(base_error_message)
md = @source.match(Private::NAME_PATTERN, true)
unless md
Expand Down Expand Up @@ -735,37 +741,85 @@ def process_comment

def process_instruction
name = parse_name("Malformed XML: Invalid processing instruction node")
if @source.match?(/\s+/um, true)
match_data = @source.match(/(.*?)\?>/um, true)
unless match_data
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
if name == "xml"
xml_declaration
else # PITarget
if @source.skip_spaces # e.g. <?name content?>
start_position = @source.position
content = @source.read_until("?>")
unless content.chomp!("?>")
@source.position = start_position
raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
end
else # e.g. <?name?>
content = nil
unless @source.match?("?>", true)
raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
end
end
content = match_data[1]
else
content = nil
[:processing_instruction, name, content]
end
end

def xml_declaration
unless @version.nil?
raise ParseException.new("Malformed XML: XML declaration is duplicated", @source)
end
if @document_status
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
end
unless @source.skip_spaces
raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source)
end
unless @source.match?("version", true)
raise ParseException.new("Malformed XML: XML declaration misses version", @source)
end
@version = parse_attribute_value_with_equal("xml")
unless @source.skip_spaces
unless @source.match?("?>", true)
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
end
encoding = normalize_xml_declaration_encoding(@source.encoding)
return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.0"?>
end
if name == "xml"
if @document_status
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
end
version = VERSION.match(content)
version = version[1] unless version.nil?
encoding = ENCODING.match(content)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding

if @source.match?("encoding", true)
encoding = parse_attribute_value_with_equal("xml")
unless @source.skip_spaces
unless @source.match?("?>", true)
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
end
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
encoding ||= normalize_xml_declaration_encoding(@source.encoding)
return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.1" encoding="UTF-8"?>
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end

if @source.match?("standalone", true)
standalone = parse_attribute_value_with_equal("xml")
case standalone
when "yes", "no"
else
raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source)
end
standalone = STANDALONE.match(content)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
end
[:processing_instruction, name, content]
@source.skip_spaces
unless @source.match?("?>", true)
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
end

if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
encoding ||= normalize_xml_declaration_encoding(@source.encoding)

# e.g. <?xml version="1.0" ?>
# <?xml version="1.1" encoding="UTF-8" ?>
# <?xml version="1.1" standalone="yes"?>
# <?xml version="1.1" encoding="UTF-8" standalone="yes" ?>
[ :xmldecl, @version, encoding, standalone ]
end

if StringScanner::Version < "3.1.1"
Expand All @@ -787,6 +841,25 @@ def scan_quote
end
end

def parse_attribute_value_with_equal(name)
unless @source.match?(Private::EQUAL_PATTERN, true)
message = "Missing attribute equal: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
unless quote = scan_quote
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
start_position = @source.position
value = @source.read_until(quote)
unless value.chomp!(quote)
@source.position = start_position
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
raise REXML::ParseException.new(message, @source)
end
value
end

def parse_attributes(prefixes)
attributes = {}
expanded_names = {}
Expand All @@ -801,23 +874,8 @@ def parse_attributes(prefixes)
name = match[1]
prefix = match[2]
local_part = match[3]

unless @source.match?(/\s*=\s*/um, true)
message = "Missing attribute equal: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
unless quote = scan_quote
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
start_position = @source.position
value = @source.read_until(quote)
unless value.chomp!(quote)
@source.position = start_position
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
raise REXML::ParseException.new(message, @source)
end
@source.match?(/\s*/um, true)
value = parse_attribute_value_with_equal(name)
@source.skip_spaces
if prefix == "xmlns"
if local_part == "xml"
if value != Private::XML_PREFIXED_NAMESPACE
Expand Down
7 changes: 6 additions & 1 deletion lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,10 @@ class Source
attr_reader :encoding

module Private
SPACES_PATTERN = /\s+/um
SCANNER_RESET_SIZE = 100000
PRE_DEFINED_TERM_PATTERNS = {}
pre_defined_terms = ["'", '"', "<", "]]>"]
pre_defined_terms = ["'", '"', "<", "]]>", "?>"]
if StringScanner::Version < "3.1.1"
pre_defined_terms.each do |term|
PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
Expand Down Expand Up @@ -150,6 +151,10 @@ def match?(pattern, cons=false)
end
end

def skip_spaces
@scanner.skip(Private::SPACES_PATTERN) ? true : false
end

def position
@scanner.pos
end
Expand Down
6 changes: 3 additions & 3 deletions test/parse/test_document_type_declaration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ def test_no_name
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed DOCTYPE: name is missing
Line: 3
Position: 17
Line: 1
Position: 10
Last 80 unconsumed characters:
<!DOCTYPE> <r/>
<!DOCTYPE>
DETAIL
end
end
Expand Down
Loading
Loading