diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index a87657b5..9304e96d 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -144,6 +144,7 @@ module Private
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
+ EQUAL_PATTERN = /\s*=\s*/um
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NAME_PATTERN = /#{NAME}/um
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
@@ -168,6 +169,7 @@ def initialize( source )
@entity_expansion_limit = Security.entity_expansion_limit
@entity_expansion_text_limit = Security.entity_expansion_text_limit
@source.ensure_buffer
+ @version = nil
end
def add_listener( listener )
@@ -280,7 +282,7 @@ def pull_event
return [ :comment, process_comment ]
elsif @source.match?("DOCTYPE", true)
base_error_message = "Malformed DOCTYPE"
- unless @source.match?(/\s+/um, true)
+ unless @source.skip_spaces
if @source.match?(">")
message = "#{base_error_message}: name is missing"
else
@@ -290,7 +292,7 @@ def pull_event
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
- @source.match?(/\s*/um, true) # skip spaces
+ @source.skip_spaces
if @source.match?("[", true)
id = [nil, nil, nil]
@document_status = :in_doctype
@@ -306,7 +308,7 @@ def pull_event
# For backward compatibility
id[1], id[2] = id[2], nil
end
- @source.match?(/\s*/um, true) # skip spaces
+ @source.skip_spaces
if @source.match?("[", true)
@document_status = :in_doctype
elsif @source.match?(">", true)
@@ -319,7 +321,7 @@ def pull_event
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
- @source.match?(/\s*/um, true)
+ @source.skip_spaces
@stack << [ :end_doctype ]
end
return args
@@ -330,7 +332,7 @@ def pull_event
end
end
if @document_status == :in_doctype
- @source.match?(/\s*/um, true) # skip spaces
+ @source.skip_spaces
start_position = @source.position
if @source.match?("")
message = "#{base_error_message}: name is missing"
else
@@ -404,7 +406,7 @@ def pull_event
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
- @source.match?(/\s*/um, true) # skip spaces
+ @source.skip_spaces
unless @source.match?(">", true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
@@ -425,7 +427,7 @@ def pull_event
end
end
if @document_status == :after_doctype
- @source.match?(/\s*/um, true)
+ @source.skip_spaces
end
begin
start_position = @source.position
@@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding)
true
end
+ def normalize_xml_declaration_encoding(xml_declaration_encoding)
+ /\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
+ end
+
def parse_name(base_error_message)
md = @source.match(Private::NAME_PATTERN, true)
unless md
@@ -735,37 +741,85 @@ def process_comment
def process_instruction
name = parse_name("Malformed XML: Invalid processing instruction node")
- if @source.match?(/\s+/um, true)
- match_data = @source.match(/(.*?)\?>/um, true)
- unless match_data
- raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
+ if name == "xml"
+ xml_declaration
+ else # PITarget
+ if @source.skip_spaces # e.g.
+ start_position = @source.position
+ content = @source.read_until("?>")
+ unless content.chomp!("?>")
+ @source.position = start_position
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
+ end
+ else # e.g.
+ content = nil
+ unless @source.match?("?>", true)
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
+ end
end
- content = match_data[1]
- else
- content = nil
+ [:processing_instruction, name, content]
+ end
+ end
+
+ def xml_declaration
+ unless @version.nil?
+ raise ParseException.new("Malformed XML: XML declaration is duplicated", @source)
+ end
+ if @document_status
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
+ end
+ unless @source.skip_spaces
+ raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source)
+ end
+ unless @source.match?("version", true)
+ raise ParseException.new("Malformed XML: XML declaration misses version", @source)
+ end
+ @version = parse_attribute_value_with_equal("xml")
+ unless @source.skip_spaces
unless @source.match?("?>", true)
- raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
end
+ encoding = normalize_xml_declaration_encoding(@source.encoding)
+ return [ :xmldecl, @version, encoding, nil ] # e.g.
end
- if name == "xml"
- if @document_status
- raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
- end
- version = VERSION.match(content)
- version = version[1] unless version.nil?
- encoding = ENCODING.match(content)
- encoding = encoding[1] unless encoding.nil?
- if need_source_encoding_update?(encoding)
- @source.encoding = encoding
+
+ if @source.match?("encoding", true)
+ encoding = parse_attribute_value_with_equal("xml")
+ unless @source.skip_spaces
+ unless @source.match?("?>", true)
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
+ end
+ if need_source_encoding_update?(encoding)
+ @source.encoding = encoding
+ end
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
+ return [ :xmldecl, @version, encoding, nil ] # e.g.
end
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
- encoding = "UTF-16"
+ end
+
+ if @source.match?("standalone", true)
+ standalone = parse_attribute_value_with_equal("xml")
+ case standalone
+ when "yes", "no"
+ else
+ raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source)
end
- standalone = STANDALONE.match(content)
- standalone = standalone[1] unless standalone.nil?
- return [ :xmldecl, version, encoding, standalone ]
end
- [:processing_instruction, name, content]
+ @source.skip_spaces
+ unless @source.match?("?>", true)
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
+ end
+
+ if need_source_encoding_update?(encoding)
+ @source.encoding = encoding
+ end
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
+
+ # e.g.
+ #
+ #
+ #
+ [ :xmldecl, @version, encoding, standalone ]
end
if StringScanner::Version < "3.1.1"
@@ -787,6 +841,25 @@ def scan_quote
end
end
+ def parse_attribute_value_with_equal(name)
+ unless @source.match?(Private::EQUAL_PATTERN, true)
+ message = "Missing attribute equal: <#{name}>"
+ raise REXML::ParseException.new(message, @source)
+ end
+ unless quote = scan_quote
+ message = "Missing attribute value start quote: <#{name}>"
+ raise REXML::ParseException.new(message, @source)
+ end
+ start_position = @source.position
+ value = @source.read_until(quote)
+ unless value.chomp!(quote)
+ @source.position = start_position
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
+ raise REXML::ParseException.new(message, @source)
+ end
+ value
+ end
+
def parse_attributes(prefixes)
attributes = {}
expanded_names = {}
@@ -801,23 +874,8 @@ def parse_attributes(prefixes)
name = match[1]
prefix = match[2]
local_part = match[3]
-
- unless @source.match?(/\s*=\s*/um, true)
- message = "Missing attribute equal: <#{name}>"
- raise REXML::ParseException.new(message, @source)
- end
- unless quote = scan_quote
- message = "Missing attribute value start quote: <#{name}>"
- raise REXML::ParseException.new(message, @source)
- end
- start_position = @source.position
- value = @source.read_until(quote)
- unless value.chomp!(quote)
- @source.position = start_position
- message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
- raise REXML::ParseException.new(message, @source)
- end
- @source.match?(/\s*/um, true)
+ value = parse_attribute_value_with_equal(name)
+ @source.skip_spaces
if prefix == "xmlns"
if local_part == "xml"
if value != Private::XML_PREFIXED_NAMESPACE
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 3ec1141e..99500072 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -65,9 +65,10 @@ class Source
attr_reader :encoding
module Private
+ SPACES_PATTERN = /\s+/um
SCANNER_RESET_SIZE = 100000
PRE_DEFINED_TERM_PATTERNS = {}
- pre_defined_terms = ["'", '"', "<", "]]>"]
+ pre_defined_terms = ["'", '"', "<", "]]>", "?>"]
if StringScanner::Version < "3.1.1"
pre_defined_terms.each do |term|
PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
@@ -150,6 +151,10 @@ def match?(pattern, cons=false)
end
end
+ def skip_spaces
+ @scanner.skip(Private::SPACES_PATTERN) ? true : false
+ end
+
def position
@scanner.pos
end
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index b22863a9..d4658b9e 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -49,10 +49,10 @@ def test_no_name
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed DOCTYPE: name is missing
-Line: 3
-Position: 17
+Line: 1
+Position: 10
Last 80 unconsumed characters:
-
+
DETAIL
end
end
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index ba381dc4..70d17747 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -30,7 +30,7 @@ def test_unclosed_content
parse("
Line: 1
Position: 14
Last 80 unconsumed characters:
@@ -43,7 +43,7 @@ def test_unclosed_no_content
parse("
Line: 1
Position: 6
Last 80 unconsumed characters:
@@ -51,6 +51,19 @@ def test_unclosed_no_content
DETAIL
end
+ def test_xml_declaration_duplicated
+ exception = assert_raise(REXML::ParseException) do
+ parse('')
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed XML: XML declaration is duplicated
+Line: 1
+Position: 42
+Last 80 unconsumed characters:
+ version="1.0"?>
+ DETAIL
+ end
+
def test_xml_declaration_not_at_document_start
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('')
@@ -64,7 +77,118 @@ def test_xml_declaration_not_at_document_start
Line: 1
Position: 25
Last 80 unconsumed characters:
+ version="1.0" ?>
+ DETAIL
+ end
+
+ def test_xml_declaration_missing_spaces
+ exception = assert_raise(REXML::ParseException) do
+ parser = REXML::Parsers::BaseParser.new('')
+ while parser.has_next?
+ parser.pull
+ end
+ end
+
+ assert_equal(<<~DETAIL.chomp, exception.to_s)
+ Malformed XML: XML declaration misses spaces before version
+ Line: 1
+ Position: 7
+ Last 80 unconsumed characters:
+ ?>
+ DETAIL
+ end
+
+ def test_xml_declaration_missing_version
+ exception = assert_raise(REXML::ParseException) do
+ parser = REXML::Parsers::BaseParser.new('')
+ while parser.has_next?
+ parser.pull
+ end
+ end
+
+ assert_equal(<<~DETAIL.chomp, exception.to_s)
+ Malformed XML: XML declaration misses version
+ Line: 1
+ Position: 8
+ Last 80 unconsumed characters:
+ ?>
+ DETAIL
+ end
+
+ def test_xml_declaration_unclosed_content
+ exception = assert_raise(REXML::ParseException) do
+ parse('')
+ while parser.has_next?
+ parser.pull
+ end
+ end
+
+ assert_equal(<<~DETAIL.chomp, exception.to_s)
+ Malformed XML: Unclosed XML declaration
+ Line: 1
+ Position: 37
+ Last 80 unconsumed characters:
+ encoding="UTF-8"?>
+ DETAIL
+ end
+
+ def test_xml_declaration_unclosed_content_missing_space_after_encoding
+ exception = assert_raise(REXML::ParseException) do
+ parser = REXML::Parsers::BaseParser.new('')
+ while parser.has_next?
+ parser.pull
+ end
+ end
+
+ assert_equal(<<~DETAIL.chomp, exception.to_s)
+ Malformed XML: Unclosed XML declaration
+ Line: 1
+ Position: 53
+ Last 80 unconsumed characters:
+ standalone="no"?>
+ DETAIL
+ end
+
+ def test_xml_declaration_unclosed_content_with_unknown_attributes
+ exception = assert_raise(REXML::ParseException) do
+ parser = REXML::Parsers::BaseParser.new('')
+ while parser.has_next?
+ parser.pull
+ end
+ end
+
+ assert_equal(<<~DETAIL.chomp, exception.to_s)
+ Malformed XML: Unclosed XML declaration
+ Line: 1
+ Position: 31
+ Last 80 unconsumed characters:
+ test="no"?>
+ DETAIL
+ end
+
+ def test_xml_declaration_standalone_no_yes_or_no
+ exception = assert_raise(REXML::ParseException) do
+ parse('')
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed XML: XML declaration standalone is not yes or no :
+Line: 1
+Position: 38
+Last 80 unconsumed characters:
+?>
DETAIL
end
end
@@ -113,7 +237,7 @@ def test_content_question
def test_linear_performance_gt
seq = [10000, 50000, 100000, 150000, 200000]
assert_linear_performance(seq, rehearsal: 10) do |n|
- REXML::Document.new("" * n + " ?>")
+ REXML::Document.new("" * n + " ?>")
end
end
diff --git a/test/test_xml_declaration.rb b/test/test_xml_declaration.rb
index 6a1f4df0..4503a90e 100644
--- a/test/test_xml_declaration.rb
+++ b/test/test_xml_declaration.rb
@@ -7,7 +7,7 @@ module REXMLTests
class TestXmlDeclaration < Test::Unit::TestCase
def setup
xml = <<~XML
-
+
XML