Skip to content

Commit 09cede2

Browse files
committed
Added XML declaration check
## Why? - The version attribute is required in XML declaration. - Only version attribute, encoding attribute, and standalone attribute are allowed in XML declaration. - XML declaration is only allowed once. See: https://www.w3.org/TR/xml/#NT-XMLDecl
1 parent 7e2b81c commit 09cede2

File tree

4 files changed

+227
-45
lines changed

4 files changed

+227
-45
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 98 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ module Private
144144
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145145
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146146
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147+
EQUAL_PATTERN = /\s*=\s*/um
147148
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148149
NAME_PATTERN = /#{NAME}/um
149150
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
@@ -168,6 +169,7 @@ def initialize( source )
168169
@entity_expansion_limit = Security.entity_expansion_limit
169170
@entity_expansion_text_limit = Security.entity_expansion_text_limit
170171
@source.ensure_buffer
172+
@version = nil
171173
end
172174

173175
def add_listener( listener )
@@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding)
642644
true
643645
end
644646

647+
def normalize_xml_declaration_encoding(xml_declaration_encoding)
648+
/\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
649+
end
650+
645651
def parse_name(base_error_message)
646652
md = @source.match(Private::NAME_PATTERN, true)
647653
unless md
@@ -735,37 +741,85 @@ def process_comment
735741

736742
def process_instruction
737743
name = parse_name("Malformed XML: Invalid processing instruction node")
738-
if @source.skip_spaces
739-
match_data = @source.match(/(.*?)\?>/um, true)
740-
unless match_data
741-
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
744+
if name == "xml"
745+
xml_declaration
746+
else # PITarget
747+
if @source.skip_spaces # e.g. <?name content?>
748+
start_position = @source.position
749+
content = @source.read_until("?>")
750+
unless content.chomp!("?>")
751+
@source.position = start_position
752+
raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
753+
end
754+
else # e.g. <?name?>
755+
content = nil
756+
unless @source.match?("?>", true)
757+
raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
758+
end
742759
end
743-
content = match_data[1]
744-
else
745-
content = nil
760+
[:processing_instruction, name, content]
761+
end
762+
end
763+
764+
def xml_declaration
765+
unless @version.nil?
766+
raise ParseException.new("Malformed XML: XML declaration is duplicated", @source)
767+
end
768+
if @document_status
769+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
770+
end
771+
unless @source.skip_spaces
772+
raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source)
773+
end
774+
unless @source.match?("version", true)
775+
raise ParseException.new("Malformed XML: XML declaration misses version", @source)
776+
end
777+
@version = parse_attribute_value_with_equal("xml")
778+
unless @source.skip_spaces
746779
unless @source.match?("?>", true)
747-
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
780+
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
748781
end
782+
encoding = normalize_xml_declaration_encoding(@source.encoding)
783+
return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.0"?>
749784
end
750-
if name == "xml"
751-
if @document_status
752-
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
753-
end
754-
version = VERSION.match(content)
755-
version = version[1] unless version.nil?
756-
encoding = ENCODING.match(content)
757-
encoding = encoding[1] unless encoding.nil?
758-
if need_source_encoding_update?(encoding)
759-
@source.encoding = encoding
785+
786+
if @source.match?("encoding", true)
787+
encoding = parse_attribute_value_with_equal("xml")
788+
unless @source.skip_spaces
789+
unless @source.match?("?>", true)
790+
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
791+
end
792+
if need_source_encoding_update?(encoding)
793+
@source.encoding = encoding
794+
end
795+
encoding ||= normalize_xml_declaration_encoding(@source.encoding)
796+
return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.1" encoding="UTF-8"?>
760797
end
761-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
762-
encoding = "UTF-16"
798+
end
799+
800+
if @source.match?("standalone", true)
801+
standalone = parse_attribute_value_with_equal("xml")
802+
case standalone
803+
when "yes", "no"
804+
else
805+
raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source)
763806
end
764-
standalone = STANDALONE.match(content)
765-
standalone = standalone[1] unless standalone.nil?
766-
return [ :xmldecl, version, encoding, standalone ]
767807
end
768-
[:processing_instruction, name, content]
808+
@source.skip_spaces
809+
unless @source.match?("?>", true)
810+
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
811+
end
812+
813+
if need_source_encoding_update?(encoding)
814+
@source.encoding = encoding
815+
end
816+
encoding ||= normalize_xml_declaration_encoding(@source.encoding)
817+
818+
# e.g. <?xml version="1.0" ?>
819+
# <?xml version="1.1" encoding="UTF-8" ?>
820+
# <?xml version="1.1" standalone="yes"?>
821+
# <?xml version="1.1" encoding="UTF-8" standalone="yes" ?>
822+
[ :xmldecl, @version, encoding, standalone ]
769823
end
770824

771825
if StringScanner::Version < "3.1.1"
@@ -787,6 +841,25 @@ def scan_quote
787841
end
788842
end
789843

844+
def parse_attribute_value_with_equal(name)
845+
unless @source.match?(Private::EQUAL_PATTERN, true)
846+
message = "Missing attribute equal: <#{name}>"
847+
raise REXML::ParseException.new(message, @source)
848+
end
849+
unless quote = scan_quote
850+
message = "Missing attribute value start quote: <#{name}>"
851+
raise REXML::ParseException.new(message, @source)
852+
end
853+
start_position = @source.position
854+
value = @source.read_until(quote)
855+
unless value.chomp!(quote)
856+
@source.position = start_position
857+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
858+
raise REXML::ParseException.new(message, @source)
859+
end
860+
value
861+
end
862+
790863
def parse_attributes(prefixes)
791864
attributes = {}
792865
expanded_names = {}
@@ -801,22 +874,7 @@ def parse_attributes(prefixes)
801874
name = match[1]
802875
prefix = match[2]
803876
local_part = match[3]
804-
805-
unless @source.match?(/\s*=\s*/um, true)
806-
message = "Missing attribute equal: <#{name}>"
807-
raise REXML::ParseException.new(message, @source)
808-
end
809-
unless quote = scan_quote
810-
message = "Missing attribute value start quote: <#{name}>"
811-
raise REXML::ParseException.new(message, @source)
812-
end
813-
start_position = @source.position
814-
value = @source.read_until(quote)
815-
unless value.chomp!(quote)
816-
@source.position = start_position
817-
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
818-
raise REXML::ParseException.new(message, @source)
819-
end
877+
value = parse_attribute_value_with_equal(name)
820878
@source.skip_spaces
821879
if prefix == "xmlns"
822880
if local_part == "xml"

lib/rexml/source.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ module Private
6868
SPACES_PATTERN = /\s+/um
6969
SCANNER_RESET_SIZE = 100000
7070
PRE_DEFINED_TERM_PATTERNS = {}
71-
pre_defined_terms = ["'", '"', "<", "]]>"]
71+
pre_defined_terms = ["'", '"', "<", "]]>", "?>"]
7272
if StringScanner::Version < "3.1.1"
7373
pre_defined_terms.each do |term|
7474
PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/

test/parse/test_processing_instruction.rb

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_unclosed_content
3030
parse("<?name content")
3131
end
3232
assert_equal(<<-DETAIL.chomp, exception.to_s)
33-
Malformed XML: Unclosed processing instruction
33+
Malformed XML: Unclosed processing instruction: <name>
3434
Line: 1
3535
Position: 14
3636
Last 80 unconsumed characters:
@@ -43,14 +43,27 @@ def test_unclosed_no_content
4343
parse("<?name")
4444
end
4545
assert_equal(<<-DETAIL.chomp, exception.to_s)
46-
Malformed XML: Unclosed processing instruction
46+
Malformed XML: Unclosed processing instruction: <name>
4747
Line: 1
4848
Position: 6
4949
Last 80 unconsumed characters:
5050
5151
DETAIL
5252
end
5353

54+
def test_xml_declaration_duplicated
55+
exception = assert_raise(REXML::ParseException) do
56+
parse('<?xml version="1.0"?><?xml version="1.0"?>')
57+
end
58+
assert_equal(<<-DETAIL.chomp, exception.to_s)
59+
Malformed XML: XML declaration is duplicated
60+
Line: 1
61+
Position: 42
62+
Last 80 unconsumed characters:
63+
version="1.0"?>
64+
DETAIL
65+
end
66+
5467
def test_xml_declaration_not_at_document_start
5568
exception = assert_raise(REXML::ParseException) do
5669
parser = REXML::Parsers::BaseParser.new('<a><?xml version="1.0" ?></a>')
@@ -64,7 +77,118 @@ def test_xml_declaration_not_at_document_start
6477
Line: 1
6578
Position: 25
6679
Last 80 unconsumed characters:
80+
version="1.0" ?>
81+
DETAIL
82+
end
83+
84+
def test_xml_declaration_missing_spaces
85+
exception = assert_raise(REXML::ParseException) do
86+
parser = REXML::Parsers::BaseParser.new('<?xml?>')
87+
while parser.has_next?
88+
parser.pull
89+
end
90+
end
91+
92+
assert_equal(<<~DETAIL.chomp, exception.to_s)
93+
Malformed XML: XML declaration misses spaces before version
94+
Line: 1
95+
Position: 7
96+
Last 80 unconsumed characters:
97+
?>
98+
DETAIL
99+
end
100+
101+
def test_xml_declaration_missing_version
102+
exception = assert_raise(REXML::ParseException) do
103+
parser = REXML::Parsers::BaseParser.new('<?xml ?>')
104+
while parser.has_next?
105+
parser.pull
106+
end
107+
end
108+
109+
assert_equal(<<~DETAIL.chomp, exception.to_s)
110+
Malformed XML: XML declaration misses version
111+
Line: 1
112+
Position: 8
113+
Last 80 unconsumed characters:
114+
?>
115+
DETAIL
116+
end
117+
118+
def test_xml_declaration_unclosed_content
119+
exception = assert_raise(REXML::ParseException) do
120+
parse('<?xml version="1.0"')
121+
end
122+
assert_equal(<<-DETAIL.chomp, exception.to_s)
123+
Malformed XML: Unclosed XML declaration
124+
Line: 1
125+
Position: 19
126+
Last 80 unconsumed characters:
127+
128+
DETAIL
129+
end
67130

131+
def test_xml_declaration_unclosed_content_missing_space_after_version
132+
exception = assert_raise(REXML::ParseException) do
133+
parser = REXML::Parsers::BaseParser.new('<?xml version="1.0"encoding="UTF-8"?>')
134+
while parser.has_next?
135+
parser.pull
136+
end
137+
end
138+
139+
assert_equal(<<~DETAIL.chomp, exception.to_s)
140+
Malformed XML: Unclosed XML declaration
141+
Line: 1
142+
Position: 37
143+
Last 80 unconsumed characters:
144+
encoding="UTF-8"?>
145+
DETAIL
146+
end
147+
148+
def test_xml_declaration_unclosed_content_missing_space_after_encoding
149+
exception = assert_raise(REXML::ParseException) do
150+
parser = REXML::Parsers::BaseParser.new('<?xml version="1.0" encoding="UTF-8"standalone="no"?>')
151+
while parser.has_next?
152+
parser.pull
153+
end
154+
end
155+
156+
assert_equal(<<~DETAIL.chomp, exception.to_s)
157+
Malformed XML: Unclosed XML declaration
158+
Line: 1
159+
Position: 53
160+
Last 80 unconsumed characters:
161+
standalone="no"?>
162+
DETAIL
163+
end
164+
165+
def test_xml_declaration_unclosed_content_with_unknown_attributes
166+
exception = assert_raise(REXML::ParseException) do
167+
parser = REXML::Parsers::BaseParser.new('<?xml version="1.0" test="no"?>')
168+
while parser.has_next?
169+
parser.pull
170+
end
171+
end
172+
173+
assert_equal(<<~DETAIL.chomp, exception.to_s)
174+
Malformed XML: Unclosed XML declaration
175+
Line: 1
176+
Position: 31
177+
Last 80 unconsumed characters:
178+
test="no"?>
179+
DETAIL
180+
end
181+
182+
def test_xml_declaration_standalone_no_yes_or_no
183+
exception = assert_raise(REXML::ParseException) do
184+
parse('<?xml version="1.0" standalone="YES"?>')
185+
end
186+
assert_equal(<<-DETAIL.chomp, exception.to_s)
187+
Malformed XML: XML declaration standalone is not yes or no : <YES>
188+
Line: 1
189+
Position: 38
190+
Last 80 unconsumed characters:
191+
?>
68192
DETAIL
69193
end
70194
end
@@ -113,7 +237,7 @@ def test_content_question
113237
def test_linear_performance_gt
114238
seq = [10000, 50000, 100000, 150000, 200000]
115239
assert_linear_performance(seq, rehearsal: 10) do |n|
116-
REXML::Document.new("<?xml version=\"1.0\" " + ">" * n + " ?>")
240+
REXML::Document.new("<?name content " + ">" * n + " ?>")
117241
end
118242
end
119243

test/test_xml_declaration.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ module REXMLTests
77
class TestXmlDeclaration < Test::Unit::TestCase
88
def setup
99
xml = <<~XML
10-
<?xml encoding= 'UTF-8' standalone='yes'?>
10+
<?xml version='1.0' encoding= 'UTF-8' standalone='yes'?>
1111
<root>
1212
</root>
1313
XML

0 commit comments

Comments
 (0)