diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a87657b5..9304e96d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -144,6 +144,7 @@ module Private PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + EQUAL_PATTERN = /\s*=\s*/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um NAME_PATTERN = /#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" @@ -168,6 +169,7 @@ def initialize( source ) @entity_expansion_limit = Security.entity_expansion_limit @entity_expansion_text_limit = Security.entity_expansion_text_limit @source.ensure_buffer + @version = nil end def add_listener( listener ) @@ -280,7 +282,7 @@ def pull_event return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" - unless @source.match?(/\s+/um, true) + unless @source.skip_spaces if @source.match?(">") message = "#{base_error_message}: name is missing" else @@ -290,7 +292,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype @@ -306,7 +308,7 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) @document_status = :in_doctype elsif @source.match?(">", true) @@ -319,7 +321,7 @@ def pull_event end args = [:start_doctype, name, *id] if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces @stack << [ :end_doctype ] end return args @@ -330,7 +332,7 @@ def pull_event end end if @document_status == :in_doctype - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces start_position = @source.position if @source.match?("") message = "#{base_error_message}: name is missing" else @@ -404,7 +406,7 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) @@ -425,7 +427,7 @@ def pull_event end end if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces end begin start_position = @source.position @@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding) true end + def normalize_xml_declaration_encoding(xml_declaration_encoding) + /\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil + end + def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md @@ -735,37 +741,85 @@ def process_comment def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") - if @source.match?(/\s+/um, true) - match_data = @source.match(/(.*?)\?>/um, true) - unless match_data - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + if name == "xml" + xml_declaration + else # PITarget + if @source.skip_spaces # e.g. + start_position = @source.position + content = @source.read_until("?>") + unless content.chomp!("?>") + @source.position = start_position + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end + else # e.g. + content = nil + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end end - content = match_data[1] - else - content = nil + [:processing_instruction, name, content] + end + end + + def xml_declaration + unless @version.nil? + raise ParseException.new("Malformed XML: XML declaration is duplicated", @source) + end + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end + unless @source.skip_spaces + raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source) + end + unless @source.match?("version", true) + raise ParseException.new("Malformed XML: XML declaration misses version", @source) + end + @version = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces unless @source.match?("?>", true) - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) end + encoding = normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if name == "xml" - if @document_status - raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) - end - version = VERSION.match(content) - version = version[1] unless version.nil? - encoding = ENCODING.match(content) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding + + if @source.match?("encoding", true) + encoding = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" + end + + if @source.match?("standalone", true) + standalone = parse_attribute_value_with_equal("xml") + case standalone + when "yes", "no" + else + raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source) end - standalone = STANDALONE.match(content) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, name, content] + @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + + # e.g. + # + # + # + [ :xmldecl, @version, encoding, standalone ] end if StringScanner::Version < "3.1.1" @@ -787,6 +841,25 @@ def scan_quote end end + def parse_attribute_value_with_equal(name) + unless @source.match?(Private::EQUAL_PATTERN, true) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + unless quote = scan_quote + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + start_position = @source.position + value = @source.read_until(quote) + unless value.chomp!(quote) + @source.position = start_position + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" + raise REXML::ParseException.new(message, @source) + end + value + end + def parse_attributes(prefixes) attributes = {} expanded_names = {} @@ -801,23 +874,8 @@ def parse_attributes(prefixes) name = match[1] prefix = match[2] local_part = match[3] - - unless @source.match?(/\s*=\s*/um, true) - message = "Missing attribute equal: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - unless quote = scan_quote - message = "Missing attribute value start quote: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - start_position = @source.position - value = @source.read_until(quote) - unless value.chomp!(quote) - @source.position = start_position - message = "Missing attribute value end quote: <#{name}>: <#{quote}>" - raise REXML::ParseException.new(message, @source) - end - @source.match?(/\s*/um, true) + value = parse_attribute_value_with_equal(name) + @source.skip_spaces if prefix == "xmlns" if local_part == "xml" if value != Private::XML_PREFIXED_NAMESPACE diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3ec1141e..99500072 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -65,9 +65,10 @@ class Source attr_reader :encoding module Private + SPACES_PATTERN = /\s+/um SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<", "]]>"] + pre_defined_terms = ["'", '"', "<", "]]>", "?>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ @@ -150,6 +151,10 @@ def match?(pattern, cons=false) end end + def skip_spaces + @scanner.skip(Private::SPACES_PATTERN) ? true : false + end + def position @scanner.pos end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index b22863a9..d4658b9e 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -49,10 +49,10 @@ def test_no_name end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: name is missing -Line: 3 -Position: 17 +Line: 1 +Position: 10 Last 80 unconsumed characters: - + DETAIL end end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index ba381dc4..70d17747 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -30,7 +30,7 @@ def test_unclosed_content parse(" Line: 1 Position: 14 Last 80 unconsumed characters: @@ -43,7 +43,7 @@ def test_unclosed_no_content parse(" Line: 1 Position: 6 Last 80 unconsumed characters: @@ -51,6 +51,19 @@ def test_unclosed_no_content DETAIL end + def test_xml_declaration_duplicated + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration is duplicated +Line: 1 +Position: 42 +Last 80 unconsumed characters: + version="1.0"?> + DETAIL + end + def test_xml_declaration_not_at_document_start exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('') @@ -64,7 +77,118 @@ def test_xml_declaration_not_at_document_start Line: 1 Position: 25 Last 80 unconsumed characters: + version="1.0" ?> + DETAIL + end + + def test_xml_declaration_missing_spaces + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses spaces before version + Line: 1 + Position: 7 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_missing_version + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses version + Line: 1 + Position: 8 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 37 + Last 80 unconsumed characters: + encoding="UTF-8"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_missing_space_after_encoding + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 53 + Last 80 unconsumed characters: + standalone="no"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_with_unknown_attributes + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 31 + Last 80 unconsumed characters: + test="no"?> + DETAIL + end + + def test_xml_declaration_standalone_no_yes_or_no + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration standalone is not yes or no : +Line: 1 +Position: 38 +Last 80 unconsumed characters: +?> DETAIL end end @@ -113,7 +237,7 @@ def test_content_question def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new("" * n + " ?>") + REXML::Document.new("" * n + " ?>") end end diff --git a/test/test_xml_declaration.rb b/test/test_xml_declaration.rb index 6a1f4df0..4503a90e 100644 --- a/test/test_xml_declaration.rb +++ b/test/test_xml_declaration.rb @@ -7,7 +7,7 @@ module REXMLTests class TestXmlDeclaration < Test::Unit::TestCase def setup xml = <<~XML - + XML