Skip to content

Commit 7e2b81c

Browse files
committed
Add Source#skip_spaces method
## Why? In the case of `@source.match?(/\s+/um, true)`, if there are no spaces at the beginning, I want to stop reading immediately. However, it continues to read the buffer until it finds a match, but it never finds a match. As a result, it continues reading until the end of the file. In the case of large XML files, drop_parsed_content occur frequently until the buffer is cleared, which may affect performance.
1 parent c87bda8 commit 7e2b81c

File tree

3 files changed

+18
-13
lines changed

3 files changed

+18
-13
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def pull_event
280280
return [ :comment, process_comment ]
281281
elsif @source.match?("DOCTYPE", true)
282282
base_error_message = "Malformed DOCTYPE"
283-
unless @source.match?(/\s+/um, true)
283+
unless @source.skip_spaces
284284
if @source.match?(">")
285285
message = "#{base_error_message}: name is missing"
286286
else
@@ -290,7 +290,7 @@ def pull_event
290290
raise REXML::ParseException.new(message, @source)
291291
end
292292
name = parse_name(base_error_message)
293-
@source.match?(/\s*/um, true) # skip spaces
293+
@source.skip_spaces
294294
if @source.match?("[", true)
295295
id = [nil, nil, nil]
296296
@document_status = :in_doctype
@@ -306,7 +306,7 @@ def pull_event
306306
# For backward compatibility
307307
id[1], id[2] = id[2], nil
308308
end
309-
@source.match?(/\s*/um, true) # skip spaces
309+
@source.skip_spaces
310310
if @source.match?("[", true)
311311
@document_status = :in_doctype
312312
elsif @source.match?(">", true)
@@ -319,7 +319,7 @@ def pull_event
319319
end
320320
args = [:start_doctype, name, *id]
321321
if @document_status == :after_doctype
322-
@source.match?(/\s*/um, true)
322+
@source.skip_spaces
323323
@stack << [ :end_doctype ]
324324
end
325325
return args
@@ -330,7 +330,7 @@ def pull_event
330330
end
331331
end
332332
if @document_status == :in_doctype
333-
@source.match?(/\s*/um, true) # skip spaces
333+
@source.skip_spaces
334334
start_position = @source.position
335335
if @source.match?("<!", true)
336336
if @source.match?("ELEMENT", true)
@@ -391,7 +391,7 @@ def pull_event
391391
return [ :attlistdecl, element, pairs, contents ]
392392
elsif @source.match?("NOTATION", true)
393393
base_error_message = "Malformed notation declaration"
394-
unless @source.match?(/\s+/um, true)
394+
unless @source.skip_spaces
395395
if @source.match?(">")
396396
message = "#{base_error_message}: name is missing"
397397
else
@@ -404,7 +404,7 @@ def pull_event
404404
id = parse_id(base_error_message,
405405
accept_external_id: true,
406406
accept_public_id: true)
407-
@source.match?(/\s*/um, true) # skip spaces
407+
@source.skip_spaces
408408
unless @source.match?(">", true)
409409
message = "#{base_error_message}: garbage before end >"
410410
raise REXML::ParseException.new(message, @source)
@@ -425,7 +425,7 @@ def pull_event
425425
end
426426
end
427427
if @document_status == :after_doctype
428-
@source.match?(/\s*/um, true)
428+
@source.skip_spaces
429429
end
430430
begin
431431
start_position = @source.position
@@ -735,7 +735,7 @@ def process_comment
735735

736736
def process_instruction
737737
name = parse_name("Malformed XML: Invalid processing instruction node")
738-
if @source.match?(/\s+/um, true)
738+
if @source.skip_spaces
739739
match_data = @source.match(/(.*?)\?>/um, true)
740740
unless match_data
741741
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
@@ -817,7 +817,7 @@ def parse_attributes(prefixes)
817817
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
818818
raise REXML::ParseException.new(message, @source)
819819
end
820-
@source.match?(/\s*/um, true)
820+
@source.skip_spaces
821821
if prefix == "xmlns"
822822
if local_part == "xml"
823823
if value != Private::XML_PREFIXED_NAMESPACE

lib/rexml/source.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class Source
6565
attr_reader :encoding
6666

6767
module Private
68+
SPACES_PATTERN = /\s+/um
6869
SCANNER_RESET_SIZE = 100000
6970
PRE_DEFINED_TERM_PATTERNS = {}
7071
pre_defined_terms = ["'", '"', "<", "]]>"]
@@ -150,6 +151,10 @@ def match?(pattern, cons=false)
150151
end
151152
end
152153

154+
def skip_spaces
155+
@scanner.skip(Private::SPACES_PATTERN) ? true : false
156+
end
157+
153158
def position
154159
@scanner.pos
155160
end

test/parse/test_document_type_declaration.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ def test_no_name
4949
end
5050
assert_equal(<<-DETAIL.chomp, exception.to_s)
5151
Malformed DOCTYPE: name is missing
52-
Line: 3
53-
Position: 17
52+
Line: 1
53+
Position: 10
5454
Last 80 unconsumed characters:
55-
<!DOCTYPE> <r/>
55+
<!DOCTYPE>
5656
DETAIL
5757
end
5858
end

0 commit comments

Comments
 (0)