From 70c8a7e7f0b6c27a5bba99bcc501882163f5ecd2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 26 Jan 2025 09:41:21 +0900 Subject: [PATCH] Reduced regular expression processing in the form of processing whitespace first ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.4.1/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +PRISM [arm64-darwin24] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.849 20.109 36.064 38.655 i/s - 100.000 times in 5.038102s 4.972864s 2.772838s 2.586981s sax 30.339 30.449 52.946 54.873 i/s - 100.000 times in 3.296102s 3.284176s 1.888722s 1.822391s pull 34.785 34.916 65.808 65.219 i/s - 100.000 times in 2.874810s 2.863976s 1.519581s 1.533305s stream 34.766 34.921 61.920 63.277 i/s - 100.000 times in 2.876359s 2.863571s 1.615000s 1.580354s Comparison: dom after(YJIT): 38.7 i/s before(YJIT): 36.1 i/s - 1.07x slower after: 20.1 i/s - 1.92x slower before: 19.8 i/s - 1.95x slower sax after(YJIT): 54.9 i/s before(YJIT): 52.9 i/s - 1.04x slower after: 30.4 i/s - 1.80x slower before: 30.3 i/s - 1.81x slower pull before(YJIT): 65.8 i/s after(YJIT): 65.2 i/s - 1.01x slower after: 34.9 i/s - 1.88x slower before: 34.8 i/s - 1.89x slower stream after(YJIT): 63.3 i/s before(YJIT): 61.9 i/s - 1.02x slower after: 34.9 i/s - 1.81x slower before: 34.8 i/s - 1.82x slower ``` - YJIT=ON : 0.99x - 1.07x faster - YJIT=OFF : 1.00x - 1.01x faster --- lib/rexml/parsers/baseparser.rb | 13 ++++++++----- test/parse/test_document_type_declaration.rb | 10 +++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 87f50f09..44aacfa2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -297,10 +297,11 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer @@ -312,9 +313,10 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) @document_status = :after_doctype @source.ensure_buffer else @@ -409,7 +411,8 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - unless @source.match?(/\s*>/um, true) + @source.match?(/\s*/um, true) # skip spaces + unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 99c23745..b22863a9 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -153,7 +153,7 @@ def test_no_literal Line: 3 Position: 26 Last 80 unconsumed characters: - SYSTEM> +SYSTEM> DETAIL end @@ -200,7 +200,7 @@ def test_content_double_quote Line: 3 Position: 62 Last 80 unconsumed characters: - PUBLIC 'double quote " is invalid' "r.dtd"> +PUBLIC 'double quote " is invalid' "r.dtd"> DETAIL end @@ -228,10 +228,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: garbage after external ID -Line: 3 -Position: 65 +Line: 1 +Position: 58 Last 80 unconsumed characters: -x'> +x'> DETAIL end