Don't extract ERB comments as Ruby comments in herb_extract_ruby (#98)

marcoroth · web-flow · commit 9ee04ae46b25 · 2025-05-03T19:14:08.000+09:00
This pull request updates the `herb_extract_ruby` and `herb_extract_ruby_to_buffer_with_semicolons` methods in `extract.c` to not extract the content of ERB Comment Nodes (`<%#`) as Ruby code. So a source file like this: ```html+erb <%# This is a comment %> <h1><%= title %></h1> ``` Was extracted to Ruby code as: ```ruby # This is a comment title ``` With the changes included in this pull request it's going to be: ```ruby title ``` This is in order to resolve #91. It's valid to have multi-line ERB Comments like: ```html+erb <%# This is a comment over multiple lines %> ``` Which before this pull request was extracted to Ruby as: ```ruby # This is a comment over multiple lines ``` Which is not a valid Ruby comment anymore, but treated as actual Ruby code from the second line on. If the comment itself included Ruby keywords it would cause syntax errors. For now, we don't extract the ERB Comments at all - which is the change this pull request introduces. In the future, we can implement #100 (and/or #102) and also make sure that multi-line ERB Comments get extracted to multi-line Ruby Comments, like: ```ruby # This is # a comment # over multiple # lines ``` or maybe even cleverer: replace the `<%` with a `=begin` and the `%>` with a `=end`: ```ruby =begin This is a comment over multiple lines =end ``` Another case where it would break Ruby syntax is in this example: ```html+erb <% if true %><%# Comment here %><% end %> ``` Which is going to comment out the `end` as well: ```ruby if true # Comment here end ``` This use-case is also fixed with this pull request, since we just skip over the ERB Comments content: ```ruby if true end ``` This last example could be solved even more elegantly if Ruby shipped the Inline Comments feature: https://bugs.ruby-lang.org/issues/20405 This following example is still broken and this pull request does not address that use-case. I opened #101 for this. ```html+erb <% if true %><% # Comment here %><% end %> ``` Currently it does not address the case, where the comment is part of the Ruby Code itself, so the comment is not seen as a "ERB Comment Node":
diff --git a/src/extract.c b/src/extract.c
@@ -8,58 +8,89 @@
 
 void herb_extract_ruby_to_buffer_with_semicolons(const char* source, buffer_T* output) {
   const array_T* tokens = herb_lex(source);
+  bool skip_erb_content = false;
 
   for (size_t i = 0; i < array_size(tokens); i++) {
     const token_T* token = array_get(tokens, i);
 
     switch (token->type) {
-      case TOKEN_NEWLINE:
-      case TOKEN_ERB_CONTENT: buffer_append(output, token->value); break;
-      case TOKEN_ERB_END: {
-        buffer_append_char(output, ';');
-        buffer_append_whitespace(output, range_length(token->range) - 1);
+      case TOKEN_NEWLINE: {
+        buffer_append(output, token->value);
         break;
       }
 
       case TOKEN_ERB_START: {
-        if (strcmp(token->value, "<%#") == 0) {
-          buffer_append_char(output, ' ');
-          buffer_append_char(output, ' ');
-          buffer_append_char(output, '#');
+        if (strcmp(token->value, "<%#") == 0) { skip_erb_content = true; }
+
+        buffer_append_whitespace(output, range_length(token->range));
+        break;
+      }
+
+      case TOKEN_ERB_CONTENT: {
+        if (skip_erb_content == false) {
+          buffer_append(output, token->value);
         } else {
           buffer_append_whitespace(output, range_length(token->range));
         }
 
         break;
       }
 
-      default: buffer_append_whitespace(output, range_length(token->range));
+      case TOKEN_ERB_END: {
+        skip_erb_content = false;
+
+        buffer_append_char(output, ';');
+        buffer_append_whitespace(output, range_length(token->range) - 1);
+        break;
+      }
+
+      default: {
+        buffer_append_whitespace(output, range_length(token->range));
+      }
     }
   }
 }
 
 void herb_extract_ruby_to_buffer(const char* source, buffer_T* output) {
   const array_T* tokens = herb_lex(source);
+  bool skip_erb_content = false;
 
   for (size_t i = 0; i < array_size(tokens); i++) {
     const token_T* token = array_get(tokens, i);
 
     switch (token->type) {
-      case TOKEN_NEWLINE:
-      case TOKEN_ERB_CONTENT: buffer_append(output, token->value); break;
+      case TOKEN_NEWLINE: {
+        buffer_append(output, token->value);
+        break;
+      }
+
       case TOKEN_ERB_START: {
-        if (strcmp(token->value, "<%#") == 0) {
-          buffer_append_char(output, ' ');
-          buffer_append_char(output, ' ');
-          buffer_append_char(output, '#');
+        if (strcmp(token->value, "<%#") == 0) { skip_erb_content = true; }
+
+        buffer_append_whitespace(output, range_length(token->range));
+        break;
+      }
+
+      case TOKEN_ERB_CONTENT: {
+        if (skip_erb_content == false) {
+          buffer_append(output, token->value);
         } else {
           buffer_append_whitespace(output, range_length(token->range));
         }
 
         break;
       }
 
-      default: buffer_append_whitespace(output, range_length(token->range));
+      case TOKEN_ERB_END: {
+        skip_erb_content = false;
+
+        buffer_append_whitespace(output, range_length(token->range));
+        break;
+      }
+
+      default: {
+        buffer_append_whitespace(output, range_length(token->range));
+      }
     }
   }
 }
diff --git a/test/extractor/extract_ruby_test.rb b/test/extractor/extract_ruby_test.rb
@@ -47,7 +47,70 @@ class ExtractRubyTest < Minitest::Spec
         <%# comment ' %>
       HTML
 
-      expected = "  # comment '   \n"
+      expected = "                \n"
+
+      assert_equal expected, actual
+    end
+
+    test "erb comment with ruby keyword" do
+      actual = Herb.extract_ruby(<<~HTML)
+        <%# end %>
+      HTML
+
+      expected = "          \n"
+
+      assert_equal expected, actual
+    end
+
+    test "erb comment broken up over multiple lines" do
+      actual = Herb.extract_ruby(<<~HTML)
+        <%#
+          end
+        %>
+      HTML
+
+      expected = "            \n"
+
+      # TODO: it should also preserve the newlines in the ERB content
+      # expected = "\n  #\n  end\n  "
+
+      assert_equal expected, actual
+    end
+
+    test "multi-line erb comment" do
+      actual = Herb.extract_ruby(<<~HTML)
+        <%#
+          end
+          end
+          end
+          end
+        %>
+      HTML
+
+      expected = "                              \n"
+
+      # TODO: it should also preserve the newlines in the ERB content
+      # expected = "   \n     \n      \n     \n     \n  \n"
+
+      assert_equal expected, actual
+    end
+
+    test "erb if/end and comment on same line" do
+      actual = Herb.extract_ruby(<<~HTML)
+        <% if %><%# comment %><% end %>
+      HTML
+
+      expected = "   if                    end   \n"
+
+      assert_equal expected, actual
+    end
+
+    xtest "erb if/end and Ruby comment on same line" do
+      actual = Herb.extract_ruby(<<~HTML)
+        <% if %><% # comment %><% end %>
+      HTML
+
+      expected = "   if      # comment      end   \n"
 
       assert_equal expected, actual
     end
diff --git a/test/parser/erb_test.rb b/test/parser/erb_test.rb
@@ -65,5 +65,21 @@ class ERBTest < Minitest::Spec
     test "comment" do
       assert_parsed_snapshot(%(<%# comment with a single qutote(') and double quote (") %>))
     end
+
+    test "multi-line comment" do
+      assert_parsed_snapshot(<<~HTML)
+        <%#
+          comment
+        %>
+      HTML
+    end
+
+    test "multi-line comment with Ruby keyword" do
+      assert_parsed_snapshot(<<~HTML)
+        <%#
+          end
+        %>
+      HTML
+    end
   end
 end
diff --git a/test/snapshots/parser/erb_test/test_0016_multi-line_comment_14c1f84ba7b4627b3e0dc3ba74179bb8.txt b/test/snapshots/parser/erb_test/test_0016_multi-line_comment_14c1f84ba7b4627b3e0dc3ba74179bb8.txt
@@ -0,0 +1,13 @@
+@ DocumentNode (location: (1:0)-(2:0))
+└── children: (2 items)
+    ├── @ ERBContentNode (location: (1:0)-(1:14))
+    │   ├── tag_opening: "<%#" (location: (1:0)-(1:3))
+    │   ├── content: "
+    │     comment
+    │   " (location: (1:3)-(1:12))
+    │   ├── tag_closing: "%>" (location: (1:12)-(1:14))
+    │   ├── parsed: true
+    │   └── valid: true
+    │
+    └── @ HTMLTextNode (location: (1:14)-(2:0))
+        └── content: "\n"
diff --git a/test/snapshots/parser/erb_test/test_0017_multi-line_comment_with_Ruby_keyword_dd7885019a51280cc6cfef734bf1161a.txt b/test/snapshots/parser/erb_test/test_0017_multi-line_comment_with_Ruby_keyword_dd7885019a51280cc6cfef734bf1161a.txt
@@ -0,0 +1,13 @@
+@ DocumentNode (location: (1:0)-(2:0))
+└── children: (2 items)
+    ├── @ ERBContentNode (location: (1:0)-(1:10))
+    │   ├── tag_opening: "<%#" (location: (1:0)-(1:3))
+    │   ├── content: "
+    │     end
+    │   " (location: (1:3)-(1:8))
+    │   ├── tag_closing: "%>" (location: (1:8)-(1:10))
+    │   ├── parsed: true
+    │   └── valid: false
+    │
+    └── @ HTMLTextNode (location: (1:10)-(2:0))
+        └── content: "\n"