diff --git a/CHANGES.md b/CHANGES.md index a34c124c2b..d3b8220fb6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) +* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Also clarified in documentation the which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/select/NodeTraversor.java b/src/main/java/org/jsoup/select/NodeTraversor.java index 763be08d2e..b29afd0941 100644 --- a/src/main/java/org/jsoup/select/NodeTraversor.java +++ b/src/main/java/org/jsoup/select/NodeTraversor.java @@ -9,11 +9,18 @@ A depth-first node traversor. Use to walk through all nodes under and including the specified root node, in document order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for each node. -
During traversal, structural changes to nodes are supported (e.g. {{@link Node#replaceWith(Node)}, - {@link Node#remove()}} -
+During the head() visit, DOM structural changes around the node currently being visited are
+ supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
+ {@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node
+ changes are also supported.
DOM structural changes to the current node are not supported during the tail() visit.
This interface provides two methods, {@link #head} and {@link #tail}. The head method is called when a node is first seen, and the tail method when all that node's children have been visited. As an example, {@code head} can be used to - emit a start tag for a node, and {@code tail} to create the end tag. The {@code tail} method defaults to a no-op, so - the {@code head} method is the {@link FunctionalInterface}. + emit a start tag for a node, and {@code tail} to emit the end tag. The {@code tail} method defaults to a no-op, so + this interface can be used as a {@link FunctionalInterface}, with {@code head} as its single abstract method.
Example:
@@ -27,9 +27,19 @@
public interface NodeVisitor {
/**
Callback for when a node is first visited.
- The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Node#replaceWith(Node)}) or removed
- {@link Node#remove()}. If it's {@code instanceOf Element}, you may cast it to an {@link Element} and access those
- methods.
+ The node may be modified (for example via {@link Node#attr(String)}), removed with
+ {@link Node#remove()}, or replaced with {@link Node#replaceWith(Node)}. If the node is an
+ {@link Element}, you may cast it and access those methods.
+ Traversal uses a forward cursor. After {@code head()} completes:
+
+ - If the current node is still attached, traversal continues into its current children and then its following
+ siblings. Nodes inserted before the current node are not visited.
+ - If the current node was detached and another node now occupies its former sibling position, the node now at
+ that position is not passed to {@code head()} again. Traversal continues from there: its children are visited,
+ then the node is passed to {@link #tail(Node, int)}, then later siblings are visited.
+ - If the current node was detached and no node occupies its former sibling position, the current node is not
+ passed to {@code tail()}, and traversal resumes at the node that originally followed it.
+
@param node the node being visited.
@param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
@@ -39,9 +49,10 @@ The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Nod
/**
Callback for when a node is last visited, after all of its descendants have been visited.
-
This method has a default no-op implementation.
- Note that neither replacement with {@link Node#replaceWith(Node)} nor removal with {@link Node#remove()} is
- supported during {@code tail()}.
+
This method defaults to a no-op.
+ The node passed to {@code tail()} is the node at the current traversal position when the subtree completes.
+ If {@code head()} replaced the original node, this may be the replacement node instead.
+ Structural changes to the current node are not supported during {@code tail()}.
@param node the node being visited.
@param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java
index 9f57a8d757..2ff1f18f53 100644
--- a/src/test/java/org/jsoup/select/TraversorTest.java
+++ b/src/test/java/org/jsoup/select/TraversorTest.java
@@ -8,6 +8,7 @@
import org.jsoup.nodes.TextNode;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.ValueSource;
import java.util.HashSet;
@@ -200,6 +201,177 @@ else if (node instanceof TextNode && ((TextNode) node).text().equals("Three"))
assertEquals("Two
", TextUtil.stripNewlines(doc.body().html()));
}
+ @Test
+ void replacementInHeadCanTailReplacementAndVisitChildren() {
+ // if head() replaces the current node, the replacement should be tailed and its children still traversed
+ Document doc = Jsoup.parseBodyFragment("two");
+ Element div = doc.expectFirst("div");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override
+ public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if ("i".equals(node.nodeName())) {
+ Element replacement = new Element("u").insertChildren(0, node.childNodes());
+ node.replaceWith(replacement);
+ }
+ }
+
+ @Override
+ public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, div);
+
+ assertEquals("div;i;two;", headOrder.toString());
+ assertEquals("two;u;div;", tailOrder.toString());
+ assertEquals("two", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @ParameterizedTest
+ @EnumSource(CurrentPositionMutation.class)
+ void supportsCurrentNodeMutationDuringHead(CurrentPositionMutation mutation) {
+ // supported head() mutations at the current cursor should terminate cleanly and continue traversal in order
+ // https://github.com/jhy/jsoup/issues/2472
+ Document doc = Jsoup.parse("One
aTwo");
+ AtomicInteger visits = new AtomicInteger();
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ if (visits.incrementAndGet() > MaxHeadVisits)
+ fail(String.format("Likely loop when applying %s in head()", mutation));
+ trackSeen(node, headOrder);
+ if (node instanceof TextNode && ((TextNode) node).text().equals("a"))
+ mutation.apply(node);
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, doc);
+
+ assertEquals("#root;html;head;body;div;p;One;a;em;Two;", headOrder.toString());
+ assertEquals(mutation.expectedTailOrder, tailOrder.toString());
+ assertEquals(mutation.expectedHtml, TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ private static final int MaxHeadVisits = 20;
+
+ private enum CurrentPositionMutation {
+ Remove("head;One;p;Two;em;div;body;html;#root;", "One
Two") {
+ @Override void apply(Node node) {
+ node.remove();
+ }
+ },
+ Replace("head;One;p;b;Two;em;div;body;html;#root;", "One
bTwo") {
+ @Override void apply(Node node) {
+ node.replaceWith(new TextNode("b"));
+ }
+ },
+ BeforeRemove("head;One;p;b;Two;em;div;body;html;#root;", "One
bTwo") {
+ @Override void apply(Node node) {
+ node.before(new TextNode("b"));
+ node.remove();
+ }
+ },
+ AfterRemove("head;One;p;b;Two;em;div;body;html;#root;", "One
bTwo") {
+ @Override void apply(Node node) {
+ node.after(new TextNode("b"));
+ node.remove();
+ }
+ };
+
+ final String expectedTailOrder;
+ final String expectedHtml;
+
+ CurrentPositionMutation(String expectedTailOrder, String expectedHtml) {
+ this.expectedTailOrder = expectedTailOrder;
+ this.expectedHtml = expectedHtml;
+ }
+
+ abstract void apply(Node node);
+ }
+
+ @ParameterizedTest
+ @EnumSource(SiblingInsertion.class)
+ void siblingInsertionsOnlyVisitFutureNodesDuringHead(SiblingInsertion insertion) {
+ // nodes inserted before the current cursor are not visited; nodes inserted after it are still ahead and are
+ Document doc = Jsoup.parseBodyFragment("a");
+ StringBuilder headTexts = new StringBuilder();
+ StringBuilder tailTexts = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ if (node instanceof TextNode) {
+ trackSeen(node, headTexts);
+ insertion.apply((TextNode) node);
+ }
+ }
+
+ @Override public void tail(Node node, int depth) {
+ if (node instanceof TextNode)
+ trackSeen(node, tailTexts);
+ }
+ }, doc);
+
+ assertEquals(insertion.expectedTexts, headTexts.toString());
+ assertEquals(insertion.expectedTexts, tailTexts.toString());
+ assertEquals(insertion.expectedHtml, TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ private enum SiblingInsertion {
+ Before("a;", "ba") {
+ @Override void apply(TextNode node) {
+ if (node.text().equals("a"))
+ node.before(new TextNode("b"));
+ }
+ },
+ After("a;b;", "ab") {
+ @Override void apply(TextNode node) {
+ if (node.text().equals("a"))
+ node.after(new TextNode("b"));
+ }
+ };
+
+ final String expectedTexts;
+ final String expectedHtml;
+
+ SiblingInsertion(String expectedTexts, String expectedHtml) {
+ this.expectedTexts = expectedTexts;
+ this.expectedHtml = expectedHtml;
+ }
+
+ abstract void apply(TextNode node);
+ }
+
+ @Test
+ void visitsChildrenInsertedInHead() {
+ // when the current node remains attached, children inserted in head() are visited in the same traversal
+ Document doc = Jsoup.parseBodyFragment("");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node instanceof Element && node.nameIs("p"))
+ ((Element) node).append("child");
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, doc.body());
+
+ assertEquals("body;div;p;span;child;", headOrder.toString());
+ assertEquals("child;span;p;div;body;", tailOrder.toString());
+ assertEquals("child
", TextUtil.stripNewlines(doc.body().html()));
+ }
+
@Test void elementFunctionalTraverse() {
Document doc = Jsoup.parse("1
2
3");
Element body = doc.body();