diff --git a/CHANGES.md b/CHANGES.md index a34c124c2b..d3b8220fb6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) +* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Also clarified in documentation the which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/select/NodeTraversor.java b/src/main/java/org/jsoup/select/NodeTraversor.java index 763be08d2e..b29afd0941 100644 --- a/src/main/java/org/jsoup/select/NodeTraversor.java +++ b/src/main/java/org/jsoup/select/NodeTraversor.java @@ -9,11 +9,18 @@ A depth-first node traversor. Use to walk through all nodes under and including the specified root node, in document order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for each node. -

During traversal, structural changes to nodes are supported (e.g. {{@link Node#replaceWith(Node)}, - {@link Node#remove()}} -

+

During the head() visit, DOM structural changes around the node currently being visited are + supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See + {@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node + changes are also supported.

+

DOM structural changes to the current node are not supported during the tail() visit.

*/ public class NodeTraversor { + // cursor state + private static final byte VisitHead = 0; + private static final byte AfterHead = 1; + private static final byte VisitTail = 2; + /** Run a depth-first traverse of the root and all of its descendants. @param visitor Node visitor. @@ -25,56 +32,55 @@ public static void traverse(NodeVisitor visitor, Node root) { Validate.notNull(root); Node node = root; int depth = 0; + byte state = VisitHead; while (node != null) { - Node parent = node.parentNode(); // remember parent to find nodes that get replaced in .head - int origSize = parent != null ? parent.childNodeSize() : 0; - Node next = node.nextSibling(); + if (state == VisitHead) { + // snapshot the current cursor position so we can recover if head() structurally changes it: + Node parent = node.parentNode(); + Node nextSib = node.nextSibling(); + int sibIndex = parent != null ? node.siblingIndex() : 0; + int childCount = parent != null ? parent.childNodeSize() : 0; - visitor.head(node, depth); // visit current node + visitor.head(node, depth); - // check for modifications to the tree - if (parent != null && !node.hasParent()) { // removed or replaced - if (origSize == parent.childNodeSize()) { // replaced - node = parent.childNode(node.siblingIndex()); // replace ditches parent but keeps sibling index - continue; - } - // else, removed - node = next; - if (node == null) { - // was last in parent. need to walk up the tree, tail()ing on the way, until we find a suitable next. Otherwise, would revisit ancestor nodes. - node = parent; - while (true) { + // any structural changes? + if (parent != null && !node.hasParent()) { // node was removed from parent; try to recover by sibling index + if (parent.childNodeSize() == childCount) { // current slot is still occupied + node = parent.childNode(sibIndex); + state = AfterHead; // continue from that slot without re-heading it + } else if (nextSib != null) { // removed; resume from the original next + node = nextSib; + } else { // removed last child; tail the parent next + node = parent; depth--; - visitor.tail(node, depth); - if (node == root) break; - if (node.nextSibling() != null) { - node = node.nextSibling(); - break; - } - node = node.parentNode(); - if (node == null) break; + state = VisitTail; } - if (node == root || node == null) break; // done, break outer + } else { + state = AfterHead; } - continue; // don't tail removed + continue; // next loop handles the updated node/state } - if (node.childNodeSize() > 0) { // descend + if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children node = node.childNode(0); depth++; - } else { - while (true) { - assert node != null; // as depth > 0, will have parent - if (!(node.nextSibling() == null && depth > 0)) break; - visitor.tail(node, depth); // when no more siblings, ascend - node = node.parentNode(); - depth--; - } - visitor.tail(node, depth); - if (node == root) - break; - node = node.nextSibling(); + state = VisitHead; + continue; + } + + visitor.tail(node, depth); + + if (node == root) break; // done + + Node next = node.nextSibling(); + if (next != null) { // traverse siblings + node = next; + state = VisitHead; + } else { // no siblings left, ascend + node = node.parentNode(); + depth--; + state = VisitTail; } } } diff --git a/src/main/java/org/jsoup/select/NodeVisitor.java b/src/main/java/org/jsoup/select/NodeVisitor.java index 0ea03deefe..b2375c49c6 100644 --- a/src/main/java/org/jsoup/select/NodeVisitor.java +++ b/src/main/java/org/jsoup/select/NodeVisitor.java @@ -4,13 +4,13 @@ import org.jsoup.nodes.Node; /** - Node visitor interface, used to walk the DOM, and visit each Node. Execute via {@link #traverse(Node)} or + Node visitor interface, used to walk the DOM and visit each node. Execute via {@link #traverse(Node)} or {@link Node#traverse(NodeVisitor)}. The traversal is depth-first.

This interface provides two methods, {@link #head} and {@link #tail}. The head method is called when a node is first seen, and the tail method when all that node's children have been visited. As an example, {@code head} can be used to - emit a start tag for a node, and {@code tail} to create the end tag. The {@code tail} method defaults to a no-op, so - the {@code head} method is the {@link FunctionalInterface}. + emit a start tag for a node, and {@code tail} to emit the end tag. The {@code tail} method defaults to a no-op, so + this interface can be used as a {@link FunctionalInterface}, with {@code head} as its single abstract method.

Example:


@@ -27,9 +27,19 @@
 public interface NodeVisitor {
     /**
      Callback for when a node is first visited.
-     

The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Node#replaceWith(Node)}) or removed - {@link Node#remove()}. If it's {@code instanceOf Element}, you may cast it to an {@link Element} and access those - methods.

+

The node may be modified (for example via {@link Node#attr(String)}), removed with + {@link Node#remove()}, or replaced with {@link Node#replaceWith(Node)}. If the node is an + {@link Element}, you may cast it and access those methods.

+

Traversal uses a forward cursor. After {@code head()} completes:

+ @param node the node being visited. @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node @@ -39,9 +49,10 @@

The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Nod /** Callback for when a node is last visited, after all of its descendants have been visited. -

This method has a default no-op implementation.

-

Note that neither replacement with {@link Node#replaceWith(Node)} nor removal with {@link Node#remove()} is - supported during {@code tail()}. +

This method defaults to a no-op.

+

The node passed to {@code tail()} is the node at the current traversal position when the subtree completes. + If {@code head()} replaced the original node, this may be the replacement node instead.

+

Structural changes to the current node are not supported during {@code tail()}.

@param node the node being visited. @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java index 9f57a8d757..2ff1f18f53 100644 --- a/src/test/java/org/jsoup/select/TraversorTest.java +++ b/src/test/java/org/jsoup/select/TraversorTest.java @@ -8,6 +8,7 @@ import org.jsoup.nodes.TextNode; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; import org.junit.jupiter.params.provider.ValueSource; import java.util.HashSet; @@ -200,6 +201,177 @@ else if (node instanceof TextNode && ((TextNode) node).text().equals("Three")) assertEquals("

Two

", TextUtil.stripNewlines(doc.body().html())); } + @Test + void replacementInHeadCanTailReplacementAndVisitChildren() { + // if head() replaces the current node, the replacement should be tailed and its children still traversed + Document doc = Jsoup.parseBodyFragment("
two
"); + Element div = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override + public void head(Node node, int depth) { + trackSeen(node, headOrder); + if ("i".equals(node.nodeName())) { + Element replacement = new Element("u").insertChildren(0, node.childNodes()); + node.replaceWith(replacement); + } + } + + @Override + public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, div); + + assertEquals("div;i;two;", headOrder.toString()); + assertEquals("two;u;div;", tailOrder.toString()); + assertEquals("
two
", TextUtil.stripNewlines(doc.body().html())); + } + + @ParameterizedTest + @EnumSource(CurrentPositionMutation.class) + void supportsCurrentNodeMutationDuringHead(CurrentPositionMutation mutation) { + // supported head() mutations at the current cursor should terminate cleanly and continue traversal in order + // https://github.com/jhy/jsoup/issues/2472 + Document doc = Jsoup.parse("

One

aTwo
"); + AtomicInteger visits = new AtomicInteger(); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + if (visits.incrementAndGet() > MaxHeadVisits) + fail(String.format("Likely loop when applying %s in head()", mutation)); + trackSeen(node, headOrder); + if (node instanceof TextNode && ((TextNode) node).text().equals("a")) + mutation.apply(node); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, doc); + + assertEquals("#root;html;head;body;div;p;One;a;em;Two;", headOrder.toString()); + assertEquals(mutation.expectedTailOrder, tailOrder.toString()); + assertEquals(mutation.expectedHtml, TextUtil.stripNewlines(doc.body().html())); + } + + private static final int MaxHeadVisits = 20; + + private enum CurrentPositionMutation { + Remove("head;One;p;Two;em;div;body;html;#root;", "

One

Two
") { + @Override void apply(Node node) { + node.remove(); + } + }, + Replace("head;One;p;b;Two;em;div;body;html;#root;", "

One

bTwo
") { + @Override void apply(Node node) { + node.replaceWith(new TextNode("b")); + } + }, + BeforeRemove("head;One;p;b;Two;em;div;body;html;#root;", "

One

bTwo
") { + @Override void apply(Node node) { + node.before(new TextNode("b")); + node.remove(); + } + }, + AfterRemove("head;One;p;b;Two;em;div;body;html;#root;", "

One

bTwo
") { + @Override void apply(Node node) { + node.after(new TextNode("b")); + node.remove(); + } + }; + + final String expectedTailOrder; + final String expectedHtml; + + CurrentPositionMutation(String expectedTailOrder, String expectedHtml) { + this.expectedTailOrder = expectedTailOrder; + this.expectedHtml = expectedHtml; + } + + abstract void apply(Node node); + } + + @ParameterizedTest + @EnumSource(SiblingInsertion.class) + void siblingInsertionsOnlyVisitFutureNodesDuringHead(SiblingInsertion insertion) { + // nodes inserted before the current cursor are not visited; nodes inserted after it are still ahead and are + Document doc = Jsoup.parseBodyFragment("
a
"); + StringBuilder headTexts = new StringBuilder(); + StringBuilder tailTexts = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + if (node instanceof TextNode) { + trackSeen(node, headTexts); + insertion.apply((TextNode) node); + } + } + + @Override public void tail(Node node, int depth) { + if (node instanceof TextNode) + trackSeen(node, tailTexts); + } + }, doc); + + assertEquals(insertion.expectedTexts, headTexts.toString()); + assertEquals(insertion.expectedTexts, tailTexts.toString()); + assertEquals(insertion.expectedHtml, TextUtil.stripNewlines(doc.body().html())); + } + + private enum SiblingInsertion { + Before("a;", "
ba
") { + @Override void apply(TextNode node) { + if (node.text().equals("a")) + node.before(new TextNode("b")); + } + }, + After("a;b;", "
ab
") { + @Override void apply(TextNode node) { + if (node.text().equals("a")) + node.after(new TextNode("b")); + } + }; + + final String expectedTexts; + final String expectedHtml; + + SiblingInsertion(String expectedTexts, String expectedHtml) { + this.expectedTexts = expectedTexts; + this.expectedHtml = expectedHtml; + } + + abstract void apply(TextNode node); + } + + @Test + void visitsChildrenInsertedInHead() { + // when the current node remains attached, children inserted in head() are visited in the same traversal + Document doc = Jsoup.parseBodyFragment("

"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node instanceof Element && node.nameIs("p")) + ((Element) node).append("child"); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, doc.body()); + + assertEquals("body;div;p;span;child;", headOrder.toString()); + assertEquals("child;span;p;div;body;", tailOrder.toString()); + assertEquals("

child

", TextUtil.stripNewlines(doc.body().html())); + } + @Test void elementFunctionalTraverse() { Document doc = Jsoup.parse("

1

2

3"); Element body = doc.body();