From b6fabf85043750c895294c0f72ee16279c593ddb Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 6 Mar 2026 14:30:09 +1100 Subject: [PATCH] Refactor NodeTraversor.traverse to have a clearer state machine impl Now if a node is replaced in head, we don't revisit it, so we won't loop. Also rewrote the documentation around this so the mutation contract and cursor state is clear. Added test cases to validate it. Fixes #2472 --- CHANGES.md | 1 + .../java/org/jsoup/select/NodeTraversor.java | 90 ++++----- .../java/org/jsoup/select/NodeVisitor.java | 29 ++- .../java/org/jsoup/select/TraversorTest.java | 172 ++++++++++++++++++ 4 files changed, 241 insertions(+), 51 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index a34c124c2b..d3b8220fb6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) +* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Also clarified in documentation the which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/select/NodeTraversor.java b/src/main/java/org/jsoup/select/NodeTraversor.java index 763be08d2e..b29afd0941 100644 --- a/src/main/java/org/jsoup/select/NodeTraversor.java +++ b/src/main/java/org/jsoup/select/NodeTraversor.java @@ -9,11 +9,18 @@ A depth-first node traversor. Use to walk through all nodes under and including the specified root node, in document order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for each node. -

During traversal, structural changes to nodes are supported (e.g. {{@link Node#replaceWith(Node)}, - {@link Node#remove()}} -

+

During the head() visit, DOM structural changes around the node currently being visited are + supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See + {@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node + changes are also supported.

+

DOM structural changes to the current node are not supported during the tail() visit.

*/ public class NodeTraversor { + // cursor state + private static final byte VisitHead = 0; + private static final byte AfterHead = 1; + private static final byte VisitTail = 2; + /** Run a depth-first traverse of the root and all of its descendants. @param visitor Node visitor. @@ -25,56 +32,55 @@ public static void traverse(NodeVisitor visitor, Node root) { Validate.notNull(root); Node node = root; int depth = 0; + byte state = VisitHead; while (node != null) { - Node parent = node.parentNode(); // remember parent to find nodes that get replaced in .head - int origSize = parent != null ? parent.childNodeSize() : 0; - Node next = node.nextSibling(); + if (state == VisitHead) { + // snapshot the current cursor position so we can recover if head() structurally changes it: + Node parent = node.parentNode(); + Node nextSib = node.nextSibling(); + int sibIndex = parent != null ? node.siblingIndex() : 0; + int childCount = parent != null ? parent.childNodeSize() : 0; - visitor.head(node, depth); // visit current node + visitor.head(node, depth); - // check for modifications to the tree - if (parent != null && !node.hasParent()) { // removed or replaced - if (origSize == parent.childNodeSize()) { // replaced - node = parent.childNode(node.siblingIndex()); // replace ditches parent but keeps sibling index - continue; - } - // else, removed - node = next; - if (node == null) { - // was last in parent. need to walk up the tree, tail()ing on the way, until we find a suitable next. Otherwise, would revisit ancestor nodes. - node = parent; - while (true) { + // any structural changes? + if (parent != null && !node.hasParent()) { // node was removed from parent; try to recover by sibling index + if (parent.childNodeSize() == childCount) { // current slot is still occupied + node = parent.childNode(sibIndex); + state = AfterHead; // continue from that slot without re-heading it + } else if (nextSib != null) { // removed; resume from the original next + node = nextSib; + } else { // removed last child; tail the parent next + node = parent; depth--; - visitor.tail(node, depth); - if (node == root) break; - if (node.nextSibling() != null) { - node = node.nextSibling(); - break; - } - node = node.parentNode(); - if (node == null) break; + state = VisitTail; } - if (node == root || node == null) break; // done, break outer + } else { + state = AfterHead; } - continue; // don't tail removed + continue; // next loop handles the updated node/state } - if (node.childNodeSize() > 0) { // descend + if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children node = node.childNode(0); depth++; - } else { - while (true) { - assert node != null; // as depth > 0, will have parent - if (!(node.nextSibling() == null && depth > 0)) break; - visitor.tail(node, depth); // when no more siblings, ascend - node = node.parentNode(); - depth--; - } - visitor.tail(node, depth); - if (node == root) - break; - node = node.nextSibling(); + state = VisitHead; + continue; + } + + visitor.tail(node, depth); + + if (node == root) break; // done + + Node next = node.nextSibling(); + if (next != null) { // traverse siblings + node = next; + state = VisitHead; + } else { // no siblings left, ascend + node = node.parentNode(); + depth--; + state = VisitTail; } } } diff --git a/src/main/java/org/jsoup/select/NodeVisitor.java b/src/main/java/org/jsoup/select/NodeVisitor.java index 0ea03deefe..b2375c49c6 100644 --- a/src/main/java/org/jsoup/select/NodeVisitor.java +++ b/src/main/java/org/jsoup/select/NodeVisitor.java @@ -4,13 +4,13 @@ import org.jsoup.nodes.Node; /** - Node visitor interface, used to walk the DOM, and visit each Node. Execute via {@link #traverse(Node)} or + Node visitor interface, used to walk the DOM and visit each node. Execute via {@link #traverse(Node)} or {@link Node#traverse(NodeVisitor)}. The traversal is depth-first.

This interface provides two methods, {@link #head} and {@link #tail}. The head method is called when a node is first seen, and the tail method when all that node's children have been visited. As an example, {@code head} can be used to - emit a start tag for a node, and {@code tail} to create the end tag. The {@code tail} method defaults to a no-op, so - the {@code head} method is the {@link FunctionalInterface}. + emit a start tag for a node, and {@code tail} to emit the end tag. The {@code tail} method defaults to a no-op, so + this interface can be used as a {@link FunctionalInterface}, with {@code head} as its single abstract method.

Example:


@@ -27,9 +27,19 @@
 public interface NodeVisitor {
     /**
      Callback for when a node is first visited.
-     

The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Node#replaceWith(Node)}) or removed - {@link Node#remove()}. If it's {@code instanceOf Element}, you may cast it to an {@link Element} and access those - methods.

+

The node may be modified (for example via {@link Node#attr(String)}), removed with + {@link Node#remove()}, or replaced with {@link Node#replaceWith(Node)}. If the node is an + {@link Element}, you may cast it and access those methods.

+

Traversal uses a forward cursor. After {@code head()} completes:

+
    +
  • If the current node is still attached, traversal continues into its current children and then its following + siblings. Nodes inserted before the current node are not visited.
  • +
  • If the current node was detached and another node now occupies its former sibling position, the node now at + that position is not passed to {@code head()} again. Traversal continues from there: its children are visited, + then the node is passed to {@link #tail(Node, int)}, then later siblings are visited.
  • +
  • If the current node was detached and no node occupies its former sibling position, the current node is not + passed to {@code tail()}, and traversal resumes at the node that originally followed it.
  • +
@param node the node being visited. @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node @@ -39,9 +49,10 @@

The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Nod /** Callback for when a node is last visited, after all of its descendants have been visited. -

This method has a default no-op implementation.

-

Note that neither replacement with {@link Node#replaceWith(Node)} nor removal with {@link Node#remove()} is - supported during {@code tail()}. +

This method defaults to a no-op.

+

The node passed to {@code tail()} is the node at the current traversal position when the subtree completes. + If {@code head()} replaced the original node, this may be the replacement node instead.

+

Structural changes to the current node are not supported during {@code tail()}.

@param node the node being visited. @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java index 9f57a8d757..2ff1f18f53 100644 --- a/src/test/java/org/jsoup/select/TraversorTest.java +++ b/src/test/java/org/jsoup/select/TraversorTest.java @@ -8,6 +8,7 @@ import org.jsoup.nodes.TextNode; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; import org.junit.jupiter.params.provider.ValueSource; import java.util.HashSet; @@ -200,6 +201,177 @@ else if (node instanceof TextNode && ((TextNode) node).text().equals("Three")) assertEquals("

Two

", TextUtil.stripNewlines(doc.body().html())); } + @Test + void replacementInHeadCanTailReplacementAndVisitChildren() { + // if head() replaces the current node, the replacement should be tailed and its children still traversed + Document doc = Jsoup.parseBodyFragment("
two
"); + Element div = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override + public void head(Node node, int depth) { + trackSeen(node, headOrder); + if ("i".equals(node.nodeName())) { + Element replacement = new Element("u").insertChildren(0, node.childNodes()); + node.replaceWith(replacement); + } + } + + @Override + public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, div); + + assertEquals("div;i;two;", headOrder.toString()); + assertEquals("two;u;div;", tailOrder.toString()); + assertEquals("
two
", TextUtil.stripNewlines(doc.body().html())); + } + + @ParameterizedTest + @EnumSource(CurrentPositionMutation.class) + void supportsCurrentNodeMutationDuringHead(CurrentPositionMutation mutation) { + // supported head() mutations at the current cursor should terminate cleanly and continue traversal in order + // https://github.com/jhy/jsoup/issues/2472 + Document doc = Jsoup.parse("

One

aTwo
"); + AtomicInteger visits = new AtomicInteger(); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + if (visits.incrementAndGet() > MaxHeadVisits) + fail(String.format("Likely loop when applying %s in head()", mutation)); + trackSeen(node, headOrder); + if (node instanceof TextNode && ((TextNode) node).text().equals("a")) + mutation.apply(node); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, doc); + + assertEquals("#root;html;head;body;div;p;One;a;em;Two;", headOrder.toString()); + assertEquals(mutation.expectedTailOrder, tailOrder.toString()); + assertEquals(mutation.expectedHtml, TextUtil.stripNewlines(doc.body().html())); + } + + private static final int MaxHeadVisits = 20; + + private enum CurrentPositionMutation { + Remove("head;One;p;Two;em;div;body;html;#root;", "

One

Two
") { + @Override void apply(Node node) { + node.remove(); + } + }, + Replace("head;One;p;b;Two;em;div;body;html;#root;", "

One

bTwo
") { + @Override void apply(Node node) { + node.replaceWith(new TextNode("b")); + } + }, + BeforeRemove("head;One;p;b;Two;em;div;body;html;#root;", "

One

bTwo
") { + @Override void apply(Node node) { + node.before(new TextNode("b")); + node.remove(); + } + }, + AfterRemove("head;One;p;b;Two;em;div;body;html;#root;", "

One

bTwo
") { + @Override void apply(Node node) { + node.after(new TextNode("b")); + node.remove(); + } + }; + + final String expectedTailOrder; + final String expectedHtml; + + CurrentPositionMutation(String expectedTailOrder, String expectedHtml) { + this.expectedTailOrder = expectedTailOrder; + this.expectedHtml = expectedHtml; + } + + abstract void apply(Node node); + } + + @ParameterizedTest + @EnumSource(SiblingInsertion.class) + void siblingInsertionsOnlyVisitFutureNodesDuringHead(SiblingInsertion insertion) { + // nodes inserted before the current cursor are not visited; nodes inserted after it are still ahead and are + Document doc = Jsoup.parseBodyFragment("
a
"); + StringBuilder headTexts = new StringBuilder(); + StringBuilder tailTexts = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + if (node instanceof TextNode) { + trackSeen(node, headTexts); + insertion.apply((TextNode) node); + } + } + + @Override public void tail(Node node, int depth) { + if (node instanceof TextNode) + trackSeen(node, tailTexts); + } + }, doc); + + assertEquals(insertion.expectedTexts, headTexts.toString()); + assertEquals(insertion.expectedTexts, tailTexts.toString()); + assertEquals(insertion.expectedHtml, TextUtil.stripNewlines(doc.body().html())); + } + + private enum SiblingInsertion { + Before("a;", "
ba
") { + @Override void apply(TextNode node) { + if (node.text().equals("a")) + node.before(new TextNode("b")); + } + }, + After("a;b;", "
ab
") { + @Override void apply(TextNode node) { + if (node.text().equals("a")) + node.after(new TextNode("b")); + } + }; + + final String expectedTexts; + final String expectedHtml; + + SiblingInsertion(String expectedTexts, String expectedHtml) { + this.expectedTexts = expectedTexts; + this.expectedHtml = expectedHtml; + } + + abstract void apply(TextNode node); + } + + @Test + void visitsChildrenInsertedInHead() { + // when the current node remains attached, children inserted in head() are visited in the same traversal + Document doc = Jsoup.parseBodyFragment("

"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node instanceof Element && node.nameIs("p")) + ((Element) node).append("child"); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, doc.body()); + + assertEquals("body;div;p;span;child;", headOrder.toString()); + assertEquals("child;span;p;div;body;", tailOrder.toString()); + assertEquals("

child

", TextUtil.stripNewlines(doc.body().html())); + } + @Test void elementFunctionalTraverse() { Document doc = Jsoup.parse("

1

2

3"); Element body = doc.body();