Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Bug Fixes
* Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459)
* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Also clarified in documentation the which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472)

## 1.22.1 (2026-Jan-01)

Expand Down
90 changes: 48 additions & 42 deletions src/main/java/org/jsoup/select/NodeTraversor.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,18 @@
A depth-first node traversor. Use to walk through all nodes under and including the specified root node, in document
order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for
each node.
<p> During traversal, structural changes to nodes are supported (e.g. {{@link Node#replaceWith(Node)},
{@link Node#remove()}}
</p>
<p>During the <code>head()</code> visit, DOM structural changes around the node currently being visited are
supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
{@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node
changes are also supported.</p>
<p>DOM structural changes to the current node are not supported during the <code>tail()</code> visit.</p>
*/
public class NodeTraversor {
// cursor state
private static final byte VisitHead = 0;
private static final byte AfterHead = 1;
private static final byte VisitTail = 2;

/**
Run a depth-first traverse of the root and all of its descendants.
@param visitor Node visitor.
Expand All @@ -25,56 +32,55 @@ public static void traverse(NodeVisitor visitor, Node root) {
Validate.notNull(root);
Node node = root;
int depth = 0;
byte state = VisitHead;

while (node != null) {
Node parent = node.parentNode(); // remember parent to find nodes that get replaced in .head
int origSize = parent != null ? parent.childNodeSize() : 0;
Node next = node.nextSibling();
if (state == VisitHead) {
// snapshot the current cursor position so we can recover if head() structurally changes it:
Node parent = node.parentNode();
Node nextSib = node.nextSibling();
int sibIndex = parent != null ? node.siblingIndex() : 0;
int childCount = parent != null ? parent.childNodeSize() : 0;

visitor.head(node, depth); // visit current node
visitor.head(node, depth);

// check for modifications to the tree
if (parent != null && !node.hasParent()) { // removed or replaced
if (origSize == parent.childNodeSize()) { // replaced
node = parent.childNode(node.siblingIndex()); // replace ditches parent but keeps sibling index
continue;
}
// else, removed
node = next;
if (node == null) {
// was last in parent. need to walk up the tree, tail()ing on the way, until we find a suitable next. Otherwise, would revisit ancestor nodes.
node = parent;
while (true) {
// any structural changes?
if (parent != null && !node.hasParent()) { // node was removed from parent; try to recover by sibling index
if (parent.childNodeSize() == childCount) { // current slot is still occupied
node = parent.childNode(sibIndex);
state = AfterHead; // continue from that slot without re-heading it
} else if (nextSib != null) { // removed; resume from the original next
node = nextSib;
} else { // removed last child; tail the parent next
node = parent;
depth--;
visitor.tail(node, depth);
if (node == root) break;
if (node.nextSibling() != null) {
node = node.nextSibling();
break;
}
node = node.parentNode();
if (node == null) break;
state = VisitTail;
}
if (node == root || node == null) break; // done, break outer
} else {
state = AfterHead;
}
continue; // don't tail removed
continue; // next loop handles the updated node/state
}

if (node.childNodeSize() > 0) { // descend
if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children
node = node.childNode(0);
depth++;
} else {
while (true) {
assert node != null; // as depth > 0, will have parent
if (!(node.nextSibling() == null && depth > 0)) break;
visitor.tail(node, depth); // when no more siblings, ascend
node = node.parentNode();
depth--;
}
visitor.tail(node, depth);
if (node == root)
break;
node = node.nextSibling();
state = VisitHead;
continue;
}

visitor.tail(node, depth);

if (node == root) break; // done

Node next = node.nextSibling();
if (next != null) { // traverse siblings
node = next;
state = VisitHead;
} else { // no siblings left, ascend
node = node.parentNode();
depth--;
state = VisitTail;
}
}
}
Expand Down
29 changes: 20 additions & 9 deletions src/main/java/org/jsoup/select/NodeVisitor.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import org.jsoup.nodes.Node;

/**
Node visitor interface, used to walk the DOM, and visit each Node. Execute via {@link #traverse(Node)} or
Node visitor interface, used to walk the DOM and visit each node. Execute via {@link #traverse(Node)} or
{@link Node#traverse(NodeVisitor)}. The traversal is depth-first.
<p>
This interface provides two methods, {@link #head} and {@link #tail}. The head method is called when a node is first
seen, and the tail method when all that node's children have been visited. As an example, {@code head} can be used to
emit a start tag for a node, and {@code tail} to create the end tag. The {@code tail} method defaults to a no-op, so
the {@code head} method is the {@link FunctionalInterface}.
emit a start tag for a node, and {@code tail} to emit the end tag. The {@code tail} method defaults to a no-op, so
this interface can be used as a {@link FunctionalInterface}, with {@code head} as its single abstract method.
</p>
<p><b>Example:</b></p>
<pre><code>
Expand All @@ -27,9 +27,19 @@
public interface NodeVisitor {
/**
Callback for when a node is first visited.
<p>The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Node#replaceWith(Node)}) or removed
{@link Node#remove()}. If it's {@code instanceOf Element}, you may cast it to an {@link Element} and access those
methods.</p>
<p>The node may be modified (for example via {@link Node#attr(String)}), removed with
{@link Node#remove()}, or replaced with {@link Node#replaceWith(Node)}. If the node is an
{@link Element}, you may cast it and access those methods.</p>
<p>Traversal uses a forward cursor. After {@code head()} completes:</p>
<ul>
<li>If the current node is still attached, traversal continues into its current children and then its following
siblings. Nodes inserted before the current node are not visited.</li>
<li>If the current node was detached and another node now occupies its former sibling position, the node now at
that position is not passed to {@code head()} again. Traversal continues from there: its children are visited,
then the node is passed to {@link #tail(Node, int)}, then later siblings are visited.</li>
<li>If the current node was detached and no node occupies its former sibling position, the current node is not
passed to {@code tail()}, and traversal resumes at the node that originally followed it.</li>
</ul>

@param node the node being visited.
@param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
Expand All @@ -39,9 +49,10 @@ <p>The node may be modified (e.g. {@link Node#attr(String)}, replaced {@link Nod

/**
Callback for when a node is last visited, after all of its descendants have been visited.
<p>This method has a default no-op implementation.</p>
<p>Note that neither replacement with {@link Node#replaceWith(Node)} nor removal with {@link Node#remove()} is
supported during {@code tail()}.
<p>This method defaults to a no-op.</p>
<p>The node passed to {@code tail()} is the node at the current traversal position when the subtree completes.
If {@code head()} replaced the original node, this may be the replacement node instead.</p>
<p>Structural changes to the current node are not supported during {@code tail()}.</p>

@param node the node being visited.
@param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
Expand Down
172 changes: 172 additions & 0 deletions src/test/java/org/jsoup/select/TraversorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.jsoup.nodes.TextNode;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.ValueSource;

import java.util.HashSet;
Expand Down Expand Up @@ -200,6 +201,177 @@ else if (node instanceof TextNode && ((TextNode) node).text().equals("Three"))
assertEquals("<div><p id=\"2\">Two</p><p></p></div>", TextUtil.stripNewlines(doc.body().html()));
}

@Test
void replacementInHeadCanTailReplacementAndVisitChildren() {
// if head() replaces the current node, the replacement should be tailed and its children still traversed
Document doc = Jsoup.parseBodyFragment("<div><i>two</i></div>");
Element div = doc.expectFirst("div");
StringBuilder headOrder = new StringBuilder();
StringBuilder tailOrder = new StringBuilder();

NodeTraversor.traverse(new NodeVisitor() {
@Override
public void head(Node node, int depth) {
trackSeen(node, headOrder);
if ("i".equals(node.nodeName())) {
Element replacement = new Element("u").insertChildren(0, node.childNodes());
node.replaceWith(replacement);
}
}

@Override
public void tail(Node node, int depth) {
trackSeen(node, tailOrder);
}
}, div);

assertEquals("div;i;two;", headOrder.toString());
assertEquals("two;u;div;", tailOrder.toString());
assertEquals("<div><u>two</u></div>", TextUtil.stripNewlines(doc.body().html()));
}

@ParameterizedTest
@EnumSource(CurrentPositionMutation.class)
void supportsCurrentNodeMutationDuringHead(CurrentPositionMutation mutation) {
// supported head() mutations at the current cursor should terminate cleanly and continue traversal in order
// https://github.com/jhy/jsoup/issues/2472
Document doc = Jsoup.parse("<div><p>One</p>a<em>Two</em></div>");
AtomicInteger visits = new AtomicInteger();
StringBuilder headOrder = new StringBuilder();
StringBuilder tailOrder = new StringBuilder();

NodeTraversor.traverse(new NodeVisitor() {
@Override public void head(Node node, int depth) {
if (visits.incrementAndGet() > MaxHeadVisits)
fail(String.format("Likely loop when applying %s in head()", mutation));
trackSeen(node, headOrder);
if (node instanceof TextNode && ((TextNode) node).text().equals("a"))
mutation.apply(node);
}

@Override public void tail(Node node, int depth) {
trackSeen(node, tailOrder);
}
}, doc);

assertEquals("#root;html;head;body;div;p;One;a;em;Two;", headOrder.toString());
assertEquals(mutation.expectedTailOrder, tailOrder.toString());
assertEquals(mutation.expectedHtml, TextUtil.stripNewlines(doc.body().html()));
}

private static final int MaxHeadVisits = 20;

private enum CurrentPositionMutation {
Remove("head;One;p;Two;em;div;body;html;#root;", "<div><p>One</p><em>Two</em></div>") {
@Override void apply(Node node) {
node.remove();
}
},
Replace("head;One;p;b;Two;em;div;body;html;#root;", "<div><p>One</p>b<em>Two</em></div>") {
@Override void apply(Node node) {
node.replaceWith(new TextNode("b"));
}
},
BeforeRemove("head;One;p;b;Two;em;div;body;html;#root;", "<div><p>One</p>b<em>Two</em></div>") {
@Override void apply(Node node) {
node.before(new TextNode("b"));
node.remove();
}
},
AfterRemove("head;One;p;b;Two;em;div;body;html;#root;", "<div><p>One</p>b<em>Two</em></div>") {
@Override void apply(Node node) {
node.after(new TextNode("b"));
node.remove();
}
};

final String expectedTailOrder;
final String expectedHtml;

CurrentPositionMutation(String expectedTailOrder, String expectedHtml) {
this.expectedTailOrder = expectedTailOrder;
this.expectedHtml = expectedHtml;
}

abstract void apply(Node node);
}

@ParameterizedTest
@EnumSource(SiblingInsertion.class)
void siblingInsertionsOnlyVisitFutureNodesDuringHead(SiblingInsertion insertion) {
// nodes inserted before the current cursor are not visited; nodes inserted after it are still ahead and are
Document doc = Jsoup.parseBodyFragment("<div>a</div>");
StringBuilder headTexts = new StringBuilder();
StringBuilder tailTexts = new StringBuilder();

NodeTraversor.traverse(new NodeVisitor() {
@Override public void head(Node node, int depth) {
if (node instanceof TextNode) {
trackSeen(node, headTexts);
insertion.apply((TextNode) node);
}
}

@Override public void tail(Node node, int depth) {
if (node instanceof TextNode)
trackSeen(node, tailTexts);
}
}, doc);

assertEquals(insertion.expectedTexts, headTexts.toString());
assertEquals(insertion.expectedTexts, tailTexts.toString());
assertEquals(insertion.expectedHtml, TextUtil.stripNewlines(doc.body().html()));
}

private enum SiblingInsertion {
Before("a;", "<div>ba</div>") {
@Override void apply(TextNode node) {
if (node.text().equals("a"))
node.before(new TextNode("b"));
}
},
After("a;b;", "<div>ab</div>") {
@Override void apply(TextNode node) {
if (node.text().equals("a"))
node.after(new TextNode("b"));
}
};

final String expectedTexts;
final String expectedHtml;

SiblingInsertion(String expectedTexts, String expectedHtml) {
this.expectedTexts = expectedTexts;
this.expectedHtml = expectedHtml;
}

abstract void apply(TextNode node);
}

@Test
void visitsChildrenInsertedInHead() {
// when the current node remains attached, children inserted in head() are visited in the same traversal
Document doc = Jsoup.parseBodyFragment("<div><p></p></div>");
StringBuilder headOrder = new StringBuilder();
StringBuilder tailOrder = new StringBuilder();

NodeTraversor.traverse(new NodeVisitor() {
@Override public void head(Node node, int depth) {
trackSeen(node, headOrder);
if (node instanceof Element && node.nameIs("p"))
((Element) node).append("<span>child</span>");
}

@Override public void tail(Node node, int depth) {
trackSeen(node, tailOrder);
}
}, doc.body());

assertEquals("body;div;p;span;child;", headOrder.toString());
assertEquals("child;span;p;div;body;", tailOrder.toString());
assertEquals("<div><p><span>child</span></p></div>", TextUtil.stripNewlines(doc.body().html()));
}

@Test void elementFunctionalTraverse() {
Document doc = Jsoup.parse("<div><p>1<p>2<p>3");
Element body = doc.body();
Expand Down