Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

### Bug Fixes
* Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459)
* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Also clarified in documentation the which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472)
* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472)
* Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474)

## 1.22.1 (2026-Jan-01)

Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
The MIT License

Copyright (c) 2009-2025 Jonathan Hedley <https://jsoup.org/>
Copyright (c) 2009-2026 Jonathan Hedley <https://jsoup.org/>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
8 changes: 4 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -249,15 +249,15 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.5.4</version>
<version>3.5.5</version>
<configuration>
<!-- smaller stack to find stack overflows. Was 256, but Zulu on MacOS ARM needs >= 640 -->
<argLine>-Xss640k</argLine>
</configuration>
</plugin>
<plugin>
<artifactId>maven-failsafe-plugin</artifactId>
<version>3.5.4</version>
<version>3.5.5</version>
<executions>
<execution>
<goals>
Expand Down Expand Up @@ -484,7 +484,7 @@
<plugins>
<plugin>
<artifactId>maven-failsafe-plugin</artifactId>
<version>3.5.4</version>
<version>3.5.5</version>
<executions>
<execution>
<goals>
Expand All @@ -505,7 +505,7 @@
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.14.2</version>
<version>5.14.3</version>
<scope>test</scope>
</dependency>

Expand Down
9 changes: 6 additions & 3 deletions src/main/java/org/jsoup/internal/SimpleBufferedInput.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ private void fill() throws IOException {
bufLength = read + bufPos;
capRemaining -= read;
while (byteBuf.length - bufLength > 0 && capRemaining > 0) { // read in more if we have space, without blocking
if (in.available() < 1) break;
try {
if (in.available() < 1) break;
} catch (IOException e) {
break; // available() is advisory; keep the bytes we've already buffered
}
toRead = Math.min(byteBuf.length - bufLength, capRemaining);
if (toRead <= 0) break;
read = in.read(byteBuf, bufLength, toRead);
Expand Down Expand Up @@ -116,8 +120,7 @@ public int available() throws IOException {
if (buffered > 0) {
return buffered; // doesn't include those in.available(), but mostly used as a block test
}
int avail = inReadFully ? 0 : in.available();
return avail;
return inReadFully ? 0 : in.available();
}

void capRemaining(int newRemaining) {
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/org/jsoup/internal/SimpleStreamReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public int read(char[] charArray, int off, int len) throws IOException {
while (true) {
CoderResult result = decoder.decode(byteBuf, charBuf, readFully);
if (result.isUnderflow()) {
if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !(in.available() > 0))
if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !hasAvailableBytes())
break;
int read = bufferUp();
if (read < 0) {
Expand All @@ -64,6 +64,14 @@ public int read(char[] charArray, int off, int len) throws IOException {
return charBuf.position();
}

private boolean hasAvailableBytes() {
try {
return in.available() > 0;
} catch (IOException e) {
return false; // available() is advisory; a real read can still consume buffered bytes or reach EOF
}
}

private int bufferUp() throws IOException {
assert byteBuf != null; // already validated ^
byteBuf.compact();
Expand Down
11 changes: 8 additions & 3 deletions src/main/java/org/jsoup/safety/Cleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,24 @@
import static org.jsoup.internal.SharedConstants.DummyUri;

/**
The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
that you are expecting; no junk, and no cross-site scripting attacks!
<p>
The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain
The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain
HTML that is allowed by the safelist.
</p>
<p>
It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
canned safe-lists only allow body contained tags.
canned safelists only allow body-contained tags.
</p>
<p>
Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
</p>
<p>
A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has
been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning
calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy.
</p>
*/
public class Cleaner {
private final Safelist safelist;
Expand Down
11 changes: 8 additions & 3 deletions src/main/java/org/jsoup/safety/Safelist.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/


/**
Safe-lists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed.
Safelists define what HTML (elements and attributes) to allow through a {@link Cleaner}. Everything else is removed.
<p>
Start with one of the defaults:
</p>
Expand Down Expand Up @@ -53,15 +53,20 @@ If you need to allow more through (please be careful!), tweak a base safelist wi
</ul>

<p>
The cleaner and these safelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
The {@link Cleaner} and these safelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, you could wrap
the templated document HTML around the cleaned body HTML.
</p>
<p>
Safelists are mutable. A {@link Cleaner} uses the supplied safelist directly, so later changes affect later cleaning
calls. If you want to share a safelist across threads, finish configuring it first and do not mutate it while it is in
use. To build a variant from an existing configuration, use {@link #Safelist(Safelist)} to make a copy.
</p>
<p>
If you are going to extend a safelist, please be very careful. Make sure you understand what attributes may lead to
XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See
the <a href="https://owasp.org/www-community/xss-filter-evasion-cheatsheet">XSS Filter Evasion Cheat Sheet</a> for some
XSS attack examples (that jsoup will safegaurd against the default Cleaner and Safelist configuration).
XSS attack examples (that jsoup will safeguard against with the default Cleaner and Safelist configuration).
</p>
*/
public class Safelist {
Expand Down
41 changes: 24 additions & 17 deletions src/main/java/org/jsoup/select/NodeTraversor.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for
each node.
<p>During the <code>head()</code> visit, DOM structural changes around the node currently being visited are
supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
supported, including {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
{@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node
changes are also supported.</p>
<p>DOM structural changes to the current node are not supported during the <code>tail()</code> visit.</p>
Expand All @@ -31,35 +31,40 @@ public static void traverse(NodeVisitor visitor, Node root) {
Validate.notNull(visitor);
Validate.notNull(root);
Node node = root;
final Node rootNext = root.nextSibling(); // don't traverse siblings beyond the original root
int depth = 0;
byte state = VisitHead;

while (node != null) {
while (true) {
if (state == VisitHead) {
// snapshot the current cursor position so we can recover if head() structurally changes it:
Node parent = node.parentNode();
Node nextSib = node.nextSibling();
int sibIndex = parent != null ? node.siblingIndex() : 0;
int childCount = parent != null ? parent.childNodeSize() : 0;
Node parent = node.parentNode();
Node next = node.nextSibling();
int sibIndex = parent != null ? node.siblingIndex() : 0;

visitor.head(node, depth);

// any structural changes?
if (parent != null && !node.hasParent()) { // node was removed from parent; try to recover by sibling index
if (parent.childNodeSize() == childCount) { // current slot is still occupied
node = parent.childNode(sibIndex);
state = AfterHead; // continue from that slot without re-heading it
} else if (nextSib != null) { // removed; resume from the original next
node = nextSib;
} else { // removed last child; tail the parent next
if (parent != null && node.parentNode() != parent) { // removed / replaced / moved
Node occupant = sibIndex < parent.childNodeSize() ? parent.childNode(sibIndex) : null;
// ^^ the node now at this node's former position
Node boundary = depth == 0 ? rootNext : next; // don't advance beyond this node when resuming
if (occupant != null && occupant != boundary) {
node = occupant;
state = AfterHead; // continue from that slot without re-heading it
} else if (depth == 0) { // root detached or replaced
break;
} else if (next != null && next.parentNode() == parent) {
node = next; // old slot is empty or shifted to the original next, visit
} else { // removed last child; tail the parent next
node = parent;
depth--;
state = VisitTail;
}
} else {
state = AfterHead;
}
continue; // next loop handles the updated node/state
continue; // next loop handles the updated node/state
}

if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children
Expand All @@ -71,10 +76,12 @@ public static void traverse(NodeVisitor visitor, Node root) {

visitor.tail(node, depth);

if (node == root) break; // done

Node next = node.nextSibling();
if (next != null) { // traverse siblings
if (depth == 0) {
if (next == null || next == rootNext) break; // done with the original root range
node = next;
state = VisitHead;
} else if (next != null) { // traverse siblings
node = next;
state = VisitHead;
} else { // no siblings left, ascend
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jsoup/select/NodeVisitor.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ <p>The node may be modified (for example via {@link Node#attr(String)}), removed
<li>If the current node was detached and no node occupies its former sibling position, the current node is not
passed to {@code tail()}, and traversal resumes at the node that originally followed it.</li>
</ul>
<p>Traversal never advances outside the original root subtree. If the traversal root is detached during
{@code head()}, traversal stops at the original root boundary.</p>

@param node the node being visited.
@param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
Expand Down
21 changes: 21 additions & 0 deletions src/test/java/org/jsoup/helper/DataUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -418,4 +418,25 @@ public int available() {
}
}

@Test
void charsetSniffingIgnoresAdvisoryAvailableIOException() throws IOException {
// https://github.com/jhy/jsoup/issues/2474
// JDK 8's HttpURLConnection stream may throw from available() once the peer has closed the socket;
// that advisory failure does not mean we can't still consume bytes already buffered or read to clean EOF.
String html = "<!doctype html><html><head><title>One</title></head><body>Two</body></html>";
byte[] bytes = html.getBytes(StandardCharsets.UTF_8);
InputStream stream = new FilterInputStream(new ByteArrayInputStream(bytes)) {
@Override
public int available() throws IOException {
throw new IOException("Stream closed.");
}
};
ControllableInputStream in = ControllableInputStream.wrap(stream, 0);

Document doc = DataUtil.parseInputStream(in, null, "http://example.com/", Parser.htmlParser());

assertEquals("One", doc.title());
assertEquals("Two", doc.body().text());
}

}
50 changes: 50 additions & 0 deletions src/test/java/org/jsoup/safety/CleanerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

import java.util.Arrays;
import java.util.Locale;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;

import static org.junit.jupiter.api.Assertions.*;

Expand Down Expand Up @@ -226,6 +229,53 @@ public void safeListedProtocolShouldBeRetained(Locale locale) {
assertFalse(new Cleaner(Safelist.none()).isValid(okDoc));
}

@Test void configuredCleanerMayBeSharedAcrossThreads() throws InterruptedException {
// https://github.com/jhy/jsoup/issues/2473
String html = "<a href='/foo'>Link</a><img src='/bar' alt='Q'>";
String baseUri = "https://example.com/";
String expected = "<a href=\"https://example.com/foo\">Link</a><img src=\"https://example.com/bar\" alt=\"Q\">";
Cleaner cleaner = new Cleaner(Safelist.basicWithImages());

int numThreads = 10;
int numLoops = 20;
String[] cleaned = new String[numThreads * numLoops];
AtomicInteger next = new AtomicInteger();
AtomicReference<Throwable> failure = new AtomicReference<>();
CountDownLatch start = new CountDownLatch(1);
CountDownLatch done = new CountDownLatch(numThreads);
Thread[] threads = new Thread[numThreads];

for (int i = 0; i < numThreads; i++) {
Thread thread = new Thread(() -> {
try {
start.await();
for (int j = 0; j < numLoops; j++) {
Document dirty = Jsoup.parseBodyFragment(html, baseUri);
cleaned[next.getAndIncrement()] = cleaner.clean(dirty).body().html();
}
} catch (Throwable t) {
failure.compareAndSet(null, t);
if (t instanceof InterruptedException) Thread.currentThread().interrupt();
} finally {
done.countDown();
}
});
threads[i] = thread;
thread.start();
}

start.countDown();
done.await();

if (failure.get() != null)
throw new AssertionError("Concurrent cleaner use failed", failure.get());

assertEquals(cleaned.length, next.get());
for (String clean : cleaned) {
assertEquals(expected, clean);
}
}

@Test public void resolvesRelativeLinks() {
String html = "<a href='/foo'>Link</a><img src='/bar'>";
String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages());
Expand Down
Loading