dudw · pull · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,7 +4,8 @@
 
 ### Bug Fixes
 * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459)
-* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Also clarified in documentation the which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472)
+* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472)
+* Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474)
 
 ## 1.22.1 (2026-Jan-01)
 

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License
 
-Copyright (c) 2009-2025 Jonathan Hedley <https://jsoup.org/>
+Copyright (c) 2009-2026 Jonathan Hedley <https://jsoup.org/>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/pom.xml b/pom.xml
@@ -249,15 +249,15 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
-        <version>3.5.4</version>
+        <version>3.5.5</version>
         <configuration>
           <!-- smaller stack to find stack overflows. Was 256, but Zulu on MacOS ARM needs >= 640 -->
           <argLine>-Xss640k</argLine>
         </configuration>
       </plugin>
       <plugin>
         <artifactId>maven-failsafe-plugin</artifactId>
-        <version>3.5.4</version>
+        <version>3.5.5</version>
         <executions>
           <execution>
             <goals>
@@ -484,7 +484,7 @@
         <plugins>
           <plugin>
             <artifactId>maven-failsafe-plugin</artifactId>
-            <version>3.5.4</version>
+            <version>3.5.5</version>
             <executions>
               <execution>
                 <goals>
@@ -505,7 +505,7 @@
     <dependency>
       <groupId>org.junit.jupiter</groupId>
       <artifactId>junit-jupiter</artifactId>
-      <version>5.14.2</version>
+      <version>5.14.3</version>
       <scope>test</scope>
     </dependency>
 

diff --git a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java
@@ -80,7 +80,11 @@ private void fill() throws IOException {
             bufLength = read + bufPos;
             capRemaining -= read;
             while (byteBuf.length - bufLength > 0 && capRemaining > 0) { // read in more if we have space, without blocking
-                if (in.available() < 1) break;
+                try {
+                    if (in.available() < 1) break;
+                } catch (IOException e) {
+                    break; // available() is advisory; keep the bytes we've already buffered
+                }
                 toRead = Math.min(byteBuf.length - bufLength, capRemaining);
                 if (toRead <= 0) break;
                 read = in.read(byteBuf, bufLength, toRead);
@@ -116,8 +120,7 @@ public int available() throws IOException {
         if (buffered > 0) {
             return buffered; // doesn't include those in.available(), but mostly used as a block test
         }
-        int avail = inReadFully ? 0 : in.available();
-        return avail;
+        return inReadFully ? 0 : in.available();
     }
 
     void capRemaining(int newRemaining) {

diff --git a/src/main/java/org/jsoup/internal/SimpleStreamReader.java b/src/main/java/org/jsoup/internal/SimpleStreamReader.java
@@ -43,7 +43,7 @@ public int read(char[] charArray, int off, int len) throws IOException {
         while (true) {
             CoderResult result = decoder.decode(byteBuf, charBuf, readFully);
             if (result.isUnderflow()) {
-                if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !(in.available() > 0))
+                if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !hasAvailableBytes())
                     break;
                 int read = bufferUp();
                 if (read < 0) {
@@ -64,6 +64,14 @@ public int read(char[] charArray, int off, int len) throws IOException {
         return charBuf.position();
     }
 
+    private boolean hasAvailableBytes() {
+        try {
+            return in.available() > 0;
+        } catch (IOException e) {
+            return false; // available() is advisory; a real read can still consume buffered bytes or reach EOF
+        }
+    }
+
     private int bufferUp() throws IOException {
         assert byteBuf != null; // already validated ^
         byteBuf.compact();

diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java
@@ -17,19 +17,24 @@
 import static org.jsoup.internal.SharedConstants.DummyUri;
 
 /**
- The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
+ The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
  that you are expecting; no junk, and no cross-site scripting attacks!
  <p>
- The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain
+ The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain
  HTML that is allowed by the safelist.
  </p>
  <p>
  It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
- canned safe-lists only allow body contained tags.
+ canned safelists only allow body-contained tags.
  </p>
  <p>
  Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
  </p>
+ <p>
+ A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has
+ been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning
+ calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy.
+ </p>
  */
 public class Cleaner {
     private final Safelist safelist;

diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java
@@ -22,7 +22,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
 
 
 /**
- Safe-lists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed.
+ Safelists define what HTML (elements and attributes) to allow through a {@link Cleaner}. Everything else is removed.
  <p>
  Start with one of the defaults:
  </p>
@@ -53,15 +53,20 @@ If you need to allow more through (please be careful!), tweak a base safelist wi
  </ul>
 
  <p>
- The cleaner and these safelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
+ The {@link Cleaner} and these safelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
  supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, you could wrap
  the templated document HTML around the cleaned body HTML.
  </p>
  <p>
+ Safelists are mutable. A {@link Cleaner} uses the supplied safelist directly, so later changes affect later cleaning
+ calls. If you want to share a safelist across threads, finish configuring it first and do not mutate it while it is in
+ use. To build a variant from an existing configuration, use {@link #Safelist(Safelist)} to make a copy.
+ </p>
+ <p>
  If you are going to extend a safelist, please be very careful. Make sure you understand what attributes may lead to
  XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See 
  the <a href="https://owasp.org/www-community/xss-filter-evasion-cheatsheet">XSS Filter Evasion Cheat Sheet</a> for some
- XSS attack examples (that jsoup will safegaurd against the default Cleaner and Safelist configuration).
+ XSS attack examples (that jsoup will safeguard against with the default Cleaner and Safelist configuration).
  </p>
  */
 public class Safelist {

diff --git a/src/main/java/org/jsoup/select/NodeTraversor.java b/src/main/java/org/jsoup/select/NodeTraversor.java
@@ -10,7 +10,7 @@
  order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for
  each node.
  <p>During the <code>head()</code> visit, DOM structural changes around the node currently being visited are
- supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
+ supported, including {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
  {@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node
  changes are also supported.</p>
  <p>DOM structural changes to the current node are not supported during the <code>tail()</code> visit.</p>
@@ -31,35 +31,40 @@ public static void traverse(NodeVisitor visitor, Node root) {
         Validate.notNull(visitor);
         Validate.notNull(root);
         Node node = root;
+        final Node rootNext = root.nextSibling(); // don't traverse siblings beyond the original root
         int depth = 0;
         byte state = VisitHead;
 
-        while (node != null) {
+        while (true) {
             if (state == VisitHead) {
                 // snapshot the current cursor position so we can recover if head() structurally changes it:
-                Node parent    = node.parentNode();
-                Node nextSib   = node.nextSibling();
-                int sibIndex   = parent != null ? node.siblingIndex()    : 0;
-                int childCount = parent != null ? parent.childNodeSize() : 0;
+                Node parent   = node.parentNode();
+                Node next     = node.nextSibling();
+                int  sibIndex = parent != null ? node.siblingIndex() : 0;
 
                 visitor.head(node, depth);
 
                 // any structural changes?
-                if (parent != null && !node.hasParent()) {      // node was removed from parent; try to recover by sibling index
-                    if (parent.childNodeSize() == childCount) { // current slot is still occupied
-                        node = parent.childNode(sibIndex);
-                        state = AfterHead;                      // continue from that slot without re-heading it
-                    } else if (nextSib != null) {               // removed; resume from the original next
-                        node = nextSib;
-                    } else {                                    // removed last child; tail the parent next
+                if (parent != null && node.parentNode() != parent) { // removed / replaced / moved
+                    Node occupant = sibIndex < parent.childNodeSize() ? parent.childNode(sibIndex) : null;
+                    // ^^ the node now at this node's former position
+                    Node boundary = depth == 0 ? rootNext : next;   // don't advance beyond this node when resuming
+                    if (occupant != null && occupant != boundary) {
+                        node = occupant;
+                        state = AfterHead;                          // continue from that slot without re-heading it
+                    } else if (depth == 0) {                        // root detached or replaced
+                        break;
+                    } else if (next != null && next.parentNode() == parent) {
+                        node = next;                                // old slot is empty or shifted to the original next, visit
+                    } else {                                        // removed last child; tail the parent next
                         node = parent;
                         depth--;
                         state = VisitTail;
                     }
                 } else {
                     state = AfterHead;
                 }
-                continue;                                       // next loop handles the updated node/state
+                continue;                                           // next loop handles the updated node/state
             }
 
             if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children
@@ -71,10 +76,12 @@ public static void traverse(NodeVisitor visitor, Node root) {
 
             visitor.tail(node, depth);
 
-            if (node == root) break; // done
-
             Node next = node.nextSibling();
-            if (next != null) {     // traverse siblings
+            if (depth == 0) {
+                if (next == null || next == rootNext) break; // done with the original root range
+                node = next;
+                state = VisitHead;
+            } else if (next != null) { // traverse siblings
                 node = next;
                 state = VisitHead;
             } else {                // no siblings left, ascend

diff --git a/src/main/java/org/jsoup/select/NodeVisitor.java b/src/main/java/org/jsoup/select/NodeVisitor.java
@@ -40,6 +40,8 @@ <p>The node may be modified (for example via {@link Node#attr(String)}), removed
      <li>If the current node was detached and no node occupies its former sibling position, the current node is not
      passed to {@code tail()}, and traversal resumes at the node that originally followed it.</li>
      </ul>
+     <p>Traversal never advances outside the original root subtree. If the traversal root is detached during
+     {@code head()}, traversal stops at the original root boundary.</p>
 
      @param node the node being visited.
      @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node

diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -418,4 +418,25 @@ public int available() {
         }
     }
 
+    @Test
+    void charsetSniffingIgnoresAdvisoryAvailableIOException() throws IOException {
+        // https://github.com/jhy/jsoup/issues/2474
+        // JDK 8's HttpURLConnection stream may throw from available() once the peer has closed the socket;
+        // that advisory failure does not mean we can't still consume bytes already buffered or read to clean EOF.
+        String html = "<!doctype html><html><head><title>One</title></head><body>Two</body></html>";
+        byte[] bytes = html.getBytes(StandardCharsets.UTF_8);
+        InputStream stream = new FilterInputStream(new ByteArrayInputStream(bytes)) {
+            @Override
+            public int available() throws IOException {
+                throw new IOException("Stream closed.");
+            }
+        };
+        ControllableInputStream in = ControllableInputStream.wrap(stream, 0);
+
+        Document doc = DataUtil.parseInputStream(in, null, "http://example.com/", Parser.htmlParser());
+
+        assertEquals("One", doc.title());
+        assertEquals("Two", doc.body().text());
+    }
+
 }
diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java
@@ -16,6 +16,9 @@
 
 import java.util.Arrays;
 import java.util.Locale;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
 
 import static org.junit.jupiter.api.Assertions.*;
 
@@ -226,6 +229,53 @@ public void safeListedProtocolShouldBeRetained(Locale locale) {
         assertFalse(new Cleaner(Safelist.none()).isValid(okDoc));
     }
 
+    @Test void configuredCleanerMayBeSharedAcrossThreads() throws InterruptedException {
+        // https://github.com/jhy/jsoup/issues/2473
+        String html = "<a href='/foo'>Link</a><img src='/bar' alt='Q'>";
+        String baseUri = "https://example.com/";
+        String expected = "<a href=\"https://example.com/foo\">Link</a><img src=\"https://example.com/bar\" alt=\"Q\">";
+        Cleaner cleaner = new Cleaner(Safelist.basicWithImages());
+
+        int numThreads = 10;
+        int numLoops = 20;
+        String[] cleaned = new String[numThreads * numLoops];
+        AtomicInteger next = new AtomicInteger();
+        AtomicReference<Throwable> failure = new AtomicReference<>();
+        CountDownLatch start = new CountDownLatch(1);
+        CountDownLatch done = new CountDownLatch(numThreads);
+        Thread[] threads = new Thread[numThreads];
+
+        for (int i = 0; i < numThreads; i++) {
+            Thread thread = new Thread(() -> {
+                try {
+                    start.await();
+                    for (int j = 0; j < numLoops; j++) {
+                        Document dirty = Jsoup.parseBodyFragment(html, baseUri);
+                        cleaned[next.getAndIncrement()] = cleaner.clean(dirty).body().html();
+                    }
+                } catch (Throwable t) {
+                    failure.compareAndSet(null, t);
+                    if (t instanceof InterruptedException) Thread.currentThread().interrupt();
+                } finally {
+                    done.countDown();
+                }
+            });
+            threads[i] = thread;
+            thread.start();
+        }
+
+        start.countDown();
+        done.await();
+
+        if (failure.get() != null)
+            throw new AssertionError("Concurrent cleaner use failed", failure.get());
+
+        assertEquals(cleaned.length, next.get());
+        for (String clean : cleaned) {
+            assertEquals(expected, clean);
+        }
+    }
+
     @Test public void resolvesRelativeLinks() {
         String html = "<a href='/foo'>Link</a><img src='/bar'>";
         String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages());