From a96388d273d176cfd615b4fb89955f5f14c8c613 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 6 Mar 2026 17:06:37 +1100 Subject: [PATCH 1/7] Cover more NodeTraversor mutation cases and tighten recovery I found some additional mutation cases around root detachment, unwrap(), and count-changing sibling edits. Tightened traversal recovery so it stays within the original root boundary, continues correctly from the former sibling position, and add regression tests with full branch coverage. Related to #2742 --- CHANGES.md | 2 +- .../java/org/jsoup/select/NodeTraversor.java | 41 +++-- .../java/org/jsoup/select/NodeVisitor.java | 2 + .../java/org/jsoup/select/TraversorTest.java | 174 ++++++++++++++++++ 4 files changed, 201 insertions(+), 18 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d3b8220fb6..bb60f30196 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,7 +4,7 @@ ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) -* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Also clarified in documentation the which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) +* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/select/NodeTraversor.java b/src/main/java/org/jsoup/select/NodeTraversor.java index b29afd0941..32ac44194f 100644 --- a/src/main/java/org/jsoup/select/NodeTraversor.java +++ b/src/main/java/org/jsoup/select/NodeTraversor.java @@ -10,7 +10,7 @@ order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for each node.

During the head() visit, DOM structural changes around the node currently being visited are - supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See + supported, including {@link Node#replaceWith(Node)} and {@link Node#remove()}. See {@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node changes are also supported.

DOM structural changes to the current node are not supported during the tail() visit.

@@ -31,27 +31,32 @@ public static void traverse(NodeVisitor visitor, Node root) { Validate.notNull(visitor); Validate.notNull(root); Node node = root; + final Node rootNext = root.nextSibling(); // don't traverse siblings beyond the original root int depth = 0; byte state = VisitHead; - while (node != null) { + while (true) { if (state == VisitHead) { // snapshot the current cursor position so we can recover if head() structurally changes it: - Node parent = node.parentNode(); - Node nextSib = node.nextSibling(); - int sibIndex = parent != null ? node.siblingIndex() : 0; - int childCount = parent != null ? parent.childNodeSize() : 0; + Node parent = node.parentNode(); + Node next = node.nextSibling(); + int sibIndex = parent != null ? node.siblingIndex() : 0; visitor.head(node, depth); // any structural changes? - if (parent != null && !node.hasParent()) { // node was removed from parent; try to recover by sibling index - if (parent.childNodeSize() == childCount) { // current slot is still occupied - node = parent.childNode(sibIndex); - state = AfterHead; // continue from that slot without re-heading it - } else if (nextSib != null) { // removed; resume from the original next - node = nextSib; - } else { // removed last child; tail the parent next + if (parent != null && node.parentNode() != parent) { // removed / replaced / moved + Node occupant = sibIndex < parent.childNodeSize() ? parent.childNode(sibIndex) : null; + // ^^ the node now at this node's former position + Node boundary = depth == 0 ? rootNext : next; // don't advance beyond this node when resuming + if (occupant != null && occupant != boundary) { + node = occupant; + state = AfterHead; // continue from that slot without re-heading it + } else if (depth == 0) { // root detached or replaced + break; + } else if (next != null && next.parentNode() == parent) { + node = next; // old slot is empty or shifted to the original next, visit + } else { // removed last child; tail the parent next node = parent; depth--; state = VisitTail; @@ -59,7 +64,7 @@ public static void traverse(NodeVisitor visitor, Node root) { } else { state = AfterHead; } - continue; // next loop handles the updated node/state + continue; // next loop handles the updated node/state } if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children @@ -71,10 +76,12 @@ public static void traverse(NodeVisitor visitor, Node root) { visitor.tail(node, depth); - if (node == root) break; // done - Node next = node.nextSibling(); - if (next != null) { // traverse siblings + if (depth == 0) { + if (next == null || next == rootNext) break; // done with the original root range + node = next; + state = VisitHead; + } else if (next != null) { // traverse siblings node = next; state = VisitHead; } else { // no siblings left, ascend diff --git a/src/main/java/org/jsoup/select/NodeVisitor.java b/src/main/java/org/jsoup/select/NodeVisitor.java index b2375c49c6..479d79be0a 100644 --- a/src/main/java/org/jsoup/select/NodeVisitor.java +++ b/src/main/java/org/jsoup/select/NodeVisitor.java @@ -40,6 +40,8 @@

The node may be modified (for example via {@link Node#attr(String)}), removed

  • If the current node was detached and no node occupies its former sibling position, the current node is not passed to {@code tail()}, and traversal resumes at the node that originally followed it.
  • +

    Traversal never advances outside the original root subtree. If the traversal root is detached during + {@code head()}, traversal stops at the original root boundary.

    @param node the node being visited. @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java index 2ff1f18f53..cb18b5106f 100644 --- a/src/test/java/org/jsoup/select/TraversorTest.java +++ b/src/test/java/org/jsoup/select/TraversorTest.java @@ -372,6 +372,180 @@ void visitsChildrenInsertedInHead() { assertEquals("

    child

    ", TextUtil.stripNewlines(doc.body().html())); } + @Test + void removingTraversalRootInHeadDoesNotEscapeOriginalSubtree() { + Document doc = Jsoup.parseBodyFragment("
    a

    b

    "); + Element root = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node == root) + node.remove(); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, root); + + assertEquals("div;", headOrder.toString()); + assertEquals("", tailOrder.toString()); + assertEquals("

    b

    ", TextUtil.stripNewlines(doc.body().html())); + } + + @Test + void removingOnlyTraversalRootInHeadStopsWhenOriginalSlotIsEmpty() { + Document doc = Jsoup.parseBodyFragment("
    a
    "); + Element root = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node == root) + node.remove(); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, root); + + assertEquals("div;", headOrder.toString()); + assertEquals("", tailOrder.toString()); + assertEquals("", TextUtil.stripNewlines(doc.body().html())); + } + + @Test + void replacingTraversalRootInHeadStaysWithinReplacementSubtree() { + Document doc = Jsoup.parseBodyFragment("
    x

    y

    "); + Element root = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node == root) { + Element replacement = new Element("section").insertChildren(0, node.childNodes()); + node.replaceWith(replacement); + } + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, root); + + assertEquals("div;span;x;", headOrder.toString()); + assertEquals("x;span;section;", tailOrder.toString()); + assertEquals("
    x

    y

    ", TextUtil.stripNewlines(doc.body().html())); + } + + @Test + void unwrappingTraversalRootInHeadVisitsExposedTopLevelNodesUntilOriginalBoundary() { + Document doc = Jsoup.parseBodyFragment("
    xy

    z

    "); + Element root = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node == root) + node.unwrap(); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, root); + + assertEquals("div;x;i;y;", headOrder.toString()); + assertEquals("x;b;y;i;", tailOrder.toString()); + assertEquals("xy

    z

    ", TextUtil.stripNewlines(doc.body().html())); + } + + @Test + void unwrapInHeadContinuesFromExposedChildren() { + Document doc = Jsoup.parseBodyFragment("
    xyz
    "); + Element root = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node instanceof Element && node.nameIs("span")) + node.unwrap(); + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, root); + + assertEquals("div;span;x;i;y;u;z;", headOrder.toString()); + assertEquals("x;b;y;i;z;u;div;", tailOrder.toString()); + assertEquals("
    xyz
    ", TextUtil.stripNewlines(doc.body().html())); + } + + @Test + void removingCurrentAndOriginalNextInHeadTailsParent() { + Document doc = Jsoup.parseBodyFragment("
    aTwo
    "); + Element root = doc.expectFirst("div"); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node instanceof TextNode && ((TextNode) node).text().equals("a")) { + node.nextSibling().remove(); + node.remove(); + } + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, root); + + assertEquals("div;a;", headOrder.toString()); + assertEquals("div;", tailOrder.toString()); + assertEquals("
    ", TextUtil.stripNewlines(doc.body().html())); + } + + @Test + void beforeAfterRemoveInHeadTailsCurrentSlotAndHeadsFutureSiblings() { + Document doc = Jsoup.parse("

    One

    aTwo
    "); + StringBuilder headOrder = new StringBuilder(); + StringBuilder tailOrder = new StringBuilder(); + + NodeTraversor.traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { + trackSeen(node, headOrder); + if (node instanceof TextNode && ((TextNode) node).text().equals("a")) { + node.before(new TextNode("b")); + node.after(new TextNode("c")); + node.remove(); + } + } + + @Override public void tail(Node node, int depth) { + trackSeen(node, tailOrder); + } + }, doc); + + assertEquals("#root;html;head;body;div;p;One;a;c;em;Two;", headOrder.toString()); + assertEquals("head;One;p;b;c;Two;em;div;body;html;#root;", tailOrder.toString()); + assertEquals("

    One

    bcTwo
    ", TextUtil.stripNewlines(doc.body().html())); + } + @Test void elementFunctionalTraverse() { Document doc = Jsoup.parse("

    1

    2

    3"); Element body = doc.body(); From 75a09696ddcb8bb2969f3d715702e2a0715bdc75 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 6 Mar 2026 17:11:30 +1100 Subject: [PATCH 2/7] Yes it's 2026 now --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index e4bf2be9fb..0b2f502e99 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License -Copyright (c) 2009-2025 Jonathan Hedley +Copyright (c) 2009-2026 Jonathan Hedley Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From b77e51bac1191faf328fa52ae29fcaf9d0456954 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 18:13:36 +1100 Subject: [PATCH 3/7] Bump org.junit.jupiter:junit-jupiter from 5.14.2 to 5.14.3 (#2469) Bumps [org.junit.jupiter:junit-jupiter](https://github.com/junit-team/junit-framework) from 5.14.2 to 5.14.3. - [Release notes](https://github.com/junit-team/junit-framework/releases) - [Commits](https://github.com/junit-team/junit-framework/compare/r5.14.2...r5.14.3) --- updated-dependencies: - dependency-name: org.junit.jupiter:junit-jupiter dependency-version: 5.14.3 dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 707cfd3d00..934d1527fc 100644 --- a/pom.xml +++ b/pom.xml @@ -505,7 +505,7 @@ org.junit.jupiter junit-jupiter - 5.14.2 + 5.14.3 test From 49e58c202883648925db5bb2b7d3f3d010a518f8 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 6 Mar 2026 19:03:51 +1100 Subject: [PATCH 4/7] Catch spurious `available()` throws Fixes #2474 Was causing CI flakes on the JDK 8 runner; quite possibly in other environments too. --- CHANGES.md | 1 + .../jsoup/internal/SimpleBufferedInput.java | 9 +++++--- .../jsoup/internal/SimpleStreamReader.java | 10 ++++++++- .../java/org/jsoup/helper/DataUtilTest.java | 21 +++++++++++++++++++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index bb60f30196..c2520ece21 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) * In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) +* Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java index dafbd3af4a..daeeaed9c5 100644 --- a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java +++ b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java @@ -80,7 +80,11 @@ private void fill() throws IOException { bufLength = read + bufPos; capRemaining -= read; while (byteBuf.length - bufLength > 0 && capRemaining > 0) { // read in more if we have space, without blocking - if (in.available() < 1) break; + try { + if (in.available() < 1) break; + } catch (IOException e) { + break; // available() is advisory; keep the bytes we've already buffered + } toRead = Math.min(byteBuf.length - bufLength, capRemaining); if (toRead <= 0) break; read = in.read(byteBuf, bufLength, toRead); @@ -116,8 +120,7 @@ public int available() throws IOException { if (buffered > 0) { return buffered; // doesn't include those in.available(), but mostly used as a block test } - int avail = inReadFully ? 0 : in.available(); - return avail; + return inReadFully ? 0 : in.available(); } void capRemaining(int newRemaining) { diff --git a/src/main/java/org/jsoup/internal/SimpleStreamReader.java b/src/main/java/org/jsoup/internal/SimpleStreamReader.java index 96fc9079db..15cff23d99 100644 --- a/src/main/java/org/jsoup/internal/SimpleStreamReader.java +++ b/src/main/java/org/jsoup/internal/SimpleStreamReader.java @@ -43,7 +43,7 @@ public int read(char[] charArray, int off, int len) throws IOException { while (true) { CoderResult result = decoder.decode(byteBuf, charBuf, readFully); if (result.isUnderflow()) { - if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !(in.available() > 0)) + if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !hasAvailableBytes()) break; int read = bufferUp(); if (read < 0) { @@ -64,6 +64,14 @@ public int read(char[] charArray, int off, int len) throws IOException { return charBuf.position(); } + private boolean hasAvailableBytes() { + try { + return in.available() > 0; + } catch (IOException e) { + return false; // available() is advisory; a real read can still consume buffered bytes or reach EOF + } + } + private int bufferUp() throws IOException { assert byteBuf != null; // already validated ^ byteBuf.compact(); diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index f557d2895c..1936871bc3 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -418,4 +418,25 @@ public int available() { } } + @Test + void charsetSniffingIgnoresAdvisoryAvailableIOException() throws IOException { + // https://github.com/jhy/jsoup/issues/2474 + // JDK 8's HttpURLConnection stream may throw from available() once the peer has closed the socket; + // that advisory failure does not mean we can't still consume bytes already buffered or read to clean EOF. + String html = "OneTwo"; + byte[] bytes = html.getBytes(StandardCharsets.UTF_8); + InputStream stream = new FilterInputStream(new ByteArrayInputStream(bytes)) { + @Override + public int available() throws IOException { + throw new IOException("Stream closed."); + } + }; + ControllableInputStream in = ControllableInputStream.wrap(stream, 0); + + Document doc = DataUtil.parseInputStream(in, null, "http://example.com/", Parser.htmlParser()); + + assertEquals("One", doc.title()); + assertEquals("Two", doc.body().text()); + } + } From 6a03db835273990489b4d12c5949d52ec8bd2e0f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 19:06:13 +1100 Subject: [PATCH 5/7] Bump org.apache.maven.plugins:maven-surefire-plugin from 3.5.4 to 3.5.5 (#2471) Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 3.5.4 to 3.5.5. - [Release notes](https://github.com/apache/maven-surefire/releases) - [Commits](https://github.com/apache/maven-surefire/compare/surefire-3.5.4...surefire-3.5.5) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-surefire-plugin dependency-version: 3.5.5 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 934d1527fc..88be43f469 100644 --- a/pom.xml +++ b/pom.xml @@ -249,7 +249,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.5.4 + 3.5.5 -Xss640k From 99bbdee251624a3c0af03792959c70a365d7df20 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 19:07:34 +1100 Subject: [PATCH 6/7] Bump org.apache.maven.plugins:maven-failsafe-plugin from 3.5.4 to 3.5.5 (#2470) Bumps [org.apache.maven.plugins:maven-failsafe-plugin](https://github.com/apache/maven-surefire) from 3.5.4 to 3.5.5. - [Release notes](https://github.com/apache/maven-surefire/releases) - [Commits](https://github.com/apache/maven-surefire/compare/surefire-3.5.4...surefire-3.5.5) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-failsafe-plugin dependency-version: 3.5.5 dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 88be43f469..d7e4b2b89f 100644 --- a/pom.xml +++ b/pom.xml @@ -257,7 +257,7 @@ maven-failsafe-plugin - 3.5.4 + 3.5.5 @@ -484,7 +484,7 @@ maven-failsafe-plugin - 3.5.4 + 3.5.5 From be7ca538fef7aff52fafa26cb6d6fcaaa9e4c5a7 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 6 Mar 2026 19:37:09 +1100 Subject: [PATCH 7/7] Document Cleaner and Safelist thread safety And add a guard test Fixes #2473 --- src/main/java/org/jsoup/safety/Cleaner.java | 11 ++-- src/main/java/org/jsoup/safety/Safelist.java | 11 ++-- .../java/org/jsoup/safety/CleanerTest.java | 50 +++++++++++++++++++ 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index 6ea53d084b..f5fd8766c1 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -17,19 +17,24 @@ import static org.jsoup.internal.SharedConstants.DummyUri; /** - The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes + The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes that you are expecting; no junk, and no cross-site scripting attacks!

    - The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain + The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain HTML that is allowed by the safelist.

    It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the - canned safe-lists only allow body contained tags. + canned safelists only allow body-contained tags.

    Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.

    +

    + A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has + been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning + calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy. +

    */ public class Cleaner { private final Safelist safelist; diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index cf76c08d3b..e24ab860ab 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -22,7 +22,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/ /** - Safe-lists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed. + Safelists define what HTML (elements and attributes) to allow through a {@link Cleaner}. Everything else is removed.

    Start with one of the defaults:

    @@ -53,15 +53,20 @@ If you need to allow more through (please be careful!), tweak a base safelist wi

    - The cleaner and these safelists assume that you want to clean a body fragment of HTML (to add user + The {@link Cleaner} and these safelists assume that you want to clean a body fragment of HTML (to add user supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, you could wrap the templated document HTML around the cleaned body HTML.

    + Safelists are mutable. A {@link Cleaner} uses the supplied safelist directly, so later changes affect later cleaning + calls. If you want to share a safelist across threads, finish configuring it first and do not mutate it while it is in + use. To build a variant from an existing configuration, use {@link #Safelist(Safelist)} to make a copy. +

    +

    If you are going to extend a safelist, please be very careful. Make sure you understand what attributes may lead to XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See the XSS Filter Evasion Cheat Sheet for some - XSS attack examples (that jsoup will safegaurd against the default Cleaner and Safelist configuration). + XSS attack examples (that jsoup will safeguard against with the default Cleaner and Safelist configuration).

    */ public class Safelist { diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index 30c05edba6..7d80d8bced 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -16,6 +16,9 @@ import java.util.Arrays; import java.util.Locale; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import static org.junit.jupiter.api.Assertions.*; @@ -226,6 +229,53 @@ public void safeListedProtocolShouldBeRetained(Locale locale) { assertFalse(new Cleaner(Safelist.none()).isValid(okDoc)); } + @Test void configuredCleanerMayBeSharedAcrossThreads() throws InterruptedException { + // https://github.com/jhy/jsoup/issues/2473 + String html = "LinkQ"; + String baseUri = "https://example.com/"; + String expected = "Link\"Q\""; + Cleaner cleaner = new Cleaner(Safelist.basicWithImages()); + + int numThreads = 10; + int numLoops = 20; + String[] cleaned = new String[numThreads * numLoops]; + AtomicInteger next = new AtomicInteger(); + AtomicReference failure = new AtomicReference<>(); + CountDownLatch start = new CountDownLatch(1); + CountDownLatch done = new CountDownLatch(numThreads); + Thread[] threads = new Thread[numThreads]; + + for (int i = 0; i < numThreads; i++) { + Thread thread = new Thread(() -> { + try { + start.await(); + for (int j = 0; j < numLoops; j++) { + Document dirty = Jsoup.parseBodyFragment(html, baseUri); + cleaned[next.getAndIncrement()] = cleaner.clean(dirty).body().html(); + } + } catch (Throwable t) { + failure.compareAndSet(null, t); + if (t instanceof InterruptedException) Thread.currentThread().interrupt(); + } finally { + done.countDown(); + } + }); + threads[i] = thread; + thread.start(); + } + + start.countDown(); + done.await(); + + if (failure.get() != null) + throw new AssertionError("Concurrent cleaner use failed", failure.get()); + + assertEquals(cleaned.length, next.get()); + for (String clean : cleaned) { + assertEquals(expected, clean); + } + } + @Test public void resolvesRelativeLinks() { String html = "Link"; String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages());