@@ -505,7 +505,7 @@
org.junit.jupiter
junit-jupiter
- 5.14.2
+ 5.14.3
test
diff --git a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java
index dafbd3af4a..daeeaed9c5 100644
--- a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java
+++ b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java
@@ -80,7 +80,11 @@ private void fill() throws IOException {
bufLength = read + bufPos;
capRemaining -= read;
while (byteBuf.length - bufLength > 0 && capRemaining > 0) { // read in more if we have space, without blocking
- if (in.available() < 1) break;
+ try {
+ if (in.available() < 1) break;
+ } catch (IOException e) {
+ break; // available() is advisory; keep the bytes we've already buffered
+ }
toRead = Math.min(byteBuf.length - bufLength, capRemaining);
if (toRead <= 0) break;
read = in.read(byteBuf, bufLength, toRead);
@@ -116,8 +120,7 @@ public int available() throws IOException {
if (buffered > 0) {
return buffered; // doesn't include those in.available(), but mostly used as a block test
}
- int avail = inReadFully ? 0 : in.available();
- return avail;
+ return inReadFully ? 0 : in.available();
}
void capRemaining(int newRemaining) {
diff --git a/src/main/java/org/jsoup/internal/SimpleStreamReader.java b/src/main/java/org/jsoup/internal/SimpleStreamReader.java
index 96fc9079db..15cff23d99 100644
--- a/src/main/java/org/jsoup/internal/SimpleStreamReader.java
+++ b/src/main/java/org/jsoup/internal/SimpleStreamReader.java
@@ -43,7 +43,7 @@ public int read(char[] charArray, int off, int len) throws IOException {
while (true) {
CoderResult result = decoder.decode(byteBuf, charBuf, readFully);
if (result.isUnderflow()) {
- if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !(in.available() > 0))
+ if (readFully || !charBuf.hasRemaining() || (charBuf.position() > 0) && !hasAvailableBytes())
break;
int read = bufferUp();
if (read < 0) {
@@ -64,6 +64,14 @@ public int read(char[] charArray, int off, int len) throws IOException {
return charBuf.position();
}
+ private boolean hasAvailableBytes() {
+ try {
+ return in.available() > 0;
+ } catch (IOException e) {
+ return false; // available() is advisory; a real read can still consume buffered bytes or reach EOF
+ }
+ }
+
private int bufferUp() throws IOException {
assert byteBuf != null; // already validated ^
byteBuf.compact();
diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java
index 6ea53d084b..f5fd8766c1 100644
--- a/src/main/java/org/jsoup/safety/Cleaner.java
+++ b/src/main/java/org/jsoup/safety/Cleaner.java
@@ -17,19 +17,24 @@
import static org.jsoup.internal.SharedConstants.DummyUri;
/**
- The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
+ The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
that you are expecting; no junk, and no cross-site scripting attacks!
- The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain
+ The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain
HTML that is allowed by the safelist.
It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
- canned safe-lists only allow body contained tags.
+ canned safelists only allow body-contained tags.
Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
+
+ A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has
+ been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning
+ calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy.
+
*/
public class Cleaner {
private final Safelist safelist;
diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java
index cf76c08d3b..e24ab860ab 100644
--- a/src/main/java/org/jsoup/safety/Safelist.java
+++ b/src/main/java/org/jsoup/safety/Safelist.java
@@ -22,7 +22,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
/**
- Safe-lists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed.
+ Safelists define what HTML (elements and attributes) to allow through a {@link Cleaner}. Everything else is removed.
Start with one of the defaults:
@@ -53,15 +53,20 @@ If you need to allow more through (please be careful!), tweak a base safelist wi
- The cleaner and these safelists assume that you want to clean a body fragment of HTML (to add user
+ The {@link Cleaner} and these safelists assume that you want to clean a body fragment of HTML (to add user
supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, you could wrap
the templated document HTML around the cleaned body HTML.
+ Safelists are mutable. A {@link Cleaner} uses the supplied safelist directly, so later changes affect later cleaning
+ calls. If you want to share a safelist across threads, finish configuring it first and do not mutate it while it is in
+ use. To build a variant from an existing configuration, use {@link #Safelist(Safelist)} to make a copy.
+
+
If you are going to extend a safelist, please be very careful. Make sure you understand what attributes may lead to
XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See
the XSS Filter Evasion Cheat Sheet for some
- XSS attack examples (that jsoup will safegaurd against the default Cleaner and Safelist configuration).
+ XSS attack examples (that jsoup will safeguard against with the default Cleaner and Safelist configuration).
*/
public class Safelist {
diff --git a/src/main/java/org/jsoup/select/NodeTraversor.java b/src/main/java/org/jsoup/select/NodeTraversor.java
index b29afd0941..32ac44194f 100644
--- a/src/main/java/org/jsoup/select/NodeTraversor.java
+++ b/src/main/java/org/jsoup/select/NodeTraversor.java
@@ -10,7 +10,7 @@
order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for
each node.
During the head() visit, DOM structural changes around the node currently being visited are
- supported, including e.g. {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
+ supported, including {@link Node#replaceWith(Node)} and {@link Node#remove()}. See
{@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node
changes are also supported.
DOM structural changes to the current node are not supported during the tail() visit.
@@ -31,27 +31,32 @@ public static void traverse(NodeVisitor visitor, Node root) {
Validate.notNull(visitor);
Validate.notNull(root);
Node node = root;
+ final Node rootNext = root.nextSibling(); // don't traverse siblings beyond the original root
int depth = 0;
byte state = VisitHead;
- while (node != null) {
+ while (true) {
if (state == VisitHead) {
// snapshot the current cursor position so we can recover if head() structurally changes it:
- Node parent = node.parentNode();
- Node nextSib = node.nextSibling();
- int sibIndex = parent != null ? node.siblingIndex() : 0;
- int childCount = parent != null ? parent.childNodeSize() : 0;
+ Node parent = node.parentNode();
+ Node next = node.nextSibling();
+ int sibIndex = parent != null ? node.siblingIndex() : 0;
visitor.head(node, depth);
// any structural changes?
- if (parent != null && !node.hasParent()) { // node was removed from parent; try to recover by sibling index
- if (parent.childNodeSize() == childCount) { // current slot is still occupied
- node = parent.childNode(sibIndex);
- state = AfterHead; // continue from that slot without re-heading it
- } else if (nextSib != null) { // removed; resume from the original next
- node = nextSib;
- } else { // removed last child; tail the parent next
+ if (parent != null && node.parentNode() != parent) { // removed / replaced / moved
+ Node occupant = sibIndex < parent.childNodeSize() ? parent.childNode(sibIndex) : null;
+ // ^^ the node now at this node's former position
+ Node boundary = depth == 0 ? rootNext : next; // don't advance beyond this node when resuming
+ if (occupant != null && occupant != boundary) {
+ node = occupant;
+ state = AfterHead; // continue from that slot without re-heading it
+ } else if (depth == 0) { // root detached or replaced
+ break;
+ } else if (next != null && next.parentNode() == parent) {
+ node = next; // old slot is empty or shifted to the original next, visit
+ } else { // removed last child; tail the parent next
node = parent;
depth--;
state = VisitTail;
@@ -59,7 +64,7 @@ public static void traverse(NodeVisitor visitor, Node root) {
} else {
state = AfterHead;
}
- continue; // next loop handles the updated node/state
+ continue; // next loop handles the updated node/state
}
if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children
@@ -71,10 +76,12 @@ public static void traverse(NodeVisitor visitor, Node root) {
visitor.tail(node, depth);
- if (node == root) break; // done
-
Node next = node.nextSibling();
- if (next != null) { // traverse siblings
+ if (depth == 0) {
+ if (next == null || next == rootNext) break; // done with the original root range
+ node = next;
+ state = VisitHead;
+ } else if (next != null) { // traverse siblings
node = next;
state = VisitHead;
} else { // no siblings left, ascend
diff --git a/src/main/java/org/jsoup/select/NodeVisitor.java b/src/main/java/org/jsoup/select/NodeVisitor.java
index b2375c49c6..479d79be0a 100644
--- a/src/main/java/org/jsoup/select/NodeVisitor.java
+++ b/src/main/java/org/jsoup/select/NodeVisitor.java
@@ -40,6 +40,8 @@ The node may be modified (for example via {@link Node#attr(String)}), removed
If the current node was detached and no node occupies its former sibling position, the current node is not
passed to {@code tail()}, and traversal resumes at the node that originally followed it.
+ Traversal never advances outside the original root subtree. If the traversal root is detached during
+ {@code head()}, traversal stops at the original root boundary.
@param node the node being visited.
@param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
index f557d2895c..1936871bc3 100644
--- a/src/test/java/org/jsoup/helper/DataUtilTest.java
+++ b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -418,4 +418,25 @@ public int available() {
}
}
+ @Test
+ void charsetSniffingIgnoresAdvisoryAvailableIOException() throws IOException {
+ // https://github.com/jhy/jsoup/issues/2474
+ // JDK 8's HttpURLConnection stream may throw from available() once the peer has closed the socket;
+ // that advisory failure does not mean we can't still consume bytes already buffered or read to clean EOF.
+ String html = "OneTwo";
+ byte[] bytes = html.getBytes(StandardCharsets.UTF_8);
+ InputStream stream = new FilterInputStream(new ByteArrayInputStream(bytes)) {
+ @Override
+ public int available() throws IOException {
+ throw new IOException("Stream closed.");
+ }
+ };
+ ControllableInputStream in = ControllableInputStream.wrap(stream, 0);
+
+ Document doc = DataUtil.parseInputStream(in, null, "http://example.com/", Parser.htmlParser());
+
+ assertEquals("One", doc.title());
+ assertEquals("Two", doc.body().text());
+ }
+
}
diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java
index 30c05edba6..7d80d8bced 100644
--- a/src/test/java/org/jsoup/safety/CleanerTest.java
+++ b/src/test/java/org/jsoup/safety/CleanerTest.java
@@ -16,6 +16,9 @@
import java.util.Arrays;
import java.util.Locale;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
import static org.junit.jupiter.api.Assertions.*;
@@ -226,6 +229,53 @@ public void safeListedProtocolShouldBeRetained(Locale locale) {
assertFalse(new Cleaner(Safelist.none()).isValid(okDoc));
}
+ @Test void configuredCleanerMayBeSharedAcrossThreads() throws InterruptedException {
+ // https://github.com/jhy/jsoup/issues/2473
+ String html = "Link
";
+ String baseUri = "https://example.com/";
+ String expected = "Link
";
+ Cleaner cleaner = new Cleaner(Safelist.basicWithImages());
+
+ int numThreads = 10;
+ int numLoops = 20;
+ String[] cleaned = new String[numThreads * numLoops];
+ AtomicInteger next = new AtomicInteger();
+ AtomicReference failure = new AtomicReference<>();
+ CountDownLatch start = new CountDownLatch(1);
+ CountDownLatch done = new CountDownLatch(numThreads);
+ Thread[] threads = new Thread[numThreads];
+
+ for (int i = 0; i < numThreads; i++) {
+ Thread thread = new Thread(() -> {
+ try {
+ start.await();
+ for (int j = 0; j < numLoops; j++) {
+ Document dirty = Jsoup.parseBodyFragment(html, baseUri);
+ cleaned[next.getAndIncrement()] = cleaner.clean(dirty).body().html();
+ }
+ } catch (Throwable t) {
+ failure.compareAndSet(null, t);
+ if (t instanceof InterruptedException) Thread.currentThread().interrupt();
+ } finally {
+ done.countDown();
+ }
+ });
+ threads[i] = thread;
+ thread.start();
+ }
+
+ start.countDown();
+ done.await();
+
+ if (failure.get() != null)
+ throw new AssertionError("Concurrent cleaner use failed", failure.get());
+
+ assertEquals(cleaned.length, next.get());
+ for (String clean : cleaned) {
+ assertEquals(expected, clean);
+ }
+ }
+
@Test public void resolvesRelativeLinks() {
String html = "Link
";
String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages());
diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java
index 2ff1f18f53..cb18b5106f 100644
--- a/src/test/java/org/jsoup/select/TraversorTest.java
+++ b/src/test/java/org/jsoup/select/TraversorTest.java
@@ -372,6 +372,180 @@ void visitsChildrenInsertedInHead() {
assertEquals("", TextUtil.stripNewlines(doc.body().html()));
}
+ @Test
+ void removingTraversalRootInHeadDoesNotEscapeOriginalSubtree() {
+ Document doc = Jsoup.parseBodyFragment("a
b
");
+ Element root = doc.expectFirst("div");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node == root)
+ node.remove();
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, root);
+
+ assertEquals("div;", headOrder.toString());
+ assertEquals("", tailOrder.toString());
+ assertEquals("b
", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @Test
+ void removingOnlyTraversalRootInHeadStopsWhenOriginalSlotIsEmpty() {
+ Document doc = Jsoup.parseBodyFragment("a
");
+ Element root = doc.expectFirst("div");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node == root)
+ node.remove();
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, root);
+
+ assertEquals("div;", headOrder.toString());
+ assertEquals("", tailOrder.toString());
+ assertEquals("", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @Test
+ void replacingTraversalRootInHeadStaysWithinReplacementSubtree() {
+ Document doc = Jsoup.parseBodyFragment("x
y
");
+ Element root = doc.expectFirst("div");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node == root) {
+ Element replacement = new Element("section").insertChildren(0, node.childNodes());
+ node.replaceWith(replacement);
+ }
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, root);
+
+ assertEquals("div;span;x;", headOrder.toString());
+ assertEquals("x;span;section;", tailOrder.toString());
+ assertEquals("y
", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @Test
+ void unwrappingTraversalRootInHeadVisitsExposedTopLevelNodesUntilOriginalBoundary() {
+ Document doc = Jsoup.parseBodyFragment("xy
z
");
+ Element root = doc.expectFirst("div");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node == root)
+ node.unwrap();
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, root);
+
+ assertEquals("div;x;i;y;", headOrder.toString());
+ assertEquals("x;b;y;i;", tailOrder.toString());
+ assertEquals("xyz
", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @Test
+ void unwrapInHeadContinuesFromExposedChildren() {
+ Document doc = Jsoup.parseBodyFragment("xyz
");
+ Element root = doc.expectFirst("div");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node instanceof Element && node.nameIs("span"))
+ node.unwrap();
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, root);
+
+ assertEquals("div;span;x;i;y;u;z;", headOrder.toString());
+ assertEquals("x;b;y;i;z;u;div;", tailOrder.toString());
+ assertEquals("xyz
", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @Test
+ void removingCurrentAndOriginalNextInHeadTailsParent() {
+ Document doc = Jsoup.parseBodyFragment("aTwo
");
+ Element root = doc.expectFirst("div");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node instanceof TextNode && ((TextNode) node).text().equals("a")) {
+ node.nextSibling().remove();
+ node.remove();
+ }
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, root);
+
+ assertEquals("div;a;", headOrder.toString());
+ assertEquals("div;", tailOrder.toString());
+ assertEquals("", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @Test
+ void beforeAfterRemoveInHeadTailsCurrentSlotAndHeadsFutureSiblings() {
+ Document doc = Jsoup.parse("");
+ StringBuilder headOrder = new StringBuilder();
+ StringBuilder tailOrder = new StringBuilder();
+
+ NodeTraversor.traverse(new NodeVisitor() {
+ @Override public void head(Node node, int depth) {
+ trackSeen(node, headOrder);
+ if (node instanceof TextNode && ((TextNode) node).text().equals("a")) {
+ node.before(new TextNode("b"));
+ node.after(new TextNode("c"));
+ node.remove();
+ }
+ }
+
+ @Override public void tail(Node node, int depth) {
+ trackSeen(node, tailOrder);
+ }
+ }, doc);
+
+ assertEquals("#root;html;head;body;div;p;One;a;c;em;Two;", headOrder.toString());
+ assertEquals("head;One;p;b;c;Two;em;div;body;html;#root;", tailOrder.toString());
+ assertEquals("", TextUtil.stripNewlines(doc.body().html()));
+ }
+
@Test void elementFunctionalTraverse() {
Document doc = Jsoup.parse("1
2
3");
Element body = doc.body();