From 03f9ed13391005583b5cb760ca55657a4a5ae51f Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sun, 8 Mar 2026 12:16:44 +1100 Subject: [PATCH 1/4] Make isSafeAttribute side effect free Move URL absolutizing from Safelist to Cleaner, so cleaning and validating a document no longer modify the input document. Fixes #2475 --- CHANGES.md | 1 + src/main/java/org/jsoup/safety/Cleaner.java | 14 ++++- src/main/java/org/jsoup/safety/Safelist.java | 51 ++++++++++++----- .../java/org/jsoup/safety/CleanerTest.java | 55 +++++++++++++++++++ .../java/org/jsoup/safety/SafelistTest.java | 21 +++++++ 5 files changed, 124 insertions(+), 18 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index c2520ece21..7a73a8c445 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,7 @@ * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) * In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) * Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474) +* Cleaner no longer makes relative URL attributes in the input document absolute when cleaning or validating a Document. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index f5fd8766c1..32ae23f9ee 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -190,9 +190,17 @@ private ElementMeta createSafeElement(Element sourceEl) { int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { - if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) - destAttrs.put(sourceAttr); - else + if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { // will keep this attr + String key = sourceAttr.getKey(); + String value = sourceAttr.getValue(); + + if (safelist.shouldAbsUrl(sourceTag, key)) { // configured to make absolute urls for this key (href) + value = sourceEl.absUrl(key); + if (value.isEmpty()) // could not be made abs; leave as-is to allow custom unknown protocols + value = sourceAttr.getValue(); + } + destAttrs.put(key, value); + } else numDiscarded++; } diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index e24ab860ab..05e5f7001d 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -71,6 +71,7 @@ XSS attack examples (that jsoup will safeguard against with the default Cleaner */ public class Safelist { private static final String All = ":all"; + private static final TagName AllTag = TagName.valueOf(All); private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span] private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. private final Map> enforcedAttributes; // always set these attribute values @@ -519,6 +520,7 @@ public boolean isSafeTag(String tag) { /** * Test if the supplied attribute is allowed by this safelist for this tag. + *

This method does not modify the input element or attribute.

* @param tagName tag to consider allowing the attribute in * @param el element under test, to confirm protocol * @param attr attribute under test @@ -533,33 +535,29 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) { if (protocols.containsKey(tag)) { Map> attrProts = protocols.get(tag); // ok if not defined protocol; otherwise test - return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key)); + return !attrProts.containsKey(key) || isSafeProtocol(getProtocolValue(el, attr), attrProts.get(key)); } else { // attribute found, no protocols defined, so OK return true; } } - // might be an enforced attribute? Map enforcedSet = enforcedAttributes.get(tag); - if (enforcedSet != null) { - Attributes expect = getEnforcedAttributes(tagName); - String attrKey = attr.getKey(); - if (expect.hasKeyIgnoreCase(attrKey)) { - return expect.getIgnoreCase(attrKey).equals(attr.getValue()); - } + if (enforcedSet != null && enforcedSet.containsKey(key)) { + // enforced attr key was LCed via AttributeKey.valueOf(attr.getKey()), + // if the input already has that exact value, treat it as safe + return enforcedSet.get(key).equals(AttributeValue.valueOf(attr.getValue())); } // no attributes defined for tag, try :all tag return !tagName.equals(All) && isSafeAttribute(All, el, attr); } - private boolean testValidProtocol(Element el, Attribute attr, Set protocols) { - // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. - // rels without a baseuri get removed + private String getProtocolValue(Element el, Attribute attr) { String value = el.absUrl(attr.getKey()); - if (value.length() == 0) + if (value.isEmpty()) value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols - if (!preserveRelativeLinks) - attr.setValue(value); - + return value; + } + + private boolean isSafeProtocol(String value, Set protocols) { for (Protocol protocol : protocols) { String prot = protocol.toString(); @@ -580,6 +578,29 @@ private boolean testValidProtocol(Element el, Attribute attr, Set prot return false; } + /** + Check if a URL attribute should be normalized to an absolute URL in the cleaned output. Uses the configured + protocols for that tag+attribute pair, falling back to {@code :all} only if the tag does not define the + attribute. + */ + boolean shouldAbsUrl(String tagName, String attrKey) { + if (preserveRelativeLinks) return false; + return shouldAbsUrl(TagName.valueOf(tagName), AttributeKey.valueOf(attrKey)); + } + + private boolean shouldAbsUrl(TagName tag, AttributeKey key) { + Set allowedAttrs = attributes.get(tag); + if (allowedAttrs != null && allowedAttrs.contains(key)) { + Map> protocolsByAttr = protocols.get(tag); + return protocolsByAttr != null && protocolsByAttr.containsKey(key); + } + + Map enforcedAttrs = enforcedAttributes.get(tag); + if (enforcedAttrs != null && enforcedAttrs.containsKey(key)) return false; + + return !tag.equals(AllTag) && shouldAbsUrl(AllTag, key); + } + private static boolean isValidAnchor(String value) { return value.startsWith("#") && !value.matches(".*\\s.*"); } diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index 7d80d8bced..dc6d953424 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -7,6 +7,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities; import org.jsoup.nodes.Range; +import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.parser.TagSet; @@ -282,6 +283,48 @@ public void safeListedProtocolShouldBeRetained(Locale locale) { assertEquals("Link", clean); } + @Test void cleanDoesNotModifyInputDocumentWhenResolvingRelativeLinks() { + String html = "Link"; + Cleaner cleaner = new Cleaner(Safelist.basic()); + Document dirty = Jsoup.parseBodyFragment(html, "http://example.com/"); + Document clean = cleaner.clean(dirty); + + assertEquals("Link", clean.body().html()); + assertEquals("/foo", dirty.expectFirst("a").attr("href")); + } + + @Test void isValidDoesNotModifyInputDocumentWhenResolvingRelativeLinks() { + String html = "Link"; + Cleaner cleaner = new Cleaner(Safelist.basic()); + Document dirty = Jsoup.parseBodyFragment(html, "http://example.com/"); + + assertTrue(cleaner.isValid(dirty)); + assertEquals("Link", + cleaner.clean(Jsoup.parseBodyFragment(html, "http://example.com/")).body().html()); + assertEquals("/foo", dirty.expectFirst("a").attr("href")); + } + + @Test void allPseudoTagProtocolsNormalizeRelativeLinks() { + Safelist safelist = new Safelist() + .addTags("a") + .addAttributes(":all", "href") + .addProtocols(":all", "href", "http", "https"); + Document dirty = Jsoup.parseBodyFragment("Link", "http://example.com/"); + + assertEquals("Link", new Cleaner(safelist).clean(dirty).body().html()); + } + + @Test void tagSpecificAttributesDoNotInheritAllPseudoTagProtocols() { + Safelist safelist = new Safelist() + .addTags("a") + .addAttributes("a", "href") + .addAttributes(":all", "href") + .addProtocols(":all", "href", "http", "https"); + Document dirty = Jsoup.parseBodyFragment("Link", "http://example.com/"); + + assertEquals("Link", new Cleaner(safelist).clean(dirty).body().html()); + } + @Test public void preservesRelativeLinksIfConfigured() { String html = "Link "; String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages().preserveRelativeLinks(true)); @@ -428,6 +471,18 @@ public void bailsIfRemovingProtocolThatsNotSet() { assertEquals("One Two", clean); } + @Test void cleanerPreservesCaseVariantOfMatchingEnforcedAttribute() { + Document dirty = Jsoup.parse("Link", "", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner cleaner = new Cleaner(Safelist.basic()); + + Document clean = cleaner.clean(dirty); + Element link = clean.expectFirst("a"); + assertTrue(link.hasAttr("REL")); + assertEquals("nofollow", link.attr("REL")); + assertTrue(cleaner.isValid(dirty)); + } + @Test public void handlesNestedQuotesInAttribute() { // https://github.com/jhy/jsoup/issues/1243 - no repro String orig = "
Will (not) fail
"; diff --git a/src/test/java/org/jsoup/safety/SafelistTest.java b/src/test/java/org/jsoup/safety/SafelistTest.java index 796ddc7225..90b4fd61d2 100644 --- a/src/test/java/org/jsoup/safety/SafelistTest.java +++ b/src/test/java/org/jsoup/safety/SafelistTest.java @@ -61,6 +61,27 @@ public void testCopyConstructor_noSideEffectOnProtocols() { assertFalse(safelist2.isSafeAttribute(TEST_TAG, invalidElement, invalidAttribute)); } + @Test + void isSafeAttributeDoesNotModifyLiveAttribute() { + Attributes attributes = new Attributes().put("href", "/foo"); + Element link = new Element(Tag.valueOf("a"), "https://example.com/", attributes); + Attribute href = link.attributes().attribute("href"); + assertNotNull(href); + + assertTrue(Safelist.basic().isSafeAttribute("a", link, href)); + assertEquals("/foo", href.getValue()); + assertEquals("/foo", link.attr("href")); + } + + @Test + void enforcedAttributeMatchesInputKeyCaseInsensitively() { + Attribute rel = new Attribute("REL", "nofollow"); + Attributes attributes = new Attributes().put(rel); + Element link = new Element(Tag.valueOf("a"), "", attributes); + + assertTrue(Safelist.basic().isSafeAttribute("a", link, rel)); + } + @Test void noscriptIsBlocked() { boolean threw = false; From e640ca8a72415e305227175c6289cca994502727 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sun, 8 Mar 2026 12:30:10 +1100 Subject: [PATCH 2/4] Changelog format tweak --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 7a73a8c445..4a6e978c25 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,7 +6,7 @@ * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) * In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) * Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474) -* Cleaner no longer makes relative URL attributes in the input document absolute when cleaning or validating a Document. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475) +* `Cleaner` no longer makes relative URL attributes in the input document absolute when cleaning or validating a `Document`. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475) ## 1.22.1 (2026-Jan-01) From 7c32b7e105184ee2e2027b2dd25bec8dac85f934 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sun, 8 Mar 2026 12:54:21 +1100 Subject: [PATCH 3/4] Canonicalize enforced attrs in Cleaner Prevent duplicate enforced attributes in cleaned preserve-case documents by replacing case-variant source attrs during the enforced-attr merge. Fixes #2476 --- CHANGES.md | 1 + src/main/java/org/jsoup/safety/Cleaner.java | 6 ++++- .../java/org/jsoup/safety/CleanerTest.java | 23 +++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 4a6e978c25..b6c22d24d2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,6 +7,7 @@ * In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) * Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474) * `Cleaner` no longer makes relative URL attributes in the input document absolute when cleaning or validating a `Document`. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475) +* `Cleaner` no longer duplicates enforced attributes when the input `Document` preserves attribute case. A case-variant source attribute is now replaced by the enforced attribute in the cleaned output. [#2476](https://github.com/jhy/jsoup/issues/2476) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index 32ae23f9ee..b80b6ddb79 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -215,7 +215,11 @@ private ElementMeta createSafeElement(Element sourceEl) { } } - destAttrs.addAll(enforcedAttrs); + // apply enforced attributes case-insensitively, so a preserved-case source attr is canonicalized to the enforced key + for (Attribute enforcedAttr : enforcedAttrs) { + destAttrs.removeIgnoreCase(enforcedAttr.getKey()); + destAttrs.put(enforcedAttr.getKey(), enforcedAttr.getValue()); + } dest.attributes().addAll(destAttrs); // re-attach, if removed in clear return new ElementMeta(dest, numDiscarded); } diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index dc6d953424..cb3bbabfcb 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -579,6 +579,29 @@ void cleansCaseSensitiveElements(boolean preserveCase) { assertEquals("One Two Three Four", clean4); } + @Test void canonicalizesEnforcedAttributes() { + Document customDirty = Jsoup.parse("One", "", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner customCleaner = new Cleaner(Safelist.none() + .addTags("a") + .addEnforcedAttribute("a", "rel", "external")); + assertEquals("One", customCleaner.clean(customDirty).body().html()); + } + + @Test void canonicalizesNofollowEnforcedAttribute() { + Document dirty = Jsoup.parse("One", "", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner cleaner = new Cleaner(Safelist.basic()); + assertEquals("One", cleaner.clean(dirty).body().html()); + } + + @Test void preservesMatchingSourceNofollowWhenEnforcementSuppressed() { + Document dirty = Jsoup.parse("One", "http://example.com/", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner cleaner = new Cleaner(Safelist.basic()); + assertEquals("One", cleaner.clean(dirty).body().html()); + } + @Test void discardsSvgScriptData() { // https://github.com/jhy/jsoup/issues/2320 Safelist svgOk = Safelist.none().addTags("svg"); From c1d08123de2945d80fd479bbad42973ce5b10188 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sun, 8 Mar 2026 13:07:46 +1100 Subject: [PATCH 4/4] Update CHANGES.md for NodeTraversor fixes Improve notes for #2472 --- CHANGES.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index b6c22d24d2..35d810d020 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,9 +2,12 @@ ## 1.22.2 (PENDING) +### Improvements +* Expanded and clarified `NodeTraversor` support for in-place DOM rewrites during `NodeVisitor.head()`. Current-node edits such as `remove`, `replace`, and `unwrap` now recover more predictably, while traversal stays within the original root subtree. This makes single-pass tree cleanup and normalization visitors easier to write, for example when unwrapping presentational elements or replacing text nodes as you walk the DOM. [#2472](https://github.com/jhy/jsoup/issues/2472) + ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) -* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) +* Fixed a `NodeTraversor` regression in 1.21.2 where removing or replacing the current node during `head()` could revisit the replacement node and loop indefinitely. The traversal docs now also clarify which inserted nodes are visited in the current pass. [#2472](https://github.com/jhy/jsoup/issues/2472) * Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474) * `Cleaner` no longer makes relative URL attributes in the input document absolute when cleaning or validating a `Document`. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475) * `Cleaner` no longer duplicates enforced attributes when the input `Document` preserves attribute case. A case-variant source attribute is now replaced by the enforced attribute in the cleaned output. [#2476](https://github.com/jhy/jsoup/issues/2476)