diff --git a/CHANGES.md b/CHANGES.md index c2520ece21..35d810d020 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,10 +2,15 @@ ## 1.22.2 (PENDING) +### Improvements +* Expanded and clarified `NodeTraversor` support for in-place DOM rewrites during `NodeVisitor.head()`. Current-node edits such as `remove`, `replace`, and `unwrap` now recover more predictably, while traversal stays within the original root subtree. This makes single-pass tree cleanup and normalization visitors easier to write, for example when unwrapping presentational elements or replacing text nodes as you walk the DOM. [#2472](https://github.com/jhy/jsoup/issues/2472) + ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) -* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472) +* Fixed a `NodeTraversor` regression in 1.21.2 where removing or replacing the current node during `head()` could revisit the replacement node and loop indefinitely. The traversal docs now also clarify which inserted nodes are visited in the current pass. [#2472](https://github.com/jhy/jsoup/issues/2472) * Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474) +* `Cleaner` no longer makes relative URL attributes in the input document absolute when cleaning or validating a `Document`. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475) +* `Cleaner` no longer duplicates enforced attributes when the input `Document` preserves attribute case. A case-variant source attribute is now replaced by the enforced attribute in the cleaned output. [#2476](https://github.com/jhy/jsoup/issues/2476) ## 1.22.1 (2026-Jan-01) diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index f5fd8766c1..b80b6ddb79 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -190,9 +190,17 @@ private ElementMeta createSafeElement(Element sourceEl) { int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { - if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) - destAttrs.put(sourceAttr); - else + if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { // will keep this attr + String key = sourceAttr.getKey(); + String value = sourceAttr.getValue(); + + if (safelist.shouldAbsUrl(sourceTag, key)) { // configured to make absolute urls for this key (href) + value = sourceEl.absUrl(key); + if (value.isEmpty()) // could not be made abs; leave as-is to allow custom unknown protocols + value = sourceAttr.getValue(); + } + destAttrs.put(key, value); + } else numDiscarded++; } @@ -207,7 +215,11 @@ private ElementMeta createSafeElement(Element sourceEl) { } } - destAttrs.addAll(enforcedAttrs); + // apply enforced attributes case-insensitively, so a preserved-case source attr is canonicalized to the enforced key + for (Attribute enforcedAttr : enforcedAttrs) { + destAttrs.removeIgnoreCase(enforcedAttr.getKey()); + destAttrs.put(enforcedAttr.getKey(), enforcedAttr.getValue()); + } dest.attributes().addAll(destAttrs); // re-attach, if removed in clear return new ElementMeta(dest, numDiscarded); } diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index e24ab860ab..05e5f7001d 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -71,6 +71,7 @@ XSS attack examples (that jsoup will safeguard against with the default Cleaner */ public class Safelist { private static final String All = ":all"; + private static final TagName AllTag = TagName.valueOf(All); private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span] private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. private final Map> enforcedAttributes; // always set these attribute values @@ -519,6 +520,7 @@ public boolean isSafeTag(String tag) { /** * Test if the supplied attribute is allowed by this safelist for this tag. + *

This method does not modify the input element or attribute.

* @param tagName tag to consider allowing the attribute in * @param el element under test, to confirm protocol * @param attr attribute under test @@ -533,33 +535,29 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) { if (protocols.containsKey(tag)) { Map> attrProts = protocols.get(tag); // ok if not defined protocol; otherwise test - return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key)); + return !attrProts.containsKey(key) || isSafeProtocol(getProtocolValue(el, attr), attrProts.get(key)); } else { // attribute found, no protocols defined, so OK return true; } } - // might be an enforced attribute? Map enforcedSet = enforcedAttributes.get(tag); - if (enforcedSet != null) { - Attributes expect = getEnforcedAttributes(tagName); - String attrKey = attr.getKey(); - if (expect.hasKeyIgnoreCase(attrKey)) { - return expect.getIgnoreCase(attrKey).equals(attr.getValue()); - } + if (enforcedSet != null && enforcedSet.containsKey(key)) { + // enforced attr key was LCed via AttributeKey.valueOf(attr.getKey()), + // if the input already has that exact value, treat it as safe + return enforcedSet.get(key).equals(AttributeValue.valueOf(attr.getValue())); } // no attributes defined for tag, try :all tag return !tagName.equals(All) && isSafeAttribute(All, el, attr); } - private boolean testValidProtocol(Element el, Attribute attr, Set protocols) { - // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. - // rels without a baseuri get removed + private String getProtocolValue(Element el, Attribute attr) { String value = el.absUrl(attr.getKey()); - if (value.length() == 0) + if (value.isEmpty()) value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols - if (!preserveRelativeLinks) - attr.setValue(value); - + return value; + } + + private boolean isSafeProtocol(String value, Set protocols) { for (Protocol protocol : protocols) { String prot = protocol.toString(); @@ -580,6 +578,29 @@ private boolean testValidProtocol(Element el, Attribute attr, Set prot return false; } + /** + Check if a URL attribute should be normalized to an absolute URL in the cleaned output. Uses the configured + protocols for that tag+attribute pair, falling back to {@code :all} only if the tag does not define the + attribute. + */ + boolean shouldAbsUrl(String tagName, String attrKey) { + if (preserveRelativeLinks) return false; + return shouldAbsUrl(TagName.valueOf(tagName), AttributeKey.valueOf(attrKey)); + } + + private boolean shouldAbsUrl(TagName tag, AttributeKey key) { + Set allowedAttrs = attributes.get(tag); + if (allowedAttrs != null && allowedAttrs.contains(key)) { + Map> protocolsByAttr = protocols.get(tag); + return protocolsByAttr != null && protocolsByAttr.containsKey(key); + } + + Map enforcedAttrs = enforcedAttributes.get(tag); + if (enforcedAttrs != null && enforcedAttrs.containsKey(key)) return false; + + return !tag.equals(AllTag) && shouldAbsUrl(AllTag, key); + } + private static boolean isValidAnchor(String value) { return value.startsWith("#") && !value.matches(".*\\s.*"); } diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index 7d80d8bced..cb3bbabfcb 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -7,6 +7,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities; import org.jsoup.nodes.Range; +import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.parser.TagSet; @@ -282,6 +283,48 @@ public void safeListedProtocolShouldBeRetained(Locale locale) { assertEquals("Link", clean); } + @Test void cleanDoesNotModifyInputDocumentWhenResolvingRelativeLinks() { + String html = "Link"; + Cleaner cleaner = new Cleaner(Safelist.basic()); + Document dirty = Jsoup.parseBodyFragment(html, "http://example.com/"); + Document clean = cleaner.clean(dirty); + + assertEquals("Link", clean.body().html()); + assertEquals("/foo", dirty.expectFirst("a").attr("href")); + } + + @Test void isValidDoesNotModifyInputDocumentWhenResolvingRelativeLinks() { + String html = "Link"; + Cleaner cleaner = new Cleaner(Safelist.basic()); + Document dirty = Jsoup.parseBodyFragment(html, "http://example.com/"); + + assertTrue(cleaner.isValid(dirty)); + assertEquals("Link", + cleaner.clean(Jsoup.parseBodyFragment(html, "http://example.com/")).body().html()); + assertEquals("/foo", dirty.expectFirst("a").attr("href")); + } + + @Test void allPseudoTagProtocolsNormalizeRelativeLinks() { + Safelist safelist = new Safelist() + .addTags("a") + .addAttributes(":all", "href") + .addProtocols(":all", "href", "http", "https"); + Document dirty = Jsoup.parseBodyFragment("Link", "http://example.com/"); + + assertEquals("Link", new Cleaner(safelist).clean(dirty).body().html()); + } + + @Test void tagSpecificAttributesDoNotInheritAllPseudoTagProtocols() { + Safelist safelist = new Safelist() + .addTags("a") + .addAttributes("a", "href") + .addAttributes(":all", "href") + .addProtocols(":all", "href", "http", "https"); + Document dirty = Jsoup.parseBodyFragment("Link", "http://example.com/"); + + assertEquals("Link", new Cleaner(safelist).clean(dirty).body().html()); + } + @Test public void preservesRelativeLinksIfConfigured() { String html = "Link "; String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages().preserveRelativeLinks(true)); @@ -428,6 +471,18 @@ public void bailsIfRemovingProtocolThatsNotSet() { assertEquals("One Two", clean); } + @Test void cleanerPreservesCaseVariantOfMatchingEnforcedAttribute() { + Document dirty = Jsoup.parse("Link", "", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner cleaner = new Cleaner(Safelist.basic()); + + Document clean = cleaner.clean(dirty); + Element link = clean.expectFirst("a"); + assertTrue(link.hasAttr("REL")); + assertEquals("nofollow", link.attr("REL")); + assertTrue(cleaner.isValid(dirty)); + } + @Test public void handlesNestedQuotesInAttribute() { // https://github.com/jhy/jsoup/issues/1243 - no repro String orig = "
Will (not) fail
"; @@ -524,6 +579,29 @@ void cleansCaseSensitiveElements(boolean preserveCase) { assertEquals("One Two Three Four", clean4); } + @Test void canonicalizesEnforcedAttributes() { + Document customDirty = Jsoup.parse("One", "", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner customCleaner = new Cleaner(Safelist.none() + .addTags("a") + .addEnforcedAttribute("a", "rel", "external")); + assertEquals("One", customCleaner.clean(customDirty).body().html()); + } + + @Test void canonicalizesNofollowEnforcedAttribute() { + Document dirty = Jsoup.parse("One", "", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner cleaner = new Cleaner(Safelist.basic()); + assertEquals("One", cleaner.clean(dirty).body().html()); + } + + @Test void preservesMatchingSourceNofollowWhenEnforcementSuppressed() { + Document dirty = Jsoup.parse("One", "http://example.com/", + Parser.htmlParser().settings(ParseSettings.preserveCase)); + Cleaner cleaner = new Cleaner(Safelist.basic()); + assertEquals("One", cleaner.clean(dirty).body().html()); + } + @Test void discardsSvgScriptData() { // https://github.com/jhy/jsoup/issues/2320 Safelist svgOk = Safelist.none().addTags("svg"); diff --git a/src/test/java/org/jsoup/safety/SafelistTest.java b/src/test/java/org/jsoup/safety/SafelistTest.java index 796ddc7225..90b4fd61d2 100644 --- a/src/test/java/org/jsoup/safety/SafelistTest.java +++ b/src/test/java/org/jsoup/safety/SafelistTest.java @@ -61,6 +61,27 @@ public void testCopyConstructor_noSideEffectOnProtocols() { assertFalse(safelist2.isSafeAttribute(TEST_TAG, invalidElement, invalidAttribute)); } + @Test + void isSafeAttributeDoesNotModifyLiveAttribute() { + Attributes attributes = new Attributes().put("href", "/foo"); + Element link = new Element(Tag.valueOf("a"), "https://example.com/", attributes); + Attribute href = link.attributes().attribute("href"); + assertNotNull(href); + + assertTrue(Safelist.basic().isSafeAttribute("a", link, href)); + assertEquals("/foo", href.getValue()); + assertEquals("/foo", link.attr("href")); + } + + @Test + void enforcedAttributeMatchesInputKeyCaseInsensitively() { + Attribute rel = new Attribute("REL", "nofollow"); + Attributes attributes = new Attributes().put(rel); + Element link = new Element(Tag.valueOf("a"), "", attributes); + + assertTrue(Safelist.basic().isSafeAttribute("a", link, rel)); + } + @Test void noscriptIsBlocked() { boolean threw = false;