Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@

## 1.22.2 (PENDING)

### Improvements
* Expanded and clarified `NodeTraversor` support for in-place DOM rewrites during `NodeVisitor.head()`. Current-node edits such as `remove`, `replace`, and `unwrap` now recover more predictably, while traversal stays within the original root subtree. This makes single-pass tree cleanup and normalization visitors easier to write, for example when unwrapping presentational elements or replacing text nodes as you walk the DOM. [#2472](https://github.com/jhy/jsoup/issues/2472)

### Bug Fixes
* Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459)
* In `NodeTraversor`, removing or replacing the current node during `head()` no longer re-visits the replacement node, preventing loops. Traversal now continues correctly from nodes that occupy the original position after mutation, and will not advance past the original root subtree. Also, clarified in the documentation which inserted nodes are visited during the current traversal. [#2472](https://github.com/jhy/jsoup/issues/2472)
* Fixed a `NodeTraversor` regression in 1.21.2 where removing or replacing the current node during `head()` could revisit the replacement node and loop indefinitely. The traversal docs now also clarify which inserted nodes are visited in the current pass. [#2472](https://github.com/jhy/jsoup/issues/2472)
* Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474)
* `Cleaner` no longer makes relative URL attributes in the input document absolute when cleaning or validating a `Document`. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475)
* `Cleaner` no longer duplicates enforced attributes when the input `Document` preserves attribute case. A case-variant source attribute is now replaced by the enforced attribute in the cleaned output. [#2476](https://github.com/jhy/jsoup/issues/2476)

## 1.22.1 (2026-Jan-01)

Expand Down
20 changes: 16 additions & 4 deletions src/main/java/org/jsoup/safety/Cleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,17 @@ private ElementMeta createSafeElement(Element sourceEl) {
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
destAttrs.put(sourceAttr);
else
if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { // will keep this attr
String key = sourceAttr.getKey();
String value = sourceAttr.getValue();

if (safelist.shouldAbsUrl(sourceTag, key)) { // configured to make absolute urls for this key (href)
value = sourceEl.absUrl(key);
if (value.isEmpty()) // could not be made abs; leave as-is to allow custom unknown protocols
value = sourceAttr.getValue();
}
destAttrs.put(key, value);
} else
numDiscarded++;
}

Expand All @@ -207,7 +215,11 @@ private ElementMeta createSafeElement(Element sourceEl) {
}
}

destAttrs.addAll(enforcedAttrs);
// apply enforced attributes case-insensitively, so a preserved-case source attr is canonicalized to the enforced key
for (Attribute enforcedAttr : enforcedAttrs) {
destAttrs.removeIgnoreCase(enforcedAttr.getKey());
destAttrs.put(enforcedAttr.getKey(), enforcedAttr.getValue());
}
dest.attributes().addAll(destAttrs); // re-attach, if removed in clear
return new ElementMeta(dest, numDiscarded);
}
Expand Down
51 changes: 36 additions & 15 deletions src/main/java/org/jsoup/safety/Safelist.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ XSS attack examples (that jsoup will safeguard against with the default Cleaner
*/
public class Safelist {
private static final String All = ":all";
private static final TagName AllTag = TagName.valueOf(All);
private final Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]
private final Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.
private final Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values
Expand Down Expand Up @@ -519,6 +520,7 @@ public boolean isSafeTag(String tag) {

/**
* Test if the supplied attribute is allowed by this safelist for this tag.
* <p>This method does not modify the input element or attribute.</p>
* @param tagName tag to consider allowing the attribute in
* @param el element under test, to confirm protocol
* @param attr attribute under test
Expand All @@ -533,33 +535,29 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
if (protocols.containsKey(tag)) {
Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag);
// ok if not defined protocol; otherwise test
return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key));
return !attrProts.containsKey(key) || isSafeProtocol(getProtocolValue(el, attr), attrProts.get(key));
} else { // attribute found, no protocols defined, so OK
return true;
}
}
// might be an enforced attribute?
Map<AttributeKey, AttributeValue> enforcedSet = enforcedAttributes.get(tag);
if (enforcedSet != null) {
Attributes expect = getEnforcedAttributes(tagName);
String attrKey = attr.getKey();
if (expect.hasKeyIgnoreCase(attrKey)) {
return expect.getIgnoreCase(attrKey).equals(attr.getValue());
}
if (enforcedSet != null && enforcedSet.containsKey(key)) {
// enforced attr key was LCed via AttributeKey.valueOf(attr.getKey()),
// if the input already has that exact value, treat it as safe
return enforcedSet.get(key).equals(AttributeValue.valueOf(attr.getValue()));
}
// no attributes defined for tag, try :all tag
return !tagName.equals(All) && isSafeAttribute(All, el, attr);
}

private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
// try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
// rels without a baseuri get removed
private String getProtocolValue(Element el, Attribute attr) {
String value = el.absUrl(attr.getKey());
if (value.length() == 0)
if (value.isEmpty())
value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols
if (!preserveRelativeLinks)
attr.setValue(value);

return value;
}

private boolean isSafeProtocol(String value, Set<Protocol> protocols) {
for (Protocol protocol : protocols) {
String prot = protocol.toString();

Expand All @@ -580,6 +578,29 @@ private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> prot
return false;
}

/**
Check if a URL attribute should be normalized to an absolute URL in the cleaned output. Uses the configured
protocols for that tag+attribute pair, falling back to {@code :all} only if the tag does not define the
attribute.
*/
boolean shouldAbsUrl(String tagName, String attrKey) {
if (preserveRelativeLinks) return false;
return shouldAbsUrl(TagName.valueOf(tagName), AttributeKey.valueOf(attrKey));
}

private boolean shouldAbsUrl(TagName tag, AttributeKey key) {
Set<AttributeKey> allowedAttrs = attributes.get(tag);
if (allowedAttrs != null && allowedAttrs.contains(key)) {
Map<AttributeKey, Set<Protocol>> protocolsByAttr = protocols.get(tag);
return protocolsByAttr != null && protocolsByAttr.containsKey(key);
}

Map<AttributeKey, AttributeValue> enforcedAttrs = enforcedAttributes.get(tag);
if (enforcedAttrs != null && enforcedAttrs.containsKey(key)) return false;

return !tag.equals(AllTag) && shouldAbsUrl(AllTag, key);
}

private static boolean isValidAnchor(String value) {
return value.startsWith("#") && !value.matches(".*\\s.*");
}
Expand Down
78 changes: 78 additions & 0 deletions src/test/java/org/jsoup/safety/CleanerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.nodes.Range;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.parser.TagSet;
Expand Down Expand Up @@ -282,6 +283,48 @@ public void safeListedProtocolShouldBeRetained(Locale locale) {
assertEquals("<a href=\"http://example.com/foo\">Link</a><img src=\"http://example.com/bar\">", clean);
}

@Test void cleanDoesNotModifyInputDocumentWhenResolvingRelativeLinks() {
String html = "<a href='/foo'>Link</a>";
Cleaner cleaner = new Cleaner(Safelist.basic());
Document dirty = Jsoup.parseBodyFragment(html, "http://example.com/");
Document clean = cleaner.clean(dirty);

assertEquals("<a href=\"http://example.com/foo\">Link</a>", clean.body().html());
assertEquals("/foo", dirty.expectFirst("a").attr("href"));
}

@Test void isValidDoesNotModifyInputDocumentWhenResolvingRelativeLinks() {
String html = "<a href='/foo'>Link</a>";
Cleaner cleaner = new Cleaner(Safelist.basic());
Document dirty = Jsoup.parseBodyFragment(html, "http://example.com/");

assertTrue(cleaner.isValid(dirty));
assertEquals("<a href=\"http://example.com/foo\">Link</a>",
cleaner.clean(Jsoup.parseBodyFragment(html, "http://example.com/")).body().html());
assertEquals("/foo", dirty.expectFirst("a").attr("href"));
}

@Test void allPseudoTagProtocolsNormalizeRelativeLinks() {
Safelist safelist = new Safelist()
.addTags("a")
.addAttributes(":all", "href")
.addProtocols(":all", "href", "http", "https");
Document dirty = Jsoup.parseBodyFragment("<a href='/foo'>Link</a>", "http://example.com/");

assertEquals("<a href=\"http://example.com/foo\">Link</a>", new Cleaner(safelist).clean(dirty).body().html());
}

@Test void tagSpecificAttributesDoNotInheritAllPseudoTagProtocols() {
Safelist safelist = new Safelist()
.addTags("a")
.addAttributes("a", "href")
.addAttributes(":all", "href")
.addProtocols(":all", "href", "http", "https");
Document dirty = Jsoup.parseBodyFragment("<a href='/foo'>Link</a>", "http://example.com/");

assertEquals("<a href=\"/foo\">Link</a>", new Cleaner(safelist).clean(dirty).body().html());
}

@Test public void preservesRelativeLinksIfConfigured() {
String html = "<a href='/foo'>Link</a><img src='/bar'> <img src='javascript:alert()'>";
String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages().preserveRelativeLinks(true));
Expand Down Expand Up @@ -428,6 +471,18 @@ public void bailsIfRemovingProtocolThatsNotSet() {
assertEquals("<a>One</a> <a>Two</a>", clean);
}

@Test void cleanerPreservesCaseVariantOfMatchingEnforcedAttribute() {
Document dirty = Jsoup.parse("<a href='http://example.com/' REL='nofollow'>Link</a>", "",
Parser.htmlParser().settings(ParseSettings.preserveCase));
Cleaner cleaner = new Cleaner(Safelist.basic());

Document clean = cleaner.clean(dirty);
Element link = clean.expectFirst("a");
assertTrue(link.hasAttr("REL"));
assertEquals("nofollow", link.attr("REL"));
assertTrue(cleaner.isValid(dirty));
}

@Test public void handlesNestedQuotesInAttribute() {
// https://github.com/jhy/jsoup/issues/1243 - no repro
String orig = "<div style=\"font-family: 'Calibri'\">Will (not) fail</div>";
Expand Down Expand Up @@ -524,6 +579,29 @@ void cleansCaseSensitiveElements(boolean preserveCase) {
assertEquals("<a href=\"http://external.com/\" rel=\"nofollow\">One</a> <a href=\"/relative/\">Two</a> <a href=\"../other/\">Three</a> <a href=\"http://example.com/bar\" rel=\"nofollow\">Four</a>", clean4);
}

@Test void canonicalizesEnforcedAttributes() {
Document customDirty = Jsoup.parse("<a REL='external'>One</a>", "",
Parser.htmlParser().settings(ParseSettings.preserveCase));
Cleaner customCleaner = new Cleaner(Safelist.none()
.addTags("a")
.addEnforcedAttribute("a", "rel", "external"));
assertEquals("<a rel=\"external\">One</a>", customCleaner.clean(customDirty).body().html());
}

@Test void canonicalizesNofollowEnforcedAttribute() {
Document dirty = Jsoup.parse("<a href='http://external.com/' REL='nofollow'>One</a>", "",
Parser.htmlParser().settings(ParseSettings.preserveCase));
Cleaner cleaner = new Cleaner(Safelist.basic());
assertEquals("<a href=\"http://external.com/\" rel=\"nofollow\">One</a>", cleaner.clean(dirty).body().html());
}

@Test void preservesMatchingSourceNofollowWhenEnforcementSuppressed() {
Document dirty = Jsoup.parse("<a href='http://example.com/foo' REL='nofollow'>One</a>", "http://example.com/",
Parser.htmlParser().settings(ParseSettings.preserveCase));
Cleaner cleaner = new Cleaner(Safelist.basic());
assertEquals("<a href=\"http://example.com/foo\" REL=\"nofollow\">One</a>", cleaner.clean(dirty).body().html());
}

@Test void discardsSvgScriptData() {
// https://github.com/jhy/jsoup/issues/2320
Safelist svgOk = Safelist.none().addTags("svg");
Expand Down
21 changes: 21 additions & 0 deletions src/test/java/org/jsoup/safety/SafelistTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,27 @@ public void testCopyConstructor_noSideEffectOnProtocols() {
assertFalse(safelist2.isSafeAttribute(TEST_TAG, invalidElement, invalidAttribute));
}

@Test
void isSafeAttributeDoesNotModifyLiveAttribute() {
Attributes attributes = new Attributes().put("href", "/foo");
Element link = new Element(Tag.valueOf("a"), "https://example.com/", attributes);
Attribute href = link.attributes().attribute("href");
assertNotNull(href);

assertTrue(Safelist.basic().isSafeAttribute("a", link, href));
assertEquals("/foo", href.getValue());
assertEquals("/foo", link.attr("href"));
}

@Test
void enforcedAttributeMatchesInputKeyCaseInsensitively() {
Attribute rel = new Attribute("REL", "nofollow");
Attributes attributes = new Attributes().put(rel);
Element link = new Element(Tag.valueOf("a"), "", attributes);

assertTrue(Safelist.basic().isSafeAttribute("a", link, rel));
}

@Test
void noscriptIsBlocked() {
boolean threw = false;
Expand Down