diff --git a/changelog.d/20250527_twm_srcset.rst b/changelog.d/20250527_twm_srcset.rst new file mode 100644 index 000000000..725afb24b --- /dev/null +++ b/changelog.d/20250527_twm_srcset.rst @@ -0,0 +1,4 @@ +Added +----- + +* Resolve relative URLs in ``srcset`` attributes and pass through ``srcset`` when sanitizing. diff --git a/feedparser/sanitizer.py b/feedparser/sanitizer.py index 0008e6d59..5cb09ebb9 100644 --- a/feedparser/sanitizer.py +++ b/feedparser/sanitizer.py @@ -259,6 +259,7 @@ class HTMLSanitizer(BaseHTMLProcessor): "size", "span", "src", + "srcset", "start", "step", "style", diff --git a/feedparser/urls.py b/feedparser/urls.py index 26ca1255b..cdf059133 100644 --- a/feedparser/urls.py +++ b/feedparser/urls.py @@ -1,4 +1,5 @@ # Copyright 2010-2025 Kurt McKee +# Copyright 2025 Tom Most # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -116,6 +117,56 @@ def make_safe_absolute_uri(base, rel=None): return uri +# Matches image candidate strings within a srcset attribute value as +# described in https://html.spec.whatwg.org/multipage/images.html#srcset-attributes +_srcset_candidate = re.compile( + r""" + # ASCII whitespace: https://infra.spec.whatwg.org/#ascii-whitespace + [\t\n\f\r ]* + ( + # URL that doesn't start or end with a comma + (?!,) + [^\t\n\f\r ]+ + (? list[tuple[str, str]]: + """ + Split a ``srcset`` attribute value into candidates: + + >>> srcset_candidates("/foo.jpg, /foo.2x.jpg 2x") + [("/foo.jpg", ""), ("/foo.2x.jpg", "2x")] + + This doesn't validate the URLs, nor check for duplicate or conflicting + descriptors. It returns an empty list when parsing fails. + """ + pos = 0 + candidates = [] + while m := _srcset_candidate.match(value, pos): + desc = m[2].strip("\t\n\f\r ") + candidates.append((m[1], desc)) + pos = m.end(0) + return candidates + + class RelativeURIResolver(BaseHTMLProcessor): relative_uris = { ("a", "href"), @@ -156,15 +207,23 @@ def __init__(self, baseuri, encoding, _type): def resolve_uri(self, uri): return make_safe_absolute_uri(self.baseuri, uri.strip()) + def resolve_srcset(self, srcset): + candidates = [] + for uri, desc in srcset_candidates(srcset): + uri = self.resolve_uri(uri) + if desc: + candidates.append(f"{uri} {desc}") + else: + candidates.append(uri) + return ", ".join(candidates) + def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) - attrs = [ - ( - key, - ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value, - ) - for key, value in attrs - ] + for i, (key, value) in enumerate(attrs): + if (tag, key) in self.relative_uris: + attrs[i] = (key, self.resolve_uri(value)) + elif tag in {"img", "source"} and key == "srcset": + attrs[i] = (key, self.resolve_srcset(value)) super().unknown_starttag(tag, attrs) diff --git a/tests/test_srcset_candidates.py b/tests/test_srcset_candidates.py new file mode 100644 index 000000000..485a0530d --- /dev/null +++ b/tests/test_srcset_candidates.py @@ -0,0 +1,63 @@ +import pytest + +from feedparser.urls import srcset_candidates + + +def test_empty(): + assert srcset_candidates("") == [] + assert srcset_candidates(" \n") == [] + + +def test_default(): + assert srcset_candidates("/1x.jpg") == [("/1x.jpg", "")] + + +def test_pixel_density_descriptor_one(): + assert srcset_candidates("/1x.jpg 1x") == [("/1x.jpg", "1x")] + + +def test_pixel_density_descriptor_two(): + assert srcset_candidates("/1x.jpg 1x,/2x.jpg\t2.0x") == [ + ("/1x.jpg", "1x"), + ("/2x.jpg", "2.0x"), + ] + + +def test_pixel_density_descriptor_three(): + assert srcset_candidates("/1x.jpg, /2x.jpg 2x , /3x.jpg 3x ") == [ + ("/1x.jpg", ""), + ("/2x.jpg", "2x"), + ("/3x.jpg", "3x"), + ] + + +@pytest.mark.parametrize( + "pd", ["1x", "1.0x", "9.5x", "36x", "39.95x", "100x", "1e1x", "2E2x"] +) +def test_pixel_density_descriptor_floats(pd): + """A pixel density descriptor allows all the valid float formats.""" + assert [("/foo.jpg", pd)] == srcset_candidates("/foo.jpg " + pd) + + +def test_url_comma(): + """A URL containing a comma is not broken.""" + assert srcset_candidates(" /,.jpg 6x,\n /,,,,.webp \t1e100x") == [ + ("/,.jpg", "6x"), + ("/,,,,.webp", "1e100x"), + ] + + +def test_width_one(): + assert srcset_candidates("/a.png 600w") == [("/a.png", "600w")] + + +def test_width_two(): + assert srcset_candidates("a.jpg 123w, b.jpg 1234w") == [ + ("a.jpg", "123w"), + ("b.jpg", "1234w"), + ] + + +@pytest.mark.parametrize("pd", ["1.5w", "9000X", "-23w", "-60x"]) +def test_invalid(pd): + assert srcset_candidates("/x.gif " + pd) == [] diff --git a/tests/wellformed/base/http_entry_content_base_srcset.xml b/tests/wellformed/base/http_entry_content_base_srcset.xml new file mode 100644 index 000000000..4de530473 --- /dev/null +++ b/tests/wellformed/base/http_entry_content_base_srcset.xml @@ -0,0 +1,10 @@ + + + + <img srcset="/rel/img.png, /rel/img.2x.png 2x"> + + +