From 09e2c0fd9777013a174a013004cf927172568440 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Tue, 27 May 2025 17:08:12 -0700 Subject: [PATCH 1/4] Avoid a copy of the attribute list normalize_attributes always does a copy when the list is non-empty, so there's no need to copy again. --- feedparser/urls.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/feedparser/urls.py b/feedparser/urls.py index e4e83d16..cfea402c 100644 --- a/feedparser/urls.py +++ b/feedparser/urls.py @@ -158,13 +158,9 @@ def resolve_uri(self, uri): def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) - attrs = [ - ( - key, - ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value, - ) - for key, value in attrs - ] + for i, (key, value) in enumerate(attrs): + if (tag, key) in self.relative_uris: + attrs[i] = (key, self.resolve_uri(value)) super().unknown_starttag(tag, attrs) From 6558d8de55cb7920d84c7684f8692820c9ee2f3c Mon Sep 17 00:00:00 2001 From: Tom Most Date: Tue, 27 May 2025 22:06:10 -0700 Subject: [PATCH 2/4] Resolve relative srcset URLs --- feedparser/sanitizer.py | 1 + feedparser/urls.py | 63 +++++++++++++++++++ tests/test_srcset_candidates.py | 63 +++++++++++++++++++ .../base/http_entry_content_base_srcset.xml | 10 +++ 4 files changed, 137 insertions(+) create mode 100644 tests/test_srcset_candidates.py create mode 100644 tests/wellformed/base/http_entry_content_base_srcset.xml diff --git a/feedparser/sanitizer.py b/feedparser/sanitizer.py index 5b3014e1..7e7884fb 100644 --- a/feedparser/sanitizer.py +++ b/feedparser/sanitizer.py @@ -259,6 +259,7 @@ class HTMLSanitizer(BaseHTMLProcessor): "size", "span", "src", + "srcset", "start", "step", "style", diff --git a/feedparser/urls.py b/feedparser/urls.py index cfea402c..d33a8d52 100644 --- a/feedparser/urls.py +++ b/feedparser/urls.py @@ -1,3 +1,4 @@ +# Copyright 2025 Tom Most # Copyright 2010-2024 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. @@ -116,6 +117,56 @@ def make_safe_absolute_uri(base, rel=None): return uri +# Matches image candidate strings within a srcset attribute value as +# described in https://html.spec.whatwg.org/multipage/images.html#srcset-attributes +_srcset_candidate = re.compile( + r""" + # ASCII whitespace: https://infra.spec.whatwg.org/#ascii-whitespace + [\t\n\f\r ]* + ( + # URL that doesn't start or end with a comma + (?!,) + [^\t\n\f\r ]+ + (? list[tuple[str, str]]: + """ + Split a ``srcset`` attribute value into candidates: + + >>> srcset_candidates("/foo.jpg, /foo.2x.jpg 2x") + [("/foo.jpg", ""), ("/foo.2x.jpg", "2x")] + + This doesn't validate the URLs, nor check for duplicate or conflicting + descriptors. It returns an empty list when parsing fails. + """ + pos = 0 + candidates = [] + while m := _srcset_candidate.match(value, pos): + desc = m[2].strip("\t\n\f\r ") + candidates.append((m[1], desc)) + pos = m.end(0) + return candidates + + class RelativeURIResolver(BaseHTMLProcessor): relative_uris = { ("a", "href"), @@ -156,11 +207,23 @@ def __init__(self, baseuri, encoding, _type): def resolve_uri(self, uri): return make_safe_absolute_uri(self.baseuri, uri.strip()) + def resolve_srcset(self, srcset): + candidates = [] + for uri, desc in srcset_candidates(srcset): + uri = self.resolve_uri(uri) + if desc: + candidates.append("{} {}".format(uri, desc)) + else: + candidates.append(uri) + return ", ".join(candidates) + def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) for i, (key, value) in enumerate(attrs): if (tag, key) in self.relative_uris: attrs[i] = (key, self.resolve_uri(value)) + elif tag in {"img", "source"} and key == "srcset": + attrs[i] = (key, self.resolve_srcset(value)) super().unknown_starttag(tag, attrs) diff --git a/tests/test_srcset_candidates.py b/tests/test_srcset_candidates.py new file mode 100644 index 00000000..485a0530 --- /dev/null +++ b/tests/test_srcset_candidates.py @@ -0,0 +1,63 @@ +import pytest + +from feedparser.urls import srcset_candidates + + +def test_empty(): + assert srcset_candidates("") == [] + assert srcset_candidates(" \n") == [] + + +def test_default(): + assert srcset_candidates("/1x.jpg") == [("/1x.jpg", "")] + + +def test_pixel_density_descriptor_one(): + assert srcset_candidates("/1x.jpg 1x") == [("/1x.jpg", "1x")] + + +def test_pixel_density_descriptor_two(): + assert srcset_candidates("/1x.jpg 1x,/2x.jpg\t2.0x") == [ + ("/1x.jpg", "1x"), + ("/2x.jpg", "2.0x"), + ] + + +def test_pixel_density_descriptor_three(): + assert srcset_candidates("/1x.jpg, /2x.jpg 2x , /3x.jpg 3x ") == [ + ("/1x.jpg", ""), + ("/2x.jpg", "2x"), + ("/3x.jpg", "3x"), + ] + + +@pytest.mark.parametrize( + "pd", ["1x", "1.0x", "9.5x", "36x", "39.95x", "100x", "1e1x", "2E2x"] +) +def test_pixel_density_descriptor_floats(pd): + """A pixel density descriptor allows all the valid float formats.""" + assert [("/foo.jpg", pd)] == srcset_candidates("/foo.jpg " + pd) + + +def test_url_comma(): + """A URL containing a comma is not broken.""" + assert srcset_candidates(" /,.jpg 6x,\n /,,,,.webp \t1e100x") == [ + ("/,.jpg", "6x"), + ("/,,,,.webp", "1e100x"), + ] + + +def test_width_one(): + assert srcset_candidates("/a.png 600w") == [("/a.png", "600w")] + + +def test_width_two(): + assert srcset_candidates("a.jpg 123w, b.jpg 1234w") == [ + ("a.jpg", "123w"), + ("b.jpg", "1234w"), + ] + + +@pytest.mark.parametrize("pd", ["1.5w", "9000X", "-23w", "-60x"]) +def test_invalid(pd): + assert srcset_candidates("/x.gif " + pd) == [] diff --git a/tests/wellformed/base/http_entry_content_base_srcset.xml b/tests/wellformed/base/http_entry_content_base_srcset.xml new file mode 100644 index 00000000..4de53047 --- /dev/null +++ b/tests/wellformed/base/http_entry_content_base_srcset.xml @@ -0,0 +1,10 @@ + + + + <img srcset="/rel/img.png, /rel/img.2x.png 2x"> + + + From ca863e5be8795204c2c9f701b3fc39d91ba562c2 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Tue, 27 May 2025 22:19:15 -0700 Subject: [PATCH 3/4] Add changefragment --- changelog.d/20250527_twm_srcset.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changelog.d/20250527_twm_srcset.rst diff --git a/changelog.d/20250527_twm_srcset.rst b/changelog.d/20250527_twm_srcset.rst new file mode 100644 index 00000000..725afb24 --- /dev/null +++ b/changelog.d/20250527_twm_srcset.rst @@ -0,0 +1,4 @@ +Added +----- + +* Resolve relative URLs in ``srcset`` attributes and pass through ``srcset`` when sanitizing. From 95ad1e542a1de2e58ec78dcea1f9b9281dda364b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 May 2025 06:05:27 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedparser/urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedparser/urls.py b/feedparser/urls.py index d33a8d52..9679d530 100644 --- a/feedparser/urls.py +++ b/feedparser/urls.py @@ -212,7 +212,7 @@ def resolve_srcset(self, srcset): for uri, desc in srcset_candidates(srcset): uri = self.resolve_uri(uri) if desc: - candidates.append("{} {}".format(uri, desc)) + candidates.append(f"{uri} {desc}") else: candidates.append(uri) return ", ".join(candidates)