Skip to content

Commit 701fe7d

Browse files
authored
Merge pull request #207 from Gallaecio/stripping
safe_url_string, canonicalize_url: apply stripping from the URL living standard
2 parents a6e8c8d + 7ecd6d0 commit 701fe7d

File tree

2 files changed

+43
-24
lines changed

2 files changed

+43
-24
lines changed

tests/test_url.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,13 +90,6 @@
9090
)
9191
)
9292

93-
# Remove any leading and trailing C0 control or space from input.
94-
SAFE_URL_URL_STRIP_CASES = tuple(
95-
(f"{char}https://example.com{char}", "https://example.com")
96-
for char in _C0_CONTROL_OR_SPACE
97-
if char not in _ASCII_TAB_OR_NEWLINE
98-
)
99-
10093
SCHEME_NON_FIRST = _ASCII_ALPHANUMERIC + "+-."
10194

10295
# Username and password characters that do not need escaping.
@@ -177,7 +170,12 @@
177170
(object(), Exception),
178171
# Empty string
179172
("", ValueError),
180-
*SAFE_URL_URL_STRIP_CASES,
173+
# Remove any leading and trailing C0 control or space from input.
174+
*(
175+
(f"{char}https://example.com{char}", "https://example.com")
176+
for char in _C0_CONTROL_OR_SPACE
177+
if char not in _ASCII_TAB_OR_NEWLINE
178+
),
181179
# Remove all ASCII tab or newline from input.
182180
(
183181
(
@@ -379,7 +377,6 @@ def test_safe_url_string_encoding(
379377

380378
KNOWN_SAFE_URL_STRING_URL_ISSUES = {
381379
"", # Invalid URL
382-
*(case[0] for case in SAFE_URL_URL_STRIP_CASES),
383380
*(case[0] for case in SAFE_URL_URL_INVALID_SCHEME_CASES),
384381
# Userinfo characters that the URL living standard requires escaping (:;=)
385382
# are not escaped.

w3lib/url.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from urllib.request import pathname2url, url2pathname
3838

3939
from .util import to_unicode
40+
from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE
4041
from ._types import AnyUnicodeError, StrOrBytes
4142
from ._url import _SPECIAL_SCHEMES
4243

@@ -77,9 +78,15 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
7778
_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS
7879

7980

80-
_ascii_tab_newline_re = re.compile(
81-
r"[\t\n\r]"
82-
) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
81+
_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = {
82+
ord(char): None for char in _ASCII_TAB_OR_NEWLINE
83+
}
84+
85+
86+
def _strip(url: str) -> str:
87+
return url.strip(_C0_CONTROL_OR_SPACE).translate(
88+
_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE
89+
)
8390

8491

8592
def safe_url_string( # pylint: disable=too-many-locals
@@ -88,9 +95,29 @@ def safe_url_string( # pylint: disable=too-many-locals
8895
path_encoding: str = "utf8",
8996
quote_path: bool = True,
9097
) -> str:
91-
"""Convert the given URL into a legal URL by escaping unsafe characters
92-
according to RFC-3986. Also, ASCII tabs and newlines are removed
93-
as per https://url.spec.whatwg.org/#url-parsing.
98+
"""Return a URL equivalent to *url* that a wide range of web browsers and
99+
web servers consider valid.
100+
101+
*url* is parsed according to the rules of the `URL living standard`_,
102+
and during serialization additional characters are percent-encoded to make
103+
the URL valid by additional URL standards.
104+
105+
.. _URL living standard: https://url.spec.whatwg.org/
106+
107+
The returned URL should be valid by *all* of the following URL standards
108+
known to be enforced by modern-day web browsers and web servers:
109+
110+
- `URL living standard`_
111+
112+
- `RFC 3986`_
113+
114+
- `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI
115+
class`_.
116+
117+
.. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html
118+
.. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt
119+
.. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt
120+
.. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt
94121
95122
If a bytes URL is given, it is first converted to `str` using the given
96123
encoding (which defaults to 'utf-8'). If quote_path is True (default),
@@ -104,17 +131,15 @@ def safe_url_string( # pylint: disable=too-many-locals
104131
105132
Calling this function on an already "safe" URL will return the URL
106133
unmodified.
107-
108-
Always returns a native `str` (bytes in Python2, unicode in Python3).
109134
"""
110-
# Python3's urlsplit() chokes on bytes input with non-ASCII chars,
135+
# urlsplit() chokes on bytes input with non-ASCII chars,
111136
# so let's decode (to Unicode) using page encoding:
112137
# - it is assumed that a raw bytes input comes from a document
113138
# encoded with the supplied encoding (or UTF8 by default)
114139
# - if the supplied (or default) encoding chokes,
115140
# percent-encode offending bytes
116141
decoded = to_unicode(url, encoding=encoding, errors="percentencode")
117-
parts = urlsplit(_ascii_tab_newline_re.sub("", decoded))
142+
parts = urlsplit(_strip(decoded))
118143

119144
username, password, hostname, port = (
120145
parts.username,
@@ -531,11 +556,8 @@ def canonicalize_url(
531556
) -> str:
532557
r"""Canonicalize the given url by applying the following procedures:
533558
559+
- make the URL safe
534560
- sort query arguments, first by key, then by value
535-
- percent encode paths ; non-ASCII characters are percent-encoded
536-
using UTF-8 (RFC-3986)
537-
- percent encode query arguments ; non-ASCII characters are percent-encoded
538-
using passed `encoding` (UTF-8 by default)
539561
- normalize all spaces (in query arguments) '+' (plus symbol)
540562
- normalize percent encodings case (%2f -> %2F)
541563
- remove query arguments with blank values (unless `keep_blank_values` is True)
@@ -563,7 +585,7 @@ def canonicalize_url(
563585
# so we should be covered regarding URL normalization,
564586
# if not for proper URL expected by remote website.
565587
if isinstance(url, str):
566-
url = url.strip()
588+
url = _strip(url)
567589
try:
568590
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
569591
parse_url(url), encoding=encoding or "utf8"

0 commit comments

Comments
 (0)