@@ -95,9 +95,29 @@ def safe_url_string( # pylint: disable=too-many-locals
9595 path_encoding : str = "utf8" ,
9696 quote_path : bool = True ,
9797) -> str :
98- """Convert the given URL into a legal URL by escaping unsafe characters
99- according to RFC-3986. Also, ASCII tabs and newlines are removed
100- as per https://url.spec.whatwg.org/#url-parsing.
98+ """Return a URL equivalent to *url* that a wide range of web browsers and
99+ web servers consider valid.
100+
101+ *url* is parsed according to the rules of the `URL living standard`_,
102+ and during serialization additional characters are percent-encoded to make
103+ the URL valid by additional URL standards.
104+
105+ .. _URL living standard: https://url.spec.whatwg.org/
106+
107+ The returned URL is valid by *all* of the following URL standards known to
108+ be enforced by modern-day web browsers and web servers:
109+
110+ - `URL living standard`_
111+
112+ - `RFC 3986`_
113+
114+ - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI
115+ class`_.
116+
117+ .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html
118+ .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt
119+ .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt
120+ .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt
101121
102122 If a bytes URL is given, it is first converted to `str` using the given
103123 encoding (which defaults to 'utf-8'). If quote_path is True (default),
@@ -111,10 +131,8 @@ def safe_url_string( # pylint: disable=too-many-locals
111131
112132 Calling this function on an already "safe" URL will return the URL
113133 unmodified.
114-
115- Always returns a native `str` (bytes in Python2, unicode in Python3).
116134 """
117- # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
135+ # urlsplit() chokes on bytes input with non-ASCII chars,
118136 # so let's decode (to Unicode) using page encoding:
119137 # - it is assumed that a raw bytes input comes from a document
120138 # encoded with the supplied encoding (or UTF8 by default)
@@ -538,11 +556,8 @@ def canonicalize_url(
538556) -> str :
539557 r"""Canonicalize the given url by applying the following procedures:
540558
559+ - make the URL safe (see :func:`safe_url_string`)
541560 - sort query arguments, first by key, then by value
542- - percent encode paths ; non-ASCII characters are percent-encoded
543- using UTF-8 (RFC-3986)
544- - percent encode query arguments ; non-ASCII characters are percent-encoded
545- using passed `encoding` (UTF-8 by default)
546561 - normalize all spaces (in query arguments) '+' (plus symbol)
547562 - normalize percent encodings case (%2f -> %2F)
548563 - remove query arguments with blank values (unless `keep_blank_values` is True)
0 commit comments