3737from urllib .request import pathname2url , url2pathname
3838
3939from .util import to_unicode
40+ from ._infra import _ASCII_TAB_OR_NEWLINE , _C0_CONTROL_OR_SPACE
4041from ._types import AnyUnicodeError , StrOrBytes
4142from ._url import _SPECIAL_SCHEMES
4243
@@ -77,9 +78,15 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
7778_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS
7879
7980
80- _ascii_tab_newline_re = re .compile (
81- r"[\t\n\r]"
82- ) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
81+ _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = {
82+ ord (char ): None for char in _ASCII_TAB_OR_NEWLINE
83+ }
84+
85+
86+ def _strip (url : str ) -> str :
87+ return url .strip (_C0_CONTROL_OR_SPACE ).translate (
88+ _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE
89+ )
8390
8491
8592def safe_url_string ( # pylint: disable=too-many-locals
@@ -88,9 +95,29 @@ def safe_url_string( # pylint: disable=too-many-locals
8895 path_encoding : str = "utf8" ,
8996 quote_path : bool = True ,
9097) -> str :
91- """Convert the given URL into a legal URL by escaping unsafe characters
92- according to RFC-3986. Also, ASCII tabs and newlines are removed
93- as per https://url.spec.whatwg.org/#url-parsing.
98+ """Return a URL equivalent to *url* that a wide range of web browsers and
99+ web servers consider valid.
100+
101+ *url* is parsed according to the rules of the `URL living standard`_,
102+ and during serialization additional characters are percent-encoded to make
103+ the URL valid by additional URL standards.
104+
105+ .. _URL living standard: https://url.spec.whatwg.org/
106+
107+ The returned URL should be valid by *all* of the following URL standards
108+ known to be enforced by modern-day web browsers and web servers:
109+
110+ - `URL living standard`_
111+
112+ - `RFC 3986`_
113+
114+ - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI
115+ class`_.
116+
117+ .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html
118+ .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt
119+ .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt
120+ .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt
94121
95122 If a bytes URL is given, it is first converted to `str` using the given
96123 encoding (which defaults to 'utf-8'). If quote_path is True (default),
@@ -104,17 +131,15 @@ def safe_url_string( # pylint: disable=too-many-locals
104131
105132 Calling this function on an already "safe" URL will return the URL
106133 unmodified.
107-
108- Always returns a native `str` (bytes in Python2, unicode in Python3).
109134 """
110- # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
135+ # urlsplit() chokes on bytes input with non-ASCII chars,
111136 # so let's decode (to Unicode) using page encoding:
112137 # - it is assumed that a raw bytes input comes from a document
113138 # encoded with the supplied encoding (or UTF8 by default)
114139 # - if the supplied (or default) encoding chokes,
115140 # percent-encode offending bytes
116141 decoded = to_unicode (url , encoding = encoding , errors = "percentencode" )
117- parts = urlsplit (_ascii_tab_newline_re . sub ( "" , decoded ))
142+ parts = urlsplit (_strip ( decoded ))
118143
119144 username , password , hostname , port = (
120145 parts .username ,
@@ -531,11 +556,8 @@ def canonicalize_url(
531556) -> str :
532557 r"""Canonicalize the given url by applying the following procedures:
533558
559+ - make the URL safe
534560 - sort query arguments, first by key, then by value
535- - percent encode paths ; non-ASCII characters are percent-encoded
536- using UTF-8 (RFC-3986)
537- - percent encode query arguments ; non-ASCII characters are percent-encoded
538- using passed `encoding` (UTF-8 by default)
539561 - normalize all spaces (in query arguments) '+' (plus symbol)
540562 - normalize percent encodings case (%2f -> %2F)
541563 - remove query arguments with blank values (unless `keep_blank_values` is True)
@@ -563,7 +585,7 @@ def canonicalize_url(
563585 # so we should be covered regarding URL normalization,
564586 # if not for proper URL expected by remote website.
565587 if isinstance (url , str ):
566- url = url . strip ( )
588+ url = _strip ( url )
567589 try :
568590 scheme , netloc , path , params , query , fragment = _safe_ParseResult (
569591 parse_url (url ), encoding = encoding or "utf8"
0 commit comments