Skip to content

Commit 6d9851d

Browse files
committed
Simplify unresolve code by using stdlib url(un)parse
1 parent 0e483ae commit 6d9851d

File tree

2 files changed

+33
-40
lines changed

2 files changed

+33
-40
lines changed

lib/pyld/iri_resolver.py

Lines changed: 30 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
- The 'unresolve()' function is a move and rename of the 'remove_base()' function from 'jsonld.py'
44
"""
55

6-
from collections import namedtuple
7-
import re
6+
from urllib.parse import urlparse, urlunparse
87

98

109
def is_character_allowed_after_relative_path_segment(ch: str) -> bool:
@@ -220,20 +219,24 @@ def unresolve(absolute_iri: str, base_iri: str = ""):
220219
221220
:return: the relative IRI if relative to base, otherwise the absolute IRI.
222221
"""
223-
# TODO: better sync with jsonld.js version
224222
# skip IRI processing
225223
if not base_iri:
226224
return absolute_iri
227225

228-
base = parse_url(base_iri)
226+
base = urlparse(base_iri)
229227

230228
if not base.scheme:
231229
raise ValueError(f"Found invalid baseIRI '{base_iri}' for value '{absolute_iri}'")
232-
233-
rel = parse_url(absolute_iri)
230+
231+
# compute authority (netloc) and strip default ports
232+
base_authority = parse_authority(base)
233+
234+
rel = urlparse(absolute_iri)
235+
# compute authority (netloc) and strip default ports
236+
rel_authority = parse_authority(rel)
234237

235238
# schemes and network locations (authorities) don't match, don't alter IRI
236-
if not (base.scheme == rel.scheme and base.authority == rel.authority):
239+
if not (base.scheme == rel.scheme and base_authority == rel_authority):
237240
return absolute_iri
238241

239242
# remove path segments that match (do not remove last segment unless there
@@ -257,36 +260,25 @@ def unresolve(absolute_iri: str, base_iri: str = ""):
257260
# prepend remaining segments
258261
rval += '/'.join(iri_segments)
259262

260-
return unparse_url((None, None, rval, rel.query, rel.fragment)) or './'
261-
262-
ParsedUrl = namedtuple(
263-
'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment'])
263+
# build relative IRI using urlunparse with empty scheme/netloc
264+
return urlunparse(('', '', rval, '', rel.query or '', rel.fragment or '')) or './'
264265

265-
def parse_url(url):
266-
# regex from RFC 3986
267-
p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?'
268-
m = re.match(p, url)
269-
# remove default http and https ports
270-
g = list(m.groups())
266+
def parse_authority(parsed_iri) -> str:
267+
"""
268+
Compute authority (netloc) and strip default ports
271269
272-
if g[1] is not None and ((g[0] == 'https' and g[1].endswith(':443')) or
273-
(g[0] == 'http' and g[1].endswith(':80'))):
274-
g[1] = g[1][:g[1].rfind(':')]
275-
return ParsedUrl(*g)
276-
277-
def unparse_url(parsed):
278-
if isinstance(parsed, dict):
279-
parsed = ParsedUrl(**parsed)
280-
elif isinstance(parsed, list) or isinstance(parsed, tuple):
281-
parsed = ParsedUrl(*parsed)
282-
rval = ''
283-
if parsed.scheme:
284-
rval += parsed.scheme + ':'
285-
if parsed.authority is not None:
286-
rval += '//' + parsed.authority
287-
rval += parsed.path
288-
if parsed.query is not None:
289-
rval += '?' + parsed.query
290-
if parsed.fragment is not None:
291-
rval += '#' + parsed.fragment
292-
return rval
270+
:param parsed_iri: Description
271+
:return: Description
272+
:rtype: str
273+
"""
274+
base_authority = parsed_iri.netloc or None
275+
276+
try:
277+
base_port = parsed_iri.port
278+
except Exception:
279+
base_port = None
280+
281+
if base_authority is not None and base_port is not None:
282+
if (parsed_iri.scheme == 'https' and base_port == 443) or (parsed_iri.scheme == 'http' and base_port == 80):
283+
base_authority = base_authority.rsplit(':', 1)[0]
284+
return base_authority

lib/pyld/jsonld.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import re
2121
import sys
2222
import traceback
23+
from urllib.parse import urlparse
2324
import warnings
2425
import uuid
2526
from .context_resolver import ContextResolver
@@ -31,7 +32,7 @@
3132
from numbers import Integral, Real
3233
from frozendict import frozendict
3334
from pyld.__about__ import (__copyright__, __license__, __version__)
34-
from .iri_resolver import parse_url, resolve, unresolve
35+
from .iri_resolver import resolve, unresolve
3536

3637
def cmp(a, b):
3738
return (a > b) - (a < b)
@@ -6445,7 +6446,7 @@ def load_html(input, url, profile, options):
64456446
html_base = resolve(html_base[0], effective_base)
64466447
options['base'] = html_base
64476448

6448-
url_elements = parse_url(url)
6449+
url_elements = urlparse(url)
64496450
if url_elements.fragment:
64506451
# FIXME: CGI decode
64516452
id = url_elements.fragment

0 commit comments

Comments
 (0)