Skip to content

Commit f0dd9cb

Browse files
committed
Move unresolve function
1 parent 65201a2 commit f0dd9cb

File tree

2 files changed

+127
-3
lines changed

2 files changed

+127
-3
lines changed

lib/pyld/iri_resolver.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
"""
2-
The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js)
2+
- The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js) (c) Ruben Taelman <ruben.taelman@ugent.be>
3+
- The 'unresolve()' function is a move and rename of the 'remove_base()' function from 'jsonld.py'
34
"""
45

6+
from collections import namedtuple
7+
import re
8+
9+
510
def is_character_allowed_after_relative_path_segment(ch: str) -> bool:
611
"""Return True if a character is valid after '.' or '..' in a path segment."""
712
return not ch or ch in ('#', '?', '/')
@@ -204,4 +209,79 @@ def resolve(relative_iri: str, base_iri: str = None) -> str:
204209
relative_iri = base_path + relative_iri
205210
relative_iri = remove_dot_segments(relative_iri)
206211

207-
return base_iri[:base_slash_after_colon_pos] + relative_iri
212+
return base_iri[:base_slash_after_colon_pos] + relative_iri
213+
214+
def unresolve(absolute_iri: str, base_iri: str = ""):
215+
"""
216+
Unresolves a given absolute IRI to an IRI relative to the given base IRI.
217+
218+
:param base: the base IRI.
219+
:param iri: the absolute IRI.
220+
221+
:return: the relative IRI if relative to base, otherwise the absolute IRI.
222+
"""
223+
# TODO: better sync with jsonld.js version
224+
# skip IRI processing
225+
if base_iri is None:
226+
return absolute_iri
227+
228+
base = parse_url(base_iri)
229+
rel = parse_url(absolute_iri)
230+
231+
# schemes and network locations (authorities) don't match, don't alter IRI
232+
if not (base.scheme == rel.scheme and base.authority == rel.authority):
233+
return absolute_iri
234+
235+
# remove path segments that match (do not remove last segment unless there
236+
# is a hash or query
237+
base_segments = remove_dot_segments(base.path).split('/')
238+
iri_segments = remove_dot_segments(rel.path).split('/')
239+
last = 0 if (rel.fragment or rel.query) else 1
240+
while (len(base_segments) and len(iri_segments) > last and
241+
base_segments[0] == iri_segments[0]):
242+
base_segments.pop(0)
243+
iri_segments.pop(0)
244+
245+
# use '../' for each non-matching base segment
246+
rval = ''
247+
if len(base_segments):
248+
# don't count the last segment (if it ends with '/' last path doesn't
249+
# count and if it doesn't end with '/' it isn't a path)
250+
base_segments.pop()
251+
rval += '../' * len(base_segments)
252+
253+
# prepend remaining segments
254+
rval += '/'.join(iri_segments)
255+
256+
return unparse_url((None, None, rval, rel.query, rel.fragment)) or './'
257+
258+
ParsedUrl = namedtuple(
259+
'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment'])
260+
261+
def parse_url(url):
262+
# regex from RFC 3986
263+
p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?'
264+
m = re.match(p, url)
265+
# remove default http and https ports
266+
g = list(m.groups())
267+
if ((g[0] == 'https' and g[1].endswith(':443')) or
268+
(g[0] == 'http' and g[1].endswith(':80'))):
269+
g[1] = g[1][:g[1].rfind(':')]
270+
return ParsedUrl(*g)
271+
272+
def unparse_url(parsed):
273+
if isinstance(parsed, dict):
274+
parsed = ParsedUrl(**parsed)
275+
elif isinstance(parsed, list) or isinstance(parsed, tuple):
276+
parsed = ParsedUrl(*parsed)
277+
rval = ''
278+
if parsed.scheme:
279+
rval += parsed.scheme + ':'
280+
if parsed.authority is not None:
281+
rval += '//' + parsed.authority
282+
rval += parsed.path
283+
if parsed.query is not None:
284+
rval += '?' + parsed.query
285+
if parsed.fragment is not None:
286+
rval += '#' + parsed.fragment
287+
return rval

tests/test_iri_resolver.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import pytest
2-
from pyld.iri_resolver import resolve, remove_dot_segments
2+
from pyld.iri_resolver import resolve, unresolve, remove_dot_segments
33

44
# Tests ported from relative-to-absolute-iri.js: https://github.com/rubensworks/relative-to-absolute-iri.js/blob/master/test/Resolve-test.ts
5+
# (c) Ruben Taelman <stevenlevithan.com>
56

67
# ---------- Tests for resolve() ----------
78
class TestResolve:
@@ -275,6 +276,49 @@ def test_relative_with_triple_dot_segment_and_2x_double_dot_and_base(self):
275276
def test_questionmark_prefix_relative_with_complex_base_with_dot(self):
276277
assert resolve('?y','http://a/bb/ccc/./d;p?q') == 'http://a/bb/ccc/./d;p?y'
277278

279+
# ---------- Tests for unresolve() ----------
280+
class TestUnresolve:
281+
def test_absolute_iri_no_base(self):
282+
assert unresolve('http://example.org/') == 'http://example.org/'
283+
284+
def test_absolute_iri_empty_base(self):
285+
assert unresolve('http://example.org/', '') == 'http://example.org/'
286+
287+
def test_absolute_iri_with_base(self):
288+
assert unresolve('http://example.org/', 'http://base.org/') == 'http://example.org/'
289+
290+
def test_empty_value_uses_base(self):
291+
assert unresolve('', 'http://base.org/') == ''
292+
293+
def test_absolute_with_base(self):
294+
assert unresolve('http://base.org/abc', 'http://base.org/') == 'abc'
295+
296+
def test_absolute_with_fragment_base(self):
297+
assert unresolve('http://base.org/abc', 'http://base.org/#frag') == 'abc'
298+
299+
def test_hash_absolute(self):
300+
assert unresolve('http://base.org/#abc', 'http://base.org/') == '#abc'
301+
302+
def test_colon_in_value_ignores_base(self):
303+
assert unresolve('http:abc', 'http://base.org/') == 'http:abc'
304+
305+
def test_colon_in_value_removes_dots(self):
306+
assert unresolve('http://abc/../../', 'http://base.org/') == 'http://abc/'
307+
308+
# def test_non_absolute_base_error(self):
309+
# with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'abc'"):
310+
# unresolve('abc', 'def')
311+
312+
# def test_non_absolute_base_empty_value_error(self):
313+
# with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"):
314+
# unresolve('', 'def')
315+
316+
def test_base_without_path_slash(self):
317+
assert unresolve('http://base.org/abc', 'http://base.org') == 'abc'
318+
319+
def test_base_with_path_slash(self):
320+
assert unresolve('http://base.org/abc/', 'http://base.org') == 'abc/'
321+
278322
# ---------- Tests for remove_dot_segments() ----------
279323
class TestRemoveDotSegments:
280324
def test_no_slash(self):

0 commit comments

Comments
 (0)