Skip to content

Commit 2f16a83

Browse files
authored
Merge pull request #321 from SciCatProject/expands-ontologies
Support setting techniques from ExPaNDS ontology label
2 parents 9d681d4 + 38ce792 commit 2f16a83

File tree

10 files changed

+400
-11
lines changed

10 files changed

+400
-11
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ __pycache__/
2626
Pipfile
2727
venv
2828
.venv
29+
30+
# Ontologies
31+
*.jsonld

docs/reference/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ Submodules
7272
:recursive:
7373

7474
model
75+
ontology
7576
testing
7677
typing
7778

src/scitacean/_dataset_fields.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from __future__ import annotations
1313

14+
from collections.abc import Iterable
1415
from dataclasses import dataclass
1516
from datetime import UTC, datetime
1617
from typing import Any, ClassVar, Literal, TypeVar
@@ -29,6 +30,7 @@
2930
Technique,
3031
construct,
3132
)
33+
from .ontology import find_technique
3234
from .pid import PID
3335

3436
M = TypeVar("M", bound=BaseModel)
@@ -54,6 +56,12 @@ def _parse_remote_path(path: str | RemotePath | None) -> RemotePath | None:
5456
return RemotePath(path)
5557

5658

59+
def _parse_techniques(arg: Iterable[str | Technique] | None) -> list[Technique] | None:
60+
if arg is None:
61+
return None
62+
return [t if isinstance(t, Technique) else find_technique(t) for t in arg]
63+
64+
5765
def _validate_checksum_algorithm(algorithm: str | None) -> str | None:
5866
if algorithm is None:
5967
return algorithm
@@ -615,7 +623,7 @@ def __init__(
615623
source_folder: RemotePath | str | None = None,
616624
source_folder_host: str | None = None,
617625
start_time: datetime | None = None,
618-
techniques: list[Technique] | None = None,
626+
techniques: Iterable[str | Technique] | None = None,
619627
used_software: list[str] | None = None,
620628
validation_status: str | None = None,
621629
meta: dict[str, Any] | None = None,
@@ -656,7 +664,7 @@ def __init__(
656664
self._source_folder = _parse_remote_path(source_folder)
657665
self._source_folder_host = source_folder_host
658666
self._start_time = start_time
659-
self._techniques = techniques
667+
self._techniques = _parse_techniques(techniques)
660668
self._used_software = used_software
661669
self._validation_status = validation_status
662670
self._api_version = None
@@ -1033,9 +1041,9 @@ def techniques(self) -> list[Technique] | None:
10331041
return self._techniques
10341042

10351043
@techniques.setter
1036-
def techniques(self, techniques: list[Technique] | None) -> None:
1044+
def techniques(self, techniques: Iterable[str | Technique] | None) -> None:
10371045
"""Stores the metadata information for techniques."""
1038-
self._techniques = techniques
1046+
self._techniques = _parse_techniques(techniques)
10391047

10401048
@property
10411049
def updated_at(self) -> datetime | None:

src/scitacean/ontology/__init__.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# SPDX-License-Identifier: BSD-3-Clause
2+
# Copyright (c) 2025 SciCat Project (https://github.com/SciCatProject/scitacean)
3+
"""Tools for working with ontologies."""
4+
5+
import bz2
6+
import importlib.resources
7+
import json
8+
import re
9+
from functools import cache
10+
11+
from ..model import Technique
12+
13+
14+
def _load_ontology(name: str) -> object:
15+
"""Load an ontology from the package resources.
16+
17+
Note that the ontology file was generated using a script in
18+
``tools/ontologies`` in the Scitacean repository.
19+
"""
20+
with (
21+
importlib.resources.files("scitacean.ontology")
22+
.joinpath(f"{name}.json.bz2")
23+
.open("rb") as raw_f
24+
):
25+
with bz2.open(raw_f, "rb") as f:
26+
return json.loads(f.read())
27+
28+
29+
@cache
30+
def expands_techniques() -> dict[str, list[str]]:
31+
"""Load the ExPaNDS experimental techniques ontology.
32+
33+
Returns
34+
-------
35+
:
36+
A dict mapping from technique ids (IRIs) to labels.
37+
The first element of the list is the primary label.
38+
All labels are lowercase and contain no leading or trailing whitespace.
39+
"""
40+
return _load_ontology("expands_techniques") # type: ignore[return-value]
41+
42+
43+
def find_technique(label_or_iri: str) -> Technique:
44+
"""Construct a Technique model from an ontology label or IRI.
45+
46+
The argument specifies a technique from the
47+
`ExPaNDS experimental techniques ontology <https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html>`_.
48+
49+
Parameters
50+
----------
51+
label_or_iri:
52+
One of:
53+
54+
- Technique *label* from the ExPaNDS ontology. The input is first converted to
55+
lowercase and leading and trailing whitespace is removed.
56+
- Technique *IRI* from the ExPaNDS ontology. Must exactly match an IRI in
57+
the ontology.
58+
59+
Returns
60+
-------
61+
:
62+
The loaded technique encoded as:
63+
64+
.. code-block:: python
65+
66+
Technique(name=label, pid=iri)
67+
68+
Raises
69+
------
70+
ValueError
71+
If the label or IRI is not found in the ontology.
72+
"""
73+
if _is_iri(label_or_iri):
74+
return _lookup_iri(label_or_iri)
75+
return _lookup_label(label_or_iri)
76+
77+
78+
def _lookup_label(label: str) -> Technique:
79+
label = label.strip().lower()
80+
found = [
81+
(iri, labels[0])
82+
for iri, labels in expands_techniques().items()
83+
if label in labels
84+
]
85+
if len(found) == 1:
86+
return Technique(pid=found[0][0], name=found[0][1])
87+
elif len(found) > 1:
88+
raise ValueError(
89+
f"Found multiple techniques with label '{label}': {[f[0] for f in found]}. "
90+
"Please specify the exact IRI instead or construct a Technique model "
91+
"manually.\n"
92+
"See the ExPaNDS experimental technique ontology for allowed labels at "
93+
"https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html"
94+
)
95+
# else: len(found) == 0
96+
raise ValueError(
97+
f"Unknown technique label: '{label}'\n"
98+
"See the ExPaNDS experimental technique ontology for allowed labels at "
99+
"https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html"
100+
)
101+
102+
103+
def _lookup_iri(iri: str) -> Technique:
104+
try:
105+
label = expands_techniques()[iri][0]
106+
except KeyError:
107+
raise ValueError(
108+
f"Unknown technique IRI: '{iri}'\n"
109+
"See the ExPaNDS experimental technique ontology for allowed labels at "
110+
"https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html"
111+
) from None
112+
return Technique(pid=iri, name=label)
113+
114+
115+
_IRI_REGEX = re.compile(r"^https?://purl\.org/pan-science/PaNET/PaNET\d+$")
116+
117+
118+
def _is_iri(iri: str) -> bool:
119+
return bool(_IRI_REGEX.match(iri))
120+
121+
122+
__all__ = ["expands_techniques", "find_technique"]
4.24 KB
Binary file not shown.

tests/dataset_fields_test.py

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
DownloadDataFile,
1919
DownloadDataset,
2020
DownloadOrigDatablock,
21+
Technique,
2122
UploadDerivedDataset,
2223
UploadRawDataset,
2324
)
@@ -430,12 +431,12 @@ def test_orcid_validation_valid(good_orcid: str) -> None:
430431
dset = Dataset(
431432
type="raw",
432433
name="test ORCID",
433-
contact_email="jan-lukas.wynen@ess.eu",
434+
contact_email="mail.person@sci.uni",
434435
creation_location="scitacean/tests",
435436
creation_time="2142-04-02T16:44:56",
436-
owner="Jan-Lukas Wynen",
437+
owner="Mustrum Ridcully",
437438
owner_group="ess",
438-
principal_investigator="jan-lukas.wynen@ess.eu",
439+
principal_investigator="mail.person@sci.uni",
439440
source_folder=RemotePath("/hex/source62"),
440441
orcid_of_owner=good_orcid,
441442
)
@@ -454,16 +455,64 @@ def test_orcid_validation_valid(good_orcid: str) -> None:
454455
def test_orcid_validation_missing_url(bad_orcid: str) -> None:
455456
dset = Dataset(
456457
type="raw",
457-
contact_email="jan-lukas.wynen@ess.eu",
458+
contact_email="mail.person@sci.uni",
458459
creation_time="2142-04-02T16:44:56",
459-
owner="Jan-Lukas Wynen",
460+
owner="Mustrum Ridcully",
460461
owner_group="ess",
461-
principal_investigator="jan-lukas.wynen@ess.eu",
462+
principal_investigator="mail.person@sci.uni",
462463
source_folder=RemotePath("/hex/source62"),
463464
orcid_of_owner=bad_orcid,
464465
)
465466
with pytest.raises(pydantic.ValidationError):
466467
dset.make_upload_model()
467468

468469

469-
# TODO technique
470+
def test_technique_set_model() -> None:
471+
technique = Technique(pid="test/technique", name="Test Technique")
472+
dset = Dataset(
473+
type="raw",
474+
contact_email="mail.person@sci.uni",
475+
creation_time="2142-04-02T16:44:56",
476+
owner="Mustrum Ridcully",
477+
owner_group="ess",
478+
principal_investigator="mail.person@sci.uni",
479+
source_folder=RemotePath("/hex/source62"),
480+
techniques=[technique],
481+
)
482+
assert dset.techniques == [technique]
483+
484+
485+
def test_technique_set_label() -> None:
486+
dset = Dataset(
487+
type="raw",
488+
contact_email="mail.person@sci.uni",
489+
creation_time="2142-04-02T16:44:56",
490+
owner="Mustrum Ridcully",
491+
owner_group="ess",
492+
principal_investigator="mail.person@sci.uni",
493+
source_folder=RemotePath("/hex/source62"),
494+
techniques=["neutron powder diffraction"],
495+
)
496+
expected = Technique(
497+
pid="http://purl.org/pan-science/PaNET/PaNET01100",
498+
name="neutron powder diffraction",
499+
)
500+
Technique(
501+
name="neutron powder diffraction",
502+
pid="http://purl.org/pan-science/PaNET/PaNET01100",
503+
)
504+
assert dset.techniques == [expected]
505+
506+
507+
def test_technique_set_invalid_label_raises_value_error() -> None:
508+
dset = Dataset(
509+
type="raw",
510+
contact_email="mail.person@sci.uni",
511+
creation_time="2142-04-02T16:44:56",
512+
owner="Mustrum Ridcully",
513+
owner_group="ess",
514+
principal_investigator="mail.person@sci.uni",
515+
source_folder=RemotePath("/hex/source62"),
516+
)
517+
with pytest.raises(ValueError, match="Unknown technique"):
518+
dset.techniques = ["bad technique"]

tests/ontology_test.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# SPDX-License-Identifier: BSD-3-Clause
2+
# Copyright (c) 2025 SciCat Project (https://github.com/SciCatProject/scitacean)
3+
import pytest
4+
5+
from scitacean import model, ontology
6+
7+
8+
def test_can_load_expands_technique_ontology() -> None:
9+
techniques = ontology.expands_techniques()
10+
assert len(techniques) > 0
11+
12+
# Check IRI
13+
assert all(iri.startswith("http") for iri in techniques.keys())
14+
15+
# Check labels
16+
assert all(
17+
all(label.islower() for label in labels) for labels in techniques.values()
18+
)
19+
assert all(
20+
all(label.strip() == label for label in labels)
21+
for labels in techniques.values()
22+
)
23+
24+
25+
def test_can_look_up_technique_by_label() -> None:
26+
technique = ontology.find_technique("small angle neutron scattering")
27+
expected = model.Technique(
28+
pid="http://purl.org/pan-science/PaNET/PaNET01189",
29+
name="small angle neutron scattering",
30+
)
31+
assert technique == expected
32+
33+
34+
def test_can_look_up_technique_by_label_is_case_insensitive() -> None:
35+
technique = ontology.find_technique("Total Scattering")
36+
expected = model.Technique(
37+
pid="http://purl.org/pan-science/PaNET/PaNET01190",
38+
name="total scattering",
39+
)
40+
assert technique == expected
41+
42+
43+
def test_can_look_up_technique_by_alternative_label() -> None:
44+
regular = ontology.find_technique("x-ray single crystal diffraction")
45+
alternative1 = ontology.find_technique("SXRD")
46+
alternative2 = ontology.find_technique("sxrd")
47+
alternative3 = ontology.find_technique("single crystal x-ray diffraction ")
48+
expected = model.Technique(
49+
pid="http://purl.org/pan-science/PaNET/PaNET01102",
50+
name="x-ray single crystal diffraction",
51+
)
52+
assert regular == expected
53+
assert alternative1 == expected
54+
assert alternative2 == expected
55+
assert alternative3 == expected
56+
57+
58+
def test_can_look_up_technique_by_iri() -> None:
59+
technique = ontology.find_technique("http://purl.org/pan-science/PaNET/PaNET01239")
60+
expected = model.Technique(
61+
pid="http://purl.org/pan-science/PaNET/PaNET01239",
62+
name="neutron reflectometry",
63+
)
64+
assert technique == expected
65+
66+
67+
def test_lookup_rejects_ambiguous_label() -> None:
68+
with pytest.raises(ValueError, match="multiple techniques"):
69+
ontology.find_technique("diffraction")

tools/model-generation/spec/dataset-fields.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ conversions:
3434
sourceFolder:
3535
func: _parse_remote_path
3636
arg_type: RemotePath | str
37+
techniques:
38+
func: _parse_techniques
39+
arg_type: Iterable[str | Technique]
3740

3841
# Mark those fields as read-only in addition to those identified as read only from the schema.
3942
# Read-only fields must be None in uploads.

0 commit comments

Comments
 (0)