Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ __pycache__/
Pipfile
venv
.venv

# Ontologies
*.jsonld
1 change: 1 addition & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ Submodules
:recursive:

model
ontology
testing
typing

Expand Down
16 changes: 12 additions & 4 deletions src/scitacean/_dataset_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from __future__ import annotations

from collections.abc import Iterable
from dataclasses import dataclass
from datetime import UTC, datetime
from typing import Any, ClassVar, Literal, TypeVar
Expand All @@ -29,6 +30,7 @@
Technique,
construct,
)
from .ontology import find_technique
from .pid import PID

M = TypeVar("M", bound=BaseModel)
Expand All @@ -54,6 +56,12 @@ def _parse_remote_path(path: str | RemotePath | None) -> RemotePath | None:
return RemotePath(path)


def _parse_techniques(arg: Iterable[str | Technique] | None) -> list[Technique] | None:
if arg is None:
return None
return [t if isinstance(t, Technique) else find_technique(t) for t in arg]


def _validate_checksum_algorithm(algorithm: str | None) -> str | None:
if algorithm is None:
return algorithm
Expand Down Expand Up @@ -615,7 +623,7 @@ def __init__(
source_folder: RemotePath | str | None = None,
source_folder_host: str | None = None,
start_time: datetime | None = None,
techniques: list[Technique] | None = None,
techniques: Iterable[str | Technique] | None = None,
used_software: list[str] | None = None,
validation_status: str | None = None,
meta: dict[str, Any] | None = None,
Expand Down Expand Up @@ -656,7 +664,7 @@ def __init__(
self._source_folder = _parse_remote_path(source_folder)
self._source_folder_host = source_folder_host
self._start_time = start_time
self._techniques = techniques
self._techniques = _parse_techniques(techniques)
self._used_software = used_software
self._validation_status = validation_status
self._api_version = None
Expand Down Expand Up @@ -1033,9 +1041,9 @@ def techniques(self) -> list[Technique] | None:
return self._techniques

@techniques.setter
def techniques(self, techniques: list[Technique] | None) -> None:
def techniques(self, techniques: Iterable[str | Technique] | None) -> None:
"""Stores the metadata information for techniques."""
self._techniques = techniques
self._techniques = _parse_techniques(techniques)

@property
def updated_at(self) -> datetime | None:
Expand Down
122 changes: 122 additions & 0 deletions src/scitacean/ontology/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2025 SciCat Project (https://github.com/SciCatProject/scitacean)
"""Tools for working with ontologies."""

import bz2
import importlib.resources
import json
import re
from functools import cache

from ..model import Technique


def _load_ontology(name: str) -> object:
"""Load an ontology from the package resources.

Note that the ontology file was generated using a script in
``tools/ontologies`` in the Scitacean repository.
"""
with (
importlib.resources.files("scitacean.ontology")
.joinpath(f"{name}.json.bz2")
.open("rb") as raw_f
):
with bz2.open(raw_f, "rb") as f:
return json.loads(f.read())


@cache
def expands_techniques() -> dict[str, list[str]]:
"""Load the ExPaNDS experimental techniques ontology.

Returns
-------
:
A dict mapping from technique ids (IRIs) to labels.
The first element of the list is the primary label.
All labels are lowercase and contain no leading or trailing whitespace.
"""
return _load_ontology("expands_techniques") # type: ignore[return-value]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this function is for notebooks, maybe we can wrap it in html repr, otherwise ignore me.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is mean t as a 'backend' implementation. I don't think it is very useful to show this list in a notebook directly. The link to the actual ontology seems more useful.



def find_technique(label_or_iri: str) -> Technique:
"""Construct a Technique model from an ontology label or IRI.

The argument specifies a technique from the
`ExPaNDS experimental techniques ontology <https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html>`_.

Parameters
----------
label_or_iri:
One of:

- Technique *label* from the ExPaNDS ontology. The input is first converted to
lowercase and leading and trailing whitespace is removed.
- Technique *IRI* from the ExPaNDS ontology. Must exactly match an IRI in
the ontology.

Returns
-------
:
The loaded technique encoded as:

.. code-block:: python

Technique(name=label, pid=iri)

Raises
------
ValueError
If the label or IRI is not found in the ontology.
"""
if _is_iri(label_or_iri):
return _lookup_iri(label_or_iri)
return _lookup_label(label_or_iri)


def _lookup_label(label: str) -> Technique:
label = label.strip().lower()
found = [
(iri, labels[0])
for iri, labels in expands_techniques().items()
if label in labels
]
if len(found) == 1:
return Technique(pid=found[0][0], name=found[0][1])
elif len(found) > 1:
raise ValueError(
f"Found multiple techniques with label '{label}': {[f[0] for f in found]}. "
"Please specify the exact IRI instead or construct a Technique model "
"manually.\n"
"See the ExPaNDS experimental technique ontology for allowed labels at "
"https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html"
)
# else: len(found) == 0
raise ValueError(
f"Unknown technique label: '{label}'\n"
"See the ExPaNDS experimental technique ontology for allowed labels at "
"https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html"
)


def _lookup_iri(iri: str) -> Technique:
try:
label = expands_techniques()[iri][0]
except KeyError:
raise ValueError(
f"Unknown technique IRI: '{iri}'\n"
"See the ExPaNDS experimental technique ontology for allowed labels at "
"https://expands-eu.github.io/ExPaNDS-experimental-techniques-ontology/index-en.html"
) from None
return Technique(pid=iri, name=label)


_IRI_REGEX = re.compile(r"^https?://purl\.org/pan-science/PaNET/PaNET\d+$")


def _is_iri(iri: str) -> bool:
return bool(_IRI_REGEX.match(iri))


__all__ = ["expands_techniques", "find_technique"]
Binary file not shown.
63 changes: 56 additions & 7 deletions tests/dataset_fields_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
DownloadDataFile,
DownloadDataset,
DownloadOrigDatablock,
Technique,
UploadDerivedDataset,
UploadRawDataset,
)
Expand Down Expand Up @@ -430,12 +431,12 @@ def test_orcid_validation_valid(good_orcid: str) -> None:
dset = Dataset(
type="raw",
name="test ORCID",
contact_email="jan-lukas.wynen@ess.eu",
contact_email="mail.person@sci.uni",
creation_location="scitacean/tests",
creation_time="2142-04-02T16:44:56",
owner="Jan-Lukas Wynen",
owner="Mustrum Ridcully",
owner_group="ess",
principal_investigator="jan-lukas.wynen@ess.eu",
principal_investigator="mail.person@sci.uni",
source_folder=RemotePath("/hex/source62"),
orcid_of_owner=good_orcid,
)
Expand All @@ -454,16 +455,64 @@ def test_orcid_validation_valid(good_orcid: str) -> None:
def test_orcid_validation_missing_url(bad_orcid: str) -> None:
dset = Dataset(
type="raw",
contact_email="jan-lukas.wynen@ess.eu",
contact_email="mail.person@sci.uni",
creation_time="2142-04-02T16:44:56",
owner="Jan-Lukas Wynen",
owner="Mustrum Ridcully",
owner_group="ess",
principal_investigator="jan-lukas.wynen@ess.eu",
principal_investigator="mail.person@sci.uni",
source_folder=RemotePath("/hex/source62"),
orcid_of_owner=bad_orcid,
)
with pytest.raises(pydantic.ValidationError):
dset.make_upload_model()


# TODO technique
def test_technique_set_model() -> None:
technique = Technique(pid="test/technique", name="Test Technique")
dset = Dataset(
type="raw",
contact_email="mail.person@sci.uni",
creation_time="2142-04-02T16:44:56",
owner="Mustrum Ridcully",
owner_group="ess",
principal_investigator="mail.person@sci.uni",
source_folder=RemotePath("/hex/source62"),
techniques=[technique],
)
assert dset.techniques == [technique]


def test_technique_set_label() -> None:
dset = Dataset(
type="raw",
contact_email="mail.person@sci.uni",
creation_time="2142-04-02T16:44:56",
owner="Mustrum Ridcully",
owner_group="ess",
principal_investigator="mail.person@sci.uni",
source_folder=RemotePath("/hex/source62"),
techniques=["neutron powder diffraction"],
)
expected = Technique(
pid="http://purl.org/pan-science/PaNET/PaNET01100",
name="neutron powder diffraction",
)
Technique(
name="neutron powder diffraction",
pid="http://purl.org/pan-science/PaNET/PaNET01100",
)
assert dset.techniques == [expected]


def test_technique_set_invalid_label_raises_value_error() -> None:
dset = Dataset(
type="raw",
contact_email="mail.person@sci.uni",
creation_time="2142-04-02T16:44:56",
owner="Mustrum Ridcully",
owner_group="ess",
principal_investigator="mail.person@sci.uni",
source_folder=RemotePath("/hex/source62"),
)
with pytest.raises(ValueError, match="Unknown technique"):
dset.techniques = ["bad technique"]
69 changes: 69 additions & 0 deletions tests/ontology_test.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can maybe add a test if the ontology file is up to date in the CI or will it not be ever changed...?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will change. I wouldn't want that check in the regular tests, but I can try and set it up in CI.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really sure how to do this. The JSONLD file does not encode the version. I would have to parse that out of the HTML, TTL, or N Triples file. Or clone the github repo. But then I don't know if the deployed version is the same as the repo.
Do you think this is worth pursuing?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we simply check if the binary files that we make from the jsonld files are exactly the same?

But we can skip it I guess, if we ever want it to be updated, we will probably be asked to do so by instrument scientists or SIMS team.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we download and parse the whole ontology every time we use it, then we don't even need to store it in the package. I wanted to avoid that so that we don't need an internet connection to make local datasets. Admittedly, that is a niche use case.

Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2025 SciCat Project (https://github.com/SciCatProject/scitacean)
import pytest

from scitacean import model, ontology


def test_can_load_expands_technique_ontology() -> None:
techniques = ontology.expands_techniques()
assert len(techniques) > 0

# Check IRI
assert all(iri.startswith("http") for iri in techniques.keys())

# Check labels
assert all(
all(label.islower() for label in labels) for labels in techniques.values()
)
assert all(
all(label.strip() == label for label in labels)
for labels in techniques.values()
)


def test_can_look_up_technique_by_label() -> None:
technique = ontology.find_technique("small angle neutron scattering")
expected = model.Technique(
pid="http://purl.org/pan-science/PaNET/PaNET01189",
name="small angle neutron scattering",
)
assert technique == expected


def test_can_look_up_technique_by_label_is_case_insensitive() -> None:
technique = ontology.find_technique("Total Scattering")
expected = model.Technique(
pid="http://purl.org/pan-science/PaNET/PaNET01190",
name="total scattering",
)
assert technique == expected


def test_can_look_up_technique_by_alternative_label() -> None:
regular = ontology.find_technique("x-ray single crystal diffraction")
alternative1 = ontology.find_technique("SXRD")
alternative2 = ontology.find_technique("sxrd")
alternative3 = ontology.find_technique("single crystal x-ray diffraction ")
expected = model.Technique(
pid="http://purl.org/pan-science/PaNET/PaNET01102",
name="x-ray single crystal diffraction",
)
assert regular == expected
assert alternative1 == expected
assert alternative2 == expected
assert alternative3 == expected


def test_can_look_up_technique_by_iri() -> None:
technique = ontology.find_technique("http://purl.org/pan-science/PaNET/PaNET01239")
expected = model.Technique(
pid="http://purl.org/pan-science/PaNET/PaNET01239",
name="neutron reflectometry",
)
assert technique == expected


def test_lookup_rejects_ambiguous_label() -> None:
with pytest.raises(ValueError, match="multiple techniques"):
ontology.find_technique("diffraction")
3 changes: 3 additions & 0 deletions tools/model-generation/spec/dataset-fields.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ conversions:
sourceFolder:
func: _parse_remote_path
arg_type: RemotePath | str
techniques:
func: _parse_techniques
arg_type: Iterable[str | Technique]

# Mark those fields as read-only in addition to those identified as read only from the schema.
# Read-only fields must be None in uploads.
Expand Down
Loading
Loading