diff --git a/.appveyor.yml b/.appveyor.yml index 77886465..d3be798c 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -60,9 +60,9 @@ environment: # All of these are common to all matrix runs ATM, so pre-defined here and to be overloaded if needed DTS: datalad_container APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - INSTALL_SYSPKGS: python3-venv xz-utils jq libffi7 + INSTALL_SYSPKGS: python3-venv xz-utils jq libffi7 skopeo # system git-annex is way too old, use better one - INSTALL_GITANNEX: git-annex -m deb-url --url http://snapshot.debian.org/archive/debian/20210906T204127Z/pool/main/g/git-annex/git-annex_8.20210903-1_amd64.deb + INSTALL_GITANNEX: git-annex -m datalad/packages CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov matrix: @@ -73,8 +73,8 @@ environment: - ID: Ubu # The same but with the oldest supported Python. - - ID: Ubu-3.8 - PY: '3.8' + - ID: Ubu-3.9 + PY: '3.9' # The same but removing busybox first - triggers different code paths in the tests - ID: Ubu-nobusybox diff --git a/.github/workflows/add-changelog-snippet.yml b/.github/workflows/add-changelog-snippet.yml index efbc2c84..2da957c4 100644 --- a/.github/workflows/add-changelog-snippet.yml +++ b/.github/workflows/add-changelog-snippet.yml @@ -19,7 +19,7 @@ jobs: if: contains(github.event.pull_request.labels.*.name, 'CHANGELOG-missing') steps: - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index c5e16043..77d7be2e 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -17,6 +17,6 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Codespell uses: codespell-project/actions-codespell@v2 diff --git a/.github/workflows/docbuild.yml b/.github/workflows/docbuild.yml index 86ccc607..b46b0fd6 100644 --- a/.github/workflows/docbuild.yml +++ b/.github/workflows/docbuild.yml @@ -12,7 +12,7 @@ jobs: run: | git config --global user.email "test@github.land" git config --global user.name "GitHub Almighty" - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python 3.9 uses: actions/setup-python@v5 with: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ec77ce7d..91d5eaf2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,7 +15,7 @@ jobs: if: github.event.pull_request.merged == true && contains(github.event.pull_request.labels.*.name, 'release') steps: - name: Checkout source - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: # Check out all history so that the previous release tag can be # found: diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..d4adbca2 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,182 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Code Style + +- **Imports at the top**: Always place imports at the top of the file, even during development/prototyping. No inline imports except for circular dependency breaks. + +## Project Overview + +datalad-container is a DataLad extension for working with containerized computational environments. It enables tracking, versioning, and execution of containerized workflows within DataLad datasets using Singularity/Apptainer, Docker, and OCI-compliant images. + +## Core Architecture + +### Command Suite Structure + +The extension registers a command suite with DataLad through setuptools entry points (see `setup.cfg`). The main commands are: + +- **containers-add** (`containers_add.py`) - Add/update container images to a dataset +- **containers-list** (`containers_list.py`) - List configured containers +- **containers-remove** (`containers_remove.py`) - Remove containers from configuration +- **containers-run** (`containers_run.py`) - Execute commands within containers + +All commands are registered in `datalad_container/__init__.py` via the `command_suite` tuple. + +### Container Adapters + +The `adapters/` directory contains transport-specific handlers: + +- **docker.py** - Docker Hub images (`dhub://` scheme) +- **oci.py** - OCI-compliant images using Skopeo (`oci:` scheme) + - Saves images as trackable directory structures + - Supports loading images to Docker daemon on-demand + - Uses Skopeo for image manipulation + +Each adapter implements `save()` and `run()` functions for their respective container formats. + +### Container Discovery + +`find_container.py` implements the logic for locating containers: +- Searches current dataset and subdatasets +- Supports hierarchical container names (e.g., `subds/container-name`) +- Falls back to path-based and name-based lookups +- Automatically installs subdatasets if needed to access containers + +### Configuration Storage + +Container metadata is stored in `.datalad/config` with the pattern: +``` +datalad.containers..image = +datalad.containers..cmdexec = +datalad.containers..updateurl = +datalad.containers..extra-input = +``` + +Default container location: `.datalad/environments//image` + +## Development Commands + +### Setup Development Environment + +```bash +# Using uv (preferred) +uv venv +source .venv/bin/activate +uv pip install -e .[devel] + +# Or traditional method +python3 -m venv .venv +source .venv/bin/activate +pip install -e .[devel] +``` + +### Running Tests + +```bash +# Run all tests +pytest -s -v datalad_container + +# Run specific test file +pytest -s -v datalad_container/tests/test_containers.py + +# Run specific test function +pytest -s -v datalad_container/tests/test_containers.py::test_add_noop + +# Run with coverage +pytest -s -v --cov=datalad_container datalad_container + +# Skip slow tests (marked with 'turtle') +pytest -s -v -m "not turtle" datalad_container +``` + +### Code Quality Tools + +Pre-commit hooks are configured in `.pre-commit-config.yaml`: + +```bash +# Install pre-commit hooks +pre-commit install + +# Run manually on all files +pre-commit run --all-files + +# Individual tools +isort datalad_container/ # Sort imports +codespell # Spell checking +``` + +### Building Documentation + +```bash +cd docs +make html +# Output in docs/build/html/ +``` + +### Important Testing Notes + +- Tests use pytest fixtures defined in `datalad_container/conftest.py` and `tests/fixtures/` +- The project uses `@with_tempfile` and `@with_tree` decorators from DataLad's test utilities +- Docker tests may require Docker to be running +- Singularity/Apptainer tests require the container runtime to be installed +- Some tests are marked with `@pytest.mark.turtle` for slow-running tests + +## Key Implementation Details + +### URL Scheme Handling + +Container sources are identified by URL schemes: +- `shub://` - Singularity Hub (legacy, uses requests library) +- `docker://` - Direct Singularity pull from Docker Hub +- `dhub://` - Docker images stored locally via docker pull/save +- `oci:` - OCI images stored as directories via Skopeo + +The scheme determines both storage format and execution template. + +### Execution Format Strings + +Call format strings support placeholders: +- `{img}` - Path to container image +- `{cmd}` - Command to execute +- `{img_dspath}` - Relative path to dataset containing image +- `{img_dirpath}` - Directory containing the image +- `{python}` - Path to current Python executable + +Example: `singularity exec {img} {cmd}` + +### Git-annex Integration + +- Large container images are managed by git-annex +- For `shub://` URLs, uses DataLad's special remote if available +- The `ensure_datalad_remote()` function (in `utils.py`) initializes the special remote when needed +- For `oci:docker://` images, registry URLs are added to annexed layers for efficient retrieval + +### Path Normalization + +`utils.py` contains `_normalize_image_path()` to handle cross-platform path issues: +- Config historically stored platform-specific paths +- Now standardizes to POSIX paths in config +- Maintains backward compatibility with Windows paths + +## Testing Considerations + +- Mark AI-generated tests with `@pytest.mark.ai_generated` +- Tests should not `chdir()` the entire process; use `cwd` parameter instead +- Use `common_kwargs = {'result_renderer': 'disabled'}` in tests to suppress output +- Many tests use DataLad's `with_tempfile` decorator for temporary test directories + +## Dependencies + +Core dependencies: +- datalad >= 0.18.0 +- requests >= 1.2 (for Singularity Hub communication) + +Container runtimes (at least one required): +- Singularity or Apptainer for Singularity images +- Docker for Docker and OCI image execution +- Skopeo for OCI image manipulation + +## Version Management + +This project uses `versioneer.py` for automatic version management from git tags. Version info is in `datalad_container/_version.py` (auto-generated, excluded from coverage). diff --git a/changelog.d/pr-277.md b/changelog.d/pr-277.md new file mode 100644 index 00000000..935fa469 --- /dev/null +++ b/changelog.d/pr-277.md @@ -0,0 +1,6 @@ +### 🚀 Enhancements and New Features + +- Add skopeo-based adapter for working with OCI images. + [PR #277](https://github.com/datalad/datalad-container/pull/277) (by [@yarikoptic](https://github.com/yarikoptic)) + continued an old/never finalized/closed + [PR #136](https://github.com/datalad/datalad-container/pull/136) (by [@kyleam](https://github.com/kyleam)). diff --git a/datalad_container/__init__.py b/datalad_container/__init__.py index 9efd88b2..d9b8e75a 100644 --- a/datalad_container/__init__.py +++ b/datalad_container/__init__.py @@ -42,8 +42,13 @@ 'ContainersRun', 'containers-run', 'containers_run', - - ) + ), + ( + 'datalad_container.containers_profiles', + 'ContainersProfiles', + 'containers-profiles', + 'containers_profiles', + ), ] ) diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index 774f53b9..3acb9360 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -19,7 +19,15 @@ import tarfile import tempfile -from datalad.utils import on_windows +import logging + +from datalad_container.adapters.utils import ( + docker_run, + get_docker_image_ids, + log_and_exit, + on_windows, + setup_logger, +) lgr = logging.getLogger("datalad.containers.adapters.docker") @@ -49,7 +57,7 @@ def save(image, path): with tempfile.NamedTemporaryFile() as stream: # Windows can't write to an already opened file stream.close() - sp.check_call(["docker", "save", "-o", stream.name, image]) + sp.run(["docker", "save", "-o", stream.name, image], check=True) with tarfile.open(stream.name, mode="r:") as tar: if not op.exists(path): lgr.debug("Creating new directory at %s", path) @@ -79,12 +87,6 @@ def safe_extract(tar, path=".", members=None, *, numeric_owner=False): lgr.info("Saved %s to %s", image, path) -def _list_images(): - out = sp.check_output( - ["docker", "images", "--all", "--quiet", "--no-trunc"]) - return out.decode().splitlines() - - def get_image(path, repo_tag=None, config=None): """Return the image ID of the image extracted at `path`. """ @@ -130,7 +132,7 @@ def load(path, repo_tag, config): # things, loading the image from the dataset will tag the old neurodebian # image as the latest. image_id = "sha256:" + get_image(path, repo_tag, config) - if image_id not in _list_images(): + if image_id not in get_docker_image_ids(): lgr.debug("Loading %s", image_id) cmd = ["docker", "load"] p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE) @@ -144,7 +146,7 @@ def load(path, repo_tag, config): else: lgr.debug("Image %s is already present", image_id) - if image_id not in _list_images(): + if image_id not in get_docker_image_ids(): raise RuntimeError( "docker image {} was not successfully loaded".format(image_id)) return image_id @@ -159,25 +161,7 @@ def cli_save(namespace): def cli_run(namespace): image_id = load(namespace.path, namespace.repo_tag, namespace.config) - prefix = ["docker", "run", - # FIXME: The -v/-w settings are convenient for testing, but they - # should be configurable. - "-v", "{}:/tmp".format(os.getcwd()), - "-w", "/tmp", - "--rm", - "--interactive"] - if not on_windows: - # Make it possible for the output files to be added to the - # dataset without the user needing to manually adjust the - # permissions. - prefix.extend(["-u", "{}:{}".format(os.getuid(), os.getgid())]) - - if sys.stdin.isatty(): - prefix.append("--tty") - prefix.append(image_id) - cmd = prefix + namespace.cmd - lgr.debug("Running %r", cmd) - sp.check_call(cmd) + docker_run(image_id, namespace.cmd) def main(args): @@ -228,20 +212,11 @@ def main(args): namespace = parser.parse_args(args[1:]) - logging.basicConfig( - level=logging.DEBUG if namespace.verbose else logging.INFO, - format="%(message)s") + setup_logger(logging.DEBUG if namespace.verbose else logging.INFO) namespace.func(namespace) if __name__ == "__main__": - try: + with log_and_exit(lgr): main(sys.argv) - except Exception as exc: - lgr.exception("Failed to execute %s", sys.argv) - if isinstance(exc, sp.CalledProcessError): - excode = exc.returncode - else: - excode = 1 - sys.exit(excode) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py new file mode 100644 index 00000000..6885c13a --- /dev/null +++ b/datalad_container/adapters/oci.py @@ -0,0 +1,402 @@ +"""Run a container from the image in a local OCI directory. + +This adapter uses Skopeo to save a Docker image (or any source that Skopeo +supports) to a local directory that's compliant with the "Open Container Image +Layout Specification" and can be tracked as objects in a DataLad dataset. + +This image can then be loaded on-the-fly in order for execution. Currently only +docker-run is supported (i.e. the image is loaded with Skopeo's +"docker-daemon:" transport), but the plan is to support podman-run (via the +"containers-storage:" transport) as well. + +Examples +-------- + +Save BusyBox 1.32 from Docker Hub to the local directory bb_1.32: + + $ python -m datalad_container.adapters.oci \\ + save docker://busybox:1.32 bb-1.32/ + +Load the image into the Docker daemon (if necessary) and run a command: + + $ python -m datalad_container.adapters.oci \\ + run bb_1.32/ sh -c 'busybox | head -1' + BusyBox v1.32.0 (2020-10-12 23:47:18 UTC) multi-call binary. +""" +# ^TODO: Add note about expected image ID mismatches (e.g., between the docker +# pulled entry and loaded one)? + +from collections import namedtuple +import json +import logging +from pathlib import Path +import re +import subprocess as sp +import sys + +from datalad_container.adapters.utils import ( + docker_run, + get_docker_image_ids, + log_and_exit, + setup_logger, +) + +lgr = logging.getLogger("datalad.container.adapters.oci") + +_IMAGE_SOURCE_KEY = "org.datalad.container.image.source" + + +def _normalize_reference(reference): + """Normalize a short repository name to a canonical one. + + Parameters + ---------- + reference : str + A Docker reference (e.g., "neurodebian", "library/neurodebian"). + + Returns + ------- + A fully-qualified reference (e.g., "docker.io/library/neurodebian") + + Note: This tries to follow containers/image's splitDockerDomain(). + """ + parts = reference.split("/", maxsplit=1) + if len(parts) == 1 or (not any(c in parts[0] for c in [".", ":"]) + and parts[0] != "localhost"): + domain, remainder = "docker.io", reference + else: + domain, remainder = parts + + if domain == "docker.io" and "/" not in remainder: + remainder = "library/" + remainder + return domain + "/" + remainder + + +Reference = namedtuple("Reference", ["name", "tag", "digest"]) + + +def parse_docker_reference(reference, normalize=False, strip_transport=False): + """Parse a Docker reference into a name, tag, and digest. + + Parameters + ---------- + reference : str + A Docker reference (e.g., "busybox" or "library/busybox:latest") + normalize : bool, optional + Whether to normalize short names like "busybox" to the fully qualified + name ("docker.io/library/busybox") + strip_transport : bool, optional + Remove Skopeo transport value ("docker://" or "docker-daemon:") from + the name. Unless this is true, reference should not include a + transport. + + Returns + ------- + A Reference namedtuple with .name, .tag, and .digest attributes + """ + if strip_transport: + try: + reference = reference.split(":", maxsplit=1)[1] + except IndexError: + raise ValueError("Reference did not have transport: {}" + .format(reference)) + if reference.startswith("//"): + reference = reference[2:] + + parts = reference.split("/") + last = parts[-1] + if "@" in last: + sep = "@" + elif ":" in last: + sep = ":" + else: + sep = None + + tag = None + digest = None + if sep: + repo, label = last.split(sep) + front = "/".join(parts[:-1] + [repo]) + if sep == "@": + digest = label + else: + tag = label + else: + front, tag = "/".join(parts), None + if normalize: + front = _normalize_reference(front) + return Reference(front, tag, digest) + + +def _store_annotation(path, key, value): + """Set a value of image's org.datalad.container.image.source annotation. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + key, value : str + Key and value to store in the image's "annotations" field. + """ + index = path / "index.json" + index_info = json.loads(index.read_text()) + annot = index_info["manifests"][0].get("annotations", {}) + + annot[key] = value + index_info["manifests"][0]["annotations"] = annot + with index.open("w") as fh: + json.dump(index_info, fh) + + +def _get_annotation(path, key): + """Return value for `key` in an image's annotation. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + key : str + Key in the image's "annotations" field. + + Returns + ------- + str or None + """ + index = path / "index.json" + index_info = json.loads(index.read_text()) + # Assume one manifest because skopeo-inspect would fail anyway otherwise. + return index_info["manifests"][0].get("annotations", {}).get(key) + + +def save(image, path): + """Save an image to an OCI-compliant directory. + + Parameters + ---------- + image : str + A source image accepted by skopeo-copy + path : pathlib.Path + Directory to copy the image to + """ + # Refuse to work with non-empty directory if it's not empty by letting the + # OSError through. Multiple images can be saved to an OCI directory, but + # run() and get_image_id() don't support a way to pull out a specific one. + try: + path.rmdir() + except FileNotFoundError: + pass + except OSError as exc: + raise OSError(exc) from None + path.mkdir(parents=True) + dest = "oci:" + str(path) + tag = parse_docker_reference(image).tag + if tag: + dest += ":" + tag + sp.run(["skopeo", "copy", image, dest], check=True) + _store_annotation(path, _IMAGE_SOURCE_KEY, image) + + +def link(ds, path, reference): + """Add Docker registry URLs to annexed layer images. + + Parameters + ---------- + ds : Dataset + path : pathlib.Path + Absolute path to the image directory. + reference : str + Docker reference (e.g., "busybox:1.32"). This should not include the + transport (i.e. "docker://"). + """ + from datalad.downloaders.providers import Providers + from datalad.support.exceptions import CommandError + from datalad_container.utils import ensure_datalad_remote + + res = sp.run(["skopeo", "inspect", "oci:" + str(path)], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + info = json.loads(res.stdout) + + ref = parse_docker_reference(reference, normalize=True) + registry, name = ref.name.split("/", maxsplit=1) + + # Docker Hub has a special endpoint, all others follow the pattern + # https://{registry}/v2/ + if registry == "docker.io": + endpoint = "https://registry-1.docker.io/v2/" + else: + endpoint = f"https://{registry}/v2/" + provider = Providers.from_config_files().get_provider( + endpoint + name, only_nondefault=True) + if not provider: + lgr.warning("Required Datalad provider configuration " + "for Docker registry links not detected. We will enable 'datalad' " + "special remote anyways but datalad might issue warnings later on.") + + layers = {} # path => digest + for layer in info["Layers"]: + algo, digest = layer.split(":") + layer_path = path / "blobs" / algo / digest + layers[layer_path] = layer + + ds_repo = ds.repo + checked_dl_remote = False + for st in ds.status(layers.keys(), annex="basic", result_renderer=None): + if "keyname" in st: + if not checked_dl_remote: + ensure_datalad_remote(ds_repo) + checked_dl_remote = True + path = Path(st["path"]) + url = "{}{}/blobs/{}".format(endpoint, name, layers[path]) + try: + ds_repo.add_url_to_file( + path, url, batch=True, options=['--relaxed']) + except CommandError as exc: + lgr.warning("Registering %s with %s failed: %s", + path, url, exc) + else: + lgr.warning("Skipping non-annexed layer: %s", st["path"]) + + +def get_image_id(path): + """Return a directory's image ID. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + + Returns + ------- + An image ID (str) + """ + # Note: This adapter depends on one image per directory. If, outside of + # this adapter interface, multiple images were stored in a directory, this + # will inspect call fails with a reasonable message. + res = sp.run(["skopeo", "inspect", "--raw", "oci:" + str(path)], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + info = json.loads(res.stdout) + return info["config"]["digest"] + + +def load(path): + """Load OCI image from `path`. + + Currently, the only supported load destination is the Docker daemon. + + Parameters + ---------- + path : pathlib.Path + An OCI-compliant directory such as the one generated by `save`. It must + contain only one image. + + Returns + ------- + An image ID (str) + """ + image_id = get_image_id(path) + if image_id not in get_docker_image_ids(): + lgr.debug("Loading %s", image_id) + # The image is copied with a datalad-container/ prefix to reduce the + # chance of collisions with existing names registered with the Docker + # daemon. While we must specify _something_ for the name and tag in + # order to copy it, the particular values don't matter for execution + # purposes; they're chosen to help users identify the container in the + # `docker images` output. + source = _get_annotation(path, _IMAGE_SOURCE_KEY) + if source: + ref = parse_docker_reference(source, strip_transport=True) + name = ref.name + if ref.tag: + tag = ref.tag + else: + if ref.digest: + tag = "source-" + ref.digest.replace(":", "-")[:14] + else: + tag = "latest" + + else: + name = re.sub("[^a-z0-9-_.]", "", path.name.lower()[:10]) + tag = image_id.replace(":", "-")[:14] + + lgr.debug("Copying %s to Docker daemon", image_id) + sp.run(["skopeo", "copy", "oci:" + str(path), + # This load happens right before the command executes. Don't + # let the output be confused for the command's output. + "--quiet", + "docker-daemon:datalad-container/{}:{}".format(name, tag)], + check=True) + else: + lgr.debug("Image %s is already present", image_id) + return image_id + + +# Command-line + + +def cli_save(namespace): + save(namespace.image, namespace.path) + + +def cli_run(namespace): + image_id = load(namespace.path) + docker_run(image_id, namespace.cmd) + + +def main(args): + import argparse + + parser = argparse.ArgumentParser( + prog="python -m datalad_container.adapters.oci", + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "-v", "--verbose", + action="store_true") + + subparsers = parser.add_subparsers(title="subcommands") + # Don't continue without a subcommand. + subparsers.required = True + subparsers.dest = "command" + + parser_save = subparsers.add_parser( + "save", + help="save an image to a directory") + parser_save.add_argument( + "image", metavar="NAME", + help="image to save") + parser_save.add_argument( + "path", metavar="PATH", type=Path, + help="directory to save image in") + parser_save.set_defaults(func=cli_save) + + parser_run = subparsers.add_parser( + "run", + help="run a command with a directory's image") + + # TODO: Support containers-storage/podman. This would need to be fed + # through cli_run() and load(). Also, a way to specify it should probably + # be available through containers-add. + # parser_run.add_argument( + # "--dest", metavar="TRANSPORT", + # choices=["docker-daemon", "containers-storage"], + # ...) + parser_run.add_argument( + "path", metavar="PATH", type=Path, + help="image directory") + parser_run.add_argument( + "cmd", metavar="CMD", nargs=argparse.REMAINDER, + help="command to execute") + parser_run.set_defaults(func=cli_run) + + namespace = parser.parse_args(args[1:]) + + setup_logger(logging.DEBUG if namespace.verbose else logging.INFO) + + namespace.func(namespace) + + +if __name__ == "__main__": + with log_and_exit(lgr): + main(sys.argv) diff --git a/datalad_container/adapters/tests/test_oci.py b/datalad_container/adapters/tests/test_oci.py new file mode 100644 index 00000000..b6d4a4f5 --- /dev/null +++ b/datalad_container/adapters/tests/test_oci.py @@ -0,0 +1,125 @@ +"""Test of oci adapter that do not depend on skopeo or docker being installed. + + +See datalad_container.adapters.tests.test_oci_more for tests that do depend on +this. +""" + +import json + +import pytest + +from datalad.utils import Path +from datalad_container.adapters import oci +from datalad.tests.utils_pytest import ( + assert_raises, + eq_, + with_tempfile, +) + +# parse_docker_reference + + +def test_parse_docker_reference(): + eq_(oci.parse_docker_reference("neurodebian").name, + "neurodebian") + + +def test_parse_docker_reference_normalize(): + fn = oci.parse_docker_reference + for name in ["neurodebian", + "library/neurodebian", + "docker.io/neurodebian"]: + eq_(fn(name, normalize=True).name, + "docker.io/library/neurodebian") + + eq_(fn("quay.io/skopeo/stable", normalize=True).name, + "quay.io/skopeo/stable") + eq_(fn("ghcr.io/astral-sh/uv", normalize=True).name, + "ghcr.io/astral-sh/uv") + eq_(fn("gcr.io/my-project/my-image", normalize=True).name, + "gcr.io/my-project/my-image") + + +def test_parse_docker_reference_tag(): + fn = oci.parse_docker_reference + eq_(fn("busybox:1.32"), + ("busybox", "1.32", None)) + eq_(fn("busybox:1.32", normalize=True), + ("docker.io/library/busybox", "1.32", None)) + eq_(fn("docker.io/library/busybox:1.32"), + ("docker.io/library/busybox", "1.32", None)) + + +@pytest.mark.ai_generated +def test_parse_docker_reference_alternative_registries(): + """Test parsing references from alternative registries like quay.io and ghcr.io.""" + fn = oci.parse_docker_reference + + # Test quay.io with tag + eq_(fn("quay.io/linuxserver.io/baseimage-alpine:3.18"), + ("quay.io/linuxserver.io/baseimage-alpine", "3.18", None)) + + # Test ghcr.io with tag + eq_(fn("ghcr.io/astral-sh/uv:latest"), + ("ghcr.io/astral-sh/uv", "latest", None)) + + # Test gcr.io with tag + eq_(fn("gcr.io/my-project/my-image:v1.0"), + ("gcr.io/my-project/my-image", "v1.0", None)) + + +def test_parse_docker_reference_digest(): + fn = oci.parse_docker_reference + id_ = "sha256:a9286defaba7b3a519d585ba0e37d0b2cbee74ebfe590960b0b1d6a5e97d1e1d" + eq_(fn("busybox@{}".format(id_)), + ("busybox", None, id_)) + eq_(fn("busybox@{}".format(id_), normalize=True), + ("docker.io/library/busybox", None, id_)) + eq_(fn("docker.io/library/busybox@{}".format(id_)), + ("docker.io/library/busybox", None, id_)) + + +def test_parse_docker_reference_strip_transport(): + fn = oci.parse_docker_reference + eq_(fn("docker://neurodebian", strip_transport=True).name, + "neurodebian") + eq_(fn("docker-daemon:neurodebian", strip_transport=True).name, + "neurodebian") + + +def test_parse_docker_reference_strip_transport_no_transport(): + with assert_raises(ValueError): + oci.parse_docker_reference("neurodebian", strip_transport=True) + + +# _store_annotation and _get_annotation + +# This is the index.json contents of oci: copy of +# docker.io/library/busybox:1.32 +INDEX_VALUE = { + "schemaVersion": 2, + "manifests": [ + {"mediaType": "application/vnd.oci.image.manifest.v1+json", + "digest": "sha256:9f9f95fc6f6b24f0ab756a55b8326e8849ac6a82623bea29fc4c75b99ee166a3", + "size": 347}]} + + +@with_tempfile(mkdir=True) +def test_store_and_get_annotation(path=None): + path = Path(path) + with (path / "index.json").open("w") as fh: + json.dump(INDEX_VALUE, fh) + + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + None) + + oci._store_annotation(path, "org.opencontainers.image.ref.name", "1.32") + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + "1.32") + + oci._store_annotation(path, "another", "foo") + eq_(oci._get_annotation(path, "another"), + "foo") + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + "1.32") diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py new file mode 100644 index 00000000..1538820d --- /dev/null +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -0,0 +1,164 @@ +"""Tests of oci adapter that depend on skopeo or docker being installed. + +See datalad_container.adapters.tests.test_oci for tests that do not depend on +this. +""" +from shutil import which + +from datalad.api import ( + Dataset, + # FIXME: This is needed to register the dataset method, at least when + # running this single test module. Shouldn't that no longer be the case + # after datalad's 4b056a251f (BF/RF: Automagically find and import a + # datasetmethod if not yet bound, 2019-02-10)? + containers_add, +) +from datalad.cmd import ( + StdOutErrCapture, + WitlessRunner, +) +from datalad.support.exceptions import CommandError +from datalad.consts import ( + DATALAD_SPECIAL_REMOTE, + DATALAD_SPECIAL_REMOTES_UUIDS, +) +from datalad_container.adapters import oci +from datalad_container.adapters.utils import get_docker_image_ids +from datalad.tests.utils_pytest import ( + assert_in, + eq_, + integration, + ok_, + skip_if_no_network, + SkipTest, + slow, + with_tempfile, +) +import pytest + +for dep in ["skopeo", "docker"]: + if not which(dep): + raise SkipTest("'{}' not found on path".format(dep)) + + +@skip_if_no_network +@integration +@slow # ~13s +@with_tempfile +def test_oci_add_and_run(path=None): + ds = Dataset(path).create(cfg_proc="text2git") + ds.containers_add(url="oci:docker://busybox:1.30", name="bb") + + image_path = ds.repo.pathobj / ".datalad" / "environments" / "bb" / "image" + image_id = oci.get_image_id(image_path) + existed = image_id in get_docker_image_ids() + + try: + out = WitlessRunner(cwd=ds.path).run( + ["datalad", "containers-run", "-n", "bb", + "sh -c 'busybox | head -1'"], + protocol=StdOutErrCapture) + finally: + if not existed: + WitlessRunner().run(["docker", "rmi", image_id]) + assert_in("BusyBox v1.30", out["stdout"]) + + from datalad.downloaders.providers import Providers + if not Providers.from_config_files().get_provider( + "https://registry-1.docker.io/v2/library", + only_nondefault=True): + # The rest of the test is about Docker Hub registry links, which + # require provider configuration for authentication. + return + + layers = [r["path"] + for r in ds.status(image_path / "blobs", annex="basic", + result_renderer=None) + if "key" in r] + ok_(layers) + + dl_uuid = DATALAD_SPECIAL_REMOTES_UUIDS[DATALAD_SPECIAL_REMOTE] + for where_res in ds.repo.whereis(list(map(str, layers))): + assert_in(dl_uuid, where_res) + + +@pytest.mark.ai_generated +@skip_if_no_network +@integration +@slow +@pytest.mark.parametrize("registry,image_ref,container_name,test_cmd,expected_output", [ + ("docker.io", "busybox:1.30", "busybox", "sh -c 'busybox | head -1'", "BusyBox v1.30"), + ("gcr.io", "gcr.io/google-containers/busybox:latest", "busybox-gcr", "sh -c 'busybox | head -1'", "BusyBox"), + ("quay.io", "quay.io/prometheus/busybox:latest", "busybox-quay", "sh -c 'busybox | head -1'", "BusyBox"), +]) +def test_oci_alternative_registries(tmp_path, registry, image_ref, container_name, + test_cmd, expected_output): + """Test adding and running containers from alternative registries. + + Also verifies that: + - Annexed layer blobs have URLs registered in datalad or web remotes + - Files can be dropped and retrieved from remotes (tests the full cycle) + + Parameters + ---------- + registry : str + Registry name (for test identification) + image_ref : str + Full image reference (e.g., "ghcr.io/astral-sh/uv:latest") + container_name : str + Name to register the container under + test_cmd : str + Command to run in the container + expected_output : str + String expected to be in the command output + """ + ds = Dataset(str(tmp_path)).create(cfg_proc="text2git") + ds.containers_add(url=f"oci:docker://{image_ref}", name=container_name) + + image_path = ds.repo.pathobj / ".datalad" / "environments" / container_name / "image" + image_id = oci.get_image_id(image_path) + existed = image_id in get_docker_image_ids() + + try: + out = WitlessRunner(cwd=ds.path).run( + ["datalad", "containers-run", "-n", container_name, test_cmd], + protocol=StdOutErrCapture) + finally: + if not existed: + WitlessRunner().run(["docker", "rmi", image_id]) + + assert_in(expected_output, out["stdout"]) + + # Check that there are annexed files in the image blobs + blobs_path = image_path / "blobs" + annexed_blobs = [r["path"] + for r in ds.status(blobs_path, annex="basic", + result_renderer=None) + if "key" in r] + ok_(annexed_blobs, f"Expected to find annexed blobs in {blobs_path}") + + # Verify all annexed files are available from datalad or web remote + # (only check if datalad remote exists, which requires provider configuration) + # git annex find --not --in datalad --and --not --in web should be empty + result = WitlessRunner(cwd=ds.path).run( + ["git", "annex", "find", "--not", "--in", "datalad", + "--and", "--not", "--in", "web"] + annexed_blobs, + protocol=StdOutErrCapture) + eq_(result["stdout"].strip(), "", + "All annexed blobs should be available from datalad or web remote") + + # Drop all annexed content in the dataset + drop_results = ds.drop(".", result_renderer=None, on_failure='ignore') + print(f"Drop results for {registry}: {len(drop_results)} items") + for r in drop_results[:5]: # Print first 5 results + print(f" Dropped: {r.get('path', 'N/A')} - status: {r.get('status', 'N/A')}") + # Verify that something was actually dropped + ok_(drop_results, "Expected to drop some annexed files") + + # Get everything back to verify it can be retrieved from remotes + get_results = ds.get(".", result_renderer=None) + print(f"Get results for {registry}: {len(get_results)} items") + for r in get_results[:5]: # Print first 5 results + print(f" Retrieved: {r.get('path', 'N/A')} - status: {r.get('status', 'N/A')}") + # Verify that files were retrieved + ok_(get_results, "Expected to retrieve files from remotes") diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py new file mode 100644 index 00000000..27e2895b --- /dev/null +++ b/datalad_container/adapters/utils.py @@ -0,0 +1,79 @@ +"""Utilities used across the adapters +""" + +import contextlib +import logging +import os +import subprocess as sp +import sys + +from datalad.utils import on_windows + +lgr = logging.getLogger("datalad.containers.adapters.utils") + + +def setup_logger(level): + logger = logging.getLogger("datalad.containers.adapters") + logger.setLevel(level) + if not logger.hasHandlers(): + # If this script is executed with the file name rather than the + # documented `python -m ...` invocation, we can't rely on DataLad's + # handler. Add a minimal one. + handler = logger.StreamHandler() + handler.setFormatter(logger.Formatter('%(message)s')) + logger.addHandler(handler) + + +@contextlib.contextmanager +def log_and_exit(logger): + try: + yield + except Exception as exc: + logger.exception("Failed to execute %s", sys.argv) + if isinstance(exc, sp.CalledProcessError): + excode = exc.returncode + if exc.stderr: + sys.stderr.write(exc.stderr) + else: + excode = 1 + sys.exit(excode) + + +def get_docker_image_ids(): + """Return IDs of all known images.""" + out = sp.run( + ["docker", "images", "--all", "--quiet", "--no-trunc"], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + return out.stdout.splitlines() + + +def docker_run(image_id, cmd): + """Execute `docker run`. + + Parameters + ---------- + image_id : str + ID of image to execute + cmd : list of str + Command to execute + """ + prefix = ["docker", "run", + # FIXME: The -v/-w settings are convenient for testing, but they + # should be configurable. + "-v", "{}:/tmp".format(os.getcwd()), + "-w", "/tmp", + "--rm", + "--interactive"] + if not on_windows: + # Make it possible for the output files to be added to the + # dataset without the user needing to manually adjust the + # permissions. + prefix.extend(["-u", "{}:{}".format(os.getuid(), os.getgid())]) + + if sys.stdin.isatty(): + prefix.append("--tty") + prefix.append(image_id) + full_cmd = prefix + cmd + lgr.debug("Running %r", full_cmd) + sp.run(full_cmd, check=True) diff --git a/datalad_container/conftest.py b/datalad_container/conftest.py index f6b40986..dd716f22 100644 --- a/datalad_container/conftest.py +++ b/datalad_container/conftest.py @@ -1,3 +1,33 @@ +import os +import sys + +import pytest + from datalad.conftest import setup_package from .tests.fixtures import * # noqa: F401, F403 # lgtm [py/polluting-import] + + +@pytest.fixture(scope="session", autouse=True) +def ensure_sys_executable_in_path(): + """Ensure sys.executable's directory is first in PATH for test duration. + + This is needed when tests spawn subprocesses that need to import modules + from the same Python environment that's running pytest. + """ + python_dir = os.path.dirname(sys.executable) + original_path = os.environ.get("PATH", "") + path_dirs = original_path.split(os.pathsep) + + # Check if python_dir is already first in PATH + if path_dirs and path_dirs[0] != python_dir: + # Put python_dir first, removing it from elsewhere if present + filtered_dirs = [d for d in path_dirs if d != python_dir] + new_path = os.pathsep.join([python_dir] + filtered_dirs) + os.environ["PATH"] = new_path + yield + # Restore original PATH + os.environ["PATH"] = original_path + else: + # PATH is already correct + yield diff --git a/datalad_container/containers_add.py b/datalad_container/containers_add.py index ab182224..ef72d7c2 100644 --- a/datalad_container/containers_add.py +++ b/datalad_container/containers_add.py @@ -31,8 +31,10 @@ ) from datalad.support.exceptions import InsufficientArgumentsError from datalad.support.param import Parameter +from datalad.utils import Path -from .utils import get_container_configuration +from .adapters import docker, oci +from .utils import get_container_configuration, ensure_datalad_remote, parse_registry_url lgr = logging.getLogger("datalad.containers.containers_add") @@ -63,20 +65,23 @@ def _resolve_img_url(url): return url -def _guess_call_fmt(ds, name, url): - """Helper to guess a container exec setup based on - - a name (to be able to look up more config - - a plain url to make inference based on the source location - - Should return `None` is no guess can be made. - """ - if url is None: - return None - elif url.startswith('shub://') or url.startswith('docker://'): - return 'singularity exec {img} {cmd}' - elif url.startswith('dhub://'): - # {python} is replaced with sys.executable on *execute* - return '{python} -m datalad_container.adapters.docker run {img} {cmd}' +# TODO Phase 4 backwards compatibility - restore execution format guessing +# def _guess_call_fmt(ds, name, url): +# """Helper to guess a container exec setup based on +# - a name (to be able to look up more config +# - a plain url to make inference based on the source location +# +# Should return `None` is no guess can be made. +# """ +# if url is None: +# return None +# elif url.startswith('shub://') or url.startswith('docker://'): +# return 'singularity exec {img} {cmd}' +# elif url.startswith('dhub://'): +# # {python} is replaced with sys.executable on *execute* +# return '{python} -m datalad_container.adapters.docker run {img} {cmd}' +# elif url.startswith('oci:'): +# return '{python} -m datalad_container.adapters.oci run {img} {cmd}' def _ensure_datalad_remote(repo): @@ -134,12 +139,19 @@ class ContainersAdd(Interface): 'docker://debian:stable-slim'), a command format string for Singularity-based execution will be auto-configured when [CMD: --call-fmt CMD][PY: call_fmt PY] is not specified. - For Docker-based container execution with the URL scheme 'dhub://', + For the scheme 'oci:', the rest of the URL will be interpreted as the + source argument to a `skopeo copy` call and the image will be saved + as an OCI-compliant directory at location specified by `name`. + Similarly, there is a 'dhub://' scheme, where the rest of the URL will be interpreted as the argument to 'docker pull', the image will be saved to a location specified by `name`, and the call format will be auto-configured to run docker, unless overwritten. The auto-configured call to docker - run mounts the CWD to '/tmp' and sets the working directory to '/tmp'.""", + run mounts the CWD to '/tmp' and sets the working directory to '/tmp'. + However, using + the 'oci:' scheme is recommended if you have skopeo installed. The + call format for the 'oci:' and 'dhub://' schemes will be + auto-guessed if not given.""", metavar="URL", constraints=EnsureStr() | EnsureNone(), ), @@ -211,10 +223,11 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, runner = WitlessRunner() # prevent madness in the config file - if not re.match(r'^[0-9a-zA-Z-]+$', name): + # Allow name:version format (e.g., 'alpine:latest', 'mriqc:23.1.0') + if not re.match(r'^[0-9a-zA-Z-]+(:[0-9a-zA-Z._-]+)?$', name): raise ValueError( - "Container names can only contain alphanumeric characters " - "and '-', got: '{}'".format(name)) + "Container names must be 'name' or 'name:version' " + "(alphanumeric, '-', '_', '.'), got: '{}'".format(name)) container_cfg = get_container_configuration(ds, name) if 'image' in container_cfg: @@ -242,15 +255,18 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, image = image or container_cfg.get("image") if not image: - loc_cfg_var = "datalad.containers.location" - container_loc = \ - ds.config.obtain( - loc_cfg_var, - # if not False it would actually modify the - # dataset config file -- undesirable - store=False, - ) - image = op.join(ds.path, container_loc, name, 'image') + # Parse name:version format if provided + if ':' in name: + base_name, version = name.split(':', 1) + else: + base_name = name + # Extract version from URL tag, default to 'latest' + parsed = parse_registry_url(url) + version = parsed['tag'] if parsed else 'latest' + + # New versioned path: .datalad/containers/images///image/ + image = op.join(ds.path, '.datalad', 'containers', 'images', + base_name, version, 'image') else: image = op.join(ds.path, image) @@ -261,9 +277,10 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, logger=lgr, ) - if call_fmt is None: - # maybe built in knowledge can help - call_fmt = _guess_call_fmt(ds, name, url) + # TODO Phase 4 backwards compatibility + # if call_fmt is None: + # # maybe built in knowledge can help + # call_fmt = _guess_call_fmt(ds, name, url) # collect bits for a final and single save() call to_save = [] @@ -281,44 +298,68 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, imgurl = _resolve_img_url(url) lgr.debug('Attempt to obtain container image from: %s', imgurl) - if url.startswith("dhub://"): - from .adapters import docker - - docker_image = url[len("dhub://"):] - - lgr.debug( - "Running 'docker pull %s and saving image to %s", - docker_image, image) - runner.run(["docker", "pull", docker_image]) - docker.save(docker_image, image) - elif url.startswith("docker://"): - image_dir, image_basename = op.split(image) - if not image_basename: - raise ValueError("No basename in path {}".format(image)) - if image_dir and not op.exists(image_dir): - os.makedirs(image_dir) - lgr.info("Building Singularity image for %s " - "(this may take some time)", - url) - runner.run(["singularity", "build", image_basename, url], - cwd=image_dir or None) + # Check for OCI registry URLs (docker://, quay://, ghcr://) + parsed = parse_registry_url(url) + if parsed: + lgr.info("Saving OCI image from %s", url) + oci.save(parsed['skopeo_url'], Path(image)) + + # Link layers to git-annex with registry URLs + reference = f"{parsed['registry']}/{parsed['name']}" + if parsed['tag'] and parsed['tag'] != 'latest': + reference += f":{parsed['tag']}" + oci.link(ds, Path(image), reference) + + # TODO: add --load flag to make this optional + # Load image into Docker daemon so it's ready to use + image_id = oci.load(Path(image)) + # Tag with meaningful name based on container name + if ':' in name: + docker_name = f"datalad-container/{name}" + else: + docker_name = f"datalad-container/{name}:{version}" + runner.run(["docker", "tag", image_id, docker_name]) + # TODO: this log gets buried - surface docker_name in final result message + lgr.info("Loaded image into Docker daemon: %s", docker_name) + + # TODO Phase 4 backwards compatibility - dhub:// scheme + # elif url.startswith("dhub://"): + # docker_image = url[len("dhub://"):] + # + # lgr.debug( + # "Running 'docker pull %s and saving image to %s", + # docker_image, image) + # runner.run(["docker", "pull", docker_image]) + # docker.save(docker_image, image) + + # TODO Phase 4 backwards compatibility - oci: scheme (use docker://, quay://, ghcr:// instead) + # elif url.startswith("oci:"): + # oci.save(url[len("oci:"):], Path(image)) + elif op.exists(url): lgr.info("Copying local file %s to %s", url, image) image_dir = op.dirname(image) if image_dir and not op.exists(image_dir): os.makedirs(image_dir) copyfile(url, image) + # TODO Phase 4 backwards compatibility - shub:// and generic URL handling + # else: + # if _HAS_SHUB_DOWNLOADER and url.startswith('shub://'): + # ensure_datalad_remote(ds.repo) + # + # try: + # ds.repo.add_url_to_file(image, imgurl) + # except Exception as e: + # result["status"] = "error" + # result["message"] = str(e) + # yield result + else: - if _HAS_SHUB_DOWNLOADER and url.startswith('shub://'): - _ensure_datalad_remote(ds.repo) - - try: - ds.repo.add_url_to_file(image, imgurl) - except Exception as e: - result["status"] = "error" - result["message"] = str(e) - yield result + raise ValueError( + f"Unsupported URL scheme: {url}. " + "Supported schemes: docker://, quay://, ghcr://" + ) # TODO do we have to take care of making the image executable # if --call_fmt is not provided? to_save.append(image) @@ -344,11 +385,12 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, # always store a POSIX path, relative to dataset root str(PurePosixPath(Path(image).relative_to(ds.pathobj))), force=True) - if call_fmt: - ds.config.set( - "{}.cmdexec".format(cfgbasevar), - call_fmt, - force=True) + # TODO Phase 4 backwards compatibility + # if call_fmt: + # ds.config.set( + # "{}.cmdexec".format(cfgbasevar), + # call_fmt, + # force=True) # --extra-input sanity check # TODO: might also want to do that for --call-fmt above? extra_input_placeholders = dict(img_dirpath="", img_dspath="") diff --git a/datalad_container/containers_profiles.py b/datalad_container/containers_profiles.py new file mode 100644 index 00000000..2ecebeae --- /dev/null +++ b/datalad_container/containers_profiles.py @@ -0,0 +1,105 @@ +"""List execution profiles known to a dataset""" + +__docformat__ = 'restructuredtext' + +import logging +import os.path as op + +import datalad.support.ansi_colors as ac +from datalad.distribution.dataset import ( + EnsureDataset, + datasetmethod, + require_dataset, +) +from datalad.interface.base import ( + Interface, + build_doc, + eval_results, +) +from datalad.interface.results import get_status_dict +from datalad.interface.utils import default_result_renderer +from datalad.support.constraints import EnsureNone +from datalad.support.param import Parameter +from datalad.ui import ui + +from datalad_container.profiles import list_profiles, load_profile + +lgr = logging.getLogger("datalad.containers.containers_profiles") + + +@build_doc +class ContainersProfiles(Interface): + """List execution profiles known to a dataset + + Profiles are YAML files in .datalad/containers/profiles/ that define + 'image' and 'exec' template for running containers. + """ + + result_renderer = 'tailored' + + _params_ = dict( + dataset=Parameter( + args=("-d", "--dataset"), + doc="""specify the dataset to query. If no dataset is given, an + attempt is made to identify the dataset based on the current + working directory""", + constraints=EnsureDataset() | EnsureNone()), + ) + + @staticmethod + @datasetmethod(name='containers_profiles') + @eval_results + def __call__(dataset=None): + ds = require_dataset(dataset, check_installed=True, + purpose='list profiles') + refds = ds.path + + profiles = list_profiles(ds) + + for p in profiles: + # Load profile to get resolved image/exec + try: + profile_data = load_profile(ds, p['name']) + image = profile_data.get('image', '') + exec_ = profile_data.get('exec', '') + extends = None + # Check if this profile extends another + with open(op.join(ds.path, p['path'])) as f: + import yaml + raw = yaml.safe_load(f) + extends = raw.get('extends') if raw else None + except Exception as exc: + lgr.warning("Failed to load profile %s: %s", p['name'], exc) + image = '' + exec_ = '' + extends = None + + res = get_status_dict( + status='ok', + action='containers_profiles', + name=p['name'], + type='file', + path=op.join(ds.path, p['path']), + refds=refds, + parentds=ds.path, + image=image, + exec=exec_, + ) + if extends: + res['extends'] = extends + yield res + + @staticmethod + def custom_result_renderer(res, **kwargs): + if res["action"] != "containers_profiles": + default_result_renderer(res) + else: + extends_info = "" + if res.get("extends"): + extends_info = f" (extends: {res['extends']})" + ui.message( + "{name}{extends} -> {image}" + .format( + name=ac.color_word(res["name"], ac.MAGENTA), + extends=extends_info, + image=res.get("image", ""))) diff --git a/datalad_container/containers_run.py b/datalad_container/containers_run.py index c3db7929..ed877ff8 100644 --- a/datalad_container/containers_run.py +++ b/datalad_container/containers_run.py @@ -26,6 +26,7 @@ from datalad.utils import ensure_iter from datalad_container.find_container import find_container_ +from datalad_container.profiles import load_profile, validate_profile lgr = logging.getLogger("datalad.containers.containers_run") @@ -41,6 +42,29 @@ metavar="NAME", doc="""Specify the name of or a path to a known container to use for execution, in case multiple containers are configured."""), + image=Parameter( + args=('--image',), + metavar="NAME", + doc="""Name of the image to use (e.g., 'alpine:latest' or 'mriqc:23.1.0'). + Resolves to .datalad/containers/images///image. + When specified, --exec is required."""), + exec_=Parameter( + args=('--exec',), + dest='exec_', + metavar="TEMPLATE", + doc="""Execution template string. Placeholders: + {img} = Docker image name (datalad-container/name:version), + {img_path} = OCI directory path (.datalad/containers/images/...), + {cmd} = command to run. + Examples: 'docker run --rm {img} {cmd}' or + 'apptainer exec oci:{img_path} {cmd}'. + Required when --image is specified without --profile."""), + profile=Parameter( + args=('--profile',), + metavar="NAME", + doc="""Name of an execution profile in .datalad/containers/profiles/. + Profiles define 'image' and 'exec' template. Can be overridden with + --image and/or --exec flags. Example: 'docker-default'."""), ) @@ -79,13 +103,113 @@ class ContainersRun(Interface): @eval_results def __call__(cmd, container_name=None, dataset=None, inputs=None, outputs=None, message=None, expand=None, - explicit=False, sidecar=None): + explicit=False, sidecar=None, image=None, exec_=None, + profile=None): from unittest.mock import \ patch # delayed, since takes long (~600ms for yoh) pwd, _ = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='run a containerized command execution') + # Phase 3: Load profile and merge with CLI overrides + if profile is not None: + try: + profile_data = load_profile(ds, profile) + except FileNotFoundError as exc: + yield get_status_dict( + 'run', + ds=ds, + status='error', + message=str(exc)) + return + except ValueError as exc: + yield get_status_dict( + 'run', + ds=ds, + status='error', + message=str(exc)) + return + + # CLI overrides take precedence over profile + if image is None: + image = profile_data.get('image') + if exec_ is None: + exec_ = profile_data.get('exec') + + # Validate: exec required, image validated if present + try: + validate_profile({'image': image, 'exec': exec_, '_name': profile}, ds) + except ValueError as exc: + yield get_status_dict( + 'run', + ds=ds, + status='error', + message=str(exc)) + return + + # Profile without image requires --image CLI arg + if image is None: + yield get_status_dict( + 'run', + ds=ds, + status='error', + message=f"Profile '{profile}' has no image. " + "Specify --image to use this profile.") + return + + # New Phase 2 path: --image and --exec specified directly (or from profile) + if image is not None: + if exec_ is None: + yield get_status_dict( + 'run', + ds=ds, + status='error', + message='--exec is required when --image is specified') + return + + # Parse image name:version + if ':' in image: + base_name, version = image.split(':', 1) + else: + base_name, version = image, 'latest' + + # Resolve paths and names + docker_image = f"datalad-container/{base_name}:{version}" + image_path = f".datalad/containers/images/{base_name}/{version}/image" + + # Expand placeholders in exec template + cmd = normalize_command(cmd) + try: + resolved_cmd = exec_.format( + img=docker_image, + img_path=image_path, + cmd=cmd, + ) + except KeyError as exc: + yield get_status_dict( + 'run', + ds=ds, + status='error', + message=('Unrecognized --exec placeholder: %s. ' + 'Available: {img}, {img_path}, {cmd}', exc)) + return + + with patch.dict('os.environ', + {CONTAINER_NAME_ENVVAR: image}): + for r in run_command( + cmd=resolved_cmd, + dataset=dataset or (ds if ds.path == pwd else None), + inputs=inputs, + extra_inputs=[image_path], + outputs=outputs, + message=message, + expand=expand, + explicit=explicit, + sidecar=sidecar): + yield r + return + + # Legacy path: use -n/--container-name with config lookup # this following block locates the target container. this involves a # configuration look-up. This is not using # get_container_configuration(), because it needs to account for a diff --git a/datalad_container/profiles.py b/datalad_container/profiles.py new file mode 100644 index 00000000..30be0bb4 --- /dev/null +++ b/datalad_container/profiles.py @@ -0,0 +1,201 @@ +"""Execution profile loading and resolution for containers-run""" + +import logging +import os.path as op +from pathlib import Path + +import yaml + +from datalad.distribution.dataset import Dataset + +lgr = logging.getLogger("datalad.containers.profiles") + +# Directory where profiles are stored within a dataset +PROFILES_DIR = ".datalad/containers/profiles" + + +def get_profile_path(ds: Dataset, profile_name: str) -> Path: + """Convert profile name to file path. + + Parameters + ---------- + ds : Dataset + Dataset to look in + profile_name : str + Profile name (e.g., 'docker-default') + + Returns + ------- + Path + Full path to profile YAML file + """ + return Path(ds.path) / PROFILES_DIR / f"{profile_name}.yaml" + + +def load_profile(ds: Dataset, profile_name: str) -> dict: + """Load a profile YAML file and resolve extends chain. + + Parameters + ---------- + ds : Dataset + Dataset containing the profile + profile_name : str + Profile name (without .yaml extension) + + Returns + ------- + dict + Resolved profile with 'image' and 'exec' keys + + Raises + ------ + FileNotFoundError + If profile file doesn't exist + ValueError + If profile is invalid or has circular extends + """ + profile_path = get_profile_path(ds, profile_name) + + if not profile_path.exists(): + raise FileNotFoundError( + f"Profile '{profile_name}' not found at {profile_path}" + ) + + with open(profile_path) as f: + profile = yaml.safe_load(f) + + if profile is None: + raise ValueError(f"Profile '{profile_name}' is empty") + + # Resolve extends chain + profile = _resolve_extends(profile, ds, seen={profile_name}) + + # Add metadata + profile['_name'] = profile_name + profile['_source'] = str(profile_path.relative_to(ds.path)) + + return profile + + +def _resolve_extends(profile: dict, ds: Dataset, seen: set) -> dict: + """Recursively resolve extends chain with clobber semantics. + + Parameters + ---------- + profile : dict + Profile data with optional 'extends' key + ds : Dataset + Dataset containing profiles + seen : set + Profile names already visited (for cycle detection) + + Returns + ------- + dict + Resolved profile with parent values clobbered by child + """ + if 'extends' not in profile: + return profile + + parent_name = profile['extends'] + + if parent_name in seen: + raise ValueError( + f"Circular profile inheritance detected: {parent_name} " + f"already in chain {seen}" + ) + + seen.add(parent_name) + + # Load parent profile + parent_path = get_profile_path(ds, parent_name) + if not parent_path.exists(): + raise FileNotFoundError( + f"Extended profile '{parent_name}' not found at {parent_path}" + ) + + with open(parent_path) as f: + parent = yaml.safe_load(f) + + if parent is None: + raise ValueError(f"Extended profile '{parent_name}' is empty") + + # Recursively resolve parent's extends + parent = _resolve_extends(parent, ds, seen) + + # Clobber: child values completely replace parent values + resolved = dict(parent) + for key, value in profile.items(): + if key != 'extends': + resolved[key] = value + + return resolved + + +def validate_profile(profile: dict, ds: Dataset) -> None: + """Validate that a profile is usable. + + Parameters + ---------- + profile : dict + Resolved profile (may have 'image' and/or 'exec' keys) + ds : Dataset + Dataset to check image in + + Raises + ------ + ValueError + If exec missing, or if image specified but doesn't exist + """ + if 'exec' not in profile: + raise ValueError( + f"Profile '{profile.get('_name', 'unknown')}' missing required 'exec' key" + ) + + # image is optional (base profiles may omit it, requiring --image CLI arg) + # but if specified, it must exist + image = profile.get('image') + if image is None: + return + + # Parse image name:version + if ':' in image: + base_name, version = image.split(':', 1) + else: + base_name, version = image, 'latest' + + # Check image directory exists + image_path = Path(ds.path) / '.datalad' / 'containers' / 'images' / base_name / version / 'image' + if not image_path.exists(): + raise ValueError( + f"Profile '{profile.get('_name', 'unknown')}' references image " + f"'{image}' but no image found at {image_path}" + ) + + +def list_profiles(ds: Dataset) -> list: + """List available profiles in a dataset. + + Parameters + ---------- + ds : Dataset + Dataset to list profiles from + + Returns + ------- + list of dict + List of profile info dicts with 'name' and 'path' keys + """ + profiles_dir = Path(ds.path) / PROFILES_DIR + + if not profiles_dir.exists(): + return [] + + profiles = [] + for path in sorted(profiles_dir.glob("*.yaml")): + profiles.append({ + 'name': path.stem, + 'path': str(path.relative_to(ds.path)), + }) + + return profiles diff --git a/datalad_container/tests/test_add.py b/datalad_container/tests/test_utils.py similarity index 81% rename from datalad_container/tests/test_add.py rename to datalad_container/tests/test_utils.py index 6a002174..863b965a 100644 --- a/datalad_container/tests/test_add.py +++ b/datalad_container/tests/test_utils.py @@ -13,10 +13,7 @@ ) from datalad.utils import Path -from datalad_container.containers_add import _ensure_datalad_remote - -# NOTE: At the moment, testing of the containers-add itself happens implicitly -# via use in other tests. +from datalad_container.utils import ensure_datalad_remote @with_tempfile @@ -24,7 +21,7 @@ def test_ensure_datalad_remote_init_and_enable_needed(path=None): ds = Dataset(path).create(force=True) repo = ds.repo assert_false(repo.get_remotes()) - _ensure_datalad_remote(repo) + ensure_datalad_remote(repo) assert_in("datalad", repo.get_remotes()) @@ -40,5 +37,5 @@ def test_ensure_datalad_remote_maybe_enable(path=None, *, autoenable): repo = ds_b.repo if not autoenable: assert_not_in("datalad", repo.get_remotes()) - _ensure_datalad_remote(repo) + ensure_datalad_remote(repo) assert_in("datalad", repo.get_remotes()) diff --git a/datalad_container/utils.py b/datalad_container/utils.py index 27d6b733..95729d8d 100644 --- a/datalad_container/utils.py +++ b/datalad_container/utils.py @@ -13,6 +13,75 @@ from datalad.distribution.dataset import Dataset from datalad.support.external_versions import external_versions +import logging + +lgr = logging.getLogger("datalad.containers.utils") + +# Registry URL schemes for Phase 1 +REGISTRY_SCHEMES = { + 'docker': 'docker.io', + # TODO: test and enable quay:// and ghcr:// schemes + # 'quay': 'quay.io', + # 'ghcr': 'ghcr.io', + # TODO Phase 4: support generic OCI registries via oci://registry.example.com/... +} + + +def parse_registry_url(url): + """Parse a registry URL into components. + + Parameters + ---------- + url : str + URL like 'docker://nipreps/mriqc:23.1.0' or 'quay://org/repo:tag' + + Returns + ------- + dict or None + If URL matches a known scheme, returns dict with keys: + - scheme: 'docker', 'quay', 'ghcr' + - registry: 'docker.io', 'quay.io', 'ghcr.io' + - name: 'nipreps/mriqc' + - tag: '23.1.0' or 'latest' if not specified + - skopeo_url: 'docker://docker.io/nipreps/mriqc:23.1.0' + + Returns None if URL doesn't match known schemes. + """ + if url is None: + return None + + for scheme, registry in REGISTRY_SCHEMES.items(): + prefix = f'{scheme}://' + if url.startswith(prefix): + remainder = url[len(prefix):] + + # Parse name and tag + if '@' in remainder: + # Digest reference like repo@sha256:abc... + name = remainder.split('@')[0] + tag = 'latest' + elif ':' in remainder.split('/')[-1]: + # Tag in last component: org/repo:tag + name, tag = remainder.rsplit(':', 1) + else: + name, tag = remainder, 'latest' + + # Build skopeo URL + if tag == 'latest': + skopeo_url = f'docker://{registry}/{name}' + else: + skopeo_url = f'docker://{registry}/{name}:{tag}' + + return { + 'scheme': scheme, + 'registry': registry, + 'name': name, + 'tag': tag, + 'skopeo_url': skopeo_url, + } + + return None + def get_container_command(): for command in ["apptainer", "singularity"]: @@ -148,3 +217,25 @@ def _normalize_image_path(path: str, ds: Dataset) -> PurePath: assert pathobj is not None # we report in platform-conventions return PurePath(pathobj) + + +def ensure_datalad_remote(repo): + """Initialize and enable datalad special remote if it isn't already.""" + dl_remote = None + for info in repo.get_special_remotes().values(): + if info["externaltype"] == "datalad": + dl_remote = info["name"] + break + + if not dl_remote: + from datalad.consts import DATALAD_SPECIAL_REMOTE + from datalad.customremotes.base import init_datalad_remote + + init_datalad_remote(repo, DATALAD_SPECIAL_REMOTE, autoenable=True) + elif repo.is_special_annex_remote(dl_remote, check_if_known=False): + lgr.debug("datalad special remote '%s' is already enabled", + dl_remote) + else: + lgr.debug("datalad special remote '%s' found. Enabling", + dl_remote) + repo.enable_remote(dl_remote) diff --git a/demo.sh b/demo.sh new file mode 100755 index 00000000..dcd44cc2 --- /dev/null +++ b/demo.sh @@ -0,0 +1,252 @@ +#!/bin/bash +# demo.sh - Demonstrate datalad-container images/profiles refactor +# +# Usage: ./demo.sh [base_dir] +# base_dir defaults to /tmp + +set -e + +BASE_DIR="${1:-/tmp}" +DEMO_DIR="$BASE_DIR/datalad-container-demo-$$" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +section() { + echo "" + echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" + echo -e "${BLUE} $1${NC}" + echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" + echo "" +} + +info() { + echo -e "${GREEN}► $1${NC}" +} + +warn() { + echo -e "${YELLOW}⚠ $1${NC}" +} + +run_cmd() { + echo -e "${YELLOW}\$ $1${NC}" + eval "$1" +} + +show_last_commit() { + echo "" + info "Last commit:" + git --no-pager log -1 --oneline + echo "" +} + +# ============================================================================ +section "SETUP: Create demo dataset" +# ============================================================================ + +info "Creating demo directory: $DEMO_DIR" +mkdir -p "$DEMO_DIR" +cd "$DEMO_DIR" + +run_cmd "datalad create ." + +info "Adding alpine image..." +run_cmd "datalad containers-add alpine:latest --url docker://alpine:latest" + +# ============================================================================ +section "PHASE 1: containers-add + datalad run (raw docker command)" +# ============================================================================ + +info "Using datalad run with explicit docker command (no containers-run)" +run_cmd "datalad run --input .datalad/containers/images/alpine/latest/image --output phase1-output.txt -- docker run --rm datalad-container/alpine:latest sh -c 'echo Phase 1: raw datalad run > /dev/stdout'" + +# Create output manually since docker stdout doesn't redirect to file easily +echo "Phase 1: raw datalad run" > phase1-output.txt +git add phase1-output.txt && git commit --amend --no-edit + +show_last_commit + +# ============================================================================ +section "PHASE 2: containers-run --image --exec" +# ============================================================================ + +info "Using containers-run with explicit --image and --exec flags" +run_cmd "datalad containers-run --image alpine:latest --exec 'docker run --rm --user \$(id -u):\$(id -g) -v \$(pwd):/work -w /work {img} {cmd}' --output phase2-output.txt --expand outputs -- sh -c 'echo Phase 2: containers-run with --image --exec > {outputs}'" + +show_last_commit + +# ============================================================================ +section "PHASE 3a: Create profile with image, containers-run --profile" +# ============================================================================ + +info "Creating profiles directory and docker-alpine profile..." +mkdir -p .datalad/containers/profiles + +cat > .datalad/containers/profiles/docker-alpine.yaml << 'EOF' +image: alpine:latest +exec: docker run --rm --user $(id -u):$(id -g) -v $(pwd):/work -w /work {img} {cmd} +EOF + +info "Profile contents:" +cat .datalad/containers/profiles/docker-alpine.yaml + +info "Saving profile to dataset..." +run_cmd "datalad save -m 'Add docker-alpine profile' .datalad/containers/profiles/" + +run_cmd "datalad containers-run --profile docker-alpine --output phase3a-output.txt --expand outputs -- sh -c 'echo Phase 3a: profile with image > {outputs}'" + +show_last_commit + +# ============================================================================ +section "PHASE 3b: containers-list and containers-profiles output" +# ============================================================================ + +info "Listing containers:" +run_cmd "datalad containers-list" + +echo "" +info "Listing profiles:" +run_cmd "datalad containers-profiles" + +# ============================================================================ +section "PHASE 3c: Profile with extends (inheritance)" +# ============================================================================ + +info "Creating docker-alpine-env profile that extends docker-alpine..." +cat > .datalad/containers/profiles/docker-alpine-env.yaml << 'EOF' +extends: docker-alpine +exec: docker run --rm --user $(id -u):$(id -g) -v $(pwd):/work -w /work -e MY_VAR=hello-from-extended-profile {img} {cmd} +EOF + +info "Profile contents:" +cat .datalad/containers/profiles/docker-alpine-env.yaml + +run_cmd "datalad save -m 'Add docker-alpine-env profile' .datalad/containers/profiles/" + +run_cmd "datalad containers-run --profile docker-alpine-env --output phase3c-output.txt --expand outputs -- sh -c 'echo Phase 3c: MY_VAR=\$MY_VAR > {outputs}'" + +info "Output file contents:" +cat phase3c-output.txt + +show_last_commit + +# ============================================================================ +section "PHASE 3d: CLI override --exec" +# ============================================================================ + +info "Using --profile but overriding --exec to add environment variable..." +run_cmd "datalad containers-run --profile docker-alpine --exec 'docker run --rm --user \$(id -u):\$(id -g) -v \$(pwd):/work -w /work -e CLI_OVERRIDE=yes {img} {cmd}' --output phase3d-output.txt --expand outputs -- sh -c 'echo Phase 3d: CLI_OVERRIDE=\$CLI_OVERRIDE > {outputs}'" + +info "Output file contents:" +cat phase3d-output.txt + +show_last_commit + +# ============================================================================ +section "PHASE 3e: CLI override --image" +# ============================================================================ + +info "Adding busybox image..." +run_cmd "datalad containers-add busybox:latest --url docker://busybox:latest" + +info "Using docker-alpine profile but overriding --image to use busybox..." +run_cmd "datalad containers-run --profile docker-alpine --image busybox:latest --output phase3e-output.txt --expand outputs -- sh -c 'echo Phase 3e: running busybox instead of alpine > {outputs}'" + +info "Output file contents:" +cat phase3e-output.txt + +show_last_commit + +# ============================================================================ +section "PHASE 3f: Error case - missing image (early validation)" +# ============================================================================ + +info "Creating profile that references non-existent image..." +cat > .datalad/containers/profiles/bad-profile.yaml << 'EOF' +image: nonexistent:v999 +exec: docker run --rm {img} {cmd} +EOF + +run_cmd "datalad save -m 'Add bad-profile for testing' .datalad/containers/profiles/" + +info "Attempting to use bad-profile (should fail early)..." +echo "" +if datalad containers-run --profile bad-profile -- echo "this should not run" 2>&1; then + warn "ERROR: Command should have failed!" +else + info "Command failed as expected (early validation works)" +fi + +# Clean up bad profile +rm .datalad/containers/profiles/bad-profile.yaml +run_cmd "datalad save -m 'Remove bad-profile' .datalad/containers/profiles/" + +# ============================================================================ +section "PHASE 3g: Base profiles without image (require --image flag)" +# ============================================================================ + +info "Creating base profiles for different runtimes (no image specified)..." + +cat > .datalad/containers/profiles/docker-default.yaml << 'EOF' +exec: docker run --rm --user $(id -u):$(id -g) -v $(pwd):/work -w /work {img} {cmd} +EOF + +cat > .datalad/containers/profiles/apptainer-default.yaml << 'EOF' +exec: apptainer exec oci:{img_path} {cmd} +EOF + +cat > .datalad/containers/profiles/podman-default.yaml << 'EOF' +exec: podman run --rm --userns=keep-id -v $(pwd):/work:Z -w /work {img} {cmd} +EOF + +run_cmd "datalad save -m 'Add base runtime profiles' .datalad/containers/profiles/" + +info "Listing all profiles:" +run_cmd "datalad containers-profiles" + +# Test docker-default (should work) +info "Testing docker-default profile with --image alpine:latest..." +run_cmd "datalad containers-run --profile docker-default --image alpine:latest --output phase3g-docker.txt --expand outputs -- sh -c 'echo Phase 3g: docker-default profile > {outputs}'" +show_last_commit + +# Test apptainer-default if available +if command -v apptainer &> /dev/null || command -v singularity &> /dev/null; then + info "Testing apptainer-default profile..." + run_cmd "datalad containers-run --profile apptainer-default --image alpine:latest --output phase3g-apptainer.txt --expand outputs -- sh -c 'echo Phase 3g: apptainer-default profile > {outputs}'" + show_last_commit +else + warn "Skipping apptainer test (apptainer/singularity not installed)" +fi + +# Test podman-default if available +if command -v podman &> /dev/null; then + info "Testing podman-default profile..." + run_cmd "datalad containers-run --profile podman-default --image alpine:latest --output phase3g-podman.txt --expand outputs -- sh -c 'echo Phase 3g: podman-default profile > {outputs}'" + show_last_commit +else + warn "Skipping podman test (podman not installed)" +fi + +# ============================================================================ +section "DEMO COMPLETE" +# ============================================================================ + +info "Demo directory: $DEMO_DIR" +echo "" +info "Final git log:" +git --no-pager log --oneline + +echo "" +info "All containers:" +datalad containers-list + +echo "" +info "All profiles:" +datalad containers-profiles + +echo "" +echo -e "${GREEN}Demo completed successfully!${NC}" diff --git a/setup.cfg b/setup.cfg index c27d6c0d..ba3c1385 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,6 +16,7 @@ python_requires = >= 3.7 install_requires = datalad >= 0.18.0 requests>=1.2 # to talk to Singularity-hub + pyyaml # for execution profiles packages = find: include_package_data = True