From f6661bf2859b1ed804ea0e9c0c1084289b3c1d0f Mon Sep 17 00:00:00 2001 From: Vasu Jaganath Date: Wed, 25 Mar 2026 12:13:29 -0400 Subject: [PATCH 1/2] add cwl_builder and remove stale dependencies --- .github/workflows/fuzzy_compile_weekly.yml | 6 +- .github/workflows/lint_and_test.yml | 2 +- .github/workflows/lint_and_test_macos.yml | 2 +- .github/workflows/run_workflows.yml | 8 +- .github/workflows/run_workflows_weekly.yml | 6 +- pyproject.toml | 3 - src/sophios/apis/python/__init__.py | 78 + src/sophios/apis/python/cwl_builder.py | 1667 ++++++++++++++++++++ tests/test_compile_python_workflows.py | 74 +- tests/test_cwl_builder.py | 167 ++ 10 files changed, 1993 insertions(+), 20 deletions(-) create mode 100644 src/sophios/apis/python/cwl_builder.py create mode 100644 tests/test_cwl_builder.py diff --git a/.github/workflows/fuzzy_compile_weekly.yml b/.github/workflows/fuzzy_compile_weekly.yml index 421e1ed4..3b9215c6 100644 --- a/.github/workflows/fuzzy_compile_weekly.yml +++ b/.github/workflows/fuzzy_compile_weekly.yml @@ -108,7 +108,11 @@ jobs: # WIC Python API workflows as well as the WIC Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + + - name: PyTest CWL Builder + if: always() + run: cd sophios/ && pytest tests/test_cwl_builder.py -k test_cwl_builder # Since a randomly chosen subschema is used every time, repeat 10X for more coverage. diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml index b62b03cb..f505f342 100644 --- a/.github/workflows/lint_and_test.yml +++ b/.github/workflows/lint_and_test.yml @@ -204,7 +204,7 @@ jobs: # Sophios Python API workflows as well as the Sophios Python API itself. - name: Validate sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows - name: Build Documentation if: always() diff --git a/.github/workflows/lint_and_test_macos.yml b/.github/workflows/lint_and_test_macos.yml index 4597a968..06dc0ee8 100644 --- a/.github/workflows/lint_and_test_macos.yml +++ b/.github/workflows/lint_and_test_macos.yml @@ -114,7 +114,7 @@ jobs: # Sophios Python API workflows as well as the Sophios Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows - name: Build Documentation if: always() diff --git a/.github/workflows/run_workflows.yml b/.github/workflows/run_workflows.yml index f6cd92a6..e356ea3b 100644 --- a/.github/workflows/run_workflows.yml +++ b/.github/workflows/run_workflows.yml @@ -165,7 +165,11 @@ jobs: # WIC Python API workflows as well as the WIC Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + + - name: PyTest CWL Builder + if: always() + run: cd sophios/ && pytest tests/test_cwl_builder.py -k test_cwl_builder - name: cwl-docker-extract (i.e. recursively docker pull) if: always() @@ -242,4 +246,4 @@ jobs: commit_message: ${{ inputs.commit_message }} mm_workflows_ref: ${{ inputs.mm-workflows_ref }} workflow_success: ${{ env.workflow_success }} - access_token: ${{ steps.generate_token.outputs.token }} \ No newline at end of file + access_token: ${{ steps.generate_token.outputs.token }} diff --git a/.github/workflows/run_workflows_weekly.yml b/.github/workflows/run_workflows_weekly.yml index 5fbd1666..96368913 100644 --- a/.github/workflows/run_workflows_weekly.yml +++ b/.github/workflows/run_workflows_weekly.yml @@ -123,7 +123,11 @@ jobs: # Sophios Python API workflows as well as the WIC Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + + - name: PyTest CWL Builder + if: always() + run: cd sophios/ && pytest tests/test_cwl_builder.py -k test_cwl_builder - name: cwl-docker-extract (i.e. recursively docker pull) if: always() diff --git a/pyproject.toml b/pyproject.toml index 1dcad37e..67417b77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,9 +32,7 @@ dependencies = [ # CommandInputParameter attr changed from `type_` to `type` and back to `type_` # between versions 0.30, 0.31, 0.32 # See https://github.com/common-workflow-language/cwl-utils/releases/ - "typeguard", "pydantic>=2.6", - "pydantic-settings", "pydantic[email]", "docker", # FYI also need uidmap to run podman rootless @@ -42,7 +40,6 @@ dependencies = [ # We are using the official release for these packages for now "toil[cwl]", "fastapi", - "python-jose", "uvicorn", "referencing", "aiofiles" diff --git a/src/sophios/apis/python/__init__.py b/src/sophios/apis/python/__init__.py index bcbfbe27..62c36be6 100644 --- a/src/sophios/apis/python/__init__.py +++ b/src/sophios/apis/python/__init__.py @@ -15,19 +15,97 @@ global_config, set_input_Step_Workflow, ) +from .cwl_builder import ( + CWLBuilderValidationError, + CommandArgument, + CommandInput, + CommandLineBinding, + CommandLineToolBuilder, + CommandOutput, + CommandOutputBinding, + Dirent, + DockerRequirement, + EnvironmentDef, + EnvVarRequirement, + Field, + FieldSpec, + InitialWorkDirRequirement, + Input, + InputSpec, + InlineJavascriptRequirement, + InplaceUpdateRequirement, + LoadListingRequirement, + NetworkAccess, + Output, + OutputSpec, + ResourceRequirement, + SchemaDefRequirement, + SecondaryFile, + ShellCommandRequirement, + SoftwarePackage, + SoftwareRequirement, + ToolTimeLimit, + Type, + ValidationResult, + WorkReuse, + array_type, + enum_type, + record_field, + record_type, + secondary_file, + validate_cwl_document, +) __all__ = [ + "CWLBuilderValidationError", + "CommandArgument", + "CommandInput", + "CommandLineBinding", + "CommandLineToolBuilder", + "CommandOutput", + "CommandOutputBinding", + "Dirent", + "DockerRequirement", + "EnvironmentDef", + "EnvVarRequirement", + "Field", + "FieldSpec", "InvalidCLTError", "InvalidInputValueError", "InvalidLinkError", "InvalidStepError", + "InitialWorkDirRequirement", + "Input", + "InputSpec", + "InlineJavascriptRequirement", + "InplaceUpdateRequirement", + "LoadListingRequirement", "MissingRequiredValueError", + "NetworkAccess", + "Output", + "OutputSpec", "ProcessInput", "ProcessOutput", + "ResourceRequirement", + "SchemaDefRequirement", + "SecondaryFile", + "ShellCommandRequirement", + "SoftwarePackage", + "SoftwareRequirement", "Step", + "ToolTimeLimit", + "Type", "Workflow", "WorkflowInputReference", + "WorkReuse", + "array_type", + "enum_type", "extract_tools_paths_NONPORTABLE", "global_config", + "record_field", + "record_type", + "secondary_file", "set_input_Step_Workflow", + "ValidationResult", + "validate_cwl_document", ] diff --git a/src/sophios/apis/python/cwl_builder.py b/src/sophios/apis/python/cwl_builder.py new file mode 100644 index 00000000..1e7e0342 --- /dev/null +++ b/src/sophios/apis/python/cwl_builder.py @@ -0,0 +1,1667 @@ +"""Cleanroom CWL v1.2 CommandLineTool builder. + +This module is intentionally separate from the workflow DSL. It is a plain +Python authoring layer for CWL CommandLineTool documents with three goals: + +1. cover the common 90% of real CLT authoring cleanly, +2. validate generated documents through the cwltool/schema-salad stack, and +3. leave raw escape hatches for the remaining awkward corners of the spec. + +Recommended style +----------------- +Prefer the structured helpers: + +```python +tool = ( + CommandLineToolBuilder("custom-tool") + .inputs(message=Input.string()) + .outputs(out=Output.file(glob="out.txt")) + .time_limit(60) +) +``` + +The lower-level `.input(...)`, `.output(...)`, `.requirement(...)`, and +`.hint(...)` methods are still available as escape hatches. + +Deliberate gaps +--------------- +- SALAD authoring features such as `$import`, `$include`, `$mixin`, and `$graph` + are not first-class builder concepts. They are document-assembly features, not + CLT-structure features. Use `extra()` or post-process the rendered dict if you + need them. +- The builder normalizes `requirements` and `hints` to map form keyed by class. + That covers typical CLT usage, but it does not preserve array ordering. +- Expressions are treated as opaque CWL strings. Schema validation is delegated + to cwltool/schema-salad; expression linting is intentionally out of scope. +- Implementation-specific extension objects are supported through `extra()` and + raw dict payloads, but they do not get typed wrappers by default. +""" + +# pylint: disable=missing-function-docstring,redefined-builtin,too-few-public-methods,too-many-arguments +# pylint: disable=too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods + +import tempfile +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, ClassVar, overload + +import yaml + +from sophios import utils_cwl + + +_UNSET = object() + + +def _render(value: Any) -> Any: + match value: + case Path(): + return str(value) + case list() as values: + return [_render(item) for item in values] + case tuple() as values: + return [_render(item) for item in values] + case dict() as values: + return {key: _render(item) for key, item in values.items()} + case _ if hasattr(value, "to_dict") and callable(value.to_dict): + return _render(value.to_dict()) + case _: + return value + + +def _merge_if_set(target: dict[str, Any], key: str, value: Any) -> None: + if value is not None: + target[key] = _render(value) + + +def _merge_if_present(target: dict[str, Any], key: str, value: Any) -> None: + if value is not _UNSET: + target[key] = _render(value) + + +def _canonicalize_type(type_: Any) -> Any: + return utils_cwl.canonicalize_type(_render(type_)) + + +def _render_doc(value: str | list[str] | None) -> str | list[str] | None: + match value: + case None: + return None + case str() as text: + return text + case list() as texts: + return [str(text) for text in texts] + + +def _render_mapping(value: dict[str, Any]) -> dict[str, Any]: + return {key: _render(item) for key, item in value.items()} + + +def _render_secondary_files(value: Any) -> Any: + if value is None: + return None + return _render(value) + + +@overload +def _optional_binding(binding: "CommandLineBinding") -> "CommandLineBinding | None": + ... + + +@overload +def _optional_binding(binding: "CommandOutputBinding") -> "CommandOutputBinding | None": + ... + + +def _optional_binding( + binding: "CommandLineBinding | CommandOutputBinding", +) -> "CommandLineBinding | CommandOutputBinding | None": + if binding.to_dict(): + return binding + return None + + +def _import_cwltool_load_tool() -> Any: + try: + from cwltool import load_tool # pylint: disable=import-outside-toplevel + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "cwltool/schema_salad is required to validate generated CommandLineTools" + ) from exc + return load_tool + + +@dataclass(slots=True) +class SecondaryFile: + """A CWL secondary file pattern.""" + + pattern: Any + required: bool | str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> str | dict[str, Any]: + if self.required is None and not self.extra and isinstance(self.pattern, str): + return self.pattern + data: dict[str, Any] = {"pattern": _render(self.pattern)} + _merge_if_set(data, "required", self.required) + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class CommandLineBinding: + """CWL CommandLineBinding fields shared by inputs and arguments.""" + + position: int | float | None = None + prefix: str | None = None + separate: bool | None = None + item_separator: str | None = None + value_from: Any = None + shell_quote: bool | None = None + load_contents: bool | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data: dict[str, Any] = {} + _merge_if_set(data, "position", self.position) + _merge_if_set(data, "prefix", self.prefix) + _merge_if_set(data, "separate", self.separate) + _merge_if_set(data, "itemSeparator", self.item_separator) + _merge_if_set(data, "valueFrom", self.value_from) + _merge_if_set(data, "shellQuote", self.shell_quote) + _merge_if_set(data, "loadContents", self.load_contents) + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class CommandOutputBinding: + """CWL CommandOutputBinding fields.""" + + glob: Any = None + load_contents: bool | None = None + output_eval: str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data: dict[str, Any] = {} + _merge_if_set(data, "glob", self.glob) + _merge_if_set(data, "loadContents", self.load_contents) + _merge_if_set(data, "outputEval", self.output_eval) + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class Dirent: + """InitialWorkDirRequirement listing entry.""" + + entry: Any + entryname: str | None = None + writable: bool | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data = {"entry": _render(self.entry)} + _merge_if_set(data, "entryname", self.entryname) + _merge_if_set(data, "writable", self.writable) + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class EnvironmentDef: + """EnvVarRequirement entry.""" + + env_name: str + env_value: str + + def to_dict(self) -> dict[str, str]: + return {"envName": self.env_name, "envValue": self.env_value} + + +@dataclass(slots=True) +class SoftwarePackage: + """SoftwareRequirement package entry.""" + + package: str + version: list[str] | None = None + specs: list[str] | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data = {"package": self.package} + _merge_if_set(data, "version", self.version) + _merge_if_set(data, "specs", self.specs) + data.update(_render(self.extra)) + return data + + +class _RequirementSpec: + class_name: ClassVar[str] + + def to_fields(self) -> dict[str, Any]: + raise NotImplementedError + + +@dataclass(slots=True) +class DockerRequirement(_RequirementSpec): + class_name: ClassVar[str] = "DockerRequirement" + + docker_pull: str | None = None + docker_load: str | None = None + docker_file: str | dict[str, Any] | None = None + docker_import: str | None = None + docker_image_id: str | None = None + docker_output_directory: str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data: dict[str, Any] = {} + _merge_if_set(data, "dockerPull", self.docker_pull) + _merge_if_set(data, "dockerLoad", self.docker_load) + _merge_if_set(data, "dockerFile", self.docker_file) + _merge_if_set(data, "dockerImport", self.docker_import) + _merge_if_set(data, "dockerImageId", self.docker_image_id) + _merge_if_set(data, "dockerOutputDirectory", self.docker_output_directory) + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class ResourceRequirement(_RequirementSpec): + class_name: ClassVar[str] = "ResourceRequirement" + + cores_min: int | float | str | None = None + cores_max: int | float | str | None = None + ram_min: int | float | str | None = None + ram_max: int | float | str | None = None + tmpdir_min: int | float | str | None = None + tmpdir_max: int | float | str | None = None + outdir_min: int | float | str | None = None + outdir_max: int | float | str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + numeric_pairs = [ + ("cores", self.cores_min, self.cores_max), + ("ram", self.ram_min, self.ram_max), + ("tmpdir", self.tmpdir_min, self.tmpdir_max), + ("outdir", self.outdir_min, self.outdir_max), + ] + for resource, minimum, maximum in numeric_pairs: + if isinstance(minimum, (int, float)) and minimum < 0: + raise ValueError(f"{resource} minimum cannot be negative") + if isinstance(maximum, (int, float)) and maximum < 0: + raise ValueError(f"{resource} maximum cannot be negative") + if isinstance(minimum, (int, float)) and isinstance(maximum, (int, float)) and maximum < minimum: + raise ValueError(f"{resource} maximum cannot be smaller than minimum") + + data: dict[str, Any] = {} + _merge_if_set(data, "coresMin", self.cores_min) + _merge_if_set(data, "coresMax", self.cores_max) + _merge_if_set(data, "ramMin", self.ram_min) + _merge_if_set(data, "ramMax", self.ram_max) + _merge_if_set(data, "tmpdirMin", self.tmpdir_min) + _merge_if_set(data, "tmpdirMax", self.tmpdir_max) + _merge_if_set(data, "outdirMin", self.outdir_min) + _merge_if_set(data, "outdirMax", self.outdir_max) + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class InitialWorkDirRequirement(_RequirementSpec): + class_name: ClassVar[str] = "InitialWorkDirRequirement" + + listing: Any + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"listing": _render(self.listing)} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class EnvVarRequirement(_RequirementSpec): + class_name: ClassVar[str] = "EnvVarRequirement" + + env_defs: list[EnvironmentDef | dict[str, Any]] = field(default_factory=list) + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"envDef": _render(self.env_defs)} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class ShellCommandRequirement(_RequirementSpec): + class_name: ClassVar[str] = "ShellCommandRequirement" + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + return _render_mapping(self.extra) + + +@dataclass(slots=True) +class InlineJavascriptRequirement(_RequirementSpec): + class_name: ClassVar[str] = "InlineJavascriptRequirement" + + expression_lib: list[str] | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data: dict[str, Any] = {} + _merge_if_set(data, "expressionLib", self.expression_lib) + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class SchemaDefRequirement(_RequirementSpec): + class_name: ClassVar[str] = "SchemaDefRequirement" + + types: list[Any] + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"types": _render(self.types)} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class LoadListingRequirement(_RequirementSpec): + class_name: ClassVar[str] = "LoadListingRequirement" + + load_listing: str + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"loadListing": self.load_listing} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class SoftwareRequirement(_RequirementSpec): + class_name: ClassVar[str] = "SoftwareRequirement" + + packages: list[SoftwarePackage | dict[str, Any]] | dict[str, Any] + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"packages": _render(self.packages)} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class WorkReuse(_RequirementSpec): + class_name: ClassVar[str] = "WorkReuse" + + enable_reuse: bool | str + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"enableReuse": _render(self.enable_reuse)} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class NetworkAccess(_RequirementSpec): + class_name: ClassVar[str] = "NetworkAccess" + + network_access: bool | str + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"networkAccess": _render(self.network_access)} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class InplaceUpdateRequirement(_RequirementSpec): + class_name: ClassVar[str] = "InplaceUpdateRequirement" + + inplace_update: bool + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + data = {"inplaceUpdate": self.inplace_update} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class ToolTimeLimit(_RequirementSpec): + class_name: ClassVar[str] = "ToolTimeLimit" + + timelimit: int | str + extra: dict[str, Any] = field(default_factory=dict) + + def to_fields(self) -> dict[str, Any]: + if isinstance(self.timelimit, int) and self.timelimit < 0: + raise ValueError("timelimit cannot be negative") + data = {"timelimit": _render(self.timelimit)} + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class CommandInput: + """A single CLT input parameter.""" + + name: str + type_: Any + binding: CommandLineBinding | None = None + label: str | None = None + doc: str | list[str] | None = None + format: Any = None + secondary_files: Any = None + streamable: bool | None = None + load_contents: bool | None = None + load_listing: str | None = None + default: Any = field(default=_UNSET, repr=False) + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data: dict[str, Any] = {"type": _canonicalize_type(self.type_)} + _merge_if_set(data, "label", self.label) + _merge_if_set(data, "doc", _render_doc(self.doc)) + _merge_if_set(data, "format", self.format) + _merge_if_set(data, "streamable", self.streamable) + _merge_if_set(data, "loadContents", self.load_contents) + _merge_if_set(data, "loadListing", self.load_listing) + secondary_files = _render_secondary_files(self.secondary_files) + if secondary_files is not None: + data["secondaryFiles"] = secondary_files + _merge_if_present(data, "default", self.default) + if self.binding is not None: + binding = self.binding.to_dict() + if binding: + data["inputBinding"] = binding + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class CommandOutput: + """A single CLT output parameter.""" + + name: str + type_: Any + binding: CommandOutputBinding | None = None + label: str | None = None + doc: str | list[str] | None = None + format: Any = None + secondary_files: Any = None + streamable: bool | None = None + load_listing: str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data: dict[str, Any] = {"type": _canonicalize_type(self.type_)} + _merge_if_set(data, "label", self.label) + _merge_if_set(data, "doc", _render_doc(self.doc)) + _merge_if_set(data, "format", self.format) + _merge_if_set(data, "streamable", self.streamable) + _merge_if_set(data, "loadListing", self.load_listing) + secondary_files = _render_secondary_files(self.secondary_files) + if secondary_files is not None: + data["secondaryFiles"] = secondary_files + if self.binding is not None: + binding = self.binding.to_dict() + if binding: + data["outputBinding"] = binding + data.update(_render(self.extra)) + return data + + +@dataclass(slots=True) +class FieldSpec: + """Structured record field specification.""" + + type_: Any + label: str | None = None + doc: str | list[str] | None = None + input_binding: CommandLineBinding | None = None + output_binding: CommandOutputBinding | None = None + secondary_files: Any = None + streamable: bool | None = None + format: Any = None + extra: dict[str, Any] = field(default_factory=dict) + + def named(self, name: str) -> dict[str, Any]: + return record_field( + name, + self.type_, + label=self.label, + doc=self.doc, + input_binding=self.input_binding, + output_binding=self.output_binding, + secondary_files=self.secondary_files, + streamable=self.streamable, + format=self.format, + extra=self.extra, + ) + + +@dataclass(slots=True) +class InputSpec: + """Structured input specification without repeating the input name.""" + + type_: Any + binding: CommandLineBinding | None = None + label: str | None = None + doc: str | list[str] | None = None + format: Any = None + secondary_files: Any = None + streamable: bool | None = None + load_contents: bool | None = None + load_listing: str | None = None + default: Any = field(default=_UNSET, repr=False) + extra: dict[str, Any] = field(default_factory=dict) + + def named(self, name: str) -> CommandInput: + return CommandInput( + name=name, + type_=self.type_, + binding=self.binding, + label=self.label, + doc=self.doc, + format=self.format, + secondary_files=self.secondary_files, + streamable=self.streamable, + load_contents=self.load_contents, + load_listing=self.load_listing, + default=self.default, + extra=self.extra, + ) + + +@dataclass(slots=True) +class OutputSpec: + """Structured output specification without repeating the output name.""" + + type_: Any + binding: CommandOutputBinding | None = None + label: str | None = None + doc: str | list[str] | None = None + format: Any = None + secondary_files: Any = None + streamable: bool | None = None + load_listing: str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def named(self, name: str) -> CommandOutput: + return CommandOutput( + name=name, + type_=self.type_, + binding=self.binding, + label=self.label, + doc=self.doc, + format=self.format, + secondary_files=self.secondary_files, + streamable=self.streamable, + load_listing=self.load_listing, + extra=self.extra, + ) + + +@dataclass(slots=True) +class CommandArgument: + """A CWL command line argument entry.""" + + value: Any = None + binding: CommandLineBinding = field(default_factory=CommandLineBinding) + extra: dict[str, Any] = field(default_factory=dict) + + def to_yaml(self) -> str | dict[str, Any]: + binding = self.binding.to_dict() + if not binding and isinstance(self.value, str) and not self.extra: + return self.value + if self.value is not None and "valueFrom" not in binding: + binding["valueFrom"] = _render(self.value) + binding.update(_render(self.extra)) + return binding + + +def secondary_file( + pattern: Any, + *, + required: bool | str | None = None, + extra: dict[str, Any] | None = None, +) -> SecondaryFile: + return SecondaryFile(pattern=pattern, required=required, extra=dict(extra or {})) + + +def array_type( + items: Any, + *, + name: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + extra: dict[str, Any] | None = None, +) -> dict[str, Any]: + data: dict[str, Any] = {"type": "array", "items": _canonicalize_type(items)} + _merge_if_set(data, "name", name) + _merge_if_set(data, "label", label) + _merge_if_set(data, "doc", _render_doc(doc)) + if input_binding is not None: + binding = input_binding.to_dict() + if binding: + data["inputBinding"] = binding + data.update(_render(extra or {})) + return data + + +def enum_type( + symbols: list[str], + *, + name: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + extra: dict[str, Any] | None = None, +) -> dict[str, Any]: + data: dict[str, Any] = {"type": "enum", "symbols": list(symbols)} + _merge_if_set(data, "name", name) + _merge_if_set(data, "label", label) + _merge_if_set(data, "doc", _render_doc(doc)) + if input_binding is not None: + binding = input_binding.to_dict() + if binding: + data["inputBinding"] = binding + data.update(_render(extra or {})) + return data + + +def record_field( + name: str, + type_: Any, + *, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + output_binding: CommandOutputBinding | None = None, + secondary_files: Any = None, + streamable: bool | None = None, + format: Any = None, + extra: dict[str, Any] | None = None, +) -> dict[str, Any]: + data: dict[str, Any] = {"name": name, "type": _canonicalize_type(type_)} + _merge_if_set(data, "label", label) + _merge_if_set(data, "doc", _render_doc(doc)) + _merge_if_set(data, "format", format) + _merge_if_set(data, "streamable", streamable) + secondary_files_value = _render_secondary_files(secondary_files) + if secondary_files_value is not None: + data["secondaryFiles"] = secondary_files_value + if input_binding is not None: + binding = input_binding.to_dict() + if binding: + data["inputBinding"] = binding + if output_binding is not None: + binding = output_binding.to_dict() + if binding: + data["outputBinding"] = binding + data.update(_render(extra or {})) + return data + + +def record_type( + fields: list[Any] | dict[str, Any], + *, + name: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + extra: dict[str, Any] | None = None, +) -> dict[str, Any]: + data: dict[str, Any] = {"type": "record", "fields": _render(fields)} + _merge_if_set(data, "name", name) + _merge_if_set(data, "label", label) + _merge_if_set(data, "doc", _render_doc(doc)) + if input_binding is not None: + binding = input_binding.to_dict() + if binding: + data["inputBinding"] = binding + data.update(_render(extra or {})) + return data + + +class Type: + """Structured CWL type helpers.""" + + @staticmethod + def null() -> str: + return "null" + + @staticmethod + def boolean() -> str: + return "boolean" + + @staticmethod + def int() -> str: + return "int" + + @staticmethod + def long() -> str: + return "long" + + @staticmethod + def float() -> str: + return "float" + + @staticmethod + def double() -> str: + return "double" + + @staticmethod + def string() -> str: + return "string" + + @staticmethod + def file() -> str: + return "File" + + @staticmethod + def directory() -> str: + return "Directory" + + @staticmethod + def stdout() -> str: + return "stdout" + + @staticmethod + def stderr() -> str: + return "stderr" + + @staticmethod + def any() -> str: + return "Any" + + @staticmethod + def array( + items: Any, + *, + name: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + extra: dict[str, Any] | None = None, + ) -> dict[str, Any]: + return array_type( + items, + name=name, + label=label, + doc=doc, + input_binding=input_binding, + extra=extra, + ) + + @staticmethod + def enum( + *symbols: str, + name: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + extra: dict[str, Any] | None = None, + ) -> dict[str, Any]: + return enum_type( + list(symbols), + name=name, + label=label, + doc=doc, + input_binding=input_binding, + extra=extra, + ) + + @staticmethod + def record( + fields: dict[str, FieldSpec] | list[Any], + *, + name: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + extra: dict[str, Any] | None = None, + ) -> dict[str, Any]: + match fields: + case dict() as mapping: + rendered_fields = [ + spec.named(field_name) if isinstance(spec, FieldSpec) else record_field(field_name, spec) + for field_name, spec in mapping.items() + ] + case list() as items: + rendered_fields = _render(items) + case _: + raise TypeError("record fields must be a mapping or a list") + return record_type( + rendered_fields, + name=name, + label=label, + doc=doc, + input_binding=input_binding, + extra=extra, + ) + + @staticmethod + def optional(inner: Any) -> list[Any]: + return ["null", _canonicalize_type(inner)] + + +class Field: + """Structured record field helpers.""" + + @staticmethod + def of( + type_: Any, + *, + label: str | None = None, + doc: str | list[str] | None = None, + input_binding: CommandLineBinding | None = None, + output_binding: CommandOutputBinding | None = None, + secondary_files: Any = None, + streamable: bool | None = None, + format: Any = None, + extra: dict[str, Any] | None = None, + ) -> FieldSpec: + return FieldSpec( + type_=type_, + label=label, + doc=doc, + input_binding=input_binding, + output_binding=output_binding, + secondary_files=secondary_files, + streamable=streamable, + format=format, + extra=dict(extra or {}), + ) + + @staticmethod + def string(**kwargs: Any) -> FieldSpec: + return Field.of(Type.string(), **kwargs) + + @staticmethod + def int(**kwargs: Any) -> FieldSpec: + return Field.of(Type.int(), **kwargs) + + @staticmethod + def long(**kwargs: Any) -> FieldSpec: + return Field.of(Type.long(), **kwargs) + + @staticmethod + def float(**kwargs: Any) -> FieldSpec: + return Field.of(Type.float(), **kwargs) + + @staticmethod + def double(**kwargs: Any) -> FieldSpec: + return Field.of(Type.double(), **kwargs) + + @staticmethod + def boolean(**kwargs: Any) -> FieldSpec: + return Field.of(Type.boolean(), **kwargs) + + @staticmethod + def file(**kwargs: Any) -> FieldSpec: + return Field.of(Type.file(), **kwargs) + + @staticmethod + def directory(**kwargs: Any) -> FieldSpec: + return Field.of(Type.directory(), **kwargs) + + @staticmethod + def array(items: Any, **kwargs: Any) -> FieldSpec: + return Field.of(Type.array(items), **kwargs) + + @staticmethod + def enum(*symbols: str, **kwargs: Any) -> FieldSpec: + return Field.of(Type.enum(*symbols), **kwargs) + + @staticmethod + def record(fields: dict[str, FieldSpec] | list[Any], **kwargs: Any) -> FieldSpec: + return Field.of(Type.record(fields), **kwargs) + + +# pylint: disable=too-many-public-methods +class Input: + """Structured CLT input helpers.""" + + @staticmethod + def of( + type_: Any, + *, + position: int | float | None = None, + prefix: str | None = None, + separate: bool | None = None, + item_separator: str | None = None, + value_from: Any = None, + shell_quote: bool | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + format: Any = None, + secondary_files: Any = None, + streamable: bool | None = None, + load_contents: bool | None = None, + load_listing: str | None = None, + default: Any = _UNSET, + binding_extra: dict[str, Any] | None = None, + extra: dict[str, Any] | None = None, + ) -> InputSpec: + binding = _optional_binding(CommandLineBinding( + position=position, + prefix=prefix, + separate=separate, + item_separator=item_separator, + value_from=value_from, + shell_quote=shell_quote, + extra=dict(binding_extra or {}), + )) + return InputSpec( + type_=type_, + binding=binding, + label=label, + doc=doc, + format=format, + secondary_files=secondary_files, + streamable=streamable, + load_contents=load_contents, + load_listing=load_listing, + default=default, + extra=dict(extra or {}), + ) + + @staticmethod + def string(**kwargs: Any) -> InputSpec: + return Input.of(Type.string(), **kwargs) + + @staticmethod + def int(**kwargs: Any) -> InputSpec: + return Input.of(Type.int(), **kwargs) + + @staticmethod + def long(**kwargs: Any) -> InputSpec: + return Input.of(Type.long(), **kwargs) + + @staticmethod + def float(**kwargs: Any) -> InputSpec: + return Input.of(Type.float(), **kwargs) + + @staticmethod + def double(**kwargs: Any) -> InputSpec: + return Input.of(Type.double(), **kwargs) + + @staticmethod + def boolean(**kwargs: Any) -> InputSpec: + return Input.of(Type.boolean(), **kwargs) + + @staticmethod + def file(**kwargs: Any) -> InputSpec: + return Input.of(Type.file(), **kwargs) + + @staticmethod + def directory(**kwargs: Any) -> InputSpec: + return Input.of(Type.directory(), **kwargs) + + @staticmethod + def array(items: Any, **kwargs: Any) -> InputSpec: + return Input.of(Type.array(items), **kwargs) + + @staticmethod + def enum(*symbols: str, **kwargs: Any) -> InputSpec: + return Input.of(Type.enum(*symbols), **kwargs) + + @staticmethod + def record(fields: dict[str, FieldSpec] | list[Any], **kwargs: Any) -> InputSpec: + return Input.of(Type.record(fields), **kwargs) + + +# pylint: disable=too-many-public-methods +class Output: + """Structured CLT output helpers.""" + + @staticmethod + def of( + type_: Any, + *, + glob: Any = None, + load_contents: bool | None = None, + output_eval: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + format: Any = None, + secondary_files: Any = None, + streamable: bool | None = None, + load_listing: str | None = None, + binding_extra: dict[str, Any] | None = None, + extra: dict[str, Any] | None = None, + ) -> OutputSpec: + binding = _optional_binding(CommandOutputBinding( + glob=glob, + load_contents=load_contents, + output_eval=output_eval, + extra=dict(binding_extra or {}), + )) + return OutputSpec( + type_=type_, + binding=binding, + label=label, + doc=doc, + format=format, + secondary_files=secondary_files, + streamable=streamable, + load_listing=load_listing, + extra=dict(extra or {}), + ) + + @staticmethod + def string(**kwargs: Any) -> OutputSpec: + return Output.of(Type.string(), **kwargs) + + @staticmethod + def int(**kwargs: Any) -> OutputSpec: + return Output.of(Type.int(), **kwargs) + + @staticmethod + def long(**kwargs: Any) -> OutputSpec: + return Output.of(Type.long(), **kwargs) + + @staticmethod + def float(**kwargs: Any) -> OutputSpec: + return Output.of(Type.float(), **kwargs) + + @staticmethod + def double(**kwargs: Any) -> OutputSpec: + return Output.of(Type.double(), **kwargs) + + @staticmethod + def boolean(**kwargs: Any) -> OutputSpec: + return Output.of(Type.boolean(), **kwargs) + + @staticmethod + def file(**kwargs: Any) -> OutputSpec: + return Output.of(Type.file(), **kwargs) + + @staticmethod + def directory(**kwargs: Any) -> OutputSpec: + return Output.of(Type.directory(), **kwargs) + + @staticmethod + def stdout(**kwargs: Any) -> OutputSpec: + return Output.of(Type.stdout(), **kwargs) + + @staticmethod + def stderr(**kwargs: Any) -> OutputSpec: + return Output.of(Type.stderr(), **kwargs) + + @staticmethod + def array(items: Any, **kwargs: Any) -> OutputSpec: + return Output.of(Type.array(items), **kwargs) + + @staticmethod + def enum(*symbols: str, **kwargs: Any) -> OutputSpec: + return Output.of(Type.enum(*symbols), **kwargs) + + @staticmethod + def record(fields: dict[str, FieldSpec] | list[Any], **kwargs: Any) -> OutputSpec: + return Output.of(Type.record(fields), **kwargs) + + +@dataclass(frozen=True, slots=True) +class ValidationResult: + """Result of validating a generated CLT with cwltool/schema-salad.""" + + path: Path + uri: str + process: Any + + +class CWLBuilderValidationError(ValueError): + """Raised when a generated CLT fails schema validation.""" + + +def validate_cwl_document( + document: dict[str, Any], + *, + filename: str = "tool.cwl", + skip_schemas: bool = False, +) -> ValidationResult: + with tempfile.TemporaryDirectory(prefix="sophios-cwl-builder-") as tmpdir: + temp_path = Path(tmpdir) / filename + temp_path.write_text( + yaml.safe_dump(_render(document), sort_keys=False, line_break="\n"), + encoding="utf-8", + ) + return _validate_path(temp_path, skip_schemas=skip_schemas) + + +def _validate_path(path: Path, *, skip_schemas: bool = False) -> ValidationResult: + del skip_schemas # Reserved for parity with the rest of the codebase. + load_tool = _import_cwltool_load_tool() + try: + loading_context, workflowobj, uri = load_tool.fetch_document(str(path)) + loading_context, uri = load_tool.resolve_and_validate_document( + loading_context, + workflowobj, + uri, + preprocess_only=False, + ) + process = load_tool.make_tool(uri, loading_context) + except Exception as exc: + raise CWLBuilderValidationError(f"Generated CommandLineTool failed validation: {path}") from exc + return ValidationResult(path=path, uri=uri, process=process) + + +def _normalize_requirement( + requirement: str | _RequirementSpec | dict[str, Any], + value: dict[str, Any] | None = None, +) -> tuple[str, dict[str, Any]]: + match requirement: + case str() as class_name: + payload = {} if value is None else dict(_render(value)) + return class_name, payload + case _RequirementSpec() as spec: + return spec.class_name, spec.to_fields() + case dict() as payload: + if "class" not in payload: + raise ValueError("raw requirement dicts must include a 'class' key") + payload_copy = dict(_render(payload)) + class_name = str(payload_copy.pop("class")) + return class_name, payload_copy + case _: + raise TypeError("requirement must be a class name, requirement spec, or raw dict") + + +@dataclass(slots=True) +class CommandLineToolBuilder: + """Fluent builder for CWL v1.2 `CommandLineTool` documents.""" + + tool_id: str + cwl_version: str = "v1.2" + label_text: str | None = None + doc_text: str | list[str] | None = None + _base_command: list[str] = field(default_factory=list) + _arguments: list[str | dict[str, Any]] = field(default_factory=list) + _inputs: dict[str, CommandInput] = field(default_factory=dict) + _outputs: dict[str, CommandOutput] = field(default_factory=dict) + _requirements: dict[str, dict[str, Any]] = field(default_factory=dict) + _hints: dict[str, dict[str, Any]] = field(default_factory=dict) + _stdin: str | None = None + _stdout: str | None = None + _stderr: str | None = None + _intent: list[str] = field(default_factory=list) + _namespaces: dict[str, str] = field(default_factory=dict) + _schemas: list[str] = field(default_factory=list) + _success_codes: list[int] = field(default_factory=list) + _temporary_fail_codes: list[int] = field(default_factory=list) + _permanent_fail_codes: list[int] = field(default_factory=list) + _extra: dict[str, Any] = field(default_factory=dict) + + def label(self, text: str) -> "CommandLineToolBuilder": + self.label_text = text + return self + + def doc(self, text: str | list[str]) -> "CommandLineToolBuilder": + self.doc_text = text + return self + + def namespace(self, prefix: str, iri: str) -> "CommandLineToolBuilder": + self._namespaces[prefix] = iri + return self + + def schema(self, iri: str) -> "CommandLineToolBuilder": + self._schemas.append(iri) + return self + + def intent(self, *identifiers: str) -> "CommandLineToolBuilder": + self._intent.extend(identifiers) + return self + + def base_command(self, *parts: str) -> "CommandLineToolBuilder": + self._base_command = list(parts) + return self + + def stdin(self, value: str) -> "CommandLineToolBuilder": + self._stdin = value + return self + + def stdout(self, value: str) -> "CommandLineToolBuilder": + self._stdout = value + return self + + def stderr(self, value: str) -> "CommandLineToolBuilder": + self._stderr = value + return self + + def add_input(self, input_spec: CommandInput) -> "CommandLineToolBuilder": + self._inputs[input_spec.name] = input_spec + return self + + def inputs(self, **input_specs: InputSpec) -> "CommandLineToolBuilder": + for name, spec in input_specs.items(): + if not isinstance(spec, InputSpec): + raise TypeError(f"input {name!r} must be an InputSpec") + self.add_input(spec.named(name)) + return self + + def input( + self, + name: str, + *, + type_: Any, + position: int | float | None = None, + prefix: str | None = None, + separate: bool | None = None, + item_separator: str | None = None, + value_from: Any = None, + shell_quote: bool | None = None, + load_contents: bool | None = None, + load_listing: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + format: Any = None, + secondary_files: Any = None, + streamable: bool | None = None, + default: Any = _UNSET, + binding_extra: dict[str, Any] | None = None, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + return self.inputs( + **{ + name: Input.of( + type_, + position=position, + prefix=prefix, + separate=separate, + item_separator=item_separator, + value_from=value_from, + shell_quote=shell_quote, + load_contents=load_contents, + load_listing=load_listing, + label=label, + doc=doc, + format=format, + secondary_files=secondary_files, + streamable=streamable, + default=default, + binding_extra=binding_extra, + extra=extra, + ) + } + ) + + def add_output(self, output_spec: CommandOutput) -> "CommandLineToolBuilder": + self._outputs[output_spec.name] = output_spec + return self + + def outputs(self, **output_specs: OutputSpec) -> "CommandLineToolBuilder": + for name, spec in output_specs.items(): + if not isinstance(spec, OutputSpec): + raise TypeError(f"output {name!r} must be an OutputSpec") + self.add_output(spec.named(name)) + return self + + def output( + self, + name: str, + *, + type_: Any, + glob: Any = None, + load_contents: bool | None = None, + output_eval: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + format: Any = None, + secondary_files: Any = None, + streamable: bool | None = None, + load_listing: str | None = None, + binding_extra: dict[str, Any] | None = None, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + return self.outputs( + **{ + name: Output.of( + type_, + glob=glob, + load_contents=load_contents, + output_eval=output_eval, + label=label, + doc=doc, + format=format, + secondary_files=secondary_files, + streamable=streamable, + load_listing=load_listing, + binding_extra=binding_extra, + extra=extra, + ) + } + ) + + def add_argument(self, argument: str | CommandArgument | dict[str, Any]) -> "CommandLineToolBuilder": + match argument: + case str() as literal: + self._arguments.append(literal) + case CommandArgument() as structured: + self._arguments.append(structured.to_yaml()) + case dict() as raw: + self._arguments.append(_render(raw)) + case _: + raise TypeError("argument must be a string, CommandArgument, or raw dict") + return self + + def argument( + self, + value: Any = None, + *, + position: int | float | None = None, + prefix: str | None = None, + separate: bool | None = None, + item_separator: str | None = None, + value_from: Any = None, + shell_quote: bool | None = None, + load_contents: bool | None = None, + binding_extra: dict[str, Any] | None = None, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + binding = CommandLineBinding( + position=position, + prefix=prefix, + separate=separate, + item_separator=item_separator, + value_from=value_from, + shell_quote=shell_quote, + load_contents=load_contents, + extra=dict(binding_extra or {}), + ) + return self.add_argument(CommandArgument(value=value, binding=binding, extra=dict(extra or {}))) + + def requirement( + self, + requirement: str | _RequirementSpec | dict[str, Any], + value: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + class_name, payload = _normalize_requirement(requirement, value) + self._requirements[class_name] = payload + return self + + def hint( + self, + requirement: str | _RequirementSpec | dict[str, Any], + value: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + class_name, payload = _normalize_requirement(requirement, value) + self._hints[class_name] = payload + return self + + def docker( + self, + *, + docker_pull: str | None = None, + docker_load: str | None = None, + docker_file: str | dict[str, Any] | None = None, + docker_import: str | None = None, + docker_image_id: str | None = None, + docker_output_directory: str | None = None, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = DockerRequirement( + docker_pull=docker_pull, + docker_load=docker_load, + docker_file=docker_file, + docker_import=docker_import, + docker_image_id=docker_image_id, + docker_output_directory=docker_output_directory, + extra=dict(extra or {}), + ) + return self.hint(spec) if as_hint else self.requirement(spec) + + def inline_javascript( + self, + *expression_lib: str, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = InlineJavascriptRequirement( + expression_lib=list(expression_lib) or None, + extra=dict(extra or {}), + ) + return self.hint(spec) if as_hint else self.requirement(spec) + + def schema_definitions( + self, + *types: Any, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = SchemaDefRequirement(types=list(types), extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def load_listing( + self, + value: str, + *, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = LoadListingRequirement(load_listing=value, extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def shell_command(self, *, as_hint: bool = False, extra: dict[str, Any] | None = None) -> "CommandLineToolBuilder": + spec = ShellCommandRequirement(extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def software( + self, + packages: list[SoftwarePackage | dict[str, Any]] | dict[str, Any], + *, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = SoftwareRequirement(packages=packages, extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def initial_workdir( + self, + listing: Any, + *, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = InitialWorkDirRequirement(listing=listing, extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def env_var(self, name: str, value: str, *, as_hint: bool = False) -> "CommandLineToolBuilder": + target = self._hints if as_hint else self._requirements + payload = target.setdefault("EnvVarRequirement", {"envDef": []}) + env_defs = payload.setdefault("envDef", []) + env_defs.append(EnvironmentDef(name, value).to_dict()) + return self + + def resources( + self, + *, + cores_min: int | float | str | None = None, + cores_max: int | float | str | None = None, + ram_min: int | float | str | None = None, + ram_max: int | float | str | None = None, + tmpdir_min: int | float | str | None = None, + tmpdir_max: int | float | str | None = None, + outdir_min: int | float | str | None = None, + outdir_max: int | float | str | None = None, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = ResourceRequirement( + cores_min=cores_min, + cores_max=cores_max, + ram_min=ram_min, + ram_max=ram_max, + tmpdir_min=tmpdir_min, + tmpdir_max=tmpdir_max, + outdir_min=outdir_min, + outdir_max=outdir_max, + extra=dict(extra or {}), + ) + return self.hint(spec) if as_hint else self.requirement(spec) + + def work_reuse( + self, + enable: bool | str, + *, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = WorkReuse(enable_reuse=enable, extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def network_access( + self, + enable: bool | str, + *, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = NetworkAccess(network_access=enable, extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def inplace_update( + self, + enable: bool = True, + *, + as_hint: bool = True, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = InplaceUpdateRequirement(inplace_update=enable, extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def time_limit( + self, + seconds: int | str, + *, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> "CommandLineToolBuilder": + spec = ToolTimeLimit(timelimit=seconds, extra=dict(extra or {})) + return self.hint(spec) if as_hint else self.requirement(spec) + + def success_codes(self, *codes: int) -> "CommandLineToolBuilder": + self._success_codes = list(codes) + return self + + def temporary_fail_codes(self, *codes: int) -> "CommandLineToolBuilder": + self._temporary_fail_codes = list(codes) + return self + + def permanent_fail_codes(self, *codes: int) -> "CommandLineToolBuilder": + self._permanent_fail_codes = list(codes) + return self + + def extra(self, **values: Any) -> "CommandLineToolBuilder": + self._extra.update(_render(values)) + return self + + def build(self) -> dict[str, Any]: + document: dict[str, Any] = { + "cwlVersion": self.cwl_version, + "class": "CommandLineTool", + "id": self.tool_id, + "inputs": {name: input_spec.to_dict() for name, input_spec in self._inputs.items()}, + "outputs": {name: output_spec.to_dict() for name, output_spec in self._outputs.items()}, + } + if self._namespaces: + document["$namespaces"] = dict(self._namespaces) + if self._schemas: + document["$schemas"] = list(self._schemas) + _merge_if_set(document, "label", self.label_text) + _merge_if_set(document, "doc", _render_doc(self.doc_text)) + if self._intent: + document["intent"] = list(self._intent) + if self._base_command: + document["baseCommand"] = self._base_command[0] if len( + self._base_command) == 1 else list(self._base_command) + if self._arguments: + document["arguments"] = list(self._arguments) + if self._requirements: + document["requirements"] = _render(self._requirements) + if self._hints: + document["hints"] = _render(self._hints) + _merge_if_set(document, "stdin", self._stdin) + _merge_if_set(document, "stdout", self._stdout) + _merge_if_set(document, "stderr", self._stderr) + if self._success_codes: + document["successCodes"] = list(self._success_codes) + if self._temporary_fail_codes: + document["temporaryFailCodes"] = list(self._temporary_fail_codes) + if self._permanent_fail_codes: + document["permanentFailCodes"] = list(self._permanent_fail_codes) + document.update(_render(self._extra)) + return document + + def to_dict(self) -> dict[str, Any]: + return self.build() + + def to_yaml(self) -> str: + rendered_yaml = yaml.safe_dump(self.build(), sort_keys=False, line_break="\n") + return str(rendered_yaml) + + def save(self, path: str | Path, *, validate: bool = False, skip_schemas: bool = False) -> Path: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(self.to_yaml(), encoding="utf-8") + if validate: + _validate_path(output_path, skip_schemas=skip_schemas) + return output_path + + def validate(self, *, skip_schemas: bool = False) -> ValidationResult: + return validate_cwl_document(self.build(), filename=f"{self.tool_id}.cwl", skip_schemas=skip_schemas) + + +__all__ = [ + "CWLBuilderValidationError", + "CommandArgument", + "CommandInput", + "CommandLineBinding", + "CommandLineToolBuilder", + "CommandOutput", + "CommandOutputBinding", + "Dirent", + "DockerRequirement", + "EnvironmentDef", + "EnvVarRequirement", + "Field", + "FieldSpec", + "InitialWorkDirRequirement", + "InlineJavascriptRequirement", + "Input", + "InputSpec", + "InplaceUpdateRequirement", + "LoadListingRequirement", + "NetworkAccess", + "Output", + "OutputSpec", + "ResourceRequirement", + "SchemaDefRequirement", + "SecondaryFile", + "ShellCommandRequirement", + "SoftwarePackage", + "SoftwareRequirement", + "ToolTimeLimit", + "Type", + "ValidationResult", + "WorkReuse", + "array_type", + "enum_type", + "record_field", + "record_type", + "secondary_file", + "validate_cwl_document", +] diff --git a/tests/test_compile_python_workflows.py b/tests/test_compile_python_workflows.py index c390d90f..0158612e 100644 --- a/tests/test_compile_python_workflows.py +++ b/tests/test_compile_python_workflows.py @@ -1,12 +1,43 @@ import json import traceback from pathlib import Path - import pytest +import yaml + import sophios import sophios.plugins from sophios import input_output as io +from sophios import utils, utils_cwl from sophios.python_cwl_adapter import import_python_file +from sophios.schemas import wic_schema +from sophios.utils_yaml import wic_loader +from sophios.wic_types import Json + + +REPO_ROOT = Path(__file__).resolve().parent.parent +AUTOGENERATED_DIR = REPO_ROOT / "autogenerated" +PYTHON_WORKFLOW_MANIFEST = AUTOGENERATED_DIR / "python_workflow_manifest.json" + + +def _load_global_config() -> Json: + config_file = Path().home() / "wic" / "global_config.json" + return io.read_config_from_disk(config_file) + + +def _iter_python_workflow_paths(global_config: Json) -> list[tuple[str, Path]]: + paths = sophios.plugins.get_py_paths(global_config) + return [ + (path_str, path) + for _, paths_dict in paths.items() + for path_str, path in paths_dict.items() + if "mm-workflows" not in str(path) and "docs/tutorials/" not in str(path) + ] + + +def _write_manifest(workflow_paths: list[Path]) -> None: + AUTOGENERATED_DIR.mkdir(parents=True, exist_ok=True) + manifest = sorted({str(path) for path in workflow_paths}) + PYTHON_WORKFLOW_MANIFEST.write_text(json.dumps(manifest, indent=2), encoding="utf-8") @pytest.mark.fast @@ -20,19 +51,12 @@ def test_compile_python_workflows() -> None: from sophios.apis.python import api # pylint: disable=C0415:import-outside-toplevel # Since this is completely different test path we have to copy # default .txt files to default global_config.json - config_file = Path().home()/'wic'/'global_config.json' - global_config = io.read_config_from_disk(config_file) + global_config = _load_global_config() api.global_config = sophios.plugins.get_tools_cwl(global_config) # Use path fallback in the CI - paths = sophios.plugins.get_py_paths(global_config) - # Above we are assuming that config is default - paths_tuples = [(path_str, path) - for namespace, paths_dict in paths.items() - for path_str, path in paths_dict.items()] + paths_tuples = _iter_python_workflow_paths(global_config) import_errors: list[str] = [] + generated_workflows: list[Path] = [] for path_stem, path in paths_tuples: - if 'mm-workflows' in str(path) or 'docs/tutorials/' in str(path): - # Exclude paths that only contain 'regular' python files. - continue # NOTE: Use anything (unique?) for the python_module_name. try: module = import_python_file(path_stem, path) @@ -48,6 +72,7 @@ def test_compile_python_workflows() -> None: # This lets us use path.parent to write a *.wic file in the # auto-discovery path, and thus reuse the existing wic CI retval.write_ast_to_disk(path.parent) + generated_workflows.extend(path.parent / f"{wf.process_name}.wic" for wf in retval.flatten_subworkflows()) # Programmatically blacklist subworkflows from running in config_ci.json # (Again, because subworkflows are missing inputs and cannot run.) @@ -69,3 +94,30 @@ def test_compile_python_workflows() -> None: traceback.print_exception(type(e), value=e, tb=None) if import_errors: pytest.fail("Python workflow imports failed:\n" + "\n".join(import_errors)) + _write_manifest(generated_workflows) + + +@pytest.mark.fast +def test_validate_generated_python_workflows() -> None: + if not PYTHON_WORKFLOW_MANIFEST.exists(): + pytest.fail(f"Missing generated workflow manifest: {PYTHON_WORKFLOW_MANIFEST}") + + global_config = _load_global_config() + tools_cwl = sophios.plugins.get_tools_cwl(global_config) + yml_paths = sophios.plugins.get_yml_paths(global_config) + yaml_stems = utils.flatten([list(paths) for paths in yml_paths.values()]) + validator = wic_schema.get_validator(tools_cwl, yaml_stems, {}, write_to_disk=False) + + workflow_paths = json.loads(PYTHON_WORKFLOW_MANIFEST.read_text(encoding="utf-8")) + validation_errors: list[str] = [] + for workflow_path_str in workflow_paths: + workflow_path = Path(workflow_path_str) + try: + with workflow_path.open("r", encoding="utf-8") as handle: + yaml_tree = yaml.load(handle.read(), Loader=wic_loader()) + validator.validate(utils_cwl.desugar_into_canonical_normal_form(yaml_tree)) + except Exception as exc: + validation_errors.append(f"{workflow_path}: {type(exc).__name__}: {exc}") + + if validation_errors: + pytest.fail("Generated workflow validation failed:\n" + "\n".join(validation_errors)) diff --git a/tests/test_cwl_builder.py b/tests/test_cwl_builder.py new file mode 100644 index 00000000..e1156a51 --- /dev/null +++ b/tests/test_cwl_builder.py @@ -0,0 +1,167 @@ +from pathlib import Path + +import pytest +import yaml + +import sophios.apis.python.cwl_builder as cwl_builder +from sophios.apis.python.cwl_builder import ( + CommandLineToolBuilder, + Dirent, + Field, + Input, + Output, + Type, + secondary_file, +) + + +def _rich_builder() -> CommandLineToolBuilder: + mode_type = Type.enum("fast", "accurate", name="Mode") + settings_type = Type.record( + { + "threads": Field.int(), + "preset": Field.of(mode_type), + "tags": Field.array(Type.string()), + }, + name="Settings", + ) + + return ( + CommandLineToolBuilder("aligner") + .label("Align reads") + .doc(["Toy CLT", "for serialization coverage"]) + .namespace("edam", "https://edamontology.org/") + .schema("https://example.org/formats.rdf") + .intent("edam:operation_3198") + .base_command("bash", "-lc") + .shell_command() + .inline_javascript("function passthrough(x) { return x; }") + .schema_definitions(mode_type, settings_type) + .docker(docker_pull="alpine:3.20") + .resources(cores_min=1.5, ram_min=1024, outdir_min=256) + .env_var("LC_ALL", "C") + .initial_workdir([Dirent("threads=4\n", entryname="config.txt")]) + .work_reuse(False, as_hint=True) + .network_access(False) + .argument("run-aligner", position=0) + .inputs( + reads=Input.array( + Type.file(), + prefix="--reads", + format="edam:format_2572", + secondary_files=[secondary_file(".bai", required=False)], + ), + mode=Input.of(mode_type, prefix="--mode"), + settings=Input.of(settings_type, load_listing="shallow_listing"), + ) + .outputs(sam=Output.stdout()) + .stdout("aligned.sam") + .success_codes(0, 2) + ) + + +@pytest.mark.fast +def test_cwl_builder_covers_common_clt_surface() -> None: + tool = _rich_builder().to_dict() + + assert tool["$namespaces"] == {"edam": "https://edamontology.org/"} + assert tool["$schemas"] == ["https://example.org/formats.rdf"] + assert tool["intent"] == ["edam:operation_3198"] + assert tool["baseCommand"] == ["bash", "-lc"] + assert tool["arguments"] == [{"position": 0, "valueFrom": "run-aligner"}] + assert tool["stdout"] == "aligned.sam" + assert tool["successCodes"] == [0, 2] + assert tool["inputs"]["reads"]["secondaryFiles"] == [{"pattern": ".bai", "required": False}] + assert tool["inputs"]["settings"]["loadListing"] == "shallow_listing" + assert tool["outputs"]["sam"]["type"] == "stdout" + assert tool["requirements"]["ShellCommandRequirement"] == {} + assert tool["requirements"]["DockerRequirement"] == {"dockerPull": "alpine:3.20"} + assert tool["requirements"]["ResourceRequirement"] == { + "coresMin": 1.5, + "ramMin": 1024, + "outdirMin": 256, + } + assert tool["requirements"]["EnvVarRequirement"] == { + "envDef": [{"envName": "LC_ALL", "envValue": "C"}] + } + assert tool["requirements"]["InitialWorkDirRequirement"] == { + "listing": [{"entry": "threads=4\n", "entryname": "config.txt"}] + } + assert tool["requirements"]["NetworkAccess"] == {"networkAccess": False} + assert tool["requirements"]["InlineJavascriptRequirement"] == { + "expressionLib": ["function passthrough(x) { return x; }"] + } + assert len(tool["requirements"]["SchemaDefRequirement"]["types"]) == 2 + assert tool["hints"]["WorkReuse"] == {"enableReuse": False} + + +@pytest.mark.fast +def test_cwl_builder_accepts_raw_extensions() -> None: + tool = ( + CommandLineToolBuilder("custom-tool") + .inputs(message=Input.string()) + .outputs(out=Output.file(glob="out.txt")) + .time_limit(60) + .extra(sbol_intent="example:custom", customExtension={"enabled": True}) + .to_dict() + ) + + assert tool["requirements"]["ToolTimeLimit"] == {"timelimit": 60} + assert tool["sbol_intent"] == "example:custom" + assert tool["customExtension"] == {"enabled": True} + + +@pytest.mark.fast +def test_cwl_builder_save_round_trips_yaml(tmp_path: Path) -> None: + builder = _rich_builder() + output_path = tmp_path / "aligner.cwl" + + saved_path = builder.save(output_path) + + assert saved_path == output_path + assert yaml.safe_load(output_path.read_text(encoding="utf-8")) == builder.to_dict() + + +@pytest.mark.fast +def test_cwl_builder_validate_uses_cwltool_stack(monkeypatch: pytest.MonkeyPatch) -> None: + class FakeLoadTool: + def __init__(self) -> None: + self.calls: list[tuple[str, object]] = [] + + def fetch_document(self, path: str) -> tuple[str, dict[str, str], str]: + self.calls.append(("fetch_document", Path(path).suffix)) + return "loading-context", {"class": "CommandLineTool"}, "file:///aligner.cwl" + + def resolve_and_validate_document( + self, + loading_context: str, + workflowobj: dict[str, str], + uri: str, + preprocess_only: bool = False, + ) -> tuple[str, str]: + self.calls.append(("resolve_and_validate_document", preprocess_only)) + assert loading_context == "loading-context" + assert workflowobj == {"class": "CommandLineTool"} + assert uri == "file:///aligner.cwl" + return "validated-context", "file:///validated-aligner.cwl" + + def make_tool(self, uri: str, loading_context: str) -> dict[str, str]: + self.calls.append(("make_tool", uri)) + assert loading_context == "validated-context" + return {"uri": uri, "loading_context": loading_context} + + fake_load_tool = FakeLoadTool() + monkeypatch.setattr(cwl_builder, "_import_cwltool_load_tool", lambda: fake_load_tool) + + result = _rich_builder().validate() + + assert result.uri == "file:///validated-aligner.cwl" + assert result.process == { + "uri": "file:///validated-aligner.cwl", + "loading_context": "validated-context", + } + assert [name for name, _ in fake_load_tool.calls] == [ + "fetch_document", + "resolve_and_validate_document", + "make_tool", + ] From c5a8b8597f0decc02c1ae247aa4b11016216fff3 Mon Sep 17 00:00:00 2001 From: Vasu Jaganath Date: Wed, 25 Mar 2026 21:28:48 -0400 Subject: [PATCH 2/2] pythonic cwl_builder --- .github/workflows/fuzzy_compile_weekly.yml | 4 +- .github/workflows/lint_and_test.yml | 20 +- .github/workflows/lint_and_test_macos.yml | 4 +- .github/workflows/run_workflows.yml | 45 +- .github/workflows/run_workflows_weekly.yml | 4 +- docs/cwl_builder_sam3.md | 336 +++ docs/cwl_builder_workflow.md | 328 +++ docs/index.rst | 2 + docs/tutorials/tutorials.rst | 2 +- docs/userguide.md | 4 + examples/scripts/cwl_builder_workflow.py | 152 ++ examples/scripts/sam3_cwl_builder.py | 88 + src/sophios/apis/python/__init__.py | 182 +- src/sophios/apis/python/_api_config.py | 9 + .../apis/python/_cwl_builder_namespaces.py | 103 + src/sophios/apis/python/_cwl_builder_specs.py | 765 +++++++ .../apis/python/_cwl_builder_step_bridge.py | 58 + .../apis/python/_cwl_builder_support.py | 346 ++++ src/sophios/apis/python/_ports.py | 410 ++-- src/sophios/apis/python/_types.py | 15 +- src/sophios/apis/python/_utils.py | 114 +- src/sophios/apis/python/_workflow_runtime.py | 521 +++++ src/sophios/apis/python/api.py | 1009 +++++---- src/sophios/apis/python/api_config.py | 8 - src/sophios/apis/python/cwl_builder.py | 1802 ++++------------- src/sophios/apis/rest/api.py | 7 +- src/sophios/apis/utils/ict/ict_spec/model.py | 13 - src/sophios/ast.py | 11 +- src/sophios/compiler.py | 4 +- src/sophios/cwl_subinterpreter.py | 2 +- src/sophios/inlineing.py | 4 +- src/sophios/input_output.py | 2 +- src/sophios/main.py | 15 +- src/sophios/plugins.py | 25 +- src/sophios/post_compile.py | 19 +- src/sophios/python_cwl_adapter.py | 18 +- src/sophios/run_local.py | 73 +- src/sophios/schemas/wic_schema.py | 10 +- src/sophios/utils_cwl.py | 12 +- src/sophios/utils_graphs.py | 4 +- tests/test_cli_flags.py | 31 - tests/test_compile_python_workflows.py | 123 -- tests/test_cwl_builder.py | 216 +- tests/test_examples.py | 4 +- tests/test_fuzzy_compile.py | 6 +- tests/test_python_api.py | 451 +++++ tests/test_python_api_redesign.py | 63 - tests/test_rest_api.py | 181 ++ tests/test_rest_core.py | 212 -- tests/test_rest_wfb.py | 34 - tests/test_setup.py | 33 +- 51 files changed, 5100 insertions(+), 2804 deletions(-) create mode 100644 docs/cwl_builder_sam3.md create mode 100644 docs/cwl_builder_workflow.md create mode 100644 examples/scripts/cwl_builder_workflow.py create mode 100644 examples/scripts/sam3_cwl_builder.py create mode 100644 src/sophios/apis/python/_api_config.py create mode 100644 src/sophios/apis/python/_cwl_builder_namespaces.py create mode 100644 src/sophios/apis/python/_cwl_builder_specs.py create mode 100644 src/sophios/apis/python/_cwl_builder_step_bridge.py create mode 100644 src/sophios/apis/python/_cwl_builder_support.py create mode 100644 src/sophios/apis/python/_workflow_runtime.py delete mode 100644 src/sophios/apis/python/api_config.py delete mode 100644 tests/test_cli_flags.py delete mode 100644 tests/test_compile_python_workflows.py create mode 100644 tests/test_python_api.py delete mode 100644 tests/test_python_api_redesign.py create mode 100644 tests/test_rest_api.py delete mode 100644 tests/test_rest_core.py delete mode 100644 tests/test_rest_wfb.py diff --git a/.github/workflows/fuzzy_compile_weekly.yml b/.github/workflows/fuzzy_compile_weekly.yml index 3b9215c6..8e6588e9 100644 --- a/.github/workflows/fuzzy_compile_weekly.yml +++ b/.github/workflows/fuzzy_compile_weekly.yml @@ -97,7 +97,7 @@ jobs: - name: Generate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_compile_python_workflows - name: Generate Sophios Validation Jsonschema if: always() @@ -108,7 +108,7 @@ jobs: # WIC Python API workflows as well as the WIC Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_validate_generated_python_workflows - name: PyTest CWL Builder if: always() diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml index f505f342..3e5e76d5 100644 --- a/.github/workflows/lint_and_test.yml +++ b/.github/workflows/lint_and_test.yml @@ -148,13 +148,13 @@ jobs: python-version: "3.11.*" - name: ShellCheck Script Quality - if: always() + if: runner.os == 'Linux' # "SC1017 (error): Literal carriage return. Run script through tr -d '\r' ." run: shellcheck -e SC1017 $(find sophios/ -name "*.sh" -and -not -path "./3/*") - name: Install Sophios if: always() - run: cd sophios/ && pip install ".[all_except_runner_src]" + run: cd sophios/ && pip install -e ".[all_except_runner_src]" - name: Update Sophios Config if: always() @@ -167,33 +167,33 @@ jobs: # NOTE: Use ".[test]" instead of ".[all_except_runner_src]" # We do not want or need to install the workflow_deps extra. # (Many of the packages conflict with pypy.) - run: cd mm-workflows/ && pip install ".[test]" && mm-workflows --generate_schemas + run: cd mm-workflows/ && pip install -e ".[test]" && mm-workflows --generate_schemas # Do the static analysis, type and style check first - name: MyPy Check Type Annotations - if: always() + if: runner.os == 'Linux' run: cd sophios/ && mypy src/ examples/ tests/ # NOTE: Do not use `mypy .` because then mypy will check both src/ and build/ causing: # src/sophios/__init__.py: error: Duplicate module named "wic" # (also at "./build/lib/sophios/__init__.py") - name: PyLint Check Code Quality - if: always() + if: runner.os == 'Linux' run: cd sophios/ && pylint src/ examples/**/*.py tests/ # NOTE: See fail-under threshold in .pylintrc - name: PEP8 Code Formatting - if: always() + if: runner.os == 'Linux' id: autopep8 run: cd sophios/ && autopep8 --exit-code --recursive --diff --max-line-length 120 examples/ src/ tests/ - name: Fail if autopep8 made changes - if: steps.autopep8.outputs.exit-code == 2 + if: runner.os == 'Linux' && steps.autopep8.outputs.exit-code == 2 run: exit 1 - name: Generate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_compile_python_workflows - name: Generate Sophios Validation Jsonschema if: always() @@ -204,10 +204,10 @@ jobs: # Sophios Python API workflows as well as the Sophios Python API itself. - name: Validate sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_validate_generated_python_workflows - name: Build Documentation - if: always() + if: runner.os == 'Linux' run: cd sophios/docs && make html # NOTE: Do NOT add coverage to PYPY CI runs https://github.com/tox-dev/tox/issues/2252 diff --git a/.github/workflows/lint_and_test_macos.yml b/.github/workflows/lint_and_test_macos.yml index 06dc0ee8..e2307b78 100644 --- a/.github/workflows/lint_and_test_macos.yml +++ b/.github/workflows/lint_and_test_macos.yml @@ -103,7 +103,7 @@ jobs: - name: Generate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_compile_python_workflows - name: Generate Sophios Validation Jsonschema if: always() @@ -114,7 +114,7 @@ jobs: # Sophios Python API workflows as well as the Sophios Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_validate_generated_python_workflows - name: Build Documentation if: always() diff --git a/.github/workflows/run_workflows.yml b/.github/workflows/run_workflows.yml index e356ea3b..0936b34d 100644 --- a/.github/workflows/run_workflows.yml +++ b/.github/workflows/run_workflows.yml @@ -106,18 +106,11 @@ jobs: run: rm -rf "/home/$(whoami)/wic/" && rm -rf "/home/$(whoami)/.toil/" # For self-hosted runners, make sure we use new global config settings - # Completely moving away from pypy - name: Remove old mamba environment if: always() run: rm -rf "/home/$(whoami)/miniconda3/envs/wic_github_actions/" # For self-hosted runners, make sure we install into a new mamba environment - # NOTE: Every time the github self-hosted runner executes, it sets "set -e" in ~/.bash_profile - # So if we rm -rf the old mamba environment without also removing the mamba init code in ~/.bash_profile - # (or removing the file altogether), then unless we immediately re-create the environment, - # (i.e. if we try to run any other commands between removing and re-creating the environment) - # we will get "EnvironmentNameNotFound: Could not find conda environment: wic_github_actions" - # and (again, due to "set -e") the workflow step will fail. - name: Setup miniforge (linux, macos) if: always() @@ -137,7 +130,7 @@ jobs: - name: Install Sophios if: always() - run: cd sophios/ && pip install ".[all_except_runner_src]" + run: cd sophios/ && pip install -e ".[all_except_runner_src]" - name: Update Sophios Config if: always() @@ -150,11 +143,23 @@ jobs: # NOTE: Use ".[test]" instead of ".[all_except_runner_src]" # We do not want or need to install the workflow_deps extra. # (Many of the packages conflict with pypy.) - run: cd mm-workflows/ && pip install ".[test]" && mm-workflows --generate_schemas + run: cd mm-workflows/ && pip install -e ".[test]" && mm-workflows --generate_schemas + + - name: Configure pytest worker count + if: always() + run: | + PYTEST_WORKERS="$(python - <<'PY' + import os + + cpu_count = os.cpu_count() or 8 + print(max(8, min(16, cpu_count - 2))) + PY + )" + echo "PYTEST_WORKERS=${PYTEST_WORKERS}" >> "$GITHUB_ENV" - name: Generate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_compile_python_workflows - name: Generate Sophios Validation Jsonschema if: always() @@ -165,26 +170,30 @@ jobs: # WIC Python API workflows as well as the WIC Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_validate_generated_python_workflows + + - name: cwl-docker-extract (i.e. recursively docker pull) + if: always() + run: cd sophios/ && pytest tests/test_examples.py -k test_cwl_docker_extract + # For self-hosted runners, make sure the docker cache is up-to-date. - name: PyTest CWL Builder if: always() run: cd sophios/ && pytest tests/test_cwl_builder.py -k test_cwl_builder - - name: cwl-docker-extract (i.e. recursively docker pull) + - name: PyTest Python API if: always() - run: cd sophios/ && pytest tests/test_examples.py -k test_cwl_docker_extract - # For self-hosted runners, make sure the docker cache is up-to-date. + run: cd sophios/ && pytest tests/test_python_api.py -k "not test_compile_python_workflows and not test_validate_generated_python_workflows" - name: PyTest Run REST Core Tests if: always() # NOTE: Do NOT add coverage to PYPY CI runs https://github.com/tox-dev/tox/issues/2252 - run: cd sophios/ && pytest tests/test_rest_core.py -k test_rest_core --cwl_runner cwltool + run: cd sophios/ && pytest tests/test_rest_api.py -k test_rest_core --cwl_runner cwltool - name: PyTest Run REST WFB Tests if: always() # NOTE: Do NOT add coverage to PYPY CI runs https://github.com/tox-dev/tox/issues/2252 - run: cd sophios/ && pytest tests/test_rest_wfb.py -k test_rest_wfb --cwl_runner cwltool + run: cd sophios/ && pytest tests/test_rest_api.py -k test_rest_wfb --cwl_runner cwltool - name: PyTest Run ICT to CLT conversion Tests if: always() @@ -193,13 +202,11 @@ jobs: - name: PyTest Run update wfb payload Tests if: always() - # NOTE: Do NOT add coverage to PYPY CI runs https://github.com/tox-dev/tox/issues/2252 run: cd sophios/ && pytest tests/test_fix_payload.py -k test_fix - name: PyTest Run Workflows if: always() - # NOTE: Do NOT add coverage to PYPY CI runs https://github.com/tox-dev/tox/issues/2252 - run: cd sophios/ && pytest tests/test_examples.py -k test_run_workflows_on_push --workers 8 --cwl_runner cwltool # --cov + run: cd sophios/ && pytest tests/test_examples.py -k test_run_workflows_on_push --workers "${PYTEST_WORKERS}" --cwl_runner cwltool # --cov # NOTE: The steps below are for repository_dispatch only. For all other steps, please insert above # this comment. diff --git a/.github/workflows/run_workflows_weekly.yml b/.github/workflows/run_workflows_weekly.yml index 96368913..50937512 100644 --- a/.github/workflows/run_workflows_weekly.yml +++ b/.github/workflows/run_workflows_weekly.yml @@ -112,7 +112,7 @@ jobs: - name: Generate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest -k test_compile_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_compile_python_workflows - name: Generate Sophios Validation Jsonschema if: always() @@ -123,7 +123,7 @@ jobs: # Sophios Python API workflows as well as the WIC Python API itself. - name: Validate Sophios Python API Workflows (*.py -> *.wic) if: always() - run: cd sophios/ && pytest tests/test_compile_python_workflows.py -k test_validate_generated_python_workflows + run: cd sophios/ && pytest tests/test_python_api.py -k test_validate_generated_python_workflows - name: PyTest CWL Builder if: always() diff --git a/docs/cwl_builder_sam3.md b/docs/cwl_builder_sam3.md new file mode 100644 index 00000000..2f6a846e --- /dev/null +++ b/docs/cwl_builder_sam3.md @@ -0,0 +1,336 @@ +# Building a CWL CommandLineTool in Python + +This walkthrough shows how to build a real CWL `CommandLineTool` using +`sophios.apis.python.cwl_builder`. + +The design goal is simple: + +- the required structure of the tool should be obvious at a glance, +- input and output names should come from Python names rather than raw string keys, +- and optional CWL details should feel like optional add-ons, not required boilerplate. + +The full working example lives in +[examples/scripts/sam3_cwl_builder.py](https://github.com/PolusAI/workflow-inference-compiler/blob/master/examples/scripts/sam3_cwl_builder.py). + +## The core idea + +There are only three required pieces: + +1. a tool name, +2. an `Inputs(...)` collection, +3. an `Outputs(...)` collection. + +That means the basic shape always looks like this: + +```python +inputs = Inputs( + input=Input(cwl.directory, position=1), + output=Input(cwl.directory, position=2), +) + +outputs = Outputs( + output=Output(cwl.directory, from_input=inputs.output), +) + +tool = CommandLineTool("example", inputs, outputs) +``` + +Everything else is optional and chainable: + +```python +tool = ( + CommandLineTool("example", inputs, outputs) + .base_command("python", "main.py") + .docker("python:3.12") + .resources(cores=2, ram=4096) +) +``` + +That split is intentional. The constructor shows the tool contract. The chained calls describe the runtime and metadata details around that contract. + +## Why this is easier to read + +The old builder style asked you to mentally assemble the CLT while reading a long chain. + +The new style makes the shape visible immediately: + +- `Inputs(...)` gives names to inputs using Python keywords, +- `Outputs(...)` gives names to outputs the same way, +- `CommandLineTool(...)` requires those named collections up front. + +That helps non-experts because the code now reads more like: + +"this tool has these inputs and these outputs" + +and less like: + +"start a builder, keep chaining methods, and hope the required bits showed up somewhere in the middle." + +## Named inputs without raw string keys + +One of the important design constraints is that users should not have to write raw string names for input and output definitions. + +So instead of: + +```python +inputs = { + "input": ..., + "output": ..., +} +``` + +you write: + +```python +inputs = Inputs( + input=Input(cwl.directory, position=1), + output=Input(cwl.directory, position=2), +) +``` + +Those Python keyword names become the CWL parameter names. + +The same thing applies to outputs: + +```python +outputs = Outputs( + output=Output(cwl.directory, from_input=inputs.output), +) +``` + +Notice that `from_input=inputs.output` uses a real named input reference, not a raw string like `"output"`. + +The other important convention is that CWL types live under the `cwl` namespace: + +```python +cwl.int +cwl.float +cwl.file +cwl.directory +``` + +That keeps CWL vocabulary visually separate from Python builtins and makes intent easier to scan. + +## How to think about inputs + +Each input answers two questions: + +1. what type of thing is this? +2. how does the underlying application receive it? + +Examples: + +```python +Input(cwl.directory, position=1) +Input(cwl.file, flag="--model", required=False) +Input(cwl.int, flag="--tile-size", required=False) +``` + +These read very close to application intent: + +- positional directory argument, +- optional file passed with `--model`, +- optional integer passed with `--tile-size`. + +Optional metadata can then be chained: + +```python +Input(cwl.file, flag="--model", required=False).label("Model override file").doc("Path to sam3.pt") +``` + +That is the intended use of chaining in this API: optional polish on top of a complete required core. + +## How to think about outputs + +Outputs follow the same pattern: + +```python +Output(cwl.directory, from_input=inputs.output) +Output(cwl.file, glob="results.json") +Output.stdout() +``` + +Again, the goal is to describe what the output means, not to hand-assemble `outputBinding` YAML. + +## The SAM3 example + +```python +from pathlib import Path + +from sophios.apis.python.cwl_builder import CommandLineTool, Input, Inputs, Output, Outputs, cwl + + +inputs = Inputs( + input=Input(cwl.directory, position=1).label("Input Zarr dataset").doc("Path to input zarr dataset"), + output=Input(cwl.directory, position=2).label("Output segmentation Zarr").doc( + "Path for output segmentation zarr" + ), + model=Input(cwl.file, flag="--model", required=False) + .label("Model override file") + .doc("Path containing sam3.pt to override baked-in models/sam3"), + tile_size=Input(cwl.int, flag="--tile-size", required=False) + .label("Tile size") + .doc("Tile size for large slices (default 1024)"), + overlap=Input(cwl.int, flag="--overlap", required=False) + .label("Tile overlap") + .doc("Overlap between adjacent tiles in pixels (default 128)"), + iou_threshold=Input(cwl.float, flag="--iou-threshold", required=False) + .label("IoU threshold") + .doc("IoU threshold for matching labels across tiles (default 0.5)"), + batch_size=Input(cwl.int, flag="--batch-size", required=False) + .label("Batch size") + .doc("Number of tiles per GPU forward pass (default 8)"), + lora_weights=Input(cwl.file, flag="--lora-weights", required=False) + .label("LoRA weights") + .doc("Path to LoRA adapter weights (.pt file) - optional"), + lora_rank=Input(cwl.int, flag="--lora-rank", required=False) + .label("LoRA rank") + .doc("LoRA rank used when lora_weights is set (default 16)"), + lora_alpha=Input(cwl.int, flag="--lora-alpha", required=False) + .label("LoRA alpha") + .doc("LoRA alpha scaling factor used when lora_weights is set (default 32)"), +) + +outputs = Outputs( + output=Output(cwl.directory, from_input=inputs.output).label("Output segmentation Zarr"), +) + +tool = ( + CommandLineTool("sam3_ome_zarr_autosegmentation", inputs, outputs) + .describe( + "SAM3 OME Zarr autosegmentation", + "Run SAM3 autosegmentation on a zarr volume.\n" + "Models are baked into the container image at models/sam3, " + "so no model staging is required.", + ) + .edam() + .gpu(cuda_version_min="11.7", compute_capability="3.0", device_count_min=2) + .docker("polusai/ichnaea-api:latest") + .stage(inputs.output, writable=True) + .stage(inputs.input) + .resources(cores=4, ram=64000) + .base_command( + "/backend/.venv/bin/python", + "/backend/dagster_pipelines/jobs/autosegmentation/logic.py", + ) +) + +output_path = Path("sam3_ome_zarr_autosegmentation.cwl") +tool.save(output_path, validate=True) +``` + +## What the builder is hiding for you + +This API is supposed to absorb the repetitive CWL details that regularly trip people up: + +- nullable unions for optional values, +- `inputBinding.prefix` vs `inputBinding.position`, +- `outputBinding.glob` expressions derived from input names, +- `InitialWorkDirRequirement` entries for staged inputs, +- `InlineJavascriptRequirement` when helper-generated expressions are present, +- namespaced hints such as `cwltool:CUDARequirement`. + +That means most users only need to think about: + +- what command runs, +- what the inputs are, +- what the outputs are, +- and which optional runtime constraints apply. + +## Sane defaults + +For most tools, the happy path is: + +```python +CommandLineTool(name, inputs, outputs).base_command(...).docker(...).resources(...) +``` + +Everything else is optional. + +In particular: + +- `label` is optional, +- `doc` is optional, +- namespaces and schemas are optional, +- `InlineJavascriptRequirement` is added automatically when helper-generated expressions are present, +- resource requirements are optional, +- EDAM metadata is optional and available through `.edam()`. + +## Why you can trust the result + +There are two separate sources of confidence. + +### 1. The API narrows the common error surface + +The builder gives you named operations rather than raw nested dictionaries, which means fewer routine mistakes: + +- malformed optional types, +- incorrect binding placement, +- incorrect output glob expressions, +- missing requirement wrappers, +- missing namespace setup for common hints. + +### 2. Validation is built in + +When you call: + +```python +tool.save(output_path, validate=True) +``` + +or: + +```python +tool.validate() +``` + +the generated CLT is validated through the `cwltool` and schema-salad stack. + +That is a much stronger guarantee than "this happened to produce YAML". It means the generated document has gone through the same validation path users already trust. + +## Escape hatches + +The main API is intentionally structured, but escape hatches still exist for advanced cases: + +- `requirement(...)` +- `hint(...)` +- `argument(...)` +- `extra(...)` + +Those are for the unusual edges of CWL. They should be the exception, not the starting point. + +## Using a built CLT in the workflow DSL + +The CLT builder can also hand off directly to the workflow Python API without writing a `.cwl` file: + +```python +tool = CommandLineTool( + "echo_tool", + Inputs(message=Input(cwl.string, position=1)), + Outputs(out=Output.stdout()), +).stdout("stdout.txt") + +step = tool.to_step(step_name="say_hello") +step.inputs.message = "hello" + +workflow = Workflow([step], "wf") +``` + +That bridge stays intentionally small: + +- the builder still only knows how to render a CLT, +- the workflow API still only knows how to work with a parsed CLT document, +- and the in-memory handoff is handled through a tiny adapter layer. + +## Run the example + +From the repository root: + +```bash +PYTHONPATH=src python examples/scripts/sam3_cwl_builder.py +PYTHONPATH=src python examples/scripts/sam3_cwl_builder.py --validate +``` + +The first command writes the CLT. The second also validates it. + +Validation requires `cwltool` and schema-salad to be installed in your Python environment. diff --git a/docs/cwl_builder_workflow.md b/docs/cwl_builder_workflow.md new file mode 100644 index 00000000..0b4611b0 --- /dev/null +++ b/docs/cwl_builder_workflow.md @@ -0,0 +1,328 @@ +# Using `cwl_builder` and the Workflow Python API Together + +Sophios now has two related Python surfaces: + +- `sophios.apis.python.cwl_builder` for authoring a single CWL `CommandLineTool` +- `sophios.apis.python.api` for wiring tools into a workflow with `Step` and `Workflow` + +Those APIs are intentionally separate, but they can be combined cleanly. + +This guide shows the intended end-to-end pattern: + +1. define a new tool in Python, +2. validate that tool as a real CWL `CommandLineTool`, +3. convert it into an in-memory `Step`, +4. compose it with the normal Sophios workflow DSL. + +The important part is that the handoff stays **in memory**. You do not need to write a temporary `.cwl` file just to use a freshly built tool inside a workflow. + +A runnable version of this pattern lives in +[examples/scripts/cwl_builder_workflow.py](https://github.com/PolusAI/workflow-inference-compiler/blob/master/examples/scripts/cwl_builder_workflow.py). + +## When to use this pattern + +This hybrid style is useful when: + +- a tool does not exist yet as a checked-in `.cwl` file, +- you want to generate a family of similar tools from Python, +- you want to validate the generated CLT before putting it into a workflow, +- or you want a workflow to mix generated tools with ordinary file-backed `Step(...)` objects. + +If you only need to build a single standalone CLT, start with {doc}`cwl_builder_sam3`. + +If you already have checked-in `.cwl` tools and only need to compose them, the workflow examples in {doc}`userguide` are still the right starting point. + +## Mental model + +The cleanest way to think about the boundary is: + +- `CommandLineTool(...)` defines a **tool contract** +- `tool.validate()` checks that contract as real CWL +- `tool.to_step()` turns that contract into a **workflow node** +- `Workflow(...)` composes that node with other steps + +That separation is deliberate. + +The builder does not need to know about workflows. +The workflow DSL does not need to know how the tool was authored. +The bridge is small: it passes a normal CWL document from one side to the other. + +## What we will build + +We will build a tiny tool called `emit_text`: + +- it accepts one string input named `message`, +- it runs `echo`, +- it captures stdout into a file, +- and it exposes that file as a normal CWL `File` output. + +Then we will: + +- convert that built tool into a Sophios `Step`, +- feed its file output into the existing checked-in [`cat.cwl`](https://github.com/PolusAI/workflow-inference-compiler/blob/master/cwl_adapters/cat.cwl), +- expose a workflow input called `message`, +- and expose a workflow output called `result`. + +So the final workflow shape is: + +```text +workflow input "message" + -> emit_text (generated in memory) + -> cat.cwl (file-backed step) + -> workflow output "result" +``` + +## Full example + +The snippet below assumes you are running from the repository root, so the checked-in adapter path `cwl_adapters/cat.cwl` is valid as written. + +```python +from pathlib import Path + +from sophios.apis.python import ( + CommandLineTool, + Input, + Inputs, + Output, + Outputs, + Step, + Workflow, + cwl, +) + + +def build_emit_text_tool() -> CommandLineTool: + inputs = Inputs( + message=Input(cwl.string, position=1) + .label("Message") + .doc("Text to print to stdout"), + ) + + outputs = Outputs( + file=Output(cwl.file, glob="stdout") + .label("Captured stdout") + .doc("The file produced by redirecting stdout"), + ) + + return ( + CommandLineTool("emit_text", inputs, outputs) + .describe( + "Emit a message", + "Small example CLT built in Python and consumed by the workflow DSL.", + ) + .base_command("echo") + .stdout("stdout") + ) + + +def build_workflow() -> Workflow: + emit_tool = build_emit_text_tool() + + # Optional but recommended while developing new generated tools. + # This requires cwltool/schema-salad in your Python environment. + emit_tool.validate() + + # No temporary file is needed here. The CLT is handed to Step in memory. + emit_step = emit_tool.to_step(step_name="emit_text") + + # This is an ordinary checked-in CWL adapter. + cat_step = Step(Path("cwl_adapters") / "cat.cwl") + + workflow = Workflow([emit_step, cat_step], "builder_and_pyapi_demo") + + # Be explicit about the workflow interface. + workflow.add_input("message", cwl.string) + + # Recommended explicit binding style. + emit_step.inputs.message = workflow.inputs.message + cat_step.inputs.file = emit_step.outputs.file + + # Expose a workflow output. + workflow.outputs.result = cat_step.outputs.output + return workflow + + +workflow = build_workflow() +compiler_info = workflow.compile(write_to_disk=True) +``` + +## Why this example is structured this way + +There are a few details worth calling out. + +### 1. The CLT is complete before it becomes a step + +The `emit_text` tool is a real `CommandLineTool` first: + +```python +inputs = Inputs( + message=Input(cwl.string, position=1), +) + +outputs = Outputs( + file=Output(cwl.file, glob="stdout"), +) + +tool = ( + CommandLineTool("emit_text", inputs, outputs) + .base_command("echo") + .stdout("stdout") +) +``` + +That matters because the builder API is responsible for answering tool-level questions: + +- what are the inputs, +- what are the outputs, +- what command runs, +- how are stdout/stderr/files represented. + +The workflow API should not need to rebuild that information later. + +### 2. `tool.validate()` happens at the tool boundary + +Validation belongs naturally on the builder side: + +```python +emit_tool.validate() +``` + +That gives you confidence that the generated CLT is valid CWL **before** it participates in a larger workflow. + +For self-authored tools, that is usually the best debugging boundary: + +- first make the tool valid, +- then compose it into the workflow. + +### 3. `tool.to_step()` is the bridge + +This is the key handoff: + +```python +emit_step = emit_tool.to_step(step_name="emit_text") +``` + +That call: + +- renders the CLT to a standard CWL document, +- parses it through the Python workflow API, +- and returns a normal `Step`. + +After that, you work with the object exactly like any other `Step`: + +```python +emit_step.inputs.message = workflow.inputs.message +cat_step.inputs.file = emit_step.outputs.file +``` + +That is the main design goal of the bridge: once a built tool becomes a step, it should feel boring. + +### 4. Workflow bindings should stay explicit + +This guide uses the explicit form: + +```python +emit_step.inputs.message = workflow.inputs.message +cat_step.inputs.file = emit_step.outputs.file +workflow.outputs.result = cat_step.outputs.output +``` + +That is easier to read than the legacy shorthand and makes directionality obvious: + +- `inputs.*` are places you can bind values, +- `outputs.*` are places you can read values from. + +The old shorthand still exists for compatibility, but it is not the best style to teach. + +### 5. Workflow interface should be declared deliberately + +This line is important: + +```python +workflow.add_input("message", cwl.string) +``` + +Yes, the compatibility layer can still create workflow inputs implicitly in some situations. +But explicit workflow inputs are much easier to reason about, especially when the workflow is meant to be reused or reviewed by someone else. + +## What gets written to disk + +Only the compiled workflow artifacts are written when you call: + +```python +workflow.compile(write_to_disk=True) +``` + +The generated `emit_text` CLT does **not** need to be written as a standalone `.cwl` file first. + +That means this pattern is suitable for: + +- generated tools, +- parameterized tools, +- short-lived tools used only inside a larger workflow, +- and tests that want to build tools programmatically. + +## How to trust this pattern + +There are two separate confidence checks here, and they complement each other. + +### 1. Tool confidence + +`emit_tool.validate()` checks the generated CLT as a real CWL document. + +That tells you: + +- the tool structure is valid, +- the CWL fields are in the right shape, +- and the generated CLT is not just "some YAML that looks plausible". + +### 2. Workflow confidence + +`workflow.compile(...)` checks that the generated step can participate in the normal Sophios compilation path. + +That tells you: + +- the workflow DSL can consume the built tool, +- the step ports are wired correctly, +- and the result compiles into the same pipeline machinery as any other Sophios workflow. + +Those are different guarantees, and you usually want both. + +## Recommended workflow for teams + +For day-to-day development, this sequence tends to work well: + +1. build the tool with `CommandLineTool(...)` +2. call `tool.validate()` +3. convert it with `tool.to_step()` +4. wire it into a `Workflow(...)` +5. call `workflow.compile(...)` +6. only then move on to full execution + +That keeps failures close to the layer that caused them. + +## Summary + +The combined Python story is now: + +- use `cwl_builder` to define a proper CWL tool, +- validate it while it is still a tool, +- turn it into a `Step` in memory, +- compose it with ordinary Sophios workflow steps. + +That gives you the best of both worlds: + +- the rigor of a real CWL `CommandLineTool`, +- and the composability of the Sophios workflow Python API. + +## Run the example script + +From the repository root: + +```bash +PYTHONPATH=src python examples/scripts/cwl_builder_workflow.py --validate +PYTHONPATH=src python examples/scripts/cwl_builder_workflow.py --run +``` + +The first command validates the generated CLTs and compiles the workflow. +The second runs the full demo workflow through `Workflow.run()`. diff --git a/docs/index.rst b/docs/index.rst index 68363ec2..ff839c1a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,6 +9,8 @@ Sophios documentation installguide.md tutorials/tutorials.rst userguide.md + cwl_builder_sam3.md + cwl_builder_workflow.md advanced.md validation.md dev/installguide.md diff --git a/docs/tutorials/tutorials.rst b/docs/tutorials/tutorials.rst index ce51b320..6144793b 100644 --- a/docs/tutorials/tutorials.rst +++ b/docs/tutorials/tutorials.rst @@ -8,4 +8,4 @@ Tutorials helloworld.md multistep.md subworkflows.md - naming_conventions.md \ No newline at end of file + naming_conventions.md diff --git a/docs/userguide.md b/docs/userguide.md index aef6caf9..585ebade 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -82,6 +82,10 @@ Note that this is one key difference between sophios and CWL. In CWL, all inputs In addition to YAML based language for building workflows Sophios also provides a Python API. The aspirational goal of this API is to be close to regular usage of Python. This API leverages YAML based syntax by transforming the Python workflow internally into a regular Sophios YAML workflow. All the Python API examples discussed here can be found in directory [`examples/scripts`](https://github.com/PolusAI/workflow-inference-compiler/tree/master/examples/scripts) in the Sophios repository. +Sophios also provides a separate Python API for authoring a single CWL `CommandLineTool` directly, without going through the workflow DSL. If you want to generate CLTs from Python and validate the result with `cwltool` and schema-salad, see {doc}`Building a CWL CommandLineTool in Python `. + +If you want to build a CLT in Python and then compose it directly into a `Workflow` without writing an intermediate `.cwl` file, see {doc}`Using cwl_builder and the Workflow Python API Together `. + ### basics Let us take the most basic workflow *`hello world`*. This is how we write it in YAML syntax. diff --git a/examples/scripts/cwl_builder_workflow.py b/examples/scripts/cwl_builder_workflow.py new file mode 100644 index 00000000..49c4fc75 --- /dev/null +++ b/examples/scripts/cwl_builder_workflow.py @@ -0,0 +1,152 @@ +"""Build and compose in-memory CWL tools with the Sophios workflow Python API.""" + +from __future__ import annotations + +from argparse import ArgumentParser +from pathlib import Path +import sys + +from sophios.apis.python import ( + CommandLineTool, + Input, + Inputs, + Output, + Outputs, + Workflow, + cwl, +) + + +def build_emit_text_tool(python_executable: str | None = None) -> CommandLineTool: + """Create a tiny CLT that writes a message to stdout.""" + interpreter = python_executable or sys.executable + inputs = Inputs( + message=Input(cwl.string, position=1) + .label("Message") + .doc("Text to print to stdout."), + ) + outputs = Outputs( + text_file=Output(cwl.file, glob="stdout.txt") + .label("Captured stdout") + .doc("Text emitted by the tool, captured as a file."), + ) + return ( + CommandLineTool("emit_text", inputs, outputs) + .describe( + "Emit text", + "Small generated CLT that prints one message and captures stdout.", + ) + .base_command(interpreter, "-c") + .argument("import sys; print(sys.argv[1])", position=0) + .stdout("stdout.txt") + ) + + +def build_read_text_tool(python_executable: str | None = None) -> CommandLineTool: + """Create a CLT that reads a text file and returns its contents as a string.""" + interpreter = python_executable or sys.executable + inputs = Inputs( + text_file=Input(cwl.file, position=1) + .label("Text file") + .doc("Input file whose contents should be returned."), + ) + outputs = Outputs( + result=Output( + cwl.string, + glob="stdout.txt", + load_contents=True, + output_eval="$(self[0].contents)", + ) + .label("Result text") + .doc("String value loaded from the generated stdout file."), + ) + return ( + CommandLineTool("read_text_file", inputs, outputs) + .describe( + "Read text", + "Generated CLT that loads a file and prints its text content.", + ) + .base_command(interpreter, "-c") + .argument( + ( + "from pathlib import Path; import sys; " + "print(Path(sys.argv[1]).read_text(encoding='utf-8').strip())" + ), + position=0, + ) + .stdout("stdout.txt") + ) + + +def build_workflow(message: str = "hello from cwl_builder") -> Workflow: + """Create a workflow that chains two generated CLTs entirely in memory.""" + emit_step = build_emit_text_tool().to_step(step_name="emit_text") + read_step = build_read_text_tool().to_step(step_name="read_text") + + emit_step.inputs.message = message + read_step.inputs.text_file = emit_step.outputs.text_file + + workflow = Workflow([emit_step, read_step], "cwl_builder_workflow_demo") + workflow.outputs.result = read_step.outputs.result + return workflow + + +def main() -> int: + """Compile or run the builder-plus-workflow demo.""" + parser = ArgumentParser(description=__doc__) + parser.add_argument( + "--message", + default="hello from cwl_builder", + help="Message passed through the generated tools.", + ) + parser.add_argument( + "--validate", + action="store_true", + help="Validate the generated CLTs with cwltool/schema-salad first.", + ) + parser.add_argument( + "--run", + action="store_true", + help="Execute the workflow locally after building it.", + ) + parser.add_argument( + "--basepath", + default="autogenerated", + help="Directory used when running the workflow locally.", + ) + parser.add_argument( + "--write-tools", + type=Path, + help="Optional directory where the generated CLTs should be written for inspection.", + ) + args = parser.parse_args() + + emit_tool = build_emit_text_tool() + read_tool = build_read_text_tool() + + if args.validate: + emit_tool.validate() + read_tool.validate() + print("Validated generated CLTs.") + + if args.write_tools is not None: + args.write_tools.mkdir(parents=True, exist_ok=True) + emit_path = emit_tool.save(args.write_tools / "emit_text.cwl") + read_path = read_tool.save(args.write_tools / "read_text_file.cwl") + print(f"Wrote {emit_path}") + print(f"Wrote {read_path}") + + workflow = build_workflow(message=args.message) + + if not args.run: + workflow.compile(write_to_disk=True) + print(f"Compiled workflow {workflow.process_name} to autogenerated/") + return 0 + + workflow.run(basepath=args.basepath) + print(f"Ran workflow {workflow.process_name}.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/scripts/sam3_cwl_builder.py b/examples/scripts/sam3_cwl_builder.py new file mode 100644 index 00000000..3d231c93 --- /dev/null +++ b/examples/scripts/sam3_cwl_builder.py @@ -0,0 +1,88 @@ +"""Build a CWL CommandLineTool for SAM3 OME Zarr autosegmentation.""" + +from argparse import ArgumentParser +from pathlib import Path + +from sophios.apis.python.cwl_builder import CommandLineTool, Input, Inputs, Output, Outputs, cwl + + +def build_tool() -> CommandLineTool: + """Create the SAM3 autosegmentation CLT using the declarative API.""" + inputs = Inputs( + input=Input(cwl.directory, position=1).label("Input Zarr dataset").doc("Path to input zarr dataset"), + output=Input(cwl.directory, position=2).label("Output segmentation Zarr").doc( + "Path for output segmentation zarr" + ), + model=Input(cwl.file, flag="--model", required=False) + .label("Model override file") + .doc("Path containing sam3.pt to override baked-in models/sam3"), + tile_size=Input(cwl.int, flag="--tile-size", required=False) + .label("Tile size") + .doc("Tile size for large slices (default 1024)"), + overlap=Input(cwl.int, flag="--overlap", required=False) + .label("Tile overlap") + .doc("Overlap between adjacent tiles in pixels (default 128)"), + iou_threshold=Input(cwl.float, flag="--iou-threshold", required=False) + .label("IoU threshold") + .doc("IoU threshold for matching labels across tiles (default 0.5)"), + batch_size=Input(cwl.int, flag="--batch-size", required=False) + .label("Batch size") + .doc("Number of tiles per GPU forward pass (default 8)"), + lora_weights=Input(cwl.file, flag="--lora-weights", required=False) + .label("LoRA weights") + .doc("Path to LoRA adapter weights (.pt file) - optional"), + lora_rank=Input(cwl.int, flag="--lora-rank", required=False) + .label("LoRA rank") + .doc("LoRA rank used when lora_weights is set (default 16)"), + lora_alpha=Input(cwl.int, flag="--lora-alpha", required=False) + .label("LoRA alpha") + .doc("LoRA alpha scaling factor used when lora_weights is set (default 32)"), + ) + outputs = Outputs(output=Output(cwl.directory, from_input=inputs.output).label("Output segmentation Zarr")) + + return ( + CommandLineTool("sam3_ome_zarr_autosegmentation", inputs, outputs) + .describe( + "SAM3 OME Zarr autosegmentation", + "Run SAM3 autosegmentation on a zarr volume.\n" + "Models are baked into the container image at models/sam3, " + "so no model staging is required.", + ) + .edam() + .gpu(cuda_version_min="11.7", compute_capability="3.0", device_count_min=2) + .docker("polusai/ichnaea-api:latest") + .stage(inputs.output, writable=True) + .stage(inputs.input) + .resources(cores=4, ram=64000) + .base_command( + "/backend/.venv/bin/python", + "/backend/dagster_pipelines/jobs/autosegmentation/logic.py", + ) + ) + + +def main() -> int: + """Write the generated CLT to disk and optionally validate it.""" + parser = ArgumentParser(description=__doc__) + parser.add_argument( + "--output", + type=Path, + default=Path(__file__).with_name("sam3_ome_zarr_autosegmentation.cwl"), + help="Where to write the generated CWL file.", + ) + parser.add_argument( + "--validate", + action="store_true", + help="Validate the generated CLT with cwltool/schema-salad before returning.", + ) + args = parser.parse_args() + + output_path = build_tool().save(args.output, validate=args.validate) + print(f"Wrote {output_path}") + if args.validate: + print("Validation succeeded.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/sophios/apis/python/__init__.py b/src/sophios/apis/python/__init__.py index 62c36be6..1087a8ea 100644 --- a/src/sophios/apis/python/__init__.py +++ b/src/sophios/apis/python/__init__.py @@ -1,68 +1,30 @@ -"""Python workflow API exports.""" +"""Python workflow and CWL builder API exports.""" -from .api import ( - InvalidCLTError, - InvalidInputValueError, - InvalidLinkError, - InvalidStepError, - MissingRequiredValueError, - ProcessInput, - ProcessOutput, - Step, - Workflow, - WorkflowInputReference, - extract_tools_paths_NONPORTABLE, - global_config, - set_input_Step_Workflow, -) -from .cwl_builder import ( - CWLBuilderValidationError, - CommandArgument, - CommandInput, - CommandLineBinding, - CommandLineToolBuilder, - CommandOutput, - CommandOutputBinding, - Dirent, - DockerRequirement, - EnvironmentDef, - EnvVarRequirement, - Field, - FieldSpec, - InitialWorkDirRequirement, - Input, - InputSpec, - InlineJavascriptRequirement, - InplaceUpdateRequirement, - LoadListingRequirement, - NetworkAccess, - Output, - OutputSpec, - ResourceRequirement, - SchemaDefRequirement, - SecondaryFile, - ShellCommandRequirement, - SoftwarePackage, - SoftwareRequirement, - ToolTimeLimit, - Type, - ValidationResult, - WorkReuse, - array_type, - enum_type, - record_field, - record_type, - secondary_file, - validate_cwl_document, -) +from importlib import import_module +from typing import TYPE_CHECKING, Any -__all__ = [ + +_API_EXPORTS = { + "InvalidLinkError", + "InvalidStepError", + "MissingRequiredValueError", + "Step", + "Workflow", +} + +_ERROR_EXPORTS = { + "InvalidCLTError", + "InvalidInputValueError", + "InvalidLinkError", + "InvalidStepError", + "MissingRequiredValueError", +} + +_CWL_BUILDER_EXPORTS = { "CWLBuilderValidationError", "CommandArgument", - "CommandInput", "CommandLineBinding", - "CommandLineToolBuilder", - "CommandOutput", + "CommandLineTool", "CommandOutputBinding", "Dirent", "DockerRequirement", @@ -70,42 +32,106 @@ "EnvVarRequirement", "Field", "FieldSpec", - "InvalidCLTError", - "InvalidInputValueError", - "InvalidLinkError", - "InvalidStepError", "InitialWorkDirRequirement", - "Input", - "InputSpec", "InlineJavascriptRequirement", "InplaceUpdateRequirement", + "Input", + "InputSpec", + "Inputs", "LoadListingRequirement", - "MissingRequiredValueError", "NetworkAccess", "Output", "OutputSpec", - "ProcessInput", - "ProcessOutput", + "Outputs", "ResourceRequirement", "SchemaDefRequirement", "SecondaryFile", "ShellCommandRequirement", "SoftwarePackage", "SoftwareRequirement", - "Step", "ToolTimeLimit", - "Type", - "Workflow", - "WorkflowInputReference", + "ValidationResult", "WorkReuse", "array_type", + "cwl", "enum_type", - "extract_tools_paths_NONPORTABLE", - "global_config", "record_field", "record_type", "secondary_file", - "set_input_Step_Workflow", - "ValidationResult", + "step_from_command_line_tool", "validate_cwl_document", -] +} + +__all__ = sorted(_API_EXPORTS | _ERROR_EXPORTS | _CWL_BUILDER_EXPORTS) + + +if TYPE_CHECKING: + from ._errors import ( + InvalidCLTError, + InvalidInputValueError, + InvalidLinkError, + InvalidStepError, + MissingRequiredValueError, + ) + from .api import ( + Step, + Workflow, + ) + from .cwl_builder import ( + CWLBuilderValidationError, + CommandArgument, + CommandLineBinding, + CommandLineTool, + CommandOutputBinding, + Dirent, + DockerRequirement, + EnvironmentDef, + EnvVarRequirement, + Field, + FieldSpec, + InitialWorkDirRequirement, + InlineJavascriptRequirement, + InplaceUpdateRequirement, + Input, + InputSpec, + Inputs, + LoadListingRequirement, + NetworkAccess, + Output, + OutputSpec, + Outputs, + ResourceRequirement, + SchemaDefRequirement, + SecondaryFile, + ShellCommandRequirement, + SoftwarePackage, + SoftwareRequirement, + ToolTimeLimit, + ValidationResult, + WorkReuse, + array_type, + cwl, + enum_type, + record_field, + record_type, + secondary_file, + step_from_command_line_tool, + validate_cwl_document, + ) + + +def __getattr__(name: str) -> Any: + if name in _ERROR_EXPORTS: + module = import_module("._errors", __name__) + return getattr(module, name) + if name in _API_EXPORTS: + module = import_module(".api", __name__) + return getattr(module, name) + if name in _CWL_BUILDER_EXPORTS: + module = import_module(".cwl_builder", __name__) + return getattr(module, name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return sorted(set(globals()) | set(__all__)) diff --git a/src/sophios/apis/python/_api_config.py b/src/sophios/apis/python/_api_config.py new file mode 100644 index 00000000..ff604b99 --- /dev/null +++ b/src/sophios/apis/python/_api_config.py @@ -0,0 +1,9 @@ +"""Default runtime values for the Python workflow API.""" + +from pathlib import Path + +DEFAULT_RUN_ARGS: dict[str, str] = { + "cwl_runner": "cwltool", + "container_engine": "docker", + "pull_dir": str(Path().cwd()), +} diff --git a/src/sophios/apis/python/_cwl_builder_namespaces.py b/src/sophios/apis/python/_cwl_builder_namespaces.py new file mode 100644 index 00000000..08e46246 --- /dev/null +++ b/src/sophios/apis/python/_cwl_builder_namespaces.py @@ -0,0 +1,103 @@ +"""Private namespace objects for the public CWL builder. + +The builder surface is intentionally small: + +- ``cwl`` is the only CWL vocabulary namespace +- ``Field``, ``Input``, and ``Output`` are the actual spec classes +- ``Inputs`` and ``Outputs`` are named collections that derive parameter names + from Python keyword arguments +""" + +from typing import Any, Iterator, Mapping, TypeVar + +from ._cwl_builder_specs import FieldSpec, InputSpec, OutputSpec +from ._cwl_builder_support import _canonicalize_type, _merge_if_set, _record_type_payload + + +class _CWLNamespace: + """Namespace for CWL type vocabulary and composite types.""" + + __slots__ = () + + null = "null" + boolean = "boolean" + int = "int" + long = "long" + float = "float" + double = "double" + string = "string" + file = "File" + directory = "Directory" + + def optional(self, type_: Any) -> list[Any]: + """Wrap a CWL type in a nullable union.""" + canonical = _canonicalize_type(type_) + if isinstance(canonical, list) and self.null in canonical: + return canonical + return [self.null, canonical] + + def array(self, items: Any) -> dict[str, Any]: + """Create a CWL array type.""" + return {"type": "array", "items": _canonicalize_type(items)} + + def enum(self, *symbols: str, name: str | None = None) -> dict[str, Any]: + """Create a CWL enum type.""" + payload: dict[str, Any] = {"type": "enum", "symbols": list(symbols)} + _merge_if_set(payload, "name", name) + return payload + + def record( + self, + fields: Mapping[str, FieldSpec] | list[FieldSpec | dict[str, Any]], + *, + name: str | None = None, + ) -> dict[str, Any]: + """Create a CWL record type.""" + return _record_type_payload(fields, name=name) + + +cwl = _CWLNamespace() + +# Intentional aliasing: these are the real immutable spec objects, not thin +# wrapper namespaces. Making them directly callable keeps the required shape +# obvious: Input(type, ...), Output(type, ...), Field(type, ...). +Field = FieldSpec +Input = InputSpec +Output = OutputSpec + + +SpecT = TypeVar("SpecT", InputSpec, OutputSpec) + + +class _NamedCollection(Mapping[str, SpecT]): + _items: dict[str, SpecT] + + def __init__(self, **specs: SpecT) -> None: + self._items = {name: spec.named(name) for name, spec in specs.items()} + + def __getitem__(self, key: str) -> SpecT: + return self._items[key] + + def __iter__(self) -> Iterator[str]: + return iter(self._items) + + def __len__(self) -> int: + return len(self._items) + + def __getattr__(self, name: str) -> SpecT: + try: + return self._items[name] + except KeyError as exc: + raise AttributeError(name) from exc + + def to_dict(self) -> dict[str, Any]: + """Render the named collection into a CWL parameter mapping.""" + return {name: spec.to_dict() for name, spec in self._items.items()} + + +class Inputs(_NamedCollection[InputSpec]): + """Named CLT inputs. Names come from Python keyword arguments.""" + + +class Outputs(_NamedCollection[OutputSpec]): + """Named CLT outputs. Names come from Python keyword arguments.""" diff --git a/src/sophios/apis/python/_cwl_builder_specs.py b/src/sophios/apis/python/_cwl_builder_specs.py new file mode 100644 index 00000000..f9182733 --- /dev/null +++ b/src/sophios/apis/python/_cwl_builder_specs.py @@ -0,0 +1,765 @@ +"""Private dataclasses for the CWL builder.""" + +from __future__ import annotations + +# pylint: disable=missing-function-docstring,too-few-public-methods +# pylint: disable=too-many-instance-attributes,too-many-arguments +# pylint: disable=too-many-locals,redefined-builtin,too-many-lines +# These frozen dataclasses mirror the CWL schema closely, so field-rich +# constructors and small fluent helpers are intentional rather than accidental. + +from dataclasses import dataclass, field, fields as dataclass_fields +from typing import Any, ClassVar, Mapping, TypeVar, cast + +from ._cwl_builder_support import ( + _SUPPORT, + _apply_required, + _basename_expression, + _canonicalize_type, + _input_expression, + _merge_if_present, + _merge_if_set, + _named_parameter, + _optional_binding, + _record_type_payload, + _render, + _render_doc, +) + + +FrozenSpecT = TypeVar("FrozenSpecT") + + +def _replace_frozen(obj: FrozenSpecT, **changes: Any) -> FrozenSpecT: + """Copy a frozen dataclass-like object while overriding selected fields.""" + clone = object.__new__(obj.__class__) + values = { + item.name: getattr(obj, item.name) + for item in dataclass_fields(cast(Any, obj)) + } + values.update(changes) + for name, value in values.items(): + object.__setattr__(clone, name, value) + return clone + + +@dataclass(frozen=True, slots=True) +class SecondaryFile: + """A CWL secondary file pattern.""" + + pattern: Any + required: bool | str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> str | dict[str, Any]: + if self.required is None and not self.extra and isinstance(self.pattern, str): + return self.pattern + payload = {"pattern": _render(self.pattern)} + _merge_if_set(payload, "required", self.required) + payload.update(_render(self.extra)) + return payload + + +def secondary_file(pattern: Any, *, required: bool | str | None = None, **extra: Any) -> SecondaryFile: + """Create a secondary file specification.""" + return SecondaryFile(pattern=pattern, required=required, extra=dict(extra)) + + +@dataclass(frozen=True, slots=True) +class Dirent: + """A CWL InitialWorkDirRequirement listing entry.""" + + entry: Any + entryname: str | None = None + writable: bool | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + payload = {"entry": _render(self.entry)} + _merge_if_set(payload, "entryname", self.entryname) + _merge_if_set(payload, "writable", self.writable) + payload.update(_render(self.extra)) + return payload + + @classmethod + def from_input( + cls, + reference: Any, + *, + writable: bool = False, + entryname: str | None = None, + extra: dict[str, Any] | None = None, + ) -> Dirent: + name = _named_parameter(reference, kind="input") + return cls( + entry=_input_expression(name), + entryname=entryname or _basename_expression(name), + writable=writable, + extra=dict(extra or {}), + ) + + +@dataclass(frozen=True, slots=True) +class EnvironmentDef: + """An EnvVarRequirement entry.""" + + env_name: str + env_value: str + + def to_dict(self) -> dict[str, str]: + return {"envName": self.env_name, "envValue": self.env_value} + + +@dataclass(frozen=True, slots=True) +class CommandLineBinding: + """A CWL input binding or argument binding.""" + + position: int | float | None = None + prefix: str | None = None + separate: bool | None = None + item_separator: str | None = None + value_from: Any = None + shell_quote: bool | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + payload: dict[str, Any] = {} + _merge_if_set(payload, "position", self.position) + _merge_if_set(payload, "prefix", self.prefix) + _merge_if_set(payload, "separate", self.separate) + _merge_if_set(payload, "itemSeparator", self.item_separator) + _merge_if_set(payload, "valueFrom", self.value_from) + _merge_if_set(payload, "shellQuote", self.shell_quote) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class CommandOutputBinding: + """A CWL output binding.""" + + glob: Any = None + load_contents: bool | None = None + output_eval: str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + payload: dict[str, Any] = {} + _merge_if_set(payload, "glob", self.glob) + _merge_if_set(payload, "loadContents", self.load_contents) + _merge_if_set(payload, "outputEval", self.output_eval) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class CommandArgument: + """A structured CWL command-line argument.""" + + value: Any = None + binding: CommandLineBinding | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_yaml(self) -> str | dict[str, Any]: + binding_dict = {} if self.binding is None else self.binding.to_dict() + if self.value is None and not binding_dict and not self.extra: + return "" + if self.value is not None and not binding_dict and not self.extra and isinstance(self.value, str): + return str(self.value) + payload = dict(binding_dict) + _merge_if_set(payload, "valueFrom", self.value) + payload.update(_render(self.extra)) + return payload + + +class _RequirementSpec: + class_name: ClassVar[str] + + def to_fields(self) -> dict[str, Any]: + raise NotImplementedError + + +@dataclass(frozen=True, slots=True) +class DockerRequirement(_RequirementSpec): + """DockerRequirement helper.""" + + docker_pull: str | None = None + docker_load: str | None = None + docker_file: str | dict[str, Any] | None = None + docker_import: str | None = None + docker_image_id: str | None = None + docker_output_directory: str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "DockerRequirement" + + def to_fields(self) -> dict[str, Any]: + payload: dict[str, Any] = {} + _merge_if_set(payload, "dockerPull", self.docker_pull) + _merge_if_set(payload, "dockerLoad", self.docker_load) + _merge_if_set(payload, "dockerFile", self.docker_file) + _merge_if_set(payload, "dockerImport", self.docker_import) + _merge_if_set(payload, "dockerImageId", self.docker_image_id) + _merge_if_set(payload, "dockerOutputDirectory", self.docker_output_directory) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class InlineJavascriptRequirement(_RequirementSpec): + """InlineJavascriptRequirement helper.""" + + expression_lib: list[str] | None = None + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "InlineJavascriptRequirement" + + def to_fields(self) -> dict[str, Any]: + payload: dict[str, Any] = {} + if self.expression_lib: + payload["expressionLib"] = list(self.expression_lib) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class SchemaDefRequirement(_RequirementSpec): + """SchemaDefRequirement helper.""" + + types: list[Any] + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "SchemaDefRequirement" + + def to_fields(self) -> dict[str, Any]: + payload = {"types": [_canonicalize_type(type_) for type_ in self.types]} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class LoadListingRequirement(_RequirementSpec): + """LoadListingRequirement helper.""" + + load_listing: str + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "LoadListingRequirement" + + def to_fields(self) -> dict[str, Any]: + payload = {"loadListing": self.load_listing} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class ShellCommandRequirement(_RequirementSpec): + """ShellCommandRequirement helper.""" + + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "ShellCommandRequirement" + + def to_fields(self) -> dict[str, Any]: + return {key: _render(value) for key, value in self.extra.items()} + + +@dataclass(frozen=True, slots=True) +class SoftwarePackage: + """A SoftwareRequirement package entry.""" + + package: str + version: list[str] | None = None + specs: list[str] | None = None + extra: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + payload = {"package": self.package} + _merge_if_set(payload, "version", self.version) + _merge_if_set(payload, "specs", self.specs) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class SoftwareRequirement(_RequirementSpec): + """SoftwareRequirement helper.""" + + packages: list[SoftwarePackage | dict[str, Any]] + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "SoftwareRequirement" + + def to_fields(self) -> dict[str, Any]: + payload = {"packages": [_render(package) for package in self.packages]} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class InitialWorkDirRequirement(_RequirementSpec): + """InitialWorkDirRequirement helper.""" + + listing: Any + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "InitialWorkDirRequirement" + + def to_fields(self) -> dict[str, Any]: + payload = {"listing": _render(self.listing)} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class EnvVarRequirement(_RequirementSpec): + """EnvVarRequirement helper.""" + + env_def: list[EnvironmentDef | dict[str, Any]] + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "EnvVarRequirement" + + def to_fields(self) -> dict[str, Any]: + payload = {"envDef": [_render(item) for item in self.env_def]} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class ResourceRequirement(_RequirementSpec): + """ResourceRequirement helper.""" + + cores_min: int | float | str | None = None + cores_max: int | float | str | None = None + ram_min: int | float | str | None = None + ram_max: int | float | str | None = None + tmpdir_min: int | float | str | None = None + tmpdir_max: int | float | str | None = None + outdir_min: int | float | str | None = None + outdir_max: int | float | str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "ResourceRequirement" + + def to_fields(self) -> dict[str, Any]: + payload: dict[str, Any] = {} + _merge_if_set(payload, "coresMin", self.cores_min) + _merge_if_set(payload, "coresMax", self.cores_max) + _merge_if_set(payload, "ramMin", self.ram_min) + _merge_if_set(payload, "ramMax", self.ram_max) + _merge_if_set(payload, "tmpdirMin", self.tmpdir_min) + _merge_if_set(payload, "tmpdirMax", self.tmpdir_max) + _merge_if_set(payload, "outdirMin", self.outdir_min) + _merge_if_set(payload, "outdirMax", self.outdir_max) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class NetworkAccess(_RequirementSpec): + """NetworkAccess helper.""" + + network_access: bool | str + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "NetworkAccess" + + def to_fields(self) -> dict[str, Any]: + payload = {"networkAccess": self.network_access} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class WorkReuse(_RequirementSpec): + """WorkReuse helper.""" + + enable_reuse: bool | str + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "WorkReuse" + + def to_fields(self) -> dict[str, Any]: + payload = {"enableReuse": self.enable_reuse} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class InplaceUpdateRequirement(_RequirementSpec): + """InplaceUpdateRequirement helper.""" + + inplace_update: bool = True + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "InplaceUpdateRequirement" + + def to_fields(self) -> dict[str, Any]: + payload = {"inplaceUpdate": self.inplace_update} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True) +class ToolTimeLimit(_RequirementSpec): + """ToolTimeLimit helper.""" + + timelimit: int | str + extra: dict[str, Any] = field(default_factory=dict) + + class_name: ClassVar[str] = "ToolTimeLimit" + + def to_fields(self) -> dict[str, Any]: + payload = {"timelimit": self.timelimit} + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True, init=False) +class FieldSpec: + """A record field definition.""" + + type_: Any + name: str | None = None + label_text: str | None = None + doc_text: str | list[str] | None = None + default_value: Any = _SUPPORT.unset + extra: dict[str, Any] = field(default_factory=dict) + + def __init__( + self, + type_: Any, + *, + name: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + default: Any = _SUPPORT.unset, + extra: dict[str, Any] | None = None, + ) -> None: + object.__setattr__(self, "type_", type_) + object.__setattr__(self, "name", name) + object.__setattr__(self, "label_text", label) + object.__setattr__(self, "doc_text", doc) + object.__setattr__(self, "default_value", default) + object.__setattr__(self, "extra", dict(extra or {})) + + @classmethod + def array(cls, items: Any, **kwargs: Any) -> FieldSpec: + return cls({"type": "array", "items": _canonicalize_type(items)}, **kwargs) + + @classmethod + def enum(cls, *symbols: str, name: str | None = None, **kwargs: Any) -> FieldSpec: + payload: dict[str, Any] = {"type": "enum", "symbols": list(symbols)} + _merge_if_set(payload, "name", name) + return cls(payload, **kwargs) + + @classmethod + def record( + cls, + fields: Mapping[str, FieldSpec] | list[Any], + *, + name: str | None = None, + **kwargs: Any, + ) -> FieldSpec: + return cls(_record_type_payload(fields, name=name), **kwargs) + + def named(self, name: str) -> FieldSpec: + return _replace_frozen(self, name=name) + + def label(self, text: str) -> FieldSpec: + return _replace_frozen(self, label_text=text) + + def doc(self, text: str | list[str]) -> FieldSpec: + return _replace_frozen(self, doc_text=text) + + def default(self, value: Any) -> FieldSpec: + return _replace_frozen(self, default_value=value) + + def to_dict(self) -> dict[str, Any]: + if self.name is None: + raise ValueError("Record fields must have a name before serialization") + payload = {"name": self.name, "type": _canonicalize_type(self.type_)} + _merge_if_set(payload, "label", self.label_text) + _merge_if_set(payload, "doc", _render_doc(self.doc_text)) + _merge_if_present(payload, "default", self.default_value) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True, init=False) +class InputSpec: + """A CWL CommandLineTool input.""" + + type_: Any + position: int | float | None = None + flag: str | None = None + required: bool = True + separate: bool | None = None + item_separator: str | None = None + binding_value_from: Any = None + shell_quote: bool | None = None + label_text: str | None = None + doc_text: str | list[str] | None = None + format_value: Any = None + secondary_files_value: Any = None + streamable_value: bool | None = None + load_contents_value: bool | None = None + load_listing_value: str | None = None + default_value: Any = _SUPPORT.unset + binding_extra: dict[str, Any] = field(default_factory=dict) + extra: dict[str, Any] = field(default_factory=dict) + name: str | None = None + + def __init__( + self, + type_: Any, + *, + position: int | float | None = None, + flag: str | None = None, + required: bool = True, + separate: bool | None = None, + item_separator: str | None = None, + value_from: Any = None, + shell_quote: bool | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + format: Any = None, + secondary_files: Any = None, + streamable: bool | None = None, + load_contents: bool | None = None, + load_listing: str | None = None, + default: Any = _SUPPORT.unset, + binding_extra: dict[str, Any] | None = None, + extra: dict[str, Any] | None = None, + name: str | None = None, + ) -> None: + object.__setattr__(self, "type_", type_) + object.__setattr__(self, "position", position) + object.__setattr__(self, "flag", flag) + object.__setattr__(self, "required", required) + object.__setattr__(self, "separate", separate) + object.__setattr__(self, "item_separator", item_separator) + object.__setattr__(self, "binding_value_from", value_from) + object.__setattr__(self, "shell_quote", shell_quote) + object.__setattr__(self, "label_text", label) + object.__setattr__(self, "doc_text", doc) + object.__setattr__(self, "format_value", format) + object.__setattr__(self, "secondary_files_value", secondary_files) + object.__setattr__(self, "streamable_value", streamable) + object.__setattr__(self, "load_contents_value", load_contents) + object.__setattr__(self, "load_listing_value", load_listing) + object.__setattr__(self, "default_value", default) + object.__setattr__(self, "binding_extra", dict(binding_extra or {})) + object.__setattr__(self, "extra", dict(extra or {})) + object.__setattr__(self, "name", name) + + @classmethod + def array(cls, items: Any, **kwargs: Any) -> InputSpec: + return cls({"type": "array", "items": _canonicalize_type(items)}, **kwargs) + + @classmethod + def enum(cls, *symbols: str, name: str | None = None, **kwargs: Any) -> InputSpec: + payload: dict[str, Any] = {"type": "enum", "symbols": list(symbols)} + _merge_if_set(payload, "name", name) + return cls(payload, **kwargs) + + @classmethod + def record( + cls, + fields: Mapping[str, FieldSpec] | list[Any], + *, + name: str | None = None, + **kwargs: Any, + ) -> InputSpec: + return cls(_record_type_payload(fields, name=name), **kwargs) + + def named(self, name: str) -> InputSpec: + return _replace_frozen(self, name=name) + + def label(self, text: str) -> InputSpec: + return _replace_frozen(self, label_text=text) + + def doc(self, text: str | list[str]) -> InputSpec: + return _replace_frozen(self, doc_text=text) + + def default(self, value: Any) -> InputSpec: + return _replace_frozen(self, default_value=value) + + def format(self, value: Any) -> InputSpec: + return _replace_frozen(self, format_value=value) + + def secondary_files(self, *values: Any) -> InputSpec: + return _replace_frozen(self, secondary_files_value=list(values)) + + def streamable(self, value: bool) -> InputSpec: + return _replace_frozen(self, streamable_value=value) + + def load_contents(self, value: bool) -> InputSpec: + return _replace_frozen(self, load_contents_value=value) + + def load_listing(self, value: str) -> InputSpec: + return _replace_frozen(self, load_listing_value=value) + + def value_from(self, expression: Any) -> InputSpec: + return _replace_frozen(self, binding_value_from=expression) + + def to_dict(self) -> dict[str, Any]: + payload = {"type": _apply_required(self.type_, self.required)} + binding = _optional_binding( + CommandLineBinding( + position=self.position, + prefix=self.flag, + separate=self.separate, + item_separator=self.item_separator, + value_from=self.binding_value_from, + shell_quote=self.shell_quote, + extra=dict(self.binding_extra), + ) + ) + if binding is not None: + payload["inputBinding"] = binding.to_dict() + _merge_if_set(payload, "label", self.label_text) + _merge_if_set(payload, "doc", _render_doc(self.doc_text)) + _merge_if_set(payload, "format", self.format_value) + _merge_if_set(payload, "secondaryFiles", self.secondary_files_value) + _merge_if_set(payload, "streamable", self.streamable_value) + _merge_if_set(payload, "loadContents", self.load_contents_value) + _merge_if_set(payload, "loadListing", self.load_listing_value) + _merge_if_present(payload, "default", self.default_value) + payload.update(_render(self.extra)) + return payload + + +@dataclass(frozen=True, slots=True, init=False) +class OutputSpec: + """A CWL CommandLineTool output.""" + + type_: Any + required: bool = True + glob: Any = None + load_contents_value: bool | None = None + output_eval: str | None = None + label_text: str | None = None + doc_text: str | list[str] | None = None + format_value: Any = None + secondary_files_value: Any = None + streamable_value: bool | None = None + load_listing_value: str | None = None + binding_extra: dict[str, Any] = field(default_factory=dict) + extra: dict[str, Any] = field(default_factory=dict) + name: str | None = None + + def __init__( + self, + type_: Any, + *, + glob: Any = None, + from_input: Any = None, + required: bool = True, + load_contents: bool | None = None, + output_eval: str | None = None, + label: str | None = None, + doc: str | list[str] | None = None, + format: Any = None, + secondary_files: Any = None, + streamable: bool | None = None, + load_listing: str | None = None, + binding_extra: dict[str, Any] | None = None, + extra: dict[str, Any] | None = None, + name: str | None = None, + ) -> None: + if glob is not None and from_input is not None: + raise ValueError("Specify either glob= or from_input=, not both") + glob_value = ( + _basename_expression(_named_parameter(from_input, kind="input")) + if from_input is not None + else glob + ) + object.__setattr__(self, "type_", type_) + object.__setattr__(self, "required", required) + object.__setattr__(self, "glob", glob_value) + object.__setattr__(self, "load_contents_value", load_contents) + object.__setattr__(self, "output_eval", output_eval) + object.__setattr__(self, "label_text", label) + object.__setattr__(self, "doc_text", doc) + object.__setattr__(self, "format_value", format) + object.__setattr__(self, "secondary_files_value", secondary_files) + object.__setattr__(self, "streamable_value", streamable) + object.__setattr__(self, "load_listing_value", load_listing) + object.__setattr__(self, "binding_extra", dict(binding_extra or {})) + object.__setattr__(self, "extra", dict(extra or {})) + object.__setattr__(self, "name", name) + + @classmethod + def array(cls, items: Any, **kwargs: Any) -> OutputSpec: + return cls({"type": "array", "items": _canonicalize_type(items)}, **kwargs) + + @classmethod + def enum(cls, *symbols: str, name: str | None = None, **kwargs: Any) -> OutputSpec: + payload: dict[str, Any] = {"type": "enum", "symbols": list(symbols)} + _merge_if_set(payload, "name", name) + return cls(payload, **kwargs) + + @classmethod + def record( + cls, + fields: Mapping[str, FieldSpec] | list[Any], + *, + name: str | None = None, + **kwargs: Any, + ) -> OutputSpec: + return cls(_record_type_payload(fields, name=name), **kwargs) + + @classmethod + def stdout(cls, **kwargs: Any) -> OutputSpec: + return cls("stdout", **kwargs) + + @classmethod + def stderr(cls, **kwargs: Any) -> OutputSpec: + return cls("stderr", **kwargs) + + def named(self, name: str) -> OutputSpec: + return _replace_frozen(self, name=name) + + def label(self, text: str) -> OutputSpec: + return _replace_frozen(self, label_text=text) + + def doc(self, text: str | list[str]) -> OutputSpec: + return _replace_frozen(self, doc_text=text) + + def format(self, value: Any) -> OutputSpec: + return _replace_frozen(self, format_value=value) + + def secondary_files(self, *values: Any) -> OutputSpec: + return _replace_frozen(self, secondary_files_value=list(values)) + + def streamable(self, value: bool) -> OutputSpec: + return _replace_frozen(self, streamable_value=value) + + def load_listing(self, value: str) -> OutputSpec: + return _replace_frozen(self, load_listing_value=value) + + def load_contents(self, value: bool) -> OutputSpec: + return _replace_frozen(self, load_contents_value=value) + + def to_dict(self) -> dict[str, Any]: + payload = {"type": _apply_required(self.type_, self.required)} + binding = _optional_binding( + CommandOutputBinding( + glob=self.glob, + load_contents=self.load_contents_value, + output_eval=self.output_eval, + extra=dict(self.binding_extra), + ) + ) + if binding is not None: + payload["outputBinding"] = binding.to_dict() + _merge_if_set(payload, "label", self.label_text) + _merge_if_set(payload, "doc", _render_doc(self.doc_text)) + _merge_if_set(payload, "format", self.format_value) + _merge_if_set(payload, "secondaryFiles", self.secondary_files_value) + _merge_if_set(payload, "streamable", self.streamable_value) + _merge_if_set(payload, "loadListing", self.load_listing_value) + payload.update(_render(self.extra)) + return payload diff --git a/src/sophios/apis/python/_cwl_builder_step_bridge.py b/src/sophios/apis/python/_cwl_builder_step_bridge.py new file mode 100644 index 00000000..ead383bf --- /dev/null +++ b/src/sophios/apis/python/_cwl_builder_step_bridge.py @@ -0,0 +1,58 @@ +"""Small bridge between the CLT builder and the workflow Python API. + +This module is intentionally the only place that imports both surfaces. +Keeping the bridge narrow lets the builder and workflow DSL evolve mostly +independently while still supporting an in-memory handoff. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any, Protocol + +from sophios.wic_types import Tools + +if TYPE_CHECKING: + from .api import Step + + +class _CommandLineToolLike(Protocol): # pylint: disable=too-few-public-methods + """Minimal protocol needed to turn a built CLT into a workflow `Step`.""" + + name: str + + def to_dict(self) -> dict[str, Any]: + """Render the CLT to a plain CWL document.""" + + +def step_from_command_line_tool( + tool: _CommandLineToolLike, + *, + step_name: str | None = None, + run_path: str | Path | None = None, + config: dict[str, Any] | None = None, + tool_registry: Tools | None = None, +) -> Step: + """Convert a built CLT into a workflow `Step` without touching disk. + + Args: + tool (_CommandLineToolLike): Built CLT-like object with `name` and `to_dict()`. + step_name (str | None): Optional workflow step name override. + run_path (str | Path | None): Optional virtual `.cwl` path for compiler bookkeeping. + config (dict[str, Any] | None): Optional input values to pre-bind on the step. + tool_registry (Tools | None): Optional tool registry retained on the step. + + Returns: + Step: An in-memory workflow step backed by the built CLT. + """ + from .api import Step # pylint: disable=C0415:import-outside-toplevel + + resolved_name = step_name or tool.name + resolved_run_path = run_path or Path(f"{resolved_name}.cwl") + return Step.from_cwl( + tool.to_dict(), + process_name=resolved_name, + run_path=resolved_run_path, + config=config, + tool_registry=tool_registry, + ) diff --git a/src/sophios/apis/python/_cwl_builder_support.py b/src/sophios/apis/python/_cwl_builder_support.py new file mode 100644 index 00000000..cd45ed80 --- /dev/null +++ b/src/sophios/apis/python/_cwl_builder_support.py @@ -0,0 +1,346 @@ +"""Private support code for the public CWL builder façade. + +The public module deliberately keeps the visible API small. This helper module +holds the repetitive rendering, validation, and sanitization logic so the +main `cwl_builder.py` file can stay focused on the user-facing surface. +""" + +from argparse import Namespace +from dataclasses import dataclass +from pathlib import Path +import re +import tempfile +from types import MappingProxyType +import warnings +from typing import Any + +import yaml + + +@dataclass(frozen=True, slots=True) +class _BuilderRules: + """Immutable support namespace for CWL builder internals.""" + + unset: object + expression_markers: tuple[str, ...] + known_namespaces: MappingProxyType[str, str] + known_schemas: MappingProxyType[str, str] + dangerous_raw_keys: frozenset[str] + raw_class_name_pattern: str + reserved_document_keys: frozenset[str] + + +_SUPPORT = _BuilderRules( + unset=object(), + expression_markers=("$(", "${"), + known_namespaces=MappingProxyType( + { + "cwltool": "http://commonwl.org/cwltool#", + "edam": "https://edamontology.org/", + } + ), + known_schemas=MappingProxyType( + { + "edam": "https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl", + } + ), + dangerous_raw_keys=frozenset({"$graph", "$import", "$include", "$mixin"}), + raw_class_name_pattern=r"^[A-Za-z0-9_.-]+(?::[A-Za-z0-9_.-]+)?$", + reserved_document_keys=frozenset( + { + "$namespaces", + "$schemas", + "arguments", + "baseCommand", + "class", + "cwlVersion", + "doc", + "hints", + "id", + "inputs", + "intent", + "label", + "outputs", + "permanentFailCodes", + "requirements", + "stderr", + "stdin", + "stdout", + "successCodes", + "temporaryFailCodes", + } + ), +) + + +def _render(value: Any) -> Any: + """Render helper objects to plain CWL/YAML-compatible values.""" + match value: + case Path() as path: + return str(path) + case list() as items: + return [_render(item) for item in items] + case tuple() as items: + return [_render(item) for item in items] + case dict() as mapping: + return {key: _render(item) for key, item in mapping.items()} + case _ if hasattr(value, "to_dict") and callable(value.to_dict): + return _render(value.to_dict()) + case _: + return value + + +def _merge_if_set(target: dict[str, Any], key: str, value: Any) -> None: + if value is not None: + target[key] = _render(value) + + +def _merge_if_present(target: dict[str, Any], key: str, value: Any) -> None: + if value is not _SUPPORT.unset: + target[key] = _render(value) + + +def _render_doc(value: str | list[str] | None) -> str | list[str] | None: + match value: + case None: + return None + case str() as text: + return text + case list() as items: + return [str(item) for item in items] + + +def _record_type_payload( + fields: Any, + *, + name: str | None = None, +) -> dict[str, Any]: + """Build a CWL record schema payload from named or positional field specs.""" + field_defs = ( + [spec.named(field_name).to_dict() for field_name, spec in fields.items()] + if isinstance(fields, dict) + else [_render(field_spec) for field_spec in fields] + ) + payload: dict[str, Any] = {"type": "record", "fields": field_defs} + _merge_if_set(payload, "name", name) + return payload + + +def _canonicalize_type(type_: Any) -> Any: + rendered = _render(type_) + match rendered: + case str() as text if text.endswith("?"): + return ["null", _canonicalize_type(text[:-1])] + case str() as text if text.endswith("[]"): + return {"type": "array", "items": _canonicalize_type(text[:-2])} + case dict() as mapping if mapping.get("type") == "array" and "items" in mapping: + return {**mapping, "items": _canonicalize_type(mapping["items"])} + case _: + return rendered + + +def _apply_required(type_: Any, required: bool) -> Any: + if required: + return _canonicalize_type(type_) + canonical = _canonicalize_type(type_) + if isinstance(canonical, list) and "null" in canonical: + return canonical + return ["null", canonical] + + +def _contains_expression(value: Any) -> bool: + match value: + case str() as text: + return any(marker in text for marker in _SUPPORT.expression_markers) + case list() as items: + return any(_contains_expression(item) for item in items) + case tuple() as items: + return any(_contains_expression(item) for item in items) + case dict() as mapping: + return any(_contains_expression(item) for item in mapping.values()) + case _ if hasattr(value, "to_dict") and callable(value.to_dict): + return _contains_expression(value.to_dict()) + case _: + return False + + +def _input_expression(name: str) -> str: + return f"$(inputs.{name})" + + +def _basename_expression(name: str) -> str: + return f"$(inputs.{name}.basename)" + + +def _warn_raw_escape_hatch(context: str) -> None: + warnings.warn( + ( + f"{context} is using raw CWL injection. Structured helpers are safer; " + "this path is sanitized for common misuse but still bypasses most type-level guidance." + ), + UserWarning, + stacklevel=3, + ) + + +def _sanitize_raw_mapping( + mapping: dict[str, Any], + *, + context: str, + allow_class_key: bool = False, + reserved_keys: set[str] | None = None, +) -> dict[str, Any]: + rendered = _render(mapping) + if not isinstance(rendered, dict): + raise TypeError(f"{context} must be a mapping") + if any(not isinstance(key, str) or not key for key in rendered): + raise TypeError(f"{context} keys must be non-empty strings") + blocked = sorted(key for key in rendered if key in _SUPPORT.dangerous_raw_keys) + if blocked: + raise ValueError( + f"{context} does not accept SALAD document-assembly keys: {', '.join(blocked)}" + ) + if reserved_keys is not None: + collisions = sorted(key for key in rendered if key in reserved_keys) + if collisions: + raise ValueError( + f"{context} cannot override builder-managed keys: {', '.join(collisions)}" + ) + if not allow_class_key and "class" in rendered: + raise ValueError(f"{context} cannot set 'class' directly") + if any(key.startswith("$") for key in rendered): + raise ValueError(f"{context} does not accept raw '$'-prefixed document keys") + return rendered + + +def _named_parameter(reference: Any, *, kind: str) -> str: + match reference: + case str() as name: + return name + case _ if isinstance(getattr(reference, "name", None), str): + return str(reference.name) + case _: + raise TypeError(f"{kind} reference must be a named Input/Output or a string") + + +def _optional_binding(binding: Any) -> Any: + rendered = binding.to_dict() + return binding if rendered else None + + +@dataclass(frozen=True, slots=True) +class ValidationResult: + """Result of validating a generated CLT with cwltool/schema-salad.""" + + path: Path + uri: str + process: Any + + +class CWLBuilderValidationError(ValueError): + """Raised when a generated CLT fails schema validation.""" + + +def _import_cwltool_load_tool() -> Any: + try: + from cwltool import load_tool # pylint: disable=import-outside-toplevel + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "cwltool/schema_salad is required to validate generated CommandLineTools" + ) from exc + return load_tool + + +def _import_cwltool_validation_support() -> tuple[Any, Any, Any]: + try: + from cwltool.context import RuntimeContext # pylint: disable=import-outside-toplevel + from cwltool.main import ( # pylint: disable=import-outside-toplevel + get_default_args, + setup_loadingContext, + ) + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "cwltool/schema_salad is required to validate generated CommandLineTools" + ) from exc + return RuntimeContext, get_default_args, setup_loadingContext + + +def _build_validation_loading_context(path: Path, *, skip_schemas: bool = False) -> Any: + runtime_context_cls, get_default_args, setup_loading_context = _import_cwltool_validation_support() + args_dict = get_default_args() + args_dict.update( + { + "skip_schemas": skip_schemas, + "validate": True, + "workflow": str(path), + } + ) + args = Namespace(**args_dict) + runtime_context = runtime_context_cls(args_dict) + return setup_loading_context(None, runtime_context, args) + + +def validate_cwl_document( + document: dict[str, Any], + *, + filename: str = "tool.cwl", + skip_schemas: bool = False, +) -> ValidationResult: + """Validate a generated CLT document through cwltool/schema-salad.""" + with tempfile.TemporaryDirectory(prefix="sophios-cwl-builder-") as tmpdir: + temp_path = Path(tmpdir) / filename + temp_path.write_text( + yaml.safe_dump(_render(document), sort_keys=False, line_break="\n"), + encoding="utf-8", + ) + return _validate_path(temp_path, skip_schemas=skip_schemas) + + +def _validate_path(path: Path, *, skip_schemas: bool = False) -> ValidationResult: + load_tool = _import_cwltool_load_tool() + try: + loading_context = _build_validation_loading_context(path, skip_schemas=skip_schemas) + loading_context, workflowobj, uri = load_tool.fetch_document(str(path), loading_context) + loading_context, uri = load_tool.resolve_and_validate_document( + loading_context, + workflowobj, + uri, + preprocess_only=False, + ) + process = load_tool.make_tool(uri, loading_context) + except Exception as exc: # pylint: disable=W0718:broad-exception-caught + raise CWLBuilderValidationError(f"Generated CommandLineTool failed validation: {path}") from exc + return ValidationResult(path=path, uri=uri, process=process) + + +def _is_requirement_spec(value: Any) -> bool: + return hasattr(value, "class_name") and callable(getattr(value, "to_fields", None)) + + +def _normalize_requirement( + requirement: Any, + value: dict[str, Any] | None = None, +) -> tuple[str, dict[str, Any]]: + match requirement: + case str() as class_name: + if re.fullmatch(_SUPPORT.raw_class_name_pattern, class_name) is None: + raise ValueError(f"invalid requirement class name {class_name!r}") + payload = {} if value is None else _sanitize_raw_mapping(value, context=f"payload for {class_name}") + return class_name, payload + case _ if _is_requirement_spec(requirement): + return str(requirement.class_name), requirement.to_fields() + case dict() as payload: + _warn_raw_escape_hatch("requirement()/hint()") + payload_copy = _sanitize_raw_mapping( + payload, + context="raw requirement mapping", + allow_class_key=True, + ) + if "class" not in payload_copy: + raise ValueError("raw requirement dicts must include a 'class' key") + class_name = str(payload_copy.pop("class")) + if re.fullmatch(_SUPPORT.raw_class_name_pattern, class_name) is None: + raise ValueError(f"invalid requirement class name {class_name!r}") + return class_name, payload_copy + case _: + raise TypeError("requirement must be a class name, requirement spec, or raw dict") diff --git a/src/sophios/apis/python/_ports.py b/src/sophios/apis/python/_ports.py index 57c66469..f7b70962 100644 --- a/src/sophios/apis/python/_ports.py +++ b/src/sophios/apis/python/_ports.py @@ -1,59 +1,153 @@ -"""Port and collection models for the Python API.""" +"""Parameter and namespace helpers for the Python workflow API.""" from __future__ import annotations -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Iterator, Optional, Union +from dataclasses import dataclass, field +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Callable, Generic, Iterator, TypeVar -from ._utils import normalize_port_name, normalize_port_type, serialize_value +from ._utils import (infer_literal_parameter_type, + is_array_type, + normalize_parameter_name, + normalize_parameter_type, + serialize_value) if TYPE_CHECKING: - from .api import Step, Workflow + from .api import Workflow -@dataclass(frozen=True) +ParameterT = TypeVar("ParameterT") +ViewT = TypeVar("ViewT") + + +@dataclass(frozen=True, slots=True) class InlineBinding: + """Inline literal bound to an input parameter.""" + value: Any -@dataclass(frozen=True) +@dataclass(frozen=True, slots=True) class AliasBinding: + """Reference to an upstream step output anchor.""" + alias: Any -@dataclass(frozen=True) +@dataclass(frozen=True, slots=True) class WorkflowBinding: + """Reference to a formal workflow input.""" + name: str -InputBinding = Union[InlineBinding, AliasBinding, WorkflowBinding] +InputBinding = InlineBinding | AliasBinding | WorkflowBinding -class ProcessInput: - """Input of a CWL CommandLineTool or Workflow.""" +@dataclass(frozen=True, slots=True) +class OutputSourceBinding: + """Source exposed as a formal workflow output.""" - inp_type: Any - name: str - parent_obj: Any - required: bool - linked: bool - _binding: Optional[InputBinding] - - def __init__(self, name: str, inp_type: Any, parent_obj: Any = None) -> None: - normalized_type, required = normalize_port_type(inp_type) - self.inp_type = normalized_type - self.name = normalize_port_name(name) - self.parent_obj = parent_obj - self.required = required - self.linked = False - self._binding = None + step_id: str | None + source_name: str + + def to_output_source(self, step_id_overrides: Mapping[str, str] | None = None) -> str: + """Render the CWL `outputSource` string for a workflow output. + + Args: + step_id_overrides (Mapping[str, str] | None): Optional mapping from + user-facing step names to compiler-assigned concrete step ids. + + Returns: + str: The serialized CWL `outputSource` value. + """ + if self.step_id is None: + return self.source_name + resolved_step_id = step_id_overrides.get(self.step_id, self.step_id) if step_id_overrides else self.step_id + return f"{resolved_step_id}/{self.source_name}" + + +@dataclass(slots=True) +class ParameterStore(Generic[ParameterT]): + """Ordered name -> parameter mapping. + + Python dicts preserve insertion order, so one mapping is enough to support + both explicit lookup and list-like indexing for the `.inputs[...]` style. + """ + + parameters: dict[str, ParameterT] = field(default_factory=dict) + + def add(self, parameter: ParameterT, *, name: str | None = None) -> ParameterT: + self.parameters[name or getattr(parameter, "name")] = parameter + return parameter + + def get(self, name: str) -> ParameterT: + return self.parameters[name] + + def ensure(self, name: str, factory: Callable[[str], ParameterT]) -> ParameterT: + if name not in self.parameters: + self.parameters[name] = factory(name) + return self.parameters[name] + + def __contains__(self, name: object) -> bool: + return name in self.parameters + + def __iter__(self) -> Iterator[ParameterT]: + return iter(self.parameters.values()) + + def __len__(self) -> int: + return len(self.parameters) + + def __getitem__(self, index: int) -> ParameterT: + return tuple(self.parameters.values())[index] def __repr__(self) -> str: - return f"ProcessInput(name={self.name!r}, inp_type={self.inp_type!r})" + return repr(tuple(self.parameters.values())) + + +@dataclass(slots=True) +class _ParameterBase: + """Shared state for named workflow/tool interface parameters.""" + + name: str + parameter_type: Any + parent_obj: Any = None + required: bool = field(init=False) + linked: bool = field(default=False, init=False) + + def __post_init__(self) -> None: + self.set_parameter_type(self.parameter_type) + self.name = normalize_parameter_name(self.name) + + def set_parameter_type(self, value: Any) -> None: + """Normalize and assign a parameter type expression.""" + self.parameter_type, self.required = normalize_parameter_type(value) + + def cwl_type(self) -> Any: + """Return the CWL type expression including optionality.""" + if self.parameter_type is None: + return None + if self.required: + return serialize_value(self.parameter_type) + match self.parameter_type: + case list() as options if "null" in options: + return serialize_value(options) + case list() as options: + return ["null", *serialize_value(options)] + case _: + return ["null", serialize_value(self.parameter_type)] + + +@dataclass(slots=True) +class InputParameter(_ParameterBase): + """Input parameter of a CWL `CommandLineTool` or `Workflow`.""" + + _binding: InputBinding | None = field(default=None, init=False, repr=False) + _bound_parameter_type: Any = field(default=None, init=False, repr=False) @property def value(self) -> Any: - """Compatibility view of the current binding.""" + """Return the bound value in the legacy compatibility shape.""" match self._binding: case None: return None @@ -63,31 +157,40 @@ def value(self) -> Any: return {"wic_alias": serialize_value(alias)} case WorkflowBinding(name=name): return name - return None def _set_value(self, value: Any, linked: bool = False) -> None: - """Compatibility helper used by older internal code paths.""" + """Translate legacy serialized values into the internal binding model.""" match value: case {"wic_alias": alias} if linked: - self._binding = AliasBinding(alias) - self.linked = True - case {"wic_inline_input": inline_value} if not linked: - self._binding = InlineBinding(inline_value) - self.linked = False + self._set_binding(AliasBinding(alias)) + case {"wic_inline_input": inline_value}: + self._set_binding(InlineBinding(inline_value)) + self.set_bound_parameter_type(infer_literal_parameter_type(inline_value)) case str() as workflow_name if linked: - self._binding = WorkflowBinding(workflow_name) - self.linked = True + self._set_binding(WorkflowBinding(workflow_name)) case _: - self._binding = InlineBinding(value) + self._set_binding(InlineBinding(value)) + self.set_bound_parameter_type(infer_literal_parameter_type(value)) self.linked = linked - def _set_binding(self, binding: Optional[InputBinding]) -> None: + def _set_binding(self, binding: InputBinding | None) -> None: self._binding = binding - match binding: - case AliasBinding() | WorkflowBinding(): - self.linked = True + self.linked = isinstance(binding, (AliasBinding, WorkflowBinding)) + + def set_bound_parameter_type(self, value: Any) -> None: + """Record the type of the bound value when it is known.""" + normalized, _required = normalize_parameter_type(value) + self._bound_parameter_type = normalized + + def is_scatterable(self) -> bool: + """Return whether the current binding can be scattered safely.""" + match self._binding: + case InlineBinding(value=list() | tuple()): + return True + case None: + return False case _: - self.linked = False + return is_array_type(self._bound_parameter_type) def is_bound(self) -> bool: return self._binding is not None @@ -102,36 +205,18 @@ def to_yaml_value(self) -> Any: return {"wic_alias": serialize_value(alias)} case WorkflowBinding(name=name): return name - return None -class ProcessOutput: - """Output of a CWL CommandLineTool or Workflow.""" +@dataclass(slots=True) +class OutputParameter(_ParameterBase): + """Output parameter of a CWL `CommandLineTool` or `Workflow`.""" - out_type: Any - name: str - parent_obj: Any - required: bool - linked: bool - _anchor_name: Optional[str] - - def __init__(self, name: str, out_type: Any, parent_obj: Any = None) -> None: - normalized_type, required = normalize_port_type(out_type) - self.out_type = normalized_type - self.name = normalize_port_name(name) - self.parent_obj = parent_obj - self.required = required - self.linked = False - self._anchor_name = None - - def __repr__(self) -> str: - return f"ProcessOutput(name={self.name!r}, out_type={self.out_type!r})" + _anchor_name: str | None = field(default=None, init=False, repr=False) + _source: OutputSourceBinding | None = field(default=None, init=False, repr=False) @property def value(self) -> Any: - if self._anchor_name is None: - return None - return {"wic_anchor": self._anchor_name} + return None if self._anchor_name is None else {"wic_anchor": self._anchor_name} def ensure_anchor(self, suggested_name: str) -> str: if self._anchor_name is None: @@ -139,140 +224,105 @@ def ensure_anchor(self, suggested_name: str) -> str: self.linked = True return self._anchor_name + def bind_source(self, source: OutputSourceBinding) -> None: + self._source = source + self.linked = True + + def has_source(self) -> bool: + return self._source is not None + + def to_workflow_output( + self, + *, + step_id_overrides: Mapping[str, str] | None = None, + ) -> dict[str, Any]: + """Serialize this workflow output parameter to CWL. + + Args: + step_id_overrides (Mapping[str, str] | None): Optional mapping from + user-facing step names to compiler-assigned concrete step ids. + + Raises: + ValueError: If the output has no source or no resolved type. + + Returns: + dict[str, Any]: Serialized CWL workflow output definition. + """ + if self._source is None: + raise ValueError(f"workflow output {self.name!r} has no source binding") + cwl_type = self.cwl_type() + if cwl_type is None: + raise ValueError(f"workflow output {self.name!r} has no resolved type") + return { + "type": cwl_type, + "outputSource": self._source.to_output_source(step_id_overrides), + } + def _set_value(self, value: Any, linked: bool = False) -> None: match value: case {"wic_anchor": anchor_name}: self._anchor_name = str(anchor_name) case str() as anchor_name: - self._anchor_name = anchor_name + self._anchor_name = str(anchor_name) case None: self._anchor_name = None self.linked = linked or self._anchor_name is not None +@dataclass(frozen=True, slots=True) class WorkflowInputReference: - """A symbolic reference to a workflow input variable.""" + """Symbolic reference to a workflow input variable.""" workflow: Workflow name: str - - def __init__(self, workflow: Workflow, name: str) -> None: - self.workflow = workflow - self.name = name - - def __repr__(self) -> str: - return f"WorkflowInputReference(workflow={self.workflow.process_name!r}, name={self.name!r})" - - -class StepInputs: - """List-like view of a Step's inputs with explicit named access.""" - - _step: Step - - def __init__(self, step: Step) -> None: - object.__setattr__(self, "_step", step) - - def __iter__(self) -> Iterator[ProcessInput]: - return iter(self._step._inputs) - - def __len__(self) -> int: - return len(self._step._inputs) - - def __getitem__(self, index: int) -> ProcessInput: - return self._step._inputs[index] - - def __getattr__(self, name: str) -> ProcessInput: - return self._step.get_inp_attr(name) - - def __setattr__(self, name: str, value: Any) -> None: - if name == "_step": - object.__setattr__(self, name, value) - return - self._step.bind_input(name, value) - - def __repr__(self) -> str: - return repr(self._step._inputs) - - -class StepOutputs: - """List-like view of a Step's outputs with explicit named access.""" - - _step: Step - - def __init__(self, step: Step) -> None: - object.__setattr__(self, "_step", step) - - def __iter__(self) -> Iterator[ProcessOutput]: - return iter(self._step._outputs) - - def __len__(self) -> int: - return len(self._step._outputs) - - def __getitem__(self, index: int) -> ProcessOutput: - return self._step._outputs[index] - - def __getattr__(self, name: str) -> ProcessOutput: - return self._step.get_output(name) - - def __setattr__(self, name: str, value: Any) -> None: - raise AttributeError(f"Step outputs are read-only; cannot set {name!r}") - - def __repr__(self) -> str: - return repr(self._step._outputs) - - -class WorkflowInputs: - """List-like view of a Workflow's inputs with explicit named access.""" - - _workflow: Workflow - - def __init__(self, workflow: Workflow) -> None: - object.__setattr__(self, "_workflow", workflow) - - def __iter__(self) -> Iterator[ProcessInput]: - return iter(self._workflow._inputs) + implicit: bool = False + + +class ParameterNamespace(Generic[ParameterT, ViewT]): + """List-like attribute namespace for input and output parameters. + + The "magic" lives here: `step.inputs.foo`, `workflow.inputs.foo`, and + `step.outputs.bar` all route through the same tiny proxy instead of four + near-duplicate wrapper classes. + """ + + _store: ParameterStore[ParameterT] + _getter: Callable[[str], ViewT] + _setter: Callable[[str, Any], None] | None + _read_only_error: str + + def __init__( + self, + store: ParameterStore[ParameterT], + getter: Callable[[str], ViewT], + setter: Callable[[str, Any], None] | None, + *, + read_only_error: str, + ) -> None: + object.__setattr__(self, "_store", store) + object.__setattr__(self, "_getter", getter) + object.__setattr__(self, "_setter", setter) + object.__setattr__(self, "_read_only_error", read_only_error) + + def __iter__(self) -> Iterator[ParameterT]: + return iter(self._store) def __len__(self) -> int: - return len(self._workflow._inputs) + return len(self._store) - def __getitem__(self, index: int) -> ProcessInput: - return self._workflow._inputs[index] + def __getitem__(self, index: int) -> ParameterT: + return self._store[index] - def __getattr__(self, name: str) -> WorkflowInputReference: - return self._workflow._ensure_input_reference(name) + def __getattr__(self, name: str) -> ViewT: + return self._getter(name) def __setattr__(self, name: str, value: Any) -> None: - if name == "_workflow": + if name.startswith("_"): object.__setattr__(self, name, value) return - self._workflow.bind_input(name, value) - - def __repr__(self) -> str: - return repr(self._workflow._inputs) - - -class WorkflowOutputs: - """List-like view of a Workflow's declared outputs.""" - - _workflow: Workflow - - def __init__(self, workflow: Workflow) -> None: - object.__setattr__(self, "_workflow", workflow) - - def __iter__(self) -> Iterator[ProcessOutput]: - return iter(self._workflow._outputs) - - def __len__(self) -> int: - return len(self._workflow._outputs) - - def __getitem__(self, index: int) -> ProcessOutput: - return self._workflow._outputs[index] - - def __getattr__(self, name: str) -> ProcessOutput: - return self._workflow.add_output(name) - - def __setattr__(self, name: str, value: Any) -> None: - raise AttributeError(f"Workflow outputs are read-only; cannot set {name!r}") + if self._setter is None: + raise AttributeError(self._read_only_error.format(name=name)) + self._setter(name, value) def __repr__(self) -> str: - return repr(self._workflow._outputs) + return repr(self._store) diff --git a/src/sophios/apis/python/_types.py b/src/sophios/apis/python/_types.py index 6d5f6f34..38f9c702 100644 --- a/src/sophios/apis/python/_types.py +++ b/src/sophios/apis/python/_types.py @@ -1,8 +1,16 @@ -"""CWL Types.""" +"""CWL type definitions used by the Python APIs.""" + from enum import Enum -class CWLTypesEnum(str, Enum): +class CWLAtomicType(str, Enum): + """Atomic CWL type names. + + These are the string-valued leaf types that may appear directly in CWL + input/output declarations. Structured types such as arrays, enums, and + records are represented separately as schema objects. + """ + NULL = "null" BOOLEAN = "boolean" INT = "int" @@ -12,9 +20,10 @@ class CWLTypesEnum(str, Enum): STRING = "string" FILE = "File" DIRECTORY = "Directory" + ANY = "Any" -class ScatterMethod(Enum): +class ScatterMethod(str, Enum): dotproduct = "dotproduct" flat_crossproduct = "flat_crossproduct" nested_crossproduct = "nested_crossproduct" diff --git a/src/sophios/apis/python/_utils.py b/src/sophios/apis/python/_utils.py index 7f23c897..4e488f4a 100644 --- a/src/sophios/apis/python/_utils.py +++ b/src/sophios/apis/python/_utils.py @@ -1,7 +1,5 @@ """Internal helpers for the Python API.""" -from __future__ import annotations - from pathlib import Path from typing import Any @@ -10,30 +8,41 @@ from sophios import utils_cwl from ._errors import InvalidInputValueError +from ._types import CWLAtomicType -def default_dict() -> dict[str, Any]: - return {} - - -def normalize_port_name(cwl_id: str) -> str: - """Return the local port name from a CWL id.""" +def normalize_parameter_name(cwl_id: str) -> str: + """Return the local parameter name from a CWL id.""" return cwl_id.split("#")[-1] -def normalize_port_type(port_type: Any) -> tuple[Any, bool]: - """Return the canonicalized port type and whether it is required.""" - canonical = utils_cwl.canonicalize_type(port_type) +def normalize_parameter_type(parameter_type: Any) -> tuple[Any, bool]: + """Return the canonicalized parameter type and whether it is required.""" + if parameter_type is None: + return None, True + canonical = utils_cwl.canonicalize_type(parameter_type) match canonical: case list() as options: - required = "null" not in options - non_null_types = [entry for entry in options if entry != "null"] - canonical = non_null_types[0] if non_null_types else options[0] + null_type = CWLAtomicType.NULL.value + required = null_type not in options + non_null_types = [entry for entry in options if entry != null_type] + canonical = non_null_types[0] if len(non_null_types) == 1 else non_null_types case _: required = True return canonical, required +def is_array_type(parameter_type: Any) -> bool: + """Return whether a normalized CWL type expression represents an array.""" + match parameter_type: + case {"type": "array"}: + return True + case list() as options: + return any(is_array_type(option) for option in options if option != CWLAtomicType.NULL.value) + case _: + return False + + def serialize_value(value: Any) -> Any: """Convert Path objects into YAML-safe values while preserving structure.""" match value: @@ -49,18 +58,81 @@ def serialize_value(value: Any) -> Any: return value +def infer_literal_parameter_type(value: Any) -> Any: + """Infer a CWL type expression from a Python literal when practical.""" + match value: + case None: + return CWLAtomicType.NULL.value + case bool(): + return CWLAtomicType.BOOLEAN.value + case int(): + return CWLAtomicType.INT.value + case float(): + return CWLAtomicType.FLOAT.value + case str(): + return CWLAtomicType.STRING.value + case Path() as path if path.exists(): + return CWLAtomicType.DIRECTORY.value if path.is_dir() else CWLAtomicType.FILE.value + case Path() as path if path.suffix: + return CWLAtomicType.FILE.value + case Path(): + return CWLAtomicType.DIRECTORY.value + case list() | tuple() as items: + if not items: + return None + inferred_item_types = [] + for item in items: + inferred = infer_literal_parameter_type(item) + if inferred is None: + return None + inferred_item_types.append(inferred) + unique_types = [] + for inferred in inferred_item_types: + if inferred not in unique_types: + unique_types.append(inferred) + if len(unique_types) != 1: + return None + return {"type": "array", "items": unique_types[0]} + case {"class": "File" | "Directory" as class_name}: + return class_name + case _: + return None + + +def _validate_fs_object(path_value: Path, *, class_name: str) -> Path: + if class_name == "Directory": + if not path_value.is_dir(): + raise InvalidInputValueError(f"{str(path_value)} is not a directory") + return path_value + if class_name == "File": + if not path_value.is_file(): + raise InvalidInputValueError(f"{str(path_value)} is not a file") + return path_value + raise InvalidInputValueError(f"Unsupported CWL object class {class_name!r}") + + def get_value_from_cfg(value: Any) -> Any: + """Normalize config values into Python values accepted by the DSL. + + This supports the common CWL input-object shapes users put in YAML config + files, notably `File`, `Directory`, and arrays/records containing them. + """ match value: - case dict() as data if "Directory" in data.values(): + case list() as items: + return [get_value_from_cfg(item) for item in items] + case tuple() as items: + return [get_value_from_cfg(item) for item in items] + case dict() as data if data.get("class") in {"Directory", "File"}: try: - value_ = Path(data["location"]) + path_text = data.get("location", data.get("path")) + if path_text is None: + raise KeyError("location") + path_value = Path(path_text) except Exception as exc: raise InvalidInputValueError() from exc - if not value_.is_dir(): - raise InvalidInputValueError(f"{str(value_)} is not a directory") - return value_ - case dict(): - return value + return _validate_fs_object(path_value, class_name=str(data["class"])) + case dict() as data: + return {key: get_value_from_cfg(item) for key, item in data.items()} case _: return value diff --git a/src/sophios/apis/python/_workflow_runtime.py b/src/sophios/apis/python/_workflow_runtime.py new file mode 100644 index 00000000..a5ff5a2a --- /dev/null +++ b/src/sophios/apis/python/_workflow_runtime.py @@ -0,0 +1,521 @@ +"""Internal runtime helpers for the Python workflow API. + +This module keeps filesystem loading, compilation, and execution details out +of `api.py` so the public `Step` and `Workflow` classes stay focused on the +Python-facing DSL. +""" + +from __future__ import annotations + +# pylint: disable=protected-access +# This module is the private adapter layer between the DSL objects and the +# legacy compiler/runtime internals, so reaching internal state is intentional. + +import logging +from pathlib import Path, PurePath +from typing import TYPE_CHECKING, Any, Mapping, Protocol, TypeVar + +import yaml +from cwl_utils.parser import CommandLineTool as CWLCommandLineTool +from cwl_utils.parser import load_document_by_uri, load_document_by_yaml + +from sophios import compiler, input_output, plugins, post_compile as pc, run_local as rl +from sophios.cli import get_dicts_for_compilation, get_known_and_unknown_args +from sophios.utils import convert_args_dict_to_args_list, step_name_str +from sophios.utils_graphs import get_graph_reps +from sophios.wic_types import CompilerInfo, Json, RoseTree, StepId, Tool, Tools, YamlTree + +from ._errors import InvalidCLTError, InvalidStepError +from ._ports import InputParameter, OutputParameter, ParameterStore +from ._types import ScatterMethod +from ._utils import load_yaml as _load_yaml +from ._api_config import DEFAULT_RUN_ARGS + +if TYPE_CHECKING: + from .api import Step, Workflow + + +logger = logging.getLogger("WIC Python API") + +ParameterT = TypeVar("ParameterT") + + +class _CWLParameterDefinition(Protocol): # pylint: disable=too-few-public-methods + """Minimal structural type shared by parsed CWL input/output parameters.""" + + id: Any + type_: Any + + +def _parameter_name(parameter_id: Any) -> str: + """Normalize a CWL parameter id to its public parameter name.""" + text = str(parameter_id) + return text.rsplit("#", maxsplit=1)[-1].rsplit("/", maxsplit=1)[-1] + + +def coerce_path(value: str | Path | None, *, field_name: str, allow_none: bool = False) -> Path | None: + """Normalize string-like path input to `Path`. + + Args: + value (str | Path | None): Incoming path-like value. + field_name (str): User-facing parameter name for error messages. + allow_none (bool): Whether `None` should be accepted. + + Raises: + TypeError: If the value is neither a `Path`, `str`, nor allowed `None`. + + Returns: + Path | None: The normalized path or `None`. + """ + match value: + case Path() as path: + return path + case str() as path_str: + return Path(path_str) + case None if allow_none: + return None + case _: + allowed = "Path or str, or None" if allow_none else "Path or str" + raise TypeError(f"{field_name} must be a {allowed}") + + +def normalize_workflow_name(workflow_name: str) -> str: + """Convert a user-facing workflow name into a filesystem-safe id. + + Args: + workflow_name (str): Original workflow name. + + Returns: + str: Normalized workflow id used by the Python API. + """ + normalized_name = workflow_name.lstrip("/").lstrip(" ") + parts = PurePath(normalized_name).parts + return "_".join(part for part in parts if part).lstrip("_").replace(" ", "_") + + +def lookup_parameter( + parameters: ParameterStore[ParameterT], + name: str, + *, + owner_name: str, + kind: str, +) -> ParameterT: + """Return a parameter from a named parameter store. + + Args: + parameters (ParameterStore[ParameterT]): Store holding the available parameters. + name (str): Requested parameter name. + owner_name (str): Human-readable process name for error messages. + kind (str): Parameter kind, such as `"input"` or `"output"`. + + Raises: + AttributeError: If the parameter does not exist. + + Returns: + ParameterT: The requested parameter object. + """ + try: + return parameters.get(name) + except KeyError as exc: + raise AttributeError(f"{owner_name!r} has no {kind} named {name!r}") from exc + + +def _validate_scatter_assignment(items: list[Any], owner: Any | None = None) -> None: + """Validate `scatter` assignments on a step.""" + if not all(isinstance(item, InputParameter) for item in items): + raise TypeError("all scatter inputs must be InputParameter type") + if len({id(item) for item in items}) != len(items): + raise ValueError("scatter inputs must be unique") + if owner is None: + return + for item in items: + if item.parent_obj is not owner: + raise ValueError("scatter inputs must belong to the same step") + if not item.is_bound(): + raise ValueError("scatter inputs must be bound before scattering") + if not item.is_scatterable(): + raise ValueError("scatter inputs must be bound to array-valued data") + + +def _validate_scatter_method_assignment(scatter_method: str) -> None: + """Validate the `scatterMethod` special step attribute.""" + allowed = {member.value for member in ScatterMethod} + if scatter_method not in allowed: + raise ValueError( + "Invalid value for scatterMethod. " + f"Valid values are: {', '.join(sorted(allowed))}" + ) + + +def _validate_when_assignment(condition: str) -> None: + """Validate the `when` JavaScript expression wrapper.""" + if not condition.startswith("$(") or not condition.endswith(")"): + raise ValueError("Invalid input to when. The js string must start with '$(' and end with ')'") + + +def validate_step_assignment(name: str, value: Any, *, owner: Any | None = None) -> None: + """Validate assignments to special step attributes. + + Args: + name (str): Attribute name being assigned. + value (Any): Candidate value for that attribute. + owner (Any | None): Optional `Step` owning the assignment. + + Raises: + TypeError: If `scatter` is not a list of `InputParameter` values. + ValueError: If `scatterMethod` or `when` receive invalid values. + + Returns: + None: Validation happens for its side effect of raising on invalid input. + """ + match name, value: + case "scatter", list() as items: + _validate_scatter_assignment(items, owner=owner) + case "scatter", invalid if invalid: + raise TypeError("scatter must be assigned a list of InputParameter values") + case "scatterMethod", str() as scatter_method if scatter_method: + _validate_scatter_method_assignment(scatter_method) + case "when", str() as condition if condition: + _validate_when_assignment(condition) + case "when", invalid if invalid: + raise ValueError("Invalid input to when. The js string must start with '$(' and end with ')'") + + +def populate_parameters( + cwl_parameters: list[_CWLParameterDefinition], + store: ParameterStore[Any], + parameter_cls: type[InputParameter] | type[OutputParameter], + *, + parent: Any, +) -> None: + """Populate a parameter store from CWL input or output declarations. + + Args: + cwl_parameters (list[_CWLParameterDefinition]): Parsed CWL parameters. + store (ParameterStore[Any]): Destination store for Python API parameter wrappers. + parameter_cls (type[InputParameter] | type[OutputParameter]): Wrapper type to instantiate. + parent (Any): Owning `Step` or `Workflow`. + + Returns: + None: The destination store is populated in place. + """ + for parameter in cwl_parameters: + store.add(parameter_cls(_parameter_name(parameter.id), parameter.type_, parent_obj=parent)) + + +def load_clt(clt_path: Path, tool_registry: Tools) -> tuple[CWLCommandLineTool, dict[str, Any]]: + """Load a CWL CommandLineTool from disk or a fallback registry. + + Args: + clt_path (Path): Filesystem path to the CWL tool. + tool_registry (Tools): Registry used when the file is unavailable on disk. + + Raises: + InvalidCLTError: If the tool cannot be loaded from disk or the registry. + + Returns: + tuple[CWLCommandLineTool, dict[str, Any]]: Parsed CWL object and raw YAML. + """ + stepid = StepId(clt_path.stem, "global") + + if clt_path.exists(): + try: + clt = load_document_by_uri(clt_path) + except Exception as exc: + raise InvalidCLTError(f"invalid cwl file: {clt_path}") from exc + yaml_file = _load_yaml(clt_path) + tool_registry[stepid] = Tool(str(clt_path), yaml_file) + return clt, yaml_file + + if stepid in tool_registry: + tool = tool_registry[stepid] + logger.info("%s does not exist, but %s was found in the provided tool registry.", clt_path, clt_path.stem) + logger.info("Using file contents from %s", tool.run_path) + yaml_file = tool.cwl + clt = load_document_by_yaml(yaml_file, tool.run_path) + return clt, yaml_file + + logger.warning("Warning! %s does not exist, and", clt_path) + logger.warning("%s was not found in the provided tool registry.", clt_path.stem) + raise InvalidCLTError(f"invalid cwl file: {clt_path}") + + +def load_clt_document( + document: Mapping[str, Any], + *, + run_path: Path, +) -> tuple[CWLCommandLineTool, dict[str, Any]]: + """Load an in-memory CWL CommandLineTool document. + + Args: + document (Mapping[str, Any]): Parsed CWL document. + run_path (Path): Virtual run path used as the tool base URI. + + Raises: + TypeError: If `document` does not normalize to a mapping. + InvalidCLTError: If the CWL document cannot be parsed. + + Returns: + tuple[CWLCommandLineTool, dict[str, Any]]: Parsed CWL object and normalized YAML. + """ + yaml_file = yaml.safe_load(yaml.safe_dump(dict(document), sort_keys=False)) + if not isinstance(yaml_file, dict): + raise TypeError("document must be a mapping of CWL fields") + try: + clt = load_document_by_yaml(yaml_file, str(run_path)) + except Exception as exc: + raise InvalidCLTError(f"invalid cwl document for: {run_path}") from exc + return clt, yaml_file + + +def workflow_document( + workflow: Workflow, + *, + inline_subtrees: bool, + directory: Path | None = None, + concrete_step_ids: bool = False, +) -> dict[str, Any]: + """Render a workflow into its in-memory WIC YAML representation. + + Args: + workflow (Workflow): Workflow to serialize. + inline_subtrees (bool): Whether nested workflows should be embedded inline. + directory (Path | None): Output directory for sibling `.wic` files. + concrete_step_ids (bool): Whether workflow outputs should use the + compiler's concrete step ids instead of the user-facing step names. + + Returns: + dict[str, Any]: Serialized workflow document. + """ + from .api import Workflow # pylint: disable=import-outside-toplevel + + workflow_inputs: dict[str, dict[str, Any]] = {} + for parameter in workflow._inputs: + cwl_type = parameter.cwl_type() + if cwl_type is None: + raise InvalidStepError( + f"workflow input {workflow.process_name}.{parameter.name} has no resolved type" + ) + workflow_inputs[parameter.name] = {"type": cwl_type} + + compiled_step_ids = ( + { + step.process_name: step_name_str( + workflow.process_name, + index, + f"{step.process_name}.wic" if isinstance(step, Workflow) else step.process_name, + ) + for index, step in enumerate(workflow.steps) + } + if concrete_step_ids + else None + ) + + workflow_outputs: dict[str, dict[str, Any]] = {} + for output_parameter in workflow._outputs: + workflow_outputs[output_parameter.name] = output_parameter.to_workflow_output( + step_id_overrides=compiled_step_ids + ) + + steps_yaml = [ + step._as_workflow_step(inline_subtrees=inline_subtrees, directory=directory) + for step in workflow.steps + ] + document: dict[str, Any] = {"steps": steps_yaml} + if workflow_inputs: + document["inputs"] = workflow_inputs + if workflow_outputs: + document["outputs"] = workflow_outputs + return document + + +def write_workflow_ast_to_disk(workflow: Workflow, directory: Path) -> None: + """Write a workflow tree to disk as `.wic` files. + + Args: + workflow (Workflow): Workflow to serialize. + directory (Path): Destination directory. + + Returns: + None: Files are written to disk as a side effect. + """ + yaml_contents = workflow_document(workflow, inline_subtrees=False, directory=directory) + directory.mkdir(exist_ok=True, parents=True) + output_path = directory / f"{workflow.process_name}.wic" + with output_path.open(mode="w", encoding="utf-8") as file_handle: + file_handle.write(yaml.dump(yaml_contents, sort_keys=False, line_break="\n", indent=2)) + + +def _extract_tools_paths_nonportable(steps: list[Step]) -> Tools: + """Extract concrete tool definitions from instantiated steps. + + Args: + steps (list[Step]): Steps whose backing CWL tools should be collected. + + Returns: + Tools: A registry keyed by `StepId` that preserves local, non-portable paths. + """ + return {StepId(step.process_name, "global"): Tool(str(step.clt_path), step.yaml) for step in steps} + + +def _step_registries(steps: list[Step]) -> Tools: + merged_tools: Tools = {} + for step in steps: + merged_tools.update(step._tool_registry) + return merged_tools + + +def _merged_known_tools(steps: list[Step], tool_registry: Tools | None = None) -> Tools: + merged_tools = dict(_extract_tools_paths_nonportable(steps)) + merged_tools.update(_step_registries(steps)) + if tool_registry is not None: + merged_tools.update(tool_registry) + return merged_tools + + +def compile_workflow( + workflow: Workflow, + *, + write_to_disk: bool = False, + tool_registry: Tools | None = None, +) -> CompilerInfo: + """Compile a Python API workflow into CWL. + + Args: + workflow (Workflow): Workflow to compile. + write_to_disk (bool): Whether to also emit generated files under `autogenerated/`. + tool_registry (Tools | None): Optional tool registry override. + + Returns: + CompilerInfo: The compiler output for the workflow. + """ + workflow._validate() + + graph = get_graph_reps(workflow.process_name) + yaml_tree = YamlTree( + StepId(workflow.process_name, "global"), + workflow_document(workflow, inline_subtrees=True, concrete_step_ids=True), + ) + merged_tools = _merged_known_tools(workflow.flatten_steps(), tool_registry) + + compiler_options, graph_settings, yaml_tag_paths = get_dicts_for_compilation() + compiler_info = compiler.compile_workflow( + yaml_tree, + compiler_options, + graph_settings, + yaml_tag_paths, + [], + [graph], + {}, + {}, + {}, + {}, + merged_tools, + True, + relative_run_path=True, + testing=False, + ) + if write_to_disk: + input_output.write_to_disk(compiler_info.rose, Path("autogenerated/"), True) + + return compiler_info + + +def runtime_rose_tree(workflow: Workflow, *, tool_registry: Tools | None = None) -> RoseTree: + """Compile a workflow and inline runtime tags for local execution. + + Args: + workflow (Workflow): Workflow to prepare for execution. + tool_registry (Tools | None): Optional tool registry override. + + Returns: + RoseTree: Runtime-ready rose tree. + """ + return pc.cwl_inline_runtag(compile_workflow(workflow, tool_registry=tool_registry).rose) + + +def compiled_cwl_json(workflow: Workflow, *, tool_registry: Tools | None = None) -> Json: + """Return the compiled CWL workflow document plus generated inputs. + + Args: + workflow (Workflow): Workflow to compile. + tool_registry (Tools | None): Optional tool registry override. + + Returns: + Json: JSON-serializable compiled workflow payload. + """ + rose_tree = runtime_rose_tree(workflow, tool_registry=tool_registry) + sub_node_data = rose_tree.data + return { + "name": workflow.process_name, + "yaml_inputs": sub_node_data.workflow_inputs_file, + **sub_node_data.compiled_cwl, + } + + +def effective_run_args(run_args_dict: dict[str, str] | None = None) -> dict[str, str]: + """Merge user runtime arguments with the default local-run settings. + + Args: + run_args_dict (dict[str, str] | None): User-supplied runtime overrides. + + Returns: + dict[str, str]: Effective runtime argument mapping. + """ + effective = dict(DEFAULT_RUN_ARGS) + if run_args_dict: + effective.update(run_args_dict) + return effective + + +def run_workflow( + workflow: Workflow, + *, + run_args_dict: dict[str, str] | None = None, + user_env_vars: dict[str, str] | None = None, + basepath: str = "autogenerated", + tool_registry: Tools | None = None, +) -> None: + """Compile and execute a workflow locally. + + Args: + workflow (Workflow): Workflow to execute. + run_args_dict (dict[str, str] | None): Runtime CLI options for local execution. + user_env_vars (dict[str, str] | None): Environment variables to expose to the run. + basepath (str): Directory used for generated files and execution artifacts. + tool_registry (Tools | None): Optional tool registry override. + + Returns: + None: The workflow is executed as a side effect. + """ + logger.info("Running %s", workflow.process_name) + plugins.logging_filters() + + resolved_run_args = effective_run_args(run_args_dict) + rose_tree = runtime_rose_tree(workflow, tool_registry=tool_registry) + pc.find_and_create_output_dirs(rose_tree) + pc.verify_container_engine_config(resolved_run_args["container_engine"], False) + input_output.write_to_disk( + rose_tree, + Path(basepath), + True, + resolved_run_args.get("inputs_file", ""), + ) + pc.cwl_docker_extract( + resolved_run_args["container_engine"], + resolved_run_args["pull_dir"], + Path(basepath) / f"{workflow.process_name}.cwl", + ) + if resolved_run_args.get("docker_remove_entrypoints"): + rose_tree = pc.remove_entrypoints(resolved_run_args["container_engine"], rose_tree) + user_args = convert_args_dict_to_args_list(resolved_run_args) + + _, unknown_args = get_known_and_unknown_args(workflow.process_name, user_args) + rl.run_local( + resolved_run_args, + False, + workflow_name=workflow.process_name, + basepath=basepath, + passthrough_args=unknown_args, + user_env_vars=dict(user_env_vars or {}), + ) diff --git a/src/sophios/apis/python/api.py b/src/sophios/apis/python/api.py index e4690fd2..12820477 100644 --- a/src/sophios/apis/python/api.py +++ b/src/sophios/apis/python/api.py @@ -1,55 +1,55 @@ -# pylint: disable=W1203 +# pylint: disable=logging-fstring-interpolation,too-many-lines,protected-access """Python API for building CWL/WIC workflows.""" from __future__ import annotations import logging -import os -from pathlib import Path, PurePath -from typing import Any, Dict, Optional, TypeVar, Union +import warnings +from collections.abc import Sequence +from pathlib import Path +from typing import Any, ClassVar, Mapping -import cwl_utils.parser as cu_parser -import yaml from cwl_utils.parser import CommandLineTool as CWLCommandLineTool -from cwl_utils.parser import load_document_by_uri, load_document_by_yaml -from sophios import compiler, input_output, plugins -from sophios import post_compile as pc -from sophios import run_local as rl -from sophios.cli import get_dicts_for_compilation, get_known_and_unknown_args -from sophios.utils import convert_args_dict_to_args_list -from sophios.utils_graphs import get_graph_reps -from sophios.wic_types import CompilerInfo, Json, RoseTree, StepId, Tool, Tools, YamlTree +from sophios.inference import types_match +from sophios.wic_types import CompilerInfo, Json, Tools from ._errors import ( - InvalidCLTError, - InvalidInputValueError, InvalidLinkError, InvalidStepError, MissingRequiredValueError, ) from ._ports import ( - ProcessInput, - ProcessOutput, - StepInputs, - StepOutputs, - WorkflowInputReference, - WorkflowInputs, - WorkflowOutputs, AliasBinding as _AliasBinding, + InputParameter, InlineBinding as _InlineBinding, + OutputSourceBinding, + OutputParameter, + ParameterNamespace, + ParameterStore, WorkflowBinding as _WorkflowBinding, + WorkflowInputReference, ) -from ._types import ScatterMethod from ._utils import ( - default_dict as _default_dict, + infer_literal_parameter_type as _infer_literal_parameter_type, get_value_from_cfg as _get_value_from_cfg, load_yaml as _load_yaml, ) -from .api_config import default_values - - -global_config: Tools = {} +from ._types import ScatterMethod +from ._workflow_runtime import ( + coerce_path as _coerce_path, + compile_workflow as _compile_workflow, + load_clt_document as _load_clt_document, + load_clt as _load_clt, + lookup_parameter as _lookup_parameter, + normalize_workflow_name as _normalize_workflow_name, + populate_parameters as _populate_parameters, + run_workflow as _run_workflow, + validate_step_assignment as _validate_step_assignment, + workflow_document as _workflow_document, + compiled_cwl_json as _compiled_cwl_json, + write_workflow_ast_to_disk as _write_workflow_ast_to_disk, +) logger = logging.getLogger("WIC Python API") @@ -67,74 +67,132 @@ def filter(self, record: logging.LogRecord) -> bool: logger_wicad.addFilter(DisableEverythingFilter()) -CWLInputParameter = Union[ - cu_parser.cwl_v1_0.CommandInputParameter, - cu_parser.cwl_v1_1.CommandInputParameter, - cu_parser.cwl_v1_2.CommandInputParameter, -] - -CWLOutputParameter = Union[ - cu_parser.cwl_v1_0.CommandOutputParameter, - cu_parser.cwl_v1_1.CommandOutputParameter, - cu_parser.cwl_v1_2.CommandOutputParameter, -] - -StrPath = TypeVar("StrPath", str, Path) - - -def _load_clt(clt_path: Path) -> tuple[CWLCommandLineTool, dict[str, Any]]: - stepid = StepId(clt_path.stem, "global") - - if clt_path.exists(): - try: - clt = load_document_by_uri(clt_path) - except Exception as exc: - raise InvalidCLTError(f"invalid cwl file: {clt_path}") from exc - yaml_file = _load_yaml(clt_path) - global_config[stepid] = Tool(str(clt_path), yaml_file) - return clt, yaml_file - - if stepid in global_config: - tool = global_config[stepid] - logger.info( - "%s does not exist, but %s was found in the global config.", - clt_path, - clt_path.stem, +StrPath = str | Path + + +def _parameter_namespace( + store: ParameterStore[Any], + getter: Any, + setter: Any, + *, + read_only_error: str, +) -> ParameterNamespace[Any, Any]: + """Create the list-like attribute proxy used for `.inputs` and `.outputs`.""" + return ParameterNamespace(store, getter, setter, read_only_error=read_only_error) + + +def _resolve_parameter_type( + parameter: InputParameter | OutputParameter, + candidate_type: Any, + *, + context: str, +) -> None: + """Infer or validate a parameter type against a new candidate.""" + if candidate_type is None: + return + if parameter.parameter_type is None: + parameter.set_parameter_type(candidate_type) + return + if not types_match(parameter.parameter_type, candidate_type): + raise InvalidLinkError( + f"{context} has incompatible types: expected {parameter.parameter_type!r}, got {candidate_type!r}" ) - logger.info("Using file contents from %s", tool.run_path) - yaml_file = tool.cwl - clt = load_document_by_yaml(yaml_file, tool.run_path) - return clt, yaml_file - logger.warning("Warning! %s does not exist, and", clt_path) - logger.warning("%s was not found in the global config.", clt_path.stem) - raise InvalidCLTError(f"invalid cwl file: {clt_path}") + +def _warn_implicit_workflow_parameter(workflow: Workflow, name: str, kind: str) -> None: + """Warn when compatibility syntax implicitly declares workflow interface.""" + warnings.warn( + ( + f"Implicitly declaring workflow {kind} {name!r} on {workflow.process_name!r}. " + f"Prefer explicit {kind}s via workflow.add_{kind}(...), workflow.{kind}s.{name}, " + f"or typed bindings so interface drift is easier to spot." + ), + UserWarning, + stacklevel=3, + ) def _bind_process_input(process_self: Any, input_name: str, value: Any) -> None: input_port = process_self.get_inp_attr(input_name) + # This is the central compatibility switchboard for the Python API: + # - workflow.input_name means "formal workflow parameter" + # - step.output_name means "link to upstream step output" + # - everything else is treated as a literal inline value match value: - case WorkflowInputReference(workflow=workflow, name=name): - workflow._ensure_input(name) - input_port._set_binding(_WorkflowBinding(name)) - case ProcessOutput(parent_obj=Workflow(), name=name) as output: + case WorkflowInputReference(workflow=workflow, name=name, implicit=implicit): + workflow_input = workflow._ensure_input(name, parameter_type=input_port.parameter_type, implicit=implicit) input_port._set_binding(_WorkflowBinding(name)) - output.linked = True - case ProcessOutput() as output: + input_port.set_bound_parameter_type(workflow_input.parameter_type) + case OutputParameter(parent_obj=Workflow(), name=name): + raise InvalidLinkError( + f"Workflow output {name!r} cannot be bound as an input. " + f"Use workflow.inputs.{name} for formal inputs or workflow.outputs.{name} = ... for outputs." + ) + case OutputParameter() as output: + _resolve_parameter_type( + input_port, + output.parameter_type, + context=f"{process_self.process_name}.{input_name}", + ) anchor_name = output.ensure_anchor(f"{input_name}{process_self.process_name}") input_port._set_binding(_AliasBinding(anchor_name)) + input_port.set_bound_parameter_type(output.parameter_type) case _: input_port._set_binding(_InlineBinding(value)) + input_port.set_bound_parameter_type(_infer_literal_parameter_type(value)) -def set_input_Step_Workflow(process_self: Any, __name: str, __value: Any) -> None: - """Compatibility wrapper for the legacy helper name.""" - _bind_process_input(process_self, __name, __value) +def _bind_workflow_output(workflow: Workflow, output_name: str, value: Any) -> None: + output_parameter = workflow.add_output(output_name, implicit=True) + match value: + case OutputParameter(parent_obj=Step(process_name=process_name), name=name) as source: + _resolve_parameter_type( + output_parameter, + source.parameter_type, + context=f"{workflow.process_name}.outputs.{output_name}", + ) + output_parameter.bind_source(OutputSourceBinding(process_name, name)) + source.linked = True + case WorkflowInputReference(workflow=source_workflow, name=name) if source_workflow is workflow: + input_parameter = workflow._ensure_input(name) + _resolve_parameter_type( + output_parameter, + input_parameter.parameter_type, + context=f"{workflow.process_name}.outputs.{output_name}", + ) + output_parameter.bind_source(OutputSourceBinding(None, name)) + case _: + raise InvalidLinkError( + "workflow outputs must be bound to a step output or a workflow input reference" + ) class Step: - """A workflow step backed by a CWL CommandLineTool.""" + """A workflow step backed by a CWL `CommandLineTool`. + + Attribute writes like `step.message = "hi"` bind named step inputs. + Attribute reads like `step.output_file` resolve named step outputs. The + same ports are also available through the explicit `step.inputs.*` and + `step.outputs.*` namespaces. + """ + + _SYSTEM_ATTRS: ClassVar[set[str]] = { + "clt", + "clt_path", + "process_name", + "cwl_version", + "yaml", + "cfg_yaml", + "_tool_registry", + "_inputs", + "_outputs", + "inputs", + "outputs", + "scatter", + "scatterMethod", + "when", + } clt: CWLCommandLineTool clt_path: Path @@ -142,146 +200,230 @@ class Step: cwl_version: str yaml: dict[str, Any] cfg_yaml: dict[str, Any] - _inputs: list[ProcessInput] - _outputs: list[ProcessOutput] - _input_map: dict[str, ProcessInput] - _output_map: dict[str, ProcessOutput] - _input_names: list[str] - _output_names: list[str] - inputs: StepInputs - outputs: StepOutputs - scatter: list[ProcessInput] + _tool_registry: Tools + _inputs: ParameterStore[InputParameter] + _outputs: ParameterStore[OutputParameter] + inputs: ParameterNamespace[InputParameter, InputParameter] + outputs: ParameterNamespace[OutputParameter, OutputParameter] + scatter: list[InputParameter] scatterMethod: str when: str - def __init__(self, clt_path: StrPath, config_path: Optional[StrPath] = None): - match clt_path: - case Path(): - clt_path_ = clt_path - case str(): - clt_path_ = Path(clt_path) - case _: - raise TypeError("cwl_path must be a Path or str") - clt, yaml_file = _load_clt(clt_path_) - - cfg_yaml: dict[str, Any] - match config_path: - case Path(): - cfg_yaml = _load_yaml(config_path) - case str(): - cfg_yaml = _load_yaml(Path(config_path)) - case None: - cfg_yaml = _default_dict() - case _: - raise TypeError("config_path must be a Path, str, or None") + def __init__( + self, + clt_path: StrPath, + config_path: StrPath | None = None, + *, + tool_registry: Tools | None = None, + ): + """Create a `Step` from a CWL CommandLineTool file. + + Args: + clt_path (StrPath): Path to the CWL tool definition. + config_path (StrPath | None): Optional YAML config used to pre-bind inputs. + tool_registry (Tools | None): Optional fallback registry for known tools. + + Raises: + TypeError: If `clt_path` or `config_path` uses an unsupported type. + InvalidCLTError: If the CWL tool cannot be loaded from disk or the registry. + + Returns: + None: The step is initialized in place. + """ + clt_path_ = _coerce_path(clt_path, field_name="clt_path") + config_path_ = _coerce_path(config_path, field_name="config_path", allow_none=True) + assert clt_path_ is not None + resolved_registry = {} if tool_registry is None else tool_registry + clt, yaml_file = _load_clt(clt_path_, resolved_registry) + cfg_yaml = _load_yaml(config_path_) if config_path_ is not None else {} + + self._initialize_loaded_tool( + clt=clt, + yaml_file=yaml_file, + clt_path=clt_path_, + cfg_yaml=cfg_yaml, + tool_registry=resolved_registry, + ) + + @classmethod + def from_cwl( + cls, + document: Mapping[str, Any], + *, + process_name: str | None = None, + run_path: StrPath | None = None, + config: Mapping[str, Any] | None = None, + tool_registry: Tools | None = None, + ) -> Step: + # pylint: disable=too-many-arguments + """Create a `Step` from an in-memory CWL CommandLineTool document. + + Args: + document (Mapping[str, Any]): Parsed CWL CommandLineTool fields. + process_name (str | None): Optional step name override. + run_path (StrPath | None): Optional virtual `.cwl` path for compiler bookkeeping. + config (Mapping[str, Any] | None): Optional input values to pre-bind. + tool_registry (Tools | None): Optional tool registry retained on the step. + + Raises: + TypeError: If `run_path` uses an unsupported type. + InvalidCLTError: If the CWL document cannot be parsed. + + Returns: + Step: A fully initialized step backed by the in-memory tool. + """ + default_name = process_name or str(document.get("id") or "in_memory_tool") + run_path_value = run_path or f"{default_name}.cwl" + clt_path = _coerce_path(run_path_value, field_name="run_path") + assert clt_path is not None + resolved_registry = {} if tool_registry is None else tool_registry + clt, yaml_file = _load_clt_document(document, run_path=clt_path) + + step = cls.__new__(cls) + step._initialize_loaded_tool( + clt=clt, + yaml_file=yaml_file, + clt_path=clt_path, + cfg_yaml=dict(config or {}), + tool_registry=resolved_registry, + process_name=process_name, + ) + return step + + def _initialize_loaded_tool( + self, + *, + clt: CWLCommandLineTool, + yaml_file: dict[str, Any], + clt_path: Path, + cfg_yaml: Mapping[str, Any], + tool_registry: Tools, + process_name: str | None = None, + ) -> None: + # pylint: disable=too-many-arguments + """Populate a step from an already parsed CLT and optional config. + + Args: + clt (CWLCommandLineTool): Parsed CWL tool object. + yaml_file (dict[str, Any]): Raw CWL document. + clt_path (Path): Filesystem or virtual path representing the tool. + cfg_yaml (Mapping[str, Any]): Optional input bindings to apply. + tool_registry (Tools): Tool registry preserved on the step. + process_name (str | None): Optional explicit step name override. + + Returns: + None: The step is initialized in place. + """ + resolved_name = process_name or clt_path.stem object.__setattr__(self, "clt", clt) - object.__setattr__(self, "clt_path", clt_path_) - object.__setattr__(self, "process_name", clt_path_.stem) + object.__setattr__(self, "clt_path", clt_path) + object.__setattr__(self, "process_name", resolved_name) object.__setattr__(self, "cwl_version", clt.cwlVersion) object.__setattr__(self, "yaml", yaml_file) - object.__setattr__(self, "cfg_yaml", cfg_yaml) - object.__setattr__(self, "_inputs", []) - object.__setattr__(self, "_outputs", []) - object.__setattr__(self, "_input_map", {}) - object.__setattr__(self, "_output_map", {}) - object.__setattr__(self, "_input_names", []) - object.__setattr__(self, "_output_names", []) - object.__setattr__(self, "inputs", StepInputs(self)) - object.__setattr__(self, "outputs", StepOutputs(self)) + object.__setattr__(self, "cfg_yaml", dict(cfg_yaml)) + object.__setattr__(self, "_tool_registry", tool_registry) + object.__setattr__(self, "_inputs", ParameterStore()) + object.__setattr__(self, "_outputs", ParameterStore()) + # This proxy is the main bit of API "magic": it supports both + # list-style access (`step.inputs[0]`) and named attribute access + # (`step.inputs.message`) without duplicating wrapper classes. + object.__setattr__( + self, + "inputs", + _parameter_namespace(self._inputs, self.get_inp_attr, self.bind_input, read_only_error=""), + ) + object.__setattr__( + self, + "outputs", + _parameter_namespace( + self._outputs, + self.get_output, + None, + read_only_error="Step outputs are read-only; cannot set {name!r}", + ), + ) object.__setattr__(self, "scatter", []) object.__setattr__(self, "scatterMethod", "") object.__setattr__(self, "when", "") - self._populate_inputs(clt.inputs) - self._populate_outputs(clt.outputs) + _populate_parameters(clt.inputs, self._inputs, InputParameter, parent=self) + _populate_parameters(clt.outputs, self._outputs, OutputParameter, parent=self) - if config_path: + if self.cfg_yaml: self._set_from_io_cfg() - def _populate_inputs(self, cwl_inputs: list[CWLInputParameter]) -> None: - for input_param in cwl_inputs: - port = ProcessInput(str(input_param.id), input_param.type_, parent_obj=self) - self._inputs.append(port) - self._input_map[port.name] = port - self._input_names.append(port.name) - - def _populate_outputs(self, cwl_outputs: list[CWLOutputParameter]) -> None: - for output_param in cwl_outputs: - port = ProcessOutput(str(output_param.id), output_param.type_, parent_obj=self) - self._outputs.append(port) - self._output_map[port.name] = port - self._output_names.append(port.name) - def __repr__(self) -> str: return f"Step(clt_path={self.clt_path!r})" def __setattr__(self, name: str, value: Any) -> None: - if name in { - "clt", - "clt_path", - "process_name", - "cwl_version", - "yaml", - "cfg_yaml", - "_inputs", - "_outputs", - "_input_map", - "_output_map", - "_input_names", - "_output_names", - "inputs", - "outputs", - "scatter", - "scatterMethod", - "when", - }: - match name, value: - case "scatter", list() as items: - if not all(isinstance(item, ProcessInput) for item in items): - raise TypeError("all scatter inputs must be ProcessInput type") - case "scatterMethod", scatter_method if scatter_method: - allowed = {member.value for member in ScatterMethod} - if scatter_method not in allowed: - raise ValueError( - "Invalid value for scatterMethod. " - f"Valid values are: {', '.join(sorted(allowed))}" - ) - case "when", str() as condition if condition: - if not condition.startswith("$(") or not condition.endswith(")"): - raise ValueError("Invalid input to when. The js string must start with '$(' and end with ')'") - case "when", invalid if invalid: - raise ValueError("Invalid input to when. The js string must start with '$(' and end with ')'") + if name in self._SYSTEM_ATTRS: + _validate_step_assignment(name, value, owner=self) object.__setattr__(self, name, value) return - if "_input_map" in self.__dict__ and name in self._input_map: + # Legacy sugar is intentionally preserved: assigning to a known input + # parameter name binds that input instead of setting a plain attribute. + if "_inputs" in self.__dict__ and name in self._inputs: self.bind_input(name, value) return - - object.__setattr__(self, name, value) + if "_outputs" in self.__dict__ and name in self._outputs: + raise AttributeError(f"Step outputs are read-only; cannot set {name!r}") + raise AttributeError( + f"{self.process_name!r} has no input named {name!r}. " + "Use step.inputs. for declared inputs only." + ) def __getattr__(self, name: str) -> Any: if name.startswith("__"): raise AttributeError(name) - if name in self._output_map: - return self._output_map[name] + if name in self._outputs: + return self._outputs.get(name) raise AttributeError(f"{self.__class__.__name__!s} has no attribute {name!r}") def bind_input(self, name: str, value: Any) -> None: - if name not in self._input_map: - raise AttributeError(f"{self.process_name!r} has no input named {name!r}") + """Bind a value or upstream output to a named step input parameter. + + Args: + name (str): The input parameter name. + value (Any): A literal value, a workflow input reference, or a step output. + + Raises: + AttributeError: If the named input does not exist on the step. + + Returns: + None: The step is mutated in place. + """ + _lookup_parameter(self._inputs, name, owner_name=self.process_name, kind="input") _bind_process_input(self, name, value) - def get_inp_attr(self, name: str) -> ProcessInput: - if name not in self._input_map: - raise AttributeError(f"{self.process_name!r} has no input named {name!r}") - return self._input_map[name] + def get_inp_attr(self, name: str) -> InputParameter: + """Return a named input parameter from this step. + + Args: + name (str): The input parameter name. + + Raises: + AttributeError: If the input does not exist. + + Returns: + InputParameter: The requested step input parameter. + """ + return _lookup_parameter(self._inputs, name, owner_name=self.process_name, kind="input") + + def get_output(self, name: str) -> OutputParameter: + """Return a named output parameter from this step. - def get_output(self, name: str) -> ProcessOutput: - if name not in self._output_map: - raise AttributeError(f"{self.process_name!r} has no output named {name!r}") - return self._output_map[name] + Args: + name (str): The output parameter name. + + Raises: + AttributeError: If the output does not exist. + + Returns: + OutputParameter: The requested step output parameter. + """ + return _lookup_parameter(self._outputs, name, owner_name=self.process_name, kind="output") def _set_from_io_cfg(self) -> None: for name, value in self.cfg_yaml.items(): @@ -292,24 +434,25 @@ def _validate(self) -> None: if input_port.required and not input_port.is_bound(): raise MissingRequiredValueError(f"{input_port.name} is required") - @property - def _yml(self) -> dict[str, Any]: - in_dict = { - input_port.name: input_port.to_yaml_value() - for input_port in self._inputs - if input_port.is_bound() - } + def flatten_steps(self) -> list[Step]: + """Return this step as a single-item list for recursive traversal.""" + return [self] - out_list = [ - {output_port.name: output_port.value} - for output_port in self._outputs - if output_port.value is not None - ] + def flatten_subworkflows(self) -> list[Workflow]: + """Return an empty subworkflow list because steps do not nest workflows.""" + return [] + + def _as_workflow_step(self, *, inline_subtrees: bool, directory: Path | None = None) -> dict[str, Any]: + del inline_subtrees, directory + return self._yml + @property + def _yml(self) -> dict[str, Any]: + """Return the internal WIC step representation for this step.""" step_yaml: dict[str, Any] = { "id": self.process_name, - "in": in_dict, - "out": out_list, + "in": {port.name: port.to_yaml_value() for port in self._inputs if port.is_bound()}, + "out": [{port.name: port.value} for port in self._outputs if port.value is not None], } if self.scatter: @@ -322,67 +465,75 @@ def _yml(self) -> dict[str, Any]: return step_yaml -def extract_tools_paths_NONPORTABLE(steps: list[Step]) -> Tools: - """Extract the non-portable tool paths from the instantiated steps.""" - return {StepId(step.process_name, "global"): Tool(str(step.clt_path), step.yaml) for step in steps} - - class Workflow: - """A WIC workflow composed from Steps and/or nested Workflows.""" - - steps: list[Any] + """A WIC workflow composed from `Step` objects and nested `Workflow`s.""" + + _SYSTEM_ATTRS: ClassVar[set[str]] = { + "steps", + "process_name", + "_inputs", + "_outputs", + "inputs", + "outputs", + "yml_path", + } + + steps: list[Step | Workflow] process_name: str - _inputs: list[ProcessInput] - _outputs: list[ProcessOutput] - _input_map: dict[str, ProcessInput] - _output_map: dict[str, ProcessOutput] - _input_names: list[str] - _output_names: list[str] - _input_references: dict[str, WorkflowInputReference] - inputs: WorkflowInputs - outputs: WorkflowOutputs - yml_path: Optional[Path] - - def __init__(self, steps: list[Any], workflow_name: str): - normalized_name = workflow_name.lstrip("/").lstrip(" ") - parts = PurePath(normalized_name).parts - normalized_name = "_".join(part for part in parts if part).lstrip("_").replace(" ", "_") - + _inputs: ParameterStore[InputParameter] + _outputs: ParameterStore[OutputParameter] + inputs: ParameterNamespace[InputParameter, WorkflowInputReference] + outputs: ParameterNamespace[OutputParameter, OutputParameter] + yml_path: Path | None + + def __init__(self, steps: Sequence[Step | Workflow], workflow_name: str): + """Create a workflow from steps and/or nested subworkflows. + + Args: + steps (Sequence[Step | Workflow]): Child workflow nodes in execution order. + workflow_name (str): User-facing workflow name. + + Returns: + None: The workflow is initialized in place. + """ object.__setattr__(self, "steps", list(steps)) - object.__setattr__(self, "process_name", normalized_name) - object.__setattr__(self, "_inputs", []) - object.__setattr__(self, "_outputs", []) - object.__setattr__(self, "_input_map", {}) - object.__setattr__(self, "_output_map", {}) - object.__setattr__(self, "_input_names", []) - object.__setattr__(self, "_output_names", []) - object.__setattr__(self, "_input_references", {}) - object.__setattr__(self, "inputs", WorkflowInputs(self)) - object.__setattr__(self, "outputs", WorkflowOutputs(self)) + object.__setattr__(self, "process_name", _normalize_workflow_name(workflow_name)) + object.__setattr__(self, "_inputs", ParameterStore()) + object.__setattr__(self, "_outputs", ParameterStore()) + object.__setattr__( + self, + "inputs", + _parameter_namespace( + self._inputs, + self._input_reference, + self._bind_input_from_namespace, + read_only_error="", + ), + ) + object.__setattr__( + self, + "outputs", + _parameter_namespace( + self._outputs, + self.add_output, + self._bind_output_from_namespace, + read_only_error="", + ), + ) object.__setattr__(self, "yml_path", None) def __repr__(self) -> str: return f"Workflow(process_name={self.process_name!r}, steps={len(self.steps)})" def __setattr__(self, name: str, value: Any) -> None: - if name in { - "steps", - "process_name", - "_inputs", - "_outputs", - "_input_map", - "_output_map", - "_input_names", - "_output_names", - "_input_references", - "inputs", - "outputs", - "yml_path", - }: + if name in self._SYSTEM_ATTRS: object.__setattr__(self, name, value) return - if "_input_map" in self.__dict__: + if "_inputs" in self.__dict__: + if name in self._outputs: + self.bind_output(name, value) + return self.bind_input(name, value) return @@ -391,43 +542,128 @@ def __setattr__(self, name: str, value: Any) -> None: def __getattr__(self, name: str) -> Any: if name.startswith("__"): raise AttributeError(name) - return self._ensure_input_reference(name) - - def _ensure_input(self, name: str) -> ProcessInput: - if name not in self._input_map: - logger.warning("Adding a new input %s to workflow %s", name, self.process_name) - port = ProcessInput(name, "Any", parent_obj=self) - self._inputs.append(port) - self._input_map[name] = port - self._input_names.append(name) - return self._input_map[name] - - def _ensure_input_reference(self, name: str) -> WorkflowInputReference: - self._ensure_input(name) - if name not in self._input_references: - self._input_references[name] = WorkflowInputReference(self, name) - return self._input_references[name] + return self._input_reference(name, implicit=True) + + def _ensure_input(self, name: str, parameter_type: Any = None, *, implicit: bool = True) -> InputParameter: + def create_input(parameter_name: str) -> InputParameter: + logger.warning("Adding a new input %s to workflow %s", parameter_name, self.process_name) + if implicit: + _warn_implicit_workflow_parameter(self, parameter_name, "input") + return InputParameter(parameter_name, parameter_type, parent_obj=self) + + input_parameter = self._inputs.ensure(name, create_input) + _resolve_parameter_type( + input_parameter, + parameter_type, + context=f"{self.process_name}.inputs.{name}", + ) + return input_parameter - def add_input(self, name: str) -> ProcessInput: - return self._ensure_input(name) + def _input_reference(self, name: str, *, implicit: bool = False) -> WorkflowInputReference: + return WorkflowInputReference(self, name, implicit=implicit) + + def add_input(self, name: str, parameter_type: Any = None) -> InputParameter: + """Declare a workflow input explicitly. + + Args: + name (str): The workflow input name. + parameter_type (Any): Optional CWL type expression for the input. - def add_output(self, name: str) -> ProcessOutput: - if name not in self._output_map: - logger.warning("Adding a new output %s to workflow %s", name, self.process_name) - port = ProcessOutput(name, "Any", parent_obj=self) - self._outputs.append(port) - self._output_map[name] = port - self._output_names.append(name) - return self._output_map[name] + Returns: + InputParameter: The created or existing workflow input parameter. + """ + return self._ensure_input(name, parameter_type=parameter_type, implicit=False) + + def add_output( + self, + name: str, + source: Any = None, + *, + parameter_type: Any = None, + implicit: bool = False, + ) -> OutputParameter: + """Declare a workflow output explicitly. + + Args: + name (str): The workflow output name. + source (Any): Optional step output or workflow input reference to expose. + parameter_type (Any): Optional CWL type expression for the output. + + Returns: + OutputParameter: The created or existing workflow output parameter. + """ + def create_output(parameter_name: str) -> OutputParameter: + logger.warning("Adding a new output %s to workflow %s", parameter_name, self.process_name) + if implicit: + _warn_implicit_workflow_parameter(self, parameter_name, "output") + return OutputParameter(parameter_name, parameter_type, parent_obj=self) + + output_parameter = self._outputs.ensure(name, create_output) + _resolve_parameter_type( + output_parameter, + parameter_type, + context=f"{self.process_name}.outputs.{name}", + ) + if source is not None: + self.bind_output(name, source) + return output_parameter def bind_input(self, name: str, value: Any) -> None: + """Bind a literal value or upstream output to a workflow input. + + Args: + name (str): The workflow input name. + value (Any): A literal value, workflow reference, or step output. + + Returns: + None: The workflow is mutated in place. + """ self._ensure_input(name) _bind_process_input(self, name, value) - def get_inp_attr(self, name: str) -> ProcessInput: + def _bind_input_from_namespace(self, name: str, value: Any) -> None: + self._ensure_input(name, implicit=False) + _bind_process_input(self, name, value) + + def bind_output(self, name: str, value: Any) -> None: + """Bind a named workflow output to a step output or workflow input. + + Args: + name (str): The workflow output name. + value (Any): A step output or workflow input reference to expose. + + Returns: + None: The workflow is mutated in place. + """ + _bind_workflow_output(self, name, value) + + def _bind_output_from_namespace(self, name: str, value: Any) -> None: + self.add_output(name, implicit=False) + _bind_workflow_output(self, name, value) + + def get_inp_attr(self, name: str) -> InputParameter: + """Return a named workflow input, creating it if needed. + + Args: + name (str): The workflow input name. + + Returns: + InputParameter: The created or existing workflow input parameter. + """ return self._ensure_input(name) def append(self, step_: Any) -> None: + """Append a step or nested workflow to this workflow. + + Args: + step_ (Any): The `Step` or `Workflow` to append. + + Raises: + TypeError: If `step_` is neither a `Step` nor a `Workflow`. + + Returns: + None: The workflow is mutated in place. + """ match step_: case Step() | Workflow(): self.steps.append(step_) @@ -435,170 +671,109 @@ def append(self, step_: Any) -> None: raise TypeError("step must be either a Step or a Workflow") def _validate(self) -> None: + for output_parameter in self._outputs: + if not output_parameter.has_source(): + raise InvalidStepError(f"{self.process_name} has unbound output {output_parameter.name!r}") for step in self.steps: try: - match step: - case Step(): - step._validate() - case Workflow(): - step._validate() + step._validate() except Exception as exc: raise InvalidStepError(f"{step.process_name} is missing required inputs") from exc @property def yaml(self) -> dict[str, Any]: - workflow_inputs = {input_port.name: {"type": "Any"} for input_port in self._inputs} - steps_yaml = [] + """Return the in-memory WIC YAML representation of this workflow. - for step in self.steps: - match step: - case Step(): - steps_yaml.append(step._yml) - case Workflow(): - bound_inputs = { - input_port.name: input_port.to_yaml_value() - for input_port in step._inputs - if input_port.is_bound() - } - parentargs = {"in": bound_inputs} if bound_inputs else {} - steps_yaml.append( - { - "id": f"{step.process_name}.wic", - "subtree": step.yaml, - "parentargs": parentargs, - } - ) - - return {"inputs": workflow_inputs, "steps": steps_yaml} if workflow_inputs else {"steps": steps_yaml} + Returns: + dict[str, Any]: A WIC-compatible YAML tree represented as a Python dict. + """ + return _workflow_document(self, inline_subtrees=True) def write_ast_to_disk(self, directory: Path) -> None: - workflow_inputs = {input_port.name: {"type": "Any"} for input_port in self._inputs} - steps_yaml = [] + """Write this workflow tree to disk as `.wic` files. - for step in self.steps: - match step: - case Step(): - steps_yaml.append(step._yml) - case Workflow(): - bound_inputs = { - input_port.name: input_port.to_yaml_value() - for input_port in step._inputs - if input_port.is_bound() - } - parentargs = {"in": bound_inputs} if bound_inputs else {} - step.write_ast_to_disk(directory) - steps_yaml.append({"id": f"{step.process_name}.wic", **parentargs}) - - yaml_contents = {"inputs": workflow_inputs, "steps": steps_yaml} if workflow_inputs else {"steps": steps_yaml} - directory.mkdir(exist_ok=True, parents=True) - output_path = directory / f"{self.process_name}.wic" - with output_path.open(mode="w", encoding="utf-8") as file_handle: - file_handle.write(yaml.dump(yaml_contents, sort_keys=False, line_break="\n", indent=2)) + Args: + directory (Path): Directory where the workflow AST should be written. + + Returns: + None: Files are written to disk as a side effect. + """ + _write_workflow_ast_to_disk(self, directory) def flatten_steps(self) -> list[Step]: - steps: list[Step] = [] - for step in self.steps: - match step: - case Step(): - steps.append(step) - case Workflow(): - steps.extend(step.flatten_steps()) - return steps + """Return every concrete step in this workflow tree. + + Returns: + list[Step]: All `Step` instances reachable from this workflow. + """ + return [step for child in self.steps for step in child.flatten_steps()] def flatten_subworkflows(self) -> list[Workflow]: - subworkflows = [self] - for step in self.steps: - match step: - case Workflow(): - subworkflows.extend(step.flatten_subworkflows()) - return subworkflows - - def compile(self, write_to_disk: bool = False) -> CompilerInfo: - self._validate() - - graph = get_graph_reps(self.process_name) - yaml_tree = YamlTree(StepId(self.process_name, "global"), self.yaml) - - steps_config = extract_tools_paths_NONPORTABLE(self.flatten_steps()) - merged_tools = dict(steps_config) - merged_tools.update(global_config) - - compiler_options, graph_settings, yaml_tag_paths = get_dicts_for_compilation() - compiler_info = compiler.compile_workflow( - yaml_tree, - compiler_options, - graph_settings, - yaml_tag_paths, - [], - [graph], - {}, - {}, - {}, - {}, - merged_tools, - True, - relative_run_path=True, - testing=False, - ) + """Return this workflow and all nested subworkflows. + + Returns: + list[Workflow]: This workflow followed by nested subworkflows. + """ + return [self, *[workflow for child in self.steps for workflow in child.flatten_subworkflows()]] + + def compile(self, write_to_disk: bool = False, *, tool_registry: Tools | None = None) -> CompilerInfo: + """Compile this workflow into CWL. - if write_to_disk: - rose_tree: RoseTree = compiler_info.rose - input_output.write_to_disk(rose_tree, Path("autogenerated/"), True) + Args: + write_to_disk (bool): Whether to also write generated CWL to `autogenerated/`. + tool_registry (Tools | None): Optional tool registry override. - return compiler_info + Returns: + CompilerInfo: The compiler result tree for this workflow. + """ + return _compile_workflow(self, write_to_disk=write_to_disk, tool_registry=tool_registry) - def get_cwl_workflow(self) -> Json: - compiler_info = self.compile(write_to_disk=False) - rose_tree = compiler_info.rose + def get_cwl_workflow(self, *, tool_registry: Tools | None = None) -> Json: + """Return the compiled CWL workflow JSON and generated input object. - rose_tree = pc.cwl_inline_runtag(rose_tree) - sub_node_data = rose_tree.data - cwl_ast = sub_node_data.compiled_cwl - yaml_inputs = sub_node_data.workflow_inputs_file - return {"name": self.process_name, "yaml_inputs": yaml_inputs, **cwl_ast} + Args: + tool_registry (Tools | None): Optional tool registry override. + + Returns: + Json: A JSON-serializable representation of the compiled CWL workflow. + """ + return _compiled_cwl_json(self, tool_registry=tool_registry) def run( self, - run_args_dict: Optional[Dict[str, str]] = None, - user_env_vars: Optional[Dict[str, str]] = None, + run_args_dict: dict[str, str] | None = None, + user_env_vars: dict[str, str] | None = None, basepath: str = "autogenerated", + tool_registry: Tools | None = None, ) -> None: - logger.info("Running %s", self.process_name) - plugins.logging_filters() - - effective_run_args = dict(default_values.default_run_args_dict) - if run_args_dict: - effective_run_args.update(run_args_dict) - - effective_env_vars = dict(user_env_vars or {}) - - compiler_info = self.compile(write_to_disk=False) - rose_tree: RoseTree = compiler_info.rose - rose_tree = pc.cwl_inline_runtag(rose_tree) - pc.find_and_create_output_dirs(rose_tree) - pc.verify_container_engine_config(effective_run_args["container_engine"], False) - input_output.write_to_disk( - rose_tree, - Path(basepath), - True, - effective_run_args.get("inputs_file", ""), - ) - pc.cwl_docker_extract( - effective_run_args["container_engine"], - effective_run_args["pull_dir"], - self.process_name, - ) - if effective_run_args.get("docker_remove_entrypoints"): - rose_tree = pc.remove_entrypoints(effective_run_args["container_engine"], rose_tree) - user_args = convert_args_dict_to_args_list(effective_run_args) - - os.environ.update(rl.sanitize_env_vars(effective_env_vars)) - - _, unknown_args = get_known_and_unknown_args(self.process_name, user_args) - rl.run_local( - effective_run_args, - False, - workflow_name=self.process_name, + """Compile and execute this workflow locally. + + Args: + run_args_dict (dict[str, str] | None): Runtime CLI options for local execution. + user_env_vars (dict[str, str] | None): Environment variables to expose to the run. + basepath (str): Directory used for generated files and execution artifacts. + tool_registry (Tools | None): Optional tool registry override. + + Returns: + None: The workflow is executed as a side effect. + """ + _run_workflow( + self, + run_args_dict=run_args_dict, + user_env_vars=user_env_vars, basepath=basepath, - passthrough_args=unknown_args, + tool_registry=tool_registry, ) + + def _as_workflow_step(self, *, inline_subtrees: bool, directory: Path | None = None) -> dict[str, Any]: + # Nested workflows are serialized in one of two ways: + # 1. inline during in-memory compilation (`subtree`) + # 2. as sibling `.wic` files when writing an AST to disk + bound_inputs = {port.name: port.to_yaml_value() for port in self._inputs if port.is_bound()} + parentargs = {"in": bound_inputs} if bound_inputs else {} + if inline_subtrees: + return {"id": f"{self.process_name}.wic", "subtree": self.yaml, "parentargs": parentargs} + if directory is None: + raise ValueError("directory is required when serializing subworkflows to disk") + self.write_ast_to_disk(directory) + return {"id": f"{self.process_name}.wic", **parentargs} diff --git a/src/sophios/apis/python/api_config.py b/src/sophios/apis/python/api_config.py deleted file mode 100644 index de6df2bc..00000000 --- a/src/sophios/apis/python/api_config.py +++ /dev/null @@ -1,8 +0,0 @@ -from typing import Dict -from pathlib import Path - - -class default_values: - default_run_args_dict: Dict[str, str] = {'cwl_runner': 'cwltool', - 'container_engine': 'docker', - 'pull_dir': str(Path().cwd())} diff --git a/src/sophios/apis/python/cwl_builder.py b/src/sophios/apis/python/cwl_builder.py index 1e7e0342..d0cb8e8c 100644 --- a/src/sophios/apis/python/cwl_builder.py +++ b/src/sophios/apis/python/cwl_builder.py @@ -1,1197 +1,87 @@ -"""Cleanroom CWL v1.2 CommandLineTool builder. +"""Public CWL v1.2 CommandLineTool authoring API. -This module is intentionally separate from the workflow DSL. It is a plain -Python authoring layer for CWL CommandLineTool documents with three goals: - -1. cover the common 90% of real CLT authoring cleanly, -2. validate generated documents through the cwltool/schema-salad stack, and -3. leave raw escape hatches for the remaining awkward corners of the spec. - -Recommended style ------------------ -Prefer the structured helpers: +The required core is intentionally small: ```python -tool = ( - CommandLineToolBuilder("custom-tool") - .inputs(message=Input.string()) - .outputs(out=Output.file(glob="out.txt")) - .time_limit(60) -) +inputs = Inputs(input=Input(cwl.directory, position=1)) +outputs = Outputs(output=Output(cwl.directory, from_input=inputs.input)) +tool = CommandLineTool("example", inputs, outputs) ``` -The lower-level `.input(...)`, `.output(...)`, `.requirement(...)`, and -`.hint(...)` methods are still available as escape hatches. - -Deliberate gaps ---------------- -- SALAD authoring features such as `$import`, `$include`, `$mixin`, and `$graph` - are not first-class builder concepts. They are document-assembly features, not - CLT-structure features. Use `extra()` or post-process the rendered dict if you - need them. -- The builder normalizes `requirements` and `hints` to map form keyed by class. - That covers typical CLT usage, but it does not preserve array ordering. -- Expressions are treated as opaque CWL strings. Schema validation is delegated - to cwltool/schema-salad; expression linting is intentionally out of scope. -- Implementation-specific extension objects are supported through `extra()` and - raw dict payloads, but they do not get typed wrappers by default. +Everything else is optional and chainable. """ -# pylint: disable=missing-function-docstring,redefined-builtin,too-few-public-methods,too-many-arguments -# pylint: disable=too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods +# pylint: disable=missing-function-docstring +# The fluent builder intentionally exposes many small self-descriptive methods. + +from __future__ import annotations -import tempfile from dataclasses import dataclass, field from pathlib import Path -from typing import Any, ClassVar, overload +from typing import TYPE_CHECKING, Any, Mapping import yaml +from sophios.wic_types import Tools + +from ._cwl_builder_namespaces import Field, Input, Inputs, Output, Outputs, cwl +from ._cwl_builder_specs import ( + CommandArgument, + CommandLineBinding, + CommandOutputBinding, + Dirent, + DockerRequirement, + EnvironmentDef, + EnvVarRequirement, + FieldSpec, + InitialWorkDirRequirement, + InlineJavascriptRequirement, + InplaceUpdateRequirement, + InputSpec, + LoadListingRequirement, + NetworkAccess, + OutputSpec, + ResourceRequirement, + SchemaDefRequirement, + SecondaryFile, + ShellCommandRequirement, + SoftwarePackage, + SoftwareRequirement, + ToolTimeLimit, + WorkReuse, + secondary_file, +) +from ._cwl_builder_support import ( + _validate_path, + _SUPPORT, + _contains_expression, + _merge_if_set, + _normalize_requirement, + _render, + _render_doc, + _sanitize_raw_mapping, + _warn_raw_escape_hatch, + CWLBuilderValidationError, + ValidationResult, + validate_cwl_document, +) -from sophios import utils_cwl - - -_UNSET = object() - - -def _render(value: Any) -> Any: - match value: - case Path(): - return str(value) - case list() as values: - return [_render(item) for item in values] - case tuple() as values: - return [_render(item) for item in values] - case dict() as values: - return {key: _render(item) for key, item in values.items()} - case _ if hasattr(value, "to_dict") and callable(value.to_dict): - return _render(value.to_dict()) - case _: - return value - - -def _merge_if_set(target: dict[str, Any], key: str, value: Any) -> None: - if value is not None: - target[key] = _render(value) - - -def _merge_if_present(target: dict[str, Any], key: str, value: Any) -> None: - if value is not _UNSET: - target[key] = _render(value) - - -def _canonicalize_type(type_: Any) -> Any: - return utils_cwl.canonicalize_type(_render(type_)) - - -def _render_doc(value: str | list[str] | None) -> str | list[str] | None: - match value: - case None: - return None - case str() as text: - return text - case list() as texts: - return [str(text) for text in texts] - - -def _render_mapping(value: dict[str, Any]) -> dict[str, Any]: - return {key: _render(item) for key, item in value.items()} - - -def _render_secondary_files(value: Any) -> Any: - if value is None: - return None - return _render(value) - - -@overload -def _optional_binding(binding: "CommandLineBinding") -> "CommandLineBinding | None": - ... - - -@overload -def _optional_binding(binding: "CommandOutputBinding") -> "CommandOutputBinding | None": - ... - - -def _optional_binding( - binding: "CommandLineBinding | CommandOutputBinding", -) -> "CommandLineBinding | CommandOutputBinding | None": - if binding.to_dict(): - return binding - return None - - -def _import_cwltool_load_tool() -> Any: - try: - from cwltool import load_tool # pylint: disable=import-outside-toplevel - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "cwltool/schema_salad is required to validate generated CommandLineTools" - ) from exc - return load_tool - - -@dataclass(slots=True) -class SecondaryFile: - """A CWL secondary file pattern.""" - - pattern: Any - required: bool | str | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> str | dict[str, Any]: - if self.required is None and not self.extra and isinstance(self.pattern, str): - return self.pattern - data: dict[str, Any] = {"pattern": _render(self.pattern)} - _merge_if_set(data, "required", self.required) - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class CommandLineBinding: - """CWL CommandLineBinding fields shared by inputs and arguments.""" - - position: int | float | None = None - prefix: str | None = None - separate: bool | None = None - item_separator: str | None = None - value_from: Any = None - shell_quote: bool | None = None - load_contents: bool | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - data: dict[str, Any] = {} - _merge_if_set(data, "position", self.position) - _merge_if_set(data, "prefix", self.prefix) - _merge_if_set(data, "separate", self.separate) - _merge_if_set(data, "itemSeparator", self.item_separator) - _merge_if_set(data, "valueFrom", self.value_from) - _merge_if_set(data, "shellQuote", self.shell_quote) - _merge_if_set(data, "loadContents", self.load_contents) - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class CommandOutputBinding: - """CWL CommandOutputBinding fields.""" - - glob: Any = None - load_contents: bool | None = None - output_eval: str | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - data: dict[str, Any] = {} - _merge_if_set(data, "glob", self.glob) - _merge_if_set(data, "loadContents", self.load_contents) - _merge_if_set(data, "outputEval", self.output_eval) - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class Dirent: - """InitialWorkDirRequirement listing entry.""" - - entry: Any - entryname: str | None = None - writable: bool | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - data = {"entry": _render(self.entry)} - _merge_if_set(data, "entryname", self.entryname) - _merge_if_set(data, "writable", self.writable) - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class EnvironmentDef: - """EnvVarRequirement entry.""" - - env_name: str - env_value: str - - def to_dict(self) -> dict[str, str]: - return {"envName": self.env_name, "envValue": self.env_value} - - -@dataclass(slots=True) -class SoftwarePackage: - """SoftwareRequirement package entry.""" - - package: str - version: list[str] | None = None - specs: list[str] | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - data = {"package": self.package} - _merge_if_set(data, "version", self.version) - _merge_if_set(data, "specs", self.specs) - data.update(_render(self.extra)) - return data - - -class _RequirementSpec: - class_name: ClassVar[str] - - def to_fields(self) -> dict[str, Any]: - raise NotImplementedError - - -@dataclass(slots=True) -class DockerRequirement(_RequirementSpec): - class_name: ClassVar[str] = "DockerRequirement" - - docker_pull: str | None = None - docker_load: str | None = None - docker_file: str | dict[str, Any] | None = None - docker_import: str | None = None - docker_image_id: str | None = None - docker_output_directory: str | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data: dict[str, Any] = {} - _merge_if_set(data, "dockerPull", self.docker_pull) - _merge_if_set(data, "dockerLoad", self.docker_load) - _merge_if_set(data, "dockerFile", self.docker_file) - _merge_if_set(data, "dockerImport", self.docker_import) - _merge_if_set(data, "dockerImageId", self.docker_image_id) - _merge_if_set(data, "dockerOutputDirectory", self.docker_output_directory) - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class ResourceRequirement(_RequirementSpec): - class_name: ClassVar[str] = "ResourceRequirement" - - cores_min: int | float | str | None = None - cores_max: int | float | str | None = None - ram_min: int | float | str | None = None - ram_max: int | float | str | None = None - tmpdir_min: int | float | str | None = None - tmpdir_max: int | float | str | None = None - outdir_min: int | float | str | None = None - outdir_max: int | float | str | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - numeric_pairs = [ - ("cores", self.cores_min, self.cores_max), - ("ram", self.ram_min, self.ram_max), - ("tmpdir", self.tmpdir_min, self.tmpdir_max), - ("outdir", self.outdir_min, self.outdir_max), - ] - for resource, minimum, maximum in numeric_pairs: - if isinstance(minimum, (int, float)) and minimum < 0: - raise ValueError(f"{resource} minimum cannot be negative") - if isinstance(maximum, (int, float)) and maximum < 0: - raise ValueError(f"{resource} maximum cannot be negative") - if isinstance(minimum, (int, float)) and isinstance(maximum, (int, float)) and maximum < minimum: - raise ValueError(f"{resource} maximum cannot be smaller than minimum") - - data: dict[str, Any] = {} - _merge_if_set(data, "coresMin", self.cores_min) - _merge_if_set(data, "coresMax", self.cores_max) - _merge_if_set(data, "ramMin", self.ram_min) - _merge_if_set(data, "ramMax", self.ram_max) - _merge_if_set(data, "tmpdirMin", self.tmpdir_min) - _merge_if_set(data, "tmpdirMax", self.tmpdir_max) - _merge_if_set(data, "outdirMin", self.outdir_min) - _merge_if_set(data, "outdirMax", self.outdir_max) - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class InitialWorkDirRequirement(_RequirementSpec): - class_name: ClassVar[str] = "InitialWorkDirRequirement" - - listing: Any - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"listing": _render(self.listing)} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class EnvVarRequirement(_RequirementSpec): - class_name: ClassVar[str] = "EnvVarRequirement" - - env_defs: list[EnvironmentDef | dict[str, Any]] = field(default_factory=list) - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"envDef": _render(self.env_defs)} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class ShellCommandRequirement(_RequirementSpec): - class_name: ClassVar[str] = "ShellCommandRequirement" - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - return _render_mapping(self.extra) - - -@dataclass(slots=True) -class InlineJavascriptRequirement(_RequirementSpec): - class_name: ClassVar[str] = "InlineJavascriptRequirement" - - expression_lib: list[str] | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data: dict[str, Any] = {} - _merge_if_set(data, "expressionLib", self.expression_lib) - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class SchemaDefRequirement(_RequirementSpec): - class_name: ClassVar[str] = "SchemaDefRequirement" - - types: list[Any] - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"types": _render(self.types)} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class LoadListingRequirement(_RequirementSpec): - class_name: ClassVar[str] = "LoadListingRequirement" - - load_listing: str - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"loadListing": self.load_listing} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class SoftwareRequirement(_RequirementSpec): - class_name: ClassVar[str] = "SoftwareRequirement" - - packages: list[SoftwarePackage | dict[str, Any]] | dict[str, Any] - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"packages": _render(self.packages)} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class WorkReuse(_RequirementSpec): - class_name: ClassVar[str] = "WorkReuse" - - enable_reuse: bool | str - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"enableReuse": _render(self.enable_reuse)} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class NetworkAccess(_RequirementSpec): - class_name: ClassVar[str] = "NetworkAccess" - - network_access: bool | str - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"networkAccess": _render(self.network_access)} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class InplaceUpdateRequirement(_RequirementSpec): - class_name: ClassVar[str] = "InplaceUpdateRequirement" - - inplace_update: bool - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - data = {"inplaceUpdate": self.inplace_update} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class ToolTimeLimit(_RequirementSpec): - class_name: ClassVar[str] = "ToolTimeLimit" - - timelimit: int | str - extra: dict[str, Any] = field(default_factory=dict) - - def to_fields(self) -> dict[str, Any]: - if isinstance(self.timelimit, int) and self.timelimit < 0: - raise ValueError("timelimit cannot be negative") - data = {"timelimit": _render(self.timelimit)} - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class CommandInput: - """A single CLT input parameter.""" - - name: str - type_: Any - binding: CommandLineBinding | None = None - label: str | None = None - doc: str | list[str] | None = None - format: Any = None - secondary_files: Any = None - streamable: bool | None = None - load_contents: bool | None = None - load_listing: str | None = None - default: Any = field(default=_UNSET, repr=False) - extra: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - data: dict[str, Any] = {"type": _canonicalize_type(self.type_)} - _merge_if_set(data, "label", self.label) - _merge_if_set(data, "doc", _render_doc(self.doc)) - _merge_if_set(data, "format", self.format) - _merge_if_set(data, "streamable", self.streamable) - _merge_if_set(data, "loadContents", self.load_contents) - _merge_if_set(data, "loadListing", self.load_listing) - secondary_files = _render_secondary_files(self.secondary_files) - if secondary_files is not None: - data["secondaryFiles"] = secondary_files - _merge_if_present(data, "default", self.default) - if self.binding is not None: - binding = self.binding.to_dict() - if binding: - data["inputBinding"] = binding - data.update(_render(self.extra)) - return data +if TYPE_CHECKING: + from .api import Step @dataclass(slots=True) -class CommandOutput: - """A single CLT output parameter.""" +# pylint: disable=too-many-instance-attributes,too-many-public-methods +class CommandLineTool: + """Declarative CWL CommandLineTool authoring object.""" name: str - type_: Any - binding: CommandOutputBinding | None = None - label: str | None = None - doc: str | list[str] | None = None - format: Any = None - secondary_files: Any = None - streamable: bool | None = None - load_listing: str | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - data: dict[str, Any] = {"type": _canonicalize_type(self.type_)} - _merge_if_set(data, "label", self.label) - _merge_if_set(data, "doc", _render_doc(self.doc)) - _merge_if_set(data, "format", self.format) - _merge_if_set(data, "streamable", self.streamable) - _merge_if_set(data, "loadListing", self.load_listing) - secondary_files = _render_secondary_files(self.secondary_files) - if secondary_files is not None: - data["secondaryFiles"] = secondary_files - if self.binding is not None: - binding = self.binding.to_dict() - if binding: - data["outputBinding"] = binding - data.update(_render(self.extra)) - return data - - -@dataclass(slots=True) -class FieldSpec: - """Structured record field specification.""" - - type_: Any - label: str | None = None - doc: str | list[str] | None = None - input_binding: CommandLineBinding | None = None - output_binding: CommandOutputBinding | None = None - secondary_files: Any = None - streamable: bool | None = None - format: Any = None - extra: dict[str, Any] = field(default_factory=dict) - - def named(self, name: str) -> dict[str, Any]: - return record_field( - name, - self.type_, - label=self.label, - doc=self.doc, - input_binding=self.input_binding, - output_binding=self.output_binding, - secondary_files=self.secondary_files, - streamable=self.streamable, - format=self.format, - extra=self.extra, - ) - - -@dataclass(slots=True) -class InputSpec: - """Structured input specification without repeating the input name.""" - - type_: Any - binding: CommandLineBinding | None = None - label: str | None = None - doc: str | list[str] | None = None - format: Any = None - secondary_files: Any = None - streamable: bool | None = None - load_contents: bool | None = None - load_listing: str | None = None - default: Any = field(default=_UNSET, repr=False) - extra: dict[str, Any] = field(default_factory=dict) - - def named(self, name: str) -> CommandInput: - return CommandInput( - name=name, - type_=self.type_, - binding=self.binding, - label=self.label, - doc=self.doc, - format=self.format, - secondary_files=self.secondary_files, - streamable=self.streamable, - load_contents=self.load_contents, - load_listing=self.load_listing, - default=self.default, - extra=self.extra, - ) - - -@dataclass(slots=True) -class OutputSpec: - """Structured output specification without repeating the output name.""" - - type_: Any - binding: CommandOutputBinding | None = None - label: str | None = None - doc: str | list[str] | None = None - format: Any = None - secondary_files: Any = None - streamable: bool | None = None - load_listing: str | None = None - extra: dict[str, Any] = field(default_factory=dict) - - def named(self, name: str) -> CommandOutput: - return CommandOutput( - name=name, - type_=self.type_, - binding=self.binding, - label=self.label, - doc=self.doc, - format=self.format, - secondary_files=self.secondary_files, - streamable=self.streamable, - load_listing=self.load_listing, - extra=self.extra, - ) - - -@dataclass(slots=True) -class CommandArgument: - """A CWL command line argument entry.""" - - value: Any = None - binding: CommandLineBinding = field(default_factory=CommandLineBinding) - extra: dict[str, Any] = field(default_factory=dict) - - def to_yaml(self) -> str | dict[str, Any]: - binding = self.binding.to_dict() - if not binding and isinstance(self.value, str) and not self.extra: - return self.value - if self.value is not None and "valueFrom" not in binding: - binding["valueFrom"] = _render(self.value) - binding.update(_render(self.extra)) - return binding - - -def secondary_file( - pattern: Any, - *, - required: bool | str | None = None, - extra: dict[str, Any] | None = None, -) -> SecondaryFile: - return SecondaryFile(pattern=pattern, required=required, extra=dict(extra or {})) - - -def array_type( - items: Any, - *, - name: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - extra: dict[str, Any] | None = None, -) -> dict[str, Any]: - data: dict[str, Any] = {"type": "array", "items": _canonicalize_type(items)} - _merge_if_set(data, "name", name) - _merge_if_set(data, "label", label) - _merge_if_set(data, "doc", _render_doc(doc)) - if input_binding is not None: - binding = input_binding.to_dict() - if binding: - data["inputBinding"] = binding - data.update(_render(extra or {})) - return data - - -def enum_type( - symbols: list[str], - *, - name: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - extra: dict[str, Any] | None = None, -) -> dict[str, Any]: - data: dict[str, Any] = {"type": "enum", "symbols": list(symbols)} - _merge_if_set(data, "name", name) - _merge_if_set(data, "label", label) - _merge_if_set(data, "doc", _render_doc(doc)) - if input_binding is not None: - binding = input_binding.to_dict() - if binding: - data["inputBinding"] = binding - data.update(_render(extra or {})) - return data - - -def record_field( - name: str, - type_: Any, - *, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - output_binding: CommandOutputBinding | None = None, - secondary_files: Any = None, - streamable: bool | None = None, - format: Any = None, - extra: dict[str, Any] | None = None, -) -> dict[str, Any]: - data: dict[str, Any] = {"name": name, "type": _canonicalize_type(type_)} - _merge_if_set(data, "label", label) - _merge_if_set(data, "doc", _render_doc(doc)) - _merge_if_set(data, "format", format) - _merge_if_set(data, "streamable", streamable) - secondary_files_value = _render_secondary_files(secondary_files) - if secondary_files_value is not None: - data["secondaryFiles"] = secondary_files_value - if input_binding is not None: - binding = input_binding.to_dict() - if binding: - data["inputBinding"] = binding - if output_binding is not None: - binding = output_binding.to_dict() - if binding: - data["outputBinding"] = binding - data.update(_render(extra or {})) - return data - - -def record_type( - fields: list[Any] | dict[str, Any], - *, - name: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - extra: dict[str, Any] | None = None, -) -> dict[str, Any]: - data: dict[str, Any] = {"type": "record", "fields": _render(fields)} - _merge_if_set(data, "name", name) - _merge_if_set(data, "label", label) - _merge_if_set(data, "doc", _render_doc(doc)) - if input_binding is not None: - binding = input_binding.to_dict() - if binding: - data["inputBinding"] = binding - data.update(_render(extra or {})) - return data - - -class Type: - """Structured CWL type helpers.""" - - @staticmethod - def null() -> str: - return "null" - - @staticmethod - def boolean() -> str: - return "boolean" - - @staticmethod - def int() -> str: - return "int" - - @staticmethod - def long() -> str: - return "long" - - @staticmethod - def float() -> str: - return "float" - - @staticmethod - def double() -> str: - return "double" - - @staticmethod - def string() -> str: - return "string" - - @staticmethod - def file() -> str: - return "File" - - @staticmethod - def directory() -> str: - return "Directory" - - @staticmethod - def stdout() -> str: - return "stdout" - - @staticmethod - def stderr() -> str: - return "stderr" - - @staticmethod - def any() -> str: - return "Any" - - @staticmethod - def array( - items: Any, - *, - name: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - extra: dict[str, Any] | None = None, - ) -> dict[str, Any]: - return array_type( - items, - name=name, - label=label, - doc=doc, - input_binding=input_binding, - extra=extra, - ) - - @staticmethod - def enum( - *symbols: str, - name: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - extra: dict[str, Any] | None = None, - ) -> dict[str, Any]: - return enum_type( - list(symbols), - name=name, - label=label, - doc=doc, - input_binding=input_binding, - extra=extra, - ) - - @staticmethod - def record( - fields: dict[str, FieldSpec] | list[Any], - *, - name: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - extra: dict[str, Any] | None = None, - ) -> dict[str, Any]: - match fields: - case dict() as mapping: - rendered_fields = [ - spec.named(field_name) if isinstance(spec, FieldSpec) else record_field(field_name, spec) - for field_name, spec in mapping.items() - ] - case list() as items: - rendered_fields = _render(items) - case _: - raise TypeError("record fields must be a mapping or a list") - return record_type( - rendered_fields, - name=name, - label=label, - doc=doc, - input_binding=input_binding, - extra=extra, - ) - - @staticmethod - def optional(inner: Any) -> list[Any]: - return ["null", _canonicalize_type(inner)] - - -class Field: - """Structured record field helpers.""" - - @staticmethod - def of( - type_: Any, - *, - label: str | None = None, - doc: str | list[str] | None = None, - input_binding: CommandLineBinding | None = None, - output_binding: CommandOutputBinding | None = None, - secondary_files: Any = None, - streamable: bool | None = None, - format: Any = None, - extra: dict[str, Any] | None = None, - ) -> FieldSpec: - return FieldSpec( - type_=type_, - label=label, - doc=doc, - input_binding=input_binding, - output_binding=output_binding, - secondary_files=secondary_files, - streamable=streamable, - format=format, - extra=dict(extra or {}), - ) - - @staticmethod - def string(**kwargs: Any) -> FieldSpec: - return Field.of(Type.string(), **kwargs) - - @staticmethod - def int(**kwargs: Any) -> FieldSpec: - return Field.of(Type.int(), **kwargs) - - @staticmethod - def long(**kwargs: Any) -> FieldSpec: - return Field.of(Type.long(), **kwargs) - - @staticmethod - def float(**kwargs: Any) -> FieldSpec: - return Field.of(Type.float(), **kwargs) - - @staticmethod - def double(**kwargs: Any) -> FieldSpec: - return Field.of(Type.double(), **kwargs) - - @staticmethod - def boolean(**kwargs: Any) -> FieldSpec: - return Field.of(Type.boolean(), **kwargs) - - @staticmethod - def file(**kwargs: Any) -> FieldSpec: - return Field.of(Type.file(), **kwargs) - - @staticmethod - def directory(**kwargs: Any) -> FieldSpec: - return Field.of(Type.directory(), **kwargs) - - @staticmethod - def array(items: Any, **kwargs: Any) -> FieldSpec: - return Field.of(Type.array(items), **kwargs) - - @staticmethod - def enum(*symbols: str, **kwargs: Any) -> FieldSpec: - return Field.of(Type.enum(*symbols), **kwargs) - - @staticmethod - def record(fields: dict[str, FieldSpec] | list[Any], **kwargs: Any) -> FieldSpec: - return Field.of(Type.record(fields), **kwargs) - - -# pylint: disable=too-many-public-methods -class Input: - """Structured CLT input helpers.""" - - @staticmethod - def of( - type_: Any, - *, - position: int | float | None = None, - prefix: str | None = None, - separate: bool | None = None, - item_separator: str | None = None, - value_from: Any = None, - shell_quote: bool | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - format: Any = None, - secondary_files: Any = None, - streamable: bool | None = None, - load_contents: bool | None = None, - load_listing: str | None = None, - default: Any = _UNSET, - binding_extra: dict[str, Any] | None = None, - extra: dict[str, Any] | None = None, - ) -> InputSpec: - binding = _optional_binding(CommandLineBinding( - position=position, - prefix=prefix, - separate=separate, - item_separator=item_separator, - value_from=value_from, - shell_quote=shell_quote, - extra=dict(binding_extra or {}), - )) - return InputSpec( - type_=type_, - binding=binding, - label=label, - doc=doc, - format=format, - secondary_files=secondary_files, - streamable=streamable, - load_contents=load_contents, - load_listing=load_listing, - default=default, - extra=dict(extra or {}), - ) - - @staticmethod - def string(**kwargs: Any) -> InputSpec: - return Input.of(Type.string(), **kwargs) - - @staticmethod - def int(**kwargs: Any) -> InputSpec: - return Input.of(Type.int(), **kwargs) - - @staticmethod - def long(**kwargs: Any) -> InputSpec: - return Input.of(Type.long(), **kwargs) - - @staticmethod - def float(**kwargs: Any) -> InputSpec: - return Input.of(Type.float(), **kwargs) - - @staticmethod - def double(**kwargs: Any) -> InputSpec: - return Input.of(Type.double(), **kwargs) - - @staticmethod - def boolean(**kwargs: Any) -> InputSpec: - return Input.of(Type.boolean(), **kwargs) - - @staticmethod - def file(**kwargs: Any) -> InputSpec: - return Input.of(Type.file(), **kwargs) - - @staticmethod - def directory(**kwargs: Any) -> InputSpec: - return Input.of(Type.directory(), **kwargs) - - @staticmethod - def array(items: Any, **kwargs: Any) -> InputSpec: - return Input.of(Type.array(items), **kwargs) - - @staticmethod - def enum(*symbols: str, **kwargs: Any) -> InputSpec: - return Input.of(Type.enum(*symbols), **kwargs) - - @staticmethod - def record(fields: dict[str, FieldSpec] | list[Any], **kwargs: Any) -> InputSpec: - return Input.of(Type.record(fields), **kwargs) - - -# pylint: disable=too-many-public-methods -class Output: - """Structured CLT output helpers.""" - - @staticmethod - def of( - type_: Any, - *, - glob: Any = None, - load_contents: bool | None = None, - output_eval: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - format: Any = None, - secondary_files: Any = None, - streamable: bool | None = None, - load_listing: str | None = None, - binding_extra: dict[str, Any] | None = None, - extra: dict[str, Any] | None = None, - ) -> OutputSpec: - binding = _optional_binding(CommandOutputBinding( - glob=glob, - load_contents=load_contents, - output_eval=output_eval, - extra=dict(binding_extra or {}), - )) - return OutputSpec( - type_=type_, - binding=binding, - label=label, - doc=doc, - format=format, - secondary_files=secondary_files, - streamable=streamable, - load_listing=load_listing, - extra=dict(extra or {}), - ) - - @staticmethod - def string(**kwargs: Any) -> OutputSpec: - return Output.of(Type.string(), **kwargs) - - @staticmethod - def int(**kwargs: Any) -> OutputSpec: - return Output.of(Type.int(), **kwargs) - - @staticmethod - def long(**kwargs: Any) -> OutputSpec: - return Output.of(Type.long(), **kwargs) - - @staticmethod - def float(**kwargs: Any) -> OutputSpec: - return Output.of(Type.float(), **kwargs) - - @staticmethod - def double(**kwargs: Any) -> OutputSpec: - return Output.of(Type.double(), **kwargs) - - @staticmethod - def boolean(**kwargs: Any) -> OutputSpec: - return Output.of(Type.boolean(), **kwargs) - - @staticmethod - def file(**kwargs: Any) -> OutputSpec: - return Output.of(Type.file(), **kwargs) - - @staticmethod - def directory(**kwargs: Any) -> OutputSpec: - return Output.of(Type.directory(), **kwargs) - - @staticmethod - def stdout(**kwargs: Any) -> OutputSpec: - return Output.of(Type.stdout(), **kwargs) - - @staticmethod - def stderr(**kwargs: Any) -> OutputSpec: - return Output.of(Type.stderr(), **kwargs) - - @staticmethod - def array(items: Any, **kwargs: Any) -> OutputSpec: - return Output.of(Type.array(items), **kwargs) - - @staticmethod - def enum(*symbols: str, **kwargs: Any) -> OutputSpec: - return Output.of(Type.enum(*symbols), **kwargs) - - @staticmethod - def record(fields: dict[str, FieldSpec] | list[Any], **kwargs: Any) -> OutputSpec: - return Output.of(Type.record(fields), **kwargs) - - -@dataclass(frozen=True, slots=True) -class ValidationResult: - """Result of validating a generated CLT with cwltool/schema-salad.""" - - path: Path - uri: str - process: Any - - -class CWLBuilderValidationError(ValueError): - """Raised when a generated CLT fails schema validation.""" - - -def validate_cwl_document( - document: dict[str, Any], - *, - filename: str = "tool.cwl", - skip_schemas: bool = False, -) -> ValidationResult: - with tempfile.TemporaryDirectory(prefix="sophios-cwl-builder-") as tmpdir: - temp_path = Path(tmpdir) / filename - temp_path.write_text( - yaml.safe_dump(_render(document), sort_keys=False, line_break="\n"), - encoding="utf-8", - ) - return _validate_path(temp_path, skip_schemas=skip_schemas) - - -def _validate_path(path: Path, *, skip_schemas: bool = False) -> ValidationResult: - del skip_schemas # Reserved for parity with the rest of the codebase. - load_tool = _import_cwltool_load_tool() - try: - loading_context, workflowobj, uri = load_tool.fetch_document(str(path)) - loading_context, uri = load_tool.resolve_and_validate_document( - loading_context, - workflowobj, - uri, - preprocess_only=False, - ) - process = load_tool.make_tool(uri, loading_context) - except Exception as exc: - raise CWLBuilderValidationError(f"Generated CommandLineTool failed validation: {path}") from exc - return ValidationResult(path=path, uri=uri, process=process) - - -def _normalize_requirement( - requirement: str | _RequirementSpec | dict[str, Any], - value: dict[str, Any] | None = None, -) -> tuple[str, dict[str, Any]]: - match requirement: - case str() as class_name: - payload = {} if value is None else dict(_render(value)) - return class_name, payload - case _RequirementSpec() as spec: - return spec.class_name, spec.to_fields() - case dict() as payload: - if "class" not in payload: - raise ValueError("raw requirement dicts must include a 'class' key") - payload_copy = dict(_render(payload)) - class_name = str(payload_copy.pop("class")) - return class_name, payload_copy - case _: - raise TypeError("requirement must be a class name, requirement spec, or raw dict") - - -@dataclass(slots=True) -class CommandLineToolBuilder: - """Fluent builder for CWL v1.2 `CommandLineTool` documents.""" - - tool_id: str + inputs: Inputs + outputs: Outputs cwl_version: str = "v1.2" label_text: str | None = None doc_text: str | list[str] | None = None _base_command: list[str] = field(default_factory=list) _arguments: list[str | dict[str, Any]] = field(default_factory=list) - _inputs: dict[str, CommandInput] = field(default_factory=dict) - _outputs: dict[str, CommandOutput] = field(default_factory=dict) _requirements: dict[str, dict[str, Any]] = field(default_factory=dict) _hints: dict[str, dict[str, Any]] = field(default_factory=dict) _stdin: str | None = None @@ -1205,245 +95,173 @@ class CommandLineToolBuilder: _permanent_fail_codes: list[int] = field(default_factory=list) _extra: dict[str, Any] = field(default_factory=dict) - def label(self, text: str) -> "CommandLineToolBuilder": - self.label_text = text - return self + def __post_init__(self) -> None: + if not isinstance(self.inputs, Inputs): + raise TypeError("inputs must be an Inputs(...) collection") + if not isinstance(self.outputs, Outputs): + raise TypeError("outputs must be an Outputs(...) collection") - def doc(self, text: str | list[str]) -> "CommandLineToolBuilder": - self.doc_text = text + def _store_requirement( + self, + bucket: dict[str, dict[str, Any]], + requirement: Any, + value: dict[str, Any] | None, + ) -> None: + class_name, payload = _normalize_requirement(requirement, value) + if ":" in class_name: + prefix, _ = class_name.split(":", 1) + if prefix in _SUPPORT.known_namespaces and prefix not in self._namespaces: + self._namespaces[prefix] = _SUPPORT.known_namespaces[prefix] + bucket[class_name] = payload + + def _apply_spec(self, spec: Any, *, as_hint: bool) -> CommandLineTool: + self._store_requirement(self._hints if as_hint else self._requirements, spec, None) return self - def namespace(self, prefix: str, iri: str) -> "CommandLineToolBuilder": - self._namespaces[prefix] = iri + def _append_requirement_entry( + self, + class_name: str, + list_key: str, + item: Any, + *, + as_hint: bool = False, + ) -> CommandLineTool: + target = self._hints if as_hint else self._requirements + payload = target.setdefault(class_name, {list_key: []}) + listing = payload.setdefault(list_key, []) + if not isinstance(listing, list): + raise TypeError(f"{class_name} {list_key} must be a list") + listing.append(_render(item)) return self - def schema(self, iri: str) -> "CommandLineToolBuilder": - self._schemas.append(iri) + def describe( + self, + label: str | None = None, + doc: str | list[str] | None = None, + ) -> CommandLineTool: + if label is not None: + self.label_text = label + if doc is not None: + self.doc_text = doc return self - def intent(self, *identifiers: str) -> "CommandLineToolBuilder": - self._intent.extend(identifiers) + def label(self, text: str) -> CommandLineTool: + self.label_text = text return self - def base_command(self, *parts: str) -> "CommandLineToolBuilder": - self._base_command = list(parts) + def doc(self, text: str | list[str]) -> CommandLineTool: + self.doc_text = text return self - def stdin(self, value: str) -> "CommandLineToolBuilder": - self._stdin = value + def namespace(self, prefix: str, iri: str | None = None) -> CommandLineTool: + namespace_iri = iri if iri is not None else _SUPPORT.known_namespaces.get(prefix) + if namespace_iri is None: + raise ValueError( + f"Unknown namespace prefix {prefix!r}; please provide an explicit iri" + ) + self._namespaces[prefix] = namespace_iri return self - def stdout(self, value: str) -> "CommandLineToolBuilder": - self._stdout = value + def schema(self, iri: str) -> CommandLineTool: + schema_iri = _SUPPORT.known_schemas.get(iri, iri) + if schema_iri not in self._schemas: + self._schemas.append(schema_iri) return self - def stderr(self, value: str) -> "CommandLineToolBuilder": - self._stderr = value - return self + def edam(self) -> CommandLineTool: + return self.namespace("edam").schema("edam") - def add_input(self, input_spec: CommandInput) -> "CommandLineToolBuilder": - self._inputs[input_spec.name] = input_spec + def intent(self, *identifiers: str) -> CommandLineTool: + self._intent.extend(identifiers) return self - def inputs(self, **input_specs: InputSpec) -> "CommandLineToolBuilder": - for name, spec in input_specs.items(): - if not isinstance(spec, InputSpec): - raise TypeError(f"input {name!r} must be an InputSpec") - self.add_input(spec.named(name)) + def base_command(self, *parts: str) -> CommandLineTool: + self._base_command = list(parts) return self - def input( - self, - name: str, - *, - type_: Any, - position: int | float | None = None, - prefix: str | None = None, - separate: bool | None = None, - item_separator: str | None = None, - value_from: Any = None, - shell_quote: bool | None = None, - load_contents: bool | None = None, - load_listing: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - format: Any = None, - secondary_files: Any = None, - streamable: bool | None = None, - default: Any = _UNSET, - binding_extra: dict[str, Any] | None = None, - extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - return self.inputs( - **{ - name: Input.of( - type_, - position=position, - prefix=prefix, - separate=separate, - item_separator=item_separator, - value_from=value_from, - shell_quote=shell_quote, - load_contents=load_contents, - load_listing=load_listing, - label=label, - doc=doc, - format=format, - secondary_files=secondary_files, - streamable=streamable, - default=default, - binding_extra=binding_extra, - extra=extra, - ) - } - ) + def stdin(self, value: str) -> CommandLineTool: + self._stdin = value + return self - def add_output(self, output_spec: CommandOutput) -> "CommandLineToolBuilder": - self._outputs[output_spec.name] = output_spec + def stdout(self, value: str) -> CommandLineTool: + self._stdout = value return self - def outputs(self, **output_specs: OutputSpec) -> "CommandLineToolBuilder": - for name, spec in output_specs.items(): - if not isinstance(spec, OutputSpec): - raise TypeError(f"output {name!r} must be an OutputSpec") - self.add_output(spec.named(name)) + def stderr(self, value: str) -> CommandLineTool: + self._stderr = value return self - def output( + def add_argument( self, - name: str, - *, - type_: Any, - glob: Any = None, - load_contents: bool | None = None, - output_eval: str | None = None, - label: str | None = None, - doc: str | list[str] | None = None, - format: Any = None, - secondary_files: Any = None, - streamable: bool | None = None, - load_listing: str | None = None, - binding_extra: dict[str, Any] | None = None, - extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - return self.outputs( - **{ - name: Output.of( - type_, - glob=glob, - load_contents=load_contents, - output_eval=output_eval, - label=label, - doc=doc, - format=format, - secondary_files=secondary_files, - streamable=streamable, - load_listing=load_listing, - binding_extra=binding_extra, - extra=extra, - ) - } - ) - - def add_argument(self, argument: str | CommandArgument | dict[str, Any]) -> "CommandLineToolBuilder": + argument: str | CommandArgument | dict[str, Any], + ) -> CommandLineTool: match argument: case str() as literal: self._arguments.append(literal) case CommandArgument() as structured: self._arguments.append(structured.to_yaml()) case dict() as raw: - self._arguments.append(_render(raw)) + _warn_raw_escape_hatch("add_argument()") + self._arguments.append( + _sanitize_raw_mapping(raw, context="raw argument mapping") + ) case _: raise TypeError("argument must be a string, CommandArgument, or raw dict") return self - def argument( - self, - value: Any = None, - *, - position: int | float | None = None, - prefix: str | None = None, - separate: bool | None = None, - item_separator: str | None = None, - value_from: Any = None, - shell_quote: bool | None = None, - load_contents: bool | None = None, - binding_extra: dict[str, Any] | None = None, - extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - binding = CommandLineBinding( - position=position, - prefix=prefix, - separate=separate, - item_separator=item_separator, - value_from=value_from, - shell_quote=shell_quote, - load_contents=load_contents, - extra=dict(binding_extra or {}), + def argument(self, value: Any = None, **kwargs: Any) -> CommandLineTool: + binding_extra = dict(kwargs.pop("binding_extra", {}) or {}) + argument_extra = dict(kwargs.pop("extra", {}) or {}) + binding = CommandLineBinding(extra=binding_extra, **kwargs) + return self.add_argument( + CommandArgument(value=value, binding=binding, extra=argument_extra) ) - return self.add_argument(CommandArgument(value=value, binding=binding, extra=dict(extra or {}))) - def requirement( - self, - requirement: str | _RequirementSpec | dict[str, Any], - value: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - class_name, payload = _normalize_requirement(requirement, value) - self._requirements[class_name] = payload + def requirement(self, requirement: Any, value: dict[str, Any] | None = None) -> CommandLineTool: + self._store_requirement(self._requirements, requirement, value) return self - def hint( - self, - requirement: str | _RequirementSpec | dict[str, Any], - value: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - class_name, payload = _normalize_requirement(requirement, value) - self._hints[class_name] = payload + def hint(self, requirement: Any, value: dict[str, Any] | None = None) -> CommandLineTool: + self._store_requirement(self._hints, requirement, value) return self def docker( self, + image: str | None = None, *, - docker_pull: str | None = None, - docker_load: str | None = None, - docker_file: str | dict[str, Any] | None = None, - docker_import: str | None = None, - docker_image_id: str | None = None, - docker_output_directory: str | None = None, as_hint: bool = False, - extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = DockerRequirement( - docker_pull=docker_pull, - docker_load=docker_load, - docker_file=docker_file, - docker_import=docker_import, - docker_image_id=docker_image_id, - docker_output_directory=docker_output_directory, - extra=dict(extra or {}), + **kwargs: Any, + ) -> CommandLineTool: + return self._apply_spec( + DockerRequirement( + docker_pull=kwargs.pop("docker_pull", None) or image, + extra=dict(kwargs.pop("extra", {}) or {}), + **kwargs, + ), + as_hint=as_hint, ) - return self.hint(spec) if as_hint else self.requirement(spec) def inline_javascript( self, *expression_lib: str, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = InlineJavascriptRequirement( - expression_lib=list(expression_lib) or None, - extra=dict(extra or {}), + ) -> CommandLineTool: + return self._apply_spec( + InlineJavascriptRequirement(list(expression_lib) or None, dict(extra or {})), + as_hint=as_hint, ) - return self.hint(spec) if as_hint else self.requirement(spec) def schema_definitions( self, *types: Any, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = SchemaDefRequirement(types=list(types), extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec( + SchemaDefRequirement(list(types), dict(extra or {})), + as_hint=as_hint, + ) def load_listing( self, @@ -1451,23 +269,25 @@ def load_listing( *, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = LoadListingRequirement(load_listing=value, extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec(LoadListingRequirement(value, dict(extra or {})), as_hint=as_hint) - def shell_command(self, *, as_hint: bool = False, extra: dict[str, Any] | None = None) -> "CommandLineToolBuilder": - spec = ShellCommandRequirement(extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + def shell_command( + self, + *, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> CommandLineTool: + return self._apply_spec(ShellCommandRequirement(dict(extra or {})), as_hint=as_hint) def software( self, - packages: list[SoftwarePackage | dict[str, Any]] | dict[str, Any], + packages: list[SoftwarePackage | dict[str, Any]], *, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = SoftwareRequirement(packages=packages, extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec(SoftwareRequirement(packages, dict(extra or {})), as_hint=as_hint) def initial_workdir( self, @@ -1475,43 +295,85 @@ def initial_workdir( *, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = InitialWorkDirRequirement(listing=listing, extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec(InitialWorkDirRequirement(listing, dict(extra or {})), as_hint=as_hint) - def env_var(self, name: str, value: str, *, as_hint: bool = False) -> "CommandLineToolBuilder": - target = self._hints if as_hint else self._requirements - payload = target.setdefault("EnvVarRequirement", {"envDef": []}) - env_defs = payload.setdefault("envDef", []) - env_defs.append(EnvironmentDef(name, value).to_dict()) - return self + # This helper deliberately bundles the common staging knobs into one call. + # The slightly wider signature is easier to use than forcing nested objects. + def stage( # pylint: disable=too-many-arguments + self, + reference: Any, + *, + writable: bool = False, + entryname: str | None = None, + as_hint: bool = False, + extra: dict[str, Any] | None = None, + ) -> CommandLineTool: + return self._append_requirement_entry( + "InitialWorkDirRequirement", + "listing", + Dirent.from_input( + reference, + writable=writable, + entryname=entryname, + extra=extra, + ).to_dict(), + as_hint=as_hint, + ) + + def env_var(self, name: str, value: str, *, as_hint: bool = False) -> CommandLineTool: + return self._append_requirement_entry( + "EnvVarRequirement", + "envDef", + EnvironmentDef(name, value).to_dict(), + as_hint=as_hint, + ) def resources( self, *, - cores_min: int | float | str | None = None, - cores_max: int | float | str | None = None, - ram_min: int | float | str | None = None, - ram_max: int | float | str | None = None, - tmpdir_min: int | float | str | None = None, - tmpdir_max: int | float | str | None = None, - outdir_min: int | float | str | None = None, - outdir_max: int | float | str | None = None, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = ResourceRequirement( - cores_min=cores_min, - cores_max=cores_max, - ram_min=ram_min, - ram_max=ram_max, - tmpdir_min=tmpdir_min, - tmpdir_max=tmpdir_max, - outdir_min=outdir_min, - outdir_max=outdir_max, - extra=dict(extra or {}), + **kwargs: Any, + ) -> CommandLineTool: + cores_min = kwargs.pop("cores_min", None) + cores = kwargs.pop("cores", None) + ram_min = kwargs.pop("ram_min", None) + ram = kwargs.pop("ram", None) + tmpdir_min = kwargs.pop("tmpdir_min", None) + tmpdir = kwargs.pop("tmpdir", None) + outdir_min = kwargs.pop("outdir_min", None) + outdir = kwargs.pop("outdir", None) + aliases = { + "cores_min": cores if cores_min is None else cores_min, + "ram_min": ram if ram_min is None else ram_min, + "tmpdir_min": tmpdir if tmpdir_min is None else tmpdir_min, + "outdir_min": outdir if outdir_min is None else outdir_min, + } + aliases.update(kwargs) + return self._apply_spec( + ResourceRequirement(extra=dict(extra or {}), **aliases), + as_hint=as_hint, ) - return self.hint(spec) if as_hint else self.requirement(spec) + + # GPU hints naturally need a few related knobs, so this stays slightly wide. + def gpu( # pylint: disable=too-many-arguments + self, + *, + cuda_version_min: str | None = None, + compute_capability: str | None = None, + device_count_min: int | str | None = None, + as_hint: bool = True, + extra: dict[str, Any] | None = None, + ) -> CommandLineTool: + payload: dict[str, Any] = {} + _merge_if_set(payload, "cudaVersionMin", cuda_version_min) + _merge_if_set(payload, "cudaComputeCapability", compute_capability) + _merge_if_set(payload, "cudaDeviceCountMin", device_count_min) + payload.update(_render(extra or {})) + if as_hint: + return self.hint("cwltool:CUDARequirement", payload) + return self.requirement("cwltool:CUDARequirement", payload) def work_reuse( self, @@ -1519,9 +381,8 @@ def work_reuse( *, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = WorkReuse(enable_reuse=enable, extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec(WorkReuse(enable, dict(extra or {})), as_hint=as_hint) def network_access( self, @@ -1529,9 +390,8 @@ def network_access( *, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = NetworkAccess(network_access=enable, extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec(NetworkAccess(enable, dict(extra or {})), as_hint=as_hint) def inplace_update( self, @@ -1539,9 +399,11 @@ def inplace_update( *, as_hint: bool = True, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = InplaceUpdateRequirement(inplace_update=enable, extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec( + InplaceUpdateRequirement(enable, dict(extra or {})), + as_hint=as_hint, + ) def time_limit( self, @@ -1549,33 +411,66 @@ def time_limit( *, as_hint: bool = False, extra: dict[str, Any] | None = None, - ) -> "CommandLineToolBuilder": - spec = ToolTimeLimit(timelimit=seconds, extra=dict(extra or {})) - return self.hint(spec) if as_hint else self.requirement(spec) + ) -> CommandLineTool: + return self._apply_spec(ToolTimeLimit(seconds, dict(extra or {})), as_hint=as_hint) - def success_codes(self, *codes: int) -> "CommandLineToolBuilder": + def success_codes(self, *codes: int) -> CommandLineTool: self._success_codes = list(codes) return self - def temporary_fail_codes(self, *codes: int) -> "CommandLineToolBuilder": + def temporary_fail_codes(self, *codes: int) -> CommandLineTool: self._temporary_fail_codes = list(codes) return self - def permanent_fail_codes(self, *codes: int) -> "CommandLineToolBuilder": + def permanent_fail_codes(self, *codes: int) -> CommandLineTool: self._permanent_fail_codes = list(codes) return self - def extra(self, **values: Any) -> "CommandLineToolBuilder": - self._extra.update(_render(values)) + def extra(self, **values: Any) -> CommandLineTool: + _warn_raw_escape_hatch("extra()") + self._extra.update( + _sanitize_raw_mapping( + values, + context="extra()", + reserved_keys=set(_SUPPORT.reserved_document_keys), + ) + ) return self + def to_step( + self, + *, + step_name: str | None = None, + run_path: str | Path | None = None, + config: dict[str, Any] | None = None, + tool_registry: Tools | None = None, + ) -> Step: + """Convert this built CLT into an in-memory workflow `Step`. + + Args: + step_name (str | None): Optional workflow step name override. + run_path (str | Path | None): Optional virtual `.cwl` path for compiler bookkeeping. + config (dict[str, Any] | None): Optional input values to pre-bind. + tool_registry (Tools | None): Optional tool registry retained on the step. + + Returns: + Step: A workflow step backed by this CLT without writing to disk. + """ + return step_from_command_line_tool( + self, + step_name=step_name, + run_path=run_path, + config=config, + tool_registry=tool_registry, + ) + def build(self) -> dict[str, Any]: document: dict[str, Any] = { - "cwlVersion": self.cwl_version, "class": "CommandLineTool", - "id": self.tool_id, - "inputs": {name: input_spec.to_dict() for name, input_spec in self._inputs.items()}, - "outputs": {name: output_spec.to_dict() for name, output_spec in self._outputs.items()}, + "cwlVersion": self.cwl_version, + "id": self.name, + "inputs": self.inputs.to_dict(), + "outputs": self.outputs.to_dict(), } if self._namespaces: document["$namespaces"] = dict(self._namespaces) @@ -1586,8 +481,11 @@ def build(self) -> dict[str, Any]: if self._intent: document["intent"] = list(self._intent) if self._base_command: - document["baseCommand"] = self._base_command[0] if len( - self._base_command) == 1 else list(self._base_command) + document["baseCommand"] = ( + self._base_command[0] + if len(self._base_command) == 1 + else list(self._base_command) + ) if self._arguments: document["arguments"] = list(self._arguments) if self._requirements: @@ -1604,14 +502,20 @@ def build(self) -> dict[str, Any]: if self._permanent_fail_codes: document["permanentFailCodes"] = list(self._permanent_fail_codes) document.update(_render(self._extra)) + if _contains_expression(document): + requirements = document.setdefault("requirements", {}) + if ( + "InlineJavascriptRequirement" not in requirements + and "InlineJavascriptRequirement" not in document.get("hints", {}) + ): + requirements["InlineJavascriptRequirement"] = {} return document def to_dict(self) -> dict[str, Any]: return self.build() def to_yaml(self) -> str: - rendered_yaml = yaml.safe_dump(self.build(), sort_keys=False, line_break="\n") - return str(rendered_yaml) + return str(yaml.safe_dump(self.build(), sort_keys=False, line_break="\n")) def save(self, path: str | Path, *, validate: bool = False, skip_schemas: bool = False) -> Path: output_path = Path(path) @@ -1622,16 +526,71 @@ def save(self, path: str | Path, *, validate: bool = False, skip_schemas: bool = return output_path def validate(self, *, skip_schemas: bool = False) -> ValidationResult: - return validate_cwl_document(self.build(), filename=f"{self.tool_id}.cwl", skip_schemas=skip_schemas) + return validate_cwl_document(self.build(), filename=f"{self.name}.cwl", skip_schemas=skip_schemas) + + +def array_type(items: Any) -> dict[str, Any]: + """Return a CWL array type expression.""" + return cwl.array(items) + + +def enum_type(*symbols: str, name: str | None = None) -> dict[str, Any]: + """Return a CWL enum type expression.""" + return cwl.enum(*symbols, name=name) + + +def record_type( + fields: Mapping[str, FieldSpec] | list[FieldSpec | dict[str, Any]], + *, + name: str | None = None, +) -> dict[str, Any]: + """Return a CWL record type expression.""" + return cwl.record(fields, name=name) + + +def record_field(type_: Any, **kwargs: Any) -> FieldSpec: + """Return a named CWL record field helper.""" + return Field(type_, **kwargs) + + +def step_from_command_line_tool( + tool: CommandLineTool, + *, + step_name: str | None = None, + run_path: str | Path | None = None, + config: dict[str, Any] | None = None, + tool_registry: Tools | None = None, +) -> Step: + """Convert a built CLT into a workflow `Step` entirely in memory. + + Args: + tool (CommandLineTool): Built CLT to wrap as a workflow step. + step_name (str | None): Optional workflow step name override. + run_path (str | Path | None): Optional virtual `.cwl` path for compiler bookkeeping. + config (dict[str, Any] | None): Optional input values to pre-bind. + tool_registry (Tools | None): Optional tool registry retained on the step. + + Returns: + Step: A workflow step backed by the CLT without touching disk. + """ + from ._cwl_builder_step_bridge import ( # pylint: disable=import-outside-toplevel + step_from_command_line_tool as _step_from_command_line_tool, + ) + + return _step_from_command_line_tool( + tool, + step_name=step_name, + run_path=run_path, + config=config, + tool_registry=tool_registry, + ) __all__ = [ "CWLBuilderValidationError", "CommandArgument", - "CommandInput", "CommandLineBinding", - "CommandLineToolBuilder", - "CommandOutput", + "CommandLineTool", "CommandOutputBinding", "Dirent", "DockerRequirement", @@ -1641,13 +600,15 @@ def validate(self, *, skip_schemas: bool = False) -> ValidationResult: "FieldSpec", "InitialWorkDirRequirement", "InlineJavascriptRequirement", + "InplaceUpdateRequirement", "Input", "InputSpec", - "InplaceUpdateRequirement", + "Inputs", "LoadListingRequirement", "NetworkAccess", "Output", "OutputSpec", + "Outputs", "ResourceRequirement", "SchemaDefRequirement", "SecondaryFile", @@ -1655,13 +616,14 @@ def validate(self, *, skip_schemas: bool = False) -> ValidationResult: "SoftwarePackage", "SoftwareRequirement", "ToolTimeLimit", - "Type", "ValidationResult", "WorkReuse", "array_type", + "cwl", "enum_type", "record_field", "record_type", "secondary_file", + "step_from_command_line_tool", "validate_cwl_document", ] diff --git a/src/sophios/apis/rest/api.py b/src/sophios/apis/rest/api.py index 1143d606..d7339d0c 100644 --- a/src/sophios/apis/rest/api.py +++ b/src/sophios/apis/rest/api.py @@ -55,7 +55,7 @@ def remove_dot_dollar(tree: Cwl) -> Cwl: @app.get("/", status_code=status.HTTP_200_OK) # @authenticate -async def root(request: Request) -> Json: +async def root() -> Json: """The api has 1 route: compile Returns: @@ -95,10 +95,7 @@ async def compile_wf(request: Request) -> Json: # The default list tools_cwl: Tools = {} global_config = input_output.get_config(Path(args.config_file), Path(args.homedir)/'wic'/'global_config.json') - tools_cwl = plugins.get_tools_cwl(global_config, - args.validate_plugins, - not args.no_skip_dollar_schemas, - args.quiet) + tools_cwl = plugins.get_tools_cwl(global_config, args.validate_plugins, args.quiet) # Add to the default list if the tool is 'inline' in run tag # run tag will have the actual CommandLineTool for can_step in workflow_can["steps"]: diff --git a/src/sophios/apis/utils/ict/ict_spec/model.py b/src/sophios/apis/utils/ict/ict_spec/model.py index 14183b6b..d8329e50 100644 --- a/src/sophios/apis/utils/ict/ict_spec/model.py +++ b/src/sophios/apis/utils/ict/ict_spec/model.py @@ -35,19 +35,6 @@ def validate_ui(self) -> "ICT": ui_keys = [ui.key.root.split(".") for ui in self.ui] for ui_ in ui_keys: io_dict[ui_[0]].append(ui_[1]) - input_names = [io.name for io in self.inputs] - output_names = [io.name for io in self.outputs] - inp_bool = [x in input_names for x in io_dict["inputs"]] - out_bool = [x in output_names for x in io_dict["outputs"]] - - # if not all(inp_bool): - # raise ValueError( - # f"The ui keys must match the inputs and outputs keys. Unmatched: inputs.{set(io_dict['inputs'])-set(input_names)}" - # ) - # if not all(out_bool): - # raise ValueError( - # f"The ui keys must match the inputs and outputs keys. Unmatched: outputs.{set(io_dict['outputs'])-set(output_names)}" - # ) return self diff --git a/src/sophios/ast.py b/src/sophios/ast.py index 516b1a9d..154089ed 100644 --- a/src/sophios/ast.py +++ b/src/sophios/ast.py @@ -90,7 +90,6 @@ def read_ast_from_disk(homedir: str, paths_ns_i = yml_paths.get(plugin_ns, {}) if paths_ns_i == {}: - wicdir = Path(homedir) / 'wic' raise Exception( f"Error! namespace {plugin_ns} not found in yaml paths. Check 'search_paths_wic' in your config file") if stem not in paths_ns_i: @@ -108,8 +107,8 @@ def read_ast_from_disk(homedir: str, sub_yaml_tree_raw: Yaml = yaml.load(y.read(), Loader=wic_loader()) y_t = YamlTree(StepId(step_key, plugin_ns), sub_yaml_tree_raw) - (step_id_, sub_yml_tree) = read_ast_from_disk(homedir, y_t, yml_paths, tools, validator, - ignore_validation_errors) + (_, sub_yml_tree) = read_ast_from_disk(homedir, y_t, yml_paths, tools, validator, + ignore_validation_errors) steps_i_copy = {**steps[i]} step_i_id = steps[i]['id'] @@ -171,7 +170,7 @@ def merge_yml_trees(yaml_tree_tuple: YamlTree, sub_wic = wic_steps.get(f'({i+1}, {step_key})', {}) y_t = YamlTree(StepId(step_key, step_id.plugin_ns), sub_yml_tree_initial) - (step_key_, sub_yml_tree) = merge_yml_trees(y_t, sub_wic, tools) + (_, sub_yml_tree) = merge_yml_trees(y_t, sub_wic, tools) # Now mutably overwrite the self args with the merged args steps[i]['subtree'] = sub_yml_tree @@ -234,7 +233,7 @@ def tree_to_forest(yaml_tree_tuple: YamlTree, tools: Tools) -> YamlForest: sub_yaml_tree = steps[i]['subtree'] sub_yml_forest = tree_to_forest(YamlTree(StepId(step_key, plugin_ns_i), sub_yaml_tree), tools) - (sub_yml_tree_step_id, sub_yml_tree_) = sub_yml_forest.yaml_tree + sub_yml_tree_step_id, _sub_yml_tree = sub_yml_forest.yaml_tree yaml_forest_list.append((sub_yml_tree_step_id, sub_yml_forest)) return YamlForest(YamlTree(step_id, yaml_tree), yaml_forest_list) @@ -274,7 +273,7 @@ def python_script_generate_cwl(yaml_tree_tuple: YamlTree, if step_key in subkeys: sub_yml_tree_initial = steps[i]['subtree'] y_t = YamlTree(StepId(step_key, step_id.plugin_ns), sub_yml_tree_initial) - (step_key_, sub_yml_tree) = python_script_generate_cwl(y_t, root_yml_dir_abs, tools) + (_, sub_yml_tree) = python_script_generate_cwl(y_t, root_yml_dir_abs, tools) steps[i]['subtree'] = sub_yml_tree if step_key not in subkeys: diff --git a/src/sophios/compiler.py b/src/sophios/compiler.py index d5cfec57..bbaacc16 100644 --- a/src/sophios/compiler.py +++ b/src/sophios/compiler.py @@ -148,7 +148,7 @@ def compile_workflow_once(yaml_tree_ast: YamlTree, yaml_path = step_id.stem # We also want another copy of the original AST so that if we need to modify it, # we can return the modified AST to the call site and re-compile. - (yaml_path_orig, yaml_tree_orig) = copy.deepcopy(yaml_tree_ast) + (_, yaml_tree_orig) = copy.deepcopy(yaml_tree_ast) if not testing: print(' starting compilation of', (' ' * len(namespaces)) + yaml_path) @@ -162,7 +162,7 @@ def compile_workflow_once(yaml_tree_ast: YamlTree, yaml_stem = Path(yaml_path).stem - (back_name_, yaml_tree) = utils.extract_implementation(yaml_tree, wic['wic'], Path(yaml_path)) + (_, yaml_tree) = utils.extract_implementation(yaml_tree, wic['wic'], Path(yaml_path)) steps: List[Yaml] = yaml_tree['steps'] steps_keys = utils.get_steps_keys(steps) diff --git a/src/sophios/cwl_subinterpreter.py b/src/sophios/cwl_subinterpreter.py index 68a76983..81627928 100644 --- a/src/sophios/cwl_subinterpreter.py +++ b/src/sophios/cwl_subinterpreter.py @@ -145,7 +145,7 @@ def rerun_cwltool(homedir: str, _directory_realtime: Path, cachedir_path: Path, # proc = sub.run(self.cmd, cwd=working_dir) # cmd = self.cmd print('Running', cmd) - proc = sub.run(cmd, cwd=working_dir, check=False) # See below! + sub.run(cmd, cwd=working_dir, check=False) # See below! print('inner cwltool completed') # Don't check the return code because the file may not exist yet, or # because speculative execution may fail for any number of reasons. diff --git a/src/sophios/inlineing.py b/src/sophios/inlineing.py index a8928cad..e614d83e 100644 --- a/src/sophios/inlineing.py +++ b/src/sophios/inlineing.py @@ -147,7 +147,7 @@ def inline_subworkflow(yaml_tree_tuple: YamlTree, namespaces: Namespaces) -> Tup else: # Strip off one initial namespace y_t = YamlTree(StepId(step_key, step_id.plugin_ns), sub_yml_tree) - (step_key_, sub_yml_tree), len_substeps = inline_subworkflow(y_t, namespaces[1:]) + (_, sub_yml_tree), len_substeps = inline_subworkflow(y_t, namespaces[1:]) # TODO: re-index wic: steps: ? We probably should, although # inlineing after merging should not affect CWL args. # Re-indexing could be tricky w.r.t. overloading. @@ -187,7 +187,7 @@ def apply_args(sub_yml_tree: Yaml, sub_parentargs: Yaml) -> Yaml: if not argkey in inputs_workflow: raise Exception(f'Error while inlineing {argkey}\n{yaml.dump(sub_yml_tree)}\n{yaml.dump(sub_parentargs)}') - for i, step_key in enumerate(steps_keys): + for i, _step_key in enumerate(steps_keys): # NOTE: We should probably be using # sub_keys = utils.get_subkeys(steps_keys, tools) # to check whether or not `step_key in sub_keys` and thus diff --git a/src/sophios/input_output.py b/src/sophios/input_output.py index bd548898..9066a16f 100644 --- a/src/sophios/input_output.py +++ b/src/sophios/input_output.py @@ -67,7 +67,7 @@ def write_to_disk(rose_tree: RoseTree, path: Path, relative_run_path: bool, inpu if inputs_file: with open(inputs_file, mode='r', encoding='utf-8') as f: inputs = yaml.safe_load(f.read()) - for key, val in inputs.items(): + for val in inputs.values(): if 'location' in val and not Path(val['location']).is_absolute(): # Change relative paths for class: File and class: Dir # to be w.r.t. autogenerated/ diff --git a/src/sophios/main.py b/src/sophios/main.py index 26b7b451..1514f0ae 100644 --- a/src/sophios/main.py +++ b/src/sophios/main.py @@ -4,6 +4,7 @@ import subprocess as sub import traceback from typing import Dict +import json import graphviz import networkx as nx @@ -35,10 +36,7 @@ def main() -> None: sys.exit(0) global_config: Json = io.get_config(Path(args.config_file), default_config_file) - tools_cwl = plugins.get_tools_cwl(global_config, - args.validate_plugins, - not args.no_skip_dollar_schemas, - args.quiet) + tools_cwl = plugins.get_tools_cwl(global_config, args.validate_plugins, args.quiet) # pass around config object instead of reading from the disk! yml_paths = plugins.get_yml_paths(global_config) @@ -65,14 +63,17 @@ def main() -> None: for yml_path_str, yml_path in yml_paths_tuples: schema = wic_schema.compile_workflow_generate_schema(args.homedir, yml_path_str, yml_path, tools_cwl, yml_paths, validator, - args.ignore_validation_errors, - args.allow_raw_cwl) + args.ignore_validation_errors) # overwrite placeholders in schema_store. See comment in get_validator() schema_store[schema['$id']] = schema # Now that we compiled all of the subworkflows once with the permissive/weak schema, # compile the root yml workflow again with the restrictive/strict schema. validator = wic_schema.get_validator(tools_cwl, yaml_stems, schema_store, write_to_disk=True) + schema_store_path = Path('autogenerated/schemas/schema_store.json') + schema_store_path.parent.mkdir(parents=True, exist_ok=True) + with open(schema_store_path, mode='w', encoding='utf-8') as f: + json.dump(schema_store, f, indent=2) if args.generate_schemas: print('Finished generating schemas. Exiting.') @@ -199,7 +200,7 @@ def main() -> None: basepath = 'autogenerated' io.write_to_disk(rose_tree, Path(basepath), True, args.inputs_file) # extract the container images - pc.cwl_docker_extract(args.container_engine, args.pull_dir, yaml_stem) + pc.cwl_docker_extract(args.container_engine, args.pull_dir, Path(basepath) / f'{yaml_stem}.cwl') if args.docker_remove_entrypoints: rose_tree = pc.remove_entrypoints(args.container_engine, rose_tree) pc.find_and_create_output_dirs(rose_tree) diff --git a/src/sophios/plugins.py b/src/sophios/plugins.py index d645d063..2eaacd79 100644 --- a/src/sophios/plugins.py +++ b/src/sophios/plugins.py @@ -54,40 +54,31 @@ def logging_filters(allow_pf: bool = False) -> None: logger_wicad = logging.getLogger("wicautodiscovery") -def validate_cwl(cwl_path_str: str, skip_schemas: bool) -> None: - """This is the body of cwltool.load_tool.load_tool but exposes skip_schemas for performance. - Skipping significantly improves initial validation performance, but this is not always desired. - See https://github.com/common-workflow-language/cwltool/issues/623 +def validate_cwl(cwl_path_str: str) -> None: + """Validate a CWL file using `cwltool.load_tool`. Args: cwl_path_str (str): The path to the CWL file. - skip_schemas (bool): Skips processing $schemas tags. """ # NOTE: This uses NoResolvedFilter to suppress the info messages to stdout. loading_context, workflowobj, uri = cwltool.load_tool.fetch_document(cwl_path_str) - # NOTE: There has been a breaking change in the API for skip_schemas. - # TODO: re-enable skip_schemas while satisfying mypy - # loading_context.skip_schemas = skip_schemas loading_context, uri = cwltool.load_tool.resolve_and_validate_document( - loading_context, workflowobj, uri, preprocess_only=False # , skip_schemas=skip_schemas + loading_context, workflowobj, uri, preprocess_only=False ) # NOTE: Although resolve_and_validate_document does some validation, # some additional validation is done in make_tool, i.e. # resolve_and_validate_document does not in fact throw an exception for # some invalid CWL files, but make_tool does! - process_ = cwltool.load_tool.make_tool(uri, loading_context) - # return process_ # ignore process_ for now + cwltool.load_tool.make_tool(uri, loading_context) -def get_tools_cwl(config: Json, validate_plugins: bool = False, - skip_schemas: bool = False, quiet: bool = False) -> Tools: +def get_tools_cwl(config: Json, validate_plugins: bool = False, quiet: bool = False) -> Tools: """Uses glob() to find all of the CWL CommandLineTool definition files within any subdirectory of cwl_dir Args: config_file (Json): The user specified (or default generated) config json object cwl_dirs_file (Path): The subdirectories in which to search for CWL CommandLineTools validate_plugins (bool, optional): Performs validation on all CWL CommandLiineTools. Defaults to False. - skip_schemas (bool, optional): Skips processing $schemas tags. Defaults to False. quiet (bool, optional): Determines whether it captures stdout or stderr. Defaults to False. Returns: @@ -119,7 +110,7 @@ def get_tools_cwl(config: Json, validate_plugins: bool = False, stem = Path(cwl_path_str).stem if validate_plugins: - validate_cwl(cwl_path_str, skip_schemas) + validate_cwl(cwl_path_str) tool = utils_cwl.desugar_into_canonical_normal_form(tool) @@ -156,7 +147,7 @@ def cwl_update_outputs_optional(cwl: Cwl, failure_code_range: List[int], f"lower {failure_code_range[0]} value can't be greater than higher {failure_code_range[1]} value" cwl_mod['successCodes'] = list(set([0] + direct_failure_codes + codes_from_range)) # Update outputs optional - for out_key, out_val_dict in cwl_mod['outputs'].items(): + for out_val_dict in cwl_mod['outputs'].values(): if isinstance(out_val_dict['type'], str) and out_val_dict['type'][-1] != '?': out_val_dict['type'] += '?' return cwl_mod @@ -207,7 +198,7 @@ def remove_entrypoints(client: Client, build: Any) -> None: f.write(dockerfile_content) # Build the new Docker image from the Dockerfile - new_image, build_logs = build.build( + build.build( path=tempdir, dockerfile="Dockerfile_tmp", tag=f"{tag}-noentrypoint" diff --git a/src/sophios/post_compile.py b/src/sophios/post_compile.py index be4303dc..e47176be 100644 --- a/src/sophios/post_compile.py +++ b/src/sophios/post_compile.py @@ -24,9 +24,9 @@ def find_output_dirs(data: Union[RoseTree, Dict, list]) -> list: match data: case dict() as data_dict: match data_dict: - case {"class": "Directory", "location": {"wic_inline_input": val}, **rest_data_dict}: + case {"class": "Directory", "location": {"wic_inline_input": val}, **_rest_data_dict}: results.append(val) - case {"class": "Directory", "location": dl, **rest_data_dict}: + case {"class": "Directory", "location": dl, **_rest_data_dict}: results.append(dl) case _: pass @@ -133,8 +133,15 @@ def verify_container_engine_config(container_engine: str, ignore_container_insta sys.exit(1) -def cwl_docker_extract(container_engine: str, pull_dir: str, file_name: str) -> None: - """Helper function to do the cwl_docker_extract""" +def cwl_docker_extract(container_engine: str, pull_dir: str, cwl_path: str | Path) -> None: + """Run `cwl-docker-extract` against a compiled CWL document. + + Args: + container_engine (str): Container engine used for execution. + pull_dir (str): Directory used by singularity for image pulls. + cwl_path (str | Path): Path to the compiled CWL workflow file. + """ + cwl_path_str = str(Path(cwl_path)) # cwl-docker-extract recursively `docker pull`s all images in all subworkflows. # This is important because cwltool only uses `docker run` when executing # workflows, and if there is a local image available, @@ -142,9 +149,9 @@ def cwl_docker_extract(container_engine: str, pull_dir: str, file_name: str) -> # cwltool has a --force-docker-pull option, but this may cause multiple pulls in parallel. if container_engine == 'singularity': cmd = ['cwl-docker-extract', '-s', '--dir', - f'{pull_dir}', f'autogenerated/{file_name}.cwl'] + f'{pull_dir}', cwl_path_str] else: - cmd = ['cwl-docker-extract', '--force-download', f'autogenerated/{file_name}.cwl'] + cmd = ['cwl-docker-extract', '--force-download', cwl_path_str] sub.run(cmd, check=True) diff --git a/src/sophios/python_cwl_adapter.py b/src/sophios/python_cwl_adapter.py index a15589bf..5b5c79c2 100644 --- a/src/sophios/python_cwl_adapter.py +++ b/src/sophios/python_cwl_adapter.py @@ -79,7 +79,6 @@ def get_main_args(module_: ModuleType) -> Dict[str, Any]: import inspect # pylint: disable=import-outside-toplevel anns = inspect.getfullargspec(module_.main).annotations - ret = {'return': anns.get('return')} # Separate out the return type if 'return' in anns: del anns['return'] # print(anns) @@ -131,11 +130,8 @@ def generate_CWL_CommandLineTool(module_inputs: Dict[str, Any], module_outputs: yaml_tree['$schemas'] = ['https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl'] yaml_tree['baseCommand'] = 'python3' - types_entry = '$(inputs.workflow_types)' - driver_entry = '$(inputs.driver_script)' - script_entry = '$(inputs.script)' requirements: Dict[str, Any] = {} - requirements = { # 'InitialWorkDirRequirement': {'listing': [script_entry]}, #[types_entry,driver_entry,script_entry] + requirements = { 'InlineJavascriptRequirement': {}} if python_script_docker_pull: requirements['DockerRequirement'] = {'dockerPull': python_script_docker_pull} @@ -160,16 +156,8 @@ def input_binding(position: int, prefix: str = '') -> Dict[str, Any]: yaml_tree['inputs'] = inputs outputs: Dict[str, Any] = {} - for i, (arg_key, (glob_pattern, arg_val)) in enumerate(module_outputs.items()): + for arg_key, (glob_pattern, arg_val) in module_outputs.items(): outputs[arg_key] = {**arg_val, 'outputBinding': {'glob': glob_pattern}} - # output_all is optional, but good for debugging bad glob patterns - output_all = {'type': - {'type': 'array', - 'items': ['Directory', 'File']}, - 'outputBinding': {'glob': '.'}, - 'format': 'edam:format_2330'} # 'Textual format' - # This crashes toil-cwl-runner, but not cwltool. - # outputs['output_all'] = output_all yaml_tree['outputs'] = outputs yaml_tree['stdout'] = 'stdout' @@ -217,7 +205,7 @@ def get_inputs_workflow(module_inputs: Dict[str, Any], python_script_path: str, """ inputs_workflow = {} inputs_workflow['script'] = {'class': 'File', 'format': 'edam:format_2330', 'path': python_script_path} - for i, (arg, yml_val) in enumerate(yml_args.items()): + for arg, yml_val in yml_args.items(): if module_inputs[arg]['type'] == 'string': inputs_workflow[arg] = yml_val else: diff --git a/src/sophios/run_local.py b/src/sophios/run_local.py index e6f3691b..ef05f230 100644 --- a/src/sophios/run_local.py +++ b/src/sophios/run_local.py @@ -5,12 +5,13 @@ import os import re import stat +from contextlib import contextmanager from pathlib import Path from pprint import pprint import shutil import traceback from datetime import datetime -from typing import List, Optional, Dict +from typing import Iterator, List, Optional, Dict import requests from sophios.wic_types import Json @@ -75,6 +76,29 @@ def create_safe_env(user_env: Dict[str, str]) -> dict: return {**os.environ, **sanitized_user_env} +@contextmanager +def temporary_env(user_env: Dict[str, str]) -> Iterator[dict[str, str]]: + """Temporarily apply sanitized environment variables and restore them after use. + + Args: + user_env (Dict[str, str]): User-defined environment variables. + + Yields: + dict: The sanitized environment mapping applied for the duration of the context. + """ + sanitized_user_env = sanitize_env_vars(user_env) + previous_values = {key: os.environ.get(key) for key in sanitized_user_env} + os.environ.update(sanitized_user_env) + try: + yield {**os.environ} + finally: + for key, previous_value in previous_values.items(): + if previous_value is None: + os.environ.pop(key, None) + else: + os.environ[key] = previous_value + + def generate_run_script(cmdline: str) -> None: """Writes the command used to invoke the cwl-runner to run.sh Does not actually invoke ./run.sh @@ -157,7 +181,7 @@ def build_cmd(workflow_name: str, basepath: str, cwl_runner: str, def run_local(run_args_dict: Dict[str, str], use_subprocess: bool, passthrough_args: List[str], workflow_name: str, - basepath: str) -> Optional[int]: + basepath: str, user_env_vars: Optional[Dict[str, str]] = None) -> Optional[int]: """This function runs the compiled workflow locally. Args: @@ -165,20 +189,22 @@ def run_local(run_args_dict: Dict[str, str], use_subprocess: bool, use_subprocess (bool): When using cwltool, determines whether to use subprocess.run(...) or use the cwltool python api. basepath (str): The path at which the workflow to be executed + user_env_vars (Optional[Dict[str, str]]): User supplied environment variables. Returns: retval (Optional[int]): The return value indicating if run succeeded (0) or not """ retval = 1 # overwrite if successful - container_engine = run_args_dict['container_engine'] yaml_path = Path(basepath) / workflow_name cwl_runner = run_args_dict['cwl_runner'] cachedir = run_args_dict.get('cachedir', 'cachedir') # 'cachedir' is the default value + container_engine = run_args_dict['container_engine'] # build the runner command cmd = build_cmd(workflow_name, basepath, cwl_runner, container_engine, passthrough_args) cmdline = ' '.join(cmd) + exec_env = create_safe_env(user_env_vars or {}) if run_args_dict.get('generate_run_script', 'no') == 'yes': generate_run_script(cmdline) @@ -189,22 +215,23 @@ def run_local(run_args_dict: Dict[str, str], use_subprocess: bool, # To run in parallel (i.e. pytest ... --workers 8 ...), we need to # use separate processes. Otherwise: # "signal only works in main thread or with __pypy__.thread.enable_signals()" - proc = sub.run(cmd, check=False) + proc = sub.run(cmd, check=False, env=exec_env) retval = proc.returncode return retval # Skip copying files to outdir/ for CI else: try: - if cwl_runner == 'cwltool': - print('via cwltool.main.main python API') - retval = cwltool.main.main(cmd[1:]) - print(f'Final output json metadata blob is in output_{workflow_name}.json') - if run_args_dict.get('copy_output_files', 'no') == 'yes': - copy_output_files(workflow_name) - elif cwl_runner == 'toil-cwl-runner': - print('via toil.cwl.cwltoil.main python API') - retval = toil.cwl.cwltoil.main(cmd[1:]) - else: - raise ValueError('unsupported cwl_runner') + with temporary_env(user_env_vars or {}): + if cwl_runner == 'cwltool': + print('via cwltool.main.main python API') + retval = cwltool.main.main(cmd[1:]) + print(f'Final output json metadata blob is in output_{workflow_name}.json') + if run_args_dict.get('copy_output_files', 'no') == 'yes': + copy_output_files(workflow_name) + elif cwl_runner == 'toil-cwl-runner': + print('via toil.cwl.cwltoil.main python API') + retval = toil.cwl.cwltoil.main(cmd[1:]) + else: + raise ValueError('unsupported cwl_runner') except Exception as e: retval = 1 @@ -248,9 +275,6 @@ def run_compute(workflow_name: str, workflow: Json, workflow_inputs: Json, Returns: retval (Optional[int]): The return value indicating if run succeeded (0) or not """ - # update the environment with user supplied env args - os.environ.update(sanitize_env_vars(user_env_vars)) - connect_timeout = 5 # in seconds read_timeout = 30 # in seconds timeout_tuple = (connect_timeout, read_timeout) @@ -271,13 +295,14 @@ def run_compute(workflow_name: str, workflow: Json, workflow_inputs: Json, print("Ill-formed URL string detected! Please provide a valid URL") return 1 - print('Sending request to Compute') - res = requests.post(submit_url, json=compute_workflow, timeout=timeout_tuple) - print('Post response code: ' + str(res.status_code)) + with temporary_env(user_env_vars): + print('Sending request to Compute') + res = requests.post(submit_url, json=compute_workflow, timeout=timeout_tuple) + print('Post response code: ' + str(res.status_code)) - res = requests.get(submit_url + f'{jobid}/outputs/', timeout=timeout_tuple) - print('Output response code: ' + str(res.status_code)) - retval = 0 if res.status_code == 200 else 1 + res = requests.get(submit_url + f'{jobid}/outputs/', timeout=timeout_tuple) + print('Output response code: ' + str(res.status_code)) + retval = 0 if res.status_code == 200 else 1 print('Toil output: ' + str(res.text)) res = requests.get(submit_url + f'{jobid}/logs/', timeout=timeout_tuple) diff --git a/src/sophios/schemas/wic_schema.py b/src/sophios/schemas/wic_schema.py index 437f6228..4f2f9c5b 100644 --- a/src/sophios/schemas/wic_schema.py +++ b/src/sophios/schemas/wic_schema.py @@ -12,7 +12,7 @@ import sophios from sophios import ast, compiler, utils_cwl -from sophios.cli import get_args, get_dicts_for_compilation +from sophios.cli import get_dicts_for_compilation from sophios.utils_yaml import wic_loader from sophios.wic_types import GraphData, GraphReps, NodeData, StepId, Yaml, YamlTree from ..wic_types import Json, Tools @@ -403,8 +403,6 @@ def wic_tag_schema(hypothesis: bool = False) -> Json: schema['title'] = 'Metadata annotations' schema['description'] = 'Use steps: to recursively overload / pass parameters.\nUse graphviz: to modify the DAGs.' - pat_semver = "^[0-9]+\\.[0-9]+\\.[0-9]+$" - version = {'type': 'string', 'pattern': pat_semver} driver = {'type': 'string', 'enum': ['slurm', 'argo']} schema_props = {'graphviz': graphviz_schema, 'steps': steps, 'implementation': implementation, @@ -595,8 +593,7 @@ def compile_workflow_generate_schema(homedir: str, tools_cwl: Tools, yml_paths: Dict[str, Dict[str, Path]], validator: Draft202012Validator, - ignore_validation_errors: bool, - allow_raw_cwl: bool) -> Json: + ignore_validation_errors: bool) -> Json: """Compiles a workflow and generates a schema which (recursively) includes the inputs/outputs from subworkflows. Args: @@ -607,7 +604,6 @@ def compile_workflow_generate_schema(homedir: str, yml_paths (Dict[str, Dict[str, Path]]): The yml workflow definitions found using get_yml_paths() validator (Draft202012Validator): Used to validate the yml files against the autogenerated schema. ignore_validation_errors (bool): Temporarily ignore validation errors. Do not use this permanently! - allow_raw_cwl (bool): Do not check whether the input to a workflow step refers to the workflow inputs: tag Returns: Json: An autogenerated, documented schema based on the inputs and outputs of the Workflow. @@ -638,8 +634,6 @@ def compile_workflow_generate_schema(homedir: str, graph_nx = nx.DiGraph() graphdata = GraphData(str(yml_path)) graph = GraphReps(graph_gv, graph_nx, graphdata) - args = get_args(str(yml_path), ['--allow_raw_cwl'] if allow_raw_cwl else []) - compiler_options, graph_settings, yaml_tag_paths = get_dicts_for_compilation() compiler_info = compiler.compile_workflow(yaml_tree, compiler_options, graph_settings, yaml_tag_paths, [], [graph], diff --git a/src/sophios/utils_cwl.py b/src/sophios/utils_cwl.py index 83185f56..a6d4681a 100644 --- a/src/sophios/utils_cwl.py +++ b/src/sophios/utils_cwl.py @@ -207,14 +207,12 @@ def get_workflow_outputs(graph_settings: Dict[str, Any], # Note that this error is not detected using --validate. # One workaround is to simply output all files. # TODO: glob "." is still returning null; need to use InitialWorkDirRequirement?? - output_all = {'output_all': - {'type': - {'type': 'array', - 'items': ['Directory', 'File']}, - 'outputBinding': {'glob': '\".\"'}, - 'format': 'edam:format_2330'}} # 'Textual format' # This crashes toil-cwl-runner, but not cwltool. - # workflow_outputs.update(output_all) # type: ignore + # workflow_outputs['output_all'] = { + # 'type': {'type': 'array', 'items': ['Directory', 'File']}, + # 'outputBinding': {'glob': '\".\"'}, + # 'format': 'edam:format_2330', + # } return workflow_outputs diff --git a/src/sophios/utils_graphs.py b/src/sophios/utils_graphs.py index 5d2baecc..ae6d5458 100644 --- a/src/sophios/utils_graphs.py +++ b/src/sophios/utils_graphs.py @@ -175,7 +175,7 @@ def make_plugins_dag(tools: Tools, graph_dark_theme: bool) -> None: font_edge_color = 'black' if graph_dark_theme else 'white' graph.attr(fontcolor=font_edge_color) for tool in list(tools)[i*num_tools_half:(i+1)*num_tools_half]: - (tool_path, tool_cwl) = tools[tool] + tool_path, _tool_cwl = tools[tool] attrs = {'shape': 'box', 'style': 'rounded, filled'} graph.node(Path(tool_path).stem, fillcolor='lightblue', fontsize="24", width='0.75', **attrs) # NOTE: Since there are no edges in this DAG and thus no edge constraints, @@ -205,7 +205,7 @@ def add_subgraphs(graph_settings: Dict[str, Any], # Add the cluster subgraphs to the main graph, but we need to add them in # reverse order to trick the graphviz layout algorithm. for sibling in sibling_subgraphs[::-1]: # Reverse! - (sib_graph_gv, sib_graph_nx, sib_graphdata) = sibling + sib_graph_gv, sib_graph_nx, _sib_graphdata = sibling if len(namespaces) < graph_settings['graph_inline_depth']: graph_gv.subgraph(sib_graph_gv) graph_nx.add_nodes_from(sib_graph_nx.nodes) diff --git a/tests/test_cli_flags.py b/tests/test_cli_flags.py deleted file mode 100644 index d18bf68c..00000000 --- a/tests/test_cli_flags.py +++ /dev/null @@ -1,31 +0,0 @@ -import pathlib -import subprocess -import shutil -import yaml - - -def test_generate_cwl_workflow() -> None: - """ - Test that running sophios with --generate_cwl_workflow produces the correct cwl file - """ - # remove output directory in case it exists to avoid false test pass - out_path = pathlib.Path('autogenerated') - - if out_path.exists() and out_path.is_dir(): - shutil.rmtree(out_path) - - yaml_path = str(pathlib.Path(__file__).parent.parent.resolve() / "docs/tutorials/helloworld.wic") - - cmd = ["sophios", "--yaml", yaml_path, "--generate_cwl_workflow"] - - # run sophios with args - subprocess.run(cmd, check=False) - - with open("autogenerated/helloworld.cwl", "r", encoding='utf-8') as cwl_file: - result_dict = yaml.safe_load(cwl_file) - - with open(str(pathlib.Path(__file__).parent.resolve() / "data/cwl/helloworld.cwl"), "r", - encoding='utf-8') as cwl_file: - actual_dict = yaml.safe_load(cwl_file) - - assert result_dict == actual_dict diff --git a/tests/test_compile_python_workflows.py b/tests/test_compile_python_workflows.py deleted file mode 100644 index 0158612e..00000000 --- a/tests/test_compile_python_workflows.py +++ /dev/null @@ -1,123 +0,0 @@ -import json -import traceback -from pathlib import Path -import pytest -import yaml - -import sophios -import sophios.plugins -from sophios import input_output as io -from sophios import utils, utils_cwl -from sophios.python_cwl_adapter import import_python_file -from sophios.schemas import wic_schema -from sophios.utils_yaml import wic_loader -from sophios.wic_types import Json - - -REPO_ROOT = Path(__file__).resolve().parent.parent -AUTOGENERATED_DIR = REPO_ROOT / "autogenerated" -PYTHON_WORKFLOW_MANIFEST = AUTOGENERATED_DIR / "python_workflow_manifest.json" - - -def _load_global_config() -> Json: - config_file = Path().home() / "wic" / "global_config.json" - return io.read_config_from_disk(config_file) - - -def _iter_python_workflow_paths(global_config: Json) -> list[tuple[str, Path]]: - paths = sophios.plugins.get_py_paths(global_config) - return [ - (path_str, path) - for _, paths_dict in paths.items() - for path_str, path in paths_dict.items() - if "mm-workflows" not in str(path) and "docs/tutorials/" not in str(path) - ] - - -def _write_manifest(workflow_paths: list[Path]) -> None: - AUTOGENERATED_DIR.mkdir(parents=True, exist_ok=True) - manifest = sorted({str(path) for path in workflow_paths}) - PYTHON_WORKFLOW_MANIFEST.write_text(json.dumps(manifest, indent=2), encoding="utf-8") - - -@pytest.mark.fast -def test_compile_python_workflows() -> None: - """This function imports (read: blindly executes) all python files in 'search_paths_wic' - The python files are assumed to have a top-level workflow() function - which returns a sophios.apis.python.api.Workflow object. - The python files should NOT call the .run() method! - (from any code path that is automatically executed on import) - """ - from sophios.apis.python import api # pylint: disable=C0415:import-outside-toplevel - # Since this is completely different test path we have to copy - # default .txt files to default global_config.json - global_config = _load_global_config() - api.global_config = sophios.plugins.get_tools_cwl(global_config) # Use path fallback in the CI - paths_tuples = _iter_python_workflow_paths(global_config) - import_errors: list[str] = [] - generated_workflows: list[Path] = [] - for path_stem, path in paths_tuples: - # NOTE: Use anything (unique?) for the python_module_name. - try: - module = import_python_file(path_stem, path) - # Let's require all python API files to define a function, say - # def workflow() -> Workflow - # so we can programmatically call it here: - retval: api.Workflow = module.workflow() # no arguments - # which allows us to programmatically call Workflow methods: - compiler_info = retval.compile() # hopefully retval is actually a Workflow object! - # But since this is python (i.e. not Haskell) that in no way eliminates - # the above security considerations. - - # This lets us use path.parent to write a *.wic file in the - # auto-discovery path, and thus reuse the existing wic CI - retval.write_ast_to_disk(path.parent) - generated_workflows.extend(path.parent / f"{wf.process_name}.wic" for wf in retval.flatten_subworkflows()) - - # Programmatically blacklist subworkflows from running in config_ci.json - # (Again, because subworkflows are missing inputs and cannot run.) - config_ci = path.parent / 'config_ci.json' - json_contents = {} - if config_ci.exists(): - with open(config_ci, mode='r', encoding='utf-8') as r: - json_contents = json.load(r) - run_blacklist: list[str] = json_contents.get('run_blacklist', []) - # Use [1:] for proper subworkflows only - subworkflows: list[api.Workflow] = retval.flatten_subworkflows()[1:] - run_blacklist += [wf.process_name for wf in subworkflows] - json_contents['run_blacklist'] = run_blacklist - with open(config_ci, mode='w', encoding='utf-8') as f: - json.dump(json_contents, f) - - except Exception as e: - import_errors.append(f"{path_stem}: {type(e).__name__}: {e}") - traceback.print_exception(type(e), value=e, tb=None) - if import_errors: - pytest.fail("Python workflow imports failed:\n" + "\n".join(import_errors)) - _write_manifest(generated_workflows) - - -@pytest.mark.fast -def test_validate_generated_python_workflows() -> None: - if not PYTHON_WORKFLOW_MANIFEST.exists(): - pytest.fail(f"Missing generated workflow manifest: {PYTHON_WORKFLOW_MANIFEST}") - - global_config = _load_global_config() - tools_cwl = sophios.plugins.get_tools_cwl(global_config) - yml_paths = sophios.plugins.get_yml_paths(global_config) - yaml_stems = utils.flatten([list(paths) for paths in yml_paths.values()]) - validator = wic_schema.get_validator(tools_cwl, yaml_stems, {}, write_to_disk=False) - - workflow_paths = json.loads(PYTHON_WORKFLOW_MANIFEST.read_text(encoding="utf-8")) - validation_errors: list[str] = [] - for workflow_path_str in workflow_paths: - workflow_path = Path(workflow_path_str) - try: - with workflow_path.open("r", encoding="utf-8") as handle: - yaml_tree = yaml.load(handle.read(), Loader=wic_loader()) - validator.validate(utils_cwl.desugar_into_canonical_normal_form(yaml_tree)) - except Exception as exc: - validation_errors.append(f"{workflow_path}: {type(exc).__name__}: {exc}") - - if validation_errors: - pytest.fail("Generated workflow validation failed:\n" + "\n".join(validation_errors)) diff --git a/tests/test_cwl_builder.py b/tests/test_cwl_builder.py index e1156a51..04b09f82 100644 --- a/tests/test_cwl_builder.py +++ b/tests/test_cwl_builder.py @@ -3,66 +3,72 @@ import pytest import yaml -import sophios.apis.python.cwl_builder as cwl_builder +import sophios.apis.python._cwl_builder_support as cwl_builder_support from sophios.apis.python.cwl_builder import ( - CommandLineToolBuilder, + CommandLineTool, Dirent, Field, Input, + Inputs, Output, - Type, + Outputs, + cwl, secondary_file, ) +from sophios.apis.python.api import Step -def _rich_builder() -> CommandLineToolBuilder: - mode_type = Type.enum("fast", "accurate", name="Mode") - settings_type = Type.record( +def _rich_tool() -> CommandLineTool: + mode_type = cwl.enum("fast", "accurate", name="Mode") + settings_type = cwl.record( { - "threads": Field.int(), - "preset": Field.of(mode_type), - "tags": Field.array(Type.string()), + "threads": Field(cwl.int), + "preset": Field(mode_type), + "tags": Field.array(cwl.string), }, name="Settings", ) + inputs = Inputs( + reads=Input.array(cwl.file, flag="--reads") + .format("edam:format_2572") + .secondary_files(secondary_file(".bai", required=False)), + mode=Input(mode_type, flag="--mode"), + settings=Input(settings_type).load_listing("shallow_listing"), + ) + outputs = Outputs(sam=Output.stdout()) return ( - CommandLineToolBuilder("aligner") + CommandLineTool("aligner", inputs, outputs) .label("Align reads") .doc(["Toy CLT", "for serialization coverage"]) - .namespace("edam", "https://edamontology.org/") + .namespace("edam") .schema("https://example.org/formats.rdf") .intent("edam:operation_3198") .base_command("bash", "-lc") .shell_command() .inline_javascript("function passthrough(x) { return x; }") .schema_definitions(mode_type, settings_type) - .docker(docker_pull="alpine:3.20") + .docker("alpine:3.20") .resources(cores_min=1.5, ram_min=1024, outdir_min=256) .env_var("LC_ALL", "C") .initial_workdir([Dirent("threads=4\n", entryname="config.txt")]) .work_reuse(False, as_hint=True) .network_access(False) .argument("run-aligner", position=0) - .inputs( - reads=Input.array( - Type.file(), - prefix="--reads", - format="edam:format_2572", - secondary_files=[secondary_file(".bai", required=False)], - ), - mode=Input.of(mode_type, prefix="--mode"), - settings=Input.of(settings_type, load_listing="shallow_listing"), - ) - .outputs(sam=Output.stdout()) .stdout("aligned.sam") .success_codes(0, 2) ) +@pytest.mark.fast +def test_cwl_builder_requires_structural_core() -> None: + with pytest.raises(TypeError): + CommandLineTool("missing-inputs") # type: ignore[call-arg] + + @pytest.mark.fast def test_cwl_builder_covers_common_clt_surface() -> None: - tool = _rich_builder().to_dict() + tool = _rich_tool().to_dict() assert tool["$namespaces"] == {"edam": "https://edamontology.org/"} assert tool["$schemas"] == ["https://example.org/formats.rdf"] @@ -97,40 +103,117 @@ def test_cwl_builder_covers_common_clt_surface() -> None: @pytest.mark.fast def test_cwl_builder_accepts_raw_extensions() -> None: + tool = CommandLineTool( + "custom-tool", + Inputs(message=Input(cwl.string)), + Outputs(out=Output(cwl.file, glob="out.txt")), + ) + + with pytest.warns(UserWarning, match="raw CWL injection"): + rendered = tool.time_limit(60).extra(sbol_intent="example:custom", customExtension={"enabled": True}).to_dict() + + assert rendered["requirements"]["ToolTimeLimit"] == {"timelimit": 60} + assert rendered["sbol_intent"] == "example:custom" + assert rendered["customExtension"] == {"enabled": True} + + +@pytest.mark.fast +def test_cwl_builder_rejects_reserved_or_salad_raw_keys() -> None: + tool = CommandLineTool( + "custom-tool", + Inputs(message=Input(cwl.string)), + Outputs(out=Output(cwl.file, glob="out.txt")), + ) + + with pytest.raises(ValueError, match="builder-managed keys"): + tool.extra(inputs={"bad": "idea"}) + + with pytest.raises(ValueError, match="SALAD document-assembly keys"): + tool.requirement({"class": "EnvVarRequirement", "$import": "bad"}) + + +@pytest.mark.fast +def test_cwl_builder_high_level_helpers_hide_cwl_plumbing() -> None: + inputs = Inputs( + input=Input(cwl.directory, position=1).label("Input Zarr dataset").doc("Path to input zarr dataset"), + output=Input(cwl.directory, position=2).label("Output segmentation Zarr").doc( + "Path for output segmentation zarr" + ), + model=Input(cwl.file, flag="--model", required=False).label("Model override file"), + tile_size=Input(cwl.int, flag="--tile-size", required=False).label("Tile size"), + iou_threshold=Input(cwl.float, flag="--iou-threshold", required=False).label("IoU threshold"), + ) + outputs = Outputs(output=Output(cwl.directory, from_input=inputs.output).label("Output segmentation Zarr")) tool = ( - CommandLineToolBuilder("custom-tool") - .inputs(message=Input.string()) - .outputs(out=Output.file(glob="out.txt")) - .time_limit(60) - .extra(sbol_intent="example:custom", customExtension={"enabled": True}) + CommandLineTool("sam3", inputs, outputs) + .describe("SAM3 OME Zarr autosegmentation", "Run SAM3 autosegmentation on a zarr volume.") + .edam() + .gpu(cuda_version_min="11.7", compute_capability="3.0", device_count_min=2) + .docker("polusai/ichnaea-api:latest") + .stage(inputs.output, writable=True) + .stage(inputs.input) + .resources(cores=4, ram=64000) + .base_command("/backend/.venv/bin/python", "/backend/dagster_pipelines/jobs/autosegmentation/logic.py") .to_dict() ) - assert tool["requirements"]["ToolTimeLimit"] == {"timelimit": 60} - assert tool["sbol_intent"] == "example:custom" - assert tool["customExtension"] == {"enabled": True} + assert tool["$namespaces"]["edam"] == "https://edamontology.org/" + assert tool["$namespaces"]["cwltool"] == "http://commonwl.org/cwltool#" + assert tool["$schemas"] == [ + "https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl" + ] + assert tool["hints"]["cwltool:CUDARequirement"] == { + "cudaVersionMin": "11.7", + "cudaComputeCapability": "3.0", + "cudaDeviceCountMin": 2, + } + assert tool["requirements"]["ResourceRequirement"] == {"coresMin": 4, "ramMin": 64000} + assert tool["requirements"]["InitialWorkDirRequirement"] == { + "listing": [ + { + "entry": "$(inputs.output)", + "entryname": "$(inputs.output.basename)", + "writable": True, + }, + { + "entry": "$(inputs.input)", + "entryname": "$(inputs.input.basename)", + "writable": False, + }, + ] + } + assert tool["requirements"]["InlineJavascriptRequirement"] == {} + assert tool["inputs"]["input"]["inputBinding"] == {"position": 1} + assert tool["inputs"]["model"]["type"] == ["null", "File"] + assert tool["inputs"]["model"]["inputBinding"] == {"prefix": "--model"} + assert tool["inputs"]["tile_size"]["type"] == ["null", "int"] + assert tool["outputs"]["output"]["outputBinding"] == {"glob": "$(inputs.output.basename)"} @pytest.mark.fast def test_cwl_builder_save_round_trips_yaml(tmp_path: Path) -> None: - builder = _rich_builder() + tool = _rich_tool() output_path = tmp_path / "aligner.cwl" - saved_path = builder.save(output_path) + saved_path = tool.save(output_path) assert saved_path == output_path - assert yaml.safe_load(output_path.read_text(encoding="utf-8")) == builder.to_dict() + assert yaml.safe_load(output_path.read_text(encoding="utf-8")) == tool.to_dict() @pytest.mark.fast def test_cwl_builder_validate_uses_cwltool_stack(monkeypatch: pytest.MonkeyPatch) -> None: + class FakeRuntimeContext: + def __init__(self, kwargs: dict[str, object]) -> None: + self.kwargs = kwargs + class FakeLoadTool: def __init__(self) -> None: self.calls: list[tuple[str, object]] = [] - def fetch_document(self, path: str) -> tuple[str, dict[str, str], str]: - self.calls.append(("fetch_document", Path(path).suffix)) - return "loading-context", {"class": "CommandLineTool"}, "file:///aligner.cwl" + def fetch_document(self, path: str, loading_context: str) -> tuple[str, dict[str, str], str]: + self.calls.append(("fetch_document", (Path(path).suffix, loading_context))) + return loading_context, {"class": "CommandLineTool"}, "file:///aligner.cwl" def resolve_and_validate_document( self, @@ -140,7 +223,7 @@ def resolve_and_validate_document( preprocess_only: bool = False, ) -> tuple[str, str]: self.calls.append(("resolve_and_validate_document", preprocess_only)) - assert loading_context == "loading-context" + assert loading_context == "prepared-context" assert workflowobj == {"class": "CommandLineTool"} assert uri == "file:///aligner.cwl" return "validated-context", "file:///validated-aligner.cwl" @@ -150,18 +233,59 @@ def make_tool(self, uri: str, loading_context: str) -> dict[str, str]: assert loading_context == "validated-context" return {"uri": uri, "loading_context": loading_context} + def fake_get_default_args() -> dict[str, object]: + return { + "validate": False, + "skip_schemas": False, + "workflow": None, + "do_validate": True, + } + + def fake_setup_loading_context( + loading_context: None, + runtime_context: FakeRuntimeContext, + args: object, + ) -> str: + assert loading_context is None + assert Path(str(runtime_context.kwargs["workflow"])).name == "aligner.cwl" + assert runtime_context.kwargs["validate"] is True + assert runtime_context.kwargs["skip_schemas"] is False + assert Path(str(getattr(args, "workflow"))).name == "aligner.cwl" + assert getattr(args, "validate") is True + assert getattr(args, "skip_schemas") is False + return "prepared-context" + fake_load_tool = FakeLoadTool() - monkeypatch.setattr(cwl_builder, "_import_cwltool_load_tool", lambda: fake_load_tool) + monkeypatch.setattr(cwl_builder_support, "_import_cwltool_load_tool", lambda: fake_load_tool) + monkeypatch.setattr( + cwl_builder_support, + "_import_cwltool_validation_support", + lambda: (FakeRuntimeContext, fake_get_default_args, fake_setup_loading_context), + ) - result = _rich_builder().validate() + result = _rich_tool().validate() assert result.uri == "file:///validated-aligner.cwl" assert result.process == { "uri": "file:///validated-aligner.cwl", "loading_context": "validated-context", } - assert [name for name, _ in fake_load_tool.calls] == [ - "fetch_document", - "resolve_and_validate_document", - "make_tool", - ] + + +@pytest.mark.fast +def test_cwl_builder_converts_to_in_memory_step() -> None: + tool = CommandLineTool( + "echo_tool", + Inputs(message=Input(cwl.string, position=1)), + Outputs(out=Output.stdout()), + ).stdout("stdout.txt") + + step = tool.to_step(step_name="say_hello") + step.inputs.message = "hello" + + assert isinstance(step, Step) + assert step.process_name == "say_hello" + assert step.clt_path.name == "say_hello.cwl" + assert step.yaml["inputs"]["message"]["type"] == "string" + assert step.yaml["outputs"]["out"]["type"] == "stdout" + assert step._yml["in"]["message"] == {"wic_inline_input": "hello"} diff --git a/tests/test_examples.py b/tests/test_examples.py index 001fd6a7..18025d1c 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -158,7 +158,7 @@ def test_run_inlined_workflows_weekly(yml_path_str: str, yml_path: Path, cwl_run def test_cwl_docker_extract(yml_path_str: str, yml_path: Path) -> None: """ Uses cwl-docker-extract to recursively `docker pull`""" args = get_args(str(yml_path)) - run_workflows(yml_path_str, yml_path, 'cwltool', args, True) + run_workflows(yml_path_str, yml_path, 'cwltool', args, docker_pull_only=True) return @@ -220,7 +220,7 @@ def run_workflows(yml_path_str: str, yml_path: Path, cwl_runner: str, args: argp verify_container_engine_config(args.container_engine, args.ignore_docker_install) if docker_pull_only: - cwl_docker_extract(args.container_engine, args.pull_dir, Path(yml_path).stem) + cwl_docker_extract(args.container_engine, args.pull_dir, Path(basepath) / f'{Path(yml_path).stem}.cwl') return if args.docker_remove_entrypoints: diff --git a/tests/test_fuzzy_compile.py b/tests/test_fuzzy_compile.py index 623c47b5..58301068 100644 --- a/tests/test_fuzzy_compile.py +++ b/tests/test_fuzzy_compile.py @@ -64,9 +64,9 @@ def test_fuzzy_compile(self, yml: Yaml) -> None: compiler_options, graph_settings, yaml_tag_paths = sophios.cli.get_dicts_for_compilation() try: - compiler_info = sophios.compiler.compile_workflow(yaml_tree, compiler_options, graph_settings, - yaml_tag_paths, [], [graph], {}, {}, {}, {}, - tools_cwl, True, relative_run_path=True, testing=True) + sophios.compiler.compile_workflow(yaml_tree, compiler_options, graph_settings, + yaml_tag_paths, [], [graph], {}, {}, {}, {}, + tools_cwl, True, relative_run_path=True, testing=True) except Exception as e: multi_def_str = 'Error! Multiple definitions of &' unbound_lit_var = 'Error! Unbound literal variable ~' diff --git a/tests/test_python_api.py b/tests/test_python_api.py new file mode 100644 index 00000000..f901c528 --- /dev/null +++ b/tests/test_python_api.py @@ -0,0 +1,451 @@ +from contextlib import contextmanager +import json +import os +from pathlib import Path +import traceback +from types import SimpleNamespace +from typing import Any, Iterator +from unittest.mock import patch + +import pytest +import yaml + +import sophios +import sophios.apis.python as python_api_package +import sophios.apis.python._workflow_runtime as python_runtime +import sophios.plugins +import sophios.run_local as run_local +from sophios import input_output as io +from sophios import utils, utils_cwl +from sophios.apis.python.cwl_builder import CommandLineTool, Input, Inputs, Output, Outputs, cwl +from sophios.apis.python.api import InvalidLinkError, Step, Workflow +from sophios.python_cwl_adapter import import_python_file +from sophios.schemas import wic_schema +from sophios.utils_yaml import wic_loader +from sophios.wic_types import Json, Tools + + +REPO_ROOT = Path(__file__).resolve().parent.parent +ADAPTERS = REPO_ROOT / "cwl_adapters" +AUTOGENERATED_DIR = REPO_ROOT / "autogenerated" +PYTHON_WORKFLOW_MANIFEST = AUTOGENERATED_DIR / "python_workflow_manifest.json" + + +def _adapter(name: str) -> Path: + return ADAPTERS / f"{name}.cwl" + + +def _load_global_config() -> Json: + config_file = Path().home() / "wic" / "global_config.json" + return io.read_config_from_disk(config_file) + + +def _iter_python_workflow_paths(global_config: Json) -> list[tuple[str, Path]]: + paths = sophios.plugins.get_py_paths(global_config) + return [ + (path_str, path) + for _, paths_dict in paths.items() + for path_str, path in paths_dict.items() + if "mm-workflows" not in str(path) and "docs/tutorials/" not in str(path) + ] + + +@contextmanager +def _step_registry_injected(tool_registry: Tools) -> Iterator[None]: + """Inject a default tool registry into imported Python workflow scripts. + + Args: + tool_registry (Tools): Registry of known CWL tools used for path fallback. + + Yields: + Iterator[None]: Context where imported scripts see the patched ``Step``. + """ + from sophios.apis.python import api # pylint: disable=C0415:import-outside-toplevel + + step_class = api.Step + + def step_factory(*args: Any, **kwargs: Any) -> Any: + kwargs.setdefault("tool_registry", tool_registry) + return step_class(*args, **kwargs) + + with patch.object(api, "Step", step_factory): + yield + + +def _write_manifest(workflow_paths: list[Path]) -> None: + AUTOGENERATED_DIR.mkdir(parents=True, exist_ok=True) + manifest = sorted({str(path) for path in workflow_paths}) + PYTHON_WORKFLOW_MANIFEST.write_text(json.dumps(manifest, indent=2), encoding="utf-8") + + +@pytest.mark.fast +def test_explicit_step_ports_match_legacy_yaml() -> None: + touch_legacy = Step(_adapter("touch")) + touch_legacy.filename = "empty.txt" + append_legacy = Step(_adapter("append")) + append_legacy.file = touch_legacy.file + append_legacy.str = "Hello" + + touch_explicit = Step(_adapter("touch")) + touch_explicit.inputs.filename = "empty.txt" + append_explicit = Step(_adapter("append")) + append_explicit.inputs.file = touch_explicit.outputs.file + append_explicit.inputs.str = "Hello" + + legacy_yaml = Workflow([touch_legacy, append_legacy], "wf").yaml + explicit_yaml = Workflow([touch_explicit, append_explicit], "wf").yaml + + assert legacy_yaml == explicit_yaml + + +@pytest.mark.fast +def test_in_memory_cwl_step_compiles_through_workflow_api() -> None: + tool = ( + CommandLineTool( + "echo_tool", + Inputs(message=Input(cwl.string, position=1)), + Outputs(out=Output.stdout()), + ) + .base_command("echo") + .stdout("stdout.txt") + ) + step = Step.from_cwl(tool.to_dict(), process_name="say_hello") + step.inputs.message = "hello" + + compiled = Workflow([step], "wf").get_cwl_workflow() + + assert compiled["class"] == "Workflow" + assert compiled["steps"][0]["id"].endswith("say_hello") + assert compiled["steps"][0]["run"]["class"] == "CommandLineTool" + assert compiled["steps"][0]["run"]["baseCommand"] == "echo" + + +@pytest.mark.fast +def test_cwl_builder_workflow_example_compiles() -> None: + example_path = REPO_ROOT / "examples" / "scripts" / "cwl_builder_workflow.py" + module = import_python_file(example_path.stem, example_path.resolve()) + + workflow = module.build_workflow("hello from test") + compiled = workflow.get_cwl_workflow() + + assert compiled["class"] == "Workflow" + step_ids = [step["id"] for step in compiled["steps"]] + assert step_ids[0].endswith("emit_text") + assert step_ids[1].endswith("read_text") + assert compiled["outputs"]["result"]["outputSource"] == f"{step_ids[1]}/result" + + +@pytest.mark.fast +def test_falsey_inline_values_are_preserved() -> None: + echo = Step(_adapter("echo")) + echo.inputs.message = "" + + workflow_yaml = Workflow([echo], "wf").yaml + assert workflow_yaml["steps"][0]["in"]["message"] == {"wic_inline_input": ""} + + +@pytest.mark.fast +def test_subworkflow_inputs_use_child_workflow_name_and_formal_parameters() -> None: + touch = Step(_adapter("touch")) + touch.inputs.filename = "empty.txt" + + sub_step = Step(_adapter("append")) + subworkflow = Workflow([sub_step], "child") + sub_step.inputs.file = subworkflow.inputs.file + sub_step.inputs.str = subworkflow.inputs.str + + subworkflow.inputs.file = touch.outputs.file + subworkflow.inputs.str = "Hello" + + root_yaml = Workflow([touch, subworkflow], "root").yaml + subworkflow_step = root_yaml["steps"][1] + + assert subworkflow_step["id"] == "child.wic" + assert subworkflow_step["parentargs"] == { + "in": { + "file": {"wic_alias": "filechild"}, + "str": {"wic_inline_input": "Hello"}, + } + } + assert subworkflow_step["subtree"]["inputs"] == { + "file": {"type": "File"}, + "str": {"type": "string"}, + } + assert subworkflow_step["subtree"]["steps"][0]["in"]["file"] == "file" + assert subworkflow_step["subtree"]["steps"][0]["in"]["str"] == "str" + + +@pytest.mark.fast +def test_inline_subworkflow_always_emits_parentargs_key() -> None: + sub_step = Step(_adapter("append")) + subworkflow = Workflow([sub_step], "child") + + root_yaml = Workflow([subworkflow], "root").yaml + subworkflow_step = root_yaml["steps"][0] + + assert subworkflow_step["id"] == "child.wic" + assert subworkflow_step["parentargs"] == {} + + +@pytest.mark.fast +def test_step_unknown_attribute_raises_immediately() -> None: + append = Step(_adapter("append")) + + with pytest.raises(AttributeError, match="has no input named"): + append.misspelled = "Hello" + + +@pytest.mark.fast +def test_incompatible_step_link_raises_invalid_link_error() -> None: + touch = Step(_adapter("touch")) + touch.inputs.filename = "empty.txt" + append = Step(_adapter("append")) + + with pytest.raises(InvalidLinkError, match="incompatible types"): + append.inputs.str = touch.outputs.file + + +@pytest.mark.fast +def test_workflow_outputs_are_serialized_with_type_and_source() -> None: + touch = Step(_adapter("touch")) + touch.inputs.filename = "empty.txt" + + append = Step(_adapter("append")) + append.inputs.file = touch.outputs.file + append.inputs.str = "Hello" + + workflow = Workflow([touch, append], "wf") + workflow.outputs.file = append.outputs.file + + workflow_yaml = workflow.yaml + + assert workflow_yaml["outputs"] == { + "file": {"type": "File", "outputSource": "append/file"}, + } + + +@pytest.mark.fast +def test_config_yaml_normalizes_cwl_file_and_directory_objects(tmp_path: Path) -> None: + input_dir = tmp_path / "input-dir" + input_dir.mkdir() + input_file = tmp_path / "input.txt" + input_file.write_text("hello", encoding="utf-8") + + subdirectory_cfg = tmp_path / "subdirectory.yml" + subdirectory_cfg.write_text( + yaml.safe_dump( + { + "directory": {"class": "Directory", "location": str(input_dir)}, + "glob_pattern": ".", + }, + sort_keys=False, + ), + encoding="utf-8", + ) + subdirectory = Step(_adapter("subdirectory"), config_path=subdirectory_cfg) + assert subdirectory._yml["in"]["directory"] == {"wic_inline_input": str(input_dir)} + + append_cfg = tmp_path / "append.yml" + append_cfg.write_text( + yaml.safe_dump( + { + "file": {"class": "File", "location": str(input_file)}, + "str": "Hello", + }, + sort_keys=False, + ), + encoding="utf-8", + ) + append = Step(_adapter("append"), config_path=append_cfg) + assert append._yml["in"]["file"] == {"wic_inline_input": str(input_file)} + + +@pytest.mark.fast +def test_scatter_rejects_unbound_foreign_or_scalar_inputs() -> None: + echo = Step(_adapter("echo")) + other_echo = Step(_adapter("echo")) + + with pytest.raises(ValueError, match="bound before scattering"): + echo.scatter = [echo.inputs.message] + + echo.inputs.message = "hello" + with pytest.raises(ValueError, match="array-valued data"): + echo.scatter = [echo.inputs.message] + + other_echo.inputs.message = ["a", "b"] + with pytest.raises(ValueError, match="belong to the same step"): + echo.scatter = [other_echo.inputs.message] + + +@pytest.mark.fast +def test_top_level_python_api_exports_only_user_facing_names() -> None: + assert not hasattr(python_api_package, "WorkflowInputReference") + assert not hasattr(python_api_package, "set_input_Step_Workflow") + assert not hasattr(python_api_package, "extract_tools_paths_NONPORTABLE") + + +@pytest.mark.fast +def test_run_local_subprocess_uses_explicit_env_without_global_mutation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + run_args = { + "container_engine": "docker", + "cwl_runner": "cwltool", + } + seen_env: dict[str, str] = {} + sentinel_key = "SOPHIOS_TEST_SUBPROCESS_ENV" + os.environ.pop(sentinel_key, None) + + def fake_run(cmd: list[str], check: bool, env: dict[str, str]) -> SimpleNamespace: + del cmd, check + seen_env.update(env) + return SimpleNamespace(returncode=0) + + monkeypatch.setattr(run_local.sub, "run", fake_run) + + retval = run_local.run_local( + run_args, + True, + passthrough_args=[], + workflow_name="wf", + basepath="autogenerated", + user_env_vars={sentinel_key: "hello"}, + ) + + assert retval == 0 + assert seen_env[sentinel_key] == "hello" + assert sentinel_key not in os.environ + + +@pytest.mark.fast +def test_run_local_python_api_restores_environment_after_run(monkeypatch: pytest.MonkeyPatch) -> None: + run_args = { + "container_engine": "docker", + "cwl_runner": "cwltool", + } + sentinel_key = "SOPHIOS_TEST_PYAPI_ENV" + os.environ.pop(sentinel_key, None) + seen_inside: dict[str, str | None] = {"value": None} + + def fake_main(args: list[str]) -> int: + del args + seen_inside["value"] = os.environ.get(sentinel_key) + return 0 + + monkeypatch.setattr(run_local.cwltool.main, "main", fake_main) + + retval = run_local.run_local( + run_args, + False, + passthrough_args=[], + workflow_name="wf", + basepath="autogenerated", + user_env_vars={sentinel_key: "inside"}, + ) + + assert retval == 0 + assert seen_inside["value"] == "inside" + assert sentinel_key not in os.environ + + +@pytest.mark.fast +def test_workflow_run_uses_basepath_for_docker_extract( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + example_path = REPO_ROOT / "examples" / "scripts" / "cwl_builder_workflow.py" + module = import_python_file(example_path.stem, example_path.resolve()) + workflow = module.build_workflow("hello from test") + calls: dict[str, list[tuple[Any, ...]]] = {"verify": [], "extract": []} + + def fake_verify(container_engine: str, ignore_install: bool) -> None: + calls["verify"].append((container_engine, ignore_install)) + + def fake_extract(container_engine: str, pull_dir: str, cwl_path: Path) -> None: + calls["extract"].append((container_engine, pull_dir, cwl_path)) + + def fake_run_local( + run_args_dict: dict[str, str], + use_subprocess: bool, + passthrough_args: list[str], + workflow_name: str, + basepath: str, + user_env_vars: dict[str, str] | None = None, + ) -> int: + del run_args_dict, use_subprocess, passthrough_args, workflow_name, basepath, user_env_vars + return 0 + + monkeypatch.setattr(python_runtime.pc, "verify_container_engine_config", fake_verify) + monkeypatch.setattr(python_runtime.pc, "cwl_docker_extract", fake_extract) + monkeypatch.setattr(python_runtime.rl, "run_local", fake_run_local) + + workflow.run(basepath=str(tmp_path)) + + assert calls["verify"] == [("docker", False)] + assert calls["extract"] == [("docker", str(Path.cwd()), tmp_path / "cwl_builder_workflow_demo.cwl")] + + +@pytest.mark.fast +def test_compile_python_workflows() -> None: + """Import and compile all auto-discovered Python workflow scripts.""" + from sophios.apis.python import api # pylint: disable=C0415:import-outside-toplevel + + global_config = _load_global_config() + tools_cwl = sophios.plugins.get_tools_cwl(global_config) + paths_tuples = _iter_python_workflow_paths(global_config) + import_errors: list[str] = [] + generated_workflows: list[Path] = [] + for path_stem, path in paths_tuples: + try: + with _step_registry_injected(tools_cwl): + module = import_python_file(path_stem, path) + retval: api.Workflow = module.workflow() + + retval.compile() + retval.write_ast_to_disk(path.parent) + generated_workflows.extend(path.parent / f"{wf.process_name}.wic" for wf in retval.flatten_subworkflows()) + + config_ci = path.parent / "config_ci.json" + json_contents = {} + if config_ci.exists(): + with open(config_ci, mode="r", encoding="utf-8") as r: + json_contents = json.load(r) + run_blacklist: list[str] = json_contents.get("run_blacklist", []) + subworkflows: list[api.Workflow] = retval.flatten_subworkflows()[1:] + run_blacklist += [wf.process_name for wf in subworkflows] + json_contents["run_blacklist"] = run_blacklist + with open(config_ci, mode="w", encoding="utf-8") as f: + json.dump(json_contents, f) + + except Exception as exc: # pylint: disable=W0718:broad-exception-caught + import_errors.append(f"{path_stem}: {type(exc).__name__}: {exc}") + traceback.print_exception(type(exc), value=exc, tb=None) + if import_errors: + pytest.fail("Python workflow imports failed:\n" + "\n".join(import_errors)) + _write_manifest(generated_workflows) + + +@pytest.mark.fast +def test_validate_generated_python_workflows() -> None: + if not PYTHON_WORKFLOW_MANIFEST.exists(): + pytest.fail(f"Missing generated workflow manifest: {PYTHON_WORKFLOW_MANIFEST}") + + global_config = _load_global_config() + tools_cwl = sophios.plugins.get_tools_cwl(global_config) + yml_paths = sophios.plugins.get_yml_paths(global_config) + yaml_stems = utils.flatten([list(paths) for paths in yml_paths.values()]) + validator = wic_schema.get_validator(tools_cwl, yaml_stems, {}, write_to_disk=False) + + workflow_paths = json.loads(PYTHON_WORKFLOW_MANIFEST.read_text(encoding="utf-8")) + validation_errors: list[str] = [] + for workflow_path_str in workflow_paths: + workflow_path = Path(workflow_path_str) + try: + with workflow_path.open("r", encoding="utf-8") as handle: + yaml_tree = yaml.load(handle.read(), Loader=wic_loader()) + validator.validate(utils_cwl.desugar_into_canonical_normal_form(yaml_tree)) + except Exception as exc: # pylint: disable=W0718:broad-exception-caught + validation_errors.append(f"{workflow_path}: {type(exc).__name__}: {exc}") + + if validation_errors: + pytest.fail("Generated workflow validation failed:\n" + "\n".join(validation_errors)) diff --git a/tests/test_python_api_redesign.py b/tests/test_python_api_redesign.py deleted file mode 100644 index 532a650e..00000000 --- a/tests/test_python_api_redesign.py +++ /dev/null @@ -1,63 +0,0 @@ -from pathlib import Path - -import pytest - -from sophios.apis.python.api import Step, Workflow - - -REPO_ROOT = Path(__file__).resolve().parent.parent -ADAPTERS = REPO_ROOT / "cwl_adapters" - - -def _adapter(name: str) -> Path: - return ADAPTERS / f"{name}.cwl" - - -@pytest.mark.fast -def test_explicit_step_ports_match_legacy_yaml() -> None: - touch_legacy = Step(_adapter("touch")) - touch_legacy.filename = "empty.txt" - append_legacy = Step(_adapter("append")) - append_legacy.file = touch_legacy.file - append_legacy.str = "Hello" - - touch_explicit = Step(_adapter("touch")) - touch_explicit.inputs.filename = "empty.txt" - append_explicit = Step(_adapter("append")) - append_explicit.inputs.file = touch_explicit.outputs.file - append_explicit.inputs.str = "Hello" - - legacy_yaml = Workflow([touch_legacy, append_legacy], "wf").yaml - explicit_yaml = Workflow([touch_explicit, append_explicit], "wf").yaml - - assert legacy_yaml == explicit_yaml - - -@pytest.mark.fast -def test_falsey_inline_values_are_preserved() -> None: - echo = Step(_adapter("echo")) - echo.inputs.message = "" - - workflow_yaml = Workflow([echo], "wf").yaml - assert workflow_yaml["steps"][0]["in"]["message"] == {"wic_inline_input": ""} - - -@pytest.mark.fast -def test_subworkflow_inputs_use_child_workflow_name_and_formal_parameters() -> None: - touch = Step(_adapter("touch")) - touch.inputs.filename = "empty.txt" - - sub_step = Step(_adapter("append")) - subworkflow = Workflow([sub_step], "child") - sub_step.inputs.file = subworkflow.inputs.file - sub_step.inputs.str = subworkflow.inputs.str - - subworkflow.inputs.file = touch.outputs.file - subworkflow.inputs.str = "Hello" - - root_yaml = Workflow([touch, subworkflow], "root").yaml - subworkflow_step = root_yaml["steps"][1] - - assert subworkflow_step["id"] == "child.wic" - assert subworkflow_step["subtree"]["steps"][0]["in"]["file"] == "file" - assert subworkflow_step["subtree"]["steps"][0]["in"]["str"] == "str" diff --git a/tests/test_rest_api.py b/tests/test_rest_api.py new file mode 100644 index 00000000..fea47d50 --- /dev/null +++ b/tests/test_rest_api.py @@ -0,0 +1,181 @@ +import asyncio +import copy +import json +from pathlib import Path +import subprocess as sub +import traceback + +from fastapi import Request +import pytest +import yaml + +import sophios.post_compile as pc +from sophios.apis.rest import api +from sophios.wic_types import Json, List + +try: + import cwltool.main + import toil.cwl.cwltoil # transitively imports cwltool +except ImportError as exc: + print("Could not import cwltool.main and/or toil.cwl.cwltoil") + print(exc) + if exc.msg == "No module named 'pwd'": + print("Windows does not have a pwd module") + print("If you want to run on windows, you need to install") + print("Windows Subsystem for Linux") + print("See https://pypi.org/project/cwltool/#ms-windows-users") + else: + raise exc + + +REST_OBJECTS = Path(__file__).parent / "rest_wfb_objects" + + +def run_cwl_local(workflow_name: str, cwl_runner: str, docker_cmd: str, use_subprocess: bool) -> int: + """Run compiled CWL output via a supported CWL runner. + + Args: + workflow_name (str): Stem of the generated workflow files in ``autogenerated/``. + cwl_runner (str): CWL runner entrypoint to invoke. + docker_cmd (str): Container engine selector passed through to the runner. + use_subprocess (bool): Whether to invoke the runner as a subprocess. + + Returns: + int: Exit code returned by the CWL runner. + """ + quiet = ["--quiet"] + skip_schemas = ["--skip-schemas"] + provenance = ["--provenance", f"provenance/{workflow_name}"] + docker_cmd_: List[str] = [] + if docker_cmd == "docker": + docker_cmd_ = [] + elif docker_cmd == "singularity": + docker_cmd_ = ["--singularity"] + else: + docker_cmd_ = ["--user-space-docker-cmd", docker_cmd] + write_summary = ["--write-summary", f"output_{workflow_name}.json"] + path_check = ["--relax-path-checks"] + docker_pull = ["--disable-pull"] + script = "cwltool_filterlog" if cwl_runner == "cwltool" else cwl_runner + cmd = [script] + docker_pull + quiet + provenance + docker_cmd_ + write_summary + skip_schemas + path_check + if cwl_runner == "cwltool": + cmd += [ + "--leave-outputs", + f"autogenerated/{workflow_name}.cwl", + f"autogenerated/{workflow_name}_inputs.yml", + ] + elif cwl_runner == "toil-cwl-runner": + cmd += [ + "--outdir", + "outdir_toil", + "--jobStore", + f"file:./jobStore_{workflow_name}", + "--clean", + "always", + f"autogenerated/{workflow_name}.cwl", + f"autogenerated/{workflow_name}_inputs.yml", + ] + + retval = 1 + print("Running " + " ".join(cmd)) + if use_subprocess: + proc = sub.run(cmd, check=False) + retval = proc.returncode + else: + print("via cwltool.main.main python API") + try: + if cwl_runner == "cwltool": + retval = cwltool.main.main(cmd[1:]) + elif cwl_runner == "toil-cwl-runner": + retval = toil.cwl.cwltoil.main(cmd[1:]) + else: + raise Exception("Invalid cwl_runner!") + + print(f"Final output json metadata blob is in output_{workflow_name}.json") + except Exception as exc: # pylint: disable=W0718:broad-exception-caught + print("Failed to execute", workflow_name) + print(f"See error_{workflow_name}.txt for detailed technical information.") + with open(f"error_{workflow_name}.txt", mode="w", encoding="utf-8") as f: + traceback.print_exception(type(exc), value=exc, tb=None, file=f) + print(exc) + return retval + + +def write_out_to_disk(res: Json, workflow_name: str) -> None: + """Write compiled REST output to disk before invoking a CWL runner. + + Args: + res (Json): REST API response containing compiled CWL and input bindings. + workflow_name (str): Stem to use for generated output files. + """ + res_cwl = copy.deepcopy(res) + res_cwl.pop("retval", None) + res_cwl.pop("cwlJobInputs", None) + res_cwl.pop("name", None) + compiled_cwl = workflow_name + ".cwl" + inputs_yml = workflow_name + "_inputs.yml" + with open(Path.cwd() / "autogenerated" / compiled_cwl, "w", encoding="utf-8") as f: + yaml.dump(res_cwl, f) + with open(Path.cwd() / "autogenerated" / inputs_yml, "w", encoding="utf-8") as f: + yaml.dump(res["cwlJobInputs"], f) + + +def prepare_call_rest_api(inp_path: Path) -> Json: + """Load a REST payload fixture and compile it through the REST API. + + Args: + inp_path (Path): Path to the JSON fixture to submit. + + Returns: + Json: Response returned by ``compile_wf``. + """ + with open(inp_path, "r", encoding="utf-8") as f: + inp = json.load(f) + scope = {"type": "http"} + + async def receive() -> Json: + inp_byte = json.dumps(inp).encode("utf-8") + return {"type": "http.request", "body": inp_byte} + + req: Request = Request(scope) + req._receive = receive + return asyncio.run(api.compile_wf(req)) + + +@pytest.mark.skip_pypi_ci +@pytest.mark.parametrize( + "inp_file", + [ + "single_node.json", + "single_node_bbbc_download.json", + "bbbc_download_wfb.json", + "multi_node.json", + "multi_node_inline_cwl.json", + ], +) +def test_rest_core_runs_workflow(inp_file: str) -> None: + """Compile representative REST payloads and run the resulting workflows. + + Args: + inp_file (str): Fixture filename under ``tests/rest_wfb_objects``. + """ + basepath = "autogenerated" + inp_path = REST_OBJECTS / inp_file + workflow_name = inp_file.split(".", maxsplit=1)[0] + res = prepare_call_rest_api(inp_path) + output_dirs = pc.find_output_dirs(res) + pc.create_output_dirs(output_dirs, basepath) + write_out_to_disk(res, workflow_name) + retval = run_cwl_local(workflow_name, "cwltool", "docker", False) + assert retval == 0 + + +def test_rest_wfb_compile() -> None: + """Compile a WFB-originated payload through the REST API. + + Returns: + None: The assertion verifies a successful compile. + """ + inp_path = REST_OBJECTS / "multi_node_wfb.json" + res = prepare_call_rest_api(inp_path) + assert int(res["retval"]) == 0 diff --git a/tests/test_rest_core.py b/tests/test_rest_core.py deleted file mode 100644 index b8691dab..00000000 --- a/tests/test_rest_core.py +++ /dev/null @@ -1,212 +0,0 @@ -import json -import copy -from pathlib import Path -import asyncio -import subprocess as sub -import traceback -import yaml - - -from fastapi import Request - -import pytest -from sophios.wic_types import Json, List -import sophios.post_compile as pc - - -from sophios.apis.rest import api - -try: - import cwltool.main - import toil.cwl.cwltoil # transitively imports cwltool -except ImportError as exc: - print('Could not import cwltool.main and/or toil.cwl.cwltoil') - # (pwd is imported transitively in cwltool.provenance) - print(exc) - if exc.msg == "No module named 'pwd'": - print('Windows does not have a pwd module') - print('If you want to run on windows, you need to install') - print('Windows Subsystem for Linux') - print('See https://pypi.org/project/cwltool/#ms-windows-users') - else: - raise exc - - -def run_cwl_local(workflow_name: str, cwl_runner: str, docker_cmd: str, use_subprocess: bool) -> int: - """A helper function to run the compiled cwl output""" - quiet = ['--quiet'] - skip_schemas = ['--skip-schemas'] - provenance = ['--provenance', f'provenance/{workflow_name}'] - docker_cmd_: List[str] = [] - if docker_cmd == 'docker': - docker_cmd_ = [] - elif docker_cmd == 'singularity': - docker_cmd_ = ['--singularity'] - else: - docker_cmd_ = ['--user-space-docker-cmd', docker_cmd] - write_summary = ['--write-summary', f'output_{workflow_name}.json'] - path_check = ['--relax-path-checks'] - # See https://github.com/common-workflow-language/cwltool/blob/5a645dfd4b00e0a704b928cc0bae135b0591cc1a/cwltool/command_line_tool.py#L94 - # NOTE: Using --leave-outputs to disable --outdir - # See https://github.com/dnanexus/dx-cwl/issues/20 - # --outdir has one or more bugs which will cause workflows to fail!!! - docker_pull = ['--disable-pull'] # Use cwl-docker-extract to pull images - script = 'cwltool_filterlog' if cwl_runner == 'cwltool' else cwl_runner - cmd = [script] + docker_pull + quiet + provenance + \ - docker_cmd_ + write_summary + skip_schemas + path_check - if cwl_runner == 'cwltool': - cmd += ['--leave-outputs', - f'autogenerated/{workflow_name}.cwl', f'autogenerated/{workflow_name}_inputs.yml'] - elif cwl_runner == 'toil-cwl-runner': - cmd += ['--outdir', 'outdir_toil', - '--jobStore', f'file:./jobStore_{workflow_name}', # NOTE: This is the equivalent of --cachedir - '--clean', 'always', # This effectively disables caching, but is reproducible - f'autogenerated/{workflow_name}.cwl', f'autogenerated/{workflow_name}_inputs.yml'] - else: - pass - cmdline = ' '.join(cmd) - - retval = 1 # overwrite on success - print('Running ' + cmdline) - if use_subprocess: - # To run in parallel (i.e. pytest ... --workers 8 ...), we need to - # use separate processes. Otherwise: - # "signal only works in main thread or with __pypy__.thread.enable_signals()" - proc = sub.run(cmd, check=False) - retval = proc.returncode - else: - print('via cwltool.main.main python API') - try: - if cwl_runner == 'cwltool': - retval = cwltool.main.main(cmd[1:]) - elif cwl_runner == 'toil-cwl-runner': - retval = toil.cwl.cwltoil.main(cmd[1:]) - else: - raise Exception("Invalid cwl_runner!") - - print(f'Final output json metadata blob is in output_{workflow_name}.json') - except Exception as e: - print('Failed to execute', workflow_name) - print(f'See error_{workflow_name}.txt for detailed technical information.') - # Do not display a nasty stack trace to the user; hide it in a file. - with open(f'error_{workflow_name}.txt', mode='w', encoding='utf-8') as f: - traceback.print_exception(type(e), value=e, tb=None, file=f) - print(e) # we are always running this on CI - return retval - - -def write_out_to_disk(res: Json, workflow_name: str) -> None: - "write compiled output to before running through cwl_runner entrypoints" - res_cwl = copy.deepcopy(res) - res_cwl.pop('retval', None) - res_cwl.pop('cwlJobInputs', None) - res_cwl.pop('name', None) - compiled_cwl = workflow_name + '.cwl' - inputs_yml = workflow_name + '_inputs.yml' - # write compiled .cwl file - with open(Path.cwd() / 'autogenerated' / compiled_cwl, 'w', encoding='utf-8') as f: - yaml.dump(res_cwl, f) - # write _input.yml file - with open(Path.cwd() / 'autogenerated' / inputs_yml, 'w', encoding='utf-8') as f: - yaml.dump(res['cwlJobInputs'], f) - - -def prepare_call_rest_api(inp_path: Path) -> Json: - """prepare payload and call rest api""" - with open(inp_path, 'r', encoding='utf-8') as f: - inp = json.load(f) - scope = {} - scope['type'] = 'http' - - async def receive() -> Json: - inp_byte = json.dumps(inp).encode('utf-8') - return {"type": "http.request", "body": inp_byte} - - # create a request object and pack it with our json payload - req: Request = Request(scope) - req._receive = receive - res: Json = asyncio.run(api.compile_wf(req)) # call to rest api - return res - - -@pytest.mark.skip_pypi_ci -@pytest.mark.fast -def test_rest_core_single_node() -> None: - """A simple single node sophios/restapi test""" - basepath = 'autogenerated' - inp_file = "single_node.json" - inp_path = Path(__file__).parent / 'rest_wfb_objects' / inp_file - workflow_name = inp_file.split('.', maxsplit=1)[0] - # write compiled_cwl and inputs_yml - res = prepare_call_rest_api(inp_path) - output_dirs = pc.find_output_dirs(res) - pc.create_output_dirs(output_dirs, basepath) - write_out_to_disk(res, workflow_name) - retval = run_cwl_local(workflow_name, 'cwltool', 'docker', False) - assert retval == 0 - - -@pytest.mark.skip_pypi_ci -def test_rest_core_single_node_bbbc() -> None: - """A simple single node sophios/restapi test""" - basepath = 'autogenerated' - inp_file = "single_node_bbbc_download.json" - inp_path = Path(__file__).parent / 'rest_wfb_objects' / inp_file - workflow_name = inp_file.split('.', maxsplit=1)[0] - # write compiled_cwl and inputs_yml - res = prepare_call_rest_api(inp_path) - output_dirs = pc.find_output_dirs(res) - pc.create_output_dirs(output_dirs, basepath) - write_out_to_disk(res, workflow_name) - retval = run_cwl_local(workflow_name, 'cwltool', 'docker', False) - assert retval == 0 - - -@pytest.mark.skip_pypi_ci -def test_rest_core_bbbc_download_wfb() -> None: - """A simple multi node (inline cwl) sophios/restapi test""" - basepath = 'autogenerated' - inp_file = "bbbc_download_wfb.json" - inp_path = Path(__file__).parent / 'rest_wfb_objects' / inp_file - workflow_name = inp_file.split('.', maxsplit=1)[0] - # write compiled_cwl and inputs_yml - res = prepare_call_rest_api(inp_path) - output_dirs = pc.find_output_dirs(res) - pc.create_output_dirs(output_dirs, basepath) - write_out_to_disk(res, workflow_name) - retval = run_cwl_local(workflow_name, 'cwltool', 'docker', False) - assert retval == 0 - - -@pytest.mark.fast -@pytest.mark.skip_pypi_ci -def test_rest_core_multi_node_file() -> None: - """A simple multi node sophios/restapi test""" - basepath = 'autogenerated' - inp_file = "multi_node.json" - inp_path = Path(__file__).parent / 'rest_wfb_objects' / inp_file - workflow_name = inp_file.split('.', maxsplit=1)[0] - # write compiled_cwl and inputs_yml - res = prepare_call_rest_api(inp_path) - output_dirs = pc.find_output_dirs(res) - pc.create_output_dirs(output_dirs, basepath) - write_out_to_disk(res, workflow_name) - retval = run_cwl_local(workflow_name, 'cwltool', 'docker', False) - assert retval == 0 - - -@pytest.mark.fast -@pytest.mark.skip_pypi_ci -def test_rest_core_multi_node_inline_cwl() -> None: - """A simple multi node (inline cwl) sophios/restapi test""" - basepath = 'autogenerated' - inp_file = "multi_node_inline_cwl.json" - inp_path = Path(__file__).parent / 'rest_wfb_objects' / inp_file - workflow_name = inp_file.split('.', maxsplit=1)[0] - # write compiled_cwl and inputs_yml - res = prepare_call_rest_api(inp_path) - output_dirs = pc.find_output_dirs(res) - pc.create_output_dirs(output_dirs, basepath) - write_out_to_disk(res, workflow_name) - retval = run_cwl_local(workflow_name, 'cwltool', 'docker', False) - assert retval == 0 diff --git a/tests/test_rest_wfb.py b/tests/test_rest_wfb.py deleted file mode 100644 index 56f9da8e..00000000 --- a/tests/test_rest_wfb.py +++ /dev/null @@ -1,34 +0,0 @@ -import copy -import json -from pathlib import Path -import asyncio - -from fastapi import Request - -import pytest -from sophios.wic_types import Json - - -from sophios.apis.rest import api - - -def test_rest_multinode_wfb() -> None: - """A multi node (with plugins) wfb -> sophios/restapi test""" - inp_file = "multi_node_wfb.json" - inp: Json = {} - inp_path = Path(__file__).parent / 'rest_wfb_objects' / inp_file - with open(inp_path, 'r', encoding='utf-8') as f: - inp = json.load(f) - print('----------- from rest api ----------- \n\n') - scope = {} - scope['type'] = 'http' - - async def receive() -> Json: - inp_byte = json.dumps(inp).encode('utf-8') - return {"type": "http.request", "body": inp_byte} - - # create a request object and pack it with our json payload - req: Request = Request(scope) - req._receive = receive - res: Json = asyncio.run(api.compile_wf(req)) # call to rest api - assert int(res['retval']) == 0 diff --git a/tests/test_setup.py b/tests/test_setup.py index aaf7cd38..77f0a629 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import time from typing import Dict @@ -21,26 +22,36 @@ # Just read from the disk and pass around config object global_config = io.get_config(Path(args.config_file), Path(args.config_file)) tools_cwl = sophios.plugins.get_tools_cwl(global_config, quiet=args.quiet) -sophios.apis.python.api.global_config = tools_cwl # Use path fallback in the CI yml_paths = sophios.plugins.get_yml_paths(global_config) +schema_store_path = Path('autogenerated/schemas/schema_store.json') yaml_stems = sophios.utils.flatten([list(p) for p in yml_paths.values()]) schema_store: Dict[str, Json] = {} -validator = sophios.schemas.wic_schema.get_validator(tools_cwl, yaml_stems, schema_store, write_to_disk=True) - yml_paths_tuples = [(yml_path_str, yml_path) for yml_namespace, yml_paths_dict in yml_paths.items() for yml_path_str, yml_path in yml_paths_dict.items()] - -for yml_path_str, yml_path in yml_paths_tuples: - schema = sophios.schemas.wic_schema.compile_workflow_generate_schema(args.homedir, yml_path_str, yml_path, - tools_cwl, yml_paths, validator, - args.ignore_validation_errors, - args.allow_raw_cwl) - # overwrite placeholders in schema_store. See comment in get_validator() - schema_store[schema['$id']] = schema +if schema_store_path.exists(): + with open(schema_store_path, mode='r', encoding='utf-8') as f: + schema_store = json.load(f) + expected_schema_ids = {f'workflows/{yml_path_str}.json' for yml_path_str, _ in yml_paths_tuples} + if not expected_schema_ids.issubset(schema_store): + schema_store = {} + +if not schema_store: + validator = sophios.schemas.wic_schema.get_validator(tools_cwl, yaml_stems, schema_store, write_to_disk=True) + + for yml_path_str, yml_path in yml_paths_tuples: + schema = sophios.schemas.wic_schema.compile_workflow_generate_schema(args.homedir, yml_path_str, yml_path, + tools_cwl, yml_paths, validator, + args.ignore_validation_errors) + # overwrite placeholders in schema_store. See comment in get_validator() + schema_store[schema['$id']] = schema + + schema_store_path.parent.mkdir(parents=True, exist_ok=True) + with open(schema_store_path, mode='w', encoding='utf-8') as f: + json.dump(schema_store, f, indent=2) validator = sophios.schemas.wic_schema.get_validator(tools_cwl, yaml_stems, schema_store, write_to_disk=True)