diff --git a/docs/examples/quickstart/README.md b/docs/examples/quickstart/README.md new file mode 100644 index 000000000..196a7c08e --- /dev/null +++ b/docs/examples/quickstart/README.md @@ -0,0 +1,41 @@ +# StreamFlow Quickstart Example + +This directory contains a minimal working example for the StreamFlow quickstart guide. + +## Files + +- `hello-workflow.cwl` - Simple CWL workflow that echoes a message +- `inputs.yml` - Input parameters for the workflow +- `streamflow.yml` - StreamFlow configuration for local execution +- `streamflow-docker.yml` - Alternative configuration using Docker (optional) + +## Running the Example + +```bash +# Run locally +streamflow run streamflow.yml + +# Check output +cat output.txt + +# Run with Docker (optional - requires Docker) +streamflow run streamflow-docker.yml +``` + +## Expected Output + +The workflow should create an `output.txt` file containing: +``` +Hello from StreamFlow! +``` + +## What It Demonstrates + +- Basic CWL CommandLineTool definition +- StreamFlow configuration structure +- Local execution (default) +- Docker deployment (optional) + +## Documentation + +See the complete quickstart guide: `docs/source/user-guide/quickstart.rst` diff --git a/docs/examples/quickstart/hello-workflow.cwl b/docs/examples/quickstart/hello-workflow.cwl new file mode 100644 index 000000000..704a76e1a --- /dev/null +++ b/docs/examples/quickstart/hello-workflow.cwl @@ -0,0 +1,12 @@ +cwlVersion: v1.2 +class: CommandLineTool +baseCommand: echo +inputs: + message: + type: string + inputBinding: + position: 1 +outputs: + output: + type: stdout +stdout: output.txt diff --git a/docs/examples/quickstart/inputs.yml b/docs/examples/quickstart/inputs.yml new file mode 100644 index 000000000..9675f26f7 --- /dev/null +++ b/docs/examples/quickstart/inputs.yml @@ -0,0 +1 @@ +message: "Hello from StreamFlow!" diff --git a/docs/examples/quickstart/streamflow.yml b/docs/examples/quickstart/streamflow.yml new file mode 100644 index 000000000..d202a9602 --- /dev/null +++ b/docs/examples/quickstart/streamflow.yml @@ -0,0 +1,8 @@ +version: v1.0 + +workflows: + hello-workflow: + type: cwl + config: + file: hello-workflow.cwl + settings: inputs.yml diff --git a/docs/source/conf.py b/docs/source/conf.py index b20f5ee93..0f7f69393 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,11 +22,11 @@ import streamflow.config.schema -project = 'StreamFlow' -copyright = '2023, Alpha Research Group, Computer Science Dept., University of Torino' -author = 'Iacopo Colonnelli' -version = '0.2' -release = '0.2.0' +project = "StreamFlow" +copyright = "2023, Alpha Research Group, Computer Science Dept., University of Torino" +author = "Iacopo Colonnelli" +version = "0.2" +release = "0.2.0" # -- General configuration --------------------------------------------------- @@ -34,53 +34,66 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosectionlabel', - 'sphinx.ext.extlinks', - 'sphinx-jsonschema', - 'sphinx_llms_txt', - 'sphinx_rtd_theme' + "sphinx.ext.autodoc", + "sphinx.ext.autosectionlabel", + "sphinx.ext.extlinks", + "sphinx-jsonschema", + "sphinx_llms_txt", + "sphinx_rtd_theme", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [ -] +exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- -html_logo = 'images/streamflow_logo.png' +html_logo = "images/streamflow_logo.png" # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] + +# Control table of contents depth +toctree_collapse = True def setup(app): - app.add_css_file('theme_overrides.css') + app.add_css_file("theme_overrides.css") # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - "logo_only": True + "logo_only": True, + "navigation_depth": 2, # Show 2-level TOC in sidebar + "collapse_navigation": False, # Keep navigation expanded + "includehidden": False, # Do not show hidden toctrees + "titles_only": True, # Show only titles } extlinks = { - 'config-schema': ('https://raw.githubusercontent.com/alpha-unito/streamflow/' + release + - '/streamflow/config/schemas/v1.0/%s', 'GH#'), - 'repo': ('https://github.com/alpha-unito/streamflow/tree/' + release + '/%s', 'GH#') + "config-schema": ( + "https://raw.githubusercontent.com/alpha-unito/streamflow/" + + release + + "/streamflow/config/schemas/v1.0/%s", + "GH#", + ), + "repo": ( + "https://github.com/alpha-unito/streamflow/tree/" + release + "/%s", + "GH#", + ), } @@ -107,18 +120,18 @@ def setup(app): def _patched_simpletype(self, schema): rows = [] - if 'title' in schema and (not self.options['lift_title'] or self.nesting > 1): - del schema['title'] + if "title" in schema and (not self.options["lift_title"] or self.nesting > 1): + del schema["title"] self._check_description(schema, rows) - if 'type' in schema: - if '$ref' in schema: + if "type" in schema: + if "$ref" in schema: ref = self._reference(schema) - rows.extend(self._prepend(self._cell('type'), ref)) - del schema['type'] - elif type(schema['type']) == list: - cells = [self._line(self._decodetype(t)) for t in schema['type']] - rows.extend(self._prepend(self._cell('type'), cells)) - del schema['type'] + rows.extend(self._prepend(self._cell("type"), ref)) + del schema["type"] + elif type(schema["type"]) == list: + cells = [self._line(self._decodetype(t)) for t in schema["type"]] + rows.extend(self._prepend(self._cell("type"), cells)) + del schema["type"] rows.extend(_original_simpletype(self, schema)) return rows @@ -128,18 +141,18 @@ def _patched_simpletype(self, schema): def _patched_arraytype(self, schema): - if 'items' in schema: - if type(schema['items']) == list: + if "items" in schema: + if type(schema["items"]) == list: return _original_arraytype(self, schema) else: - schema['unique'] = 'uniqueItems' in schema['items'] - if 'type' in schema['items']: - schema['type'] = schema['items']['type'] + '[]' + schema["unique"] = "uniqueItems" in schema["items"] + if "type" in schema["items"]: + schema["type"] = schema["items"]["type"] + "[]" rows = self._simpletype(schema) return rows else: rows = _original_arraytype(self, schema) - rows.extend(self._bool_or_object(schema, 'unique')) + rows.extend(self._bool_or_object(schema, "unique")) return rows else: return _original_arraytype(self, schema) @@ -150,8 +163,8 @@ def _patched_arraytype(self, schema): def _patched_objecttype(self, schema): - if 'additionalProperties' in schema: - del schema['additionalProperties'] + if "additionalProperties" in schema: + del schema["additionalProperties"] return _original_objecttype(self, schema) @@ -167,11 +180,11 @@ def _patched_objectproperties(self, schema, key): for prop in schema[key].keys(): # insert spaces around the regexp OR operator # allowing the regexp to be split over multiple lines. - proplist = prop.split('|') - dispprop = self._escape(' | '.join(proplist)) - if 'required' in schema: - if prop in schema['required']: - dispprop = f'**{dispprop}**\n(required)' + proplist = prop.split("|") + dispprop = self._escape(" | ".join(proplist)) + if "required" in schema: + if prop in schema["required"]: + dispprop = f"**{dispprop}**\n(required)" label = self._cell(dispprop) if isinstance(schema[key][prop], dict): @@ -188,18 +201,18 @@ def _patched_objectproperties(self, schema, key): def _patched_complexstructures(self, schema): rows = [] - if 'oneOf' in schema: + if "oneOf" in schema: types = [] - for obj in schema['oneOf']: - if 'type' in obj: - if obj['type'] == 'object' and '$ref' in obj: + for obj in schema["oneOf"]: + if "type" in obj: + if obj["type"] == "object" and "$ref" in obj: types.extend(self._reference(obj)) - elif obj['type'] != 'null': - types.append(self._line(self._decodetype(obj['type']))) - del obj['type'] - if not list(filter(bool, schema['oneOf'])): - del schema['oneOf'] - rows.extend(self._prepend(self._cell('type'), types)) + elif obj["type"] != "null": + types.append(self._line(self._decodetype(obj["type"]))) + del obj["type"] + if not list(filter(bool, schema["oneOf"])): + del schema["oneOf"] + rows.extend(self._prepend(self._cell("type"), types)) rows.extend(_original_complexstructures(self, schema)) return rows @@ -210,7 +223,7 @@ def _patched_complexstructures(self, schema): def patched_transform(self, schema): table, definitions = original_transform(self, schema) - table['classes'] += ['jsonschema-table'] + table["classes"] += ["jsonschema-table"] return table, definitions @@ -218,20 +231,20 @@ def patched_transform(self, schema): sjs_wide_format.WideFormat.transform = patched_transform -def patched_run(self, schema, pointer=''): - if 'id' in schema: - del schema['id'] - elif '$id' in schema: - del schema['$id'] - if 'type' in schema: - del schema['type'] - if 'required' in schema and 'properties' in schema: +def patched_run(self, schema, pointer=""): + if "id" in schema: + del schema["id"] + elif "$id" in schema: + del schema["$id"] + if "type" in schema: + del schema["type"] + if "required" in schema and "properties" in schema: props = {} - for prop in schema['required']: - if prop in schema['properties']: - props[prop] = schema['properties'][prop] - del schema['properties'][prop] - schema['properties'] = {**props, **schema['properties']} + for prop in schema["required"]: + if prop in schema["properties"]: + props[prop] = schema["properties"][prop] + del schema["properties"][prop] + schema["properties"] = {**props, **schema["properties"]} return original_run(self, schema, pointer) @@ -240,19 +253,23 @@ def patched_run(self, schema, pointer=''): def patched_from_url(self, url): - root_schema = json.loads(streamflow.config.schema.SfSchema().dump(version='v1.0')) - defs = root_schema.get('$defs', root_schema.get('definitions', {})) + root_schema = json.loads(streamflow.config.schema.SfSchema().dump(version="v1.0")) + defs = root_schema.get("$defs", root_schema.get("definitions", {})) if url in defs: schema = defs[url] - if '$ref' in schema and not schema['$ref'].startswith('#'): - ref_id, ref_pointer = self._splitpointer(urljoin(schema['$id'], schema['$ref'])) + if "$ref" in schema and not schema["$ref"].startswith("#"): + ref_id, ref_pointer = self._splitpointer( + urljoin(schema["$id"], schema["$ref"]) + ) ref_schema = defs[ref_id] if ref_pointer: ref_schema = self.resolve_pointer(ref_schema, ref_pointer) schema = ref_schema | schema - schema['properties'] = ref_schema.get('properties', {}) | schema.get('properties', {}) - schema['properties'] = dict(sorted(schema['properties'].items())) - del schema['$ref'] + schema["properties"] = ref_schema.get("properties", {}) | schema.get( + "properties", {} + ) + schema["properties"] = dict(sorted(schema["properties"].items())) + del schema["$ref"] return json.dumps(schema), url else: raise self.error( diff --git a/docs/source/developer-guide/architecture/index.rst b/docs/source/developer-guide/architecture/index.rst new file mode 100644 index 000000000..dcbcad4ab --- /dev/null +++ b/docs/source/developer-guide/architecture/index.rst @@ -0,0 +1,108 @@ +============ +Architecture +============ + +.. meta:: + :keywords: StreamFlow, architecture, design, execution model, modules + :description: StreamFlow architectural design, execution model, and module structure + +Overview +======== + +This section provides a comprehensive overview of StreamFlow's architecture, from high-level design principles to detailed execution flows and module organization. + +Quick Reference +=============== + +============ ==================================== +Audience Developers and architects +Purpose Understand StreamFlow design +Time 2-3 hours +Difficulty Intermediate +============ ==================================== + +Table of Contents +================= + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + overview + execution-model + data-management + module-structure + plugin-architecture + +Architecture Topics +=================== + +**Overview** + High-level architectural design and the three-layer environment stack (deployment → service → location). + + :doc:`overview` + +**Execution Model** + Detailed workflow execution flow from CWL parsing through task scheduling and remote execution. + + :doc:`execution-model` + +**Data Management** + How StreamFlow tracks data locations, optimizes transfers, and manages remote paths. + + :doc:`data-management` + +**Module Structure** + Python package organization and the role of each module in the codebase. + + :doc:`module-structure` + +**Plugin Architecture** + How the plugin system works, including discovery, loading, and extension point registration. + + :doc:`plugin-architecture` + +Key Architectural Principles +============================= + +**Container-Native Design** + StreamFlow is built from the ground up to support multi-container environments and service-oriented architectures. + +**Hybrid Infrastructure Support** + Relaxes the shared data space requirement to enable execution across heterogeneous cloud and HPC resources. + +**CWL Standard Compliance** + Implements the Common Workflow Language standard (v1.0-v1.2) for workflow portability. + +**Extensibility via Plugins** + Core functionality can be extended through a well-defined plugin system with multiple extension points. + +Related Documentation +===================== + +**User Guide:** + For using StreamFlow: + + - :doc:`/user-guide/quickstart` - Get started quickly + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + +**Core Interfaces:** + For detailed interface documentation: + + - :doc:`/developer-guide/core-interfaces/index` - Core abstractions + +**Extension Points:** + For creating plugins: + + - :doc:`/developer-guide/extension-points/index` - Plugin development + +Next Steps +========== + +Start with the overview and progress through the execution model: + +1. :doc:`overview` - Understand the high-level design +2. :doc:`execution-model` - Learn how workflows execute +3. :doc:`data-management` - Understand data handling +4. :doc:`module-structure` - Explore the codebase organization +5. :doc:`plugin-architecture` - Learn the extension system diff --git a/docs/source/developer-guide/core-interfaces/index.rst b/docs/source/developer-guide/core-interfaces/index.rst new file mode 100644 index 000000000..6574a68af --- /dev/null +++ b/docs/source/developer-guide/core-interfaces/index.rst @@ -0,0 +1,132 @@ +=============== +Core Interfaces +=============== + +.. meta:: + :keywords: StreamFlow, core, interfaces, API, abstractions + :description: StreamFlow core interface documentation for plugin developers + +Overview +======== + +This section documents StreamFlow's core interfaces defined in the ``streamflow.core`` module. These abstractions form the foundation for all StreamFlow functionality and are essential for plugin development. + +Quick Reference +=============== + +============== ==================================== +Audience Plugin developers +Purpose Understand core abstractions +Difficulty Intermediate to Advanced +Prerequisites Python async programming +============== ==================================== + +Core Interfaces +=============== + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + streamflow-context + workflow + deployment + data + persistence + scheduling + recovery + +Interface Descriptions +====================== + +**StreamFlowContext** + The central coordinator managing all StreamFlow components and their lifecycle. Entry point for accessing managers and services. + + :doc:`streamflow-context` + +**Workflow Interfaces** + Workflow, Step, Port, and Token abstractions for representing computational workflows. + + :doc:`workflow` + +**Deployment Interfaces** + Connector, Target, Service, and Location for interacting with execution environments. + + :doc:`deployment` + +**Data Interfaces** + DataManager and DataLocation for tracking and transferring data across environments. + + :doc:`data` + +**Persistence Interfaces** + Database abstraction for storing workflow metadata and provenance. + + :doc:`persistence` + +**Scheduling Interfaces** + Scheduler and Policy for resource allocation and task assignment. + + :doc:`scheduling` + +**Recovery Interfaces** + CheckpointManager and FailureManager for fault tolerance. + + :doc:`recovery` + +Using Core Interfaces +====================== + +**For Plugin Development:** + +When creating plugins, you'll implement one or more of these interfaces: + +* **Connector plugin:** Implement ``Connector`` interface from :doc:`deployment` +* **Scheduler plugin:** Implement ``Scheduler`` and ``Policy`` from :doc:`scheduling` +* **Database plugin:** Implement ``Database`` from :doc:`persistence` +* **DataManager plugin:** Implement ``DataManager`` from :doc:`data` + +**For Core Development:** + +When contributing to StreamFlow core, understand these interfaces to maintain consistency and avoid breaking changes. + +Interface Contracts +=================== + +All core interfaces define contracts that implementations must honor: + +* **Async Methods:** Most methods are async and must be awaited +* **Type Hints:** All methods have complete type annotations +* **Error Handling:** Raise appropriate exceptions from ``streamflow.core.exception`` +* **Context Management:** Use StreamFlowContext for accessing other components + +Related Documentation +===================== + +**Extension Points:** + For step-by-step plugin development: + + - :doc:`/developer-guide/extension-points/creating-plugins` - Plugin tutorial + - :doc:`/developer-guide/extension-points/index` - All extension points + +**Architecture:** + For understanding how interfaces fit together: + + - :doc:`/developer-guide/architecture/overview` - System architecture + - :doc:`/developer-guide/architecture/module-structure` - Code organization + +**API Reference:** + For complete API documentation: + + - :doc:`/reference/api/index` - Auto-generated API docs + +Next Steps +========== + +Start with the context, then explore interfaces relevant to your needs: + +1. :doc:`streamflow-context` - Central coordinator (start here) +2. :doc:`deployment` - If creating connectors +3. :doc:`scheduling` - If creating schedulers +4. :doc:`data` - If working with data management +5. :doc:`workflow` - If extending workflow capabilities diff --git a/docs/source/developer-guide/extension-points/index.rst b/docs/source/developer-guide/extension-points/index.rst new file mode 100644 index 000000000..09f217950 --- /dev/null +++ b/docs/source/developer-guide/extension-points/index.rst @@ -0,0 +1,183 @@ +================ +Extension Points +================ + +.. meta:: + :keywords: StreamFlow, plugins, extensions, connector, scheduler, database + :description: Guide to extending StreamFlow through plugins and custom implementations + +Overview +======== + +StreamFlow provides multiple extension points that allow you to customize and extend its functionality. This section documents all available extension points and provides guidance for creating custom plugins. + +Quick Reference +=============== + +============== ========================================= +Audience Plugin developers +Purpose Create custom StreamFlow extensions +Time 4-6 hours for complete guide +Difficulty Advanced +Prerequisites Python 3.10+, async programming +============== ========================================= + +Available Extension Points +========================== + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + creating-plugins + connector + binding-filter + cwl-docker-translator + scheduler + database + data-manager + deployment-manager + fault-tolerance + +Extension Point Overview +======================== + +======================== ============================================ ============== +Extension Point Purpose Difficulty +======================== ============================================ ============== +**Connector** Support new execution environments Intermediate +**Scheduler** Custom task scheduling policies Advanced +**Database** Alternative persistence backends Intermediate +**DataManager** Custom data transfer strategies Advanced +**BindingFilter** Custom binding selection logic Intermediate +**CWLDockerTranslator** Custom CWL DockerRequirement handling Advanced +**DeploymentManager** Alternative deployment lifecycle management Advanced +**CheckpointManager** Custom checkpointing strategies Advanced +**FailureManager** Custom failure handling policies Advanced +======================== ============================================ ============== + +Getting Started +=============== + +**New to Plugin Development?** + +Start with the step-by-step tutorial: + +:doc:`creating-plugins` - Complete guide to creating your first plugin + +**Choose Your Extension Point:** + +* **Want to support a new execution environment?** → :doc:`connector` +* **Need custom scheduling logic?** → :doc:`scheduler` +* **Want different data transfer behavior?** → :doc:`data-manager` +* **Need custom persistence?** → :doc:`database` +* **Want to customize binding selection?** → :doc:`binding-filter` + +Plugin Development Workflow +============================ + +1. **Choose Extension Point:** Identify which interface to implement +2. **Study the Interface:** Read the documentation and existing implementations +3. **Implement the Plugin:** Create your plugin class implementing the interface +4. **Register the Plugin:** Use StreamFlow's plugin registration mechanism +5. **Test Locally:** Verify your plugin works as expected +6. **Package & Distribute:** Share your plugin with others (optional) + +Example: Connector Plugin Structure +==================================== + +.. code-block:: python + + from streamflow.core.deployment import Connector + from streamflow.core.context import StreamFlowContext + + class MyConnector(Connector): + def __init__( + self, + deployment_name: str, + config_dir: str, + # ... your config parameters + ) -> None: + super().__init__(deployment_name, config_dir) + # Initialize your connector + + async def deploy(self, external: bool) -> None: + # Implement deployment logic + pass + + async def run( + self, + location: ExecutionLocation, + command: MutableSequence[str], + # ... other parameters + ) -> tuple[Any, int]: + # Implement command execution + pass + + # Implement other required methods... + +Built-in Implementations +======================== + +StreamFlow includes several built-in implementations you can reference: + +**Connectors:** + - Docker, DockerCompose, Kubernetes, Helm3 + - SSH, Slurm, PBS, Flux + - Singularity, OccAM + + See :doc:`/reference/index` + +**Schedulers:** + - DataLocalityPolicy (default) + + See :doc:`scheduler` + +**Databases:** + - SQLite (default) + + See :doc:`database` + +**Binding Filters:** + - ShuffleBindingFilter, MatchBindingFilter + + See :doc:`binding-filter` + +Related Documentation +===================== + +**Core Interfaces:** + For detailed interface documentation: + + - :doc:`/developer-guide/core-interfaces/index` - Core abstractions + +**Architecture:** + For understanding the plugin system: + + - :doc:`/developer-guide/architecture/plugin-architecture` - Plugin mechanism + +**Reference:** + For configuration schemas: + + - :doc:`/reference/configuration/index` - Plugin configuration + +Contributing Plugins +==================== + +If you've created a useful plugin, consider contributing it: + +* **To StreamFlow Core:** Open a pull request with your connector +* **As Separate Package:** Publish to PyPI as ``streamflow-plugin-{name}`` +* **Examples/Documentation:** Share in GitHub discussions + +See :doc:`/developer-guide/contributing` for contribution guidelines. + +Next Steps +========== + +Start building your plugin: + +1. :doc:`creating-plugins` - Step-by-step tutorial +2. Choose your extension point from the list above +3. Study :doc:`/developer-guide/core-interfaces/index` for interface details +4. Review :doc:`/developer-guide/code-style` for coding standards diff --git a/docs/source/developer-guide/index.rst b/docs/source/developer-guide/index.rst new file mode 100644 index 000000000..d2a37ce98 --- /dev/null +++ b/docs/source/developer-guide/index.rst @@ -0,0 +1,152 @@ +=============== +Developer Guide +=============== + +.. meta:: + :keywords: StreamFlow, developer, architecture, plugin, extension, API + :description: Comprehensive guide for StreamFlow developers: architecture, core interfaces, and plugin development + +Overview +======== + +The Developer Guide provides in-depth information about StreamFlow's architecture, core abstractions, and extension mechanisms. This guide is essential for developers who want to understand StreamFlow's internals, contribute to the codebase, or create custom extensions. + +Quick Reference +=============== + +============== ==================================== +Audience Developers and contributors +Purpose Understand and extend StreamFlow +Time 4-6 hours for complete guide +Difficulty Intermediate to Advanced +Prerequisites Python 3.10+, async programming +============== ==================================== + +Who Should Read This +==================== + +This guide is for: + +* **Core Contributors:** Developers contributing to StreamFlow codebase +* **Plugin Developers:** Creating custom connectors, schedulers, or other extensions +* **Architecture Enthusiasts:** Understanding StreamFlow's design and implementation +* **System Developers:** Integrating StreamFlow into larger systems + +Table of Contents +================= + +Architecture +------------ + +Understand StreamFlow's design, execution model, and module structure: + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + architecture/index + +Development Setup +----------------- + +Set up your development environment and learn the contribution workflow: + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + development-setup + testing-guide + code-style + contributing + +Core Interfaces +--------------- + +Deep dive into StreamFlow's core abstractions for plugin development: + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + core-interfaces/index + +Extension Points +---------------- + +Learn how to create custom plugins and extend StreamFlow: + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + extension-points/index + +Quick Navigation +================ + +**New to StreamFlow Development?** + Start with :doc:`architecture/overview` to understand the system design. + +**Want to Create a Plugin?** + Go to :doc:`extension-points/creating-plugins` for a step-by-step tutorial. + +**Contributing Code?** + Read :doc:`development-setup` and :doc:`contributing` for guidelines. + +**Need API Reference?** + See :doc:`/reference/api/index` for complete API documentation. + +Key Architecture Concepts +========================= + +**StreamFlowContext** + The central coordinator managing all StreamFlow components and their lifecycle. + +**Connector** + Interface for interacting with execution environments (Docker, Kubernetes, HPC, etc.). + +**Scheduler** + Component responsible for assigning workflow tasks to available resources. + +**DataManager** + Handles data transfer and location tracking across different execution environments. + +**Database** + Persistence layer for workflow metadata, provenance, and checkpointing. + +Related Documentation +===================== + +**User Guide:** + For using StreamFlow (not developing it): + + - :doc:`/user-guide/quickstart` - Get started in 10 minutes + - :doc:`/user-guide/installation` - Installation instructions + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + +**Reference:** + For detailed API and configuration reference: + + - :doc:`/reference/api/index` - Auto-generated API documentation + - :doc:`/reference/configuration/index` - Configuration schemas + +Contributing to StreamFlow +=========================== + +We welcome contributions! Please see: + +* :doc:`contributing` - Contribution guidelines +* :doc:`code-style` - Coding standards and best practices +* :doc:`testing-guide` - How to write and run tests +* `GitHub Repository `_ - Source code + +Next Steps +========== + +Choose your path: + +* :doc:`architecture/overview` - Understand the architecture +* :doc:`development-setup` - Set up your environment +* :doc:`extension-points/creating-plugins` - Build your first plugin +* :doc:`core-interfaces/index` - Learn the core abstractions diff --git a/docs/source/index.rst b/docs/source/index.rst index 40d54ace4..3ac9e1b53 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -35,8 +35,42 @@ For LaTeX users, the following BibTeX entry can be used: } .. toctree:: - :caption: Getting Started + :caption: User Guide + :maxdepth: 2 + :titlesonly: + + user-guide/installation + user-guide/quickstart + user-guide/writing-workflows + user-guide/configuring-deployments + user-guide/binding-workflows + user-guide/running-workflows + user-guide/inspecting-results + user-guide/advanced-patterns/index + user-guide/troubleshooting + +.. toctree:: + :caption: Developer Guide + :maxdepth: 2 + :titlesonly: + + developer-guide/index + +.. toctree:: + :caption: Reference + :maxdepth: 2 + :titlesonly: + + reference/cli/index + reference/configuration/index + reference/connectors/index + reference/cwl-docker-translators/index + reference/glossary + +.. toctree:: + :caption: Getting Started (Legacy) :hidden: + :titlesonly: guide/install.rst guide/architecture.rst @@ -47,24 +81,27 @@ For LaTeX users, the following BibTeX entry can be used: guide/inspect.rst .. toctree:: - :caption: Advanced Features + :caption: Advanced Features (Legacy) :hidden: + :titlesonly: advanced/multiple-targets.rst advanced/port-targets.rst advanced/stacked-locations.rst .. toctree:: - :caption: CWL Standard + :caption: CWL Standard (Legacy) :hidden: + :titlesonly: cwl/cwl-conformance.rst cwl/cwl-runner.rst cwl/docker-requirement.rst .. toctree:: - :caption: Extension Points + :caption: Extension Points (Legacy) :hidden: + :titlesonly: ext/plugins.rst ext/binding-filter.rst @@ -77,8 +114,9 @@ For LaTeX users, the following BibTeX entry can be used: ext/scheduling.rst .. toctree:: - :caption: Connectors + :caption: Connectors (Legacy) :hidden: + :titlesonly: connector/container.rst connector/docker.rst @@ -94,8 +132,9 @@ For LaTeX users, the following BibTeX entry can be used: connector/ssh.rst .. toctree:: - :caption: CWL Docker Translators + :caption: CWL Docker Translators (Legacy) :hidden: + :titlesonly: cwl/docker/docker.rst cwl/docker/kubernetes.rst diff --git a/docs/source/reference/cli/cwl-runner.rst b/docs/source/reference/cli/cwl-runner.rst new file mode 100644 index 000000000..ca966f8a4 --- /dev/null +++ b/docs/source/reference/cli/cwl-runner.rst @@ -0,0 +1,127 @@ +=========== +cwl-runner +=========== + +.. meta:: + :keywords: StreamFlow, CWL, cwl-runner, Common Workflow Language + :description: CWL standard runner interface for StreamFlow + +Synopsis +======== + +.. code-block:: bash + + cwl-runner [OPTIONS] WORKFLOW INPUTS + +Description +=========== + +Execute CWL workflows using the CWL standard ``cwl-runner`` interface. StreamFlow implements this interface for compatibility with CWL-based tools and pipelines. + +Arguments +========= + +``WORKFLOW`` + Path to the CWL workflow file (``.cwl``). + + **Required:** Yes + +``INPUTS`` + Path to the inputs file (typically YAML or JSON). + + **Required:** Yes + +Options +======= + +``--streamflow-file FILE`` + Path to StreamFlow configuration file for deployment and binding configuration. + + **Optional:** If not specified, runs locally + +``--debug`` + Enable debug output. + +``--quiet`` + Suppress informational messages. + +``--outdir DIRECTORY`` + Output directory for results. + + **Default:** Current directory + +Examples +======== + +**Basic CWL Execution:** + +.. code-block:: bash + + cwl-runner workflow.cwl inputs.yml + +**With StreamFlow Configuration:** + +.. code-block:: bash + + cwl-runner --streamflow-file streamflow.yml workflow.cwl inputs.yml + +**Custom Output Directory:** + +.. code-block:: bash + + cwl-runner --outdir /path/to/results workflow.cwl inputs.yml + +**Debug Mode:** + +.. code-block:: bash + + cwl-runner --debug workflow.cwl inputs.yml + +CWL Compatibility +================= + +StreamFlow implements CWL standard versions: + +* **CWL v1.0** - Fully supported +* **CWL v1.1** - Fully supported +* **CWL v1.2** - Fully supported + +For conformance details, see the CWL conformance documentation. + +Differences from streamflow run +================================ + +The ``cwl-runner`` interface: + +* **CWL Standard:** Implements the CWL reference interface +* **Compatibility:** Works with CWL tools expecting ``cwl-runner`` +* **Simpler:** Takes workflow + inputs directly +* **Limited Config:** Uses ``--streamflow-file`` for StreamFlow-specific configuration + +The ``streamflow run`` command: + +* **StreamFlow Native:** Uses StreamFlow configuration format +* **Full Features:** Access to all StreamFlow features +* **Configuration:** Single YAML file with workflow, deployments, bindings + +Related Commands +================ + +* :doc:`run` - Native StreamFlow execution interface + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + - :doc:`/user-guide/running-workflows` - Running workflows + +**External Resources:** + - `Common Workflow Language `_ + - `CWL User Guide `_ + +See Also +======== + +* :doc:`run` - For StreamFlow native interface +* `cwltool `_ - CWL reference implementation diff --git a/docs/source/reference/cli/ext.rst b/docs/source/reference/cli/ext.rst new file mode 100644 index 000000000..6b51a58e8 --- /dev/null +++ b/docs/source/reference/cli/ext.rst @@ -0,0 +1,80 @@ +=============== +streamflow ext +=============== + +.. meta:: + :keywords: StreamFlow, CLI, extensions, connectors + :description: List StreamFlow extensions + +Synopsis +======== + +.. code-block:: bash + + streamflow ext {list,show} [OPTIONS] + +Description +=========== + +List and inspect available StreamFlow extensions including built-in connectors, schedulers, and other extension points. + +Subcommands +=========== + +``list`` + List all available StreamFlow extensions. + + .. code-block:: bash + + streamflow ext list + +``show`` + Show details of a specific StreamFlow extension. + + .. code-block:: bash + + streamflow ext show EXTENSION_NAME + +Examples +======== + +**List All Extensions:** + +.. code-block:: bash + + streamflow ext list + +**Show Extension Details:** + +.. code-block:: bash + + streamflow ext show docker + +Output +====== + +Lists available extension types: + +* **Connectors:** Docker, Kubernetes, Slurm, SSH, etc. +* **Schedulers:** Data locality scheduler, etc. +* **Binding Filters:** Shuffle, match filters +* **Other Extensions:** Custom plugins + +Related Commands +================ + +* :doc:`plugin` - Manage installed plugins + +Related Documentation +===================== + +**Reference:** + - :doc:`/reference/connectors/index` - Connector documentation + +**Developer Guide:** + - :doc:`/developer-guide/extension-points/index` - Extension points + +See Also +======== + +* :doc:`plugin` - For plugin management diff --git a/docs/source/reference/cli/index.rst b/docs/source/reference/cli/index.rst new file mode 100644 index 000000000..e32bbdd3e --- /dev/null +++ b/docs/source/reference/cli/index.rst @@ -0,0 +1,160 @@ +====================== +Command-Line Interface +====================== + +.. meta:: + :keywords: StreamFlow, CLI, command line, streamflow run, commands + :description: Complete StreamFlow command-line interface reference + +Overview +======== + +StreamFlow provides a comprehensive command-line interface (CLI) for executing workflows, inspecting results, and managing plugins. This section documents all available commands and their options. + +Quick Reference +=============== + +============ ==================================== +Purpose CLI command reference +Audience All users +Organization By command +Usage Lookup command details +============ ==================================== + +Available Commands +================== + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + run + list + report + prov + plugin + ext + schema + cwl-runner + +Command Summary +=============== + +Core Commands +------------- + +===================== =========================================== +Command Purpose +===================== =========================================== +``streamflow run`` Execute a workflow +``streamflow list`` List executed workflows +``streamflow report`` Generate execution report +``streamflow prov`` Export provenance archive +===================== =========================================== + +Utility Commands +---------------- + +======================== =========================================== +Command Purpose +======================== =========================================== +``streamflow plugin`` Manage installed plugins +``streamflow ext`` List available extensions +``streamflow schema`` Dump configuration schema +``streamflow version`` Show StreamFlow version +======================== =========================================== + +Alternative Interfaces +---------------------- + +===================== =========================================== +Command Purpose +===================== =========================================== +``cwl-runner`` CWL standard runner interface +===================== =========================================== + +Getting Help +============ + +For any command, use ``--help`` to see detailed usage: + +.. code-block:: bash + + streamflow --help + streamflow run --help + streamflow list --help + +Common Options +============== + +Many commands share these common options: + +``--log-level`` + Set logging verbosity (DEBUG, INFO, WARNING, ERROR, CRITICAL) + +``--config`` + Path to StreamFlow configuration file (default: ``streamflow.yml``) + +Exit Codes +========== + +StreamFlow uses standard exit codes: + +==== =========================================== +Code Meaning +==== =========================================== +0 Success +1 General error +2 Configuration error +3 Workflow execution error +==== =========================================== + +Related Documentation +===================== + +**User Guide:** + For tutorials on using commands: + + - :doc:`/user-guide/running-workflows` - Running workflows + - :doc:`/user-guide/inspecting-results` - Inspection commands + +**Configuration:** + For configuration file reference: + + - :doc:`/reference/configuration/streamflow-yml` - Main config file + +Examples +======== + +**Run a workflow:** + +.. code-block:: bash + + streamflow run streamflow.yml + +**List all workflows:** + +.. code-block:: bash + + streamflow list + +**Generate report:** + +.. code-block:: bash + + streamflow report + +**Debug mode:** + +.. code-block:: bash + + streamflow run --log-level DEBUG streamflow.yml + +Next Steps +========== + +Explore command documentation: + +* :doc:`run` - Most commonly used command +* :doc:`list` - Inspect workflow history +* :doc:`report` - Generate execution reports diff --git a/docs/source/reference/cli/list.rst b/docs/source/reference/cli/list.rst new file mode 100644 index 000000000..a27eeac1f --- /dev/null +++ b/docs/source/reference/cli/list.rst @@ -0,0 +1,89 @@ +================ +streamflow list +================ + +.. meta:: + :keywords: StreamFlow, CLI, list, workflows + :description: List executed StreamFlow workflows + +Synopsis +======== + +.. code-block:: bash + + streamflow list [OPTIONS] [NAME] + +Description +=========== + +List all executed workflows stored in the StreamFlow database. Can list all workflows or filter by workflow name. + +Arguments +========= + +``NAME`` + List all executions for the given workflow name. + + **Optional:** If omitted, lists all workflows + +Options +======= + +``-h, --help`` + Show help message and exit. + +``--file FILE, -f FILE`` + Path to the StreamFlow configuration file. Uses the database configured in that file. + + **Default:** Uses default database location + +Examples +======== + +**List All Workflows:** + +.. code-block:: bash + + streamflow list + +**List Executions for Specific Workflow:** + +.. code-block:: bash + + streamflow list my-workflow + +**Use Custom Configuration File:** + +.. code-block:: bash + + streamflow list --file streamflow.yml my-workflow + +Output Format +============= + +The command outputs a table with the following columns: + +* **Name:** Workflow name +* **Status:** Execution status (completed, failed, running) +* **Start Time:** When execution started +* **End Time:** When execution finished +* **Duration:** Total execution time + +Related Commands +================ + +* :doc:`run` - Execute a workflow +* :doc:`report` - Generate detailed execution report + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/inspecting-results` - Inspecting workflow results + - :doc:`/user-guide/running-workflows` - Running workflows + +See Also +======== + +* :doc:`report` - For detailed reports +* :doc:`prov` - For provenance data diff --git a/docs/source/reference/cli/plugin.rst b/docs/source/reference/cli/plugin.rst new file mode 100644 index 000000000..ba72fce1e --- /dev/null +++ b/docs/source/reference/cli/plugin.rst @@ -0,0 +1,67 @@ +================== +streamflow plugin +================== + +.. meta:: + :keywords: StreamFlow, CLI, plugin, extensions + :description: Manage StreamFlow plugins + +Synopsis +======== + +.. code-block:: bash + + streamflow plugin {list,show} [OPTIONS] + +Description +=========== + +Manage installed StreamFlow plugins. View available plugins and their details. + +Subcommands +=========== + +``list`` + List all installed StreamFlow plugins. + + .. code-block:: bash + + streamflow plugin list + +``show`` + Show details of a specific StreamFlow plugin. + + .. code-block:: bash + + streamflow plugin show PLUGIN_NAME + +Examples +======== + +**List All Plugins:** + +.. code-block:: bash + + streamflow plugin list + +**Show Plugin Details:** + +.. code-block:: bash + + streamflow plugin show my-connector-plugin + +Related Commands +================ + +* :doc:`ext` - List available extensions + +Related Documentation +===================== + +**Developer Guide:** + - :doc:`/developer-guide/extension-points/index` - Creating plugins + +See Also +======== + +* :doc:`ext` - For extension management diff --git a/docs/source/reference/cli/prov.rst b/docs/source/reference/cli/prov.rst new file mode 100644 index 000000000..ac79aac98 --- /dev/null +++ b/docs/source/reference/cli/prov.rst @@ -0,0 +1,157 @@ +================ +streamflow prov +================ + +.. meta:: + :keywords: StreamFlow, CLI, provenance, RO-Crate + :description: Export provenance data for StreamFlow workflows + +Synopsis +======== + +.. code-block:: bash + + streamflow prov [OPTIONS] WORKFLOW + +Description +=========== + +Export workflow provenance data as a structured archive. StreamFlow supports RO-Crate format for capturing comprehensive workflow execution provenance. + +Arguments +========= + +``WORKFLOW`` + Name of the workflow to process. + + **Required:** Yes + +Options +======= + +``-h, --help`` + Show help message and exit. + +``--add-file ADD_FILE`` + Add an external file to the provenance archive. File properties are specified as comma-separated key-value pairs. + + **Required properties:** + + * ``src`` - Source file path (mandatory) + * ``dst`` - Destination path in archive (default: ``/``) + + Additional properties can be specified as strings or JSON objects. + +``--add-property ADD_PROPERTY`` + Add a property to the archive manifest. Properties are specified as comma-separated key-value pairs and can be strings or JSON objects. + +``--all, -a`` + Include all executions of the selected workflow. If false, include only the last execution. + + **Default:** false + +``--file FILE, -f FILE`` + Path to the StreamFlow configuration file. + + **Default:** Uses default database location + +``--name NAME`` + Name of the generated archive. + + **Default:** ``${WORKFLOW_NAME}.crate.zip`` + +``--outdir OUTDIR`` + Directory where the archive should be created. + + **Default:** Current directory + +``--type {run_crate}, -t {run_crate}`` + Type of provenance archive to generate. + + **Available types:** + + * ``run_crate`` - RO-Crate workflow run provenance + + **Default:** run_crate + +Examples +======== + +**Export Provenance Archive:** + +.. code-block:: bash + + streamflow prov my-workflow + +**Include All Executions:** + +.. code-block:: bash + + streamflow prov --all my-workflow + +**Custom Archive Name:** + +.. code-block:: bash + + streamflow prov --name my-archive.zip my-workflow + +**Custom Output Directory:** + +.. code-block:: bash + + streamflow prov --outdir /path/to/archives my-workflow + +**Add External File:** + +.. code-block:: bash + + streamflow prov --add-file src=/path/to/file.txt,dst=/docs/file.txt my-workflow + +**Add Metadata Property:** + +.. code-block:: bash + + streamflow prov --add-property author="John Doe",license=MIT my-workflow + +Archive Contents +================ + +The generated RO-Crate archive includes: + +* **Workflow Definition:** CWL workflow files +* **Execution Metadata:** Timestamps, status, parameters +* **Input/Output Data:** Links or copies of data files +* **RO-Crate Metadata:** JSON-LD manifest describing archive contents +* **Provenance Information:** Complete execution trace + +RO-Crate Format +=============== + +StreamFlow generates provenance archives following the `RO-Crate `_ specification, which provides: + +* **Interoperability:** Standard format for workflow provenance +* **Reproducibility:** Captures complete execution context +* **FAIR Principles:** Findable, Accessible, Interoperable, Reusable data + +Related Commands +================ + +* :doc:`report` - Generate execution report +* :doc:`list` - List executed workflows +* :doc:`run` - Execute a workflow + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/inspecting-results` - Inspecting workflow results + +**External Resources:** + - `RO-Crate Specification `_ + - `Workflow Run Crate `_ + +See Also +======== + +* :doc:`report` - For execution reports +* :doc:`list` - For listing workflows diff --git a/docs/source/reference/cli/report.rst b/docs/source/reference/cli/report.rst new file mode 100644 index 000000000..35b37fa9d --- /dev/null +++ b/docs/source/reference/cli/report.rst @@ -0,0 +1,145 @@ +================== +streamflow report +================== + +.. meta:: + :keywords: StreamFlow, CLI, report, execution report + :description: Generate execution reports for StreamFlow workflows + +Synopsis +======== + +.. code-block:: bash + + streamflow report [OPTIONS] WORKFLOWS + +Description +=========== + +Generate execution reports for one or more workflows. Reports can be generated in multiple formats including HTML, PDF, and JSON. + +Arguments +========= + +``WORKFLOWS`` + Comma-separated list of workflow names to process. + + **Required:** Yes + +Options +======= + +``-h, --help`` + Show help message and exit. + +``--all, -a`` + Include all executions of the selected workflow(s). If false, include only the last execution. + + **Default:** false + +``--file FILE, -f FILE`` + Path to the StreamFlow configuration file. + + **Default:** Uses default database location + +``--format [{html,pdf,eps,png,jpg,webp,svg,csv,json} ...]`` + Report output format(s). Can specify multiple formats. + + **Available formats:** + + * ``html`` - Interactive HTML report (default) + * ``pdf`` - PDF document + * ``eps`` - Encapsulated PostScript + * ``png`` - PNG image + * ``jpg`` - JPEG image + * ``webp`` - WebP image + * ``svg`` - SVG vector graphic + * ``csv`` - CSV data export + * ``json`` - JSON data export + + **Default:** html + +``--group-by-step`` + Group execution of multiple instances of the same step on a single line. + + **Default:** Disabled + +``--name NAME`` + Name of the report folder. + + **Default:** ``${WORKFLOW}-report`` + +``--outdir OUTDIR`` + Output directory to store the report file. + + **Default:** Current directory + +Examples +======== + +**Generate HTML Report:** + +.. code-block:: bash + + streamflow report my-workflow + +**Generate Multiple Format Reports:** + +.. code-block:: bash + + streamflow report --format html pdf json my-workflow + +**Include All Executions:** + +.. code-block:: bash + + streamflow report --all my-workflow + +**Custom Output Directory:** + +.. code-block:: bash + + streamflow report --outdir /path/to/reports my-workflow + +**Multiple Workflows:** + +.. code-block:: bash + + streamflow report workflow1,workflow2,workflow3 + +**Group by Step:** + +.. code-block:: bash + + streamflow report --group-by-step my-workflow + +Report Contents +=============== + +The generated report includes: + +* **Workflow Overview:** Name, status, duration +* **Execution Timeline:** Visual timeline of step executions +* **Step Details:** Individual step execution times and statuses +* **Resource Usage:** CPU, memory, and other resource metrics +* **Data Transfers:** Information about data movements between locations + +Related Commands +================ + +* :doc:`list` - List executed workflows +* :doc:`prov` - Export provenance data +* :doc:`run` - Execute a workflow + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/inspecting-results` - Inspecting workflow results + - :doc:`/user-guide/troubleshooting` - Troubleshooting guide + +See Also +======== + +* :doc:`list` - For listing workflows +* :doc:`prov` - For provenance archives diff --git a/docs/source/reference/cli/run.rst b/docs/source/reference/cli/run.rst new file mode 100644 index 000000000..f5d455a3c --- /dev/null +++ b/docs/source/reference/cli/run.rst @@ -0,0 +1,126 @@ +============== +streamflow run +============== + +.. meta:: + :keywords: StreamFlow, CLI, run, execute workflow + :description: Execute a StreamFlow workflow + +Synopsis +======== + +.. code-block:: bash + + streamflow run [OPTIONS] STREAMFLOW_FILE + +Description +=========== + +Execute a workflow defined in a StreamFlow configuration file. This is the primary command for running workflows. + +Arguments +========= + +``STREAMFLOW_FILE`` + Path to the StreamFlow configuration file (typically ``streamflow.yml``) describing the workflow execution. + + **Required:** Yes + +Options +======= + +``-h, --help`` + Show help message and exit. + +``--color`` + Print log output with colors related to the logging level. + + **Default:** Disabled + +``--debug`` + Print debug-level diagnostic output. Useful for troubleshooting workflow execution issues. + + **Default:** Disabled + +``--name [NAME]`` + Name of the current workflow. Used for search and indexing in the database. + + **Default:** Derived from workflow file name + +``--outdir OUTDIR`` + Output directory to store final results of the workflow. + + **Default:** Current directory + +``--quiet`` + Only print results, warnings, and errors. Suppresses informational messages. + + **Default:** Disabled + +Examples +======== + +**Basic Execution:** + +.. code-block:: bash + + streamflow run streamflow.yml + +**With Debug Output:** + +.. code-block:: bash + + streamflow run --debug streamflow.yml + +**Custom Output Directory:** + +.. code-block:: bash + + streamflow run --outdir /path/to/results streamflow.yml + +**Named Workflow:** + +.. code-block:: bash + + streamflow run --name my-workflow streamflow.yml + +**Quiet Mode:** + +.. code-block:: bash + + streamflow run --quiet streamflow.yml + +Exit Codes +========== + +==== ================================ +Code Meaning +==== ================================ +0 Workflow completed successfully +1 General error +2 Configuration error +3 Workflow execution error +==== ================================ + +Related Commands +================ + +* :doc:`list` - List executed workflows +* :doc:`report` - Generate execution report +* :doc:`prov` - Export provenance data + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/running-workflows` - Running workflows tutorial + - :doc:`/user-guide/troubleshooting` - Troubleshooting guide + +**Configuration:** + - :doc:`/reference/configuration/streamflow-yml` - Configuration file reference + +See Also +======== + +* ``streamflow --help`` - Show general help +* ``cwl-runner`` - CWL standard runner interface diff --git a/docs/source/reference/cli/schema.rst b/docs/source/reference/cli/schema.rst new file mode 100644 index 000000000..d44338bd4 --- /dev/null +++ b/docs/source/reference/cli/schema.rst @@ -0,0 +1,96 @@ +================== +streamflow schema +================== + +.. meta:: + :keywords: StreamFlow, CLI, schema, JSON schema, validation + :description: Dump StreamFlow configuration schema + +Synopsis +======== + +.. code-block:: bash + + streamflow schema [OPTIONS] [VERSION] + +Description +=========== + +Dump the JSON schema for StreamFlow configuration files. Useful for validation and IDE autocomplete. + +Arguments +========= + +``VERSION`` + Version of the StreamFlow schema to print. + + **Optional:** Defaults to latest version + +Options +======= + +``-h, --help`` + Show help message and exit. + +``--pretty`` + Format JSON output with indentation for readability. + + **Default:** Disabled + +Examples +======== + +**Dump Current Schema:** + +.. code-block:: bash + + streamflow schema + +**Pretty-Printed Schema:** + +.. code-block:: bash + + streamflow schema --pretty + +**Specific Version:** + +.. code-block:: bash + + streamflow schema v1.0 + +**Save to File:** + +.. code-block:: bash + + streamflow schema --pretty > streamflow-schema.json + +Usage +===== + +The generated JSON schema can be used for: + +* **Validation:** Validate configuration files against the schema +* **IDE Support:** Enable autocomplete in editors with JSON schema support +* **Documentation:** Generate configuration documentation +* **Tools:** Build configuration tools and validators + +JSON Schema Support +=================== + +Many editors support JSON schema for YAML files: + +* **VS Code:** Add ``$schema`` field to your YAML files +* **IntelliJ IDEA:** Configure schema mapping +* **vim/neovim:** Use YAML language server with schema support + +Related Documentation +===================== + +**Configuration:** + - :doc:`/reference/configuration/index` - Configuration reference + - :doc:`/reference/configuration/streamflow-yml` - Main config file + +See Also +======== + +* `JSON Schema `_ - JSON Schema specification diff --git a/docs/source/reference/configuration/binding-config.rst b/docs/source/reference/configuration/binding-config.rst new file mode 100644 index 000000000..834f4bff0 --- /dev/null +++ b/docs/source/reference/configuration/binding-config.rst @@ -0,0 +1,174 @@ +==================== +Binding Configuration +==================== + +.. meta:: + :keywords: StreamFlow, binding, configuration, step mapping + :description: Binding configuration reference for StreamFlow + +Overview +======== + +Binding configuration associates workflow steps with deployment targets, specifying where each step executes. + +Configuration Structure +======================= + +.. code-block:: yaml + + bindings: + - step: /step-name + target: + deployment: deployment-name + # Additional target options + +Fields Reference +================ + +``step`` + CWL step name to bind. + + **Type:** String + **Required:** Yes + **Format:** Must start with ``/``, use ``/*`` for all steps + +``target`` + Target deployment specification. + + **Type:** Object or Array + **Required:** Yes + +``target.deployment`` + Name of the deployment defined in the ``deployments`` section. + + **Type:** String + **Required:** Yes + +``target.service`` + Specific service within the deployment (for multi-service deployments). + + **Type:** String + **Required:** No + +``target.locations`` + Number of parallel locations to create. + + **Type:** Integer + **Required:** No + **Default:** 1 + +Examples +======== + +**Basic Binding:** + +.. code-block:: yaml + + bindings: + - step: /my-step + target: + deployment: docker-env + +**Bind All Steps:** + +.. code-block:: yaml + + bindings: + - step: /* + target: + deployment: docker-env + +**Multiple Bindings:** + +.. code-block:: yaml + + bindings: + - step: /preprocess + target: + deployment: local-env + + - step: /compute + target: + deployment: hpc-cluster + + - step: /postprocess + target: + deployment: docker-env + +**Parallel Locations:** + +.. code-block:: yaml + + bindings: + - step: /parallel-task + target: + deployment: docker-env + locations: 10 + +**Multiple Targets:** + +.. code-block:: yaml + + bindings: + - step: /distributed-task + target: + - deployment: cloud-env-1 + - deployment: cloud-env-2 + - deployment: hpc-cluster + +**Port-Specific Targets:** + +.. code-block:: yaml + + bindings: + - step: /data-processing + target: + deployment: compute-env + - step: /data-processing/input-data + target: + deployment: storage-env + +Advanced Patterns +================= + +For advanced binding patterns, see: + +* :doc:`/user-guide/advanced-patterns/multiple-targets` - Multiple deployment targets +* :doc:`/user-guide/advanced-patterns/port-targets` - Port-specific bindings +* :doc:`/user-guide/advanced-patterns/stacked-locations` - Nested deployments + +Binding Filters +=============== + +Filters can customize binding behavior: + +.. code-block:: yaml + + bindings: + - step: /my-step + target: + - deployment: env-1 + - deployment: env-2 + filters: + - my-filter + + filters: + my-filter: + type: shuffle + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/binding-workflows` - Binding guide + - :doc:`/user-guide/advanced-patterns/index` - Advanced patterns + +**Configuration:** + - :doc:`streamflow-yml` - Main configuration file + - :doc:`deployment-config` - Deployment configuration + +See Also +======== + +* :doc:`/user-guide/binding-workflows` - Complete binding guide +* :doc:`/user-guide/advanced-patterns/index` - Advanced binding patterns diff --git a/docs/source/reference/configuration/deployment-config.rst b/docs/source/reference/configuration/deployment-config.rst new file mode 100644 index 000000000..309b223ad --- /dev/null +++ b/docs/source/reference/configuration/deployment-config.rst @@ -0,0 +1,162 @@ +======================= +Deployment Configuration +======================= + +.. meta:: + :keywords: StreamFlow, deployment, configuration, connectors + :description: Deployment configuration reference for StreamFlow + +Overview +======== + +Deployment configuration defines execution environments where workflow tasks run. + +Configuration Structure +======================= + +.. code-block:: yaml + + deployments: + deployment-name: + type: connector-type + config: + # Connector-specific configuration + wraps: other-deployment # Optional + +Fields Reference +================ + +``type`` + Connector type identifier. + + **Type:** String + **Required:** Yes + **Values:** ``local``, ``docker``, ``kubernetes``, ``ssh``, ``slurm``, ``pbs``, ``singularity``, etc. + +``config`` + Connector-specific configuration. + + **Type:** Object + **Required:** Depends on connector + **Format:** See connector documentation + +``wraps`` + Name of another deployment to wrap (for stacked deployments). + + **Type:** String + **Required:** No + +Connector Types +=============== + +**Container Connectors:** + +* ``docker`` - Docker containers +* ``singularity`` - Singularity/Apptainer containers + +**Cloud Connectors:** + +* ``kubernetes`` - Kubernetes pods + +**HPC Connectors:** + +* ``ssh`` - Remote SSH hosts +* ``slurm`` - Slurm batch scheduler +* ``pbs`` - PBS/Torque scheduler + +**Other:** + +* ``local`` - Local execution + +Examples +======== + +**Local Deployment:** + +.. code-block:: yaml + + deployments: + local-env: + type: local + +**Docker Deployment:** + +.. code-block:: yaml + + deployments: + docker-env: + type: docker + config: + image: python:3.10 + volume: + - /data:/data + +**Kubernetes Deployment:** + +.. code-block:: yaml + + deployments: + k8s-env: + type: kubernetes + config: + kubeconfig: ~/.kube/config + namespace: streamflow + +**Slurm Deployment:** + +.. code-block:: yaml + + deployments: + slurm-env: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + maxConcurrentJobs: 10 + +**Stacked Deployment:** + +.. code-block:: yaml + + deployments: + base-slurm: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + + container-on-slurm: + type: singularity + config: + image: docker://python:3.10 + wraps: base-slurm + +Connector Configuration +======================= + +For detailed connector-specific configuration, see: + +* :doc:`/reference/connectors/docker` - Docker configuration +* :doc:`/reference/connectors/singularity` - Singularity configuration +* :doc:`/reference/connectors/kubernetes` - Kubernetes configuration +* :doc:`/reference/connectors/ssh` - SSH configuration +* :doc:`/reference/connectors/slurm` - Slurm configuration +* :doc:`/reference/connectors/pbs` - PBS configuration +* :doc:`/reference/connectors/index` - All connectors + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment guide + - :doc:`/user-guide/advanced-patterns/stacked-locations` - Stacked deployments + +**Configuration:** + - :doc:`streamflow-yml` - Main configuration file + +See Also +======== + +* :doc:`/reference/connectors/index` - Complete connector documentation diff --git a/docs/source/reference/configuration/environment-variables.rst b/docs/source/reference/configuration/environment-variables.rst new file mode 100644 index 000000000..8f0b5c2a6 --- /dev/null +++ b/docs/source/reference/configuration/environment-variables.rst @@ -0,0 +1,109 @@ +====================== +Environment Variables +====================== + +.. meta:: + :keywords: StreamFlow, environment variables, configuration + :description: Environment variable reference for StreamFlow + +Overview +======== + +StreamFlow can be configured using environment variables for certain runtime behaviors. + +Available Variables +=================== + +``STREAMFLOW_CONFIG`` + Default path to StreamFlow configuration file. + + **Type:** String + **Default:** ``streamflow.yml`` + **Usage:** + + .. code-block:: bash + + export STREAMFLOW_CONFIG=/path/to/config.yml + streamflow run + +``STREAMFLOW_DATABASE`` + Path to StreamFlow database file. + + **Type:** String + **Default:** ``~/.streamflow/streamflow.db`` + **Usage:** + + .. code-block:: bash + + export STREAMFLOW_DATABASE=/path/to/database.db + +``STREAMFLOW_LOG_LEVEL`` + Logging level for StreamFlow. + + **Type:** String + **Values:** ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR``, ``CRITICAL`` + **Default:** ``INFO`` + **Usage:** + + .. code-block:: bash + + export STREAMFLOW_LOG_LEVEL=DEBUG + streamflow run streamflow.yml + +``STREAMFLOW_WORKING_DIR`` + Working directory for StreamFlow temporary files. + + **Type:** String + **Default:** ``~/.streamflow`` + **Usage:** + + .. code-block:: bash + + export STREAMFLOW_WORKING_DIR=/tmp/streamflow + +Examples +======== + +**Set Multiple Variables:** + +.. code-block:: bash + + export STREAMFLOW_LOG_LEVEL=DEBUG + export STREAMFLOW_DATABASE=/data/streamflow.db + streamflow run workflow.yml + +**Temporary Configuration:** + +.. code-block:: bash + + STREAMFLOW_LOG_LEVEL=DEBUG streamflow run workflow.yml + +**In Docker:** + +.. code-block:: bash + + docker run -e STREAMFLOW_LOG_LEVEL=DEBUG alphaunito/streamflow:latest + +**In Shell Script:** + +.. code-block:: bash + + #!/bin/bash + export STREAMFLOW_LOG_LEVEL=DEBUG + export STREAMFLOW_DATABASE=/data/streamflow.db + streamflow run streamflow.yml + +Related Documentation +===================== + +**Configuration:** + - :doc:`streamflow-yml` - Main configuration file + +**CLI:** + - :doc:`/reference/cli/run` - Run command options + +See Also +======== + +* Many settings can also be configured via CLI options +* Database location can be specified in ``streamflow.yml`` diff --git a/docs/source/reference/configuration/index.rst b/docs/source/reference/configuration/index.rst new file mode 100644 index 000000000..70495adfd --- /dev/null +++ b/docs/source/reference/configuration/index.rst @@ -0,0 +1,181 @@ +============= +Configuration +============= + +.. meta:: + :keywords: StreamFlow, configuration, schema, YAML, streamflow.yml + :description: Complete StreamFlow configuration reference with schemas and examples + +Overview +======== + +StreamFlow uses YAML configuration files to define workflows, deployments, and bindings. This section provides complete reference documentation for all configuration options, generated from JSON schemas. + +Quick Reference +=============== + +============ ==================================== +Purpose Configuration file reference +Audience All users +Format YAML (validated against JSON Schema) +Main File ``streamflow.yml`` +============ ==================================== + +Configuration Documentation +=========================== + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + streamflow-yml + workflow-config + deployment-config + binding-config + environment-variables + +Configuration Structure +======================= + +The main ``streamflow.yml`` file has this structure: + +.. code-block:: yaml + + version: v1.0 + + workflows: + # Workflow configurations + + deployments: + # Deployment configurations + + bindings: + # Step-to-deployment bindings + + filters: + # Binding filters (optional) + +See :doc:`streamflow-yml` for complete documentation. + +Configuration by Topic +====================== + +**Workflows** + Define CWL workflows and their execution parameters. + + :doc:`workflow-config` + +**Deployments** + Configure execution environments (Docker, Kubernetes, HPC, etc.). + + :doc:`deployment-config` + +**Bindings** + Associate workflow steps with deployments. + + :doc:`binding-config` + +**Environment Variables** + Configure StreamFlow behavior via environment variables. + + :doc:`environment-variables` + +Schema Validation +================= + +Validate your configuration: + +.. code-block:: bash + + # Dump the JSON schema + streamflow schema + + # Validate during run + streamflow run streamflow.yml + +Invalid configurations will produce detailed error messages indicating the problem location. + +Configuration Examples +====================== + +**Minimal Configuration:** + +.. code-block:: yaml + + version: v1.0 + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: {} + +**With Deployment:** + +.. code-block:: yaml + + version: v1.0 + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + + deployments: + docker-env: + type: docker + config: + image: alpine:latest + + bindings: + - step: /my-step + target: + deployment: docker-env + +Connector-Specific Configuration +================================= + +Each connector type has its own configuration schema: + +* :doc:`/reference/connectors/docker` - Docker configuration +* :doc:`/reference/connectors/kubernetes` - Kubernetes configuration +* :doc:`/reference/connectors/slurm` - Slurm configuration +* :doc:`/reference/index` - All connectors + +Related Documentation +===================== + +**User Guide:** + For configuration tutorials: + + - :doc:`/user-guide/configuring-deployments` - Deployment setup + - :doc:`/user-guide/binding-workflows` - Binding configuration + +**Reference:** + For connector details: + + - :doc:`/reference/index` - Available connectors + +Best Practices +============== + +**Use Version Control:** + Store ``streamflow.yml`` in version control with your workflows. + +**Validate Early:** + Run ``streamflow run`` with ``--dry-run`` to validate before execution. + +**Use Comments:** + YAML supports comments - document complex configurations. + +**External Files:** + Reference external CWL and deployment files for better organization. + +Next Steps +========== + +* :doc:`streamflow-yml` - Complete configuration file reference +* :doc:`deployment-config` - Deployment configuration details +* :doc:`binding-config` - Binding configuration details diff --git a/docs/source/reference/configuration/streamflow-yml.rst b/docs/source/reference/configuration/streamflow-yml.rst new file mode 100644 index 000000000..d496de6d4 --- /dev/null +++ b/docs/source/reference/configuration/streamflow-yml.rst @@ -0,0 +1,259 @@ +=================== +The StreamFlow File +=================== + +.. meta:: + :keywords: StreamFlow, configuration, YAML, streamflow.yml + :description: Main StreamFlow configuration file reference + +Overview +======== + +The ``streamflow.yml`` file is the main configuration file for StreamFlow. It defines workflows, deployments, and bindings in a single YAML document. + +File Structure +============== + +.. code-block:: yaml + + version: v1.0 + + workflows: + # Workflow definitions + + deployments: + # Deployment configurations + + bindings: + # Step-to-deployment bindings + + filters: + # Binding filters (optional) + +Top-Level Fields +================ + +``version`` + StreamFlow configuration schema version. + + **Type:** String + **Required:** Yes + **Values:** ``v1.0`` + +``workflows`` + Map of workflow definitions. Keys are workflow names, values are workflow configurations. + + **Type:** Object + **Required:** Yes + +``deployments`` + Map of deployment configurations. Keys are deployment names, values are deployment configurations. + + **Type:** Object + **Required:** No + +``bindings`` + List of bindings associating workflow steps with deployments. + + **Type:** Array + **Required:** No + +``filters`` + Map of binding filter configurations. + + **Type:** Object + **Required:** No + +Workflow Configuration +====================== + +Each workflow entry has the following structure: + +.. code-block:: yaml + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: input.yml + +**Fields:** + +``type`` + Workflow type. Currently only ``cwl`` is supported. + + **Type:** String + **Required:** Yes + **Values:** ``cwl`` + +``config`` + Workflow-specific configuration. + + **Type:** Object + **Required:** Yes + +``config.file`` + Path to CWL workflow file. + + **Type:** String + **Required:** Yes + +``config.settings`` + Path to workflow inputs file (YAML or JSON). + + **Type:** String + **Required:** Yes + +Deployment Configuration +======================== + +Each deployment entry has the following structure: + +.. code-block:: yaml + + deployments: + my-deployment: + type: docker + config: + image: python:3.10 + +**Fields:** + +``type`` + Deployment connector type. + + **Type:** String + **Required:** Yes + **Values:** ``local``, ``docker``, ``kubernetes``, ``ssh``, ``slurm``, ``pbs``, ``singularity``, etc. + +``config`` + Connector-specific configuration. + + **Type:** Object + **Required:** Depends on connector + +For connector-specific configuration options, see: + +* :doc:`/reference/connectors/docker` - Docker configuration +* :doc:`/reference/connectors/kubernetes` - Kubernetes configuration +* :doc:`/reference/connectors/slurm` - Slurm configuration +* :doc:`/reference/connectors/ssh` - SSH configuration +* :doc:`/reference/connectors/index` - All connectors + +Binding Configuration +===================== + +Each binding associates a workflow step with a deployment: + +.. code-block:: yaml + + bindings: + - step: /my-step + target: + deployment: my-deployment + +**Fields:** + +``step`` + CWL step name (with leading ``/``). + + **Type:** String + **Required:** Yes + +``target`` + Target deployment specification. + + **Type:** Object + **Required:** Yes + +``target.deployment`` + Name of the deployment to use. + + **Type:** String + **Required:** Yes + +For advanced binding patterns, see: + +* :doc:`/user-guide/binding-workflows` - Binding workflows guide +* :doc:`/user-guide/advanced-patterns/index` - Advanced patterns + +Filter Configuration +==================== + +Filters customize binding behavior: + +.. code-block:: yaml + + filters: + my-filter: + type: shuffle + +**Built-in filter types:** + +* ``shuffle`` - Randomize target selection +* ``match`` - Pattern-based target matching + +Complete Example +================ + +.. code-block:: yaml + + version: v1.0 + + workflows: + example-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + + deployments: + local-env: + type: local + + docker-env: + type: docker + config: + image: python:3.10 + + hpc-cluster: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + + bindings: + - step: /preprocess + target: + deployment: docker-env + + - step: /compute + target: + deployment: hpc-cluster + + - step: /postprocess + target: + deployment: docker-env + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment configuration + - :doc:`/user-guide/binding-workflows` - Binding configuration + +**Configuration:** + - :doc:`workflow-config` - Workflow configuration details + - :doc:`deployment-config` - Deployment configuration details + - :doc:`binding-config` - Binding configuration details + +**CLI:** + - :doc:`/reference/cli/schema` - Dump JSON schema + +See Also +======== + +* Use ``streamflow schema`` to view the complete JSON schema +* See :doc:`/reference/connectors/index` for connector-specific configuration diff --git a/docs/source/reference/configuration/workflow-config.rst b/docs/source/reference/configuration/workflow-config.rst new file mode 100644 index 000000000..d5113d145 --- /dev/null +++ b/docs/source/reference/configuration/workflow-config.rst @@ -0,0 +1,106 @@ +================== +Workflow Configuration +================== + +.. meta:: + :keywords: StreamFlow, workflow, configuration, CWL + :description: Workflow configuration reference for StreamFlow + +Overview +======== + +Workflow configuration defines how CWL workflows are executed in StreamFlow. + +Configuration Structure +======================= + +.. code-block:: yaml + + workflows: + workflow-name: + type: cwl + config: + file: path/to/workflow.cwl + settings: path/to/inputs.yml + +Fields Reference +================ + +``type`` + Workflow type identifier. + + **Type:** String + **Required:** Yes + **Values:** ``cwl`` (only supported type) + +``config`` + Workflow-specific configuration object. + + **Type:** Object + **Required:** Yes + +``config.file`` + Path to CWL workflow definition file. + + **Type:** String + **Required:** Yes + **Format:** Relative or absolute path to ``.cwl`` file + +``config.settings`` + Path to workflow inputs file. + + **Type:** String or Object + **Required:** Yes + **Format:** Path to YAML/JSON file, or inline object + +Examples +======== + +**Basic Configuration:** + +.. code-block:: yaml + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + +**Inline Settings:** + +.. code-block:: yaml + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: + input_file: data.txt + num_threads: 4 + +**Absolute Paths:** + +.. code-block:: yaml + + workflows: + my-workflow: + type: cwl + config: + file: /path/to/workflow.cwl + settings: /path/to/inputs.yml + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + +**Configuration:** + - :doc:`streamflow-yml` - Main configuration file + +See Also +======== + +* `CWL Specification `_ - CWL workflow format diff --git a/docs/source/reference/connectors/docker.rst b/docs/source/reference/connectors/docker.rst new file mode 100644 index 000000000..a9299c713 --- /dev/null +++ b/docs/source/reference/connectors/docker.rst @@ -0,0 +1,104 @@ +=============== +Docker Connector +=============== + +.. meta:: + :keywords: StreamFlow, docker, container, deployment + :description: Docker connector reference for StreamFlow + +Overview +======== + +The Docker connector executes workflow tasks in Docker containers, providing isolation, reproducibility, and portability for local and single-node deployments. + +Quick Reference +=============== + +============ ==================================== +Type ``docker`` +Category Container +Scalability Single host +Best For Local development, CI/CD pipelines +============ ==================================== + +Examples +======== + +With Volume Mounts +------------------ + +.. code-block:: yaml + + deployments: + docker-volumes: + type: docker + config: + image: ubuntu:22.04 + volumes: + - /host/data:/container/data:ro + - /host/output:/container/output:rw + +With GPU Support +---------------- + +.. code-block:: yaml + + deployments: + docker-gpu: + type: docker + config: + image: nvidia/cuda:11.8.0-runtime-ubuntu22.04 + gpus: all + +With Resource Limits +-------------------- + +.. code-block:: yaml + + deployments: + docker-limited: + type: docker + config: + image: python:3.10 + cpus: 4.0 + memory: 8g + +Prerequisites +============= + +* Docker installed and running +* User has Docker permissions (member of ``docker`` group on Linux) +* Required images available or pullable from registry + +Platform Support +================ + +**Linux:** Full support +**macOS:** Full support +**Windows:** Not supported + +.. note:: + StreamFlow only supports Linux and macOS. Windows is not supported. + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/deployment/connector/docker.json + :lift_description: true + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment configuration guide + - :doc:`/user-guide/binding-workflows` - Binding workflows to deployments + - :doc:`/user-guide/troubleshooting` - Docker troubleshooting + +**Connectors:** + - :doc:`index` - All container connectors + - :doc:`docker-compose` - Multi-container orchestration + - :doc:`singularity` - For HPC container execution + - :doc:`/reference/connectors/kubernetes` - For cloud-native container orchestration + +**External Resources:** + - :doc:`/reference/cwl-docker-translators/docker` - CWL Docker translator configuration diff --git a/docs/source/reference/connectors/index.rst b/docs/source/reference/connectors/index.rst new file mode 100644 index 000000000..0a32a8a14 --- /dev/null +++ b/docs/source/reference/connectors/index.rst @@ -0,0 +1,165 @@ +========== +Connectors +========== + +.. meta:: + :keywords: StreamFlow, connectors, docker, kubernetes, slurm, ssh, deployment + :description: Complete reference for all StreamFlow execution environment connectors + +Overview +======== + +Connectors are StreamFlow's interface to execution environments. This section provides comprehensive reference documentation for all built-in connectors. + +Quick Reference +=============== + +============ ==================================== +Purpose Connector reference +Audience Users configuring deployments +Total Count 12+ built-in connectors +============ ==================================== + +Available Connectors +==================== + +.. toctree:: + :maxdepth: 1 + + docker + singularity + kubernetes + ssh + slurm + pbs + +Connector Comparison +==================== + +==================== ============= ============== ============== ================ +Connector Category Scalability Complexity Best For +==================== ============= ============== ============== ================ +**local** Native Single node Very Simple Development +**docker** Container Single node Simple Local testing +**docker-compose** Container Multi-service Simple Service apps +**singularity** Container HPC-friendly Medium HPC containers +**kubernetes** Cloud Multi-node Medium Cloud native +**helm3** Cloud Multi-node Medium K8s packages +**ssh** Remote Remote nodes Simple Remote exec +**slurm** HPC HPC clusters Medium HPC batch +**pbs** HPC HPC clusters Medium HPC batch +**flux** HPC HPC clusters Medium Modern HPC +**occam** HPC Specific HPC Complex Torino HPC +==================== ============= ============== ============== ================ + +Choosing a Connector +==================== + +**For Development:** + Use ``local`` connector for simplicity + +**For Testing:** + Use ``docker`` for reproducibility + +**For Production:** + * **Cloud workloads:** Use ``kubernetes`` or ``helm3`` + * **HPC workloads:** Use ``slurm``, ``pbs``, or ``flux`` + * **Hybrid:** Combine connectors with bindings + +**For Containers on HPC:** + Use ``singularity`` connector or stack ``singularity`` on ``slurm`` + +**For Remote Execution:** + Use ``ssh`` connector for general remote hosts + +Common Configuration Patterns +============================== + +**Local Deployment:** + +.. code-block:: yaml + + deployments: + local-env: + type: local + +**Docker with Volume:** + +.. code-block:: yaml + + deployments: + docker-env: + type: docker + config: + image: alpine:latest + volume: + - /host/path:/container/path + +**Kubernetes with Pod Spec:** + +.. code-block:: yaml + + deployments: + k8s-env: + type: kubernetes + config: + files: + - pod-spec.yaml + +**Slurm with Queue:** + +.. code-block:: yaml + + deployments: + slurm-env: + type: slurm + config: + file: ssh-config.yaml + maxConcurrentJobs: 10 + slurmConfig: + partition: gpu-queue + +Connector Features Matrix +========================== + +================= ====== ============ =========== ============= +Feature Local Container Cloud HPC +================= ====== ============ =========== ============= +Multi-node No Depends Yes Yes +Resource limits No Yes Yes Yes +Queue management No No Yes Yes +Container support No Native Native Via Singularity +GPU support Yes Yes Yes Yes +Network isolation No Yes Yes Depends +================= ====== ============ =========== ============= + +Related Documentation +===================== + +**User Guide:** + For deployment tutorials: + + - :doc:`/user-guide/configuring-deployments` - Deployment setup guide + +**Configuration:** + For schema reference: + + - :doc:`/reference/configuration/deployment-config` - Deployment schema + +**Developer Guide:** + For creating custom connectors: + + - :doc:`/developer-guide/extension-points/connector` - Connector plugin + +Advanced Topics +=============== + +**Stacked Deployments:** + Combine connectors for complex scenarios: + + - :doc:`/user-guide/advanced-patterns/stacked-locations` + +**Custom Connectors:** + Create your own connector plugin: + + - :doc:`/developer-guide/extension-points/creating-plugins` diff --git a/docs/source/reference/connectors/kubernetes.rst b/docs/source/reference/connectors/kubernetes.rst new file mode 100644 index 000000000..7cd9e8e0e --- /dev/null +++ b/docs/source/reference/connectors/kubernetes.rst @@ -0,0 +1,176 @@ +==================== +Kubernetes Connector +==================== + +.. meta:: + :keywords: StreamFlow, kubernetes, k8s, cloud, orchestration + :description: Kubernetes connector reference for StreamFlow + +Overview +======== + +The Kubernetes connector executes workflow tasks as Kubernetes pods, providing multi-node scalability, resource management, and cloud-native integration. + +Quick Reference +=============== + +============ ==================================== +Type ``kubernetes`` +Category Cloud +Scalability Multi-node, auto-scaling +Best For Cloud environments, production +============ ==================================== + +Examples +======== + +With Resource Requests +----------------------- + +.. code-block:: yaml + + deployments: + k8s-workers: + type: kubernetes + config: + kubeconfig: ~/.kube/config + namespace: streamflow + services: + compute: + replicas: 5 + template: + spec: + containers: + - name: worker + image: python:3.10 + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" + +With GPU Support +---------------- + +.. code-block:: yaml + + deployments: + k8s-gpu: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + gpu-workers: + replicas: 2 + template: + spec: + containers: + - name: gpu-worker + image: tensorflow/tensorflow:latest-gpu + resources: + limits: + nvidia.com/gpu: 1 + +With Node Affinity +------------------ + +.. code-block:: yaml + + deployments: + k8s-affinity: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + specific-nodes: + replicas: 3 + template: + spec: + containers: + - name: worker + image: python:3.10 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-type + operator: In + values: + - compute + +Multiple Services +----------------- + +.. code-block:: yaml + + deployments: + k8s-mixed: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + cpu-workers: + replicas: 10 + template: + spec: + containers: + - name: cpu-worker + image: python:3.10 + + gpu-workers: + replicas: 2 + template: + spec: + containers: + - name: gpu-worker + image: tensorflow/tensorflow:latest-gpu + resources: + limits: + nvidia.com/gpu: 1 + +Prerequisites +============= + +* Kubernetes cluster access +* ``kubectl`` installed and configured +* Valid kubeconfig file +* Appropriate RBAC permissions +* Namespace exists (or permission to create) + +Pod Specifications +================== + +The ``template`` field accepts standard Kubernetes pod template specifications. See `Kubernetes Pod documentation `_ for complete options. + +Platform Support +================ + +**Linux:** Full support +**macOS:** Full support +**Windows:** Not supported + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/deployment/connector/kubernetes.json + :lift_description: true + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment configuration guide + - :doc:`/user-guide/running-workflows` - Workflow execution + - :doc:`/user-guide/troubleshooting` - Kubernetes troubleshooting + +**Connectors:** + - :doc:`index` - All cloud connectors + - :doc:`helm3` - For Helm chart deployments + - :doc:`/reference/connectors/docker` - For local Docker execution + +**External Resources:** + - :doc:`/reference/cwl-docker-translators/kubernetes` - CWL Kubernetes translator + - `Kubernetes Documentation `_ diff --git a/docs/source/reference/connectors/pbs.rst b/docs/source/reference/connectors/pbs.rst new file mode 100644 index 000000000..5647d47a3 --- /dev/null +++ b/docs/source/reference/connectors/pbs.rst @@ -0,0 +1,153 @@ +============ +PBS Connector +============ + +.. meta:: + :keywords: StreamFlow, pbs, torque, HPC, batch scheduler + :description: PBS/Torque connector reference for StreamFlow + +Overview +======== + +The PBS connector executes workflow tasks via PBS Pro or OpenPBS batch schedulers, commonly found in traditional HPC environments. + +Quick Reference +=============== + +============ ==================================== +Type ``pbs`` +Category HPC +Scalability HPC clusters +Best For PBS-managed HPC systems +============ ==================================== + +Examples +======== + +Basic CPU Job +------------- + +.. code-block:: yaml + + deployments: + pbs-cpu: + type: pbs + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /home/user/jobs + services: + compute: + queue: batch + nodes: 2 + cpus: 32 + mem: 128gb + walltime: "04:00:00" + +With Resource Selection +----------------------- + +.. code-block:: yaml + + deployments: + pbs-select: + type: pbs + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /scratch/user + services: + custom: + queue: longrun + select: "2:ncpus=16:mem=64gb:ngpus=1" + walltime: "48:00:00" + +With GPU Allocation +------------------- + +.. code-block:: yaml + + deployments: + pbs-gpu: + type: pbs + config: + hostname: gpu-login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /gpfs/scratch/user + services: + gpu-jobs: + queue: gpu-queue + nodes: 1 + cpus: 16 + ngpus: 2 + mem: 128gb + walltime: "08:00:00" + +Prerequisites +============= + +* HPC system access +* SSH access to PBS login nodes +* Job submission permissions +* Queue access +* Resource quota available +* Working directory exists on HPC filesystem + +PBS Directives +============== + +StreamFlow service configuration maps to PBS qsub directives. For complete PBS options, see PBS documentation. + +Common PBS Commands +=================== + +**Check queue status:** + +.. code-block:: bash + + qstat -u $USER + +**Job details:** + +.. code-block:: bash + + qstat -f + +**Cancel job:** + +.. code-block:: bash + + qdel + +Platform Support +================ + +**Linux:** Full support +**macOS:** Full support (via SSH to Linux HPC) +**Windows:** Not supported + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/deployment/connector/pbs.json + :lift_description: true + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment configuration guide + - :doc:`/user-guide/troubleshooting` - HPC troubleshooting + - :doc:`/user-guide/advanced-patterns/stacked-locations` - Container stacking + +**Connectors:** + - :doc:`index` - All HPC connectors + - :doc:`slurm` - For Slurm-managed systems + - :doc:`ssh` - For simple remote execution + - :doc:`/reference/connectors/singularity` - Container integration + +**External Resources:** + - `PBS Professional Documentation `_ diff --git a/docs/source/reference/connectors/singularity.rst b/docs/source/reference/connectors/singularity.rst new file mode 100644 index 000000000..fe5ee708f --- /dev/null +++ b/docs/source/reference/connectors/singularity.rst @@ -0,0 +1,116 @@ +==================== +Singularity Connector +==================== + +.. meta:: + :keywords: StreamFlow, singularity, apptainer, container, HPC + :description: Singularity/Apptainer connector reference for StreamFlow + +Overview +======== + +The Singularity connector executes workflow tasks in Singularity (now Apptainer) containers, designed for HPC environments with rootless execution and enhanced security. + +Quick Reference +=============== + +============ ==================================== +Type ``singularity`` +Category Container +Scalability HPC environments +Best For HPC clusters, rootless containers +============ ==================================== + +Examples +======== + +From Docker Hub +--------------- + +.. code-block:: yaml + + deployments: + singularity-docker: + type: singularity + config: + image: docker://tensorflow/tensorflow:latest + +With Bind Mounts +---------------- + +.. code-block:: yaml + + deployments: + singularity-binds: + type: singularity + config: + image: docker://python:3.10 + bindPaths: + - /data:/data + - /scratch:/scratch + +Stacked on Slurm +---------------- + +.. code-block:: yaml + + deployments: + slurm-batch: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + + singularity-on-slurm: + type: singularity + config: + image: docker://python:3.10 + wraps: slurm-batch + +Prerequisites +============= + +* Singularity/Apptainer installed on execution host +* Appropriate image access permissions +* Sufficient disk space for image cache + +HPC Integration +=============== + +Singularity is designed for HPC systems and provides: + +* **Rootless execution** - No privileged access required +* **MPI support** - Native HPC application support +* **GPU access** - Direct GPU passthrough +* **Shared filesystems** - Automatic bind mounting + +Platform Support +================ + +**Linux:** Full support +**macOS:** Limited (via VM) +**Windows:** Not supported + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/deployment/connector/singularity.json + :lift_description: true + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment configuration guide + - :doc:`/user-guide/advanced-patterns/stacked-locations` - Stacking with batch schedulers + - :doc:`/user-guide/troubleshooting` - Container troubleshooting + +**Connectors:** + - :doc:`index` - All container connectors + - :doc:`docker` - For local Docker containers + - :doc:`/reference/connectors/slurm` - Slurm integration + - :doc:`/reference/connectors/index` - HPC connectors + +**External Resources:** + - :doc:`/reference/cwl-docker-translators/singularity` - CWL Singularity translator diff --git a/docs/source/reference/connectors/slurm.rst b/docs/source/reference/connectors/slurm.rst new file mode 100644 index 000000000..98f7e605e --- /dev/null +++ b/docs/source/reference/connectors/slurm.rst @@ -0,0 +1,206 @@ +============== +Slurm Connector +============== + +.. meta:: + :keywords: StreamFlow, slurm, HPC, batch scheduler + :description: Slurm connector reference for StreamFlow + +Overview +======== + +The Slurm connector executes workflow tasks via the Slurm workload manager, the most widely used HPC batch scheduler, providing resource allocation, queue management, and fair-share scheduling. + +Quick Reference +=============== + +============ ==================================== +Type ``slurm`` +Category HPC +Scalability HPC clusters (1000s of nodes) +Best For Large-scale HPC computations +============ ==================================== + +Examples +======== + +Basic CPU Job +------------- + +.. code-block:: yaml + + deployments: + slurm-cpu: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /scratch/user/jobs + services: + compute: + partition: standard + nodes: 2 + ntasks: 64 + mem: 128G + time: "04:00:00" + +With GPU Allocation +------------------- + +.. code-block:: yaml + + deployments: + slurm-gpu: + type: slurm + config: + hostname: gpu-login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /gpfs/scratch/user/jobs + services: + gpu-jobs: + partition: gpu + nodes: 1 + ntasks: 8 + gres: gpu:v100:2 # 2 V100 GPUs + mem: 128G + time: "08:00:00" + +With Account and QoS +-------------------- + +.. code-block:: yaml + + deployments: + slurm-priority: + type: slurm + config: + hostname: hpc.example.com + username: user + sshKey: ~/.ssh/id_rsa + workdir: /scratch/user + services: + high-priority: + partition: priority + account: research-grant-123 + qos: high + nodes: 4 + ntasksPerNode: 32 + time: "24:00:00" + +With Singularity Container +--------------------------- + +.. code-block:: yaml + + deployments: + slurm-batch: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /scratch/user + services: + compute: + partition: standard + time: "04:00:00" + + singularity-on-slurm: + type: singularity + config: + image: docker://python:3.10 + wraps: slurm-batch + +Multiple Services +----------------- + +.. code-block:: yaml + + deployments: + slurm-mixed: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /scratch/user + services: + cpu-large: + partition: compute + nodes: 10 + ntasksPerNode: 64 + time: "12:00:00" + + gpu-small: + partition: gpu + nodes: 1 + gres: gpu:4 + time: "02:00:00" + +Prerequisites +============= + +* HPC system access +* SSH access to Slurm login nodes +* Job submission permissions +* Partition/queue access +* Resource quota available +* Working directory exists on HPC filesystem + +Slurm Directives +================ + +StreamFlow service configuration maps directly to Slurm sbatch directives. For complete Slurm options, see the `Slurm documentation `_. + +Common Slurm Commands +===================== + +**Check queue status:** + +.. code-block:: bash + + squeue -u $USER + +**Job details:** + +.. code-block:: bash + + scontrol show job + +**Cancel job:** + +.. code-block:: bash + + scancel + +Platform Support +================ + +**Linux:** Full support +**macOS:** Full support (via SSH to Linux HPC) +**Windows:** Not supported + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/deployment/connector/slurm.json + :lift_description: true + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment configuration guide + - :doc:`/user-guide/advanced-patterns/stacked-locations` - Container stacking + - :doc:`/user-guide/troubleshooting` - HPC troubleshooting + +**Connectors:** + - :doc:`index` - All HPC connectors + - :doc:`ssh` - For simple remote execution + - :doc:`pbs` - For PBS-managed systems + - :doc:`/reference/connectors/singularity` - Container integration + +**External Resources:** + - `Slurm Documentation `_ diff --git a/docs/source/reference/connectors/ssh.rst b/docs/source/reference/connectors/ssh.rst new file mode 100644 index 000000000..edc72e58f --- /dev/null +++ b/docs/source/reference/connectors/ssh.rst @@ -0,0 +1,143 @@ +============ +SSH Connector +============ + +.. meta:: + :keywords: StreamFlow, ssh, remote, deployment + :description: SSH connector reference for StreamFlow + +Overview +======== + +The SSH connector executes workflow tasks on remote hosts via SSH, providing simple remote execution without batch scheduling systems. + +Quick Reference +=============== + +============ ==================================== +Type ``ssh`` +Category HPC +Scalability Single or multiple remote hosts +Best For Remote execution, simple clusters +============ ==================================== + +Examples +======== + +With SSH Key Authentication +---------------------------- + +.. code-block:: yaml + + deployments: + ssh-key: + type: ssh + config: + hostname: hpc.example.com + username: researcher + sshKey: ~/.ssh/id_rsa + sshKeyPassphrase: my-passphrase # Optional + +With Password Authentication +----------------------------- + +.. code-block:: yaml + + deployments: + ssh-password: + type: ssh + config: + hostname: 192.168.1.100 + username: user + password: secret # Not recommended for production + +Multiple Hosts +-------------- + +.. code-block:: yaml + + deployments: + ssh-cluster: + type: ssh + config: + nodes: + - hostname: node1.example.com + username: user + sshKey: ~/.ssh/id_rsa + - hostname: node2.example.com + username: user + sshKey: ~/.ssh/id_rsa + - hostname: node3.example.com + username: user + sshKey: ~/.ssh/id_rsa + +With Custom Port and Timeout +----------------------------- + +.. code-block:: yaml + + deployments: + ssh-custom: + type: ssh + config: + hostname: bastion.example.com + username: admin + sshKey: ~/.ssh/id_ed25519 + port: 2222 + connectionTimeout: 30 + maxConnections: 10 + +Prerequisites +============= + +* SSH access to remote host(s) +* SSH key authentication configured (recommended) +* Appropriate file permissions on SSH keys (``chmod 600``) +* Remote host in SSH known_hosts or host key verification disabled + +Security Considerations +======================= + +* **Use SSH keys** instead of passwords for authentication +* **Protect private keys** with passphrases +* **Limit maxConnections** to avoid overwhelming remote systems +* **Use jump hosts/bastions** for accessing secured networks + +Connection Pooling +================== + +The ``maxConnections`` option controls concurrent SSH connections: + +* **Low values (1-5):** Conservative, safe for small systems +* **Medium values (10-20):** Balanced for typical workloads +* **High values (50+):** For large-scale parallel execution + +Platform Support +================ + +**Linux:** Full support +**macOS:** Full support +**Windows:** Not supported + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/deployment/connector/ssh.json + :lift_description: true + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/configuring-deployments` - Deployment configuration guide + - :doc:`/user-guide/troubleshooting` - SSH connection troubleshooting + - :doc:`/user-guide/advanced-patterns/stacked-locations` - Container stacking with SSH + +**Connectors:** + - :doc:`index` - All HPC connectors + - :doc:`slurm` - For batch-scheduled execution + - :doc:`pbs` - For PBS-managed systems + - :doc:`/reference/connectors/singularity` - Container integration + +**External Resources:** + - `OpenSSH Documentation `_ diff --git a/docs/source/reference/cwl-docker-translators/docker.rst b/docs/source/reference/cwl-docker-translators/docker.rst new file mode 100644 index 000000000..c01af06b6 --- /dev/null +++ b/docs/source/reference/cwl-docker-translators/docker.rst @@ -0,0 +1,113 @@ +================ +Docker Translator +================ + +.. meta:: + :keywords: StreamFlow, Docker, CWL, DockerRequirement, containers + :description: Docker translator for CWL DockerRequirement in StreamFlow + +Overview +======== + +The **Docker Translator** is the default CWL Docker Translator in StreamFlow. It instantiates a :doc:`/reference/connectors/docker` deployment for every CWL ``DockerRequirement`` specification. + +**Use Cases:** + +* Local workflow development +* CI/CD pipelines +* Systems with Docker Engine installed + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/cwl/requirement/docker/docker.json + :lift_description: true + +Examples +======== + +Basic Usage (Automatic) +----------------------- + +StreamFlow automatically uses Docker for ``DockerRequirement``: + +.. code-block:: yaml + :caption: CWL workflow with DockerRequirement + + cwlVersion: v1.2 + class: CommandLineTool + baseCommand: python + requirements: + DockerRequirement: + dockerPull: python:3.10 + inputs: + script: + type: File + inputBinding: + position: 1 + outputs: + result: + type: stdout + +No ``streamflow.yml`` configuration needed - Docker is used automatically. + +Explicit Configuration +---------------------- + +Override Docker settings in ``streamflow.yml``: + +.. code-block:: yaml + :caption: streamflow.yml - Custom Docker config + + version: v1.0 + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: docker + config: + addHost: + - "database.local:192.168.1.10" + transferBufferSize: 65536 + +Custom Network Settings +----------------------- + +Add custom DNS and hosts: + +.. code-block:: yaml + :caption: Custom networking + + workflows: + network-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /process + deployment: + type: docker + config: + addHost: + - "api.internal:10.0.0.5" + - "cache.internal:10.0.0.6" + +Related Documentation +===================== + +**Connectors:** + - :doc:`/reference/connectors/docker` - Docker connector reference + +**CWL Docker Translators:** + - :doc:`index` - CWL Docker Translators overview + - :doc:`kubernetes` - Kubernetes translator + - :doc:`singularity` - Singularity translator + +**User Guide:** + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows diff --git a/docs/source/reference/cwl-docker-translators/index.rst b/docs/source/reference/cwl-docker-translators/index.rst new file mode 100644 index 000000000..325611d32 --- /dev/null +++ b/docs/source/reference/cwl-docker-translators/index.rst @@ -0,0 +1,326 @@ +===================== +CWL Docker Translators +===================== + +.. meta:: + :keywords: StreamFlow, CWL, Docker, DockerRequirement, translators, containers + :description: CWL DockerRequirement translators for StreamFlow + +Overview +======== + +StreamFlow uses **CWL Docker Translators** to convert CWL `DockerRequirement `_ specifications into StreamFlow deployment bindings. This allows you to run CWL workflows with Docker requirements on different container runtimes without modifying the workflow. + +**Default Behavior:** + +By default, StreamFlow automatically maps steps with ``DockerRequirement`` to Docker deployments using the specified image. + +**Supported Translators:** + +StreamFlow provides translators for multiple container runtimes: + +* :doc:`docker` - Docker containers (default) +* :doc:`kubernetes` - Kubernetes pods +* :doc:`singularity` - Singularity/Apptainer containers +* :doc:`no-container` - Skip containerization (use with caution) + +Quick Reference +=============== + +================ ======================================== ====================================== +Translator Runtime Use Case +================ ======================================== ====================================== +``docker`` Docker Engine Local development, CI/CD +``kubernetes`` Kubernetes Cloud-native, scalable deployments +``singularity`` Singularity/Apptainer HPC clusters (rootless containers) +``none`` No container (local) Testing, pre-configured environments +================ ======================================== ====================================== + +How It Works +============ + +CWL Docker Translators convert CWL ``DockerRequirement`` specifications into StreamFlow bindings: + +**CWL Workflow:** + +.. code-block:: yaml + :caption: workflow.cwl + + cwlVersion: v1.2 + class: CommandLineTool + baseCommand: python + requirements: + DockerRequirement: + dockerPull: python:3.10 + inputs: + script: + type: File + inputBinding: + position: 1 + outputs: + result: + type: stdout + stdout: output.txt + +**Default Translation (Docker):** + +StreamFlow automatically creates this equivalent binding: + +.. code-block:: yaml + :caption: Automatic Docker binding + + deployments: + auto-docker: + type: docker + config: + image: python:3.10 + bindings: + - step: / + target: + deployment: auto-docker + +**Custom Translation (Singularity):** + +Override the default translator in ``streamflow.yml``: + +.. code-block:: yaml + :caption: streamflow.yml - Use Singularity + + version: v1.0 + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: singularity + config: + image: docker://python:3.10 + +This runs the workflow in Singularity instead of Docker. + +Configuration Format +==================== + +Docker translators are configured in the workflow's ``docker`` section: + +.. code-block:: yaml + :caption: Basic structure + + workflows: + workflow-name: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /step-name # Step to translate + deployment: + type: translator-type + config: + # Translator-specific configuration + +**Fields:** + +``step`` + CWL step name (with leading ``/``). Use ``/`` for entire workflow. + + **Type:** String + **Required:** Yes + +``deployment.type`` + Translator type. + + **Type:** String + **Required:** Yes + **Values:** ``docker``, ``kubernetes``, ``singularity``, ``none`` + +``deployment.config`` + Translator-specific configuration. + + **Type:** Object + **Required:** Depends on translator + +Common Use Cases +================ + +Run on HPC with Singularity +---------------------------- + +Convert Docker requirements to Singularity for HPC clusters: + +.. code-block:: yaml + :caption: HPC with Singularity + + version: v1.0 + workflows: + hpc-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: singularity + config: + image: docker://python:3.10 + +Run on Kubernetes +----------------- + +Deploy to Kubernetes pods: + +.. code-block:: yaml + :caption: Kubernetes deployment + + version: v1.0 + workflows: + k8s-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: kubernetes + config: + namespace: default + +Skip Containerization +--------------------- + +Run without containers (requires pre-configured environment): + +.. code-block:: yaml + :caption: No container (use with caution) + + version: v1.0 + workflows: + local-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: none + +.. warning:: + + When using ``type: none``, you must manually ensure all required software is installed in the execution environment. + +Per-Step Translation +-------------------- + +Apply different translators to different steps: + +.. code-block:: yaml + :caption: Mixed translators + + version: v1.0 + workflows: + hybrid-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /preprocess + deployment: + type: docker + config: + image: python:3.10 + - step: /analyze + deployment: + type: kubernetes + config: + namespace: compute + - step: /visualize + deployment: + type: singularity + config: + image: docker://r-base:latest + +Available Translators +===================== + +.. toctree:: + :maxdepth: 1 + + docker + kubernetes + singularity + no-container + +Implementation Details +====================== + +The ``CWLDockerTranslator`` interface is defined in ``streamflow.cwl.requirement.docker.translator`` and exposes a single method: + +.. code-block:: python + + def get_target( + self, + image: str, + output_directory: str | None, + network_access: bool, + target: Target, + ) -> Target: + ... + +**Parameters:** + +``image`` + Docker image name from CWL ``DockerRequirement``. + +``output_directory`` + Value of CWL ``dockerOutputDirectory`` option. + +``network_access`` + Value from CWL `NetworkAccess `_ requirement. + +``target`` + Original target object from step binding. + +**Returns:** + +``Target`` object with auto-generated deployment configuration. + +Custom Translators +================== + +You can implement custom translators via the plugin system. See :doc:`/developer-guide/extension-points/index` for details. + +**Registration:** + +.. code-block:: python + + from streamflow.core.context import StreamFlowContext + + def register_cwl_docker_translator(context: StreamFlowContext): + context.register_translator('my-translator', MyCustomTranslator) + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + - :doc:`/user-guide/binding-workflows` - Binding workflows to deployments + +**Reference:** + - :doc:`/reference/connectors/docker` - Docker connector + - :doc:`/reference/connectors/singularity` - Singularity connector + - :doc:`/reference/connectors/kubernetes` - Kubernetes connector + +**Developer Guide:** + - :doc:`/developer-guide/extension-points/index` - Creating custom translators + +**External Resources:** + - `CWL DockerRequirement `_ + - `CWL NetworkAccess `_ diff --git a/docs/source/reference/cwl-docker-translators/kubernetes.rst b/docs/source/reference/cwl-docker-translators/kubernetes.rst new file mode 100644 index 000000000..d092b9cc0 --- /dev/null +++ b/docs/source/reference/cwl-docker-translators/kubernetes.rst @@ -0,0 +1,147 @@ +==================== +Kubernetes Translator +==================== + +.. meta:: + :keywords: StreamFlow, Kubernetes, CWL, DockerRequirement, cloud, pods + :description: Kubernetes translator for CWL DockerRequirement in StreamFlow + +Overview +======== + +The **Kubernetes Translator** converts CWL ``DockerRequirement`` specifications into :doc:`/reference/connectors/kubernetes` deployments. It runs each containerized step as a Kubernetes pod. + +**Use Cases:** + +* Cloud-native workflows +* Scalable compute clusters +* Multi-tenant environments +* Enterprise Kubernetes platforms + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/cwl/requirement/docker/kubernetes.json + :lift_description: true + +Examples +======== + +Basic Usage +----------- + +Run workflow on Kubernetes: + +.. code-block:: yaml + :caption: streamflow.yml - Basic Kubernetes + + version: v1.0 + workflows: + k8s-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: kubernetes + config: + namespace: workflows + +Custom Namespace +---------------- + +Deploy to specific namespace: + +.. code-block:: yaml + :caption: Custom namespace + + workflows: + compute-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /compute + deployment: + type: kubernetes + config: + namespace: compute-intensive + maxCores: 8 + maxMemory: 16Gi + +In-Cluster Execution +-------------------- + +Run StreamFlow inside Kubernetes cluster: + +.. code-block:: yaml + :caption: In-cluster configuration + + workflows: + in-cluster-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: kubernetes + config: + inCluster: true + namespace: default + +**Requirements:** + +* StreamFlow must run inside a Kubernetes pod +* ServiceAccount with appropriate RBAC permissions +* See :doc:`/reference/connectors/kubernetes` for RBAC configuration + +Multi-Namespace Workflow +------------------------- + +Different steps in different namespaces: + +.. code-block:: yaml + :caption: Multi-namespace deployment + + workflows: + multi-ns-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /preprocess + deployment: + type: kubernetes + config: + namespace: preprocessing + - step: /analyze + deployment: + type: kubernetes + config: + namespace: compute + - step: /visualize + deployment: + type: kubernetes + config: + namespace: visualization + +Related Documentation +===================== + +**Connectors:** + - :doc:`/reference/connectors/kubernetes` - Kubernetes connector reference + +**CWL Docker Translators:** + - :doc:`index` - CWL Docker Translators overview + - :doc:`docker` - Docker translator + - :doc:`singularity` - Singularity translator + +**User Guide:** + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + - :doc:`/user-guide/running-workflows` - Running on Kubernetes diff --git a/docs/source/reference/cwl-docker-translators/no-container.rst b/docs/source/reference/cwl-docker-translators/no-container.rst new file mode 100644 index 000000000..2d1cd54df --- /dev/null +++ b/docs/source/reference/cwl-docker-translators/no-container.rst @@ -0,0 +1,307 @@ +==================== +No Container Translator +==================== + +.. meta:: + :keywords: StreamFlow, CWL, DockerRequirement, local, no container + :description: No-container translator for CWL DockerRequirement in StreamFlow + +Overview +======== + +The **No Container Translator** (``none``) bypasses CWL ``DockerRequirement`` specifications and runs workflow steps directly on the execution environment **without containers**. The local connector is used by default unless the step is explicitly bound to a different deployment. + +.. warning:: + + **Use with Caution!** + + This translator skips containerization entirely. You must manually ensure: + + * All required software is installed + * Correct versions are available + * Environment is properly configured + * Dependencies are satisfied + + Step execution **will fail** if requirements are not met. + +**Use Cases:** + +* Testing workflows in pre-configured environments +* Debugging container issues +* Legacy systems without container support +* Environments where containers are prohibited + +**When NOT to Use:** + +* Production workflows (use proper containers) +* Workflows with complex dependencies +* Multi-user/multi-tenant systems +* Reproducibility-critical workflows + +Configuration +============= + +The no-container translator has no configurable options. Simply specify ``type: none``: + +.. code-block:: yaml + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: none + +Examples +======== + +Skip All Containerization +-------------------------- + +Run entire workflow without containers: + +.. code-block:: yaml + :caption: streamflow.yml - No containers + + version: v1.0 + workflows: + local-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: none + +**Effect:** + +All CWL ``DockerRequirement`` specifications are ignored. Steps run directly on the local machine. + +Skip Specific Steps +------------------- + +Use containers for some steps, skip for others: + +.. code-block:: yaml + :caption: Mixed containerization + + workflows: + mixed-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /preprocess + deployment: + type: docker + config: + image: python:3.10 + - step: /analyze + deployment: + type: none # Run directly (no container) + - step: /visualize + deployment: + type: docker + config: + image: r-base:latest + +Testing Environment +------------------- + +Test workflow without containers: + +.. code-block:: yaml + :caption: Development/testing configuration + + version: v1.0 + workflows: + test-workflow: + type: cwl + config: + file: workflow.cwl + settings: test-inputs.yml + docker: + - step: / + deployment: + type: none + +**Prerequisites:** + +You must manually install all workflow requirements: + +.. code-block:: bash + + # Install Python dependencies + pip install numpy pandas scikit-learn + + # Install system tools + apt-get install samtools bcftools + + # Verify installations + python --version + samtools --version + +Requirements +============ + +When using ``type: none``, you are responsible for: + +1. **Software Installation** + + All tools referenced in CWL ``baseCommand`` must be installed and in ``$PATH``. + +2. **Correct Versions** + + Installed versions must match workflow expectations. + +3. **Dependencies** + + All runtime dependencies (libraries, system packages) must be present. + +4. **Environment Configuration** + + Environment variables, paths, and configuration files must be set correctly. + +5. **Permissions** + + File system permissions must allow workflow execution. + +Example Checklist +----------------- + +Before using ``type: none``, verify: + +.. code-block:: bash + + # Check required tools are installed + which python + which samtools + which bcftools + + # Check versions + python --version # Should match workflow requirements + samtools --version + + # Check Python packages + pip list | grep numpy + pip list | grep pandas + + # Test execution + python --help + samtools --help + +Advantages & Disadvantages +========================== + +**Advantages:** + +* No container overhead - Faster startup +* Direct execution - Simpler debugging +* Flexibility - Use system-specific optimizations +* Legacy compatibility - Works without container runtime + +**Disadvantages:** + +* No isolation - Execution can affect system +* Not reproducible - Environment-dependent +* Manual setup - Requires pre-configuration +* Error-prone - Missing dependencies cause failures +* Not portable - Won't work on other systems + +Best Practices +============== + +1. **Document Requirements** + + Maintain a list of required software and versions. + +2. **Use for Testing Only** + + Prefer containers for production workflows. + +3. **Automate Setup** + + Create scripts to install/verify requirements: + + .. code-block:: bash + + #!/bin/bash + # setup-environment.sh + pip install -r requirements.txt + apt-get install -y samtools bcftools + + # Verify installations + python -c "import numpy, pandas, sklearn" + samtools --version + +4. **Version Lock** + + Pin exact versions to match production: + + .. code-block:: text + :caption: requirements.txt + + numpy==1.24.3 + pandas==2.0.2 + scikit-learn==1.3.0 + +5. **Test Before Production** + + Always test with ``type: none`` before deploying without containers. + +Troubleshooting +=============== + +**Command Not Found:** + +.. code-block:: text + + Error: command not found: python + +**Solution:** Install the missing command: + +.. code-block:: bash + + apt-get install python3 + ln -s /usr/bin/python3 /usr/bin/python + +**Module Not Found:** + +.. code-block:: text + + ModuleNotFoundError: No module named 'numpy' + +**Solution:** Install Python packages: + +.. code-block:: bash + + pip install numpy pandas + +**Version Mismatch:** + +Workflow expects Python 3.10 but system has Python 3.8. + +**Solution:** Install correct version or use containers instead. + +Related Documentation +===================== + +**CWL Docker Translators:** + - :doc:`index` - CWL Docker Translators overview + - :doc:`docker` - Docker translator (recommended) + - :doc:`singularity` - Singularity translator (HPC) + +**User Guide:** + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + - :doc:`/user-guide/troubleshooting` - Troubleshooting guide + +**Connectors:** + - :doc:`/reference/connectors/docker` - Use Docker instead + - :doc:`/reference/connectors/singularity` - Use Singularity instead diff --git a/docs/source/reference/cwl-docker-translators/singularity.rst b/docs/source/reference/cwl-docker-translators/singularity.rst new file mode 100644 index 000000000..5e9b97a6a --- /dev/null +++ b/docs/source/reference/cwl-docker-translators/singularity.rst @@ -0,0 +1,260 @@ +===================== +Singularity Translator +===================== + +.. meta:: + :keywords: StreamFlow, Singularity, Apptainer, CWL, DockerRequirement, HPC + :description: Singularity translator for CWL DockerRequirement in StreamFlow + +Overview +======== + +The **Singularity Translator** converts CWL ``DockerRequirement`` specifications into :doc:`/reference/connectors/singularity` deployments. It enables running Docker-based CWL workflows on HPC systems using Singularity/Apptainer. + +**Use Cases:** + +* HPC cluster workflows +* Rootless container execution +* Systems without Docker +* Security-conscious environments + +**Key Benefits:** + +* No root/sudo required +* OCI/Docker image compatibility +* HPC-optimized performance +* Enhanced security model + +Configuration +============= + +.. jsonschema:: https://streamflow.di.unito.it/schemas/cwl/requirement/docker/singularity.json + :lift_description: true + +Image Formats +============= + +Singularity supports multiple image sources: + +**Docker Hub (Automatic):** + +CWL ``dockerPull: python:3.10`` automatically converts to ``docker://python:3.10`` + +**Explicit Docker:** + +.. code-block:: yaml + + config: + image: docker://nvidia/cuda:11.8.0-base + +**Singularity Library:** + +.. code-block:: yaml + + config: + image: library://lolcow + +**Singularity Hub:** + +.. code-block:: yaml + + config: + image: shub://vsoch/hello-world + +**Local SIF File:** + +.. code-block:: yaml + + config: + image: /path/to/image.sif + +Examples +======== + +Basic Usage (Automatic) +----------------------- + +StreamFlow automatically converts Docker images: + +.. code-block:: yaml + :caption: CWL with DockerRequirement + + cwlVersion: v1.2 + class: CommandLineTool + baseCommand: python + requirements: + DockerRequirement: + dockerPull: python:3.10 + +.. code-block:: yaml + :caption: streamflow.yml - Use Singularity + + version: v1.0 + workflows: + hpc-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: singularity + +The Docker image ``python:3.10`` is automatically converted to ``docker://python:3.10`` for Singularity. + +HPC Cluster Deployment +---------------------- + +Run on HPC with Singularity: + +.. code-block:: yaml + :caption: HPC configuration + + version: v1.0 + workflows: + hpc-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: singularity + config: + image: docker://python:3.10 + +Pre-Built SIF Image +------------------- + +Use pre-built Singularity image: + +.. code-block:: yaml + :caption: Local SIF file + + workflows: + cached-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: / + deployment: + type: singularity + config: + image: /shared/containers/python-3.10.sif + +GPU-Enabled Containers +---------------------- + +Run GPU workloads with Singularity: + +.. code-block:: yaml + :caption: GPU container + + workflows: + gpu-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /compute + deployment: + type: singularity + config: + image: docker://nvidia/cuda:11.8.0-runtime + +Mixed Container Runtimes +------------------------ + +Use different runtimes for different steps: + +.. code-block:: yaml + :caption: Docker for local, Singularity for HPC + + workflows: + hybrid-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + docker: + - step: /preprocess + deployment: + type: docker + config: + image: python:3.10 + - step: /compute + deployment: + type: singularity + config: + image: docker://python:3.10 + +Best Practices +============== + +1. **Pre-Build Images** + + Build Singularity images ahead of time for faster execution: + + .. code-block:: bash + + singularity build python-3.10.sif docker://python:3.10 + +2. **Use Shared Storage** + + Store SIF files on shared HPC storage for all nodes to access. + +3. **Cache Docker Pulls** + + Set ``SINGULARITY_CACHEDIR`` to cache Docker image conversions: + + .. code-block:: bash + + export SINGULARITY_CACHEDIR=/shared/singularity-cache + +4. **Test Locally First** + + Test with Docker locally, then switch to Singularity for HPC deployment. + +Troubleshooting +=============== + +**Image Conversion Issues:** + +If automatic Docker→Singularity conversion fails, pre-build the image: + +.. code-block:: bash + + singularity build myimage.sif docker://python:3.10 + +Then use the local SIF file in configuration. + +**Permission Errors:** + +Singularity runs as your user. Ensure proper permissions on: + +* Input/output directories +* Cache directories +* Image files + +Related Documentation +===================== + +**Connectors:** + - :doc:`/reference/connectors/singularity` - Singularity connector reference + +**CWL Docker Translators:** + - :doc:`index` - CWL Docker Translators overview + - :doc:`docker` - Docker translator + - :doc:`kubernetes` - Kubernetes translator + +**User Guide:** + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + +**External Resources:** + - `Singularity Documentation `_ + - `Apptainer Documentation `_ diff --git a/docs/source/reference/glossary.rst b/docs/source/reference/glossary.rst new file mode 100644 index 000000000..24f7a5ed4 --- /dev/null +++ b/docs/source/reference/glossary.rst @@ -0,0 +1,66 @@ +======== +Glossary +======== + +.. meta:: + :keywords: StreamFlow, glossary, terms, definitions, terminology + :description: Complete glossary of StreamFlow terminology and concepts + +Overview +======== + +This glossary provides definitions for key terms and concepts used throughout StreamFlow documentation. + +Terms +===== + +.. glossary:: + + Binding + Associates a workflow step with a deployment target, specifying where the step should execute. + + Connector + A StreamFlow component that interfaces with an execution environment (Docker, Kubernetes, HPC, etc.). + + CWL + Common Workflow Language - a standard for describing computational workflows. + + Deployment + An execution environment configured in StreamFlow where workflow tasks can run. + + Location + The lowest level in StreamFlow's three-tier execution model (Deployment → Service → Location). + + Port + An input or output interface of a workflow step that handles data flow. + + Scheduler + Component responsible for assigning workflow tasks to available resources. + + Service + The middle level in StreamFlow's execution model, representing a logical grouping of locations. + + Step + An individual task or operation in a workflow. + + StreamFlowContext + The central coordinator managing all StreamFlow components and their lifecycle. + + Target + A deployment destination specified in a binding. + + Token + A data unit flowing through workflow ports during execution. + + Workflow + A computational process defined as a directed acyclic graph of steps. + +Related Documentation +===================== + +**User Guide:** + - :doc:`/user-guide/quickstart` - Get started with StreamFlow + - :doc:`/user-guide/writing-workflows` - Workflow concepts + +**Reference:** + - :doc:`/reference/index` - Complete reference documentation diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst new file mode 100644 index 000000000..3885453c4 --- /dev/null +++ b/docs/source/reference/index.rst @@ -0,0 +1,142 @@ +========= +Reference +========= + +.. meta:: + :keywords: StreamFlow, reference, API, CLI, configuration, schema + :description: Complete reference documentation for StreamFlow: CLI commands, configuration schemas, connector reference, and API documentation + +Overview +======== + +The Reference section provides comprehensive technical documentation for StreamFlow. This includes command-line interface details, configuration schemas, connector specifications, and complete API documentation. + +Quick Reference +=============== + +============ ==================================== +Audience All users (lookup reference) +Purpose Detailed technical specifications +Organization By component type +Updates Generated from source code/schemas +============ ==================================== + +Using This Reference +==================== + +This reference is organized by component type: + +**Need to run a command?** + See :doc:`cli/index` for complete command-line reference + +**Configuring StreamFlow?** + See :doc:`configuration/index` for all configuration options + +**Choosing a connector?** + Browse the connector pages below for available execution environments + +**Using CWL features?** + See :doc:`cwl-support/index` for CWL-specific documentation + +**Programming with StreamFlow?** + See :doc:`api/index` for complete API documentation + +Table of Contents +================= + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + cli/index + configuration/index + connectors/index + cwl-docker-translators/index + glossary + +Quick Links +=========== + +**Most Common References:** + +* :doc:`cli/run` - Run workflows +* :doc:`configuration/streamflow-yml` - The StreamFlow file +* :doc:`connectors/docker` - Docker connector +* :doc:`connectors/slurm` - Slurm connector +* :doc:`glossary` - Term definitions + +**Connector Reference:** + +* **Container:** :doc:`connectors/docker`, :doc:`connectors/singularity` +* **Cloud:** :doc:`connectors/kubernetes` +* **HPC:** :doc:`connectors/ssh`, :doc:`connectors/slurm`, :doc:`connectors/pbs` + +**CWL-Specific:** + +* :doc:`cwl-docker-translators/index` - Docker requirement handling +* :doc:`cwl-docker-translators/docker` - Docker translator +* :doc:`cwl-docker-translators/kubernetes` - Kubernetes translator +* :doc:`cwl-docker-translators/singularity` - Singularity translator + +Related Documentation +===================== + +**User Guide:** + For tutorials and usage examples: + + - :doc:`/user-guide/quickstart` - Get started in 10 minutes + - :doc:`/user-guide/installation` - Installation instructions + - :doc:`/user-guide/writing-workflows` - Writing CWL workflows + +**Developer Guide:** + For extending StreamFlow: + + - :doc:`/developer-guide/extension-points/index` - Creating plugins + +Finding Information +=================== + +**By Task:** + +* **Installing:** :doc:`/user-guide/installation` +* **Running workflows:** :doc:`cli/index` +* **Configuring deployments:** :doc:`configuration/index` +* **Inspecting results:** :doc:`cli/index` + +**By Connector Type:** + +* **Container:** :doc:`connectors/docker`, :doc:`connectors/singularity` +* **Cloud:** :doc:`connectors/kubernetes` +* **HPC:** :doc:`connectors/ssh`, :doc:`connectors/slurm`, :doc:`connectors/pbs` +* **Schedulers:** :doc:`/developer-guide/extension-points/scheduler` +* **Data Managers:** :doc:`/developer-guide/extension-points/data-manager` +* **Databases:** :doc:`/developer-guide/extension-points/database` + +**By Format:** + +* **Configuration Schemas:** :doc:`configuration/index` +* **Python API:** :doc:`api/index` +* **CLI Commands:** :doc:`cli/index` + +Conventions Used +================ + +**Configuration Examples:** + +All configuration examples use YAML format for streamflow.yml files and follow the schema defined in :doc:`configuration/streamflow-yml`. + +**Command Examples:** + +Command-line examples show the prompt with ``$`` and use ``streamflow`` as the command name. Commands are assumed to run from the project directory. + +**Code Examples:** + +Python code examples use type hints and follow StreamFlow's coding standards documented in :doc:`/developer-guide/code-style`. + +Need Help? +========== + +* **Can't find something?** Check :doc:`glossary` for terminology +* **Configuration not working?** Validate with ``streamflow schema`` +* **CLI questions?** Run ``streamflow --help`` or ``streamflow --help`` +* **API questions?** See :doc:`api/index` for complete documentation diff --git a/docs/source/user-guide/advanced-patterns/index.rst b/docs/source/user-guide/advanced-patterns/index.rst new file mode 100644 index 000000000..abf62d7bd --- /dev/null +++ b/docs/source/user-guide/advanced-patterns/index.rst @@ -0,0 +1,87 @@ +================ +Advanced Patterns +================ + +.. meta:: + :keywords: StreamFlow, advanced, patterns, multiple targets, port targets, stacked locations + :description: Advanced StreamFlow configuration patterns for complex workflow scenarios + +Overview +======== + +This section covers advanced StreamFlow binding patterns that enable sophisticated workflow configurations. These patterns are useful for complex scenarios requiring fine-grained control over task placement and data locality. + +Patterns Covered +================ + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + multiple-targets + port-targets + stacked-locations + +When to Use Advanced Patterns +============================== + +**Multiple Targets:** + Use when a workflow step should execute on different deployment targets based on runtime conditions or data characteristics. + + *Example:* Process different data subsets on different clouds, or route tasks based on size/type. + +**Port Targets:** + Use when input/output data resides on specific deployments separate from where computation occurs. + + *Example:* Data stored in cloud storage while computation happens on HPC, or distributed data sources. + +**Stacked Locations:** + Use when you need to wrap one deployment inside another, creating layered execution environments. + + *Example:* Run Singularity containers inside Slurm jobs accessed via SSH, or Docker-in-Docker scenarios. + +Pattern Selection Guide +======================= + +============================================ ==================================== +Scenario Pattern to Use +============================================ ==================================== +Different clouds for different tasks :doc:`multiple-targets` +Load balancing across resources :doc:`multiple-targets` +Data on storage, compute elsewhere :doc:`port-targets` +Optimize data transfer costs :doc:`port-targets` +Container in batch scheduler in remote host :doc:`stacked-locations` +Complex multi-layer environments :doc:`stacked-locations` +============================================ ==================================== + +Prerequisites +============= + +Before exploring advanced patterns, ensure you understand: + +* :doc:`../binding-workflows` - Basic binding concepts +* :doc:`../configuring-deployments` - Deployment configuration +* :doc:`/reference/configuration/binding-config` - Binding schema reference + +Related Topics +============== + +**User Guide:** + - :doc:`../binding-workflows` - Basic binding patterns + - :doc:`../configuring-deployments` - Deployment setup + +**Reference:** + - :doc:`/reference/configuration/binding-config` - Complete binding schema + - :doc:`/developer-guide/extension-points/binding-filter` - Custom filters + +**Examples:** + Working examples for each pattern are available in the ``docs/examples/advanced/`` directory. + +Next Steps +========== + +Choose the pattern that matches your use case: + +* :doc:`multiple-targets` - Multiple deployment targets per step +* :doc:`port-targets` - Data placement separate from computation +* :doc:`stacked-locations` - Nested deployment environments diff --git a/docs/source/user-guide/advanced-patterns/multiple-targets.rst b/docs/source/user-guide/advanced-patterns/multiple-targets.rst new file mode 100644 index 000000000..f3d3cad79 --- /dev/null +++ b/docs/source/user-guide/advanced-patterns/multiple-targets.rst @@ -0,0 +1,644 @@ +================ +Multiple Targets +================ + +.. meta:: + :keywords: StreamFlow, multiple targets, binding filters, load balancing, scheduling + :description: Learn how to bind workflow steps to multiple execution targets for load balancing and flexibility + +Overview +======== + +StreamFlow allows binding workflow steps to multiple execution targets, enabling load balancing, failover, and flexible resource allocation. This pattern is especially useful for scatter operations and workflows with variable workloads. + +Use Cases +========= + +======================== ======================================== +Scenario Benefit +======================== ======================================== +**Scatter Operations** Distribute parallel tasks across clusters +**Load Balancing** Spread work across multiple resources +**Failover** Use backup resources if primary unavailable +**Hybrid Execution** Use mix of cloud and HPC resources +**Cost Optimization** Use cheaper resources when available +======================== ======================================== + +Basic Multiple Target Binding +============================== + +Simple Configuration +-------------------- + +Bind a step to multiple deployments: + +.. code-block:: yaml + :caption: Multiple targets configuration + + version: v1.0 + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + + deployments: + cluster-1: + type: slurm + config: + hostname: hpc1.example.edu + username: user + sshKey: ~/.ssh/id_rsa + + cluster-2: + type: slurm + config: + hostname: hpc2.example.edu + username: user + sshKey: ~/.ssh/id_rsa + + cluster-3: + type: slurm + config: + hostname: hpc3.example.edu + username: user + sshKey: ~/.ssh/id_rsa + +**Behavior:** + +* StreamFlow scheduler evaluates targets in order +* Tasks scheduled to first available target +* Subsequent tasks may use different targets + +Scatter with Multiple Targets +------------------------------ + +Particularly useful for scatter operations: + +.. code-block:: yaml + :caption: workflow.cwl - Scatter workflow + + cwlVersion: v1.2 + class: Workflow + + requirements: + ScatterFeatureRequirement: {} + + inputs: + input_files: File[] + + outputs: + results: + type: File[] + outputSource: process/output + + steps: + process: + run: process-tool.cwl + scatter: input_file + in: + input_file: input_files + out: [output] + +.. code-block:: yaml + :caption: streamflow.yml - Bind to multiple targets + + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + +**Result:** + +* If ``input_files`` has 30 items, creates 30 scattered tasks +* Tasks distributed across all three clusters +* Maximizes resource utilization + +Service-Level Multiple Targets +=============================== + +Target multiple services within or across deployments: + +.. code-block:: yaml + :caption: Multiple service targets + + bindings: + - step: /compute + target: + - deployment: k8s-cluster + service: cpu-workers + - deployment: k8s-cluster + service: gpu-workers + - deployment: hpc-cluster + service: compute-nodes + + deployments: + k8s-cluster: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + cpu-workers: + replicas: 10 + gpu-workers: + replicas: 2 + + hpc-cluster: + type: slurm + config: + hostname: hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + services: + compute-nodes: + partition: standard + nodes: 4 + +Binding Filters +=============== + +Filters control how StreamFlow selects among multiple targets. + +Default Behavior +---------------- + +Without filters, targets are evaluated in order of appearance: + +.. code-block:: yaml + + bindings: + - step: /process + target: + - deployment: cluster-1 # Tried first + - deployment: cluster-2 # Tried second + - deployment: cluster-3 # Tried third + +**Selection Logic:** + +1. Try cluster-1 +2. If unavailable/busy, try cluster-2 +3. If unavailable/busy, try cluster-3 +4. If all unavailable, wait and retry + +Shuffle Filter +-------------- + +Randomize target evaluation order: + +.. code-block:: yaml + :caption: Shuffle filter + + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + filters: + - shuffle + +**Benefits:** + +* Distributes load randomly +* Prevents overloading first target +* Better load balancing for bursty workloads + +**Use Cases:** + +* Multiple equivalent resources +* Load balancing across identical clusters +* Avoiding hotspots + +Multiple Filters +---------------- + +Apply multiple filters in sequence: + +.. code-block:: yaml + :caption: Multiple filters + + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + - deployment: cluster-4 + filters: + - filter-type-1 + - filter-type-2 + - shuffle + +Filters are applied in order, each transforming the target list. + +Custom Filters +-------------- + +Create custom binding filters for advanced selection logic. See :doc:`/developer-guide/extension-points/binding-filter`. + +**Example Custom Logic:** + +* Select based on current queue wait times +* Prefer targets with local data +* Use cost-based selection +* Time-of-day routing + +Practical Examples +================== + +Hybrid Cloud-HPC +---------------- + +.. code-block:: yaml + :caption: Hybrid execution + + workflows: + data-pipeline: + type: cwl + config: + file: pipeline.cwl + bindings: + # Lightweight preprocessing on cloud + - step: /preprocess + target: + - deployment: aws-cluster + - deployment: gcp-cluster + filters: + - shuffle + + # Heavy computation on HPC + - step: /compute + target: + - deployment: hpc-primary + - deployment: hpc-backup + + # Visualization back on cloud + - step: /visualize + target: + - deployment: aws-cluster + + deployments: + aws-cluster: + type: kubernetes + config: { ... } + + gcp-cluster: + type: kubernetes + config: { ... } + + hpc-primary: + type: slurm + config: { ... } + + hpc-backup: + type: slurm + config: { ... } + +**Strategy:** + +* Preprocessing uses either cloud provider +* Computation uses HPC with backup +* Visualization returns to cloud + +Cost-Optimized Execution +------------------------- + +.. code-block:: yaml + :caption: Cost optimization + + bindings: + - step: /analysis + target: + - deployment: on-premise # Free + - deployment: spot-instances # Cheap + - deployment: on-demand # Expensive backup + + deployments: + on-premise: + type: local + + spot-instances: + type: kubernetes + config: + # Kubernetes with spot/preemptible instances + ... + + on-demand: + type: kubernetes + config: + # Kubernetes with on-demand instances + ... + +**Cost Strategy:** + +1. Use free on-premise if available +2. Use cheap spot instances +3. Fall back to expensive on-demand only if needed + +Geographic Distribution +----------------------- + +.. code-block:: yaml + :caption: Geographic targets + + bindings: + - step: /process + target: + - deployment: us-east + - deployment: us-west + - deployment: eu-central + - deployment: asia-pacific + filters: + - shuffle + + deployments: + us-east: + type: kubernetes + config: + kubeconfig: ~/.kube/config-us-east + + us-west: + type: kubernetes + config: + kubeconfig: ~/.kube/config-us-west + + eu-central: + type: kubernetes + config: + kubeconfig: ~/.kube/config-eu + + asia-pacific: + type: kubernetes + config: + kubeconfig: ~/.kube/config-apac + +**Benefits:** + +* Global load distribution +* Reduced latency for distributed data +* Regulatory compliance (data locality) + +Advanced Patterns +================= + +Tiered Resource Strategy +------------------------- + +.. code-block:: yaml + :caption: Resource tiers + + bindings: + - step: /light_task + target: + - deployment: small-vms + - deployment: medium-vms + - deployment: large-vms + + - step: /medium_task + target: + - deployment: medium-vms + - deployment: large-vms + + - step: /heavy_task + target: + - deployment: large-vms + - deployment: gpu-nodes + +**Strategy:** Match task requirements to appropriate resources. + +Per-Step Multiple Targets +-------------------------- + +Different steps use different target sets: + +.. code-block:: yaml + + bindings: + # I/O intensive: use fast storage nodes + - step: /read_data + target: + - deployment: storage-node-1 + - deployment: storage-node-2 + + # CPU intensive: use compute nodes + - step: /compute + target: + - deployment: compute-node-1 + - deployment: compute-node-2 + - deployment: compute-node-3 + + # Memory intensive: use bigmem nodes + - step: /analyze + target: + - deployment: bigmem-node-1 + - deployment: bigmem-node-2 + +Conditional Targeting +--------------------- + +While StreamFlow doesn't support conditional bindings directly, use CWL conditional execution with multiple target bindings: + +.. code-block:: yaml + :caption: workflow.cwl with conditions + + cwlVersion: v1.2 + class: Workflow + requirements: + InlineJavascriptRequirement: {} + + inputs: + use_gpu: boolean + data: File + + steps: + cpu_process: + when: $(inputs.use_gpu == false) + run: cpu-tool.cwl + in: + use_gpu: use_gpu + input: data + out: [output] + + gpu_process: + when: $(inputs.use_gpu == true) + run: gpu-tool.cwl + in: + use_gpu: use_gpu + input: data + out: [output] + +.. code-block:: yaml + :caption: streamflow.yml with separate bindings + + bindings: + - step: /cpu_process + target: + - deployment: cpu-cluster-1 + - deployment: cpu-cluster-2 + + - step: /gpu_process + target: + - deployment: gpu-cluster-1 + - deployment: gpu-cluster-2 + +Monitoring and Debugging +========================= + +Track Target Usage +------------------ + +Generate reports to see which targets were used: + +.. code-block:: bash + + streamflow report workflow-name --format json | \ + jq '.steps[] | {step: .name, location: .location}' + +Debug Target Selection +---------------------- + +Enable debug logging to see selection decisions: + +.. code-block:: bash + + streamflow run streamflow.yml --debug + +Logs show: + +* Target evaluation order +* Why targets were selected/rejected +* Load balancing decisions + +Best Practices +============== + +1. **Use Shuffle for Equivalent Targets** + + .. code-block:: yaml + + # Good for load balancing + target: + - deployment: node-1 + - deployment: node-2 + - deployment: node-3 + filters: + - shuffle + +2. **Order by Preference Without Shuffle** + + .. code-block:: yaml + + # Prefer fast-cluster, fall back to others + target: + - deployment: fast-cluster + - deployment: medium-cluster + - deployment: slow-cluster + +3. **Match Resources to Tasks** + + Don't bind lightweight tasks to expensive GPU clusters. + +4. **Consider Data Locality** + + Prefer targets where data already resides: + + .. code-block:: yaml + + bindings: + - step: /process + target: + - deployment: hpc-with-data + - deployment: cloud-cluster + - port: /input_data + target: + deployment: hpc-with-data + workdir: /data + +5. **Test Failover Behavior** + + Verify workflow continues if a target becomes unavailable. + +6. **Monitor Resource Utilization** + + Use reports to verify load is distributed as expected. + +Limitations +=========== + +**No User-Defined Selection Logic:** + +StreamFlow doesn't support custom selection logic in configuration. Use binding filter plugins for advanced selection. + +**All Targets Must Support Step:** + +All targets must have required tools and environment for the step. + +**No Dynamic Target Addition:** + +Target list is fixed at workflow start. Cannot add targets during execution. + +Troubleshooting +=============== + +All Targets Unavailable +----------------------- + +**Problem:** ``No available targets for step`` + +**Solution:** + +* Check deployment connectivity +* Verify resource availability +* Review logs for specific failures +* Add more targets + +Uneven Load Distribution +------------------------ + +**Problem:** Some targets overloaded, others idle + +**Solution:** + +* Add shuffle filter +* Check target capabilities +* Review scheduling logs +* Verify targets have equal capacity + +Tasks Only Use First Target +---------------------------- + +**Problem:** All tasks scheduled to first target + +**Cause:** First target has excess capacity + +**Solution:** + +* Add shuffle filter for random distribution +* Reduce target capacity to force spillover +* Use custom binding filter + +Next Steps +========== + +After mastering multiple targets: + +* :doc:`port-targets` - Advanced port binding patterns +* :doc:`stacked-locations` - Complex deployment stacking +* :doc:`/developer-guide/extension-points/binding-filter` - Create custom filters +* :doc:`/reference/configuration/binding-config` - Complete binding reference + +Related Topics +============== + +* :doc:`/user-guide/binding-workflows` - Basic binding concepts +* :doc:`/user-guide/configuring-deployments` - Deployment configuration +* :doc:`/developer-guide/core-interfaces/scheduling` - Scheduling internals diff --git a/docs/source/user-guide/advanced-patterns/port-targets.rst b/docs/source/user-guide/advanced-patterns/port-targets.rst new file mode 100644 index 000000000..7bcb9a68b --- /dev/null +++ b/docs/source/user-guide/advanced-patterns/port-targets.rst @@ -0,0 +1,720 @@ +============ +Port Targets +============ + +.. meta:: + :keywords: StreamFlow, port binding, data staging, remote files, input output + :description: Learn how to bind input/output ports to specific locations for optimal data management + +Overview +======== + +Port targets allow you to specify where StreamFlow should look for input files or place output files, overriding default behavior. This is essential for workflows with data already on remote systems or when you need to control data placement. + +When to Use Port Targets +========================= + +Use port targets when: + +========================== ======================================== +Scenario Solution +========================== ======================================== +Data on Remote System Avoid transferring large datasets +Shared Storage Access files on shared filesystem +Output Placement Write results to specific location +Data Locality Execute where data resides +Archive Integration Read from/write to archive storage +========================== ======================================== + +Default Behavior +================ + +Without port targets: + +**Input Files:** + StreamFlow looks for inputs in the local filesystem + +**Output Files:** + StreamFlow retrieves outputs from the execution location + +**Intermediate Files:** + StreamFlow transfers files between locations as needed + +Port Target Syntax +================== + +Basic Structure +--------------- + +.. code-block:: yaml + :caption: Port target configuration + + bindings: + - port: /path/to/port + target: + deployment: deployment-name + workdir: /remote/directory + +**Required Fields:** + +* ``port`` - POSIX-like path to the port +* ``target.deployment`` - Deployment where files are located +* ``target.workdir`` - Base directory for files + +Port Path Format +---------------- + +Ports use POSIX-like paths: + +================== ======================================== +Path Description +================== ======================================== +``/input_file`` Workflow input port +``/step/in_port`` Input port of a step +``/step/out_port`` Output port of a step +================== ======================================== + +**Path Structure:** ``/[step-name]/port-name`` + +**Root-level ports:** Omit step name (``/port-name``) + +Basic Examples +============== + +Remote Input File +----------------- + +Data already exists on HPC system: + +.. code-block:: yaml + :caption: Access remote input file + + version: v1.0 + + workflows: + process-data: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /process + target: + deployment: hpc-cluster + + - port: /input_data + target: + deployment: hpc-cluster + workdir: /scratch/user/datasets + + deployments: + hpc-cluster: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + +**Behavior:** + +* ``/input_data`` file is accessed directly on HPC +* No transfer from local machine +* Step executes on HPC where data resides + +Remote Output Location +---------------------- + +Write results to specific remote location: + +.. code-block:: yaml + :caption: Direct output to archive + + bindings: + - step: /analyze + target: + deployment: compute-cluster + + - port: /results + target: + deployment: archive-storage + workdir: /archive/project-123/results + + deployments: + compute-cluster: + type: kubernetes + config: { ... } + + archive-storage: + type: ssh + config: + hostname: archive.example.com + username: archiver + sshKey: ~/.ssh/archive_key + +**Behavior:** + +* Workflow executes on compute-cluster +* Final results are placed directly on archive-storage +* No intermediate transfer through local machine + +Step-Specific Port Targets +=========================== + +Input Port of Specific Step +---------------------------- + +.. code-block:: yaml + :caption: Step input from remote + + workflows: + compile-workflow: + type: cwl + config: + file: compile.cwl + bindings: + - step: /compile + target: + deployment: build-server + + - port: /compile/src + target: + deployment: source-repository + workdir: /repos/project/src + +**CWL Workflow:** + +.. code-block:: yaml + :caption: compile.cwl + + cwlVersion: v1.2 + class: Workflow + + inputs: + tarball: File + source_file: string + + steps: + untar: + run: untar.cwl + in: + archive: tarball + out: [files] + + compile: + run: javac.cwl + in: + src: untar/files # This input + out: [classfile] + +**Result:** + +* The ``src`` input port of the ``compile`` step reads from ``source-repository`` +* Other inputs use default behavior + +Output Port of Specific Step +----------------------------- + +.. code-block:: yaml + :caption: Step output to specific location + + bindings: + - step: /process + target: + deployment: compute-cluster + + - port: /process/results + target: + deployment: results-storage + workdir: /results/experiment-42 + +Complex Data Flow +================= + +Multi-Step with Different Locations +------------------------------------ + +.. code-block:: yaml + :caption: Complex data flow + + workflows: + pipeline: + type: cwl + config: + file: pipeline.cwl + bindings: + # Steps + - step: /preprocess + target: + deployment: cloud-cluster + + - step: /analyze + target: + deployment: hpc-cluster + + - step: /visualize + target: + deployment: local + + # Ports + - port: /raw_data + target: + deployment: hpc-storage + workdir: /data/raw + + - port: /analyze/preprocessed + target: + deployment: hpc-storage + workdir: /data/preprocessed + + - port: /final_plots + target: + deployment: web-server + workdir: /var/www/html/plots + + deployments: + cloud-cluster: + type: kubernetes + config: { ... } + + hpc-cluster: + type: slurm + config: { ... } + + hpc-storage: + type: ssh + config: + hostname: storage.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + local: + type: local + + web-server: + type: ssh + config: + hostname: webserver.example.com + username: www-data + sshKey: ~/.ssh/web_key + +**Data Flow:** + +1. ``/raw_data`` read from HPC storage +2. ``/preprocess`` executes on cloud, writes to HPC storage +3. ``/analyze`` reads from HPC storage, executes on HPC +4. ``/visualize`` executes locally +5. ``/final_plots`` written to web server + +Avoiding Large Transfers +------------------------- + +.. code-block:: yaml + :caption: Minimize data movement + + bindings: + # Execute where data is + - step: /process_large_dataset + target: + deployment: data-center + + # Data already there + - port: /large_dataset + target: + deployment: data-center + workdir: /mnt/datasets/project + + # Results stay there + - port: /processed_results + target: + deployment: data-center + workdir: /mnt/results/project + +**Benefits:** + +* No transfer of multi-TB dataset to local machine +* No transfer back to storage +* Execution happens where data resides + +Shared Filesystem Scenarios +============================ + +HPC with Shared Storage +----------------------- + +Common HPC pattern with shared filesystem: + +.. code-block:: yaml + :caption: HPC shared storage + + deployments: + hpc-login: + type: ssh + config: + hostname: login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + hpc-compute: + type: slurm + config: { ... } + wraps: hpc-login + + bindings: + - step: /compute + target: + deployment: hpc-compute + + # Data on shared filesystem + - port: /input_data + target: + deployment: hpc-login + workdir: /home/user/data + + - port: /results + target: + deployment: hpc-login + workdir: /home/user/results + +**Shared Filesystem:** + +* ``/home/user`` visible to both login and compute nodes +* No data transfer needed between locations +* StreamFlow recognizes shared storage + +Network Filesystem (NFS) +------------------------- + +.. code-block:: yaml + :caption: NFS-mounted storage + + bindings: + - step: /process + target: + deployment: worker-nodes + + - port: /nfs_data + target: + deployment: nfs-server + workdir: /exports/shared/data + +Multiple Input Ports +===================== + +Different inputs from different locations: + +.. code-block:: yaml + :caption: Multiple input sources + + workflows: + merge-data: + type: cwl + config: + file: merge.cwl + bindings: + - step: /merge + target: + deployment: processor + + - port: /dataset_a + target: + deployment: source-a + workdir: /data/a + + - port: /dataset_b + target: + deployment: source-b + workdir: /data/b + + - port: /reference + target: + deployment: reference-db + workdir: /db/reference + +**CWL Workflow:** + +.. code-block:: yaml + + inputs: + dataset_a: File + dataset_b: File + reference: File + + steps: + merge: + run: merge-tool.cwl + in: + input_a: dataset_a + input_b: dataset_b + ref: reference + out: [merged] + +Secondary Files +=============== + +Handle index files and companions: + +.. code-block:: yaml + :caption: Port with secondary files + + bindings: + - port: /reference_genome + target: + deployment: genomics-data + workdir: /genomes + +**CWL Input with Secondary Files:** + +.. code-block:: yaml + + inputs: + reference_genome: + type: File + secondaryFiles: + - .fai + - .amb + - .ann + +**Behavior:** + +* Main file: ``/genomes/hg38.fa`` +* Secondary files automatically found: + + * ``/genomes/hg38.fa.fai`` + * ``/genomes/hg38.fa.amb`` + * ``/genomes/hg38.fa.ann`` + +Array Ports +=========== + +Port binding with array inputs: + +.. code-block:: yaml + :caption: Array port target + + bindings: + - port: /input_files + target: + deployment: file-server + workdir: /data/inputs + +**CWL Array Input:** + +.. code-block:: yaml + + inputs: + input_files: + type: File[] + +**Behavior:** + +All files in the array are accessed from the specified location. + +Best Practices +============== + +1. **Bind Step and Port Together** + + Execute where data resides: + + .. code-block:: yaml + + bindings: + - step: /process + target: + deployment: hpc-cluster + - port: /input + target: + deployment: hpc-cluster + workdir: /data + +2. **Use Absolute Paths** + + Specify full paths in ``workdir``: + + .. code-block:: yaml + + # Good + workdir: /home/user/data + + # Avoid + workdir: data # Relative path + +3. **Document Data Locations** + + Comment your port bindings: + + .. code-block:: yaml + + bindings: + # Large genomics dataset stored on HPC shared filesystem + - port: /genome_data + target: + deployment: hpc-storage + workdir: /gpfs/genomics/references + +4. **Consider Data Lifecycle** + + Plan where data is at each workflow stage. + +5. **Test File Accessibility** + + Verify files exist at specified locations: + + .. code-block:: bash + + ssh user@host ls -la /data/input.txt + +Troubleshooting +=============== + +File Not Found +-------------- + +**Problem:** ``No such file or directory`` + +**Solutions:** + +1. **Verify file exists at remote location:** + + .. code-block:: bash + + ssh user@hostname ls -la /workdir/file.txt + +2. **Check path is absolute:** + + .. code-block:: yaml + + workdir: /absolute/path # Not relative + +3. **Verify permissions:** + + .. code-block:: bash + + ssh user@hostname test -r /workdir/file.txt && echo "readable" + +4. **Check CWL input name matches:** + + Port name must match CWL workflow input name. + +Unexpected Transfer +------------------- + +**Problem:** StreamFlow still transfers files + +**Cause:** Port binding not recognized + +**Solutions:** + +* Verify port path matches CWL definition +* Check spelling of port name +* Ensure ``workdir`` is correct + +Permission Denied +----------------- + +**Problem:** ``Permission denied`` accessing remote file + +**Solutions:** + +* Verify SSH key has access +* Check file permissions on remote system +* Ensure user has read/write permissions +* Test manually: ``ssh user@host cat /path/to/file`` + +Wrong Deployment +---------------- + +**Problem:** Port bound to wrong deployment + +**Solution:** + +Carefully check deployment names: + +.. code-block:: yaml + + # Deployment definition + deployments: + hpc-storage: # Name here + type: ssh + config: { ... } + + # Port binding - must match + bindings: + - port: /data + target: + deployment: hpc-storage # Must match exactly + +Examples +======== + +Genomics Pipeline +----------------- + +.. code-block:: yaml + :caption: Genomics workflow with port targets + + bindings: + - step: /align + target: + deployment: hpc-cluster + + - step: /call_variants + target: + deployment: hpc-cluster + + # Reference genome on shared storage + - port: /reference + target: + deployment: hpc-storage + workdir: /genomes/hg38 + + # Raw reads on shared storage + - port: /reads + target: + deployment: hpc-storage + workdir: /sequencing/project-123/raw + + # Results to project directory + - port: /variants + target: + deployment: hpc-storage + workdir: /sequencing/project-123/results + +Machine Learning Training +-------------------------- + +.. code-block:: yaml + :caption: ML training with remote data + + bindings: + - step: /train_model + target: + deployment: gpu-cluster + + # Training data on fast storage + - port: /training_data + target: + deployment: nvme-storage + workdir: /fast-storage/datasets/imagenet + + # Checkpoints to persistent storage + - port: /checkpoints + target: + deployment: persistent-storage + workdir: /models/experiment-42/checkpoints + +Next Steps +========== + +After mastering port targets: + +* :doc:`stacked-locations` - Complex deployment hierarchies +* :doc:`multiple-targets` - Multiple target strategies +* :doc:`/user-guide/configuring-deployments` - Deployment details +* :doc:`/developer-guide/core-interfaces/data` - Data management internals + +Related Topics +============== + +* :doc:`/user-guide/binding-workflows` - Basic binding concepts +* :doc:`/user-guide/writing-workflows` - CWL workflow syntax +* :doc:`/reference/configuration/binding-config` - Complete binding reference diff --git a/docs/source/user-guide/advanced-patterns/stacked-locations.rst b/docs/source/user-guide/advanced-patterns/stacked-locations.rst new file mode 100644 index 000000000..5b08d630c --- /dev/null +++ b/docs/source/user-guide/advanced-patterns/stacked-locations.rst @@ -0,0 +1,1044 @@ +================= +Stacked Locations +================= + +.. meta:: + :keywords: StreamFlow, stacked locations, wraps, deployment hierarchy, HPC, containers + :description: Learn how to create complex execution environments by stacking deployments using the wraps directive + +Overview +======== + +Stacked locations enable you to describe complex, layered execution environments by composing multiple deployments. For example, you can run Singularity containers inside Slurm jobs accessed through SSH—a common pattern in HPC environments. + +Understanding Stacking +====================== + +**Stacking Concept:** + +Deployments can "wrap" other deployments, creating execution hierarchies that match real-world infrastructure. + +**Common Pattern:** + +``Container → Queue Manager → SSH → Local`` + +**Benefits:** + +* Separation of concerns (networking vs. scheduling vs. environment) +* Reusable deployment definitions +* Match actual infrastructure topology +* Simplify complex configurations + +When to Use Stacked Locations +============================== + +============================ ======================================== +Scenario Example +============================ ======================================== +**HPC Access** SSH to login node, submit to queue manager +**Containerized HPC** Containers launched by queue managers +**Complex Microservices** Target specific services in Docker Compose +**Multi-Hop Access** Jump hosts to reach compute resources +**Environment Layering** Python virtualenv in Singularity on Slurm +============================ ======================================== + +Basic Stacking with wraps +========================== + +Simple Two-Layer Stack +---------------------- + +Connect to HPC via SSH, then submit to Slurm: + +.. code-block:: yaml + :caption: SSH wrapping example + + version: v1.0 + + workflows: + compute-job: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /compute + target: + deployment: slurm-hpc + + deployments: + ssh-hpc: + type: ssh + config: + hostname: login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + maxConnections: 5 + + slurm-hpc: + type: slurm + config: + partition: compute + nodes: 1 + ntasks: 16 + wraps: ssh-hpc + +**Execution Flow:** + +1. StreamFlow opens SSH connection to ``login.hpc.edu`` +2. Through SSH, submits Slurm job to ``compute`` partition +3. Job executes on compute node +4. Results return through SSH connection + +**Key Point:** The ``wraps`` directive tells StreamFlow that ``slurm-hpc`` wraps ``ssh-hpc``. + +Three-Layer Stack +----------------- + +Singularity container in Slurm job via SSH: + +.. code-block:: yaml + :caption: Three-layer stack + + deployments: + ssh-hpc: + type: ssh + config: + hostname: login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + slurm-hpc: + type: slurm + config: + partition: gpu + nodes: 1 + ntasks: 8 + gres: gpu:1 + wraps: ssh-hpc + + singularity-env: + type: singularity + config: + image: docker://tensorflow/tensorflow:latest-gpu + wraps: slurm-hpc + + workflows: + ml-training: + type: cwl + config: + file: train.cwl + bindings: + - step: /train + target: + deployment: singularity-env + +**Execution Flow:** + +1. SSH to login node (``ssh-hpc``) +2. Submit Slurm job requesting GPU (``slurm-hpc``) +3. Within job, launch Singularity container (``singularity-env``) +4. Execute training inside container + +**Result:** TensorFlow job runs in GPU-enabled container on Slurm-managed node. + +Practical Examples +================== + +Classic HPC Pattern +------------------- + +Most HPC facilities use this structure: + +.. code-block:: yaml + :caption: Standard HPC configuration + + deployments: + # Network layer: Access to HPC + hpc-login: + type: ssh + config: + hostname: login.supercomputer.edu + username: researcher + sshKey: ~/.ssh/hpc_key + maxConnections: 10 + # Connection through firewall/VPN + proxyJump: gateway.university.edu + + # Scheduling layer: Resource management + hpc-scheduler: + type: slurm + config: + partition: high-memory + nodes: 2 + ntasksPerNode: 32 + mem: 256GB + time: 24:00:00 + account: project-12345 + wraps: hpc-login + + # Environment layer: Software stack + hpc-container: + type: singularity + config: + image: /apps/containers/bioinformatics.sif + bind: + - /scratch:/scratch + - /projects:/projects + wraps: hpc-scheduler + + workflows: + genome-assembly: + type: cwl + config: + file: assembly.cwl + bindings: + - step: /assemble + target: + deployment: hpc-container + +**Why Three Layers:** + +* **SSH Layer:** Handles network access, authentication +* **Slurm Layer:** Manages compute resources, scheduling +* **Singularity Layer:** Provides consistent software environment + +**Best Practice:** Keep each layer focused on single responsibility. + +Docker Compose with Slurm +-------------------------- + +Run Slurm jobs that target services in Docker Compose: + +.. code-block:: yaml + :caption: Slurm wrapping Docker Compose service + + deployments: + # Complex deployment: microservices architecture + microservices: + type: docker-compose + config: + file: docker-compose.yml + + # Target specific service for compute jobs + batch-processor: + type: slurm + config: + partition: batch + nodes: 1 + ntasks: 4 + wraps: + deployment: microservices + service: controller + +**docker-compose.yml:** + +.. code-block:: yaml + :caption: Docker Compose with multiple services + + version: '3' + services: + controller: + image: batch-controller:latest + ports: + - "5000:5000" + + database: + image: postgres:13 + volumes: + - db-data:/var/lib/postgresql/data + + worker: + image: batch-worker:latest + depends_on: + - database + +**Behavior:** + +* Slurm jobs target only the ``controller`` service +* Other services (database, worker) run independently +* Enables complex architectures with targeted execution + +Kubernetes with SSH +------------------- + +Access remote Kubernetes cluster: + +.. code-block:: yaml + :caption: Kubernetes through SSH tunnel + + deployments: + k8s-gateway: + type: ssh + config: + hostname: k8s-master.cloud.example.com + username: admin + sshKey: ~/.ssh/k8s_key + + k8s-cluster: + type: kubernetes + config: + kubeconfig: /home/admin/.kube/config + namespace: streamflow-jobs + wraps: k8s-gateway + +**Use Case:** Kubernetes API not directly accessible, must tunnel through gateway. + +ConnectorWrapper Interface +========================== + +Which Connectors Support Wrapping? +----------------------------------- + +Only connectors implementing ``ConnectorWrapper`` interface support the ``wraps`` directive: + +=================== ===================== =========================== +Connector Supports wraps Default Wraps +=================== ===================== =========================== +``local`` No N/A +``ssh`` No N/A +``docker`` Yes LocalConnector +``docker-compose`` Yes LocalConnector +``kubernetes`` Yes LocalConnector +``singularity`` Yes LocalConnector +``slurm`` Yes LocalConnector +``pbs`` Yes LocalConnector +=================== ===================== =========================== + +**Important Rules:** + +1. **Only wrappers can use wraps directive** + + .. code-block:: yaml + + # ERROR: ssh does not implement ConnectorWrapper + deployments: + invalid: + type: ssh + wraps: local # Will fail during initialization + +2. **Default wrapping** + + .. code-block:: yaml + + # These are equivalent + deployments: + explicit: + type: docker + wraps: local + + implicit: + type: docker + # Automatically wraps LocalConnector + +3. **Single inner location** + + .. code-block:: yaml + + # ERROR: Cannot wrap multiple deployments + deployments: + invalid: + type: slurm + wraps: [ssh-1, ssh-2] # Not supported + +Multiple Wrappers, Single Inner +-------------------------------- + +One deployment can be wrapped by multiple outer deployments: + +.. code-block:: yaml + :caption: Shared SSH connection + + deployments: + # Shared inner layer + hpc-access: + type: ssh + config: + hostname: hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + # Multiple queue managers wrap same SSH + slurm-short: + type: slurm + config: + partition: short + time: 01:00:00 + wraps: hpc-access + + slurm-long: + type: slurm + config: + partition: long + time: 48:00:00 + wraps: hpc-access + + pbs-gpu: + type: pbs + config: + queue: gpu + walltime: 24:00:00 + wraps: hpc-access + +**Benefit:** Single SSH connection shared by multiple schedulers. + +Service-Level Wrapping +====================== + +Target Specific Service +----------------------- + +Wrap individual services in complex deployments: + +.. code-block:: yaml + :caption: Service-level wrapping + + deployments: + app-stack: + type: docker-compose + config: + file: stack.yml + services: + frontend: + ports: ["80:80"] + backend: + ports: ["3000:3000"] + worker: + replicas: 3 + + # Wrap only the worker service + batch-jobs: + type: slurm + config: + partition: batch + wraps: + deployment: app-stack + service: worker + +**Use Case:** In a microservices architecture, only the worker service needs batch processing, while frontend/backend run continuously. + +Full Syntax +----------- + +.. code-block:: yaml + :caption: Service wrapping syntax + + deployments: + outer-deployment: + type: connector-type + config: + # Outer configuration + wraps: + deployment: inner-deployment-name + service: service-name + +**When to Use:** + +* Docker Compose with multiple services +* Kubernetes deployments with multiple containers +* Complex architectures where only specific services need wrapping + +Deployment Order +================ + +StreamFlow's DeploymentManager guarantees correct deployment order: + +.. code-block:: yaml + :caption: Complex stack + + deployments: + ssh-access: + type: ssh + config: { ... } + + slurm-scheduler: + type: slurm + config: { ... } + wraps: ssh-access + + container-env: + type: singularity + config: { ... } + wraps: slurm-scheduler + +**Deployment Order:** (innermost to outermost) + +1. ``ssh-access`` - Establish SSH connection +2. ``slurm-scheduler`` - Set up Slurm submission +3. ``container-env`` - Prepare container environment + +**Undeployment Order:** (outermost to innermost) + +1. ``container-env`` - Clean up container +2. ``slurm-scheduler`` - Cancel/cleanup Slurm jobs +3. ``ssh-access`` - Close SSH connection + +**Automatic:** StreamFlow handles ordering automatically based on ``wraps`` relationships. + +Advanced Patterns +================= + +Conditional Stacking +-------------------- + +Use different stacks for different steps: + +.. code-block:: yaml + :caption: Different stacks per step + + deployments: + ssh-hpc: + type: ssh + config: { ... } + + slurm-cpu: + type: slurm + config: + partition: cpu + wraps: ssh-hpc + + slurm-gpu: + type: slurm + config: + partition: gpu + gres: gpu:1 + wraps: ssh-hpc + + singularity-cpu: + type: singularity + config: + image: /apps/cpu-tools.sif + wraps: slurm-cpu + + singularity-gpu: + type: singularity + config: + image: /apps/gpu-tools.sif + nv: true # NVIDIA GPU support + wraps: slurm-gpu + + workflows: + pipeline: + type: cwl + config: + file: pipeline.cwl + bindings: + - step: /preprocess + target: + deployment: singularity-cpu + + - step: /train_model + target: + deployment: singularity-gpu + + - step: /evaluate + target: + deployment: singularity-cpu + +**Pattern:** CPU and GPU workloads use different stacks with appropriate resources. + +Reusable Base Layers +-------------------- + +Define common base layers, specialize on top: + +.. code-block:: yaml + :caption: Reusable base + + deployments: + # Base: SSH access (reusable) + hpc-ssh: + type: ssh + config: + hostname: hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + # Specializations: Different queues + quick-jobs: + type: slurm + config: + partition: quick + time: 00:15:00 + wraps: hpc-ssh + + standard-jobs: + type: slurm + config: + partition: standard + time: 24:00:00 + wraps: hpc-ssh + + bigmem-jobs: + type: slurm + config: + partition: bigmem + mem: 500GB + wraps: hpc-ssh + +**Benefit:** Don't repeat SSH configuration for each queue. + +Migration from StreamFlow v0.1 +=============================== + +Deprecated Pattern (Still Supported) +------------------------------------- + +StreamFlow v0.1 embedded SSH config in queue manager: + +.. code-block:: yaml + :caption: Old pattern (deprecated, will be removed in v0.3) + + deployments: + slurm-hpc: + type: slurm + config: + # SSH properties directly in Slurm config + hostname: hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + # Slurm properties + partition: compute + nodes: 1 + +**Status:** Works in v0.2, but deprecated. Will be removed in v0.3. + +New Pattern (Recommended) +-------------------------- + +Use explicit stacking: + +.. code-block:: yaml + :caption: New pattern (recommended) + + deployments: + ssh-hpc: + type: ssh + config: + hostname: hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + slurm-hpc: + type: slurm + config: + partition: compute + nodes: 1 + wraps: ssh-hpc + +**Migration:** Separate SSH config into dedicated deployment, use ``wraps``. + +Best Practices +============== + +1. **Separate Concerns** + + Each layer should have single responsibility: + + .. code-block:: yaml + + # Good: Clear separation + ssh-layer: # Networking + slurm-layer: # Scheduling + container-layer: # Environment + +2. **Name Layers Descriptively** + + .. code-block:: yaml + + # Good + deployments: + hpc-ssh-access: + type: ssh + hpc-gpu-queue: + type: slurm + cuda-environment: + type: singularity + + # Avoid + deployments: + deployment1: + type: ssh + deployment2: + type: slurm + +3. **Document Stacking Relationships** + + .. code-block:: yaml + + deployments: + # Layer 1: Network access to HPC facility + hpc-login: + type: ssh + config: { ... } + + # Layer 2: Slurm scheduler (wraps hpc-login) + hpc-slurm: + type: slurm + config: { ... } + wraps: hpc-login + + # Layer 3: Singularity container (wraps hpc-slurm) + analysis-env: + type: singularity + config: { ... } + wraps: hpc-slurm + +4. **Test Each Layer Independently** + + Verify each deployment works before stacking: + + .. code-block:: bash + + # Test SSH connection + ssh -i ~/.ssh/id_rsa user@hpc.edu "hostname" + + # Test Slurm submission (through SSH) + ssh -i ~/.ssh/id_rsa user@hpc.edu "sbatch --wrap 'hostname'" + + # Test Singularity container + singularity exec container.sif command + +5. **Use Defaults Wisely** + + If wrapping local execution, omit ``wraps`` directive: + + .. code-block:: yaml + + # These are equivalent + deployments: + explicit: + type: docker + wraps: local + + implicit: + type: docker + # Implicitly wraps local + +6. **Reuse Base Deployments** + + Create shared base layers for consistency: + + .. code-block:: yaml + + deployments: + shared-ssh: + type: ssh + config: { ... } + + # Multiple deployments reuse base + slurm-cpu: + type: slurm + config: { ... } + wraps: shared-ssh + + slurm-gpu: + type: slurm + config: { ... } + wraps: shared-ssh + +Troubleshooting +=============== + +Invalid wraps Directive +----------------------- + +**Problem:** ``Error: Connector 'X' does not support wraps directive`` + +**Cause:** Attempting to use ``wraps`` on connector that doesn't implement ConnectorWrapper + +**Solution:** + +Check if connector supports wrapping: + +.. code-block:: yaml + + # ERROR: ssh cannot wrap + deployments: + invalid: + type: ssh + wraps: local + +**Fix:** Remove ``wraps`` directive or use connector that supports it. + +Cannot Connect Through Stack +----------------------------- + +**Problem:** Deployment succeeds but cannot execute commands + +**Cause:** Connectivity issue in one layer + +**Solution:** + +Test each layer: + +.. code-block:: bash + + # Test inner layer (SSH) + ssh user@host "echo success" + + # Test middle layer (Slurm through SSH) + ssh user@host "sinfo" # Check Slurm is accessible + + # Test outer layer (container) + ssh user@host "singularity exec image.sif echo success" + +**Check logs:** + +.. code-block:: bash + + streamflow run streamflow.yml --debug + +Look for connection failures at specific layers. + +Wrong Deployment Order +---------------------- + +**Problem:** Deployment fails with initialization errors + +**Cause:** StreamFlow may have incorrect dependency order + +**Solution:** + +Verify ``wraps`` relationships form valid tree: + +.. code-block:: yaml + + # Valid tree structure + A wraps: local + B wraps: A + C wraps: B + + # Invalid: circular + A wraps: B + B wraps: A # ERROR: circular dependency + +**Fix:** Ensure no circular dependencies in wrapping. + +Service Not Found +----------------- + +**Problem:** ``Service 'X' not found in deployment 'Y'`` + +**Cause:** Service name doesn't exist in wrapped deployment + +**Solution:** + +1. **Check service name in deployment config:** + + .. code-block:: yaml + + deployments: + compose-app: + type: docker-compose + config: + file: app.yml + services: + web: # Service name + worker: # Service name + database: # Service name + +2. **Match exactly in wraps:** + + .. code-block:: yaml + + batch: + type: slurm + wraps: + deployment: compose-app + service: worker # Must match exactly + +Container Not Starting +---------------------- + +**Problem:** Container fails to start in stacked environment + +**Cause:** Resource constraints or image access issues + +**Solution:** + +1. **Check Slurm job has sufficient resources:** + + .. code-block:: yaml + + slurm-layer: + type: slurm + config: + nodes: 1 + ntasks: 8 + mem: 32GB # Ensure sufficient memory + +2. **Verify container image accessible from compute node:** + + .. code-block:: bash + + # On compute node (in Slurm job) + ssh user@host 'srun singularity exec /path/to/image.sif ls' + +3. **Check bind mounts exist:** + + .. code-block:: yaml + + singularity-layer: + type: singularity + config: + image: /apps/image.sif + bind: + - /data:/data # Verify /data exists on compute node + - /scratch:/scratch + +Real-World Examples +=================== + +Bioinformatics HPC Workflow +---------------------------- + +.. code-block:: yaml + :caption: Complete bioinformatics stack + + version: v1.0 + + deployments: + # Layer 1: SSH to HPC login node + bioinfo-hpc: + type: ssh + config: + hostname: login.biocompute.edu + username: researcher + sshKey: ~/.ssh/bioinfo_key + maxConnections: 5 + + # Layer 2: Slurm scheduler + slurm-highmem: + type: slurm + config: + partition: highmem + nodes: 1 + ntasksPerNode: 16 + mem: 128GB + time: 72:00:00 + account: bio-project-001 + wraps: bioinfo-hpc + + # Layer 3: Singularity with bioinformatics tools + biotools-container: + type: singularity + config: + image: /apps/containers/bioinfo-suite-2024.sif + bind: + - /gpfs/genomics:/data + - /gpfs/scratch:/scratch + wraps: slurm-highmem + + workflows: + genome-analysis: + type: cwl + config: + file: analysis.cwl + bindings: + - step: /quality-control + target: + deployment: biotools-container + + - step: /alignment + target: + deployment: biotools-container + + - step: /variant-calling + target: + deployment: biotools-container + + # Reference genome on shared storage + - port: /reference + target: + deployment: bioinfo-hpc + workdir: /gpfs/genomics/references/hg38 + +**Architecture:** + +* All tools execute in consistent Singularity environment +* Slurm manages compute resources +* SSH provides access through firewall +* Shared storage accessible at all layers + +Cloud-HPC Hybrid +---------------- + +.. code-block:: yaml + :caption: Hybrid cloud-HPC deployment + + deployments: + # Cloud: Direct Kubernetes access + cloud-k8s: + type: kubernetes + config: + kubeconfig: ~/.kube/config-cloud + namespace: preprocessing + + # HPC: SSH access + hpc-ssh: + type: ssh + config: + hostname: hpc.university.edu + username: user + sshKey: ~/.ssh/hpc_key + + # HPC: Slurm for computation + hpc-slurm: + type: slurm + config: + partition: compute + nodes: 4 + ntasksPerNode: 32 + wraps: hpc-ssh + + # HPC: Container environment + hpc-container: + type: singularity + config: + image: /shared/apps/compute-env.sif + wraps: hpc-slurm + + workflows: + hybrid-pipeline: + type: cwl + config: + file: pipeline.cwl + bindings: + # Light preprocessing on cloud + - step: /preprocess + target: + deployment: cloud-k8s + + # Heavy computation on HPC + - step: /simulate + target: + deployment: hpc-container + + # Visualization back on cloud + - step: /visualize + target: + deployment: cloud-k8s + +**Pattern:** + +* Cloud (single layer): Kubernetes for lightweight tasks +* HPC (three layers): SSH → Slurm → Singularity for heavy computation +* Data automatically transferred between environments + +Next Steps +========== + +After mastering stacked locations: + +* :doc:`multiple-targets` - Distribute work across resources +* :doc:`port-targets` - Control data flow between layers +* :doc:`/developer-guide/extension-points/connector` - Create custom connectors +* :doc:`/reference/configuration/deployment-config` - Complete deployment reference + +Related Topics +============== + +* :doc:`/user-guide/configuring-deployments` - Deployment fundamentals +* :doc:`/user-guide/binding-workflows` - Binding concepts +* :doc:`/developer-guide/core-interfaces/deployment` - Deployment internals +* :ref:`ConnectorWrapper ` - Wrapper interface details diff --git a/docs/source/user-guide/binding-workflows.rst b/docs/source/user-guide/binding-workflows.rst new file mode 100644 index 000000000..09a8c3189 --- /dev/null +++ b/docs/source/user-guide/binding-workflows.rst @@ -0,0 +1,605 @@ +================== +Binding Workflows +================== + +.. meta:: + :keywords: StreamFlow, binding, deployment, workflow, target, filter + :description: Learn how to bind workflow steps to execution environments in StreamFlow + +Overview +======== + +Bindings connect workflow steps to execution environments. This guide explains how to configure bindings in the StreamFlow configuration file to control where each step executes. + +Binding Concepts +================ + +======================== ======================================== +Concept Description +======================== ======================================== +**Step** A single computational task in a workflow +**Binding** Association between step and deployment +**Target** Deployment/service where step executes +**Filter** Strategy for selecting among multiple targets +======================== ======================================== + +StreamFlow Configuration File +============================== + +The ``streamflow.yml`` file is the entrypoint for StreamFlow execution. It connects workflows with deployments through bindings. + +Basic Structure +--------------- + +.. code-block:: yaml + :caption: streamflow.yml - Basic structure + + version: v1.0 + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + bindings: + - step: /step-name + target: + deployment: deployment-name + + deployments: + deployment-name: + type: connector-type + config: + # Connector configuration + +**Required Fields:** + +* ``version`` - StreamFlow configuration version (currently ``v1.0``) +* ``workflows`` - Dictionary of workflow definitions +* ``deployments`` - Dictionary of deployment definitions + +Workflow Configuration +====================== + +Each workflow entry contains: + +============== ==================================== +Field Description +============== ==================================== +``type`` Workflow language (``cwl``) +``config`` Workflow-specific configuration +``bindings`` List of step-to-deployment mappings +============== ==================================== + +CWL Workflow Config +------------------- + +.. code-block:: yaml + :caption: CWL workflow configuration + + workflows: + my-cwl-workflow: + type: cwl + config: + file: workflow.cwl # Required: CWL workflow file + settings: inputs.yml # Optional: Input values file + +Step Identification +=================== + +Steps are identified using POSIX-like paths: + +================== ======================================== +Path Meaning +================== ======================================== +``/`` Entire workflow (root) +``/step-name`` Top-level step +``/sub/nested`` Nested sub-workflow step +================== ======================================== + +**Example Workflow:** + +.. code-block:: yaml + :caption: workflow.cwl - Multi-step workflow + + cwlVersion: v1.2 + class: Workflow + steps: + preprocess: + run: preprocess.cwl + in: { ... } + out: [...] + + analyze: + run: analyze.cwl + in: { ... } + out: [...] + + visualize: + run: visualize.cwl + in: { ... } + out: [...] + +**Step Paths:** + +* ``/preprocess`` - The preprocess step +* ``/analyze`` - The analyze step +* ``/visualize`` - The visualize step +* ``/`` - The entire workflow + +Basic Bindings +============== + +Single Step Binding +------------------- + +Bind a specific step to a deployment: + +.. code-block:: yaml + :caption: Single step binding + + workflows: + example: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + bindings: + - step: /compile + target: + deployment: docker-java + + deployments: + docker-java: + type: docker + config: + image: openjdk:11 + +**Result:** The ``/compile`` step executes in the ``docker-java`` deployment. + +Whole Workflow Binding +---------------------- + +Bind all steps to one deployment: + +.. code-block:: yaml + :caption: Bind entire workflow + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: / + target: + deployment: my-cluster + + deployments: + my-cluster: + type: kubernetes + config: + kubeconfig: ~/.kube/config + +**Result:** All steps execute on the Kubernetes cluster. + +Multiple Step Bindings +---------------------- + +Bind different steps to different deployments: + +.. code-block:: yaml + :caption: Multiple bindings + + workflows: + pipeline: + type: cwl + config: + file: pipeline.cwl + bindings: + - step: /preprocess + target: + deployment: fast-cloud + + - step: /heavy_compute + target: + deployment: hpc-cluster + + - step: /visualize + target: + deployment: local + + deployments: + fast-cloud: + type: kubernetes + config: { ... } + + hpc-cluster: + type: slurm + config: { ... } + + local: + type: local + +**Result:** Hybrid execution across cloud, HPC, and local environments. + +Service-Level Bindings +====================== + +Target specific services within deployments: + +.. code-block:: yaml + :caption: Service-level binding + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /cpu_task + target: + deployment: k8s-cluster + service: cpu-workers + + - step: /gpu_task + target: + deployment: k8s-cluster + service: gpu-workers + + deployments: + k8s-cluster: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + cpu-workers: + replicas: 10 + template: + spec: + containers: + - name: worker + image: python:3.10 + + gpu-workers: + replicas: 2 + template: + spec: + containers: + - name: gpu-worker + image: tensorflow/tensorflow:latest-gpu + resources: + limits: + nvidia.com/gpu: 1 + +**Result:** CPU tasks run on CPU workers, GPU tasks run on GPU workers. + +Multiple Targets +================ + +Bind a step to multiple targets for load balancing: + +.. code-block:: yaml + :caption: Multiple targets + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + + deployments: + cluster-1: + type: slurm + config: { ... } + + cluster-2: + type: slurm + config: { ... } + + cluster-3: + type: slurm + config: { ... } + +**Result:** + +* Step instances can execute on any of the three clusters +* Useful for scatter operations that generate multiple tasks +* StreamFlow scheduler selects target based on availability + +Binding Filters +=============== + +Filters control target selection among multiple options. + +Shuffle Filter +-------------- + +Evaluate targets in random order: + +.. code-block:: yaml + :caption: Shuffle filter + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + filters: + - shuffle + +**Use Case:** Distribute load randomly across clusters. + +Custom Filters +-------------- + +StreamFlow supports custom binding filters through plugins. See :doc:`/developer-guide/extension-points/binding-filter` for creating custom filters. + +Port Bindings +============= + +Bind input/output ports to specific locations for data staging. + +Basic Port Binding +------------------ + +.. code-block:: yaml + :caption: Port target configuration + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - port: /compile/src + target: + deployment: hpc-storage + workdir: /scratch/user/data + + deployments: + hpc-storage: + type: ssh + config: + hostname: storage.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + +**Result:** StreamFlow looks for the ``src`` input file on the remote HPC storage instead of locally. + +Port Path Syntax +---------------- + +Ports use POSIX-like paths: ``/step-name/port-name`` + +================== ======================================== +Path Meaning +================== ======================================== +``/tarball`` Workflow input port +``/compile/src`` Input port ``src`` of step ``compile`` +``/compile/class`` Output port ``class`` of step ``compile`` +================== ======================================== + +Use Cases for Port Bindings +---------------------------- + +1. **Data Already on Remote System:** + + .. code-block:: yaml + + bindings: + - port: /input_data + target: + deployment: hpc-cluster + workdir: /data/project + +2. **Avoid Large Data Transfers:** + + .. code-block:: yaml + + bindings: + - port: /large_dataset + target: + deployment: storage-server + workdir: /mnt/datasets + +3. **Stage Outputs to Specific Location:** + + .. code-block:: yaml + + bindings: + - port: /final_results + target: + deployment: archive-storage + workdir: /archive/project-123 + +Advanced Binding Patterns +========================== + +Stacked Locations +----------------- + +Wrap deployments for complex execution environments: + +.. code-block:: yaml + :caption: Singularity on Slurm via SSH + + deployments: + ssh-hpc: + type: ssh + config: + hostname: login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + slurm-hpc: + type: slurm + config: + partition: standard + wraps: ssh-hpc + + singularity-container: + type: singularity + config: + image: docker://python:3.10 + wraps: slurm-hpc + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /analyze + target: + deployment: singularity-container + +**Result:** + +1. Connect via SSH to HPC login node +2. Submit Slurm job +3. Run Singularity container in the job +4. Execute workflow step inside container + +For details, see :doc:`advanced-patterns/stacked-locations`. + +Wrap Specific Service +--------------------- + +.. code-block:: yaml + :caption: Wrap specific service + + deployments: + microservices: + type: docker-compose + config: + file: docker-compose.yml + + compute-in-container: + type: slurm + config: + partition: batch + wraps: + deployment: microservices + service: compute-node + +Conditional Bindings +-------------------- + +Use different deployments based on conditions (requires custom logic in workflow): + +.. code-block:: yaml + :caption: Environment-specific bindings + + workflows: + adaptive-workflow: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /small_task + target: + deployment: local + + - step: /medium_task + target: + deployment: docker-cluster + + - step: /large_task + target: + deployment: hpc-cluster + +Binding Best Practices +====================== + +* **Start simple:** Begin with ``step: /`` binding to single deployment, then optimize for specific steps +* **Group similar steps:** Bind related steps to the same deployment to minimize data transfers +* **Consider data locality:** Execute steps close to where data resides using port bindings +* **Use services for resource differentiation:** Define separate services for CPU vs GPU tasks +* **Test locally first:** Validate workflow logic with local execution before remote deployment + +Complete Example +================ + +Here's a complete ``streamflow.yml`` example demonstrating hybrid execution: + +.. code-block:: yaml + :caption: Complete StreamFlow configuration + + version: v1.0 + + workflows: + data-pipeline: + type: cwl + config: + file: pipeline.cwl + settings: inputs.yml + bindings: + - step: /ingest + target: + deployment: cloud-workers + - step: /process + target: + deployment: hpc-slurm + service: compute-nodes + - step: /visualize + target: + deployment: local + - port: /raw_data + target: + deployment: hpc-storage + workdir: /scratch/project/data + + deployments: + cloud-workers: + type: kubernetes + config: + kubeconfig: ~/.kube/config + + hpc-storage: + type: ssh + config: + hostname: storage.hpc.edu + username: researcher + sshKey: ~/.ssh/hpc_key + + hpc-slurm: + type: slurm + config: + partition: standard + wraps: hpc-storage + services: + compute-nodes: + nodes: 4 + time: "08:00:00" + + local: + type: local + +Validation +========== + +Generate JSON Schema for IDE validation: + +.. code-block:: bash + + streamflow schema > streamflow-schema.json + +Configure your IDE to use the schema for auto-completion and validation of ``streamflow.yml`` files. + +For binding troubleshooting, see :doc:`troubleshooting`. + +Next: :doc:`running-workflows` to execute your configured workflows, or :doc:`advanced-patterns/index` for complex binding patterns. diff --git a/docs/source/user-guide/binding-workflows.rst.backup b/docs/source/user-guide/binding-workflows.rst.backup new file mode 100644 index 000000000..5848dd369 --- /dev/null +++ b/docs/source/user-guide/binding-workflows.rst.backup @@ -0,0 +1,762 @@ +================== +Binding Workflows +================== + +.. meta:: + :keywords: StreamFlow, binding, deployment, workflow, target, filter + :description: Learn how to bind workflow steps to execution environments in StreamFlow + +Overview +======== + +Bindings connect workflow steps to execution environments. This guide explains how to configure bindings in the StreamFlow configuration file to control where each step executes. + +Binding Concepts +================ + +======================== ======================================== +Concept Description +======================== ======================================== +**Step** A single computational task in a workflow +**Binding** Association between step and deployment +**Target** Deployment/service where step executes +**Filter** Strategy for selecting among multiple targets +======================== ======================================== + +StreamFlow Configuration File +============================== + +The ``streamflow.yml`` file is the entrypoint for StreamFlow execution. It connects workflows with deployments through bindings. + +Basic Structure +--------------- + +.. code-block:: yaml + :caption: streamflow.yml - Basic structure + + version: v1.0 + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + bindings: + - step: /step-name + target: + deployment: deployment-name + + deployments: + deployment-name: + type: connector-type + config: + # Connector configuration + +**Required Fields:** + +* ``version`` - StreamFlow configuration version (currently ``v1.0``) +* ``workflows`` - Dictionary of workflow definitions +* ``deployments`` - Dictionary of deployment definitions + +Workflow Configuration +====================== + +Each workflow entry contains: + +============== ==================================== +Field Description +============== ==================================== +``type`` Workflow language (``cwl``) +``config`` Workflow-specific configuration +``bindings`` List of step-to-deployment mappings +============== ==================================== + +CWL Workflow Config +------------------- + +.. code-block:: yaml + :caption: CWL workflow configuration + + workflows: + my-cwl-workflow: + type: cwl + config: + file: workflow.cwl # Required: CWL workflow file + settings: inputs.yml # Optional: Input values file + +Step Identification +=================== + +Steps are identified using POSIX-like paths: + +================== ======================================== +Path Meaning +================== ======================================== +``/`` Entire workflow (root) +``/step-name`` Top-level step +``/sub/nested`` Nested sub-workflow step +================== ======================================== + +**Example Workflow:** + +.. code-block:: yaml + :caption: workflow.cwl - Multi-step workflow + + cwlVersion: v1.2 + class: Workflow + steps: + preprocess: + run: preprocess.cwl + in: { ... } + out: [...] + + analyze: + run: analyze.cwl + in: { ... } + out: [...] + + visualize: + run: visualize.cwl + in: { ... } + out: [...] + +**Step Paths:** + +* ``/preprocess`` - The preprocess step +* ``/analyze`` - The analyze step +* ``/visualize`` - The visualize step +* ``/`` - The entire workflow + +Basic Bindings +============== + +Single Step Binding +------------------- + +Bind a specific step to a deployment: + +.. code-block:: yaml + :caption: Single step binding + + workflows: + example: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + bindings: + - step: /compile + target: + deployment: docker-java + + deployments: + docker-java: + type: docker + config: + image: openjdk:11 + +**Result:** The ``/compile`` step executes in the ``docker-java`` deployment. + +Whole Workflow Binding +---------------------- + +Bind all steps to one deployment: + +.. code-block:: yaml + :caption: Bind entire workflow + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: / + target: + deployment: my-cluster + + deployments: + my-cluster: + type: kubernetes + config: + kubeconfig: ~/.kube/config + +**Result:** All steps execute on the Kubernetes cluster. + +Multiple Step Bindings +---------------------- + +Bind different steps to different deployments: + +.. code-block:: yaml + :caption: Multiple bindings + + workflows: + pipeline: + type: cwl + config: + file: pipeline.cwl + bindings: + - step: /preprocess + target: + deployment: fast-cloud + + - step: /heavy_compute + target: + deployment: hpc-cluster + + - step: /visualize + target: + deployment: local + + deployments: + fast-cloud: + type: kubernetes + config: { ... } + + hpc-cluster: + type: slurm + config: { ... } + + local: + type: local + +**Result:** Hybrid execution across cloud, HPC, and local environments. + +Service-Level Bindings +====================== + +Target specific services within deployments: + +.. code-block:: yaml + :caption: Service-level binding + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /cpu_task + target: + deployment: k8s-cluster + service: cpu-workers + + - step: /gpu_task + target: + deployment: k8s-cluster + service: gpu-workers + + deployments: + k8s-cluster: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + cpu-workers: + replicas: 10 + template: + spec: + containers: + - name: worker + image: python:3.10 + + gpu-workers: + replicas: 2 + template: + spec: + containers: + - name: gpu-worker + image: tensorflow/tensorflow:latest-gpu + resources: + limits: + nvidia.com/gpu: 1 + +**Result:** CPU tasks run on CPU workers, GPU tasks run on GPU workers. + +Multiple Targets +================ + +Bind a step to multiple targets for load balancing: + +.. code-block:: yaml + :caption: Multiple targets + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + + deployments: + cluster-1: + type: slurm + config: { ... } + + cluster-2: + type: slurm + config: { ... } + + cluster-3: + type: slurm + config: { ... } + +**Result:** + +* Step instances can execute on any of the three clusters +* Useful for scatter operations that generate multiple tasks +* StreamFlow scheduler selects target based on availability + +Binding Filters +=============== + +Filters control target selection among multiple options. + +Shuffle Filter +-------------- + +Evaluate targets in random order: + +.. code-block:: yaml + :caption: Shuffle filter + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /process + target: + - deployment: cluster-1 + - deployment: cluster-2 + - deployment: cluster-3 + filters: + - shuffle + +**Use Case:** Distribute load randomly across clusters. + +Custom Filters +-------------- + +StreamFlow supports custom binding filters through plugins. See :doc:`/developer-guide/extension-points/binding-filter` for creating custom filters. + +Port Bindings +============= + +Bind input/output ports to specific locations for data staging. + +Basic Port Binding +------------------ + +.. code-block:: yaml + :caption: Port target configuration + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - port: /compile/src + target: + deployment: hpc-storage + workdir: /scratch/user/data + + deployments: + hpc-storage: + type: ssh + config: + hostname: storage.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + +**Result:** StreamFlow looks for the ``src`` input file on the remote HPC storage instead of locally. + +Port Path Syntax +---------------- + +Ports use POSIX-like paths: ``/step-name/port-name`` + +================== ======================================== +Path Meaning +================== ======================================== +``/tarball`` Workflow input port +``/compile/src`` Input port ``src`` of step ``compile`` +``/compile/class`` Output port ``class`` of step ``compile`` +================== ======================================== + +Use Cases for Port Bindings +---------------------------- + +1. **Data Already on Remote System:** + + .. code-block:: yaml + + bindings: + - port: /input_data + target: + deployment: hpc-cluster + workdir: /data/project + +2. **Avoid Large Data Transfers:** + + .. code-block:: yaml + + bindings: + - port: /large_dataset + target: + deployment: storage-server + workdir: /mnt/datasets + +3. **Stage Outputs to Specific Location:** + + .. code-block:: yaml + + bindings: + - port: /final_results + target: + deployment: archive-storage + workdir: /archive/project-123 + +Advanced Binding Patterns +========================== + +Stacked Locations +----------------- + +Wrap deployments for complex execution environments: + +.. code-block:: yaml + :caption: Singularity on Slurm via SSH + + deployments: + ssh-hpc: + type: ssh + config: + hostname: login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + + slurm-hpc: + type: slurm + config: + partition: standard + wraps: ssh-hpc + + singularity-container: + type: singularity + config: + image: docker://python:3.10 + wraps: slurm-hpc + + workflows: + example: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /analyze + target: + deployment: singularity-container + +**Result:** + +1. Connect via SSH to HPC login node +2. Submit Slurm job +3. Run Singularity container in the job +4. Execute workflow step inside container + +For details, see :doc:`advanced-patterns/stacked-locations`. + +Wrap Specific Service +--------------------- + +.. code-block:: yaml + :caption: Wrap specific service + + deployments: + microservices: + type: docker-compose + config: + file: docker-compose.yml + + compute-in-container: + type: slurm + config: + partition: batch + wraps: + deployment: microservices + service: compute-node + +Conditional Bindings +-------------------- + +Use different deployments based on conditions (requires custom logic in workflow): + +.. code-block:: yaml + :caption: Environment-specific bindings + + workflows: + adaptive-workflow: + type: cwl + config: + file: workflow.cwl + bindings: + - step: /small_task + target: + deployment: local + + - step: /medium_task + target: + deployment: docker-cluster + + - step: /large_task + target: + deployment: hpc-cluster + +Binding Best Practices +====================== + +1. **Start Simple** + + Begin with single deployment, then expand: + + .. code-block:: yaml + + # Start with this + bindings: + - step: / + target: + deployment: local + + # Then optimize + bindings: + - step: /preprocess + target: + deployment: cloud + - step: /compute + target: + deployment: hpc + +2. **Group Similar Steps** + + Bind related steps to the same deployment to minimize data transfers. + +3. **Consider Data Locality** + + Execute steps close to where data resides: + + .. code-block:: yaml + + bindings: + - step: /process_hpc_data + target: + deployment: hpc-cluster + - port: /input_data + target: + deployment: hpc-cluster + workdir: /data + +4. **Use Services for Resource Differentiation** + + Define separate services for different resource needs. + +5. **Test Locally First** + + Validate workflow logic with local execution before remote deployment. + +6. **Document Binding Rationale** + + Add comments explaining why steps are bound to specific targets: + + .. code-block:: yaml + + bindings: + # CPU-only preprocessing on cloud for scalability + - step: /preprocess + target: + deployment: kubernetes-cloud + + # GPU-intensive computation on HPC + - step: /train_model + target: + deployment: hpc-gpu-cluster + +JSON Schema Validation +====================== + +Generate JSON Schema for IDE validation and autocomplete: + +.. code-block:: bash + :caption: Generate StreamFlow schema + + streamflow schema > streamflow-schema.json + +Then configure your IDE to use the schema for ``streamflow.yml`` files. + +**Benefits:** + +* Auto-completion of configuration keys +* Real-time validation +* Inline documentation + +Complete Example +================ + +Here's a complete ``streamflow.yml`` example: + +.. code-block:: yaml + :caption: Complete StreamFlow configuration + + version: v1.0 + + workflows: + data-pipeline: + type: cwl + config: + file: pipeline.cwl + settings: inputs.yml + bindings: + # Data ingestion on cloud + - step: /ingest + target: + deployment: cloud-workers + + # Heavy computation on HPC + - step: /process + target: + deployment: hpc-slurm + service: compute-nodes + + # Visualization locally + - step: /visualize + target: + deployment: local + + # Data already on HPC storage + - port: /raw_data + target: + deployment: hpc-storage + workdir: /scratch/project/data + + deployments: + cloud-workers: + type: kubernetes + config: + kubeconfig: ~/.kube/config + namespace: data-pipeline + services: + workers: + replicas: 10 + template: + spec: + containers: + - name: worker + image: python:3.10 + + hpc-storage: + type: ssh + config: + hostname: storage.hpc.edu + username: researcher + sshKey: ~/.ssh/hpc_key + + hpc-slurm: + type: slurm + config: + partition: standard + wraps: hpc-storage + services: + compute-nodes: + nodes: 4 + ntasks-per-node: 32 + time: "08:00:00" + + local: + type: local + +Troubleshooting +=============== + +Invalid Step Path +----------------- + +**Problem:** ``Step '/step-name' not found in workflow`` + +**Solution:** + +* Verify step name matches CWL workflow definition +* Check for typos +* Use ``/`` for entire workflow +* For nested workflows, use full path: ``/subworkflow/step`` + +Deployment Not Found +-------------------- + +**Problem:** ``Deployment 'name' not found`` + +**Solution:** + +* Verify deployment name in ``deployments`` section matches binding +* Check for typos +* Ensure deployment is properly configured + +Service Not Found +----------------- + +**Problem:** ``Service 'name' not found in deployment`` + +**Solution:** + +* Verify service is defined in deployment's ``services`` section +* For simple deployments (docker, local), don't specify service +* Check connector documentation for service support + +Invalid Wraps Configuration +--------------------------- + +**Problem:** ``Connector type does not support wrapping`` + +**Solution:** + +* Only certain connectors support ``wraps`` (see :doc:`/reference/connectors/index`) +* Remove ``wraps`` directive or use compatible connector +* Check connector inherits from ``ConnectorWrapper`` + +Schema Validation Errors +------------------------- + +**Problem:** IDE shows validation errors + +**Solution:** + +* Generate schema: ``streamflow schema > schema.json`` +* Configure IDE to use the schema +* Update schema after installing new plugins + +Next Steps +========== + +After configuring bindings: + +* :doc:`running-workflows` - Execute your workflows +* :doc:`inspecting-results` - Analyze workflow execution +* :doc:`advanced-patterns/index` - Learn advanced binding patterns + +Related Topics +============== + +* :doc:`configuring-deployments` - Deployment configuration reference +* :doc:`/reference/configuration/binding-config` - Binding configuration schema +* :doc:`/developer-guide/extension-points/binding-filter` - Custom binding filters +* :doc:`advanced-patterns/multiple-targets` - Multiple target patterns +* :doc:`advanced-patterns/port-targets` - Port binding patterns +* :doc:`advanced-patterns/stacked-locations` - Stacked location patterns diff --git a/docs/source/user-guide/configuring-deployments.rst b/docs/source/user-guide/configuring-deployments.rst new file mode 100644 index 000000000..087666528 --- /dev/null +++ b/docs/source/user-guide/configuring-deployments.rst @@ -0,0 +1,462 @@ +======================= +Configuring Deployments +======================= + +.. meta:: + :keywords: StreamFlow, deployment, connector, docker, kubernetes, slurm, ssh, configuration + :description: Learn how to configure execution environments and deployments in StreamFlow + +Overview +======== + +Deployments define where and how workflow steps execute. StreamFlow supports diverse execution environments including containers, cloud platforms, and HPC systems through a unified connector interface. + +Deployment Concepts +=================== + +Understanding the deployment hierarchy: + +======================== ======================================== +Concept Description +======================== ======================================== +**Deployment** An entire infrastructure (unit of deployment) +**Service** A type of compute resource within a deployment +**Location** A single instance of a service (unit of scheduling) +**Connector** Implementation that manages deployment lifecycle +======================== ======================================== + +**Example:** + +* Deployment: ``my-k8s-cluster`` +* Service: ``gpu-workers`` (Kubernetes deployment with GPU nodes) +* Locations: Individual pods created by Kubernetes + +Deployment Configuration Structure +=================================== + +Deployments are defined in the StreamFlow configuration file (``streamflow.yml``): + +.. code-block:: yaml + :caption: streamflow.yml structure + + version: v1.0 + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + + deployments: + deployment-name: + type: connector-type + config: + # Connector-specific configuration + services: + service-name: + # Service-specific configuration + +Available Connectors +==================== + +StreamFlow provides connectors for various environments: + +======================== ======================================== +Connector Type Use Case +======================== ======================================== +``local`` Local machine execution +``docker`` Docker containers +``docker-compose`` Docker Compose multi-container +``singularity`` Singularity/Apptainer containers +``kubernetes`` Kubernetes clusters +``helm`` Helm charts on Kubernetes +``ssh`` SSH to remote machines +``slurm`` Slurm HPC scheduler +``pbs`` PBS/Torque HPC scheduler +``flux`` Flux Framework scheduler +``occam`` OCCAM connector +======================== ======================================== + +For complete connector reference, see :doc:`/reference/index`. + +Local Connector +=============== + +Execute on the local machine without containers: + +.. code-block:: yaml + :caption: Local deployment + + deployments: + local: + type: local + +**Use Cases:** + +* Testing workflows locally +* Running lightweight tools +* Development and debugging + +**Notes:** + +* No isolation between tasks +* Shares filesystem with StreamFlow process +* Fastest option for small-scale testing + +Docker Connector +================ + +Execute in Docker containers: + +.. code-block:: yaml + :caption: Docker deployment with common options + + deployments: + docker-python: + type: docker + config: + image: python:3.10 + volumes: + - /host/data:/container/data:ro + environment: + DATABASE_URL: postgresql://localhost/mydb + gpus: all # GPU access (optional) + cpus: 4.0 # CPU limit (optional) + memory: 8g # Memory limit (optional) + +See :doc:`/reference/connectors/docker` for all configuration options. + +Docker Compose Connector +======================== + +Manage multi-container deployments: + +.. code-block:: yaml + :caption: Docker Compose deployment + + deployments: + app-stack: + type: docker-compose + config: + file: docker-compose.yml + services: + web: + # Target the 'web' service from docker-compose.yml + worker: + # Target the 'worker' service + +.. code-block:: yaml + :caption: docker-compose.yml + + version: '3.8' + services: + web: + image: nginx:latest + ports: + - "8080:80" + worker: + image: python:3.10 + volumes: + - ./app:/app + +**Use Cases:** + +* Multi-container applications +* Service dependencies (database + application) +* Complex network configurations + +Kubernetes Connector +==================== + +Deploy on Kubernetes clusters: + +.. code-block:: yaml + :caption: Kubernetes deployment + + deployments: + k8s-cluster: + type: kubernetes + config: + kubeconfig: ~/.kube/config + namespace: streamflow + services: + compute: + replicas: 5 + template: + spec: + containers: + - name: worker + image: python:3.10 + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" + nvidia.com/gpu: 1 # For GPU support + +See :doc:`/reference/connectors/kubernetes` for node affinity, tolerations, and advanced configuration. + +Helm Connector +============== + +Deploy using Helm charts: + +.. code-block:: yaml + :caption: Helm deployment + + deployments: + spark-cluster: + type: helm + config: + chart: bitnami/spark + release: streamflow-spark + namespace: analytics + values: + worker: + replicaCount: 3 + resources: + limits: + cpu: 2 + memory: 4Gi + +**Use Cases:** + +* Deploying complex applications with Helm charts +* Managing application lifecycle +* Using community charts (Spark, Airflow, etc.) + +SSH Connector +============= + +Execute on remote machines via SSH: + +.. code-block:: yaml + :caption: SSH deployment + + deployments: + remote-server: + type: ssh + config: + hostname: compute.example.com + username: myuser + sshKey: ~/.ssh/id_rsa + sshKeyPassphrase: passphrase # Optional + port: 22 # Optional (default: 22) + maxConnections: 10 # Optional (default: 1) + nodes: # For multiple hosts + - hostname: node1.example.com + username: user + sshKey: ~/.ssh/id_rsa + - hostname: node2.example.com + username: user + sshKey: ~/.ssh/id_rsa + +See :doc:`/reference/connectors/ssh` for password authentication and additional options. + +Slurm Connector +=============== + +Submit jobs to Slurm HPC schedulers: + +.. code-block:: yaml + :caption: Slurm deployment + + deployments: + hpc-slurm: + type: slurm + config: + hostname: login.hpc.example.edu + username: researcher + sshKey: ~/.ssh/hpc_key + workdir: /scratch/researcher/streamflow + services: + compute: + partition: standard + nodes: 1 + ntasks: 16 + mem: 64G + time: 02:00:00 + gres: gpu:v100:2 # For GPU allocation (optional) + account: research-grant-123 # For account tracking (optional) + qos: high # For QoS (optional) + +See :doc:`/reference/connectors/slurm` for all Slurm directives and batch options. + +PBS Connector +============= + +Submit jobs to PBS/Torque schedulers: + +.. code-block:: yaml + :caption: PBS deployment + + deployments: + hpc-pbs: + type: pbs + config: + hostname: pbs-login.hpc.edu + username: researcher + sshKey: ~/.ssh/id_rsa + workdir: /home/researcher/jobs + services: + compute: + queue: batch + nodes: 2 + cpus: 32 + mem: 128gb + walltime: "04:00:00" + select: "2:ncpus=16:mem=64gb:ngpus=1" # Alternative resource spec + +See :doc:`/reference/connectors/pbs` for all PBS directives. + +Singularity Connector +===================== + +Execute in Singularity/Apptainer containers: + +.. code-block:: yaml + :caption: Singularity deployment + + deployments: + singularity-hpc: + type: singularity + config: + image: library://library/default/ubuntu:20.04 + # or image: docker://python:3.10 + # or image: /path/to/image.sif + +Singularity is commonly used on HPC systems. See :doc:`/reference/connectors/singularity` for integration with Slurm/PBS. + +Service Configuration +===================== + +Services define resource subsets within deployments: + +.. code-block:: yaml + :caption: Multiple services in one deployment + + deployments: + mixed-resources: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + cpu-workers: + replicas: 10 + template: + spec: + containers: + - name: worker + image: python:3.10 + resources: + requests: + cpu: "2" + memory: "4Gi" + + gpu-workers: + replicas: 2 + template: + spec: + containers: + - name: gpu-worker + image: tensorflow/tensorflow:latest-gpu + resources: + limits: + nvidia.com/gpu: 1 + + memory-intensive: + replicas: 3 + template: + spec: + containers: + - name: bigmem + image: r-base:latest + resources: + requests: + memory: "64Gi" + +Then bind different workflow steps to different services based on resource needs. + +Multi-Deployment Workflows +=========================== + +Use multiple deployments in one workflow: + +.. code-block:: yaml + :caption: Hybrid cloud-HPC workflow + + deployments: + cloud-preprocessing: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + workers: + replicas: 20 + + hpc-computation: + type: slurm + config: + hostname: supercomputer.edu + username: researcher + sshKey: ~/.ssh/id_rsa + services: + compute-nodes: + partition: standard + nodes: 10 + + cloud-postprocessing: + type: docker + config: + image: python:3.10 + + bindings: + - step: preprocess + target: + deployment: cloud-preprocessing + - step: heavy_computation + target: + deployment: hpc-computation + - step: visualize + target: + deployment: cloud-postprocessing + +This enables hybrid workflows across cloud and HPC, cost optimization, and data locality. + +Validation +========== + +After configuring deployments, verify they work by running a simple test workflow: + +.. code-block:: yaml + :caption: test-deployment.yml + + version: v1.0 + + workflows: + test: + type: cwl + config: + file: test.cwl + + deployments: + docker-test: + type: docker + config: + image: python:3.10 + + bindings: + - step: / + target: + deployment: docker-test + +.. code-block:: bash + + $ streamflow run test-deployment.yml --debug + +If the workflow completes successfully, your deployment is configured correctly. For troubleshooting deployment issues, see :doc:`troubleshooting`. + +Next: :doc:`binding-workflows` to bind workflow steps to deployments, or see :doc:`/reference/index` for complete connector documentation. diff --git a/docs/source/user-guide/configuring-deployments.rst.backup b/docs/source/user-guide/configuring-deployments.rst.backup new file mode 100644 index 000000000..33b761c63 --- /dev/null +++ b/docs/source/user-guide/configuring-deployments.rst.backup @@ -0,0 +1,850 @@ +======================= +Configuring Deployments +======================= + +.. meta:: + :keywords: StreamFlow, deployment, connector, docker, kubernetes, slurm, ssh, configuration + :description: Learn how to configure execution environments and deployments in StreamFlow + +Overview +======== + +Deployments define where and how workflow steps execute. StreamFlow supports diverse execution environments including containers, cloud platforms, and HPC systems through a unified connector interface. + +Deployment Concepts +=================== + +Understanding the deployment hierarchy: + +======================== ======================================== +Concept Description +======================== ======================================== +**Deployment** An entire infrastructure (unit of deployment) +**Service** A type of compute resource within a deployment +**Location** A single instance of a service (unit of scheduling) +**Connector** Implementation that manages deployment lifecycle +======================== ======================================== + +**Example:** + +* Deployment: ``my-k8s-cluster`` +* Service: ``gpu-workers`` (Kubernetes deployment with GPU nodes) +* Locations: Individual pods created by Kubernetes + +Deployment Configuration Structure +=================================== + +Deployments are defined in the StreamFlow configuration file (``streamflow.yml``): + +.. code-block:: yaml + :caption: streamflow.yml structure + + version: v1.0 + + workflows: + my-workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + + deployments: + deployment-name: + type: connector-type + config: + # Connector-specific configuration + services: + service-name: + # Service-specific configuration + +Available Connectors +==================== + +StreamFlow provides connectors for various environments: + +======================== ======================================== +Connector Type Use Case +======================== ======================================== +``local`` Local machine execution +``docker`` Docker containers +``docker-compose`` Docker Compose multi-container +``singularity`` Singularity/Apptainer containers +``kubernetes`` Kubernetes clusters +``helm`` Helm charts on Kubernetes +``ssh`` SSH to remote machines +``slurm`` Slurm HPC scheduler +``pbs`` PBS/Torque HPC scheduler +``flux`` Flux Framework scheduler +``occam`` OCCAM connector +======================== ======================================== + +For complete connector reference, see :doc:`/reference/connectors/index`. + +Local Connector +=============== + +Execute on the local machine without containers: + +.. code-block:: yaml + :caption: Local deployment + + deployments: + local: + type: local + +**Use Cases:** + +* Testing workflows locally +* Running lightweight tools +* Development and debugging + +**Notes:** + +* No isolation between tasks +* Shares filesystem with StreamFlow process +* Fastest option for small-scale testing + +Docker Connector +================ + +Execute in Docker containers: + +Basic Configuration +------------------- + +.. code-block:: yaml + :caption: Docker deployment + + deployments: + docker-python: + type: docker + config: + image: python:3.10 + +**With Volume Mounts:** + +.. code-block:: yaml + :caption: Docker with volume mounts + + deployments: + docker-data: + type: docker + config: + image: ubuntu:22.04 + volumes: + - /host/data:/container/data:ro + - /host/output:/container/output:rw + +**With Environment Variables:** + +.. code-block:: yaml + :caption: Docker with environment variables + + deployments: + docker-configured: + type: docker + config: + image: myapp:latest + environment: + DATABASE_URL: postgresql://localhost/mydb + DEBUG: "true" + +**Advanced Configuration:** + +.. code-block:: yaml + :caption: Docker with all options + + deployments: + docker-advanced: + type: docker + config: + image: nvidia/cuda:11.8.0-runtime-ubuntu22.04 + pull_policy: always # always, never, missing + network: host + privileged: false + gpus: all # GPU access + shm_size: 2gb # Shared memory + cpus: 4.0 # CPU limit + memory: 8g # Memory limit + user: "1000:1000" # User:group + workdir: /workspace + entrypoint: /bin/bash + +**Use Cases:** + +* Containerized applications +* Reproducible environments +* Dependency isolation + +**Prerequisites:** + +* Docker installed and running +* Appropriate images available or pullable +* User has Docker permissions + +Docker Compose Connector +======================== + +Manage multi-container deployments: + +.. code-block:: yaml + :caption: Docker Compose deployment + + deployments: + app-stack: + type: docker-compose + config: + file: docker-compose.yml + services: + web: + # Target the 'web' service from docker-compose.yml + worker: + # Target the 'worker' service + +.. code-block:: yaml + :caption: docker-compose.yml + + version: '3.8' + services: + web: + image: nginx:latest + ports: + - "8080:80" + worker: + image: python:3.10 + volumes: + - ./app:/app + +**Use Cases:** + +* Multi-container applications +* Service dependencies (database + application) +* Complex network configurations + +Kubernetes Connector +==================== + +Deploy on Kubernetes clusters: + +Basic Configuration +------------------- + +.. code-block:: yaml + :caption: Kubernetes deployment + + deployments: + k8s-cluster: + type: kubernetes + config: + kubeconfig: ~/.kube/config + namespace: streamflow + services: + compute: + replicas: 5 + template: + spec: + containers: + - name: worker + image: python:3.10 + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" + +**With Node Affinity:** + +.. code-block:: yaml + :caption: Kubernetes with node affinity + + deployments: + k8s-gpu: + type: kubernetes + config: + kubeconfig: ~/.kube/config + namespace: gpu-workloads + services: + gpu-workers: + replicas: 2 + template: + spec: + containers: + - name: gpu-worker + image: tensorflow/tensorflow:latest-gpu + resources: + limits: + nvidia.com/gpu: 1 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu-type + operator: In + values: + - nvidia-v100 + +**Use Cases:** + +* Cloud-native applications +* Auto-scaling workloads +* Production deployments +* Multi-tenant environments + +Helm Connector +============== + +Deploy using Helm charts: + +.. code-block:: yaml + :caption: Helm deployment + + deployments: + spark-cluster: + type: helm + config: + chart: bitnami/spark + release: streamflow-spark + namespace: analytics + values: + worker: + replicaCount: 3 + resources: + limits: + cpu: 2 + memory: 4Gi + +**Use Cases:** + +* Deploying complex applications with Helm charts +* Managing application lifecycle +* Using community charts (Spark, Airflow, etc.) + +SSH Connector +============= + +Execute on remote machines via SSH: + +Basic Configuration +------------------- + +.. code-block:: yaml + :caption: SSH deployment + + deployments: + remote-server: + type: ssh + config: + hostname: compute.example.com + username: myuser + sshKey: ~/.ssh/id_rsa + sshKeyPassphrase: passphrase # Optional + +**With Password Authentication:** + +.. code-block:: yaml + :caption: SSH with password + + deployments: + remote-password: + type: ssh + config: + hostname: 192.168.1.100 + username: user + password: secret # Not recommended for production + +**Multiple Hosts:** + +.. code-block:: yaml + :caption: SSH to multiple hosts + + deployments: + ssh-cluster: + type: ssh + config: + nodes: + - hostname: node1.example.com + username: user + sshKey: ~/.ssh/id_rsa + - hostname: node2.example.com + username: user + sshKey: ~/.ssh/id_rsa + - hostname: node3.example.com + username: user + sshKey: ~/.ssh/id_rsa + +**With Custom SSH Config:** + +.. code-block:: yaml + :caption: SSH with custom options + + deployments: + ssh-advanced: + type: ssh + config: + hostname: bastion.example.com + username: admin + sshKey: ~/.ssh/id_ed25519 + sshKeyPassphrase: passphrase + port: 2222 + connectionTimeout: 30 + maxConnections: 10 + +**Use Cases:** + +* Legacy systems without container support +* Direct remote execution +* Bridging to on-premise infrastructure + +Slurm Connector +=============== + +Submit jobs to Slurm HPC schedulers: + +Basic Configuration +------------------- + +.. code-block:: yaml + :caption: Slurm deployment + + deployments: + hpc-slurm: + type: slurm + config: + hostname: login.hpc.example.edu + username: researcher + sshKey: ~/.ssh/hpc_key + workdir: /scratch/researcher/streamflow + services: + compute: + partition: standard + nodes: 1 + ntasks: 16 + mem: 64G + time: 02:00:00 + +**With GPU:** + +.. code-block:: yaml + :caption: Slurm with GPU allocation + + deployments: + slurm-gpu: + type: slurm + config: + hostname: gpu-login.hpc.edu + username: user + sshKey: ~/.ssh/id_rsa + workdir: /gpfs/scratch/user/jobs + services: + gpu-jobs: + partition: gpu + nodes: 1 + ntasks: 8 + gres: gpu:v100:2 # 2 V100 GPUs + mem: 128G + time: 08:00:00 + +**With QoS and Account:** + +.. code-block:: yaml + :caption: Slurm with QoS + + deployments: + slurm-priority: + type: slurm + config: + hostname: hpc.example.com + username: user + sshKey: ~/.ssh/id_rsa + services: + high-priority: + partition: priority + account: research-grant-123 + qos: high + nodes: 4 + ntasks-per-node: 32 + time: 24:00:00 + +**Use Cases:** + +* HPC cluster job submission +* Batch processing on supercomputers +* Resource-intensive computations + +PBS Connector +============= + +Submit jobs to PBS/Torque schedulers: + +.. code-block:: yaml + :caption: PBS deployment + + deployments: + hpc-pbs: + type: pbs + config: + hostname: pbs-login.hpc.edu + username: researcher + sshKey: ~/.ssh/id_rsa + workdir: /home/researcher/jobs + services: + compute: + queue: batch + nodes: 2 + cpus: 32 + mem: 128gb + walltime: "04:00:00" + + +**With Resource Selection:** + +.. code-block:: yaml + :caption: PBS with resource selection + + deployments: + pbs-custom: + type: pbs + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + services: + custom-resources: + queue: longrun + select: "2:ncpus=16:mem=64gb:ngpus=1" + walltime: "48:00:00" + +**Use Cases:** + +* PBS-managed HPC systems +* Traditional batch processing +* Legacy HPC infrastructure + +Singularity Connector +===================== + +Execute in Singularity/Apptainer containers: + +.. code-block:: yaml + :caption: Singularity deployment + + deployments: + singularity-hpc: + type: singularity + config: + image: library://library/default/ubuntu:20.04 + # or + # image: docker://python:3.10 + # or + # image: /path/to/image.sif + +**On HPC with Slurm:** + +.. code-block:: yaml + :caption: Singularity on Slurm + + deployments: + slurm-singularity: + type: slurm + config: + hostname: hpc.example.edu + username: user + sshKey: ~/.ssh/id_rsa + container: + type: singularity + image: docker://tensorflow/tensorflow:latest + services: + gpu-container: + partition: gpu + gres: gpu:1 + time: 04:00:00 + +**Use Cases:** + +* HPC systems without Docker +* Reproducible environments on shared systems +* Security-constrained environments + +Service Configuration +===================== + +Services define resource subsets within deployments: + +.. code-block:: yaml + :caption: Multiple services in one deployment + + deployments: + mixed-resources: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + cpu-workers: + replicas: 10 + template: + spec: + containers: + - name: worker + image: python:3.10 + resources: + requests: + cpu: "2" + memory: "4Gi" + + gpu-workers: + replicas: 2 + template: + spec: + containers: + - name: gpu-worker + image: tensorflow/tensorflow:latest-gpu + resources: + limits: + nvidia.com/gpu: 1 + + memory-intensive: + replicas: 3 + template: + spec: + containers: + - name: bigmem + image: r-base:latest + resources: + requests: + memory: "64Gi" + +Then bind different workflow steps to different services based on resource needs. + +Multi-Deployment Workflows +=========================== + +Use multiple deployments in one workflow: + +.. code-block:: yaml + :caption: Hybrid cloud-HPC workflow + + deployments: + cloud-preprocessing: + type: kubernetes + config: + kubeconfig: ~/.kube/config + services: + workers: + replicas: 20 + + hpc-computation: + type: slurm + config: + hostname: supercomputer.edu + username: researcher + sshKey: ~/.ssh/id_rsa + services: + compute-nodes: + partition: standard + nodes: 10 + ntasks-per-node: 128 + + cloud-postprocessing: + type: docker + config: + image: python:3.10 + + bindings: + - step: preprocess + target: + deployment: cloud-preprocessing + - step: heavy_computation + target: + deployment: hpc-computation + - step: visualize + target: + deployment: cloud-postprocessing + +**Use Cases:** + +* Hybrid workflows across cloud and HPC +* Cost optimization (cheap preprocessing, expensive computation) +* Data locality (process data where it resides) + +Configuration Best Practices +============================= + +1. **Use Environment Variables** + + Avoid hardcoding credentials: + + .. code-block:: yaml + + deployments: + secure: + type: ssh + config: + hostname: ${SSH_HOST} + username: ${SSH_USER} + sshKey: ${SSH_KEY_PATH} + +2. **Separate Configurations** + + Keep deployment configs in separate files: + + .. code-block:: yaml + + # streamflow.yml + deployments: !include deployments/production.yml + +3. **Test Incrementally** + + Start with local, then docker, then remote deployments. + +4. **Resource Limits** + + Always specify resource requirements to avoid oversubscription. + +5. **Timeout Settings** + + Set appropriate timeouts for long-running jobs: + + .. code-block:: yaml + + deployments: + long-running: + config: + connectionTimeout: 300 + jobTimeout: 86400 # 24 hours + +6. **Connection Pooling** + + Limit concurrent connections to avoid overwhelming remote systems: + + .. code-block:: yaml + + deployments: + ssh-limited: + type: ssh + config: + maxConnections: 5 + +Troubleshooting +=============== + +Connection Issues +----------------- + +**Problem:** ``Connection refused`` or ``Connection timeout`` + +**Solution:** + +* Verify hostname/IP is correct +* Check network connectivity (``ping``, ``telnet``) +* Verify SSH key permissions (``chmod 600 ~/.ssh/id_rsa``) +* Check firewall rules +* Verify service is running (Docker daemon, Kubernetes API, etc.) + +Authentication Issues +--------------------- + +**Problem:** ``Permission denied (publickey)`` + +**Solution:** + +* Verify SSH key is correct +* Add public key to ``~/.ssh/authorized_keys`` on remote host +* Check SSH key passphrase is correct +* Try password authentication temporarily for testing + +**Problem:** ``Unauthorized`` (Kubernetes) + +**Solution:** + +* Verify kubeconfig file path +* Check cluster credentials: ``kubectl cluster-info`` +* Ensure namespace exists: ``kubectl get namespaces`` +* Verify RBAC permissions + +Resource Issues +--------------- + +**Problem:** ``Insufficient resources`` or pods stuck ``Pending`` + +**Solution:** + +* Check cluster capacity: ``kubectl describe nodes`` +* Reduce resource requests +* Check resource quotas: ``kubectl get resourcequota`` +* Scale cluster if needed + +**Problem:** Slurm/PBS job stays in queue + +**Solution:** + +* Check queue status: ``squeue`` or ``qstat`` +* Verify partition/queue exists and is active +* Reduce resource requests (nodes, time, memory) +* Check account limits and QoS + +Image Issues +------------ + +**Problem:** ``Failed to pull image`` or ``ImagePullBackOff`` + +**Solution:** + +* Verify image name and tag are correct +* Check image exists: ``docker pull `` +* Verify registry credentials if private registry +* Check network connectivity to registry + +Validation +========== + +After configuring deployments, verify they work by running a simple test workflow. For example, test a Docker deployment: + +.. code-block:: yaml + :caption: test-deployment.yml - Test configuration + + version: v1.0 + + workflows: + test: + type: cwl + config: + file: test.cwl + + deployments: + docker-test: + type: docker + config: {} + + bindings: + - step: / + target: + deployment: docker-test + +.. code-block:: bash + :caption: Run test workflow + + $ streamflow run test-deployment.yml --debug + +If the workflow completes successfully, your deployment is configured correctly. + +Next Steps +========== + +After configuring deployments: + +* :doc:`binding-workflows` - Bind workflow steps to deployments +* :doc:`running-workflows` - Execute workflows +* :doc:`/reference/connectors/index` - Complete connector reference +* :doc:`/user-guide/advanced-patterns/index` - Advanced binding patterns + +Related Topics +============== + +* :doc:`/reference/configuration/deployment-config` - Deployment configuration schema +* :doc:`/reference/connectors/container/index` - Container connectors +* :doc:`/reference/connectors/cloud/index` - Cloud connectors +* :doc:`/reference/connectors/hpc/index` - HPC connectors +* :doc:`/developer-guide/extension-points/connector` - Creating custom connectors diff --git a/docs/source/user-guide/inspecting-results.rst b/docs/source/user-guide/inspecting-results.rst new file mode 100644 index 000000000..12264ffd9 --- /dev/null +++ b/docs/source/user-guide/inspecting-results.rst @@ -0,0 +1,710 @@ +================== +Inspecting Results +================== + +.. meta:: + :keywords: StreamFlow, results, provenance, report, metadata, inspection + :description: Learn how to inspect workflow execution results and generate reports in StreamFlow + +Overview +======== + +After workflow execution, StreamFlow provides tools to inspect results, analyze execution metadata, and generate reports. This guide covers the inspection capabilities of the StreamFlow CLI. + +StreamFlow Metadata +=================== + +StreamFlow automatically collects execution metadata: + +======================== ======================================== +Metadata Type Information Collected +======================== ======================================== +**Execution** Start/end times, status, duration +**Workflow** Workflow name, type, configuration +**Steps** Step execution times, status, locations +**Data** Input/output files, data transfers +**Deployments** Deployed environments, resources +**Provenance** Complete execution trace +======================== ======================================== + +**Storage Location:** + +By default: ``${HOME}/.streamflow/workflow.db`` + +Or: ``/.streamflow/workflow.db`` if ``--outdir`` was specified + +Listing Workflows +================= + +List All Workflows +------------------ + +View all executed workflows: + +.. code-block:: bash + + $ streamflow list + + ==================== ====== =========== + NAME TYPE EXECUTIONS + ==================== ====== =========== + data-pipeline cwl 5 + analysis-workflow cwl 3 + test-workflow cwl 12 + ==================== ====== =========== + +**Columns:** + +* **NAME** - Workflow name (from ``--name`` or auto-generated) +* **TYPE** - Workflow type (``cwl``) +* **EXECUTIONS** - Number of times workflow was executed + +List Workflow Executions +------------------------- + +View execution history for a specific workflow: + +.. code-block:: bash + + $ streamflow list data-pipeline + + ================================== ================================== =========== + START_TIME END_TIME STATUS + ================================== ================================== =========== + 2024-02-20T09:15:23.456789+00:00 2024-02-20T09:45:12.345678+00:00 COMPLETED + 2024-02-21T14:30:45.678901+00:00 2024-02-21T15:10:23.456789+00:00 COMPLETED + 2024-02-22T08:20:15.234567+00:00 2024-02-22T08:22:30.123456+00:00 FAILED + 2024-02-23T10:45:30.345678+00:00 2024-02-23T11:20:15.234567+00:00 COMPLETED + 2024-02-24T13:10:45.456789+00:00 2024-02-24T13:55:30.345678+00:00 COMPLETED + ================================== ================================== =========== + +**Columns:** + +* **START_TIME** - When execution started (ISO 8601 format) +* **END_TIME** - When execution finished +* **STATUS** - Final status (COMPLETED, FAILED, CANCELLED) + +**Use Cases:** + +* Track workflow execution history +* Identify failed runs +* Compare execution times +* Debugging and auditing + +Execution Status +---------------- + +================ ======================================== +Status Meaning +================ ======================================== +``PENDING`` Workflow queued for execution +``RUNNING`` Currently executing +``COMPLETED`` Finished successfully +``FAILED`` Encountered fatal error +``CANCELLED`` Manually stopped +================ ======================================== + +Generating Reports +================== + +Interactive HTML Report +----------------------- + +Generate an interactive timeline report: + +.. code-block:: bash + :caption: Generate HTML report + + streamflow report data-pipeline + +**Output:** ``data-pipeline.html`` (interactive visualization) + +**Report Contents:** + +* Timeline of step executions +* Resource utilization +* Data transfer operations +* Execution statistics +* Interactive exploration + +**Open in Browser:** + +.. code-block:: bash + + # macOS + open data-pipeline.html + + # Linux + xdg-open data-pipeline.html + + # macOS + open data-pipeline.html + +Report Formats +-------------- + +StreamFlow supports multiple report formats: + +.. code-block:: bash + :caption: Generate different format reports + + # HTML (default, interactive) + streamflow report workflow-name --format html + + # JSON (machine-readable) + streamflow report workflow-name --format json + + # Text (console-friendly) + streamflow report workflow-name --format text + +**Available Formats:** + +================ ======================================== +Format Use Case +================ ======================================== +``html`` Interactive visualization (default) +``json`` Programmatic analysis +``text`` Console inspection +================ ======================================== + +Report Multiple Executions +-------------------------- + +By default, reports include only the most recent execution. To include all executions: + +.. code-block:: bash + :caption: Report for all executions + + streamflow report workflow-name --all + +**Use Cases:** + +* Track workflow evolution over time +* Compare different runs of the same workflow +* Analyze performance trends + +Multi-Workflow Reports +---------------------- + +Generate combined report from multiple workflows: + +.. code-block:: bash + :caption: Combined report + + streamflow report workflow1,workflow2,workflow3 + +**Output:** Single report comparing all three workflows + +**Use Cases:** + +* Compare different workflow versions +* Analyze performance across experiments +* Aggregate results for reporting + +Report Output Location +---------------------- + +Specify where to save the report: + +.. code-block:: bash + :caption: Save report to specific location + + streamflow report workflow-name --outdir ./reports + +**Default:** Current directory + +Custom Report Name +------------------ + +.. code-block:: bash + :caption: Custom report filename + + streamflow report workflow-name \ + --outdir ./reports \ + --name experiment-2024-02-24 + +**Output:** ``./reports/experiment-2024-02-24.html`` + +Report Analysis +=============== + +Understanding HTML Reports +-------------------------- + +The interactive HTML report includes: + +**1. Summary Section** + +* Workflow name and execution time +* Total duration +* Number of steps +* Success/failure status +* Resource summary + +**2. Timeline View** + +* Visual representation of step execution +* Parallel execution visualization +* Data transfer operations +* Idle time identification + +**3. Step Details** + +* Individual step execution times +* Input/output files +* Execution location +* Resource usage +* Error messages (if failed) + +**4. Resource Utilization** + +* CPU usage over time +* Memory consumption +* Network I/O +* Disk I/O + +**5. Performance Metrics** + +* Critical path analysis +* Parallelization efficiency +* Resource utilization percentage +* Bottleneck identification + +JSON Report Structure +--------------------- + +.. code-block:: json + :caption: JSON report structure (example) + + { + "workflow": { + "name": "data-pipeline", + "type": "cwl", + "status": "COMPLETED", + "start_time": "2024-02-24T13:10:45.456789+00:00", + "end_time": "2024-02-24T13:55:30.345678+00:00", + "duration": 2684.888889 + }, + "steps": [ + { + "name": "/preprocess", + "status": "COMPLETED", + "start_time": "2024-02-24T13:11:00.123456+00:00", + "end_time": "2024-02-24T13:15:30.234567+00:00", + "duration": 270.111111, + "location": "kubernetes-cluster/worker-pod-1" + } + ], + "statistics": { + "total_steps": 5, + "completed_steps": 5, + "failed_steps": 0, + "total_duration": 2684.888889 + } + } + +Provenance Archives +=================== + +StreamFlow supports the `Workflow Run RO-Crate `_ provenance format for capturing complete workflow execution provenance. + +Generate Provenance Archive +---------------------------- + +.. code-block:: bash + :caption: Generate provenance archive + + streamflow prov workflow-name + +**Output:** ``workflow-name.crate.zip`` + +**Archive Contents:** + +* Workflow definition files +* Input/output data +* Execution metadata +* Provenance graph +* RO-Crate metadata (JSON-LD) + +Archive All Executions +---------------------- + +Include entire execution history: + +.. code-block:: bash + :caption: Archive all executions + + streamflow prov workflow-name --all + +Custom Archive Name +------------------- + +.. code-block:: bash + :caption: Custom archive name + + streamflow prov workflow-name --name experiment-2024 + +**Output:** ``experiment-2024.crate.zip`` + +Specify Output Directory +------------------------- + +.. code-block:: bash + :caption: Save archive to specific location + + streamflow prov workflow-name --outdir /path/to/archives + +**Default:** Current directory + +Provenance Use Cases +-------------------- + +1. **Reproducibility** + + * Share complete workflow execution + * Enable exact reproduction + * Document computational experiments + +2. **Publication** + + * Supplement research papers + * Meet FAIR data principles + * Provide computational evidence + +3. **Compliance** + + * Audit trail for regulated workflows + * Record-keeping requirements + * Quality assurance + +4. **Debugging** + + * Comprehensive execution trace + * Input/output inspection + * Performance analysis + +Workflow Outputs +================ + +Accessing Results +----------------- + +Workflow outputs are stored in the specified output directory: + +.. code-block:: bash + :caption: Check output location + + # Default output directory (current) + ls -la ./ + + # Custom output directory + ls -la /path/to/outdir + +**Typical Output Structure:** + +:: + + outdir/ + ├── result_file_1.txt # Workflow outputs + ├── result_file_2.csv + ├── results/ # Subdirectory outputs + │ ├── analysis_1.png + │ └── analysis_2.png + └── .streamflow/ # StreamFlow metadata + └── workflow.db + +Output Files +------------ + +CWL workflows declare outputs explicitly: + +.. code-block:: yaml + :caption: CWL outputs + + outputs: + results: + type: File + outputSource: analyze/result_file + + plots: + type: Directory + outputSource: visualize/plot_directory + +StreamFlow ensures these outputs are transferred to the output directory. + +Intermediate Files +------------------ + +By default, StreamFlow cleans up intermediate files. To preserve them: + +.. code-block:: bash + :caption: Keep intermediate files (planned feature) + + streamflow run workflow.yml --keep-intermediates + +Database Inspection +=================== + +Direct Database Access +---------------------- + +The StreamFlow database is SQLite format: + +.. code-block:: bash + :caption: Inspect database directly + + sqlite3 ~/.streamflow/workflow.db + +.. code-block:: sql + :caption: Example queries + + -- List all workflows + SELECT name, type FROM workflows; + + -- Get execution times + SELECT name, start_time, end_time, status + FROM executions + WHERE workflow_name = 'my-workflow'; + + -- Step execution details + SELECT step_name, duration, status + FROM steps + WHERE execution_id = 'exec-123'; + +**Database Schema:** + +* ``workflows`` - Workflow definitions +* ``executions`` - Execution instances +* ``steps`` - Individual step executions +* ``transfers`` - Data transfer operations +* ``deployments`` - Deployed environments + +Programmatic Access +------------------- + +For custom analysis, use Python with SQLite: + +.. code-block:: python + :caption: Analyze execution data + + import sqlite3 + import pandas as pd + + # Connect to database + conn = sqlite3.connect(os.path.expanduser('~/.streamflow/workflow.db')) + + # Load execution data + df = pd.read_sql_query(""" + SELECT + e.workflow_name, + s.step_name, + s.duration, + s.location + FROM executions e + JOIN steps s ON e.id = s.execution_id + WHERE e.status = 'COMPLETED' + """, conn) + + # Analyze + print(df.groupby('step_name')['duration'].describe()) + + conn.close() + +Performance Analysis +==================== + +Execution Time Analysis +----------------------- + +Compare execution times across runs: + +.. code-block:: bash + :caption: Get execution times + + streamflow list my-workflow | awk '{print $2, $4}' + +Identify slow steps: + +1. Generate JSON report +2. Parse step durations +3. Identify bottlenecks + +.. code-block:: bash + :caption: Extract step durations + + streamflow report my-workflow --format json | \ + jq '.steps[] | {name: .name, duration: .duration}' | \ + jq -s 'sort_by(.duration) | reverse' + +Resource Utilization +-------------------- + +HTML reports show: + +* CPU utilization over time +* Memory consumption patterns +* I/O bandwidth usage +* Resource allocation efficiency + +Parallelization Efficiency +-------------------------- + +Analyze how well workflow parallelizes: + +**Ideal Parallelization:** Steps execute simultaneously on all available resources + +**Poor Parallelization:** Sequential execution with idle resources + +HTML reports visualize this through timeline view. + +Troubleshooting with Reports +============================= + +Failed Executions +----------------- + +Inspect failed runs: + +.. code-block:: bash + :caption: List failed executions + + streamflow list workflow-name | grep FAILED + +Generate report for the most recent execution (which may include failed runs): + +.. code-block:: bash + :caption: Report on recent run + + streamflow report workflow-name + +**Report Shows:** + +* Which step failed +* Error messages +* Input files at failure point +* Execution location +* Resource state + +Performance Issues +------------------ + +If workflows run slowly: + +1. **Generate HTML report** +2. **Check timeline** for: + + * Long-running steps + * Sequential execution (should be parallel) + * Excessive idle time + * Large data transfers + +3. **Optimize** based on findings: + + * Add parallelism (scatter) + * Improve data locality + * Adjust resource allocation + * Use faster deployments + +Best Practices +============== + +1. **Regular Archiving** + + Archive important workflow executions: + + .. code-block:: bash + + # After successful production run + streamflow prov production-workflow \ + --name production-2024-02-24 \ + --outdir /archive/provenance + +2. **Comparative Analysis** + + Compare workflow versions: + + .. code-block:: bash + + streamflow report workflow-v1,workflow-v2,workflow-v3 \ + --format html \ + --outdir ./comparisons + +3. **Automated Reporting** + + Generate reports automatically: + + .. code-block:: bash + + #!/bin/bash + # post-workflow-script.sh + WORKFLOW_NAME=$1 + streamflow report $WORKFLOW_NAME \ + --outdir ./reports/$(date +%Y-%m-%d) + +4. **Database Backups** + + Backup metadata regularly: + + .. code-block:: bash + + # Backup StreamFlow database + cp ~/.streamflow/workflow.db \ + ~/backups/streamflow-$(date +%Y%m%d).db + +5. **Clean Old Data** + + Periodically remove old executions: + + .. code-block:: bash + + # Remove database (careful!) + rm ~/.streamflow/workflow.db + + # Or use database tools to prune old records + +Exporting Results +================= + +Share with Collaborators +------------------------ + +.. code-block:: bash + :caption: Package results for sharing + + # Create archive + tar -czf results-package.tar.gz \ + results/ \ + reports/workflow-report.html \ + workflow-name.crate.zip + +Integration with Analysis Tools +-------------------------------- + +Export data for external tools: + +.. code-block:: bash + :caption: Export for analysis + + # JSON for programmatic access + streamflow report workflow-name --format json > results.json + + # Load in Python/R/etc. for analysis + python analyze-results.py results.json + +Next Steps +========== + +After inspecting results: + +* :doc:`troubleshooting` - Resolve issues found in reports +* :doc:`/reference/cli/report` - Complete report options +* :doc:`/reference/cli/prov` - Provenance command reference +* :doc:`advanced-patterns/index` - Optimize workflow performance + +Related Topics +============== + +* :doc:`running-workflows` - Workflow execution +* :doc:`/reference/cli/list` - List command reference +* :doc:`/developer-guide/core-interfaces/persistence` - Database internals +* `RO-Crate Specification `_ - Provenance format diff --git a/docs/source/user-guide/installation.rst b/docs/source/user-guide/installation.rst new file mode 100644 index 000000000..606509992 --- /dev/null +++ b/docs/source/user-guide/installation.rst @@ -0,0 +1,299 @@ +============ +Installation +============ + +.. meta:: + :keywords: StreamFlow, installation, pip, docker, kubernetes, helm, setup + :description: Complete guide for installing StreamFlow via pip, Docker, or Kubernetes with Helm + +Overview +======== + +StreamFlow can be installed in multiple ways depending on your environment and use case. This guide covers all installation methods with verification steps and troubleshooting guidance. + +Installation Methods +==================== + +.. note:: + + **Supported Platforms:** StreamFlow supports Linux and macOS only. Windows is not supported. + +======================== =============================== ============================= +Method Best For Prerequisites +======================== =============================== ============================= +:ref:`install-pip` Most users, development Python 3.10-3.14 +:ref:`install-docker` Containerized environments Docker +:ref:`install-k8s` Production deployments Kubernetes cluster, Helm +======================== =============================== ============================= + +.. _install-pip: + +Method 1: Install with pip +=========================== + +The StreamFlow module is available on `PyPI `_ and can be installed using pip. This is the recommended method for most users. + +Prerequisites +------------- + +* Python 3.10, 3.11, 3.12, 3.13, or 3.14 +* pip (Python package installer) + +Installation Command +-------------------- + +.. code-block:: bash + :caption: Install StreamFlow from PyPI + + pip install streamflow + +For the latest development version from GitHub: + +.. code-block:: bash + :caption: Install from GitHub main branch + + pip install git+https://github.com/alpha-unito/streamflow.git + +Verify Installation +------------------- + +Check that StreamFlow is installed correctly: + +.. code-block:: bash + + $ streamflow version + + StreamFlow version 0.2.0.dev14 + +Check available commands: + +.. code-block:: bash + + $ streamflow --help + + usage: streamflow [-h] {ext,list,plugin,prov,report,run,schema,version} ... + + StreamFlow Command Line + + positional arguments: + {ext,list,plugin,prov,report,run,schema,version} + ext Retrieve information on the available StreamFlow extensions + list List the executed workflows + plugin Retrieve information on the installed StreamFlow plugins + prov Generate a provenance archive for an executed workflow + report Generate a report for an executed workflow + run Execute a workflow + schema Dump StreamFlow JSON Schema and exit + version Only print StreamFlow version and exit + +Run Your First Workflow +----------------------- + +After installation, you can execute workflows using the StreamFlow CLI: + +.. code-block:: bash + :caption: Execute a workflow + + streamflow run /path/to/streamflow.yml + +For a complete first workflow example, see the :doc:`quickstart` guide. + +Upgrading +--------- + +To upgrade to the latest version: + +.. code-block:: bash + :caption: Upgrade StreamFlow + + pip install --upgrade streamflow + +Uninstalling +------------ + +To remove StreamFlow: + +.. code-block:: bash + :caption: Uninstall StreamFlow + + pip uninstall streamflow + +.. _install-docker: + +Method 2: Run with Docker +========================== + +StreamFlow Docker images are available on `Docker Hub `_. This method is ideal for containerized environments or testing StreamFlow without installing Python dependencies. + +Prerequisites +------------- + +* Docker Engine (version 20.10 or later recommended) +* Docker daemon running + +Pull the Image +-------------- + +Download the latest StreamFlow Docker image: + +.. code-block:: bash + :caption: Pull latest StreamFlow image + + docker pull --platform linux/amd64 alphaunito/streamflow:latest + +For a specific development version: + +.. code-block:: bash + :caption: Pull specific development version + + docker pull --platform linux/amd64 alphaunito/streamflow:0.2.0.dev14 + +.. note:: + + **Platform Limitation:** The Docker images are built for ``linux/amd64`` platform only. + + * **Apple Silicon / ARM64:** Use ``docker pull --platform linux/amd64 alphaunito/streamflow:latest`` or install from source with :ref:`install-source` + * **Available tags:** See `Docker Hub tags `_ for all versions (stable releases use version tags like ``latest``, development versions use ``0.2.0.devN``) + +Verify Image +------------ + +List downloaded images: + +.. code-block:: bash + + $ docker images | grep streamflow + + alphaunito/streamflow latest abc123def456 2 days ago 500MB + +Run StreamFlow in Container +---------------------------- + +Execute StreamFlow commands in a container: + +.. code-block:: bash + :caption: Run StreamFlow in Docker container + + docker run --rm --platform linux/amd64 alphaunito/streamflow:latest streamflow version + +To run a workflow, mount your workflow directory: + +.. code-block:: bash + :caption: Execute workflow with Docker + + docker run --rm --platform linux/amd64 \ + --mount type=bind,source="$(pwd)"/my-project,target=/streamflow/project \ + --mount type=bind,source="$(pwd)"/results,target=/streamflow/results \ + alphaunito/streamflow:latest \ + streamflow run /streamflow/project/streamflow.yml + +Docker Compose Example +---------------------- + +For persistent setups, use Docker Compose: + +.. code-block:: yaml + :caption: docker-compose.yml + + version: '3.8' + services: + streamflow: + image: alphaunito/streamflow:latest + volumes: + - ./my-project:/streamflow/project + - ./results:/streamflow/results + command: streamflow run /streamflow/project/streamflow.yml + +Run with Docker Compose: + +.. code-block:: bash + :caption: Start with Docker Compose + + docker-compose up + +.. _install-k8s: + +Method 3: Deploy on Kubernetes with Helm +========================================= + +For production deployments, StreamFlow can be deployed on Kubernetes using Helm charts. + +Prerequisites +------------- + +* Kubernetes cluster (1.19 or later) +* kubectl configured to access your cluster +* Helm 3.x installed + +Verify Prerequisites +-------------------- + +.. code-block:: bash + :caption: Check Kubernetes connection + + kubectl cluster-info + +.. code-block:: bash + :caption: Check Helm installation + + helm version + +Using the Helm Chart +--------------------- + +StreamFlow provides a Helm chart template in the ``helm/chart`` directory of the repository. You can use this template to deploy StreamFlow as a Kubernetes Job. + +Clone the repository to access the Helm chart: + +.. code-block:: bash + :caption: Clone StreamFlow repository + + git clone https://github.com/alpha-unito/streamflow.git + cd streamflow/helm/chart + +Install StreamFlow using the local chart: + +.. code-block:: bash + :caption: Install StreamFlow with Helm + + helm install my-streamflow . + +With custom values: + +.. code-block:: bash + :caption: Install with custom configuration + + helm install my-streamflow . -f custom-values.yaml + +Verify Deployment +----------------- + +Check that StreamFlow pods are running: + +.. code-block:: bash + + $ kubectl get pods -l app=streamflow + + NAME READY STATUS RESTARTS AGE + my-streamflow-xxxxx 1/1 Running 0 2m + +Uninstall +--------- + +To remove the StreamFlow deployment: + +.. code-block:: bash + :caption: Uninstall StreamFlow from Kubernetes + + helm uninstall my-streamflow + +Troubleshooting +=============== + +For installation issues, see :doc:`troubleshooting`. + +Next Steps +========== + +* :doc:`quickstart` - Get started in 10 minutes +* :doc:`troubleshooting` - Installation help diff --git a/docs/source/user-guide/installation.rst.backup b/docs/source/user-guide/installation.rst.backup new file mode 100644 index 000000000..a38f95a5d --- /dev/null +++ b/docs/source/user-guide/installation.rst.backup @@ -0,0 +1,506 @@ +============ +Installation +============ + +.. meta:: + :keywords: StreamFlow, installation, pip, docker, kubernetes, helm, setup + :description: Complete guide for installing StreamFlow via pip, Docker, or Kubernetes with Helm + +Overview +======== + +StreamFlow can be installed in multiple ways depending on your environment and use case. This guide covers all installation methods with verification steps and troubleshooting guidance. + +Installation Methods +==================== + +.. note:: + + **Supported Platforms:** StreamFlow supports Linux and macOS only. Windows is not supported. + +======================== =============================== ============================= +Method Best For Prerequisites +======================== =============================== ============================= +:ref:`install-pip` Most users, development Python 3.10-3.14 +:ref:`install-docker` Containerized environments Docker +:ref:`install-k8s` Production deployments Kubernetes cluster, Helm +======================== =============================== ============================= + +.. _install-pip: + +Method 1: Install with pip +=========================== + +The StreamFlow module is available on `PyPI `_ and can be installed using pip. This is the recommended method for most users. + +Prerequisites +------------- + +* Python 3.10, 3.11, 3.12, 3.13, or 3.14 +* pip (Python package installer) + +Installation Command +-------------------- + +.. code-block:: bash + :caption: Install StreamFlow from PyPI + + pip install streamflow + +For the latest development version from GitHub: + +.. code-block:: bash + :caption: Install from GitHub main branch + + pip install git+https://github.com/alpha-unito/streamflow.git + +Verify Installation +------------------- + +Check that StreamFlow is installed correctly: + +.. code-block:: bash + + $ streamflow version + + StreamFlow version 0.2.0.dev14 + +Check available commands: + +.. code-block:: bash + + $ streamflow --help + + usage: streamflow [-h] {ext,list,plugin,prov,report,run,schema,version} ... + + StreamFlow Command Line + + positional arguments: + {ext,list,plugin,prov,report,run,schema,version} + ext Retrieve information on the available StreamFlow extensions + list List the executed workflows + plugin Retrieve information on the installed StreamFlow plugins + prov Generate a provenance archive for an executed workflow + report Generate a report for an executed workflow + run Execute a workflow + schema Dump StreamFlow JSON Schema and exit + version Only print StreamFlow version and exit + +Run Your First Workflow +----------------------- + +After installation, you can execute workflows using the StreamFlow CLI: + +.. code-block:: bash + :caption: Execute a workflow + + streamflow run /path/to/streamflow.yml + +For a complete first workflow example, see the :doc:`quickstart` guide. + +Upgrading +--------- + +To upgrade to the latest version: + +.. code-block:: bash + :caption: Upgrade StreamFlow + + pip install --upgrade streamflow + +Uninstalling +------------ + +To remove StreamFlow: + +.. code-block:: bash + :caption: Uninstall StreamFlow + + pip uninstall streamflow + +.. _install-docker: + +Method 2: Run with Docker +========================== + +StreamFlow Docker images are available on `Docker Hub `_. This method is ideal for containerized environments or testing StreamFlow without installing Python dependencies. + +Prerequisites +------------- + +* Docker Engine (version 20.10 or later recommended) +* Docker daemon running + +Pull the Image +-------------- + +Download the latest StreamFlow Docker image: + +.. code-block:: bash + :caption: Pull latest StreamFlow image + + docker pull --platform linux/amd64 alphaunito/streamflow:latest + +For a specific development version: + +.. code-block:: bash + :caption: Pull specific development version + + docker pull --platform linux/amd64 alphaunito/streamflow:0.2.0.dev14 + +.. note:: + + **Platform Limitation:** The Docker images are built for ``linux/amd64`` platform only. + + * **Apple Silicon / ARM64:** Use ``docker pull --platform linux/amd64 alphaunito/streamflow:latest`` or install from source with :ref:`install-source` + * **Available tags:** See `Docker Hub tags `_ for all versions (stable releases use version tags like ``latest``, development versions use ``0.2.0.devN``) + +Verify Image +------------ + +List downloaded images: + +.. code-block:: bash + + $ docker images | grep streamflow + + alphaunito/streamflow latest abc123def456 2 days ago 500MB + +Run StreamFlow in Container +---------------------------- + +Execute StreamFlow commands in a container: + +.. code-block:: bash + :caption: Run StreamFlow in Docker container + + docker run --rm --platform linux/amd64 alphaunito/streamflow:latest streamflow version + +To run a workflow, mount your workflow directory: + +.. code-block:: bash + :caption: Execute workflow with Docker + + docker run --rm --platform linux/amd64 \ + --mount type=bind,source="$(pwd)"/my-project,target=/streamflow/project \ + --mount type=bind,source="$(pwd)"/results,target=/streamflow/results \ + alphaunito/streamflow:latest \ + streamflow run /streamflow/project/streamflow.yml + +Docker Compose Example +---------------------- + +For persistent setups, use Docker Compose: + +.. code-block:: yaml + :caption: docker-compose.yml + + version: '3.8' + services: + streamflow: + image: alphaunito/streamflow:latest + volumes: + - ./my-project:/streamflow/project + - ./results:/streamflow/results + command: streamflow run /streamflow/project/streamflow.yml + +Run with Docker Compose: + +.. code-block:: bash + :caption: Start with Docker Compose + + docker-compose up + +.. _install-k8s: + +Method 3: Deploy on Kubernetes with Helm +========================================= + +For production deployments, StreamFlow can be deployed on Kubernetes using Helm charts. + +Prerequisites +------------- + +* Kubernetes cluster (1.19 or later) +* kubectl configured to access your cluster +* Helm 3.x installed + +Verify Prerequisites +-------------------- + +.. code-block:: bash + :caption: Check Kubernetes connection + + kubectl cluster-info + +.. code-block:: bash + :caption: Check Helm installation + + helm version + +Using the Helm Chart +--------------------- + +StreamFlow provides a Helm chart template in the ``helm/chart`` directory of the repository. You can use this template to deploy StreamFlow as a Kubernetes Job. + +Clone the repository to access the Helm chart: + +.. code-block:: bash + :caption: Clone StreamFlow repository + + git clone https://github.com/alpha-unito/streamflow.git + cd streamflow/helm/chart + +Install StreamFlow using the local chart: + +.. code-block:: bash + :caption: Install StreamFlow with Helm + + helm install my-streamflow . + +With custom values: + +.. code-block:: bash + :caption: Install with custom configuration + + helm install my-streamflow . -f custom-values.yaml + +Verify Deployment +----------------- + +Check that StreamFlow pods are running: + +.. code-block:: bash + + $ kubectl get pods -l app=streamflow + + NAME READY STATUS RESTARTS AGE + my-streamflow-xxxxx 1/1 Running 0 2m + +Uninstall +--------- + +To remove the StreamFlow deployment: + +.. code-block:: bash + :caption: Uninstall StreamFlow from Kubernetes + + helm uninstall my-streamflow + +Optional Dependencies +===================== + +StreamFlow supports optional features that require additional dependencies. + +Container Runtimes +------------------ + +For container-based workflows, install the appropriate runtime: + +**Docker** + Already covered in :ref:`install-docker` + +**Singularity/Apptainer** + Follow official installation at https://apptainer.org/docs/user/latest/quick_start.html + +**Podman** + Follow official installation at https://podman.io/getting-started/installation + +SSH and Remote Execution +------------------------- + +For SSH-based connectors, ensure OpenSSH client is installed: + +.. code-block:: bash + :caption: Check SSH installation (Linux/macOS) + + ssh -V + +.. code-block:: bash + :caption: Install OpenSSH (Ubuntu/Debian) + + sudo apt-get install openssh-client + +.. code-block:: bash + :caption: Install OpenSSH (macOS with Homebrew) + + brew install openssh + +HPC Schedulers +-------------- + +StreamFlow supports various HPC schedulers. No client installation is typically required as StreamFlow communicates via SSH, but ensure your HPC system has: + +* Slurm, PBS, or other supported scheduler +* SSH access configured +* Appropriate user permissions + +Troubleshooting +=============== + +Python Version Issues +--------------------- + +**Problem:** ``ERROR: Package requires a different Python version`` + +**Solution:** Verify Python version: + +.. code-block:: bash + + python --version + # or + python3 --version + +StreamFlow requires Python 3.10 or later. If your default Python is older: + +.. code-block:: bash + + # Use Python 3.10+ explicitly + pip3.10 install streamflow + # or + python3.10 -m pip install streamflow + +**Problem:** ``Command 'streamflow' not found`` + +**Solution:** The pip installation directory may not be in your PATH: + +.. code-block:: bash + + # Find where streamflow was installed + pip show streamflow + + # Add to PATH (add to ~/.bashrc or ~/.zshrc for persistence) + export PATH="$HOME/.local/bin:$PATH" + + # Or use full path + ~/.local/bin/streamflow version + +Docker Issues +------------- + +**Problem:** ``docker: Cannot connect to the Docker daemon`` + +**Solution:** Ensure Docker daemon is running: + +.. code-block:: bash + + # Check Docker status + docker info + + # Start Docker daemon (Linux systemd) + sudo systemctl start docker + + # macOS: Start Docker Desktop application + +**Problem:** Permission denied when running Docker + +**Solution:** Add user to docker group (Linux): + +.. code-block:: bash + + sudo usermod -aG docker $USER + # Log out and back in for changes to take effect + +Kubernetes/Helm Issues +----------------------- + +**Problem:** Helm chart not found in repository + +**Solution:** Clone the StreamFlow repository to access the Helm chart: + +.. code-block:: bash + + git clone https://github.com/alpha-unito/streamflow.git + cd streamflow/helm/chart + +**Problem:** Pods stuck in ``Pending`` state + +**Solution:** Check resource availability and pod events: + +.. code-block:: bash + + kubectl describe pod + kubectl get events --sort-by='.lastTimestamp' + +Dependency Conflicts +-------------------- + +**Problem:** pip reports dependency conflicts during installation + +**Solution:** Use a virtual environment to isolate dependencies: + +.. code-block:: bash + + # Create virtual environment + python3 -m venv streamflow-env + + # Activate virtual environment + source streamflow-env/bin/activate # Linux/macOS + + # Install StreamFlow + pip install streamflow + +Installation Hangs +------------------ + +**Problem:** pip installation hangs or is very slow + +**Solution:** Try using a different PyPI mirror or increase timeout: + +.. code-block:: bash + + # Increase timeout + pip install --timeout=300 streamflow + + # Use verbose output to see progress + pip install -v streamflow + +Performance Issues +------------------ + +**Problem:** StreamFlow runs slowly or uses excessive resources + +**Solution:** Check system resources and adjust configuration: + +.. code-block:: bash + + # Check available resources + free -h # Linux + top # Monitor processes + + # Adjust StreamFlow configuration (streamflow.yml) + # Set appropriate resource limits in deployment configuration + +Getting Help +------------ + +If you encounter issues not covered here: + +1. **Check logs:** StreamFlow logs provide detailed error information +2. **Search issues:** Visit https://github.com/alpha-unito/streamflow/issues +3. **Ask questions:** Use GitHub Discussions +4. **Report bugs:** Open a new issue with: + + * StreamFlow version (``streamflow version``) + * Python version (``python --version``) + * Operating system + * Complete error message + * Minimal reproducible example + +Next Steps +========== + +After installing StreamFlow, continue with: + +* :doc:`quickstart` - Run your first workflow in 10 minutes +* :doc:`writing-workflows` - Learn CWL workflow syntax +* :doc:`configuring-deployments` - Set up execution environments +* :ref:`/reference/configuration/index` - Detailed configuration reference + +Related Topics +============== + +* :doc:`/reference/cli/index` - Complete CLI command reference +* :doc:`/reference/configuration/index` - Configuration file schemas +* :doc:`/reference/connectors/index` - Available deployment connectors +* :doc:`troubleshooting` - Common issues and solutions diff --git a/docs/source/user-guide/quickstart.rst b/docs/source/user-guide/quickstart.rst new file mode 100644 index 000000000..d7413c463 --- /dev/null +++ b/docs/source/user-guide/quickstart.rst @@ -0,0 +1,158 @@ +========== +Quickstart +========== + +.. meta:: + :description: Get started with StreamFlow in 10 minutes - install, create your first workflow, and run it + :keywords: StreamFlow, quickstart, tutorial, getting started, first workflow + :audience: users + :difficulty: beginner + :reading_time_minutes: 10 + +**Prerequisites:** + +* :doc:`installation` - StreamFlow installed + +**What You'll Learn:** + +* Create a simple CWL workflow +* Configure and run a workflow with StreamFlow +* View workflow results + +Step 1: Create Project Directory +================================= + +.. code-block:: bash + + mkdir streamflow-quickstart + cd streamflow-quickstart + +Step 2: Create CWL Workflow +============================ + +Create ``hello-workflow.cwl``: + +.. literalinclude:: ../../examples/quickstart/hello-workflow.cwl + :language: yaml + :caption: hello-workflow.cwl - CWL Workflow Definition + :linenos: + +Step 3: Create Workflow Inputs +=============================== + +Create ``inputs.yml``: + +.. literalinclude:: ../../examples/quickstart/inputs.yml + :language: yaml + :caption: inputs.yml - Workflow Inputs + :linenos: + +Step 4: Create StreamFlow Configuration +======================================== + +Create ``streamflow.yml``: + +.. literalinclude:: ../../examples/quickstart/streamflow.yml + :language: yaml + :caption: streamflow.yml - StreamFlow Configuration + :linenos: + +Step 5: Run the Workflow +========================= + +.. code-block:: bash + + $ streamflow run streamflow.yml + + 2026-02-24 10:52:43.304 INFO Processing workflow fde2a338-33f7-42c3-80be-8a59cccabf53 + 2026-02-24 10:52:43.322 INFO EXECUTING workflow fde2a338-33f7-42c3-80be-8a59cccabf53 + 2026-02-24 10:52:43.382 INFO EXECUTING step / (job /0) locally + 2026-02-24 10:52:43.770 INFO COMPLETED Step / + 2026-02-24 10:52:43.773 INFO COMPLETED workflow execution + +Step 6: Check Results +====================== + +.. code-block:: bash + + $ cat output.txt + + Hello from StreamFlow! + +Success! Your first workflow executed and produced the expected output. + +Using Docker (Optional) +======================== + +To run the same workflow in a Docker container, update ``streamflow.yml``: + +.. code-block:: yaml + :caption: streamflow.yml - With Docker + :emphasize-lines: 10-14,16-19 + + version: v1.0 + + workflows: + hello-workflow: + type: cwl + config: + file: hello-workflow.cwl + settings: inputs.yml + + deployments: + docker-env: + type: docker + config: {} + + bindings: + - step: / + target: + deployment: docker-env + +Run again: + +.. code-block:: bash + + $ streamflow run streamflow.yml + + 2026-02-24 10:55:12.445 INFO EXECUTING step / (job /0) on docker-env + 2026-02-24 10:55:13.102 INFO COMPLETED Step / + +The workflow now runs in a Docker container instead of locally. + +Inspect Workflow Execution +=========================== + +List executed workflows: + +.. code-block:: bash + + $ streamflow list + + NAME STATUS START END + hello-workflow COMPLETED 2026-02-24 10:52:43 2026-02-24 10:52:43 + +View detailed report: + +.. code-block:: bash + + $ streamflow report hello-workflow + + Report generated: hello-workflow.html + +Common Issues +============= + +**Workflow not found:** Ensure you're in the directory containing ``streamflow.yml`` + +**Docker errors:** See :doc:`troubleshooting` for Docker setup issues + +**CWL validation errors:** Validate your workflow with ``cwltool --validate hello-workflow.cwl`` + +Next Steps +========== + +* :doc:`writing-workflows` - Learn CWL workflow syntax +* :doc:`configuring-deployments` - Set up remote execution environments +* :doc:`binding-workflows` - Control where workflow steps execute +* :doc:`troubleshooting` - Solutions to common issues diff --git a/docs/source/user-guide/quickstart.rst.backup b/docs/source/user-guide/quickstart.rst.backup new file mode 100644 index 000000000..eced7930c --- /dev/null +++ b/docs/source/user-guide/quickstart.rst.backup @@ -0,0 +1,337 @@ +========== +Quickstart +========== + +.. meta:: + :keywords: StreamFlow, quickstart, tutorial, getting started, first workflow + :description: Run your first StreamFlow workflow in 10 minutes - complete step-by-step tutorial + +Overview +======== + +This quickstart guide will walk you through creating and executing a simple StreamFlow workflow. You'll learn the basics by running a complete working example. + +Prerequisites +============= + +This guide assumes you have: + +* **StreamFlow installed** (see previous section) +* **Python 3.10 or later** +* **Text editor** for creating files +* **Terminal/command line** access + +Verify your installation: + +.. code-block:: bash + + $ streamflow version + + StreamFlow version 0.2.0 + +What We'll Build +================ + +A simple workflow that: + +1. Takes a message as input +2. Runs the ``echo`` command to print the message +3. Saves the output to a file +4. Runs locally (no external infrastructure needed) + +This minimal example demonstrates the complete StreamFlow workflow: CWL description + deployment configuration + execution. + +Step 1: Create Project Directory +================================= + +Create a directory for your first StreamFlow project: + +.. code-block:: bash + :caption: Command: Create project directory + + mkdir streamflow-quickstart + cd streamflow-quickstart + +Step 2: Create the CWL Workflow +================================ + +Create a file named ``hello-workflow.cwl`` with the following content: + +.. literalinclude:: ../../examples/quickstart/hello-workflow.cwl + :language: yaml + :caption: hello-workflow.cwl - CWL Workflow Definition + :linenos: + +**What this does:** + +* **Line 1-2:** Specifies CWL version (v1.2) and defines this as a CommandLineTool +* **Line 3:** The command to execute is ``echo`` +* **Line 5-9:** Defines an input parameter called ``message`` of type string +* **Line 10-12:** Defines output that captures stdout to a file +* **Line 13:** Redirects stdout to ``output.txt`` + +This is a minimal CWL workflow that echoes a message to a file. + +Step 3: Create the Workflow Inputs +=================================== + +Create a file named ``inputs.yml`` with the workflow inputs: + +.. literalinclude:: ../../examples/quickstart/inputs.yml + :language: yaml + :caption: inputs.yml - Workflow Inputs + :linenos: + +This file provides the input parameters for our CWL workflow. + +Step 4: Create the StreamFlow Configuration +============================================ + +Create a file named ``streamflow.yml`` with the following content: + +.. literalinclude:: ../../examples/quickstart/streamflow.yml + :language: yaml + :caption: streamflow.yml - StreamFlow Configuration + :linenos: + +**What this does:** + +* **Line 1:** Specifies StreamFlow configuration version +* **Line 3-8:** Defines a workflow named ``hello-workflow`` + + * Uses CWL type + * Points to our ``hello-workflow.cwl`` file + * Points to ``inputs.yml`` for input parameters + +No ``deployments`` or ``bindings`` section means StreamFlow will use the default local execution environment. + +Step 5: Run the Workflow +========================= + +Execute the workflow with a single command: + +.. code-block:: bash + + $ streamflow run streamflow.yml + + 2026-02-24 10:52:43.304 INFO Processing workflow fde2a338-33f7-42c3-80be-8a59cccabf53 + 2026-02-24 10:52:43.304 INFO Building workflow execution plan + 2026-02-24 10:52:43.322 INFO COMPLETED building of workflow execution plan + 2026-02-24 10:52:43.322 INFO EXECUTING workflow fde2a338-33f7-42c3-80be-8a59cccabf53 + 2026-02-24 10:52:43.382 INFO EXECUTING step / (job /0) locally + 2026-02-24 10:52:43.770 INFO COMPLETED Step / + 2026-02-24 10:52:43.773 INFO COMPLETED workflow execution + +The workflow executes locally and creates an output file. + +Step 6: Check the Results +========================== + +Verify the workflow output: + +.. code-block:: bash + + $ cat output.txt + + Hello from StreamFlow! + +.. code-block:: text + :caption: Expected Output + + Hello from StreamFlow! + +Success! Your first workflow has executed and produced the expected output. + +Understanding What Happened +============================ + +Let's break down what StreamFlow did: + +1. **Parsed Configuration:** Read ``streamflow.yml`` and understood the workflow structure +2. **Loaded CWL Workflow:** Parsed ``hello-workflow.cwl`` and created an execution plan +3. **Prepared Environment:** Since no deployment was specified, used the local environment +4. **Executed Command:** Ran ``echo "Hello from StreamFlow!"`` on your local system +5. **Captured Output:** Saved stdout to ``output.txt`` as specified in the CWL +6. **Completed:** Marked the workflow as successfully completed + +Project Structure +================= + +Your project directory now contains: + +.. code-block:: text + + streamflow-quickstart/ + ├── hello-workflow.cwl # CWL workflow definition + ├── inputs.yml # Workflow inputs + ├── streamflow.yml # StreamFlow configuration + ├── output.txt # Workflow output + └── .streamflow/ # StreamFlow metadata (hidden) + +The ``.streamflow/`` directory contains execution metadata, logs, and provenance information. + +Next Steps: Add a Deployment +============================= + +Let's enhance the workflow by adding a Docker deployment (optional - requires Docker installed): + +Create an updated ``streamflow.yml``: + +.. code-block:: yaml + :caption: streamflow.yml - With Docker Deployment + :linenos: + :emphasize-lines: 10-14,16-19 + + version: v1.0 + + workflows: + hello-workflow: + type: cwl + config: + file: hello-workflow.cwl + settings: inputs.yml + + deployments: + docker-env: + type: docker + config: + image: alpine:latest + + bindings: + - step: / + target: + deployment: docker-env + +Now run again: + +.. code-block:: bash + + $ streamflow run streamflow.yml + +This time, the workflow executes inside an Alpine Docker container instead of locally. + +Inspecting Workflow Execution +============================== + +StreamFlow provides commands to inspect workflow executions: + +**List all workflows:** + +.. code-block:: bash + + $ streamflow list + + Workflow Name Status Start Time Duration + ─────────────────────────────────────────────────────────── + hello-workflow COMPLETED 2024-02-24 10:30:00 2.3s + +**Generate detailed report:** + +.. code-block:: bash + + $ streamflow report hello-workflow + +This produces a detailed execution report with timing, resource usage, and provenance information. + +Common Operations +================= + +**View logs:** + +.. code-block:: bash + + # Logs are in .streamflow directory + ls -la .streamflow/ + +**Clean up:** + +.. code-block:: bash + + # Remove StreamFlow metadata + rm -rf .streamflow/ + + # Remove outputs + rm output.txt + +**Debug mode:** + +.. code-block:: bash + + # Run with detailed logging + streamflow run --debug streamflow.yml + +Troubleshooting +=============== + +**Workflow fails to run:** + * Verify StreamFlow is installed: ``streamflow version`` + * Check CWL syntax is correct + * Ensure file paths in ``streamflow.yml`` are correct + +**Output file not created:** + * Check for error messages in the output + * Verify the CWL stdout specification + * Look in ``.streamflow/`` for execution logs + +**Docker deployment fails:** + * Ensure Docker is installed and running + * Verify you have permission to run Docker + * Check image name is correct + +See :doc:`troubleshooting` for more solutions. + +What You Learned +================ + +In this quickstart, you: + +✓ Created a simple CWL workflow +✓ Configured StreamFlow with ``streamflow.yml`` +✓ Executed a workflow locally +✓ Verified workflow outputs +✓ (Optional) Ran the same workflow in Docker +✓ Inspected workflow execution history + +Key Concepts +============ + +**CWL Workflow (hello-workflow.cwl):** + Describes *what* to execute - the computational task and its inputs/outputs + +**StreamFlow Configuration (streamflow.yml):** + Describes *how* to execute - which workflows to run and where to run them + +**Deployment:** + The execution environment (local, Docker, Kubernetes, HPC, etc.) + +**Binding:** + The association between workflow steps and deployments + +Related Documentation +===================== + +**User Guide:** + - :doc:`writing-workflows` - Learn CWL in detail + - :doc:`configuring-deployments` - Configure execution environments + - :doc:`binding-workflows` - Advanced binding patterns + - :doc:`running-workflows` - All execution options + +**Reference:** + - :doc:`/reference/cli/streamflow-run` - Complete CLI reference + - :doc:`/reference/configuration/streamflow-yml` - Configuration schema + +**Examples:** + - Working examples in ``docs/examples/workflows/`` + - Advanced patterns in ``docs/examples/advanced/`` + +Next Steps +========== + +Now that you've run your first workflow, explore: + +1. **Write More Complex Workflows:** :doc:`writing-workflows` +2. **Use Real Deployments:** :doc:`configuring-deployments` +3. **Bind Steps to Specific Environments:** :doc:`binding-workflows` +4. **Explore Advanced Features:** :doc:`advanced-patterns/index` + +Start building real workflows with StreamFlow! diff --git a/docs/source/user-guide/running-workflows.rst b/docs/source/user-guide/running-workflows.rst new file mode 100644 index 000000000..ddb3b78a1 --- /dev/null +++ b/docs/source/user-guide/running-workflows.rst @@ -0,0 +1,676 @@ +================== +Running Workflows +================== + +.. meta:: + :keywords: StreamFlow, execute, run, workflow, CLI, monitoring, logs + :description: Learn how to execute and monitor StreamFlow workflows + +Overview +======== + +This guide explains how to execute workflows using the StreamFlow CLI, monitor execution progress, and understand execution options. + +Basic Execution +=============== + +Command Syntax +-------------- + +.. code-block:: bash + :caption: Execute a workflow + + streamflow run /path/to/streamflow.yml + +This command: + +1. Parses the StreamFlow configuration file +2. Deploys the specified execution environments +3. Executes the workflow according to bindings +4. Collects outputs and metadata +5. Tears down deployments + +Example Execution +----------------- + +.. code-block:: bash + + $ # Navigate to workflow directory + $ cd my-workflow-project + $ + $ # Execute workflow + $ streamflow run streamflow.yml + + 2024-02-24 12:00:00.123 INFO StreamFlow version 0.2.0.dev14 + 2024-02-24 12:00:00.456 INFO Loading workflow from streamflow.yml + 2024-02-24 12:00:01.789 INFO Deploying environment: docker-python + 2024-02-24 12:00:03.012 INFO Starting workflow execution + 2024-02-24 12:00:05.345 INFO Step /preprocess completed + 2024-02-24 12:00:08.678 INFO Step /analyze completed + 2024-02-24 12:00:10.901 INFO Workflow completed successfully + 2024-02-24 12:00:11.234 INFO Undeploying environments + 2024-02-24 12:00:12.567 INFO Results saved to ./results + +Command-Line Options +==================== + +Output Directory +---------------- + +Specify where to store workflow results: + +.. code-block:: bash + :caption: Set output directory + + streamflow run streamflow.yml --outdir /path/to/outputs + +**Default:** Current directory (``.``) + +**Output Contents:** + +:: + + outputs/ + ├── result_file_1.txt # Workflow output files + ├── result_file_2.csv + └── .streamflow/ # StreamFlow metadata + └── workflow.db # Execution database + +Workflow Name +------------- + +Assign a name to the workflow execution: + +.. code-block:: bash + :caption: Name workflow execution + + streamflow run streamflow.yml --name my-experiment-v1 + +**Purpose:** + +* Track multiple executions of the same workflow +* Generate reports for specific executions +* Organize workflow history + +**Default:** Auto-generated unique name + +Colored Logs +------------ + +Enable colored log output: + +.. code-block:: bash + :caption: Enable colored logs + + streamflow run streamflow.yml --color + +**Colors by Level:** + +* **ERROR** - Red +* **WARNING** - Yellow +* **INFO** - Green +* **DEBUG** - Blue + +**Use Cases:** + +* Live demos +* Interactive terminal sessions +* Faster log inspection + +Log Level +--------- + +Control logging verbosity: + +.. code-block:: bash + :caption: Set log level + + # Minimal output (only results, warnings, and errors) + streamflow run streamflow.yml --quiet + + # Normal output (default - shows workflow progress) + streamflow run streamflow.yml + + # Detailed output (debug-level diagnostics) + streamflow run streamflow.yml --debug + +**Levels:** + +================ ======================================== +Flag Output +================ ======================================== +``--quiet`` Only results, warnings, and errors +(default) Normal execution information +``--debug`` Detailed debugging information +================ ======================================== + +Complete Example +---------------- + +.. code-block:: bash + :caption: Full command with options + + streamflow run streamflow.yml \ + --outdir ./experiment-results \ + --name genome-pipeline-run-42 \ + --color \ + --log-level INFO + +CWL Runner Interface +==================== + +StreamFlow supports the ``cwl-runner`` interface for CWL compatibility: + +Basic Usage +----------- + +.. code-block:: bash + :caption: Use cwl-runner interface + + cwl-runner workflow.cwl inputs.yml + +This executes the CWL workflow directly without a StreamFlow configuration file. + +**Equivalent to:** + +.. code-block:: yaml + :caption: streamflow.yml equivalent + + version: v1.0 + workflows: + workflow: + type: cwl + config: + file: workflow.cwl + settings: inputs.yml + bindings: + - step: / + target: + deployment: local + + deployments: + local: + type: local + +With Deployments +---------------- + +To use custom deployments with ``cwl-runner``, create a minimal StreamFlow file: + +.. code-block:: bash + :caption: CWL runner with deployments + + cwl-runner \ + --streamflow-file streamflow-config.yml \ + workflow.cwl \ + inputs.yml + +.. code-block:: yaml + :caption: streamflow-config.yml + + version: v1.0 + deployments: + docker-env: + type: docker + config: + image: python:3.10 + bindings: + - step: / + target: + deployment: docker-env + +Running in Containers +===================== + +Run StreamFlow Docker Image +---------------------------- + +Execute StreamFlow in a container: + +.. code-block:: bash + :caption: Run StreamFlow in Docker + + docker run --rm \ + -v "$(pwd)"/my-project:/streamflow/project \ + -v "$(pwd)"/results:/streamflow/results \ + -v "$(pwd)"/tmp:/tmp/streamflow \ + alphaunito/streamflow \ + streamflow run /streamflow/project/streamflow.yml + +**Volume Mounts:** + +========================== ======================================== +Mount Purpose +========================== ======================================== +``my-project`` Workflow files (streamflow.yml, etc.) +``results`` Workflow outputs +``tmp`` Temporary files +``$HOME/.streamflow`` Metadata database (optional) +========================== ======================================== + +Complete Docker Example +----------------------- + +.. code-block:: bash + :caption: Complete Docker execution + + docker run -d \ + --name streamflow-execution \ + --mount type=bind,source="$(pwd)"/my-project,target=/streamflow/project \ + --mount type=bind,source="$(pwd)"/results,target=/streamflow/results \ + --mount type=bind,source="$(pwd)"/tmp,target=/tmp/streamflow \ + --mount type=bind,source="$HOME"/.streamflow,target=/root/.streamflow \ + alphaunito/streamflow \ + streamflow run /streamflow/project/streamflow.yml \ + --outdir /streamflow/results \ + --name my-workflow \ + --color + +**Monitor logs:** + +.. code-block:: bash + + docker logs -f streamflow-execution + +**Limitations:** + +Container-based connectors (Docker, Docker Compose, Singularity) are **not supported** from inside a Docker container due to nested container complexity. + +Running on Kubernetes +===================== + +Deploy as Kubernetes Job +------------------------ + +.. code-block:: yaml + :caption: streamflow-job.yaml - Kubernetes Job + + apiVersion: batch/v1 + kind: Job + metadata: + name: streamflow-workflow + spec: + template: + spec: + containers: + - name: streamflow + image: alphaunito/streamflow:latest + command: + - streamflow + - run + - /workflow/streamflow.yml + volumeMounts: + - name: workflow + mountPath: /workflow + - name: results + mountPath: /results + volumes: + - name: workflow + configMap: + name: streamflow-config + - name: results + persistentVolumeClaim: + claimName: streamflow-results + restartPolicy: Never + +Apply the job: + +.. code-block:: bash + :caption: Deploy StreamFlow job + + kubectl create configmap streamflow-config \ + --from-file=streamflow.yml \ + --from-file=workflow.cwl \ + --from-file=inputs.yml + + kubectl apply -f streamflow-job.yaml + +In-Cluster Execution +-------------------- + +For Helm connector to deploy on the same cluster: + +.. code-block:: yaml + :caption: streamflow.yml - In-cluster config + + deployments: + helm-deployment: + type: helm + config: + inCluster: true # Use ServiceAccount credentials + chart: bitnami/spark + release: spark-cluster + +**Requirements:** + +* Proper RBAC configuration +* ServiceAccount with deployment permissions + +.. code-block:: yaml + :caption: rbac.yaml - Required permissions + + apiVersion: v1 + kind: ServiceAccount + metadata: + name: streamflow-sa + --- + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: streamflow-role + rules: + - apiGroups: ["", "apps", "batch"] + resources: ["pods", "deployments", "jobs", "services"] + verbs: ["get", "list", "create", "delete", "watch"] + --- + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: streamflow-rolebinding + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: streamflow-role + subjects: + - kind: ServiceAccount + name: streamflow-sa + +Monitoring Execution +==================== + +Progress Tracking +----------------- + +StreamFlow logs provide real-time progress: + +.. code-block:: text + :caption: Example log output + + 2024-02-24 12:00:00.123 INFO StreamFlow version 0.2.0.dev14 + 2024-02-24 12:00:00.456 INFO Loading workflow from streamflow.yml + 2024-02-24 12:00:01.789 INFO Deploying environment: kubernetes-cluster + 2024-02-24 12:00:05.012 INFO Deployment kubernetes-cluster ready + 2024-02-24 12:00:05.345 INFO Starting workflow execution + 2024-02-24 12:00:06.678 INFO Executing step: /preprocess + 2024-02-24 12:00:15.901 INFO Step /preprocess completed (9.2s) + 2024-02-24 12:00:16.234 INFO Executing step: /analyze (scattered: 10 instances) + 2024-02-24 12:01:45.567 INFO Step /analyze completed (89.3s) + 2024-02-24 12:01:45.890 INFO Executing step: /visualize + 2024-02-24 12:01:52.123 INFO Step /visualize completed (6.2s) + 2024-02-24 12:01:52.456 INFO Workflow completed successfully + 2024-02-24 12:01:52.789 INFO Total execution time: 112.4s + 2024-02-24 12:01:53.012 INFO Undeploying environments + 2024-02-24 12:01:55.345 INFO Results saved to ./results + +Debug Logs +---------- + +Enable debug logging for troubleshooting: + +.. code-block:: bash + :caption: Debug logging + + streamflow run streamflow.yml --debug + +Debug logs include: + +* Detailed connector operations +* File transfer information +* Scheduling decisions +* Data management operations + +Log Files +--------- + +Redirect logs to a file: + +.. code-block:: bash + :caption: Save logs to file + + streamflow run streamflow.yml 2>&1 | tee workflow-execution.log + +Or: + +.. code-block:: bash + + streamflow run streamflow.yml > workflow.log 2>&1 + +Execution Lifecycle +=================== + +StreamFlow execution follows these phases: + +1. **Initialization** + + * Parse configuration + * Validate workflow syntax + * Initialize database + +2. **Deployment** + + * Deploy execution environments + * Verify connectivity + * Create services/locations + +3. **Execution** + + * Schedule workflow steps + * Transfer input data + * Execute tasks + * Collect outputs + +4. **Collection** + + * Gather results + * Save metadata + * Generate provenance + +5. **Teardown** + + * Undeploy environments + * Clean temporary files + * Close connections + +Execution States +---------------- + +================ ======================================== +State Description +================ ======================================== +``PENDING`` Workflow queued for execution +``RUNNING`` Workflow currently executing +``COMPLETED`` Workflow finished successfully +``FAILED`` Workflow encountered errors +``CANCELLED`` Workflow manually stopped +================ ======================================== + +Handling Failures +================= + +Automatic Retry +--------------- + +StreamFlow automatically retries failed tasks based on configuration: + +.. code-block:: yaml + :caption: Configure retry behavior (planned feature) + + workflows: + my-workflow: + config: + retry: + maxAttempts: 3 + backoff: exponential + +Checkpointing +------------- + +StreamFlow supports checkpointing for long-running workflows (see recovery features in reference documentation). + +Manual Intervention +------------------- + +If workflow fails: + +1. **Check logs** for error messages +2. **Verify deployments** are accessible +3. **Fix issues** (connectivity, resources, etc.) +4. **Restart workflow** from checkpoint or beginning + +Performance Optimization +======================== + +Parallel Execution +------------------ + +Use scatter for parallel processing: + +.. code-block:: yaml + :caption: CWL scatter for parallelism + + steps: + process: + run: process-tool.cwl + scatter: input_file + in: + input_file: input_files # Array of files + out: [output] + +StreamFlow schedules scattered tasks across available locations. + +Resource Allocation +------------------- + +Specify resource requirements in CWL: + +.. code-block:: yaml + :caption: Resource hints + + hints: + ResourceRequirement: + coresMin: 4 + ramMin: 8192 # MB + +Data Locality +------------- + +Bind steps to deployments where data resides: + +.. code-block:: yaml + :caption: Optimize data locality + + bindings: + - step: /process_large_data + target: + deployment: hpc-storage + - port: /large_dataset + target: + deployment: hpc-storage + workdir: /data + +Best Practices +============== + +1. **Test Locally First** + + .. code-block:: bash + + # Test with local deployment + streamflow run streamflow-local.yml + +2. **Use Descriptive Names** + + .. code-block:: bash + + streamflow run workflow.yml --name experiment-2024-02-24-v3 + +3. **Save Logs** + + .. code-block:: bash + + streamflow run workflow.yml 2>&1 | tee logs/execution-$(date +%Y%m%d-%H%M%S).log + +4. **Monitor Resource Usage** + + Use system monitoring tools alongside StreamFlow execution. + +5. **Organize Outputs** + + .. code-block:: bash + + mkdir -p results/$(date +%Y-%m-%d) + streamflow run workflow.yml --outdir results/$(date +%Y-%m-%d) + +6. **Version Control Workflows** + + Keep ``streamflow.yml`` and CWL files in version control. + +Troubleshooting +=============== + +Workflow Won't Start +-------------------- + +**Problem:** Workflow fails immediately at startup + +**Solution:** + +* Check YAML syntax: ``streamflow schema`` for validation +* Verify all referenced files exist +* Check deployment connectivity +* Review initialization logs + +Stuck at Deployment +------------------- + +**Problem:** Workflow hangs during deployment phase + +**Solution:** + +* Check network connectivity to deployment targets +* Verify credentials (SSH keys, kubeconfig, etc.) +* Check resource availability +* Review connector-specific logs + +Step Execution Fails +-------------------- + +**Problem:** Specific workflow step fails + +**Solution:** + +* Check step-specific logs +* Verify input files are accessible +* Check command/tool is available in execution environment +* Verify resource requirements are met + +Slow Execution +-------------- + +**Problem:** Workflow runs slower than expected + +**Solution:** + +* Check network bandwidth for data transfers +* Verify adequate resources allocated +* Review scheduling decisions in debug logs +* Consider data locality optimizations + +For comprehensive troubleshooting, see :doc:`troubleshooting`. + +Next Steps +========== + +After running workflows: + +* :doc:`inspecting-results` - Analyze workflow execution and results +* :doc:`troubleshooting` - Resolve common issues +* :doc:`/reference/cli/run` - Complete CLI reference +* :doc:`/reference/cli/report` - Generate execution reports + +Related Topics +============== + +* :doc:`binding-workflows` - Configure workflow bindings +* :doc:`/reference/cli/index` - Complete CLI documentation +* :doc:`/developer-guide/core-interfaces/workflow` - Workflow execution internals diff --git a/docs/source/user-guide/troubleshooting.rst b/docs/source/user-guide/troubleshooting.rst new file mode 100644 index 000000000..b7cd17f95 --- /dev/null +++ b/docs/source/user-guide/troubleshooting.rst @@ -0,0 +1,830 @@ +=============== +Troubleshooting +=============== + +.. meta:: + :keywords: StreamFlow, troubleshooting, debugging, errors, problems, solutions + :description: Comprehensive troubleshooting guide for common StreamFlow issues + +Overview +======== + +This guide provides solutions to common problems encountered when using StreamFlow. Issues are organized by category for quick reference. + +Quick Diagnostic Steps +====================== + +When encountering issues: + +1. **Check logs** - Look for error messages +2. **Verify configuration** - Validate YAML syntax +3. **Test connectivity** - Ensure deployments are reachable +4. **Check resources** - Verify adequate CPU/memory/disk +5. **Isolate the problem** - Test components individually + +Installation Issues +=================== + +Python Version Error +-------------------- + +**Problem:** ``ERROR: Package requires a different Python version`` + +**Cause:** StreamFlow requires Python 3.10 or later + +**Solution:** + +.. code-block:: bash + + # Check Python version + python --version + + # Install with specific Python version + python3.10 -m pip install streamflow + +Command Not Found +----------------- + +**Problem:** ``command 'streamflow' not found`` + +**Cause:** Installation directory not in PATH + +**Solution:** + +.. code-block:: bash + + # Find installation location + pip show streamflow | grep Location + + # Add to PATH (add to ~/.bashrc or ~/.zshrc) + export PATH="$HOME/.local/bin:$PATH" + + # Or use full path + ~/.local/bin/streamflow version + +Dependency Conflicts +-------------------- + +**Problem:** pip reports dependency conflicts + +**Solution:** + +.. code-block:: bash + + # Use virtual environment + python3 -m venv streamflow-env + source streamflow-env/bin/activate + pip install streamflow + +Configuration Issues +==================== + +YAML Syntax Errors +------------------ + +**Problem:** ``YAML parse error`` or ``Unexpected token`` + +**Solution:** + +* Check indentation (use spaces, not tabs) +* Ensure proper YAML formatting +* Validate with online YAML validator +* Use JSON Schema for validation: + +.. code-block:: bash + + streamflow schema > streamflow-schema.json + # Configure IDE to use schema + +Invalid Deployment Type +----------------------- + +**Problem:** ``Unknown deployment type 'xxx'`` + +**Solution:** + +* Check spelling of connector type +* Verify connector is available: + +.. code-block:: bash + + streamflow ext list + +* Ensure required plugin is installed + +Missing Required Fields +----------------------- + +**Problem:** ``Missing required field 'xxx'`` + +**Solution:** + +* Check connector documentation for required fields +* Refer to :doc:`/reference/index` +* Generate schema for auto-completion + +Step Not Found +-------------- + +**Problem:** ``Step '/xxx' not found in workflow`` + +**Solution:** + +* Verify step name matches CWL workflow +* Check for typos in step path +* Use ``/`` for entire workflow +* For nested workflows: ``/subworkflow/step`` + +Deployment Issues +================= + +Connection Timeout +------------------ + +**Problem:** ``Connection timeout`` or ``Connection refused`` + +**Cause:** Network connectivity or service not running + +**Solution:** + +.. code-block:: bash + + # Test connectivity + ping hostname + + # Test SSH + ssh user@hostname + + # Check Docker daemon + docker info + + # Check Kubernetes + kubectl cluster-info + +SSH Authentication Failed +------------------------- + +**Problem:** ``Permission denied (publickey)`` + +**Cause:** SSH key not authorized or incorrect + +**Solution:** + +.. code-block:: bash + + # Check SSH key permissions + chmod 600 ~/.ssh/id_rsa + chmod 644 ~/.ssh/id_rsa.pub + + # Test SSH connection + ssh -i ~/.ssh/id_rsa user@hostname + + # Add public key to remote host + ssh-copy-id -i ~/.ssh/id_rsa.pub user@hostname + + # Verify key in authorized_keys + cat ~/.ssh/authorized_keys # on remote host + +Docker Issues +------------- + +**Problem:** ``Cannot connect to Docker daemon`` + +**Solution:** + +.. code-block:: bash + + # Start Docker daemon + sudo systemctl start docker # Linux + # or start Docker Desktop (macOS) + + # Check Docker status + docker info + + # Add user to docker group (Linux) + sudo usermod -aG docker $USER + # Log out and back in + +**Problem:** ``ImagePullBackOff`` or ``Failed to pull image`` + +**Solution:** + +.. code-block:: bash + + # Test image locally + docker pull image:tag + + # Check image name/tag + # Verify registry credentials if private + docker login registry.example.com + + # Check network connectivity + +Kubernetes Issues +----------------- + +**Problem:** ``Unauthorized`` or ``Error loading kubeconfig`` + +**Solution:** + +.. code-block:: bash + + # Verify kubeconfig + kubectl cluster-info + + # Check authentication + kubectl auth can-i get pods + + # Verify namespace exists + kubectl get namespaces + + # Check current context + kubectl config current-context + +**Problem:** Pods stuck in ``Pending`` + +**Solution:** + +.. code-block:: bash + + # Check pod events + kubectl describe pod + + # Check node resources + kubectl describe nodes + + # Check resource quotas + kubectl get resourcequota + + # Reduce resource requests or scale cluster + +Slurm/PBS Issues +---------------- + +**Problem:** Job stays in queue indefinitely + +**Solution:** + +.. code-block:: bash + + # Check job status + squeue # Slurm + qstat # PBS + + # Check job details + scontrol show job # Slurm + qstat -f # PBS + + # Verify partition/queue exists + sinfo # Slurm + qstat -Q # PBS + + # Check account/QoS limits + sacctmgr show assoc # Slurm + +* Reduce resource requests +* Check account limits +* Verify partition is active + +**Problem:** ``Invalid account`` or ``Invalid QOS`` + +**Solution:** + +.. code-block:: bash + + # Check available accounts + sacctmgr show assoc where user=$USER # Slurm + + # Use correct account in configuration + services: + compute: + account: valid-account-name + +Workflow Execution Issues +========================== + +Workflow Won't Start +-------------------- + +**Problem:** Workflow fails immediately at startup + +**Diagnostic Steps:** + +1. Check for configuration errors +2. Verify CWL file syntax +3. Check file paths exist +4. Review initialization logs + +**Solution:** + +.. code-block:: bash + + # Validate CWL + cwltool --validate workflow.cwl + + # Check StreamFlow config syntax + streamflow schema > schema.json + # Validate against schema + + # Enable debug logging + streamflow run workflow.yml --debug + +Step Execution Fails +-------------------- + +**Problem:** Specific workflow step fails + +**Diagnostic Steps:** + +1. Check step logs for error messages +2. Verify command/tool is available +3. Check input files are accessible +4. Verify resource requirements + +**Solution:** + +.. code-block:: bash + + # Check if command exists in deployment + # For Docker: + docker run image:tag which command + + # For SSH: + ssh user@host which command + + # Verify input files + # Check file paths in error logs + + # Test command manually + # Run the exact command from logs + +Command Not Found +----------------- + +**Problem:** ``command not found: xxx`` + +**Cause:** Tool not installed in execution environment + +**Solution:** + +* Verify tool in container image +* Install missing tools +* Use correct base image +* Check PATH in execution environment + +.. code-block:: yaml + + # Ensure tool in container + deployments: + docker-env: + type: docker + config: + image: image-with-tool:latest + +File Not Found +-------------- + +**Problem:** ``No such file or directory: /path/to/file`` + +**Cause:** File not accessible in execution location + +**Solution:** + +* Check file exists locally +* Verify file transfer occurred +* Use port binding for remote files: + +.. code-block:: yaml + + bindings: + - port: /input_file + target: + deployment: remote-storage + workdir: /data + +* Check file permissions + +Data Transfer Issues +-------------------- + +**Problem:** Large files transfer slowly or fail + +**Solution:** + +* Use data locality - execute where data resides +* Configure port bindings to avoid transfers +* Check network bandwidth +* Use appropriate transfer protocols + +.. code-block:: yaml + + # Avoid transfer by using port binding + bindings: + - step: /process + target: + deployment: hpc-cluster + - port: /large_dataset + target: + deployment: hpc-cluster + workdir: /scratch/data + +Memory Issues +------------- + +**Problem:** ``Out of memory`` or ``Killed`` + +**Solution:** + +* Increase memory limits: + +.. code-block:: yaml + + # CWL hint + hints: + ResourceRequirement: + ramMin: 16384 # MB + + # Kubernetes + services: + workers: + template: + spec: + containers: + - resources: + limits: + memory: "16Gi" + + # Slurm + services: + compute: + mem: 64G + +* Process data in smaller chunks +* Use streaming/incremental processing +* Scale to nodes with more memory + +Disk Space Issues +----------------- + +**Problem:** ``No space left on device`` + +**Solution:** + +.. code-block:: bash + + # Check disk space + df -h + + # Clean temporary files + rm -rf /tmp/streamflow/* + + # Increase disk quota + # Configure larger scratch space + +* Use deployment with more disk +* Clean intermediate files +* Stream data instead of storing + +Container-Specific Issues +========================== + +Permission Denied in Container +------------------------------ + +**Problem:** ``Permission denied`` inside container + +**Cause:** User ID mismatch + +**Solution:** + +.. code-block:: yaml + + deployments: + docker-env: + type: docker + config: + image: myimage:latest + user: "1000:1000" # Match host UID:GID + +Volume Mount Issues +------------------- + +**Problem:** Files not visible in container + +**Solution:** + +.. code-block:: yaml + + deployments: + docker-volumes: + type: docker + config: + image: myimage:latest + volumes: + - /host/path:/container/path:rw + +* Verify host path exists +* Check path permissions +* Use absolute paths + +Performance Issues +================== + +Slow Execution +-------------- + +**Problem:** Workflow runs slower than expected + +**Diagnostic Steps:** + +1. Generate HTML report +2. Check timeline for bottlenecks +3. Identify long-running steps +4. Check for sequential execution that should be parallel + +**Solutions:** + +* Add parallelism with scatter: + +.. code-block:: yaml + + steps: + process: + run: tool.cwl + scatter: input_file + in: + input_file: input_files + +* Improve data locality +* Use faster deployment +* Adjust resource allocation +* Optimize step commands + +Excessive Idle Time +------------------- + +**Problem:** Resources sit idle during execution + +**Solution:** + +* Increase parallelism +* Adjust scatter configuration +* Use multiple deployments +* Check scheduling efficiency + +CWL Issues +========== + +Invalid CWL Syntax +------------------ + +**Problem:** ``Invalid CWL`` or ``Parse error`` + +**Solution:** + +.. code-block:: bash + + # Validate CWL + cwltool --validate workflow.cwl + + # Check version compatibility + # StreamFlow supports CWL v1.0, v1.1, v1.2 + +Output Glob No Matches +---------------------- + +**Problem:** ``Output glob pattern matches no files`` + +**Cause:** Command didn't create expected output + +**Solution:** + +* Verify command actually creates file +* Check output filename/pattern +* Check working directory +* Test command manually: + +.. code-block:: bash + + # Run command to see what files it creates + ls -la + +JavaScript Expression Errors +----------------------------- + +**Problem:** ``Invalid JavaScript expression`` + +**Solution:** + +* Add requirement: + +.. code-block:: yaml + + requirements: + InlineJavascriptRequirement: {} + +* Check JavaScript syntax +* Verify variable names (``inputs.*``, ``runtime.*``) + +Secondary Files Missing +----------------------- + +**Problem:** ``Secondary file not found`` + +**Solution:** + +* Ensure secondary files exist +* Check secondaryFiles specification: + +.. code-block:: yaml + + inputs: + reference: + type: File + secondaryFiles: + - .fai + - ^.dict # ^ means replace extension + +Debugging Techniques +==================== + +Enable Debug Logging +-------------------- + +.. code-block:: bash + + streamflow run workflow.yml --debug + +Debug logs show: + +* Detailed connector operations +* File transfers +* Command executions +* Scheduling decisions + +Test Locally First +------------------ + +.. code-block:: yaml + + # Test with local deployment + deployments: + local: + type: local + + bindings: + - step: / + target: + deployment: local + +Isolate Problems +---------------- + +1. **Test deployment separately** + +.. code-block:: bash + + # SSH + ssh user@hostname + + # Docker + docker run -it image:tag /bin/bash + + # Kubernetes + kubectl run test --image=image:tag -it -- /bin/bash + +2. **Test CWL workflow separately** + +.. code-block:: bash + + cwltool workflow.cwl inputs.yml + +3. **Test single step** + +.. code-block:: yaml + + bindings: + - step: /problematic_step + target: + deployment: test-deployment + +Manual Command Execution +------------------------ + +Run the exact command from logs manually: + +.. code-block:: bash + + # Copy command from debug logs + # Execute in same environment + ssh user@host 'command from logs' + +Check Intermediate Files +------------------------ + +Inspect intermediate outputs: + +.. code-block:: bash + + # Check working directories for intermediate files + ls -la /tmp/streamflow/ + + # Or check .streamflow directory + ls -la .streamflow/ + +Common Error Messages +===================== + +Database Errors +--------------- + +**Error:** ``Database is locked`` + +**Solution:** + +* Close other StreamFlow instances +* Remove lock file if stale +* Use separate databases (``--outdir``) + +**Error:** ``Unable to open database file`` + +**Solution:** + +* Check file permissions +* Verify directory exists +* Check disk space + +Network Errors +-------------- + +**Error:** ``Connection reset by peer`` + +**Solution:** + +* Check network stability +* Verify firewall rules +* Increase timeout settings + +**Error:** ``Name or service not known`` + +**Solution:** + +* Verify hostname is correct +* Check DNS resolution +* Use IP address instead + +Resource Errors +--------------- + +**Error:** ``Insufficient resources`` + +**Solution:** + +* Reduce resource requests +* Scale deployment +* Use different deployment with more resources + +Getting Help +============ + +If problems persist: + +1. **Check Logs** + + * Enable debug logging + * Review full error messages + * Check connector-specific logs + +2. **Search Documentation** + + * :doc:`/reference/index` - Connector-specific guidance + * :doc:`/reference/cli/index` - CLI reference + * CWL issues: https://www.commonwl.org/ + +3. **Search GitHub Issues** + + https://github.com/alpha-unito/streamflow/issues + +4. **Report Bug** + + When reporting issues, include: + + * StreamFlow version (``streamflow version``) + * Python version (``python --version``) + * Operating system + * Complete error message + * Minimal reproducible example + * Debug logs + +5. **Ask for Help** + + * GitHub Discussions: https://github.com/alpha-unito/streamflow/discussions + * Include context and what you've tried + +Related Topics +============== + +* :doc:`running-workflows` - Workflow execution guide +* :doc:`inspecting-results` - Debugging with reports +* :doc:`/reference/index` - Connector documentation +* :doc:`/developer-guide/index` - Architecture and internals diff --git a/docs/source/user-guide/writing-workflows.rst b/docs/source/user-guide/writing-workflows.rst new file mode 100644 index 000000000..cd92e8a05 --- /dev/null +++ b/docs/source/user-guide/writing-workflows.rst @@ -0,0 +1,642 @@ +================= +Writing Workflows +================= + +.. meta:: + :keywords: StreamFlow, CWL, Common Workflow Language, workflow, CommandLineTool, steps + :description: Learn how to write CWL workflows for StreamFlow execution + +Overview +======== + +StreamFlow uses the `Common Workflow Language `_ (CWL) standard to describe workflows. This guide introduces CWL concepts and shows how to write workflows for StreamFlow execution. + +CWL Support in StreamFlow +========================== + +StreamFlow implements CWL conformance for multiple versions: + +=========== =================== =============================== +Version Conformance Status Notes +=========== =================== =============================== +v1.0 Full conformance All required and optional features +v1.1 Full conformance Including scatter/gather +v1.2 Full conformance Including conditional execution +v1.3 Partial support Under development +=========== =================== =============================== + +For complete conformance details, see :doc:`/reference/cwl-support/index`. + +.. note:: + StreamFlow does not modify the CWL specification. All standard CWL workflows are compatible with StreamFlow. + +CWL Basics +========== + +A CWL workflow consists of: + +**Workflow** + A directed acyclic graph (DAG) of computational steps. + +**Steps** + Individual computational tasks, each described by a CommandLineTool or sub-workflow. + +**Inputs** + Data and parameters provided to the workflow. + +**Outputs** + Results produced by the workflow. + +**Connections** + Data flow between steps via input/output bindings. + +Minimal Workflow Example +======================== + +Here's the simplest possible CWL workflow: + +.. code-block:: yaml + :caption: hello-world.cwl - Minimal workflow + + cwlVersion: v1.2 + class: Workflow + + inputs: + message: string + + outputs: + output_file: + type: File + outputSource: echo_step/outfile + + steps: + echo_step: + run: + class: CommandLineTool + baseCommand: echo + inputs: + msg: + type: string + inputBinding: + position: 1 + outputs: + outfile: + type: stdout + stdout: output.txt + in: + msg: message + out: [outfile] + +**Usage:** + +.. code-block:: yaml + :caption: hello-inputs.yml + + message: "Hello from StreamFlow!" + +.. code-block:: bash + :caption: Execute the workflow + + streamflow run workflow.yml + +Where ``workflow.yml`` is the StreamFlow configuration file that references ``hello-world.cwl``. + +CommandLineTool Structure +========================== + +A CommandLineTool describes how to execute a single command: + +.. code-block:: yaml + :caption: CommandLineTool components + + class: CommandLineTool + baseCommand: [command, arg1] # Base command to execute + arguments: [--flag, value] # Additional arguments + inputs: # Input parameters + input_name: + type: File | string | int | boolean | ... + inputBinding: + position: 1 # Argument position + prefix: --input # Command-line flag + outputs: # Output specifications + output_name: + type: File | Directory | ... + outputBinding: + glob: "*.txt" # Output file pattern + requirements: # Tool requirements + - class: DockerRequirement + dockerPull: ubuntu:22.04 + hints: # Optional hints + - class: ResourceRequirement + coresMin: 4 + ramMin: 8192 + +Data Types +---------- + +CWL supports various data types: + +================== ======================================== +Type Description +================== ======================================== +``string`` Text string +``int`` Integer number +``long`` Long integer +``float`` Floating-point number +``double`` Double-precision float +``boolean`` True or false +``File`` File reference +``Directory`` Directory reference +``null`` Null value (for optional inputs) +``array`` Array of any type (e.g., ``File[]``) +``record`` Structured data (custom schema) +``enum`` Enumerated values +================== ======================================== + +Optional inputs use a union type: ``[null, string]`` or ``string?`` + +Multi-Step Workflow Example +============================ + +Here's a more realistic workflow with multiple steps: + +.. code-block:: yaml + :caption: compile-workflow.cwl - Extract and compile Java source + + cwlVersion: v1.2 + class: Workflow + + inputs: + tarball: File + name_of_file_to_extract: string + + outputs: + compiled_class: + type: File + outputSource: compile/classfile + + steps: + untar: + run: + class: CommandLineTool + baseCommand: [tar, --extract] + inputs: + tarfile: + type: File + inputBinding: + prefix: --file + extractfile: string + outputs: + extracted_file: + type: File + outputBinding: + glob: $(inputs.extractfile) + in: + tarfile: tarball + extractfile: name_of_file_to_extract + out: [extracted_file] + + compile: + run: + class: CommandLineTool + baseCommand: javac + arguments: ["-d", $(runtime.outdir)] + inputs: + src: + type: File + inputBinding: + position: 1 + outputs: + classfile: + type: File + outputBinding: + glob: "*.class" + in: + src: untar/extracted_file + out: [classfile] + +**Key Points:** + +* Steps execute in dependency order (``compile`` waits for ``untar``) +* Data flows from ``untar/extracted_file`` to ``compile/src`` +* Each step has its own CommandLineTool definition + +Workflow Requirements +===================== + +Requirements specify runtime conditions: + +Common Requirements +------------------- + +**DockerRequirement** + Run in a Docker container: + + .. code-block:: yaml + + requirements: + DockerRequirement: + dockerPull: python:3.10 + +**InitialWorkDirRequirement** + Stage files in the working directory: + + .. code-block:: yaml + + requirements: + InitialWorkDirRequirement: + listing: + - $(inputs.input_file) + - entry: $(inputs.config_data) + entryname: config.json + +**ResourceRequirement** + Specify resource needs: + + .. code-block:: yaml + + requirements: + ResourceRequirement: + coresMin: 4 + coresMax: 8 + ramMin: 8192 # MB + ramMax: 16384 + tmpdirMin: 10000 # MB + outdirMin: 10000 + +**ScatterFeatureRequirement** + Enable parallel scatter execution: + + .. code-block:: yaml + + requirements: + ScatterFeatureRequirement: {} + +**SubworkflowFeatureRequirement** + Use sub-workflows: + + .. code-block:: yaml + + requirements: + SubworkflowFeatureRequirement: {} + +For complete list, see the `CWL specification `_. + +Scatter/Gather Pattern +====================== + +Process arrays in parallel using scatter: + +.. code-block:: yaml + :caption: Scatter example - Process multiple files + + cwlVersion: v1.2 + class: Workflow + + requirements: + ScatterFeatureRequirement: {} + + inputs: + files: File[] + + outputs: + processed: + type: File[] + outputSource: process/output + + steps: + process: + run: process-tool.cwl + scatter: input_file + in: + input_file: files + out: [output] + +StreamFlow will schedule scattered tasks across available locations for parallel execution. + +Conditional Execution (CWL v1.2+) +================================= + +Execute steps conditionally: + +.. code-block:: yaml + :caption: Conditional workflow + + cwlVersion: v1.2 + class: Workflow + + requirements: + InlineJavascriptRequirement: {} + + inputs: + run_optional: boolean + data: File + + outputs: + result: + type: File + outputSource: process/output + + steps: + optional_step: + when: $(inputs.run_optional) + run: preprocessing.cwl + in: + run_optional: run_optional + input: data + out: [preprocessed] + + process: + run: main-processing.cwl + in: + input: + source: [optional_step/preprocessed, data] + pickValue: first_non_null + out: [output] + +JavaScript Expressions +====================== + +CWL supports JavaScript for dynamic values: + +.. code-block:: yaml + :caption: Using JavaScript expressions + + requirements: + InlineJavascriptRequirement: {} + + inputs: + input_file: + type: File + inputBinding: + # Remove .txt extension and add .processed.txt + valueFrom: | + $(inputs.input_file.nameroot + '.processed' + inputs.input_file.nameext) + +**Available variables:** + +* ``inputs.*`` - Input values +* ``self`` - Current value +* ``runtime.*`` - Runtime environment (``cores``, ``ram``, ``outdir``, ``tmpdir``) + +External Tool Definitions +========================== + +Keep CommandLineTool definitions in separate files: + +.. code-block:: yaml + :caption: workflow.cwl - Reference external tools + + cwlVersion: v1.2 + class: Workflow + + steps: + align: + run: tools/bwa-mem.cwl # External tool definition + in: + reference: ref_genome + reads: input_reads + out: [aligned] + + sort: + run: tools/samtools-sort.cwl + in: + input: align/aligned + out: [sorted] + +.. code-block:: yaml + :caption: tools/bwa-mem.cwl - External tool definition + + class: CommandLineTool + baseCommand: [bwa, mem] + inputs: + reference: + type: File + inputBinding: + position: 1 + reads: + type: File + inputBinding: + position: 2 + outputs: + aligned: + type: File + outputBinding: + glob: "aligned.sam" + stdout: aligned.sam + +StreamFlow-Specific Considerations +=================================== + +While StreamFlow follows the CWL standard, consider these points: + +File Transfer +------------- + +StreamFlow automatically transfers files between execution locations. Use appropriate data transfer strategies in your workflow design. See :doc:`/user-guide/advanced-patterns/index` for details. + +Container Translation +--------------------- + +DockerRequirement is automatically translated to the appropriate container runtime (Docker, Singularity, Kubernetes) based on deployment configuration. See :doc:`/reference/cwl-docker-translators/index`. + +Binding Filters +--------------- + +StreamFlow extends CWL with binding filters to control which steps run where. This is configured in the StreamFlow YAML file, not the CWL file. See :doc:`binding-workflows`. + +Resource Hints +-------------- + +ResourceRequirement hints guide StreamFlow's scheduler but don't enforce hard limits unless configured in the deployment. + +Best Practices +============== + +1. **Keep Tools Separate** + + Store CommandLineTool definitions in separate files for reusability: + + :: + + workflows/ + my-workflow.cwl + tools/ + tool1.cwl + tool2.cwl + +2. **Use Descriptive Names** + + Use clear, descriptive names for steps, inputs, and outputs: + + .. code-block:: yaml + + # Good + align_reads: + run: bwa-mem.cwl + + # Avoid + step1: + run: tool.cwl + +3. **Document Inputs** + + Add documentation to inputs: + + .. code-block:: yaml + + inputs: + reference_genome: + type: File + label: "Reference genome FASTA" + doc: "Reference genome in FASTA format for alignment" + +4. **Validate Workflows** + + Use ``cwltool --validate`` to check syntax: + + .. code-block:: bash + + cwltool --validate my-workflow.cwl + +5. **Test Locally First** + + Test workflows with local execution before deploying to remote infrastructure. + +6. **Specify CWL Version** + + Always include ``cwlVersion`` to ensure compatibility. + +7. **Use Type Hints** + + Explicitly type all inputs and outputs for better validation. + +Common Patterns +=============== + +File Pairs (R1/R2 Reads) +------------------------ + +Handle paired-end sequencing reads: + +.. code-block:: yaml + + inputs: + reads: + type: + type: record + fields: + forward: File + reverse: File + +Multiple Output Files +--------------------- + +Capture multiple outputs: + +.. code-block:: yaml + + outputs: + results: + type: File[] + outputBinding: + glob: "result_*.txt" + +Secondary Files +--------------- + +Handle index files and other companions: + +.. code-block:: yaml + + inputs: + reference: + type: File + secondaryFiles: + - .fai + - ^.dict + +Troubleshooting +=============== + +Common CWL Issues +----------------- + +**Problem:** ``ValueError: Missing required input parameter`` + +**Solution:** Ensure all required inputs are provided in the inputs file. + +**Problem:** ``Output glob pattern matches no files`` + +**Solution:** + +* Check that the command actually creates the expected output file +* Verify the glob pattern matches the actual filename +* Check working directory and output directory paths + +**Problem:** ``Invalid JavaScript expression`` + +**Solution:** + +* Add ``InlineJavascriptRequirement`` to requirements +* Check JavaScript syntax +* Verify variable names (``inputs.*``, ``runtime.*``) + +Validation Errors +----------------- + +Use ``cwltool`` for validation: + +.. code-block:: bash + + # Validate workflow syntax + cwltool --validate workflow.cwl + + # Validate with inputs + cwltool --validate workflow.cwl inputs.yml + + # Print detailed validation info + cwltool --print-pre workflow.cwl + +StreamFlow-Specific Issues +--------------------------- + +**Problem:** Workflow works with ``cwltool`` but fails in StreamFlow + +**Solution:** + +* Check that all files are accessible from the execution location +* Verify deployment bindings are correct +* Check container availability on target deployment +* Review StreamFlow logs for detailed error messages + +Next Steps +========== + +After writing your workflow: + +* :doc:`configuring-deployments` - Set up execution environments +* :doc:`binding-workflows` - Bind workflow steps to deployments +* :doc:`running-workflows` - Execute and monitor workflows +* :doc:`/user-guide/advanced-patterns/index` - Learn advanced binding patterns + +Learning Resources +================== + +**Official CWL Documentation:** + * `CWL User Guide `_ - Comprehensive tutorial + * `CWL Specification `_ - Complete reference + * `CWL Command Line Tool `_ - Reference implementation + +**StreamFlow Resources:** + * :doc:`/reference/cwl-support/index` - CWL conformance details + * :doc:`/reference/cwl-docker-translators/index` - Container translation + * `GitHub Examples `_ - Sample workflows + +Related Topics +============== + +* :doc:`quickstart` - Simple workflow example +* :doc:`/reference/configuration/workflow-config` - Workflow configuration schema +* :doc:`/developer-guide/core-interfaces/workflow` - Workflow interface internals +* `CWL Conformance Tests `_ - Test suite