From a993604e25263bef26bdb75020d28ab47011c1d5 Mon Sep 17 00:00:00 2001 From: Brendan Barnes Date: Fri, 1 Aug 2025 21:41:40 +0000 Subject: [PATCH 1/6] Reference workflow using argo --- .devcontainer/devcontainer.json | 18 +++++ .devcontainer/setup-argo.sh | 45 ++++++++++++ README.md | 64 ++++++++++++++++ docs/draft/orchestration/orchestration.md | 13 ++++ docs/draft/orchestration/reference.md | 3 + docs/draft/orchestration/schema.md | 3 + docs/draft/orchestration/spec.md | 3 + docs/draft/orchestration/standard.md | 26 +++++++ mkdocs.yml | 1 + reference/orchestration/argo/reference.yaml | 81 +++++++++++++++++++++ 10 files changed, 257 insertions(+) create mode 100644 .devcontainer/devcontainer.json create mode 100755 .devcontainer/setup-argo.sh create mode 100644 docs/draft/orchestration/orchestration.md create mode 100644 docs/draft/orchestration/reference.md create mode 100644 docs/draft/orchestration/schema.md create mode 100644 docs/draft/orchestration/spec.md create mode 100644 docs/draft/orchestration/standard.md create mode 100644 reference/orchestration/argo/reference.yaml diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..a2aa5f3 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,18 @@ +{ + "name": "FFRD Specs devcontainer", + "image": "mcr.microsoft.com/devcontainers/base:bookworm", + "features": { + "ghcr.io/devcontainers/features/docker-outside-of-docker:1": { + "enableNonRootDocker": "true" + }, + "ghcr.io/devcontainers/features/python:1": {} + }, + "postCreateCommand": ".devcontainer/setup-argo.sh", + "customizations": { + "vscode": { + "extensions": [ + "ms-kubernetes-tools.vscode-kubernetes-tools" + ] + } + } +} \ No newline at end of file diff --git a/.devcontainer/setup-argo.sh b/.devcontainer/setup-argo.sh new file mode 100755 index 0000000..4a3f3aa --- /dev/null +++ b/.devcontainer/setup-argo.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -e + +echo "๐Ÿš€ Setting up Argo Workflows development environment..." + +echo "๐Ÿ“ฆ Installing kubectl..." +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" +chmod +x kubectl +sudo mv kubectl /usr/local/bin/ + +echo "๐Ÿ“ฆ Installing Argo CLI..." +curl -sLO https://github.com/argoproj/argo-workflows/releases/download/v3.7.0/argo-linux-amd64.gz +gunzip argo-linux-amd64.gz +chmod +x argo-linux-amd64 +sudo mv argo-linux-amd64 /usr/local/bin/argo + +echo "๐Ÿงน Cleaning up any existing k3s-server container..." +docker rm -f k3s-server 2>/dev/null || true + +echo "๐Ÿ”ง Starting k3s Kubernetes cluster..." +docker run -d --name k3s-server --privileged -p 6443:6443 rancher/k3s:latest server --disable=traefik + +echo "โณ Waiting for k3s to start..." +sleep 10 + +echo "๐Ÿ”ง Configuring kubectl..." +mkdir -p ~/.kube +docker exec k3s-server cat /etc/rancher/k3s/k3s.yaml | sed 's/127.0.0.1/172.17.0.2/g' > ~/.kube/config +CONTAINER_IP=$(docker inspect k3s-server | grep '"IPAddress"' | head -1 | cut -d'"' -f4) +docker exec k3s-server cat /etc/rancher/k3s/k3s.yaml | sed "s/127.0.0.1/$CONTAINER_IP/g" > ~/.kube/config +sed -i '/certificate-authority-data:/d' ~/.kube/config +sed -i '/server:/a\ insecure-skip-tls-verify: true' ~/.kube/config + +echo "๐Ÿ“ฆ Installing Argo Workflows..." +kubectl create namespace argo || true +kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.7.0/install.yaml + +echo "โณ Waiting for Argo Workflows to be ready..." +kubectl wait --for=condition=available --timeout=300s deployment/argo-server -n argo || echo "โš ๏ธ Argo server may still be starting..." +kubectl wait --for=condition=available --timeout=300s deployment/workflow-controller -n argo || echo "โš ๏ธ Workflow controller may still be starting..." + +echo "๐Ÿ” Setting up RBAC for workflows..." +kubectl create rolebinding default-admin --clusterrole=admin --serviceaccount=default:default + +echo "โœ… Setup complete!" diff --git a/README.md b/README.md index 2bea558..9b5d244 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,70 @@ FEMA FFRD Specifications. IN PROGRESS. https://fema-ffrd.github.io/specs/ ## Setup + +### Dev Container Setup (Optional) +1. Open this repository in VS Code +2. When prompted, click "Reopen in Container" or use the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container" +3. The container will automatically set up the environment and install dependencies + +#### What Gets Installed + +The setup includes: + +- **Base**: Debian 12 (bookworm) container +- **Docker**: Docker-outside-of-Docker for running k3s +- **kubectl**: Kubernetes CLI +- **argo**: Argo Workflows CLI v3.7.0 +- **k3s**: Lightweight Kubernetes cluster +- **Argo Workflows**: v3.7.0 installed in the cluster + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€-โ” +โ”‚ DevContainer โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ argo โ”‚ โ”‚ kubectl โ”‚ โ”‚ +โ”‚ โ”‚ CLI โ”‚ โ”‚ CLI โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ Docker Host โ”‚ โ”‚ +โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ k3s Container โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ Argo Workflows โ”‚โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€-โ”˜ +``` + +#### Useful Commands + +Once setup is complete, you can use these commands: + +```bash +# Validate workflow files +argo lint reference/orchestration/argo/reference.yaml + +# Submit workflow files +argo submit reference/orchestration/argo/reference.yaml + +# Watch the workflow execution +argo submit --watch reference/orchestration/argo/reference.yaml + +# List all workflows +argo list + +# View logs for a specific workflow +argo logs +``` + +#### Useful Links + +- Explore the [reference workflow](./reference/orchestration/argo/reference.yaml) +- Read the [Argo Workflows documentation](https://argo-workflows.readthedocs.io/) + +### Documentation Setup 1. Create a Python virtual environment. ``` $ python -m venv venv-specs diff --git a/docs/draft/orchestration/orchestration.md b/docs/draft/orchestration/orchestration.md new file mode 100644 index 0000000..9a65bf5 --- /dev/null +++ b/docs/draft/orchestration/orchestration.md @@ -0,0 +1,13 @@ +{% include "draft/orchestration/standard.md" %} + +--- + +{% include "draft/orchestration/spec.md" %} + +--- + +{% include "draft/orchestration/schema.md" %} + +--- + +{% include "draft/orchestration/reference.md" %} \ No newline at end of file diff --git a/docs/draft/orchestration/reference.md b/docs/draft/orchestration/reference.md new file mode 100644 index 0000000..96f0dd3 --- /dev/null +++ b/docs/draft/orchestration/reference.md @@ -0,0 +1,3 @@ +## ๐Ÿ“š Reference + +> TODO \ No newline at end of file diff --git a/docs/draft/orchestration/schema.md b/docs/draft/orchestration/schema.md new file mode 100644 index 0000000..b985592 --- /dev/null +++ b/docs/draft/orchestration/schema.md @@ -0,0 +1,3 @@ +## ๐Ÿ—ƒ๏ธ Schema + +> TODO \ No newline at end of file diff --git a/docs/draft/orchestration/spec.md b/docs/draft/orchestration/spec.md new file mode 100644 index 0000000..72efe8a --- /dev/null +++ b/docs/draft/orchestration/spec.md @@ -0,0 +1,3 @@ +## ๐Ÿ“ Specification + +> TODO \ No newline at end of file diff --git a/docs/draft/orchestration/standard.md b/docs/draft/orchestration/standard.md new file mode 100644 index 0000000..bf460c3 --- /dev/null +++ b/docs/draft/orchestration/standard.md @@ -0,0 +1,26 @@ +# Workflow Orchestration + +## ๐Ÿ“ Standard + +### Purpose +To provide a standardized, automated, and reproducible method for executing multi-step flood risk modeling pipelines within the FFRD initiative. + +### Scope +This standard applies to all workflows requiring: + +- Sequencing of multiple containerized tools for data preparation, model execution, and post-processing. +- Orchestration of HEC-HMS and HEC-RAS models within larger computational workflows. +- Portable workflow definitions that can run across different orchestration systems. + +### Guidelines +1. **Declarative Workflow Definition**: All workflows must be defined in a declarative, version-controllable format. +2. **Containerized Execution**: All tasks must run in containers using standardized FFRD images. +3. **Shared Storage**: Use persistent volumes for data artifacts shared between workflow steps. +4. **Parallel Execution**: Support parallel execution of independent tasks with dependency management. +6. **DAG Structure**: Define workflows as Directed Acyclic Graphs (DAGs) to ensure clear dependencies and execution order. +5. **Documentation**: Provide clear usage instructions and workflow specifications. + +### Best Practices +- Define explicit resource requirements and constraints for all tasks. +- Implement data validation at workflow boundaries. +- Log all workflow execution steps for traceability. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index d311487..4103e05 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -35,6 +35,7 @@ nav: - draft/base_image/base_image.md - draft/hms_sim/hms_sim.md - draft/ras_sim/ras_sim.md + - draft/orchestration/orchestration.md - Proposals: - proposals/conformance/conformance.md diff --git a/reference/orchestration/argo/reference.yaml b/reference/orchestration/argo/reference.yaml new file mode 100644 index 0000000..fd7de7a --- /dev/null +++ b/reference/orchestration/argo/reference.yaml @@ -0,0 +1,81 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: dag-example- +spec: + entrypoint: main + volumeClaimTemplates: # Create a shared volume for the workflow + - metadata: + name: workdir + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi + templates: + - name: main + dag: + tasks: + - name: generate-number + template: generate-number + - name: process-numbers + dependencies: [generate-number] + template: process-numbers + - name: sum-results + dependencies: [process-numbers] + template: sum-results + + - name: generate-number + container: + image: alpine:3.18 + command: [sh, -c] + args: ["echo 5 > /work/number.txt"] + volumeMounts: + - name: workdir + mountPath: /work + + - name: process-numbers + parallelism: 2 # Run two steps at a time + steps: + - - name: process-number + template: process-number + withItems: # Iterate over this list of numbers + - 1 + - 2 + - 3 + - 4 + arguments: + parameters: + - name: item + value: "{{item}}" # Pass the item from the list to the process-number template + + - name: process-number + inputs: + parameters: + - name: item + container: + image: alpine:3.18 + command: [sh, -c] + args: + - | + num=$(cat /work/number.txt) + result=$((num + {{inputs.parameters.item}})) + echo $result > /work/result-{{inputs.parameters.item}}.txt + volumeMounts: + - name: workdir + mountPath: /work + + - name: sum-results + container: + image: alpine:3.18 + command: [sh, -c] + args: + - | + sum=0 + for file in /work/result-*.txt; do + sum=$((sum + $(cat $file))) + done + echo "Total sum: $sum" + volumeMounts: + - name: workdir + mountPath: /work \ No newline at end of file From 3c7f74776d56972442063b2ea0f322421824b510 Mon Sep 17 00:00:00 2001 From: Brendan Barnes Date: Mon, 4 Aug 2025 22:17:22 +0000 Subject: [PATCH 2/6] orchestration docs --- docs/draft/orchestration/orchestration.md | 6 +- docs/draft/orchestration/reference.md | 187 +++++++++++++++++++++- docs/draft/orchestration/schema.md | 3 - docs/draft/orchestration/spec.md | 128 ++++++++++++++- docs/draft/orchestration/standard.md | 81 ++++++++-- 5 files changed, 379 insertions(+), 26 deletions(-) delete mode 100644 docs/draft/orchestration/schema.md diff --git a/docs/draft/orchestration/orchestration.md b/docs/draft/orchestration/orchestration.md index 9a65bf5..17272fa 100644 --- a/docs/draft/orchestration/orchestration.md +++ b/docs/draft/orchestration/orchestration.md @@ -6,8 +6,4 @@ --- -{% include "draft/orchestration/schema.md" %} - ---- - -{% include "draft/orchestration/reference.md" %} \ No newline at end of file +{% include "draft/orchestration/reference.md" %} diff --git a/docs/draft/orchestration/reference.md b/docs/draft/orchestration/reference.md index 96f0dd3..7f55190 100644 --- a/docs/draft/orchestration/reference.md +++ b/docs/draft/orchestration/reference.md @@ -1,3 +1,188 @@ ## ๐Ÿ“š Reference -> TODO \ No newline at end of file +### Argo Workflows Implementation + +This reference implementation demonstrates how Argo Workflows can satisfy the FFRD orchestration requirements. Argo Workflows is provided as one example of a compliant orchestration system, but other systems may be used as long as they meet the specification requirements. + +#### Implementation Overview + +The reference implementation uses Argo Workflows running on Kubernetes to provide: + +- DAG-based workflow execution with explicit task dependencies +- Container execution with shared volume access +- Parallel task execution with parameterization +- Shared volume management for data exchange between tasks +- Basic logging and monitoring capabilities + +#### Example Workflow Structure + +The following example demonstrates a basic FFRD workflow pattern with parallel processing and data collection: + +```yaml +# This is a simplified example showing the orchestration pattern +# Full FFRD workflows would use FFRD-compliant containers and configurations + +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: dag-example- +spec: + entrypoint: main + volumeClaimTemplates: # Create a shared volume for the workflow + - metadata: + name: workdir + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi + templates: + - name: main + dag: + tasks: + - name: generate-number + template: generate-number + - name: process-numbers + dependencies: [generate-number] + template: process-numbers + - name: sum-results + dependencies: [process-numbers] + template: sum-results + + - name: generate-number + container: + image: alpine:3.18 + command: [sh, -c] + args: ["echo 5 > /work/number.txt"] + volumeMounts: + - name: workdir + mountPath: /work + + - name: process-numbers + parallelism: 2 # Run two steps at a time + steps: + - - name: process-number + template: process-number + withItems: # Iterate over this list of numbers + - 1 + - 2 + - 3 + - 4 + arguments: + parameters: + - name: item + value: "{{ '{{item}}' }}" # Pass the item from the list to the process-number template + + - name: process-number + inputs: + parameters: + - name: item + container: + image: alpine:3.18 + command: [sh, -c] + args: + - | + num=$(cat /work/number.txt) + result=$((num + {{ '{{inputs.parameters.item}}' }})) + echo $result > /work/result-{{ '{{inputs.parameters.item}}' }}.txt + volumeMounts: + - name: workdir + mountPath: /work + + - name: sum-results + container: + image: alpine:3.18 + command: [sh, -c] + args: + - | + sum=0 + for file in /work/result-*.txt; do + sum=$((sum + $(cat $file))) + done + echo "Total sum: $sum" + volumeMounts: + - name: workdir + mountPath: /work +``` + +#### Key Implementation Features + +##### DAG Structure +- Uses Argo's DAG template to define explicit task dependencies (`dependencies: [generate-number]`) +- Demonstrates parallel execution through steps with `withItems` parameterization +- Shows sequential workflow phases (generate โ†’ process โ†’ collect) + +##### Container Execution +- Executes standard containers (Alpine Linux) as a pattern for FFRD containers +- Demonstrates passing command line arguments to containers +- Shows volume mounting for data access across all tasks + +##### Data Sharing +- Uses persistent volume claims (`volumeClaimTemplates`) for shared storage +- Consistent volume mounting (`/work`) across all workflow tasks +- Demonstrates file-based data exchange between workflow steps + +##### Parameterization +- Shows parameter passing with `withItems` for parallel task execution +- Demonstrates template parameter usage with `inputs.parameters.item` +- Illustrates how to iterate over lists to create multiple parallel tasks + +#### Deployment Requirements + +##### Infrastructure +- Kubernetes cluster +- Argo Workflows +- Container runtime (Docker, containerd, or CRI-O) +- Persistent storage provisioner + +##### Configuration +- Argo Workflows controller installation +- RBAC configuration for workflow execution +- Storage class configuration for volume provisioning +- Container registry access credentials + +#### Usage Examples + +##### Validate Workflow +```bash +# Validate the workflow definition +argo lint reference.yaml +``` + +##### Submit Workflow +```bash +# Submit the workflow to Argo +argo submit reference.yaml +``` + +##### Monitor Execution +```bash +# List all workflows +argo list + +# Watch workflow execution (use actual workflow name from list) +argo get dag-example-abc123 + +# View workflow logs +argo logs dag-example-abc123 +``` + +##### Access Results +```bash +# View workflow status and results +argo get dag-example-abc123 +``` + +#### Alternative Implementations + +While this reference uses Argo Workflows, other orchestration systems can satisfy FFRD requirements: + +- **Apache Airflow**: Python-based DAG orchestration with extensive integrations +- **Prefect**: Modern workflow orchestration with dynamic DAG generation +- **Kubeflow Pipelines**: ML-focused orchestration with container-native execution +- **Temporal**: Durable execution framework with strong consistency guarantees +- **Custom Solutions**: Purpose-built orchestration systems meeting FFRD specifications + +The key requirement is that any chosen system must satisfy all requirements outlined in the FFRD orchestration specification, regardless of the underlying implementation technology. + +This reference implementation serves as a concrete example of how to satisfy FFRD orchestration requirements. \ No newline at end of file diff --git a/docs/draft/orchestration/schema.md b/docs/draft/orchestration/schema.md deleted file mode 100644 index b985592..0000000 --- a/docs/draft/orchestration/schema.md +++ /dev/null @@ -1,3 +0,0 @@ -## ๐Ÿ—ƒ๏ธ Schema - -> TODO \ No newline at end of file diff --git a/docs/draft/orchestration/spec.md b/docs/draft/orchestration/spec.md index 72efe8a..d7b325d 100644 --- a/docs/draft/orchestration/spec.md +++ b/docs/draft/orchestration/spec.md @@ -1,3 +1,129 @@ ## ๐Ÿ“ Specification -> TODO \ No newline at end of file +### Overview +This specification defines the fundamental requirements for orchestration systems used within the FFRD initiative to execute complex flood risk analysis workflows. The system must provide DAG-based workflow execution, FFRD container integration, and essential operational capabilities. This specification does not prescribe specific implementation technologies. + +### Requirements + +#### 1. Workflow Structure + +##### 1.1 Directed Acyclic Graph (DAG) Support +- **Graph Definition**: Workflows MUST be representable as directed acyclic graphs with explicit task dependencies +- **Task Dependencies**: System MUST support expressing dependencies between tasks (e.g., Task B depends on Task A completion) +- **Parallel Execution**: System MUST execute independent tasks concurrently when resources allow +- **Conditional Execution**: System MUST support conditional task execution based on upstream task results or external conditions + +##### 1.2 Workflow Definition +- **Declarative Format**: Workflows MUST be defined in a human-readable, version-controllable format +- **Reproducibility**: Identical workflow definitions MUST produce deterministic execution behavior +- **Parameterization**: System MUST support parameterized workflows for different study areas, configurations, and datasets + +#### 2. FFRD Container Integration + +##### 2.1 Container Execution +- **FFRD Base Image**: System MUST execute containers built on FFRD base image specifications +- **HMS Containers**: System MUST execute HEC-HMS containers with appropriate Java runtime requirements +- **RAS Containers**: System MUST execute HEC-RAS containers with computational dependencies +- **Conformance Containers**: System MUST execute validation and conformance testing containers +- **Plugin Containers**: System MUST execute custom FFRD-compliant analysis containers + +##### 2.2 Container Configuration +- **Configuration Files**: System MUST support passing JSON configuration files to containers as specified in FFRD standards +- **Environment Variables**: System MUST support setting required environment variables for FFRD containers +- **Command Line Arguments**: System MUST support passing command line arguments to containers +- **Exit Code Handling**: System MUST properly interpret container exit codes and handle success/failure states + +#### 3. Volume Sharing and Data Management + +##### 3.1 Shared Storage +- **Inter-task Data Sharing**: System MUST provide mechanisms for tasks to share data through persistent storage +- **Volume Persistence**: System MUST support volumes that persist beyond individual task execution +- **Storage Size Configuration**: System MUST allow specification of storage volume sizes (minimum 1GB, configurable up to hundreds of GB) + +##### 3.2 Data Access Patterns +- **Read/Write Access**: System MUST support both read-only and read-write volume access modes +- **Multiple Mount Points**: System MUST support mounting volumes at different paths within containers +- **Data Isolation**: System MUST prevent unauthorized access to data between different workflow executions + +#### 4. Resource Allocation + +##### 4.1 Compute Resources +- **CPU Allocation**: System MUST support specifying CPU core requirements per task (minimum 0.1 cores, typical 1-8 cores) +- **Memory Allocation**: System MUST support specifying memory requirements per task (minimum 512MB, typical 1GB-32GB) +- **Resource Enforcement**: System MUST enforce specified resource limits to prevent resource contention + +##### 4.2 Resource Constraints +- **Resource Isolation**: System MUST isolate resources between concurrent tasks +- **Resource Monitoring**: System MUST track actual resource usage against allocated limits +- **Resource Availability**: System MUST queue tasks when insufficient resources are available + +#### 5. Logging and Observability + +##### 5.1 Execution Logging +- **Container Logs**: System MUST capture and store all container stdout/stderr output +- **Workflow Progress**: System MUST provide visibility into workflow execution status and task completion +- **Log Association**: System MUST associate logs with specific workflow runs and individual tasks +- **Log Retention**: System MUST retain logs for completed workflows for a configurable period + +##### 5.2 Monitoring +- **Task Status**: System MUST report status of workflow tasks (pending, running, completed, failed) +- **Workflow History**: System MUST maintain history of workflow executions + +#### 6. Error Handling and Recovery + +##### 6.1 Retry Mechanisms +- **Configurable Retry**: System MUST support configurable retry policies for failed tasks +- **Retry Limits**: System MUST support maximum retry attempt limits + +##### 6.2 Failure Handling +- **Failure Isolation**: System MUST prevent individual task failures from stopping independent workflow branches +- **Partial Completion**: System MUST support completing successful workflow branches when other branches fail +- **Failure Reporting**: System MUST clearly report which tasks failed and provide failure details +- **Manual Recovery**: System MUST support manual intervention to recover from failures + +#### 7. Integration Requirements + +##### 7.1 Data Sources +- **S3 Integration**: System MUST support integration with S3-compatible object storage for input/output data +- **File System Access**: System MUST support mounting external file systems for data access +- **Network Access**: System MUST support controlled network access for containers requiring external connectivity + +##### 7.2 Operational Integration +- **Workflow Submission**: System MUST provide mechanisms for submitting and executing workflows + +#### 8. Security and Access Control + +##### 8.1 Access Control +- **Authentication**: System MUST provide authentication mechanisms for workflow access +- **Authorization**: System MUST provide authorization controls for workflow execution +- **Credential Management**: System MUST provide secure mechanisms for handling sensitive data and credentials + +### Operational Requirements + +#### Performance Expectations +- **Concurrent Workflows**: System MUST support executing multiple independent workflows simultaneously +- **Multi-node Execution**: System MUST support executing workflows across multiple compute nodes + +#### Reliability Requirements +- **System Availability**: System MUST provide high availability for workflow execution +- **Data Durability**: System MUST ensure durability of workflow outputs and execution logs +- **Recovery**: System MUST support recovery from system failures without losing workflow progress + +#### Compliance Requirements +- **Audit Trail**: System MUST maintain complete audit trails of workflow executions +- **Data Governance**: System MUST support data governance requirements for FFRD data +- **Documentation**: System MUST provide documentation for operational procedures and troubleshooting + +### Example Workflow Scenario + +A typical FFRD workflow might include: + +1. **Data Preparation**: Validate input configuration and download required datasets from S3 +2. **Model Execution**: Run HEC-HMS hydrologic models with specified parameters +3. **Post-Processing**: Process model outputs and generate analysis results +4. **Validation**: Run conformance tests on outputs +5. **Data Upload**: Upload results to designated S3 locations + +The orchestration system must execute these tasks in the correct dependency order, share data between tasks through persistent volumes, allocate appropriate compute resources, handle any task failures with retries, and provide complete logging and monitoring throughout the process. + +This specification provides the essential requirements for FFRD workflow orchestration while allowing flexibility in implementation approach and technology choices. \ No newline at end of file diff --git a/docs/draft/orchestration/standard.md b/docs/draft/orchestration/standard.md index bf460c3..9eabaf0 100644 --- a/docs/draft/orchestration/standard.md +++ b/docs/draft/orchestration/standard.md @@ -3,24 +3,73 @@ ## ๐Ÿ“ Standard ### Purpose -To provide a standardized, automated, and reproducible method for executing multi-step flood risk modeling pipelines within the FFRD initiative. +To establish fundamental requirements for orchestration systems that can execute complex, multi-step flood risk data processing workflows within the FFRD initiative. The orchestration system must support directed acyclic graphs (DAGs), container execution, resource management, observability, and robust error handling to ensure reliable execution of hydrologic and hydraulic modeling workflows. -### Scope -This standard applies to all workflows requiring: +### Scope +This standard applies to all orchestration systems used within the FFRD initiative for: -- Sequencing of multiple containerized tools for data preparation, model execution, and post-processing. -- Orchestration of HEC-HMS and HEC-RAS models within larger computational workflows. -- Portable workflow definitions that can run across different orchestration systems. +- Executing multi-step flood risk analysis workflows +- Coordinating hydrologic and hydraulic model runs (HEC-HMS, HEC-RAS, etc.) +- Managing data processing pipelines for stochastic storm transposition +- Orchestrating conformance testing and validation workflows +- Supporting distributed computing across multiple processing nodes -### Guidelines -1. **Declarative Workflow Definition**: All workflows must be defined in a declarative, version-controllable format. -2. **Containerized Execution**: All tasks must run in containers using standardized FFRD images. -3. **Shared Storage**: Use persistent volumes for data artifacts shared between workflow steps. -4. **Parallel Execution**: Support parallel execution of independent tasks with dependency management. -6. **DAG Structure**: Define workflows as Directed Acyclic Graphs (DAGs) to ensure clear dependencies and execution order. -5. **Documentation**: Provide clear usage instructions and workflow specifications. +### Core Requirements + +#### 1. Workflow Structure +- **DAG Support**: Must support directed acyclic graph (DAG) workflow definitions with explicit task dependencies +- **Parallel Execution**: Must enable parallel execution of independent tasks +- **Conditional Logic**: Must support conditional task execution based on upstream task results + +#### 2. Container Integration +- **FFRD Container Compatibility**: Must execute all FFRD-compliant containers (base image, HMS, RAS, conformance, plugin containers) +- **Container Registry Support**: Must support pulling containers from public and private container registries +- **Runtime Configuration**: Must support passing configuration files, environment variables, and command-line arguments to containers +- **Exit Code Handling**: Must properly handle container exit codes and propagate failures appropriately + +#### 3. Resource Management +- **Compute Resources**: Must allow specification of CPU cores, memory limits, and GPU resources per task +- **Storage Allocation**: Must support dynamic and static volume provisioning with configurable storage sizes +- **Resource Constraints**: Must enforce resource limits and prevent resource contention between concurrent tasks + +#### 4. Data Sharing and Persistence +- **Volume Sharing**: Must provide shared storage mechanisms for data exchange between workflow tasks +- **Persistent Volumes**: Must support persistent storage that survives task and workflow completion +- **Data Lifecycle Management**: Must support cleanup of temporary data when workflows complete + +#### 5. Observability and Monitoring +- **Execution Logging**: Must capture logs from workflow tasks +- **Progress Tracking**: Must provide visibility into workflow execution status and task completion + +#### 6. Error Handling and Resilience +- **Retry Strategies**: Must support configurable retry policies for failed tasks +- **Failure Isolation**: Must prevent individual task failures from stopping independent workflow branches + +#### 7. Workflow Definition and Versioning +- **Declarative Format**: Must support workflow definitions in a human-readable, declarative format +- **Version Control**: Must enable workflow definitions to be versioned in source control systems +- **Validation**: Must provide validation mechanisms for workflow definitions (e.g., linting) + +#### 8. Security and Access Control +- **Authentication**: Must provide authentication mechanisms for workflow access +- **Authorization**: Must provide authorization controls for workflow execution +- **Secret Management**: Must provide secure mechanisms for handling sensitive data and credentials + +#### 9. Scalability and Performance +- **Multi-node Execution**: Must support executing workflows across multiple compute nodes +- **Concurrent Workflows**: Must support running multiple workflows simultaneously + +#### 10. Integration and Interoperability +- **Workflow Submission**: Must provide mechanisms for submitting and executing workflows ### Best Practices -- Define explicit resource requirements and constraints for all tasks. -- Implement data validation at workflow boundaries. -- Log all workflow execution steps for traceability. \ No newline at end of file +- Use immutable workflow definitions to ensure reproducible executions +- Implement comprehensive testing strategies for workflow validation before production deployment +- Design workflows with failure scenarios in mind and include appropriate error handling +- Document workflow dependencies, data requirements, and expected outcomes +- Implement monitoring and alerting for critical workflow execution paths +- Use resource quotas and limits to prevent resource exhaustion +- Follow security best practices for credential management and access control +- Maintain workflow execution history for analysis and troubleshooting +- Implement workflow approval processes for production environments +- Use infrastructure as code practices for orchestration system deployment and configuration From a0a812f94d6c2190d6f27bff54099ddef6346f4f Mon Sep 17 00:00:00 2001 From: Brendan Barnes Date: Mon, 4 Aug 2025 22:22:03 +0000 Subject: [PATCH 3/6] mdformat --- README.md | 6 +++-- docs/draft/orchestration/orchestration.md | 4 ++-- docs/draft/orchestration/reference.md | 14 +++++++++-- docs/draft/orchestration/spec.md | 29 +++++++++++++++++++---- docs/draft/orchestration/standard.md | 15 +++++++++++- 5 files changed, 56 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 782a08b..93eea06 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,10 @@ https://fema-ffrd.github.io/specs/ ## Setup ### Dev Container Setup (Optional) + 1. Open this repository in VS Code -2. When prompted, click "Reopen in Container" or use the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container" -3. The container will automatically set up the environment and install dependencies +1. When prompted, click "Reopen in Container" or use the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container" +1. The container will automatically set up the environment and install dependencies #### What Gets Installed @@ -69,6 +70,7 @@ argo logs - Read the [Argo Workflows documentation](https://argo-workflows.readthedocs.io/) ### Documentation Setup + 1. Create a Python virtual environment. ``` diff --git a/docs/draft/orchestration/orchestration.md b/docs/draft/orchestration/orchestration.md index 17272fa..691d6d6 100644 --- a/docs/draft/orchestration/orchestration.md +++ b/docs/draft/orchestration/orchestration.md @@ -1,9 +1,9 @@ {% include "draft/orchestration/standard.md" %} ---- +______________________________________________________________________ {% include "draft/orchestration/spec.md" %} ---- +______________________________________________________________________ {% include "draft/orchestration/reference.md" %} diff --git a/docs/draft/orchestration/reference.md b/docs/draft/orchestration/reference.md index 7f55190..98269e4 100644 --- a/docs/draft/orchestration/reference.md +++ b/docs/draft/orchestration/reference.md @@ -108,21 +108,25 @@ spec: #### Key Implementation Features ##### DAG Structure + - Uses Argo's DAG template to define explicit task dependencies (`dependencies: [generate-number]`) - Demonstrates parallel execution through steps with `withItems` parameterization - Shows sequential workflow phases (generate โ†’ process โ†’ collect) ##### Container Execution + - Executes standard containers (Alpine Linux) as a pattern for FFRD containers - Demonstrates passing command line arguments to containers - Shows volume mounting for data access across all tasks ##### Data Sharing + - Uses persistent volume claims (`volumeClaimTemplates`) for shared storage - Consistent volume mounting (`/work`) across all workflow tasks - Demonstrates file-based data exchange between workflow steps ##### Parameterization + - Shows parameter passing with `withItems` for parallel task execution - Demonstrates template parameter usage with `inputs.parameters.item` - Illustrates how to iterate over lists to create multiple parallel tasks @@ -130,12 +134,14 @@ spec: #### Deployment Requirements ##### Infrastructure + - Kubernetes cluster - Argo Workflows - Container runtime (Docker, containerd, or CRI-O) - Persistent storage provisioner ##### Configuration + - Argo Workflows controller installation - RBAC configuration for workflow execution - Storage class configuration for volume provisioning @@ -144,18 +150,21 @@ spec: #### Usage Examples ##### Validate Workflow + ```bash # Validate the workflow definition argo lint reference.yaml ``` ##### Submit Workflow + ```bash # Submit the workflow to Argo argo submit reference.yaml ``` ##### Monitor Execution + ```bash # List all workflows argo list @@ -168,6 +177,7 @@ argo logs dag-example-abc123 ``` ##### Access Results + ```bash # View workflow status and results argo get dag-example-abc123 @@ -178,11 +188,11 @@ argo get dag-example-abc123 While this reference uses Argo Workflows, other orchestration systems can satisfy FFRD requirements: - **Apache Airflow**: Python-based DAG orchestration with extensive integrations -- **Prefect**: Modern workflow orchestration with dynamic DAG generation +- **Prefect**: Modern workflow orchestration with dynamic DAG generation - **Kubeflow Pipelines**: ML-focused orchestration with container-native execution - **Temporal**: Durable execution framework with strong consistency guarantees - **Custom Solutions**: Purpose-built orchestration systems meeting FFRD specifications The key requirement is that any chosen system must satisfy all requirements outlined in the FFRD orchestration specification, regardless of the underlying implementation technology. -This reference implementation serves as a concrete example of how to satisfy FFRD orchestration requirements. \ No newline at end of file +This reference implementation serves as a concrete example of how to satisfy FFRD orchestration requirements. diff --git a/docs/draft/orchestration/spec.md b/docs/draft/orchestration/spec.md index d7b325d..0c09c9f 100644 --- a/docs/draft/orchestration/spec.md +++ b/docs/draft/orchestration/spec.md @@ -1,6 +1,7 @@ ## ๐Ÿ“ Specification ### Overview + This specification defines the fundamental requirements for orchestration systems used within the FFRD initiative to execute complex flood risk analysis workflows. The system must provide DAG-based workflow execution, FFRD container integration, and essential operational capabilities. This specification does not prescribe specific implementation technologies. ### Requirements @@ -8,12 +9,14 @@ This specification defines the fundamental requirements for orchestration system #### 1. Workflow Structure ##### 1.1 Directed Acyclic Graph (DAG) Support + - **Graph Definition**: Workflows MUST be representable as directed acyclic graphs with explicit task dependencies - **Task Dependencies**: System MUST support expressing dependencies between tasks (e.g., Task B depends on Task A completion) - **Parallel Execution**: System MUST execute independent tasks concurrently when resources allow - **Conditional Execution**: System MUST support conditional task execution based on upstream task results or external conditions ##### 1.2 Workflow Definition + - **Declarative Format**: Workflows MUST be defined in a human-readable, version-controllable format - **Reproducibility**: Identical workflow definitions MUST produce deterministic execution behavior - **Parameterization**: System MUST support parameterized workflows for different study areas, configurations, and datasets @@ -21,6 +24,7 @@ This specification defines the fundamental requirements for orchestration system #### 2. FFRD Container Integration ##### 2.1 Container Execution + - **FFRD Base Image**: System MUST execute containers built on FFRD base image specifications - **HMS Containers**: System MUST execute HEC-HMS containers with appropriate Java runtime requirements - **RAS Containers**: System MUST execute HEC-RAS containers with computational dependencies @@ -28,6 +32,7 @@ This specification defines the fundamental requirements for orchestration system - **Plugin Containers**: System MUST execute custom FFRD-compliant analysis containers ##### 2.2 Container Configuration + - **Configuration Files**: System MUST support passing JSON configuration files to containers as specified in FFRD standards - **Environment Variables**: System MUST support setting required environment variables for FFRD containers - **Command Line Arguments**: System MUST support passing command line arguments to containers @@ -36,11 +41,13 @@ This specification defines the fundamental requirements for orchestration system #### 3. Volume Sharing and Data Management ##### 3.1 Shared Storage + - **Inter-task Data Sharing**: System MUST provide mechanisms for tasks to share data through persistent storage - **Volume Persistence**: System MUST support volumes that persist beyond individual task execution - **Storage Size Configuration**: System MUST allow specification of storage volume sizes (minimum 1GB, configurable up to hundreds of GB) ##### 3.2 Data Access Patterns + - **Read/Write Access**: System MUST support both read-only and read-write volume access modes - **Multiple Mount Points**: System MUST support mounting volumes at different paths within containers - **Data Isolation**: System MUST prevent unauthorized access to data between different workflow executions @@ -48,11 +55,13 @@ This specification defines the fundamental requirements for orchestration system #### 4. Resource Allocation ##### 4.1 Compute Resources + - **CPU Allocation**: System MUST support specifying CPU core requirements per task (minimum 0.1 cores, typical 1-8 cores) - **Memory Allocation**: System MUST support specifying memory requirements per task (minimum 512MB, typical 1GB-32GB) - **Resource Enforcement**: System MUST enforce specified resource limits to prevent resource contention ##### 4.2 Resource Constraints + - **Resource Isolation**: System MUST isolate resources between concurrent tasks - **Resource Monitoring**: System MUST track actual resource usage against allocated limits - **Resource Availability**: System MUST queue tasks when insufficient resources are available @@ -60,22 +69,26 @@ This specification defines the fundamental requirements for orchestration system #### 5. Logging and Observability ##### 5.1 Execution Logging + - **Container Logs**: System MUST capture and store all container stdout/stderr output - **Workflow Progress**: System MUST provide visibility into workflow execution status and task completion - **Log Association**: System MUST associate logs with specific workflow runs and individual tasks - **Log Retention**: System MUST retain logs for completed workflows for a configurable period ##### 5.2 Monitoring + - **Task Status**: System MUST report status of workflow tasks (pending, running, completed, failed) - **Workflow History**: System MUST maintain history of workflow executions #### 6. Error Handling and Recovery ##### 6.1 Retry Mechanisms + - **Configurable Retry**: System MUST support configurable retry policies for failed tasks - **Retry Limits**: System MUST support maximum retry attempt limits ##### 6.2 Failure Handling + - **Failure Isolation**: System MUST prevent individual task failures from stopping independent workflow branches - **Partial Completion**: System MUST support completing successful workflow branches when other branches fail - **Failure Reporting**: System MUST clearly report which tasks failed and provide failure details @@ -84,16 +97,19 @@ This specification defines the fundamental requirements for orchestration system #### 7. Integration Requirements ##### 7.1 Data Sources + - **S3 Integration**: System MUST support integration with S3-compatible object storage for input/output data - **File System Access**: System MUST support mounting external file systems for data access - **Network Access**: System MUST support controlled network access for containers requiring external connectivity ##### 7.2 Operational Integration + - **Workflow Submission**: System MUST provide mechanisms for submitting and executing workflows #### 8. Security and Access Control ##### 8.1 Access Control + - **Authentication**: System MUST provide authentication mechanisms for workflow access - **Authorization**: System MUST provide authorization controls for workflow execution - **Credential Management**: System MUST provide secure mechanisms for handling sensitive data and credentials @@ -101,15 +117,18 @@ This specification defines the fundamental requirements for orchestration system ### Operational Requirements #### Performance Expectations + - **Concurrent Workflows**: System MUST support executing multiple independent workflows simultaneously - **Multi-node Execution**: System MUST support executing workflows across multiple compute nodes #### Reliability Requirements + - **System Availability**: System MUST provide high availability for workflow execution - **Data Durability**: System MUST ensure durability of workflow outputs and execution logs - **Recovery**: System MUST support recovery from system failures without losing workflow progress #### Compliance Requirements + - **Audit Trail**: System MUST maintain complete audit trails of workflow executions - **Data Governance**: System MUST support data governance requirements for FFRD data - **Documentation**: System MUST provide documentation for operational procedures and troubleshooting @@ -119,11 +138,11 @@ This specification defines the fundamental requirements for orchestration system A typical FFRD workflow might include: 1. **Data Preparation**: Validate input configuration and download required datasets from S3 -2. **Model Execution**: Run HEC-HMS hydrologic models with specified parameters -3. **Post-Processing**: Process model outputs and generate analysis results -4. **Validation**: Run conformance tests on outputs -5. **Data Upload**: Upload results to designated S3 locations +1. **Model Execution**: Run HEC-HMS hydrologic models with specified parameters +1. **Post-Processing**: Process model outputs and generate analysis results +1. **Validation**: Run conformance tests on outputs +1. **Data Upload**: Upload results to designated S3 locations The orchestration system must execute these tasks in the correct dependency order, share data between tasks through persistent volumes, allocate appropriate compute resources, handle any task failures with retries, and provide complete logging and monitoring throughout the process. -This specification provides the essential requirements for FFRD workflow orchestration while allowing flexibility in implementation approach and technology choices. \ No newline at end of file +This specification provides the essential requirements for FFRD workflow orchestration while allowing flexibility in implementation approach and technology choices. diff --git a/docs/draft/orchestration/standard.md b/docs/draft/orchestration/standard.md index 9eabaf0..cd2abff 100644 --- a/docs/draft/orchestration/standard.md +++ b/docs/draft/orchestration/standard.md @@ -3,9 +3,11 @@ ## ๐Ÿ“ Standard ### Purpose + To establish fundamental requirements for orchestration systems that can execute complex, multi-step flood risk data processing workflows within the FFRD initiative. The orchestration system must support directed acyclic graphs (DAGs), container execution, resource management, observability, and robust error handling to ensure reliable execution of hydrologic and hydraulic modeling workflows. -### Scope +### Scope + This standard applies to all orchestration systems used within the FFRD initiative for: - Executing multi-step flood risk analysis workflows @@ -17,52 +19,63 @@ This standard applies to all orchestration systems used within the FFRD initiati ### Core Requirements #### 1. Workflow Structure + - **DAG Support**: Must support directed acyclic graph (DAG) workflow definitions with explicit task dependencies - **Parallel Execution**: Must enable parallel execution of independent tasks - **Conditional Logic**: Must support conditional task execution based on upstream task results #### 2. Container Integration + - **FFRD Container Compatibility**: Must execute all FFRD-compliant containers (base image, HMS, RAS, conformance, plugin containers) - **Container Registry Support**: Must support pulling containers from public and private container registries - **Runtime Configuration**: Must support passing configuration files, environment variables, and command-line arguments to containers - **Exit Code Handling**: Must properly handle container exit codes and propagate failures appropriately #### 3. Resource Management + - **Compute Resources**: Must allow specification of CPU cores, memory limits, and GPU resources per task - **Storage Allocation**: Must support dynamic and static volume provisioning with configurable storage sizes - **Resource Constraints**: Must enforce resource limits and prevent resource contention between concurrent tasks #### 4. Data Sharing and Persistence + - **Volume Sharing**: Must provide shared storage mechanisms for data exchange between workflow tasks - **Persistent Volumes**: Must support persistent storage that survives task and workflow completion - **Data Lifecycle Management**: Must support cleanup of temporary data when workflows complete #### 5. Observability and Monitoring + - **Execution Logging**: Must capture logs from workflow tasks - **Progress Tracking**: Must provide visibility into workflow execution status and task completion #### 6. Error Handling and Resilience + - **Retry Strategies**: Must support configurable retry policies for failed tasks - **Failure Isolation**: Must prevent individual task failures from stopping independent workflow branches #### 7. Workflow Definition and Versioning + - **Declarative Format**: Must support workflow definitions in a human-readable, declarative format - **Version Control**: Must enable workflow definitions to be versioned in source control systems - **Validation**: Must provide validation mechanisms for workflow definitions (e.g., linting) #### 8. Security and Access Control + - **Authentication**: Must provide authentication mechanisms for workflow access - **Authorization**: Must provide authorization controls for workflow execution - **Secret Management**: Must provide secure mechanisms for handling sensitive data and credentials #### 9. Scalability and Performance + - **Multi-node Execution**: Must support executing workflows across multiple compute nodes - **Concurrent Workflows**: Must support running multiple workflows simultaneously #### 10. Integration and Interoperability + - **Workflow Submission**: Must provide mechanisms for submitting and executing workflows ### Best Practices + - Use immutable workflow definitions to ensure reproducible executions - Implement comprehensive testing strategies for workflow validation before production deployment - Design workflows with failure scenarios in mind and include appropriate error handling From 714ab7a4061dc40895af18bd5d112b7dda3b4d71 Mon Sep 17 00:00:00 2001 From: Brendan Barnes Date: Mon, 11 Aug 2025 18:31:14 +0000 Subject: [PATCH 4/6] Refactor orchestration documentation --- docs/draft/orchestration/orchestration.md | 2 +- docs/draft/orchestration/reference.md | 28 +--- docs/draft/orchestration/spec.md | 148 ------------------ docs/draft/orchestration/standard.md | 90 +++++------ .../orchestration/technical-capabilities.md | 148 ++++++++++++++++++ 5 files changed, 201 insertions(+), 215 deletions(-) delete mode 100644 docs/draft/orchestration/spec.md create mode 100644 docs/draft/orchestration/technical-capabilities.md diff --git a/docs/draft/orchestration/orchestration.md b/docs/draft/orchestration/orchestration.md index 691d6d6..4c8321c 100644 --- a/docs/draft/orchestration/orchestration.md +++ b/docs/draft/orchestration/orchestration.md @@ -2,7 +2,7 @@ ______________________________________________________________________ -{% include "draft/orchestration/spec.md" %} +{% include "draft/orchestration/technical-capabilities.md" %} ______________________________________________________________________ diff --git a/docs/draft/orchestration/reference.md b/docs/draft/orchestration/reference.md index 98269e4..f8766ea 100644 --- a/docs/draft/orchestration/reference.md +++ b/docs/draft/orchestration/reference.md @@ -2,17 +2,17 @@ ### Argo Workflows Implementation -This reference implementation demonstrates how Argo Workflows can satisfy the FFRD orchestration requirements. Argo Workflows is provided as one example of a compliant orchestration system, but other systems may be used as long as they meet the specification requirements. +This reference implementation demonstrates how Argo Workflows could support FFRD orchestration needs. Argo Workflows is presented as one example of an orchestration system that offers relevant capabilities, alongside other potential solutions that could meet similar workflow requirements. #### Implementation Overview -The reference implementation uses Argo Workflows running on Kubernetes to provide: +The reference implementation uses Argo Workflows running on Kubernetes to illustrate: -- DAG-based workflow execution with explicit task dependencies -- Container execution with shared volume access -- Parallel task execution with parameterization -- Shared volume management for data exchange between tasks -- Basic logging and monitoring capabilities +- DAG-based workflow execution patterns with explicit task dependencies +- Container execution approaches with shared volume access +- Parallel task execution techniques with parameterization +- Shared volume management strategies for data exchange between tasks +- Logging and monitoring capabilities for workflow observability #### Example Workflow Structure @@ -182,17 +182,3 @@ argo logs dag-example-abc123 # View workflow status and results argo get dag-example-abc123 ``` - -#### Alternative Implementations - -While this reference uses Argo Workflows, other orchestration systems can satisfy FFRD requirements: - -- **Apache Airflow**: Python-based DAG orchestration with extensive integrations -- **Prefect**: Modern workflow orchestration with dynamic DAG generation -- **Kubeflow Pipelines**: ML-focused orchestration with container-native execution -- **Temporal**: Durable execution framework with strong consistency guarantees -- **Custom Solutions**: Purpose-built orchestration systems meeting FFRD specifications - -The key requirement is that any chosen system must satisfy all requirements outlined in the FFRD orchestration specification, regardless of the underlying implementation technology. - -This reference implementation serves as a concrete example of how to satisfy FFRD orchestration requirements. diff --git a/docs/draft/orchestration/spec.md b/docs/draft/orchestration/spec.md deleted file mode 100644 index 0c09c9f..0000000 --- a/docs/draft/orchestration/spec.md +++ /dev/null @@ -1,148 +0,0 @@ -## ๐Ÿ“ Specification - -### Overview - -This specification defines the fundamental requirements for orchestration systems used within the FFRD initiative to execute complex flood risk analysis workflows. The system must provide DAG-based workflow execution, FFRD container integration, and essential operational capabilities. This specification does not prescribe specific implementation technologies. - -### Requirements - -#### 1. Workflow Structure - -##### 1.1 Directed Acyclic Graph (DAG) Support - -- **Graph Definition**: Workflows MUST be representable as directed acyclic graphs with explicit task dependencies -- **Task Dependencies**: System MUST support expressing dependencies between tasks (e.g., Task B depends on Task A completion) -- **Parallel Execution**: System MUST execute independent tasks concurrently when resources allow -- **Conditional Execution**: System MUST support conditional task execution based on upstream task results or external conditions - -##### 1.2 Workflow Definition - -- **Declarative Format**: Workflows MUST be defined in a human-readable, version-controllable format -- **Reproducibility**: Identical workflow definitions MUST produce deterministic execution behavior -- **Parameterization**: System MUST support parameterized workflows for different study areas, configurations, and datasets - -#### 2. FFRD Container Integration - -##### 2.1 Container Execution - -- **FFRD Base Image**: System MUST execute containers built on FFRD base image specifications -- **HMS Containers**: System MUST execute HEC-HMS containers with appropriate Java runtime requirements -- **RAS Containers**: System MUST execute HEC-RAS containers with computational dependencies -- **Conformance Containers**: System MUST execute validation and conformance testing containers -- **Plugin Containers**: System MUST execute custom FFRD-compliant analysis containers - -##### 2.2 Container Configuration - -- **Configuration Files**: System MUST support passing JSON configuration files to containers as specified in FFRD standards -- **Environment Variables**: System MUST support setting required environment variables for FFRD containers -- **Command Line Arguments**: System MUST support passing command line arguments to containers -- **Exit Code Handling**: System MUST properly interpret container exit codes and handle success/failure states - -#### 3. Volume Sharing and Data Management - -##### 3.1 Shared Storage - -- **Inter-task Data Sharing**: System MUST provide mechanisms for tasks to share data through persistent storage -- **Volume Persistence**: System MUST support volumes that persist beyond individual task execution -- **Storage Size Configuration**: System MUST allow specification of storage volume sizes (minimum 1GB, configurable up to hundreds of GB) - -##### 3.2 Data Access Patterns - -- **Read/Write Access**: System MUST support both read-only and read-write volume access modes -- **Multiple Mount Points**: System MUST support mounting volumes at different paths within containers -- **Data Isolation**: System MUST prevent unauthorized access to data between different workflow executions - -#### 4. Resource Allocation - -##### 4.1 Compute Resources - -- **CPU Allocation**: System MUST support specifying CPU core requirements per task (minimum 0.1 cores, typical 1-8 cores) -- **Memory Allocation**: System MUST support specifying memory requirements per task (minimum 512MB, typical 1GB-32GB) -- **Resource Enforcement**: System MUST enforce specified resource limits to prevent resource contention - -##### 4.2 Resource Constraints - -- **Resource Isolation**: System MUST isolate resources between concurrent tasks -- **Resource Monitoring**: System MUST track actual resource usage against allocated limits -- **Resource Availability**: System MUST queue tasks when insufficient resources are available - -#### 5. Logging and Observability - -##### 5.1 Execution Logging - -- **Container Logs**: System MUST capture and store all container stdout/stderr output -- **Workflow Progress**: System MUST provide visibility into workflow execution status and task completion -- **Log Association**: System MUST associate logs with specific workflow runs and individual tasks -- **Log Retention**: System MUST retain logs for completed workflows for a configurable period - -##### 5.2 Monitoring - -- **Task Status**: System MUST report status of workflow tasks (pending, running, completed, failed) -- **Workflow History**: System MUST maintain history of workflow executions - -#### 6. Error Handling and Recovery - -##### 6.1 Retry Mechanisms - -- **Configurable Retry**: System MUST support configurable retry policies for failed tasks -- **Retry Limits**: System MUST support maximum retry attempt limits - -##### 6.2 Failure Handling - -- **Failure Isolation**: System MUST prevent individual task failures from stopping independent workflow branches -- **Partial Completion**: System MUST support completing successful workflow branches when other branches fail -- **Failure Reporting**: System MUST clearly report which tasks failed and provide failure details -- **Manual Recovery**: System MUST support manual intervention to recover from failures - -#### 7. Integration Requirements - -##### 7.1 Data Sources - -- **S3 Integration**: System MUST support integration with S3-compatible object storage for input/output data -- **File System Access**: System MUST support mounting external file systems for data access -- **Network Access**: System MUST support controlled network access for containers requiring external connectivity - -##### 7.2 Operational Integration - -- **Workflow Submission**: System MUST provide mechanisms for submitting and executing workflows - -#### 8. Security and Access Control - -##### 8.1 Access Control - -- **Authentication**: System MUST provide authentication mechanisms for workflow access -- **Authorization**: System MUST provide authorization controls for workflow execution -- **Credential Management**: System MUST provide secure mechanisms for handling sensitive data and credentials - -### Operational Requirements - -#### Performance Expectations - -- **Concurrent Workflows**: System MUST support executing multiple independent workflows simultaneously -- **Multi-node Execution**: System MUST support executing workflows across multiple compute nodes - -#### Reliability Requirements - -- **System Availability**: System MUST provide high availability for workflow execution -- **Data Durability**: System MUST ensure durability of workflow outputs and execution logs -- **Recovery**: System MUST support recovery from system failures without losing workflow progress - -#### Compliance Requirements - -- **Audit Trail**: System MUST maintain complete audit trails of workflow executions -- **Data Governance**: System MUST support data governance requirements for FFRD data -- **Documentation**: System MUST provide documentation for operational procedures and troubleshooting - -### Example Workflow Scenario - -A typical FFRD workflow might include: - -1. **Data Preparation**: Validate input configuration and download required datasets from S3 -1. **Model Execution**: Run HEC-HMS hydrologic models with specified parameters -1. **Post-Processing**: Process model outputs and generate analysis results -1. **Validation**: Run conformance tests on outputs -1. **Data Upload**: Upload results to designated S3 locations - -The orchestration system must execute these tasks in the correct dependency order, share data between tasks through persistent volumes, allocate appropriate compute resources, handle any task failures with retries, and provide complete logging and monitoring throughout the process. - -This specification provides the essential requirements for FFRD workflow orchestration while allowing flexibility in implementation approach and technology choices. diff --git a/docs/draft/orchestration/standard.md b/docs/draft/orchestration/standard.md index cd2abff..562e1eb 100644 --- a/docs/draft/orchestration/standard.md +++ b/docs/draft/orchestration/standard.md @@ -4,85 +4,85 @@ ### Purpose -To establish fundamental requirements for orchestration systems that can execute complex, multi-step flood risk data processing workflows within the FFRD initiative. The orchestration system must support directed acyclic graphs (DAGs), container execution, resource management, observability, and robust error handling to ensure reliable execution of hydrologic and hydraulic modeling workflows. +To document orchestration capabilities and requirements that can support complex, multi-step flood risk data processing workflows within the FFRD initiative. This standard explores how orchestration systems can provide directed acyclic graphs (DAGs), container execution, resource management, observability, and error handling to support reliable execution of hydrologic and hydraulic modeling workflows. ### Scope -This standard applies to all orchestration systems used within the FFRD initiative for: +This standard explores orchestration capabilities relevant to FFRD initiative workflows, including: -- Executing multi-step flood risk analysis workflows -- Coordinating hydrologic and hydraulic model runs (HEC-HMS, HEC-RAS, etc.) -- Managing data processing pipelines for stochastic storm transposition -- Orchestrating conformance testing and validation workflows -- Supporting distributed computing across multiple processing nodes +- Multi-step flood risk analysis workflow patterns +- Coordination approaches for hydrologic and hydraulic model runs (HEC-HMS, HEC-RAS, etc.) +- Data processing pipeline management for stochastic storm transposition +- Conformance testing and validation workflow approaches +- Distributed computing patterns across multiple processing nodes -### Core Requirements +### Core Capabilities #### 1. Workflow Structure -- **DAG Support**: Must support directed acyclic graph (DAG) workflow definitions with explicit task dependencies -- **Parallel Execution**: Must enable parallel execution of independent tasks -- **Conditional Logic**: Must support conditional task execution based on upstream task results +- **DAG Support**: Orchestration systems typically provide directed acyclic graph (DAG) workflow definitions with explicit task dependencies +- **Parallel Execution**: Modern systems generally enable parallel execution of independent tasks +- **Conditional Logic**: Advanced orchestration platforms often support conditional task execution based on upstream task results #### 2. Container Integration -- **FFRD Container Compatibility**: Must execute all FFRD-compliant containers (base image, HMS, RAS, conformance, plugin containers) -- **Container Registry Support**: Must support pulling containers from public and private container registries -- **Runtime Configuration**: Must support passing configuration files, environment variables, and command-line arguments to containers -- **Exit Code Handling**: Must properly handle container exit codes and propagate failures appropriately +- **FFRD Container Compatibility**: Orchestration systems can execute FFRD-compliant containers (base image, HMS, RAS, conformance, plugin containers) +- **Container Registry Support**: Most platforms support pulling containers from public and private container registries +- **Runtime Configuration**: Systems typically support passing configuration files, environment variables, and command-line arguments to containers +- **Exit Code Handling**: Well-designed systems handle container exit codes and propagate failures appropriately #### 3. Resource Management -- **Compute Resources**: Must allow specification of CPU cores, memory limits, and GPU resources per task -- **Storage Allocation**: Must support dynamic and static volume provisioning with configurable storage sizes -- **Resource Constraints**: Must enforce resource limits and prevent resource contention between concurrent tasks +- **Compute Resources**: Orchestration platforms generally allow specification of CPU cores, memory limits, and GPU resources per task +- **Storage Allocation**: Most systems support dynamic and static volume provisioning with configurable storage sizes +- **Resource Constraints**: Mature platforms enforce resource limits and prevent resource contention between concurrent tasks #### 4. Data Sharing and Persistence -- **Volume Sharing**: Must provide shared storage mechanisms for data exchange between workflow tasks -- **Persistent Volumes**: Must support persistent storage that survives task and workflow completion -- **Data Lifecycle Management**: Must support cleanup of temporary data when workflows complete +- **Volume Sharing**: Orchestration systems typically provide shared storage mechanisms for data exchange between workflow tasks +- **Persistent Volumes**: Most platforms support persistent storage that survives task and workflow completion +- **Data Lifecycle Management**: Advanced systems support cleanup of temporary data when workflows complete #### 5. Observability and Monitoring -- **Execution Logging**: Must capture logs from workflow tasks -- **Progress Tracking**: Must provide visibility into workflow execution status and task completion +- **Execution Logging**: Standard orchestration capabilities include capturing logs from workflow tasks +- **Progress Tracking**: Most systems provide visibility into workflow execution status and task completion #### 6. Error Handling and Resilience -- **Retry Strategies**: Must support configurable retry policies for failed tasks -- **Failure Isolation**: Must prevent individual task failures from stopping independent workflow branches +- **Retry Strategies**: Modern orchestration systems support configurable retry policies for failed tasks +- **Failure Isolation**: Well-designed systems prevent individual task failures from stopping independent workflow branches #### 7. Workflow Definition and Versioning -- **Declarative Format**: Must support workflow definitions in a human-readable, declarative format -- **Version Control**: Must enable workflow definitions to be versioned in source control systems -- **Validation**: Must provide validation mechanisms for workflow definitions (e.g., linting) +- **Declarative Format**: Standard orchestration systems support workflow definitions in human-readable, declarative formats +- **Version Control**: Most platforms enable workflow definitions to be versioned in source control systems +- **Validation**: Many systems provide validation mechanisms for workflow definitions (e.g., linting) #### 8. Security and Access Control -- **Authentication**: Must provide authentication mechanisms for workflow access -- **Authorization**: Must provide authorization controls for workflow execution -- **Secret Management**: Must provide secure mechanisms for handling sensitive data and credentials +- **Authentication**: Enterprise orchestration systems typically provide authentication mechanisms for workflow access +- **Authorization**: Most platforms provide authorization controls for workflow execution +- **Secret Management**: Modern systems provide secure mechanisms for handling sensitive data and credentials #### 9. Scalability and Performance -- **Multi-node Execution**: Must support executing workflows across multiple compute nodes -- **Concurrent Workflows**: Must support running multiple workflows simultaneously +- **Multi-node Execution**: Scalable orchestration systems support executing workflows across multiple compute nodes +- **Concurrent Workflows**: Most platforms support running multiple workflows simultaneously #### 10. Integration and Interoperability -- **Workflow Submission**: Must provide mechanisms for submitting and executing workflows +- **Workflow Submission**: Standard systems provide various mechanisms for submitting and executing workflows -### Best Practices +### Implementation Considerations -- Use immutable workflow definitions to ensure reproducible executions -- Implement comprehensive testing strategies for workflow validation before production deployment -- Design workflows with failure scenarios in mind and include appropriate error handling -- Document workflow dependencies, data requirements, and expected outcomes -- Implement monitoring and alerting for critical workflow execution paths -- Use resource quotas and limits to prevent resource exhaustion -- Follow security best practices for credential management and access control -- Maintain workflow execution history for analysis and troubleshooting -- Implement workflow approval processes for production environments -- Use infrastructure as code practices for orchestration system deployment and configuration +- Immutable workflow definitions can help ensure reproducible executions +- Comprehensive testing strategies may be valuable for workflow validation before production deployment +- Designing workflows with failure scenarios in mind can improve reliability +- Documenting workflow dependencies, data requirements, and expected outcomes supports operational clarity +- Monitoring and alerting for critical workflow execution paths can improve observability +- Resource quotas and limits may help prevent resource exhaustion +- Security best practices for credential management and access control are generally recommended +- Maintaining workflow execution history can support analysis and troubleshooting +- Workflow approval processes may be appropriate for production environments +- Infrastructure as code practices can support consistent orchestration system deployment and configuration diff --git a/docs/draft/orchestration/technical-capabilities.md b/docs/draft/orchestration/technical-capabilities.md new file mode 100644 index 0000000..34e5a72 --- /dev/null +++ b/docs/draft/orchestration/technical-capabilities.md @@ -0,0 +1,148 @@ +## ๐Ÿ“‹ Technical Capabilities + +### Overview + +This section explores technical capabilities commonly found in modern orchestration systems that could support FFRD initiative flood risk analysis workflows. These capabilities include DAG-based workflow execution, container integration patterns, and operational features suitable for complex computational workflows. This exploration examines various implementation patterns and approaches available in contemporary orchestration platforms. + +### Capabilities Framework + +#### 1. Workflow Structure + +##### 1.1 Directed Acyclic Graph (DAG) Support + +- **Graph Definition**: Effective orchestration systems represent workflows as directed acyclic graphs with explicit task dependencies +- **Task Dependencies**: Systems should support expressing dependencies between tasks (e.g., Task B depends on Task A completion) +- **Parallel Execution**: Efficient systems execute independent tasks concurrently when resources allow +- **Conditional Execution**: Advanced systems support conditional task execution based on upstream task results or external conditions + +##### 1.2 Workflow Definition + +- **Declarative Format**: Well-designed systems support workflow definitions in human-readable, version-controllable formats +- **Reproducibility**: Reliable systems ensure identical workflow definitions produce deterministic execution behavior +- **Parameterization**: Flexible systems support parameterized workflows for different study areas, configurations, and datasets + +#### 2. FFRD Container Integration + +##### 2.1 Container Execution + +- **FFRD Base Image**: Compatible systems can execute containers built on FFRD base image specifications +- **HMS Containers**: Suitable systems can execute HEC-HMS containers with appropriate Java runtime requirements +- **RAS Containers**: Capable systems can execute HEC-RAS containers with computational dependencies +- **Conformance Containers**: Supporting systems can execute validation and conformance testing containers +- **Plugin Containers**: Extensible systems can execute custom FFRD-compliant analysis containers + +##### 2.2 Container Configuration + +- **Configuration Files**: Effective systems support passing JSON configuration files to containers as specified in FFRD standards +- **Environment Variables**: Compatible systems support setting required environment variables for FFRD containers +- **Command Line Arguments**: Standard systems support passing command line arguments to containers +- **Exit Code Handling**: Reliable systems properly interpret container exit codes and handle success/failure states + +#### 3. Volume Sharing and Data Management + +##### 3.1 Shared Storage + +- **Inter-task Data Sharing**: Effective systems provide mechanisms for tasks to share data through persistent storage +- **Volume Persistence**: Robust systems support volumes that persist beyond individual task execution +- **Storage Size Configuration**: Flexible systems allow specification of storage volume sizes (minimum 1GB, configurable up to hundreds of GB) + +##### 3.2 Data Access Patterns + +- **Read/Write Access**: Well-designed systems support both read-only and read-write volume access modes +- **Multiple Mount Points**: Flexible systems support mounting volumes at different paths within containers +- **Data Isolation**: Secure systems prevent unauthorized access to data between different workflow executions + +#### 4. Resource Allocation + +##### 4.1 Compute Resources + +- **CPU Allocation**: Capable systems support specifying CPU core requirements per task (minimum 0.1 cores, typical 1-8 cores) +- **Memory Allocation**: Standard systems support specifying memory requirements per task (minimum 512MB, typical 1GB-32GB) +- **Resource Enforcement**: Reliable systems enforce specified resource limits to prevent resource contention + +##### 4.2 Resource Constraints + +- **Resource Isolation**: Well-architected systems isolate resources between concurrent tasks +- **Resource Monitoring**: Monitoring-capable systems track actual resource usage against allocated limits +- **Resource Availability**: Intelligent systems queue tasks when insufficient resources are available + +#### 5. Logging and Observability + +##### 5.1 Execution Logging + +- **Container Logs**: Comprehensive systems capture and store all container stdout/stderr output +- **Workflow Progress**: Transparent systems provide visibility into workflow execution status and task completion +- **Log Association**: Well-organized systems associate logs with specific workflow runs and individual tasks +- **Log Retention**: Configurable systems retain logs for completed workflows for specified periods + +##### 5.2 Monitoring + +- **Task Status**: Monitoring systems report status of workflow tasks (pending, running, completed, failed) +- **Workflow History**: Historical systems maintain records of workflow executions + +#### 6. Error Handling and Recovery + +##### 6.1 Retry Mechanisms + +- **Configurable Retry**: Resilient systems support configurable retry policies for failed tasks +- **Retry Limits**: Safe systems support maximum retry attempt limits + +##### 6.2 Failure Handling + +- **Failure Isolation**: Robust systems prevent individual task failures from stopping independent workflow branches +- **Partial Completion**: Flexible systems support completing successful workflow branches when other branches fail +- **Failure Reporting**: Clear systems report which tasks failed and provide failure details +- **Manual Recovery**: Recoverable systems support manual intervention to recover from failures + +#### 7. Integration Capabilities + +##### 7.1 Data Sources + +- **S3 Integration**: Compatible systems support integration with S3-compatible object storage for input/output data +- **File System Access**: Flexible systems support mounting external file systems for data access +- **Network Access**: Connected systems support controlled network access for containers requiring external connectivity + +##### 7.2 Operational Integration + +- **Workflow Submission**: Operational systems provide mechanisms for submitting and executing workflows + +#### 8. Security and Access Control + +##### 8.1 Access Control + +- **Authentication**: Secure systems provide authentication mechanisms for workflow access +- **Authorization**: Controlled systems provide authorization controls for workflow execution +- **Credential Management**: Protected systems provide secure mechanisms for handling sensitive data and credentials + +### Operational Considerations + +#### Performance Expectations + +- **Concurrent Workflows**: Scalable systems support executing multiple independent workflows simultaneously +- **Multi-node Execution**: Distributed systems support executing workflows across multiple compute nodes + +#### Reliability Considerations + +- **System Availability**: Reliable systems provide high availability for workflow execution +- **Data Durability**: Durable systems ensure persistence of workflow outputs and execution logs +- **Recovery**: Resilient systems support recovery from system failures without losing workflow progress + +#### Compliance Considerations + +- **Audit Trail**: Compliant systems maintain complete audit trails of workflow executions +- **Data Governance**: Governance-aware systems support data governance requirements for FFRD data +- **Documentation**: Well-documented systems provide documentation for operational procedures and troubleshooting + +### Example Workflow Scenario + +A typical FFRD workflow might include: + +1. **Data Preparation**: Validate input configuration and download required datasets from S3 +1. **Model Execution**: Run HEC-HMS hydrologic models with specified parameters +1. **Post-Processing**: Process model outputs and generate analysis results +1. **Validation**: Run conformance tests on outputs +1. **Data Upload**: Upload results to designated S3 locations + +Orchestration systems supporting such workflows would execute these tasks in the correct dependency order, share data between tasks through persistent volumes, allocate appropriate compute resources, handle any task failures with retries, and provide complete logging and monitoring throughout the process. + +This exploration of technical capabilities demonstrates the range of features available in modern orchestration systems while highlighting the flexibility organizations have in selecting implementation approaches and technology choices that align with their specific needs and constraints. From 6b54becc88d83ae89007e32573952ba72039f81a Mon Sep 17 00:00:00 2001 From: Brendan Barnes Date: Wed, 13 Aug 2025 20:39:43 +0000 Subject: [PATCH 5/6] move Dev Container setup instructions for Argo Workflows --- README.md | 65 --------------------------- docs/draft/orchestration/reference.md | 64 ++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 93eea06..b24eb15 100644 --- a/README.md +++ b/README.md @@ -6,71 +6,6 @@ https://fema-ffrd.github.io/specs/ ## Setup -### Dev Container Setup (Optional) - -1. Open this repository in VS Code -1. When prompted, click "Reopen in Container" or use the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container" -1. The container will automatically set up the environment and install dependencies - -#### What Gets Installed - -The setup includes: - -- **Base**: Debian 12 (bookworm) container -- **Docker**: Docker-outside-of-Docker for running k3s -- **kubectl**: Kubernetes CLI -- **argo**: Argo Workflows CLI v3.7.0 -- **k3s**: Lightweight Kubernetes cluster -- **Argo Workflows**: v3.7.0 installed in the cluster - -``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€-โ” -โ”‚ DevContainer โ”‚ -โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ -โ”‚ โ”‚ argo โ”‚ โ”‚ kubectl โ”‚ โ”‚ -โ”‚ โ”‚ CLI โ”‚ โ”‚ CLI โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ -โ”‚ โ”‚ -โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ -โ”‚ โ”‚ Docker Host โ”‚ โ”‚ -โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”‚ โ”‚ -โ”‚ โ”‚ โ”‚ k3s Container โ”‚โ”‚ โ”‚ -โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”‚โ”‚ โ”‚ -โ”‚ โ”‚ โ”‚ โ”‚ Argo Workflows โ”‚โ”‚โ”‚ โ”‚ -โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ”‚โ”‚ โ”‚ -โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€-โ”˜ -``` - -#### Useful Commands - -Once setup is complete, you can use these commands: - -```bash -# Validate workflow files -argo lint reference/orchestration/argo/reference.yaml - -# Submit workflow files -argo submit reference/orchestration/argo/reference.yaml - -# Watch the workflow execution -argo submit --watch reference/orchestration/argo/reference.yaml - -# List all workflows -argo list - -# View logs for a specific workflow -argo logs -``` - -#### Useful Links - -- Explore the [reference workflow](./reference/orchestration/argo/reference.yaml) -- Read the [Argo Workflows documentation](https://argo-workflows.readthedocs.io/) - -### Documentation Setup - 1. Create a Python virtual environment. ``` diff --git a/docs/draft/orchestration/reference.md b/docs/draft/orchestration/reference.md index f8766ea..db195c0 100644 --- a/docs/draft/orchestration/reference.md +++ b/docs/draft/orchestration/reference.md @@ -182,3 +182,67 @@ argo logs dag-example-abc123 # View workflow status and results argo get dag-example-abc123 ``` + +### Dev Container Setup (Optional) + +This section describes how to set up a development environment for running argo workflows locally using Visual Studio Code Dev Containers. This setup includes a lightweight Kubernetes cluster (k3s) with Argo Workflows installed, allowing you to run and test the reference implementation locally. + +1. Open [this](https://github.com/fema-ffrd/specs) repository in VS Code +1. When prompted, click "Reopen in Container" or use the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container" +1. The container will automatically set up the environment and install dependencies + +#### What Gets Installed + +The setup includes: + +- **Base**: Debian 12 (bookworm) container +- **Docker**: Docker-outside-of-Docker for running k3s +- **kubectl**: Kubernetes CLI +- **argo**: Argo Workflows CLI v3.7.0 +- **k3s**: Lightweight Kubernetes cluster +- **Argo Workflows**: v3.7.0 installed in the cluster + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€-โ” +โ”‚ DevContainer โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ argo โ”‚ โ”‚ kubectl โ”‚ โ”‚ +โ”‚ โ”‚ CLI โ”‚ โ”‚ CLI โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ Docker Host โ”‚ โ”‚ +โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ k3s Container โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ Argo Workflows โ”‚โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ”‚โ”‚ โ”‚ +โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€-โ”˜ +``` + +#### Useful Commands + +Once setup is complete, you can use these commands: + +```bash +# Validate workflow files +argo lint reference/orchestration/argo/reference.yaml + +# Submit workflow files +argo submit reference/orchestration/argo/reference.yaml + +# Watch the workflow execution +argo submit --watch reference/orchestration/argo/reference.yaml + +# List all workflows +argo list + +# View logs for a specific workflow +argo logs +``` + +#### Useful Links + +- Read the [Argo Workflows documentation](https://argo-workflows.readthedocs.io/) From 335b16695f8b99615559bd0936e77a2c37ffd5ff Mon Sep 17 00:00:00 2001 From: Brendan Barnes Date: Wed, 13 Aug 2025 21:21:48 +0000 Subject: [PATCH 6/6] move orchestration docs from drafts to proposals --- docs/draft/orchestration/orchestration.md | 9 --------- docs/proposals/orchestration/orchestration.md | 9 +++++++++ docs/{draft => proposals}/orchestration/reference.md | 0 docs/{draft => proposals}/orchestration/standard.md | 0 .../orchestration/technical-capabilities.md | 0 mkdocs.yml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) delete mode 100644 docs/draft/orchestration/orchestration.md create mode 100644 docs/proposals/orchestration/orchestration.md rename docs/{draft => proposals}/orchestration/reference.md (100%) rename docs/{draft => proposals}/orchestration/standard.md (100%) rename docs/{draft => proposals}/orchestration/technical-capabilities.md (100%) diff --git a/docs/draft/orchestration/orchestration.md b/docs/draft/orchestration/orchestration.md deleted file mode 100644 index 4c8321c..0000000 --- a/docs/draft/orchestration/orchestration.md +++ /dev/null @@ -1,9 +0,0 @@ -{% include "draft/orchestration/standard.md" %} - -______________________________________________________________________ - -{% include "draft/orchestration/technical-capabilities.md" %} - -______________________________________________________________________ - -{% include "draft/orchestration/reference.md" %} diff --git a/docs/proposals/orchestration/orchestration.md b/docs/proposals/orchestration/orchestration.md new file mode 100644 index 0000000..03ff695 --- /dev/null +++ b/docs/proposals/orchestration/orchestration.md @@ -0,0 +1,9 @@ +{% include "proposals/orchestration/standard.md" %} + +______________________________________________________________________ + +{% include "proposals/orchestration/technical-capabilities.md" %} + +______________________________________________________________________ + +{% include "proposals/orchestration/reference.md" %} diff --git a/docs/draft/orchestration/reference.md b/docs/proposals/orchestration/reference.md similarity index 100% rename from docs/draft/orchestration/reference.md rename to docs/proposals/orchestration/reference.md diff --git a/docs/draft/orchestration/standard.md b/docs/proposals/orchestration/standard.md similarity index 100% rename from docs/draft/orchestration/standard.md rename to docs/proposals/orchestration/standard.md diff --git a/docs/draft/orchestration/technical-capabilities.md b/docs/proposals/orchestration/technical-capabilities.md similarity index 100% rename from docs/draft/orchestration/technical-capabilities.md rename to docs/proposals/orchestration/technical-capabilities.md diff --git a/mkdocs.yml b/mkdocs.yml index 4103e05..dbd174f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -35,10 +35,10 @@ nav: - draft/base_image/base_image.md - draft/hms_sim/hms_sim.md - draft/ras_sim/ras_sim.md - - draft/orchestration/orchestration.md - Proposals: - proposals/conformance/conformance.md + - proposals/orchestration/orchestration.md - Appendix: - references.md