diff --git a/README.md b/README.md index ef3e39c9..e8116b6b 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,9 @@ cp .env.example .env # Start the stack (Postgres, API, Inngest, dashboard) docker compose up -# Run a benchmark -ergon benchmark run smoke_test +# Define and run an experiment +ergon experiment define smoke_test --worker training-stub --model stub:constant --limit 1 +ergon experiment run ``` ## Configuration diff --git a/ci/wait_for_stack.sh b/ci/wait_for_stack.sh index d7d1a8c6..b7900ac5 100755 --- a/ci/wait_for_stack.sh +++ b/ci/wait_for_stack.sh @@ -26,9 +26,8 @@ check() { # Postgres via docker exec (host may not have pg_isready installed). check "postgres" "docker compose exec -T postgres pg_isready -U ergon > /dev/null 2>&1" check "inngest" "curl -sf http://localhost:8289/v1/events/test > /dev/null 2>&1" -# The api has no / or /healthz route today; any HTTP response (including -# 404) from uvicorn counts as "reachable". ``curl -s`` without ``-f`` -# returns 0 on any HTTP status; ``--connect-timeout 2`` keeps probes snappy. -check "api" "curl -s -o /dev/null --connect-timeout 2 http://localhost:9000/ 2>/dev/null" +# Wait for an application-level route so Uvicorn accepting a socket during +# FastAPI lifespan startup does not race ahead of migrations/plugin setup. +check "api" "curl -sf --connect-timeout 2 http://localhost:9000/health > /dev/null 2>&1" echo "stack up" diff --git a/docker-compose.yml b/docker-compose.yml index 153cc2be..2adb82bf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -24,6 +24,9 @@ # POSTGRES_PASSWORD=ergon \ # TEST_HARNESS_SECRET=real-llm-secret \ # OPENROUTER_API_KEY="$OPENROUTER_API_KEY" \ +# OPENAI_API_KEY="$OPENAI_API_KEY" \ +# EXA_API_KEY="$EXA_API_KEY" \ +# HF_API_KEY="$HF_API_KEY" \ # docker compose up -d # # Observability stack (otel + jaeger) on demand: @@ -88,6 +91,9 @@ services: - OTEL_SERVICE_NAME=ergon-core - E2B_API_KEY=${E2B_API_KEY:-} - OPENROUTER_API_KEY=${OPENROUTER_API_KEY:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - EXA_API_KEY=${EXA_API_KEY:-} + - HF_API_KEY=${HF_API_KEY:-} # Put /app on sys.path so editable source mounts resolve in the API # container while the smoke fixtures live in ergon_core.test_support. - PYTHONPATH=/app diff --git a/docs/architecture/02_runtime_lifecycle.md b/docs/architecture/02_runtime_lifecycle.md index 809ca5d3..a20dd17a 100644 --- a/docs/architecture/02_runtime_lifecycle.md +++ b/docs/architecture/02_runtime_lifecycle.md @@ -156,7 +156,7 @@ A brief index of where runtime functions live. The architectural claims above st | Concern | File | | --- | --- | -| Entry + init | `runtime/inngest/benchmark_run_start.py`, `runtime/inngest/start_workflow.py` | +| Entry + init | `runtime/services/experiment_launch_service.py`, `runtime/inngest/start_workflow.py` | | Task orchestration | `runtime/inngest/execute_task.py` | | Task child steps | `runtime/inngest/sandbox_setup.py`, `runtime/inngest/worker_execute.py`, `runtime/inngest/persist_outputs.py` | | Propagation | `runtime/inngest/propagate_execution.py` | diff --git a/docs/architecture/03_providers.md b/docs/architecture/03_providers.md index 7a957547..89d9a90a 100644 --- a/docs/architecture/03_providers.md +++ b/docs/architecture/03_providers.md @@ -8,9 +8,7 @@ The providers layer is Ergon's boundary between runtime code and external execut | Name | Kind | Location | Freeze status | Owner | | --- | --- | --- | --- | --- | -| `_BACKEND_REGISTRY` | module-level dict | `ergon_core/core/providers/generation/model_resolution.py` | Frozen shape; entries grow via registration. | Providers layer. | | `resolve_model_target` | function | `ergon_core/core/providers/generation/model_resolution.py` | Public, frozen signature. Returns `ResolvedModel`. | Providers layer. | -| `register_model_backend` | function | `ergon_core/core/providers/generation/model_resolution.py` | Public, frozen signature. | Providers layer; callers are backend modules executing at import time. | | `BaseSandboxManager` | abstract class + singleton | `ergon_core/core/providers/sandbox/manager.py` | Shape stable; `event_sink` activation path in flux. | Providers layer. | | `DefaultSandboxManager` | concrete class | `ergon_core/core/providers/sandbox/manager.py` | Frozen. | Providers layer. | | `SWEBenchSandboxManager`, `MiniF2FSandboxManager`, `ResearchRubricsSandboxManager` | concrete subclasses | `ergon_builtins/` | Owned per benchmark; singletons. | Benchmark authors. | @@ -19,11 +17,11 @@ The providers layer is Ergon's boundary between runtime code and external execut | `SandboxResourcePublisher` | class | `ergon_core/core/providers/sandbox/resource_publisher.py` | Frozen API; storage backend swappable via `ERGON_BLOB_ROOT`. | Providers layer. | | `TransformersModel` | `pydantic_ai.models.Model` subclass | `ergon_builtins/ergon_builtins/models/transformers_backend.py` | Frozen. | ML team (TRL training loop callers). | -### 2.1 Generation registry +### 2.1 Model target resolution -`_BACKEND_REGISTRY` is a prefix-keyed dispatch table of resolver callables. `resolve_model_target` splits the target on its first colon, dispatches to the resolver, and returns a `ResolvedModel` wrapping either a `pydantic_ai.models.Model` instance or a passthrough string. Unknown prefixes fall through to a passthrough `ResolvedModel` — PydanticAI's own `infer_model` is invoked on use. Backends mutate the registry at import time; the builtins pack registers all four in a single loop at `ergon_builtins/ergon_builtins/registry.py:81`. +`resolve_model_target` is the single dispatch point for model target strings. It splits the target on its first colon and returns a `ResolvedModel` wrapping a concrete `pydantic_ai.models.Model` instance. Unknown prefixes raise immediately instead of falling through to PydanticAI inference. -The four prefixes registered today are `vllm:*` (local vLLM server via PydanticAI's `OpenAIChatModel`), `openai:*` / `anthropic:*` / `google:*` (passthrough to `infer_model`), and `transformers:*` (custom `TransformersModel` for TRL-trained checkpoints not served over vLLM). +The supported prefixes are `vllm:[#]`, `openai-compatible:#`, and cloud provider prefixes `openai:*` / `anthropic:*` / `google:*`. Cloud provider prefixes always route through OpenRouter via PydanticAI's OpenRouter provider; they do not call direct OpenAI, Anthropic, or Google APIs. Workers are expected to hold no hardcoded SDK client constructions (`AsyncOpenAI`, `anthropic.Client`, `genai.Client`). This is an invariant (Section 4), not a coincidence, and is currently honored — enforcement is grep discipline. @@ -87,7 +85,7 @@ The decentralized shape means `ergon benchmark setup` iterates over whatever sub Worker.execute() | +-> resolve_model_target(self.model) --> ResolvedModel - | (prefix dispatch; 4 backends + fallthrough to infer_model) + | (explicit prefix dispatch; cloud targets route via OpenRouter) | +-> ManagerClass() (singleton; returns cached instance) | ManagerClass().create(sandbox_key=task_id, run_id=run_id, ...) @@ -126,7 +124,7 @@ Movement of data across this diagram: ## 4. Invariants 1. **One entry point to LLM resolution.** Every model reference goes through `resolve_model_target`. Enforced by grep discipline and review; no runtime check. -2. **Backends register at import time.** `register_model_backend` must be called before any caller hits `resolve_model_target`. Enforced by the builtins pack running its registration loop at import, before any worker module imports. +2. **Cloud provider prefixes use OpenRouter.** `openai:*`, `anthropic:*`, and `google:*` model targets are OpenRouter-hosted targets. Direct cloud SDK model routing is intentionally outside the grammar. 3. **Singleton managers hold authoritative sandbox state.** A subclass's class-level state is the only source of truth for in-process reconnect. Enforced by `__new__` caching the instance and `get_sandbox` reading the class dict. Applies only within a single Python process; cross-process actors must use `terminate_by_sandbox_id` or provision their own sandbox. 4. **Sandbox lifecycle is per-task.** Enforced by `create` accepting `sandbox_key` and by the worker runtime persisting `sandbox_id` on the execution row. 5. **Sandbox lives across evaluator fan-out.** Teardown runs at the end of `check_evaluators`, not at worker completion, not in `finalize_success`. Enforced by the evaluator harness, not by the manager itself. @@ -146,10 +144,9 @@ Movement of data across this diagram: ### 5.1 Add a new LLM backend -1. Write a resolver that maps `"myprefix:foo"` to a `pydantic_ai.models.Model` instance wrapped in `ResolvedModel`. -2. Register it in the builtins-pack registration loop so `register_model_backend` is called at import time. -3. Ensure the builtins pack is imported before any worker that references `myprefix:*` model ids. -4. Add an entry to `LLMProvider` and `PROVIDER_KEY_MAP` in `ergon_cli/onboarding/profile.py` so onboarding prompts for the key or server URL. +1. Add an explicit prefix branch in `resolve_model_target` and keep the constructor logic in a sibling module under `ergon_core/core/providers/generation/`. +2. Return a concrete `pydantic_ai.models.Model` instance wrapped in `ResolvedModel`. +3. Add an entry to `LLMProvider` and `PROVIDER_KEY_MAP` in `ergon_cli/onboarding/profile.py` so onboarding prompts for the key or server URL. ### 5.2 Add a new sandbox manager diff --git a/docs/architecture/06_builtins.md b/docs/architecture/06_builtins.md index b7c082fe..43073cba 100644 --- a/docs/architecture/06_builtins.md +++ b/docs/architecture/06_builtins.md @@ -52,10 +52,11 @@ runnable — not a catalog of registered implementations. Rubric nesting is not supported and there are no plans to change that. - Third-party users primarily extend at the Criterion layer. -- Model backend registry. - - Concrete LLM backends register via - `register_model_backend(prefix, resolver)` at import time. - - Freeze status: stable API; adding a backend is additive. +- Model target resolution. + - Builtins do not register cloud model backends. Model target strings are + resolved centrally by `resolve_model_target` in `ergon_core`. + - Freeze status: stable API; adding a backend is additive inside the + providers layer. - ReAct toolkit composition. - There is one concrete ReAct worker class — `ReActWorker` (slug `react-v1`, @@ -145,8 +146,8 @@ Benchmark loader → Task instances → Worker - **New worker.** Add under `ergon_builtins/workers/baselines/` if it is cross-benchmark; alongside the benchmark otherwise. The contract is which task schemas it supports. -- **New model backend.** Call `register_model_backend(prefix, resolver)` at - import time; prefer short, stable prefixes. +- **New model backend.** Add an explicit `resolve_model_target` branch in + `ergon_core/core/providers/generation/`; prefer short, stable prefixes. - **New Criterion.** Place in `ergon_builtins/evaluators/criteria/` if reusable, alongside the benchmark if benchmark-specific. This is the layer third-party users most often extend. diff --git a/docs/experiments/rq1-cli-specialism/changelog.md b/docs/experiments/rq1-cli-specialism/changelog.md new file mode 100644 index 00000000..11cb5f91 --- /dev/null +++ b/docs/experiments/rq1-cli-specialism/changelog.md @@ -0,0 +1,297 @@ +# RQ1 CLI Specialism Overnight Changelog + +## Goal + +Use the PR #39 workflow-CLI ResearchRubrics agent to produce rollout-card artifacts that support RQ1: returns remain a useful guardrail, but rollout cards preserve richer delegation and role-specialism behaviour that scalar returns discard. + +## 2026-04-26 23:30 UTC+1 - Preflight + +- Worktree: `/Users/charliemasters/Desktop/synced_vm_002/ergon/.worktrees/feature/finish-agent-workflow-cli` +- Branch: `feature/finish-agent-workflow-cli` +- PR: https://github.com/DeepFlow-research/ergon/pull/39 +- Commit at start: `ae7a0a8 Finish agent workflow CLI task editing` +- PR checks: all current checks passing by `gh pr checks 39`: + - `Integration tests (Python)`: pass + - `Lint + type-check (Frontend)`: pass + - `Lint + type-check (Python)`: pass + - `Unit tests (Python)`: pass + - `smoke [minif2f]`: pass + - `smoke [researchrubrics]`: pass + - `smoke [swebench-verified]`: pass +- Local `.env`: not present in the PR worktree. Real-LLM commands source `/Users/charliemasters/Desktop/synced_vm_002/ergon/.env` without copying it. +- Required keys after sourcing main `.env`: `OPENROUTER_API_KEY`, `EXA_API_KEY`, and `E2B_API_KEY` are set. +- Local services: + - `docker compose ps` in the worktree showed no compose-owned services. + - `http://127.0.0.1:3001/` responded. + - `http://127.0.0.1:9000/` responded with HTTP 404, which still indicates a process is listening; harness fixture treats connection success as stack-up. + +## Run Log + +Runs append below. Each entry should include command, env knobs, rollout artifact path, run ID, terminal status, score notes, graph/subtask notes, and prompt/config changes. + +## 2026-04-26 23:36 UTC+1 - Preflight Smoke Blocker + +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 uv run pytest tests/real_llm/benchmarks/test_smoke_stub.py -v -s --assume-stack-up` +- Result: + - Failed during test collection before any benchmark/model spend. +- Root cause: + - `telemetry.models` imports `ergon_core.api.json_types`, which executes `ergon_core.api.__init__`. + - `ergon_core.api.__init__` eagerly imported `RunResourceView` from `api.run_resource`. + - `api.run_resource` imports `RunResourceKind` from `telemetry.models` while `telemetry.models` is partially initialized. +- Fix: + - Added `tests/unit/runtime/test_import_boundaries.py` as a regression. + - Changed `ergon_core/ergon_core/api/__init__.py` to lazily expose `RunResourceKind` and `RunResourceView` via `__getattr__`. +- Verification: + - `uv run pytest tests/unit/runtime/test_import_boundaries.py -q` -> `1 passed` + - `uv run ruff format ergon_core/ergon_core/api/__init__.py tests/unit/runtime/test_import_boundaries.py && uv run ruff check ergon_core/ergon_core/api/__init__.py tests/unit/runtime/test_import_boundaries.py` -> `All checks passed` +- Commit: + - `e23c276 Fix run resource API import boundary` + +## 2026-04-26 23:45 UTC+1 - Stack Rebuild + +- Rebuilt the shared `ergon` compose project from the PR #39 worktree: + - `COMPOSE_PROJECT_NAME=ergon docker compose up -d --build --wait` +- Reason: + - The running stack was built before PR #39, so the API/Inngest runtime might not know `researchrubrics-workflow-cli-react`. +- Result: + - `ergon-api-1`, `ergon-dashboard-1`, `ergon-inngest-dev-1`, and `ergon-postgres-1` are running. + - API root returns HTTP 404 but the process is reachable; the real-LLM fixture only requires connection success. + +## 2026-04-26 23:47 UTC+1 - Baseline Workflow-CLI Batch 1 + +- Intent: + - Run 5 ResearchRubrics samples with the current PR #39 workflow-CLI prompt. +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Status: + - Failed after creating run `2626bae9-b058-4b1b-9803-8e6186468023`. +- Failure: + - Harness endpoint `GET /api/test/read/run/2626bae9-b058-4b1b-9803-8e6186468023/state` returned HTTP 500. + - Local DB/API inspection showed `psycopg2.errors.UndefinedColumn: column run_resources.copied_from_resource_id does not exist`. +- Root cause: + - The long-lived local Postgres DB was stamped at Alembic head `0a1b2c3d4e5f`, but was missing the already-existing migration `a2b3c4d5e6f7_add_copied_from_resource_id.py` effect. This is local schema drift, not a missing migration in the branch. +- Local repair: + - Applied idempotent local DDL: + - `ALTER TABLE run_resources ADD COLUMN IF NOT EXISTS copied_from_resource_id UUID NULL` + - `CREATE INDEX IF NOT EXISTS ix_run_resources_copied_from_resource_id ON run_resources (copied_from_resource_id)` + - Add FK constraint `fk_run_resources_copied_from_resource_id_run_resources` if absent. + - Verification: information schema now reports one `copied_from_resource_id` column. +- Post-repair canary: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 uv run pytest tests/real_llm/benchmarks/test_smoke_stub.py -v -s --assume-stack-up` + - Result: `1 passed` in 27.15s. + +## 2026-04-26 23:45 UTC+1 - Baseline Workflow-CLI Batch 1b + +- Intent: + - Retry 5 ResearchRubrics samples after local schema repair. +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Status: + - Passed, but not useful for headline RQ1 evidence. +- Rollout: + - Directory: `tests/real_llm/.rollouts/20260426T224530Z-3caf7e5c-e09f-47a8-8afb-58fd2693b761/` + - Run ID: `3caf7e5c-e09f-47a8-8afb-58fd2693b761` + - Wall clock: 235.6s + - Budget: $0.477609 +- Findings: + - The hardcoded `researchrubrics` benchmark loaded only 2 private/default smoke rows: `smoke-001`, `smoke-002`. + - Graph had 2 root nodes, 0 edges, 0 child subtasks, 2 resources, 1 evaluation. + - Worker did call `workflow inspect task-tree` once per task, but did not spawn/coordinate specialist subtasks. + - Evaluator returned score 0.0 because the API container did not have `OPENAI_API_KEY`. +- Fixes after analysis: + - `ResearchRubricsBenchmark._payload_from_row` now accepts vanilla dataset rows with `prompt` when `ablated_prompt` is absent. + - `tests/real_llm/benchmarks/test_researchrubrics.py` now honors `ERGON_REAL_LLM_BENCHMARK`, defaulting to `researchrubrics`. + - `docker-compose.yml` now passes `OPENAI_API_KEY`, `EXA_API_KEY`, and `HF_API_KEY` to the API container alongside the existing E2B/OpenRouter keys. + - Focused tests: `uv run pytest tests/unit/state/test_research_rubrics_benchmark.py -q` -> `10 passed`. + - Vanilla load check: `ResearchRubricsVanillaBenchmark(limit=5)` -> 5 rows loaded. + - Stack rebuilt with exported env; API container verified all provider keys present. + +## 2026-04-27 00:00 UTC+1 - Vanilla 5-Sample Workflow-CLI Batch 1 + +- Intent: + - Run the actual 5-row ScaleAI ResearchRubrics benchmark after enabling vanilla rows and backend evaluator env. +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_BENCHMARK=researchrubrics-vanilla ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Status: + - Run reached terminal `failed`, but pytest timed out waiting for resources/evaluations because all tasks failed before report persistence. +- Rollout: + - Directory: `tests/real_llm/.rollouts/20260426T230154Z-ab57a0df-2a6d-4174-95f5-87185f717707/` + - Run ID: `ab57a0df-2a6d-4174-95f5-87185f717707` + - Row counts: 5 graph nodes, 0 graph edges, 25 mutations, 121 context events, 10 sandbox events, 0 resources, 0 evaluations. +- Findings: + - This was the intended 5 real-row ScaleAI benchmark: five sample IDs were created. + - Behavior was rich but not successful: 116 tool calls total, including 111 `exa_search` and 5 `workflow inspect task-tree`. + - No task called `write_report_draft` or `final_result`; all failed with generic `Worker execution failed`. + - Failure mode appears to be search-budget exhaustion / max-iteration behavior on large vanilla rubrics, not missing provider keys. + - No child subtasks: the workflow tool was available but graph editing was not manager-capable, and the prompt only suggested inspection/resource-copying. +- Core/harness fixes: + - `_wait_for_post_terminal_artifacts` now returns for terminal `failed`/`cancelled` runs with no running executions, so failed-before-output rollouts still dump artifacts. + - `_require_keys` now includes `openai_api_key`. + - Broke a context-event import cycle by storing context `turn_logprobs` as open JSON payloads instead of importing `TokenLogprob` from `ergon_core.api.generation`. + - Added import-boundary coverage for context models. + - Tests: `uv run pytest tests/unit/runtime/test_import_boundaries.py tests/unit/state/test_research_rubrics_benchmark.py -q` -> `12 passed`. + +## 2026-04-27 00:04 UTC+1 - Prompt Hillclimb Variant 1 + +- Prompt/tool changes: + - Workflow-CLI ReAct worker now passes `manager_capable=True` to `make_workflow_cli_tool`. + - Prompt asks level-0 tasks to create exactly three specialist child tasks before research: + - source scout + - rubric compliance checker + - synthesis reviewer + - Prompt tells non-root tasks not to create recursive children. + - Prompt caps own work to at most 6 `exa_search` calls before writing `final_output/report.md`. +- Verification: + - `uv run pytest tests/unit/state/test_research_rubrics_workers.py tests/unit/state/test_workflow_cli_tool.py -q` -> `10 passed`. + - API restarted; provider keys still present in container. +- Next run command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_BENCHMARK=researchrubrics-vanilla ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Diagnostic run: + - Run ID: `4d3721d0-aacb-4f04-bea9-9217c0549f9e` + - Stopped pytest manually after confirming it was polluted by the async workflow bridge bug. + - Positive signal: at least one root task attempted the desired `workflow manage add-task` specialist pattern before searching: + - source scout + - rubric compliance checker + - synthesis reviewer + - Bug found: agent-side workflow manage commands called the sync CLI bridge, which used `asyncio.run()` inside an already-running event loop. API log showed `RuntimeWarning: coroutine '_handle_manage' was never awaited`. +- Core fix: + - Added `execute_workflow_command_async(...)` in `ergon_cli.commands.workflow`. + - `execute_workflow_command(...)` now remains a sync wrapper for CLI callers. + - `make_workflow_cli_tool(...)` now awaits the async executor. + - Tests: `uv run pytest tests/unit/cli/test_workflow_cli.py tests/unit/state/test_workflow_cli_tool.py -q` -> `10 passed`. + +## 2026-04-27 00:13 UTC+1 - Prompt Hillclimb Variant 1b + +- Intent: + - Re-run Variant 1 with the fixed async workflow bridge. +- Status: + - Cancelled after diagnostic success and provider failures. +- Diagnostic result: + - Run ID: `9a83787a-dac2-45a1-9d3f-823f65984716` + - Early poll showed 20 graph nodes: 5 roots + 15 level-1 specialist children. + - Each root created source scout, rubric compliance, and synthesis reviewer children. + - This is the desired RQ1 graph-specialism signal. + - However, several roots failed on provider/schema errors (`finish_reason=None`) before reports/evaluations landed; remaining children were pending/blocked. + - Cancelled run via `uv run ergon run cancel 9a83787a-dac2-45a1-9d3f-823f65984716`. + +## 2026-04-27 00:22 UTC+1 - Prompt Hillclimb Variant 1c + +- Intent: + - Keep the specialist-subtask prompt, but switch from OpenRouter Sonnet to direct OpenAI to avoid the `finish_reason=None` OpenRouter/PydanticAI failure. +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_MODEL=openai:gpt-4o-mini ERGON_REAL_LLM_BENCHMARK=researchrubrics-vanilla ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Status: + - Cancelled after partial artifact dump. +- Rollout: + - Directory: `tests/real_llm/.rollouts/20260426T232803Z-3b258073-ab38-4a22-ac18-766c27d8aa1e/` + - Run ID: `3b258073-ab38-4a22-ac18-766c27d8aa1e` + - Row counts: 11 graph nodes, 37 mutations, 90 context events, 3 resources, 2 evaluations. +- Findings: + - Direct OpenAI avoided the OpenRouter `finish_reason=None` issue. + - Two root tasks completed and produced evaluations: + - score `0.11382113821138211`, passed `true` + - score `0.014084507042253521`, passed `false` + - Three roots failed before final output; two failed roots created specialist children, which were blocked by parent failure. + - This is a partial "rich behavior vs return" data point: returns are low/partial, but rollout-card structure exposes role-specialist decomposition not captured by scalar return. + +## 2026-04-27 00:29 UTC+1 - Prompt Hillclimb Variant 1d + +- Intent: + - Same specialist prompt, direct OpenAI, stronger model (`openai:gpt-4o`) to improve returns while preserving graph-specialism signal. +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_MODEL=openai:gpt-4o ERGON_REAL_LLM_BENCHMARK=researchrubrics-vanilla ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Status: + - Cancelled after partial artifact dump because dynamic child tasks remained pending. +- Rollout: + - Directory: `tests/real_llm/.rollouts/20260426T233740Z-356b7189-229b-4ef4-849c-f3c87964feb4/` + - Run ID: `356b7189-229b-4ef4-849c-f3c87964feb4` + - Row counts: 20 graph nodes, 43 mutations, 62 context events, 5 resources, 4 evaluations. +- Findings: + - Best evidence so far: 5 roots, 15 specialist children, 4/5 root reports completed, 4 evaluations landed. + - Scores: + - `0.1267605633802817`, passed `true` + - `0.11382113821138211`, passed `true` + - `0.07142857142857142`, passed `false` + - `0.0`, passed `false` + - Dynamic children remained `pending` rather than being scheduled after creation. +- Core fix: + - `WorkflowService.add_task` now emits `task/ready` for the created dynamic node after commit. + - Added an injectable task-ready dispatcher and unit test coverage. + - Tests: `uv run pytest tests/unit/runtime/test_workflow_service.py tests/unit/cli/test_workflow_cli.py tests/unit/state/test_workflow_cli_tool.py -q` -> `22 passed`. + +## 2026-04-27 00:38 UTC+1 - Prompt Hillclimb Variant 1e + +- Intent: + - Same GPT-4o specialist prompt, now with dynamic child scheduling fixed. +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_MODEL=openai:gpt-4o ERGON_REAL_LLM_BENCHMARK=researchrubrics-vanilla ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Status: + - Pytest artifact-dump wrapper passed, but run terminal status is `failed`. +- Rollout: + - Directory: `tests/real_llm/.rollouts/20260426T233920Z-0700b668-a640-49f2-80f9-a5c87bc160a9/` + - Run ID: `0700b668-a640-49f2-80f9-a5c87bc160a9` + - Row counts: 20 graph nodes, 70 mutations, 257 context events, 5 resources, 5 evaluations, 20 executions. + - Run summary: `final_score=0.7134802212615627`, `normalized_score=0.14269604425231255`, `evaluators_count=5`. +- Findings: + - Scheduling fix worked: Inngest logs show dynamic `task/ready` events for child `node_id`s and `task-execute` initialized for those children. + - Graph-specialism signal preserved: 5 roots and 15 specialist children. + - Returns improved versus prior failed/partial variants: all 5 root tasks completed and all 5 root evaluations landed; 1/5 evaluations passed. + - Remaining failure mode: most specialist children started and then failed generically (`Worker execution failed`), causing the overall run to fail even though root reports/evaluations landed. + - Root cause for child recursion: the prompt told agents to inspect `task-tree`; child agents can see other level-0 roots in that output and at least one child incorrectly called `manage add-task`. +- Prompt fix for next run: + - Delegation decision now uses only `workflow("inspect task-workspace --format json")` and `task_workspace.task.level`. + - Prompt explicitly says to ignore level-0 tasks shown elsewhere in task-tree. + - Non-root specialist children are told not to call `workflow("manage add-task`), to use at most 2 workflow inspections and 3 `exa_search` calls, and to write `final_output/report.md`. +- Verification: + - Red test first: `uv run pytest tests/unit/state/test_research_rubrics_workers.py::TestResearcherWorker::test_workflow_cli_prompt_uses_current_task_level_for_delegation -q` failed on the missing `task-workspace --format json` instruction. + - Green tests: `uv run pytest tests/unit/state/test_research_rubrics_workers.py tests/unit/state/test_workflow_cli_tool.py -q` -> `11 passed`. + +## 2026-04-27 00:55 UTC+1 - Prompt Hillclimb Variant 1f + +- Intent: + - Same GPT-4o specialist prompt, but with delegation keyed to the current task workspace rather than global task-tree rows. +- Command: + - `ERGON_REAL_LLM=1 ERGON_REAL_LLM_BUDGET_USD=50 ERGON_REAL_LLM_MODEL=openai:gpt-4o ERGON_REAL_LLM_BENCHMARK=researchrubrics-vanilla ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react ERGON_REAL_LLM_LIMIT=5 uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s --assume-stack-up` +- Status: + - Pytest artifact-dump wrapper passed, but run terminal status is `failed`. +- Rollout: + - Directory: `tests/real_llm/.rollouts/20260426T234424Z-7fc055f5-03c3-4cab-8117-04e844696482/` + - Run ID: `7fc055f5-03c3-4cab-8117-04e844696482` + - Row counts: 20 graph nodes, 70 mutations, 235 context events, 5 resources, 5 evaluations, 20 executions. + - Run summary: `final_score=0.7597894539417135`, `normalized_score=0.1519578907883427`, `evaluators_count=5`. +- Findings: + - Best overnight evidence so far. + - Graph-specialism signal: 5 roots created exactly 15 specialist children; `manage add-task` appears exactly 15 times and no recursive child creation was observed. + - Return guardrail: all 5 root tasks completed, all 5 root evaluations landed, and the aggregate normalized score improved slightly over 1e (`0.1519578907883427` vs `0.14269604425231255`). + - Specialist execution improved but remains noisy: 5/15 children completed, 10/15 failed with generic `Worker execution failed`, so the run-level status is still `failed`. + - This supports the RQ1 story: the scalar terminal status is poor, but the rollout card exposes a stable specialist-delegation pattern, role-specific child descriptions, root report completion, and recoverable child-worker behavior. +- Backend harness endpoint check: + - `GET http://127.0.0.1:9000/api/test/read/run/7fc055f5-03c3-4cab-8117-04e844696482/state` returned HTTP 200 with `status=failed`, `graph_nodes=20`, `mutations=70`, `evaluations=5`, `executions=20`, `resource_count=5`, `context_event_count=235`. + - The same path on dashboard port `3001` returned 404; the harness route is backend API, not dashboard. + +## Morning Handoff Notes + +- Best variant: Prompt Hillclimb Variant 1f. +- Best artifact path: `tests/real_llm/.rollouts/20260426T234424Z-7fc055f5-03c3-4cab-8117-04e844696482/` +- Candidate RQ1 headline evidence: + - Returns/status alone: run is `failed`, but all 5 root tasks completed and all 5 evaluations landed. + - Rollout-card structure: 5 root tasks, exactly 15 specialist child tasks, 70 graph mutations, 235 context events. + - Specialism behavior: root tasks consistently decomposed into source-scout, rubric-checker/compliance, and synthesis-reviewer roles. + - Cross-community analysis hook: this single rollout card supports post-hoc role-diversity / worker-specialism measurements that are invisible in terminal status or scalar return. +- Main residual issue: + - Dynamic specialist children now schedule and some complete, but child failures still propagate run failure. Next core-code direction would be either (a) make advisory child tasks non-fatal for parent benchmark return, or (b) harden child-worker prompting/tooling so specialist children reliably write `final_output/report.md`. + +## 2026-04-27 10:05 UTC+1 - Model Resolution Refactor + +- Intent: + - Make Ergon cloud model targets (`openai:*`, `anthropic:*`, `google:*`) route through OpenRouter instead of direct provider APIs or PydanticAI fallback inference. +- Changes: + - Upgraded `pydantic-ai` from `0.7.2` to `0.8.1`, the latest resolvable version in this environment. + - Centralized dispatch in `ergon_core/core/providers/generation/model_resolution.py`. + - Added `openrouter.py` for OpenRouter-hosted cloud targets and `openai_compatible.py` for `vllm:` plus `openai-compatible:` endpoint targets. + - Removed the builtins model-backend registration path and the old `cloud_passthrough.py` / `vllm_backend.py` modules. +- Note: + - The installed PydanticAI version exposes `OpenRouterProvider` but not `OpenRouterModel`; the implementation uses `OpenAIChatModel(..., provider=OpenRouterProvider(...))`, which gives the desired OpenRouter routing semantics. + + diff --git a/docs/real-llm-rollout-harness.md b/docs/real-llm-rollout-harness.md index ad49aead..813f6f95 100644 --- a/docs/real-llm-rollout-harness.md +++ b/docs/real-llm-rollout-harness.md @@ -78,10 +78,9 @@ Missing for a real-LLM researchrubrics rollout: scorer), wired in `registry_data.py:28`. - **Sandbox**: `ResearchRubricsSandboxManager` — blank E2B sandbox with workspace dirs provisioned. No template file needed. -- **CLI**: `ergon benchmark run researchrubrics --worker +- **CLI**: `ergon experiment define researchrubrics --worker researchrubrics-researcher --evaluator research-rubric --model - --limit 1` composes cleanly via `build_experiment` - (`ergon_cli/composition/__init__.py:42`). + --limit 1` followed by `ergon experiment run `. ## Keys come from `settings` @@ -195,7 +194,7 @@ async def test_researchrubrics_rollout( "--evaluator", "research-rubric", "--model", os.environ.get( "ERGON_REAL_LLM_MODEL", - "openrouter:anthropic/claude-sonnet-4.6", + "anthropic:claude-sonnet-4.6", ), "--limit", "1", ], @@ -215,16 +214,13 @@ async def test_researchrubrics_rollout( ## Spike results -**1. OpenRouter model routing — works out of the box.** -`pydantic_ai.models.infer_model("openrouter:anthropic/claude-sonnet-4.6")` -resolves to an `OpenAIModel` backed by -`pydantic_ai.providers.openrouter.OpenRouterProvider`. The only -requirement is `OPENROUTER_API_KEY` in the process env, which -`settings.py:82-83` already exports from `settings.openrouter_api_key`. -`resolve_model_target`'s fallback branch passes `openrouter:*` strings -straight through to pydantic-ai. **No backend registration needed.** An -optional one-line `"openrouter": resolve_cloud` entry in -`MODEL_BACKENDS` is nice-to-have for symmetry, not required. +**1. OpenRouter model routing.** +`resolve_model_target("anthropic:claude-sonnet-4.6")` resolves to a +PydanticAI chat model backed by +`pydantic_ai.providers.openrouter.OpenRouterProvider`. Cloud provider +prefixes (`openai:`, `anthropic:`, `google:`) are OpenRouter-hosted in +Ergon; use `OPENROUTER_API_KEY` in the process env and do not route +through direct provider APIs. **2. Exa inside the sandbox — confirmed not wired.** The plumbing exists but nothing populates it: @@ -342,10 +338,9 @@ Files: `write_manifest`, `_rollout_dir`. 2. `tests/real_llm/benchmarks/test_researchrubrics.py` — the 30-line trigger above. -3. *(optional)* `ergon_builtins/registry_core.py` — one-line - `"openrouter": resolve_cloud` entry in `MODEL_BACKENDS` for - symmetry with the other cloud prefixes. Not required — pydantic-ai - handles the prefix natively via the fallback branch. +3. Model targets resolve centrally in `resolve_model_target`; use + provider-prefixed targets such as `anthropic:claude-sonnet-4.6`. + Cloud provider prefixes route through OpenRouter. Estimated effort: **half a day** on top of the pre-work PR. diff --git a/docs/superpowers/plans/2026-04-26-finish-agent-workflow-cli.md b/docs/superpowers/plans/2026-04-26-finish-agent-workflow-cli.md new file mode 100644 index 00000000..23ad8eef --- /dev/null +++ b/docs/superpowers/plans/2026-04-26-finish-agent-workflow-cli.md @@ -0,0 +1,100 @@ +# Finish Agent Workflow CLI Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Finish `ergon workflow` as an agent-facing CLI for task editing and resource copying in one PR off `main`. + +**Architecture:** Extend the already-merged V1 instead of replacing it. Keep scoped reads and mutation policy in `WorkflowService`, command parsing/rendering in `ergon_cli.commands.workflow`, and model-facing scope injection in `workflow_cli_tool`. All commands stay current-run/current-node scoped unless an injected manager-capable context explicitly permits broader graph edits. + +**Tech Stack:** Python, argparse, SQLModel, existing `WorkflowGraphRepository`, existing run graph tables, pydantic DTOs, pytest. + +--- + +## Current Baseline + +Already merged: + +- `ergon workflow ...` top-level command. +- `WorkflowService` with task/resource inspection and `materialize_resource`. +- `workflow(command)` pydantic-ai wrapper with injected run/node/execution/sandbox scope. +- ResearchRubrics workflow worker registration. + +## Implementation Tasks + +### Task 1: Real Task Editing Commands + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/workflow_dto.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/workflow_service.py` +- Modify: `ergon_cli/ergon_cli/commands/workflow.py` +- Test: `tests/unit/runtime/test_workflow_service.py` +- Test: `tests/unit/cli/test_workflow_cli.py` + +- [ ] Add a `WorkflowMutationRef` DTO with `action`, `dry_run`, `node`, `edge`, `message`, and `suggested_commands`. +- [ ] Add service methods for `add_task`, `add_edge`, `update_task_description`, `restart_task`, and `abandon_task`. +- [ ] Use `WorkflowGraphRepository` for graph writes and mutation logging. +- [ ] Keep `--dry-run` behavior identical to real command validation but without writes. +- [ ] Add CLI parser arguments for task slug, description, worker, source/target, and status fields. +- [ ] Add text and JSON renderers for mutation results. +- [ ] Verify with focused unit tests before moving on. + +### Task 2: Resource Copying Completion + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/workflow_dto.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/workflow_service.py` +- Modify: `ergon_cli/ergon_cli/commands/workflow.py` +- Test: `tests/unit/runtime/test_workflow_service.py` +- Test: `tests/unit/cli/test_workflow_cli.py` + +- [ ] Add `inspect resource-location`. +- [ ] Add `inspect task-workspace`. +- [ ] Harden `materialize-resource` destination handling: reject absolute paths, `..`, and paths outside `/workspace`. +- [ ] Preserve source resource bytes and row unchanged. +- [ ] Ensure copied resource rows use `RunResourceKind.IMPORT`, `copied_from_resource_id`, and metadata with source resource, source task, and sandbox destination. +- [ ] Add JSON/text outputs for resource location and task workspace. +- [ ] Verify with unit tests and one integration-style sandbox-manager-injected test. + +### Task 3: Agent Wrapper Permissions + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` +- Test: `tests/unit/state/test_workflow_cli_tool.py` + +- [ ] Add a permission mode to the wrapper: leaf agents can inspect and materialize visible resources; manager-capable agents can use graph edit commands. +- [ ] Reject user-supplied scope/context flags before command execution. +- [ ] Reject multiline commands. +- [ ] Return structured, model-readable failure strings instead of leaking tracebacks. +- [ ] Verify wrapper tests for allowed inspect, allowed materialize, denied graph edit, and allowed manager graph edit. + +### Task 4: Acceptance Coverage + +**Files:** +- Modify: existing smoke fixture workers only as needed. +- Modify: existing E2E assertions only as needed. +- Test: focused unit tests plus existing smoke tests. + +- [ ] Ensure one deterministic no-LLM smoke path calls `workflow("inspect task-tree")`. +- [ ] Ensure one deterministic no-LLM smoke path calls `workflow("inspect resource-list --scope input")`. +- [ ] Ensure one deterministic no-LLM smoke path dry-runs `manage materialize-resource`. +- [ ] Keep real-LLM rollout optional, using `researchrubrics-workflow-cli-react`. +- [ ] Run focused workflow tests, Python unit tests touched by runtime changes, frontend contract generation if schemas change, and CI-fast-compatible checks. + +## Verification Commands + +Run incrementally: + +```bash +uv run pytest tests/unit/runtime/test_workflow_service.py -v +uv run pytest tests/unit/cli/test_workflow_cli.py -v +uv run pytest tests/unit/state/test_workflow_cli_tool.py -v +``` + +Before PR: + +```bash +uv run pytest tests/unit/runtime/test_workflow_service.py tests/unit/cli/test_workflow_cli.py tests/unit/state/test_workflow_cli_tool.py -v +uv run pytest tests/unit/runtime tests/unit/cli tests/unit/state -q +pnpm --dir ergon-dashboard run typecheck +``` + diff --git a/ergon-dashboard/scripts/generate-rest-contracts.mjs b/ergon-dashboard/scripts/generate-rest-contracts.mjs index 558239b9..24745ab7 100644 --- a/ergon-dashboard/scripts/generate-rest-contracts.mjs +++ b/ergon-dashboard/scripts/generate-rest-contracts.mjs @@ -9,7 +9,13 @@ const contractsPath = path.resolve(__dirname, "../src/generated/rest/contracts.t const source = readFileSync(contractsPath, "utf8") .replace('import { makeApi, Zodios, type ZodiosOptions } from "@zodios/core";\n', "") // openapi-zod-client generates z.record(V) but Zod requires z.record(K, V). - .replace(/z\.record\((?!z\.string\(\))/g, "z.record(z.string(), "); + .replace(/z\.record\((?!z\.string\(\))/g, "z.record(z.string(), ") + // Recursive JSON schemas must be lazy or the generated module dereferences + // JsonValue_Input before it has been initialized. + .replace( + /const JsonValue_(Input|Output): z\.ZodType = z\.union\(\[\n([\s\S]*?)\n\]\);/g, + "const JsonValue_$1: z.ZodType = z.lazy(() => z.union([\n$2\n]));", + ); const endpointMarker = "\nconst endpoints = makeApi(["; const markerIndex = source.indexOf(endpointMarker); diff --git a/ergon-dashboard/src/app/api/cohorts/[cohortId]/route.ts b/ergon-dashboard/src/app/api/cohorts/[cohortId]/route.ts index 74f496bf..723236c5 100644 --- a/ergon-dashboard/src/app/api/cohorts/[cohortId]/route.ts +++ b/ergon-dashboard/src/app/api/cohorts/[cohortId]/route.ts @@ -2,7 +2,6 @@ import { NextResponse } from "next/server"; import { config } from "@/lib/config"; import { - parseCohortDetail, parseCohortSummary, parseUpdateCohortRequest, } from "@/lib/contracts/rest"; @@ -30,7 +29,7 @@ export async function GET(_request: Request, context: RouteContext) { const response = await fetchErgonApi(`/cohorts/${cohortId}`); const body = await response.json(); if (response.ok) { - return NextResponse.json(parseCohortDetail(body), { status: response.status }); + return NextResponse.json(body, { status: response.status }); } return NextResponse.json(body, { status: response.status }); } catch (error) { diff --git a/ergon-dashboard/src/app/cohorts/[cohortId]/page.tsx b/ergon-dashboard/src/app/cohorts/[cohortId]/page.tsx index c27dd0ca..4c0b14d7 100644 --- a/ergon-dashboard/src/app/cohorts/[cohortId]/page.tsx +++ b/ergon-dashboard/src/app/cohorts/[cohortId]/page.tsx @@ -1,7 +1,7 @@ -import { CohortDetailView } from "@/components/cohorts/CohortDetailView"; -import { parseCohortDetail } from "@/lib/contracts/rest"; +import { CohortExperimentDetailView } from "@/components/cohorts/CohortExperimentDetailView"; +import { config } from "@/lib/config"; import { fetchErgonApi } from "@/lib/serverApi"; -import type { CohortDetail } from "@/lib/types"; +import { getHarnessCohort } from "@/lib/testing/dashboardHarness"; interface CohortPageProps { params: Promise<{ @@ -11,16 +11,20 @@ interface CohortPageProps { export default async function CohortPage({ params }: CohortPageProps) { const { cohortId } = await params; - let initialDetail: CohortDetail | null = null; + let initialDetail = null; try { - const response = await fetchErgonApi(`/cohorts/${cohortId}`); - if (response.ok) { - initialDetail = parseCohortDetail(await response.json()); + if (config.enableTestHarness) { + initialDetail = getHarnessCohort(cohortId); + } else { + const response = await fetchErgonApi(`/cohorts/${cohortId}`); + if (response.ok) { + initialDetail = await response.json(); + } } } catch { initialDetail = null; } - return ; + return ; } diff --git a/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx b/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx index 6bc16e39..86d038e2 100644 --- a/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx +++ b/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx @@ -1,9 +1,9 @@ import { RunWorkspacePage } from "@/components/run/RunWorkspacePage"; import { config } from "@/lib/config"; -import { parseCohortDetail, parseRunSnapshot } from "@/lib/contracts/rest"; +import { parseRunSnapshot } from "@/lib/contracts/rest"; import { fetchErgonApi } from "@/lib/serverApi"; -import { getHarnessCohort, getHarnessRun } from "@/lib/testing/dashboardHarness"; -import type { CohortDetail, SerializedWorkflowRunState } from "@/lib/types"; +import { getHarnessRun } from "@/lib/testing/dashboardHarness"; +import type { SerializedWorkflowRunState } from "@/lib/types"; interface CohortRunPageProps { params: Promise<{ @@ -15,26 +15,18 @@ interface CohortRunPageProps { export default async function CohortRunPage({ params }: CohortRunPageProps) { const { cohortId, runId } = await params; let initialRunState: SerializedWorkflowRunState | null = null; - let initialCohortDetail: CohortDetail | null = null; let ssrError: string | null = null; if (config.enableTestHarness) { initialRunState = getHarnessRun(runId); - initialCohortDetail = getHarnessCohort(cohortId); } else { try { - const [runResponse, cohortResponse] = await Promise.all([ - fetchErgonApi(`/runs/${runId}`), - fetchErgonApi(`/cohorts/${cohortId}`), - ]); + const runResponse = await fetchErgonApi(`/runs/${runId}`); if (runResponse.ok) { initialRunState = parseRunSnapshot(await runResponse.json()); } else { ssrError = `Run API returned ${runResponse.status}`; } - if (cohortResponse.ok) { - initialCohortDetail = parseCohortDetail(await cohortResponse.json()); - } } catch (e) { const msg = e instanceof Error ? e.message : String(e); console.error(`[CohortRunPage] SSR fetch failed for run ${runId}:`, msg); @@ -42,7 +34,6 @@ export default async function CohortRunPage({ params }: CohortRunPageProps) { ? "Stale build — the .next cache is corrupted. Restart the dev server (rm -rf .next && docker compose restart dashboard)." : `Server-side data fetch failed: ${msg}`; initialRunState = null; - initialCohortDetail = null; } } @@ -51,7 +42,6 @@ export default async function CohortRunPage({ params }: CohortRunPageProps) { cohortId={cohortId} runId={runId} initialRunState={initialRunState} - initialCohortDetail={initialCohortDetail} ssrError={ssrError} /> ); diff --git a/ergon-dashboard/src/app/cohorts/page.tsx b/ergon-dashboard/src/app/cohorts/page.tsx new file mode 100644 index 00000000..70ec99cf --- /dev/null +++ b/ergon-dashboard/src/app/cohorts/page.tsx @@ -0,0 +1,5 @@ +import { CohortListView } from "@/components/cohorts/CohortListView"; + +export default function CohortsPage() { + return ; +} diff --git a/ergon-dashboard/src/app/experiments/[experimentId]/page.tsx b/ergon-dashboard/src/app/experiments/[experimentId]/page.tsx new file mode 100644 index 00000000..b03eb64d --- /dev/null +++ b/ergon-dashboard/src/app/experiments/[experimentId]/page.tsx @@ -0,0 +1,300 @@ +import Link from "next/link"; +import { notFound } from "next/navigation"; + +import { config } from "@/lib/config"; +import { fetchErgonApi } from "@/lib/serverApi"; +import { getHarnessExperiment } from "@/lib/testing/dashboardHarness"; + +interface ExperimentRunRow { + run_id: string; + workflow_definition_id: string; + benchmark_type: string; + instance_key: string; + status: string; + created_at: string; + started_at: string | null; + completed_at: string | null; + model_target: string | null; + evaluator_slug: string | null; + worker_team: Record; + seed: number | null; + running_time_ms: number | null; + final_score: number | null; + total_tasks: number | null; + total_cost_usd: number | null; + error_message: string | null; +} + +interface ExperimentStatusCounts { + pending: number; + executing: number; + evaluating: number; + completed: number; + failed: number; + cancelled: number; +} + +interface ExperimentAnalytics { + total_runs: number; + status_counts: ExperimentStatusCounts; + average_score: number | null; + average_duration_ms: number | null; + average_tasks: number | null; + total_cost_usd: number | null; + latest_activity_at: string | null; + error_count: number; +} + +interface ExperimentDetail { + experiment: { + experiment_id: string; + cohort_id: string | null; + name: string; + benchmark_type: string; + sample_count: number; + status: string; + default_model_target: string | null; + default_evaluator_slug: string | null; + default_worker_team: Record; + created_at: string; + started_at: string | null; + completed_at: string | null; + run_count: number; + }; + runs: ExperimentRunRow[]; + analytics: ExperimentAnalytics; + sample_selection: Record; + design: Record; + metadata: Record; +} + +interface ExperimentPageProps { + params: Promise<{ experimentId: string }>; +} + +function formatNumber(value: number | null | undefined, fallback = "—") { + if (value === null || value === undefined) return fallback; + return Number.isInteger(value) ? value.toString() : value.toFixed(2); +} + +function formatCurrency(value: number | null | undefined) { + if (value === null || value === undefined) return "—"; + return `$${value.toFixed(2)}`; +} + +function formatDuration(ms: number | null | undefined) { + if (ms === null || ms === undefined) return "—"; + if (ms < 1000) return `${ms}ms`; + const seconds = ms / 1000; + if (seconds < 60) return `${seconds.toFixed(1)}s`; + return `${(seconds / 60).toFixed(1)}m`; +} + +function formatDate(value: string | null | undefined) { + if (!value) return "—"; + return new Date(value).toLocaleString(); +} + +function workerTeamLabel(workerTeam: Record) { + const entries = Object.entries(workerTeam); + if (entries.length === 0) return "—"; + return entries.map(([key, value]) => `${key}: ${String(value)}`).join(", "); +} + +function runLink(runId: string, cohortId: string | null) { + if (cohortId) return `/cohorts/${cohortId}/runs/${runId}`; + return `/run/${runId}`; +} + +export default async function ExperimentPage({ params }: ExperimentPageProps) { + const { experimentId } = await params; + let detail: ExperimentDetail | null = null; + if (config.enableTestHarness) { + detail = getHarnessExperiment(experimentId) as ExperimentDetail | null; + if (detail === null) notFound(); + } else { + const response = await fetchErgonApi(`/experiments/${experimentId}`); + if (response.status === 404) notFound(); + if (!response.ok) { + throw new Error(`Failed to load experiment ${experimentId}: ${response.status}`); + } + detail = (await response.json()) as ExperimentDetail; + } + + const experiment = detail.experiment; + const analytics = detail.analytics; + + return ( +
+
+
+ + {experiment.cohort_id ? "Cohort" : "Experiments"} + +

+ {experiment.name} +

+

+ {experiment.benchmark_type} · {experiment.sample_count} samples ·{" "} + {experiment.run_count} runs · latest activity {formatDate(analytics.latest_activity_at)} +

+
+
+ {experiment.status} +
+
+ +
+
+
Model
+
{experiment.default_model_target ?? "—"}
+
+
+
Evaluator
+
{experiment.default_evaluator_slug ?? "—"}
+
+
+
Worker team
+
+ {workerTeamLabel(experiment.default_worker_team)} +
+
+
+
Samples
+
+ {Array.isArray(detail.sample_selection.instance_keys) + ? detail.sample_selection.instance_keys.join(", ") + : experiment.sample_count} +
+
+
+ +
+
+
Score
+
+ {formatNumber(analytics.average_score)} +
+
average completed-run score
+
+
+
Runs
+
+ {analytics.status_counts.completed}/{analytics.total_runs} +
+
+ {analytics.status_counts.failed} failed ·{" "} + {analytics.status_counts.executing + analytics.status_counts.evaluating} active +
+
+
+
Runtime
+
+ {formatDuration(analytics.average_duration_ms)} +
+
+ {formatNumber(analytics.average_tasks)} avg tasks +
+
+
+
Cost
+
+ {formatCurrency(analytics.total_cost_usd)} +
+
+ {analytics.error_count} runs with errors +
+
+
+ +
+
+
+

Run distribution

+

+ Score and runtime by benchmark instance. +

+
+
+
+ {detail.runs.map((run) => ( +
+
+ {run.instance_key} + {run.status} +
+
+ score {formatNumber(run.final_score)} · runtime {formatDuration(run.running_time_ms)} +
+
+ ))} +
+
+ +
+ + + + + + + + + + + + + + + {detail.runs.map((run) => ( + + + + + + + + + + + ))} + {detail.runs.length === 0 ? ( + + + + ) : null} + +
RunSampleStatusDurationScoreTasksModelEvaluator
+ + {run.run_id} + + {run.instance_key} +
{run.status}
+ {run.error_message ?
{run.error_message}
: null} +
+ {formatDuration(run.running_time_ms)} + + {run.final_score === null ? "—" : `Eval ${formatNumber(run.final_score)}`} + {run.total_tasks ?? "—"}{run.model_target ?? "—"}{run.evaluator_slug ?? "—"}
+ This experiment has not launched any runs yet. +
+
+
+ ); +} diff --git a/ergon-dashboard/src/app/experiments/page.tsx b/ergon-dashboard/src/app/experiments/page.tsx new file mode 100644 index 00000000..b4bb4ffb --- /dev/null +++ b/ergon-dashboard/src/app/experiments/page.tsx @@ -0,0 +1,97 @@ +import Link from "next/link"; + +import { fetchErgonApi } from "@/lib/serverApi"; + +interface ExperimentSummary { + experiment_id: string; + cohort_id: string | null; + name: string; + benchmark_type: string; + sample_count: number; + status: string; + default_model_target: string | null; + default_evaluator_slug: string | null; + created_at: string; + run_count: number; +} + +export default async function ExperimentsPage() { + let experiments: ExperimentSummary[] = []; + let error: string | null = null; + + try { + const response = await fetchErgonApi("/experiments?limit=100"); + if (response.ok) { + experiments = (await response.json()) as ExperimentSummary[]; + } else { + error = `API returned ${response.status}`; + } + } catch (err) { + error = err instanceof Error ? err.message : "Failed to load experiments"; + } + + return ( +
+
+

+ Experiment Index +

+

+ Experiments +

+

+ One experiment is a launched design; each row can own multiple workflow runs. +

+
+ + {error ? ( +
+ {error} +
+ ) : null} + +
+ + + + + + + + + + + + + {experiments.map((experiment) => ( + + + + + + + + + ))} + {experiments.length === 0 ? ( + + + + ) : null} + +
NameBenchmarkSamplesRunsStatusModel
+ + {experiment.name} + + {experiment.benchmark_type}{experiment.sample_count}{experiment.run_count}{experiment.status} + {experiment.default_model_target ?? "—"} +
+ No experiments yet. +
+
+
+ ); +} diff --git a/ergon-dashboard/src/components/cohorts/CohortDetailView.tsx b/ergon-dashboard/src/components/cohorts/CohortDetailView.tsx deleted file mode 100644 index c071391d..00000000 --- a/ergon-dashboard/src/components/cohorts/CohortDetailView.tsx +++ /dev/null @@ -1,498 +0,0 @@ -"use client"; - -import Link from "next/link"; -import { useState } from "react"; - -import { useCohortDetail } from "@/hooks/useCohortDetail"; -import { CohortRunRow, CohortSummary, RunLifecycleStatus } from "@/lib/types"; -import { StatusBadge } from "@/components/common/StatusBadge"; -import { getCohortDisplayStatus } from "@/lib/cohortStatus"; -import { CohortDetail } from "@/lib/types"; -import { formatDurationMs } from "@/lib/formatDuration"; - -function formatScore(score: number | null | undefined): string { - if (score == null) return "—"; - return `${(score * 100).toFixed(1)}%`; -} - -function formatCost(value: number | null): string { - if (value == null) return "—"; - return `$${value.toFixed(2)}`; -} - -const startedAtDisplayFormatter = new Intl.DateTimeFormat(undefined, { - dateStyle: "medium", - timeStyle: "short", -}); - -function formatStartedAt(iso: string | null): { text: string; dateTime: string | null } { - if (iso == null || iso === "") return { text: "—", dateTime: null }; - const d = new Date(iso); - if (Number.isNaN(d.getTime())) return { text: "—", dateTime: null }; - return { text: startedAtDisplayFormatter.format(d), dateTime: iso }; -} - -/* ────────────────────────────────────────────────────────── */ -/* Metric Tiles */ -/* ────────────────────────────────────────────────────────── */ - -function MetricTile({ - title, - value, - sub, - children, -}: { - title: string; - value: string; - sub?: string; - children?: React.ReactNode; -}) { - return ( -
-
- {title} -
-
- {value} -
- {sub && ( -
{sub}
- )} - {children} -
- ); -} - -interface CohortDetailStats { - averageCostUsd: number | null; - averageTasks: number | null; - completed: number; - failed: number; - scores: number[]; - totalCostUsd: number | null; - totalRuns: number; -} - -function buildDetailStats(summary: CohortSummary, runs: CohortRunRow[]): CohortDetailStats { - const totalRuns = runs.length || summary.total_runs; - const completed = - runs.length > 0 - ? runs.filter((run) => run.status === "completed").length - : summary.status_counts.completed; - const failed = - runs.length > 0 - ? runs.filter((run) => run.status === "failed").length - : summary.status_counts.failed; - const scores = runs - .map((run) => run.final_score) - .filter((score): score is number => score !== null); - const taskCounts = runs - .map((run) => run.total_tasks) - .filter((count): count is number => count !== null); - const costs = runs - .map((run) => run.total_cost_usd) - .filter((cost): cost is number => cost !== null); - const totalCostUsd = costs.length > 0 ? costs.reduce((sum, cost) => sum + cost, 0) : null; - - return { - averageCostUsd: costs.length > 0 && totalCostUsd !== null ? totalCostUsd / costs.length : null, - averageTasks: - taskCounts.length > 0 - ? taskCounts.reduce((sum, count) => sum + count, 0) / taskCounts.length - : null, - completed, - failed, - scores, - totalCostUsd, - totalRuns, - }; -} - -function ResolutionTile({ stats }: { stats: CohortDetailStats }) { - const total = stats.totalRuns; - const completed = stats.completed; - const pct = total > 0 ? Math.round((completed / total) * 100) : 0; - - return ( - - ); -} - -function RunsPassFailTile({ stats }: { stats: CohortDetailStats }) { - const completed = stats.completed; - const failed = stats.failed; - const total = stats.totalRuns; - const greenPct = total > 0 ? (completed / total) * 100 : 0; - const redPct = total > 0 ? (failed / total) * 100 : 0; - - return ( - -
- {greenPct > 0 && ( -
- )} - {redPct > 0 && ( -
- )} -
- - ); -} - -type DistributionMetric = "score" | "runtime" | "tasks" | "cost"; - -const distributionMetrics: Array<{ key: DistributionMetric; label: string }> = [ - { key: "score", label: "Score" }, - { key: "runtime", label: "Runtime" }, - { key: "tasks", label: "Tasks" }, - { key: "cost", label: "Cost" }, -]; - -function metricValue(run: CohortRunRow, metric: DistributionMetric): number | null { - switch (metric) { - case "score": - return run.final_score; - case "runtime": - return run.running_time_ms; - case "tasks": - return run.total_tasks; - case "cost": - return run.total_cost_usd; - } -} - -function formatMetricValue(metric: DistributionMetric, value: number): string { - switch (metric) { - case "score": - return formatScore(value); - case "runtime": - return formatDurationMs(value); - case "tasks": - return value.toFixed(0); - case "cost": - return formatCost(value); - } -} - -function RunDistribution({ cohortId, runs }: { cohortId: string; runs: CohortRunRow[] }) { - const [selectedMetric, setSelectedMetric] = useState("score"); - const selectedLabel = - distributionMetrics.find((metric) => metric.key === selectedMetric)?.label ?? "Score"; - const points = runs - .map((run, index) => ({ - index, - run, - value: metricValue(run, selectedMetric), - })) - .filter((point): point is { index: number; run: CohortRunRow; value: number } => point.value !== null); - const values = points.map((point) => point.value); - const min = selectedMetric === "score" ? 0 : Math.min(...values); - const max = selectedMetric === "score" ? 1 : Math.max(...values); - - function leftPct(value: number): number { - if (values.length === 0 || min === max) return 50; - return ((value - min) / (max - min)) * 100; - } - - return ( -
-
-
-

- {selectedLabel} distribution -

-

- One dot per run. Use the metric controls to spot slow, costly, or unusually large runs. -

-
-
- {distributionMetrics.map((metric) => ( - - ))} -
-
- - {points.length === 0 ? ( -
- No {selectedLabel.toLowerCase()} values are available yet. -
- ) : ( -
-
-
- {points.map((point) => { - const valueLabel = formatMetricValue(selectedMetric, point.value); - return ( - - - {point.run.run_id} {selectedLabel} {valueLabel} - - - ); - })} -
-
- {formatMetricValue(selectedMetric, min)} - {points.length} run{points.length === 1 ? "" : "s"} - {formatMetricValue(selectedMetric, max)} -
-
- )} -
- ); -} - -/* ────────────────────────────────────────────────────────── */ -/* Run Row */ -/* ────────────────────────────────────────────────────────── */ - -function CohortRunRowCard({ cohortId, run }: { cohortId: string; run: CohortRunRow }) { - const started = formatStartedAt(run.started_at); - - return ( - -
-
- - {run.run_id} - - -
-
- {run.cohort_name} - - {run.run_id.slice(0, 8)}... -
- {run.error_message && ( -
- {run.error_message} -
- )} -
- -
-
Benchmark
-
{run.cohort_name}
-
-
-
Status
-
{run.status}
-
-
-
Started
-
- {started.dateTime ? ( - - ) : ( - started.text - )} -
-
-
-
Runtime
-
- {formatDurationMs(run.running_time_ms)} -
-
-
-
Score
-
- {formatScore(run.final_score)} -
-
- - ); -} - -/* ────────────────────────────────────────────────────────── */ -/* Empty State */ -/* ────────────────────────────────────────────────────────── */ - -function EmptyRunsState() { - return ( -
- -

No runs yet

-

- This cohort has no runs. Launch a benchmark run targeting this cohort to get started. -

- -
- ); -} - -/* ────────────────────────────────────────────────────────── */ -/* Main View */ -/* ────────────────────────────────────────────────────────── */ - -export function CohortDetailView({ - cohortId, - initialDetail = null, -}: { - cohortId: string; - initialDetail?: CohortDetail | null; -}) { - const { detail, isLoading, error } = useCohortDetail(cohortId, initialDetail); - - if (isLoading) { - return ( -
- Loading cohort... -
- ); - } - - if (!detail) { - return ( -
- {error ?? "Cohort not found"} -
- ); - } - - const { summary, runs } = detail; - const stats = buildDetailStats(summary, runs); - - return ( -
-
-
- - Cohorts - -
-
-
-

- {summary.name} -

- -
-

- {summary.description ?? - "Monitor cohort progress, inspect runs, and drill into task-level evidence."} -

-
-
-
-
- -
- {error && ( -
- {error} -
- )} - - {/* 5-tile summary row */} -
- - - - - -
- - - - {/* Runs section */} -
-
-
-

Runs

-

- Select a run to inspect graph topology and task workspace evidence. -

-
-
- {runs.length === 0 ? ( - - ) : ( -
- {runs.map((run) => ( - - ))} -
- )} -
-
-
- ); -} diff --git a/ergon-dashboard/src/components/cohorts/CohortExperimentDetailView.tsx b/ergon-dashboard/src/components/cohorts/CohortExperimentDetailView.tsx new file mode 100644 index 00000000..a26957d2 --- /dev/null +++ b/ergon-dashboard/src/components/cohorts/CohortExperimentDetailView.tsx @@ -0,0 +1,217 @@ +"use client"; + +import Link from "next/link"; + +interface StatusCounts { + pending: number; + executing: number; + evaluating: number; + completed: number; + failed: number; +} + +interface CohortExperimentRow { + experiment_id: string; + name: string; + benchmark_type: string; + sample_count: number; + total_runs: number; + status_counts: StatusCounts; + status: string; + created_at: string; + default_model_target: string | null; + default_evaluator_slug: string | null; + final_score: number | null; + total_cost_usd: number | null; + error_message: string | null; +} + +interface CohortExperimentDetail { + summary: { + cohort_id: string; + name: string; + description: string | null; + created_by: string | null; + created_at: string; + status: string; + total_runs: number; + average_score: number | null; + average_duration_ms: number | null; + }; + experiments: CohortExperimentRow[]; +} + +function formatNumber(value: number | null | undefined, fallback = "—") { + if (value === null || value === undefined) return fallback; + return Number.isInteger(value) ? value.toString() : value.toFixed(2); +} + +function formatCurrency(value: number | null | undefined) { + if (value === null || value === undefined) return "—"; + return `$${value.toFixed(2)}`; +} + +function formatDuration(ms: number | null | undefined) { + if (ms === null || ms === undefined) return "—"; + if (ms < 1000) return `${ms}ms`; + const seconds = ms / 1000; + if (seconds < 60) return `${seconds.toFixed(1)}s`; + return `${(seconds / 60).toFixed(1)}m`; +} + +function statusSummary(counts: StatusCounts) { + return `${counts.completed} done · ${counts.failed} failed · ${ + counts.executing + counts.evaluating + counts.pending + } active`; +} + +function latestExperimentActivity(experiments: CohortExperimentRow[]) { + const latest = experiments + .map((experiment) => Date.parse(experiment.created_at)) + .filter(Number.isFinite) + .sort((a, b) => b - a)[0]; + return latest ? new Date(latest).toLocaleString() : "—"; +} + +function totalExperimentCost(experiments: CohortExperimentRow[]) { + const costs = experiments + .map((experiment) => experiment.total_cost_usd) + .filter((cost): cost is number => cost !== null); + if (costs.length === 0) return null; + return costs.reduce((total, cost) => total + cost, 0); +} + +export function CohortExperimentDetailView({ + detail, +}: { + detail: CohortExperimentDetail | null; +}) { + if (detail === null) { + return ( +
+
+ Cohort not found. +
+
+ ); + } + + const totalCost = totalExperimentCost(detail.experiments); + + return ( +
+
+ + Cohorts + +

+ {detail.summary.name} +

+

+ {detail.summary.description ?? "Project folder"} · created by{" "} + {detail.summary.created_by ?? "unknown"} · latest activity{" "} + {latestExperimentActivity(detail.experiments)} +

+
+ +
+
+
+ Experiments +
+
+ {detail.experiments.length} +
+
+ {detail.summary.total_runs} total runs +
+
+
+
+ Score / runtime +
+
+ {formatNumber(detail.summary.average_score)} +
+
+ avg score · {formatDuration(detail.summary.average_duration_ms)} avg runtime +
+
+
+
Cost
+
+ {formatCurrency(totalCost)} +
+
+ from experiments with persisted cost +
+
+
+ +
+ + + + + + + + + + + + + + + + {detail.experiments.map((experiment) => ( + + + + + + + + + + + + ))} + {detail.experiments.length === 0 ? ( + + + + ) : null} + +
ExperimentBenchmarkSamplesRunsStatusScoreCostEvaluatorModel
+ + {experiment.name} + + {experiment.benchmark_type}{experiment.sample_count}{experiment.total_runs} +
{experiment.status}
+
{statusSummary(experiment.status_counts)}
+ {experiment.error_message ? ( +
{experiment.error_message}
+ ) : null} +
+ {formatNumber(experiment.final_score)} + + {formatCurrency(experiment.total_cost_usd)} + + {experiment.default_evaluator_slug ?? "—"} + + {experiment.default_model_target ?? "—"} +
+ This cohort does not contain any experiments yet. +
+
+
+ ); +} diff --git a/ergon-dashboard/src/components/cohorts/CohortListView.tsx b/ergon-dashboard/src/components/cohorts/CohortListView.tsx index 0dd02222..3431c635 100644 --- a/ergon-dashboard/src/components/cohorts/CohortListView.tsx +++ b/ergon-dashboard/src/components/cohorts/CohortListView.tsx @@ -252,7 +252,7 @@ export function CohortListView() { No cohorts yet

- Start a benchmark run with a compulsory cohort name to create the first cohort. + Define an experiment with an optional cohort name to create the first cohort.

) : ( diff --git a/ergon-dashboard/src/components/run/RunWorkspacePage.tsx b/ergon-dashboard/src/components/run/RunWorkspacePage.tsx index 378bda4a..2320e0b5 100644 --- a/ergon-dashboard/src/components/run/RunWorkspacePage.tsx +++ b/ergon-dashboard/src/components/run/RunWorkspacePage.tsx @@ -17,10 +17,9 @@ import { type GraphMutationDto, } from "@/features/graph/contracts/graphMutations"; import { createReplayInitialState, replayToSequence } from "@/features/graph/state/graphMutationReducer"; -import { useCohortDetail } from "@/hooks/useCohortDetail"; import { useRunState } from "@/hooks/useRunState"; import { buildRunEvents } from "@/lib/runEvents"; -import { CohortDetail, RunLifecycleStatus, SerializedWorkflowRunState, TaskStatus } from "@/lib/types"; +import { RunLifecycleStatus, SerializedWorkflowRunState, TaskStatus } from "@/lib/types"; function formatSeconds(value: number | null): string { if (value == null) return "—"; @@ -49,13 +48,11 @@ export function RunWorkspacePage({ runId, cohortId, initialRunState = null, - initialCohortDetail = null, ssrError = null, }: { runId: string; cohortId?: string; initialRunState?: SerializedWorkflowRunState | null; - initialCohortDetail?: CohortDetail | null; ssrError?: string | null; }) { const [selectedTaskId, setSelectedTaskId] = useState(null); @@ -64,7 +61,6 @@ export function RunWorkspacePage({ const [statusFilter, setStatusFilter] = useState(null); const [isStreamOpen, setIsStreamOpen] = useState(false); const { runState, isLoading, error, isSubscribed } = useRunState(runId, initialRunState); - const { detail } = useCohortDetail(cohortId ?? "", initialCohortDetail); // A null snapshot means the graph follows live state; a sequence replays // mutations to that point. @@ -133,11 +129,6 @@ export function RunWorkspacePage({ ); }, [runState, mutations, snapshotSequence]); - const runRow = useMemo(() => { - if (!cohortId || !detail) return null; - return detail.runs.find((run) => run.run_id === runId) ?? null; - }, [cohortId, detail, runId]); - const selectedTask = useMemo(() => { if (!displayState || !selectedTaskId) return null; return displayState.tasks.get(selectedTaskId) ?? null; @@ -265,7 +256,7 @@ export function RunWorkspacePage({ } }, [displayState, selectedTaskId]); - const status = runState?.status ?? runRow?.status ?? "pending"; + const status = runState?.status ?? "pending"; const isInspectorOpen = selectedTaskId !== null; const handleTaskClick = (taskId: string) => { @@ -308,7 +299,7 @@ export function RunWorkspacePage({ >
- Cohorts + Cohorts {cohortId && ( <> @@ -317,7 +308,7 @@ export function RunWorkspacePage({ className="max-w-[180px] truncate hover:text-[var(--ink)]" data-testid="run-breadcrumb-cohort" > - {detail?.summary.name ?? "Cohort"} + Cohort @@ -326,7 +317,7 @@ export function RunWorkspacePage({

- {runState?.name ?? runRow?.run_id ?? "Run"} + {runState?.name ?? runId}

@@ -355,7 +346,7 @@ export function RunWorkspacePage({
Score
- {formatPercent(runState?.finalScore ?? runRow?.final_score ?? null)} + {formatPercent(runState?.finalScore ?? null)}
diff --git a/ergon-dashboard/src/components/shell/Topbar.tsx b/ergon-dashboard/src/components/shell/Topbar.tsx index 01517a1a..94ed2569 100644 --- a/ergon-dashboard/src/components/shell/Topbar.tsx +++ b/ergon-dashboard/src/components/shell/Topbar.tsx @@ -4,7 +4,8 @@ import Link from "next/link"; import { usePathname } from "next/navigation"; const NAV_ITEMS = [ - { label: "Cohorts", href: "/" }, + { label: "Experiments", href: "/experiments" }, + { label: "Cohorts", href: "/cohorts" }, { label: "Runs", href: "/runs" }, { label: "Training", href: "/training" }, { label: "Models", href: "/models" }, @@ -12,9 +13,12 @@ const NAV_ITEMS = [ ] as const; function isActive(href: string, pathname: string): boolean { - if (href === "/") { + if (href === "/cohorts") { return pathname === "/" || pathname.startsWith("/cohorts"); } + if (href === "/experiments") { + return pathname.startsWith("/experiments"); + } if (href === "/runs") { return pathname.startsWith("/run/") || pathname.startsWith("/runs"); } @@ -77,7 +81,7 @@ export function Topbar() { data-testid="topbar-search" > - Search cohorts, runs, tasks… + Search experiments, cohorts, runs, tasks… ⌘K diff --git a/ergon-dashboard/src/generated/rest/contracts.ts b/ergon-dashboard/src/generated/rest/contracts.ts index d013a213..587babcb 100644 --- a/ergon-dashboard/src/generated/rest/contracts.ts +++ b/ergon-dashboard/src/generated/rest/contracts.ts @@ -1,195 +1,464 @@ /* eslint-disable @typescript-eslint/no-empty-object-type */ import { z } from "zod"; -type JsonValue = - | (JsonScalar | Array | {}) - | Array | {}>; +type JsonValue_Input = + | (JsonScalar | Array | {}) + | Array | {}>; type JsonScalar = | (string | number | number | boolean | null) | Array; +type JsonValue_Output = + | (JsonScalar | Array | {}) + | Array | {}>; -const RunTaskDto = z.object({ +const CohortStatusCountsDto = z + .object({ + completed: z.number().int().default(0), + evaluating: z.number().int().default(0), + executing: z.number().int().default(0), + failed: z.number().int().default(0), + pending: z.number().int().default(0), + }) + .partial() + .passthrough(); +const CohortSummaryDto = z + .object({ + average_duration_ms: z.union([z.number(), z.null()]).optional(), + average_score: z.union([z.number(), z.null()]).optional(), + best_score: z.union([z.number(), z.null()]).optional(), + cohort_id: z.string().uuid(), + created_at: z.string().datetime({ offset: true }), + created_by: z.union([z.string(), z.null()]).optional(), + description: z.union([z.string(), z.null()]).optional(), + failure_rate: z.number().optional().default(0), + name: z.string(), + stats_updated_at: z.union([z.string(), z.null()]).optional(), + status: z.string(), + status_counts: CohortStatusCountsDto.optional(), + total_runs: z.number().int().optional().default(0), + worst_score: z.union([z.number(), z.null()]).optional(), + }) + .passthrough(); +const ValidationError = z + .object({ + ctx: z.object({}).partial().passthrough().optional(), + input: z.unknown().optional(), + loc: z.array(z.union([z.string(), z.number()])), + msg: z.string(), + type: z.string(), + }) + .passthrough(); +const HTTPValidationError = z + .object({ detail: z.array(ValidationError) }) + .partial() + .passthrough(); +const CohortExperimentRowDto = z + .object({ + benchmark_type: z.string(), + created_at: z.string().datetime({ offset: true }), + default_evaluator_slug: z.union([z.string(), z.null()]).optional(), + default_model_target: z.union([z.string(), z.null()]).optional(), + error_message: z.union([z.string(), z.null()]).optional(), + experiment_id: z.string().uuid(), + final_score: z.union([z.number(), z.null()]).optional(), + name: z.string(), + sample_count: z.number().int(), + status: z.string(), + status_counts: CohortStatusCountsDto.optional(), + total_cost_usd: z.union([z.number(), z.null()]).optional(), + total_runs: z.number().int().optional().default(0), + }) + .passthrough(); +const CohortDetailDto = z + .object({ + experiments: z.array(CohortExperimentRowDto).optional(), + summary: CohortSummaryDto, + }) + .passthrough(); +const ExperimentCohortStatus = z.enum(["active", "archived"]); +const UpdateCohortRequest = z + .object({ status: ExperimentCohortStatus }) + .passthrough(); +const ExperimentSummaryDto = z + .object({ + benchmark_type: z.string(), + cohort_id: z.union([z.string(), z.null()]).optional(), + completed_at: z.union([z.string(), z.null()]).optional(), + created_at: z.string().datetime({ offset: true }), + default_evaluator_slug: z.union([z.string(), z.null()]).optional(), + default_model_target: z.union([z.string(), z.null()]).optional(), + default_worker_team: z.object({}).partial().passthrough().optional(), + experiment_id: z.string().uuid(), + name: z.string(), + run_count: z.number().int().optional().default(0), + sample_count: z.number().int(), + started_at: z.union([z.string(), z.null()]).optional(), + status: z.string(), + }) + .passthrough(); +const JsonScalar = z.union([ + z.string(), + z.number(), + z.number(), + z.boolean(), + z.null(), +]); +const JsonValue_Input: z.ZodType = z.lazy(() => z.union([ + JsonScalar, + z.array(JsonValue_Input), + z.record(z.string(), JsonValue_Input), +])); +const JsonObject_Input = z.record(z.string(), JsonValue_Input); +const ExperimentDefineRequest = z + .object({ + benchmark_slug: z.string(), + cohort_id: z.union([z.string(), z.null()]).optional(), + default_evaluator_slug: z.union([z.string(), z.null()]).optional(), + default_model_target: z.union([z.string(), z.null()]).optional(), + default_worker_team: JsonObject_Input.optional(), + design: JsonObject_Input.optional(), + limit: z.union([z.number(), z.null()]).optional(), + metadata: JsonObject_Input.optional(), + name: z.union([z.string(), z.null()]).optional(), + sample_ids: z.union([z.array(z.string()), z.null()]).optional(), + seed: z.union([z.number(), z.null()]).optional(), + }) + .passthrough(); +const ExperimentDefineResult = z + .object({ + benchmark_type: z.string(), + cohort_id: z.union([z.string(), z.null()]), + experiment_id: z.string().uuid(), + sample_count: z.number().int(), + selected_samples: z.array(z.string()), + }) + .passthrough(); +const ExperimentStatusCountsDto = z + .object({ + cancelled: z.number().int().default(0), + completed: z.number().int().default(0), + evaluating: z.number().int().default(0), + executing: z.number().int().default(0), + failed: z.number().int().default(0), + pending: z.number().int().default(0), + }) + .partial() + .passthrough(); +const ExperimentAnalyticsDto = z + .object({ + average_duration_ms: z.union([z.number(), z.null()]), + average_score: z.union([z.number(), z.null()]), + average_tasks: z.union([z.number(), z.null()]), + error_count: z.number().int().default(0), + latest_activity_at: z.union([z.string(), z.null()]), + status_counts: ExperimentStatusCountsDto, + total_cost_usd: z.union([z.number(), z.null()]), + total_runs: z.number().int().default(0), + }) + .partial() + .passthrough(); +const ExperimentRunRowDto = z + .object({ + benchmark_type: z.string(), + completed_at: z.union([z.string(), z.null()]).optional(), + created_at: z.string().datetime({ offset: true }), + error_message: z.union([z.string(), z.null()]).optional(), + evaluator_slug: z.union([z.string(), z.null()]).optional(), + final_score: z.union([z.number(), z.null()]).optional(), + instance_key: z.string(), + model_target: z.union([z.string(), z.null()]).optional(), + run_id: z.string().uuid(), + running_time_ms: z.union([z.number(), z.null()]).optional(), + seed: z.union([z.number(), z.null()]).optional(), + started_at: z.union([z.string(), z.null()]).optional(), + status: z.string(), + total_cost_usd: z.union([z.number(), z.null()]).optional(), + total_tasks: z.union([z.number(), z.null()]).optional(), + worker_team: z.object({}).partial().passthrough().optional(), + workflow_definition_id: z.string().uuid(), + }) + .passthrough(); +const ExperimentDetailDto = z + .object({ + analytics: ExperimentAnalyticsDto.optional(), + design: z.object({}).partial().passthrough().optional(), + experiment: ExperimentSummaryDto, + metadata: z.object({}).partial().passthrough().optional(), + runs: z.array(ExperimentRunRowDto).optional(), + sample_selection: z.object({}).partial().passthrough().optional(), + }) + .passthrough(); +const ExperimentRunRequest = z + .object({ + experiment_id: z.string().uuid(), + timeout_seconds: z.union([z.number(), z.null()]).optional(), + wait: z.boolean().optional().default(true), + }) + .passthrough(); +const run_experiment_experiments__experiment_id__run_post_Body = z.union([ + ExperimentRunRequest, + z.null(), +]); +const ExperimentRunResult = z + .object({ + experiment_id: z.string().uuid(), + run_ids: z.array(z.string().uuid()), + workflow_definition_ids: z.array(z.string().uuid()).optional(), + }) + .passthrough(); +const SubmitRequest = z + .object({ + definition_id: z.string().uuid(), + model_target_override: z.union([z.string(), z.null()]).optional(), + num_episodes: z.number().int().gte(1), + policy_version: z.union([z.number(), z.null()]).optional(), + }) + .passthrough(); +const BatchStatus = z.enum([ + "pending", + "running", + "complete", + "failed", + "cancelled", +]); +const SubmitResponse = z + .object({ + batch_id: z.string().uuid(), + run_ids: z.array(z.string().uuid()), + status: BatchStatus.optional(), + }) + .passthrough(); +const WeightSyncRequest = z + .object({ checkpoint_path: z.string(), model_name: z.string() }) + .passthrough(); +const WeightSyncResponse = z + .object({ success: z.boolean(), vllm_model_loaded: z.string() }) + .passthrough(); +const EpisodeFailure = z + .object({ error: z.string(), run_id: z.string().uuid() }) + .passthrough(); +const Trajectory = z + .object({ + agent_id: z.string(), + completion_ids: z.array(z.number().int()), + env_mask: z.array(z.number().int()), + logprobs: z.array(z.number()), + num_turns: z.number().int(), + prompt_ids: z.array(z.number().int()), + reward: z.number(), + run_id: z.string().uuid(), + }) + .passthrough(); +const PollResponse = z + .object({ + batch_id: z.string().uuid(), + completed: z.number().int().optional().default(0), + failures: z.array(EpisodeFailure).optional(), + status: BatchStatus, + total: z.number().int().optional().default(0), + trajectories: z.array(Trajectory).optional(), + }) + .passthrough(); +const definition_id = z.union([z.string(), z.null()]).optional(); +const TrainingCurvePointDto = z.object({ + benchmarkType: z.union([z.string(), z.null()]).optional(), + createdAt: z.union([z.string(), z.null()]).optional(), + meanScore: z.number(), + runId: z.string(), + step: z.number().int(), +}); +const TrainingSessionDto = z.object({ + completedAt: z.union([z.string(), z.null()]).optional(), + experimentDefinitionId: z.string(), + finalLoss: z.union([z.number(), z.null()]).optional(), id: z.string(), - name: z.string(), - description: z.string(), - status: z.string(), - parentId: z.union([z.string(), z.null()]).optional(), - childIds: z.array(z.string()).optional(), - dependsOnIds: z.array(z.string()).optional(), - isLeaf: z.boolean(), - level: z.number().int(), - assignedWorkerId: z.union([z.string(), z.null()]).optional(), - assignedWorkerName: z.union([z.string(), z.null()]).optional(), + modelName: z.string(), + outputDir: z.union([z.string(), z.null()]).optional(), startedAt: z.union([z.string(), z.null()]).optional(), - completedAt: z.union([z.string(), z.null()]).optional(), + status: z.string(), + totalSteps: z.union([z.number(), z.null()]).optional(), }); -const RunResourceDto = z.object({ - id: z.string(), - taskId: z.string(), - taskExecutionId: z.string(), - name: z.string(), - mimeType: z.string(), - filePath: z.string(), - sizeBytes: z.number().int(), - createdAt: z.string().datetime({ offset: true }), +const TrainingMetricDto = z.object({ + completionMeanLength: z.union([z.number(), z.null()]).optional(), + entropy: z.union([z.number(), z.null()]).optional(), + epoch: z.union([z.number(), z.null()]).optional(), + gradNorm: z.union([z.number(), z.null()]).optional(), + learningRate: z.union([z.number(), z.null()]).optional(), + loss: z.union([z.number(), z.null()]).optional(), + rewardMean: z.union([z.number(), z.null()]).optional(), + rewardStd: z.union([z.number(), z.null()]).optional(), + step: z.number().int(), + stepTimeS: z.union([z.number(), z.null()]).optional(), }); -const RunExecutionAttemptDto = z.object({ +const RunContextEventDto = z.object({ + completedAt: z.union([z.string(), z.null()]).optional(), + createdAt: z.string(), + eventType: z.string(), id: z.string(), - taskId: z.string(), - attemptNumber: z.number().int(), - status: z.string(), + payload: z.object({}).partial().passthrough(), + sequence: z.number().int(), startedAt: z.union([z.string(), z.null()]).optional(), - completedAt: z.union([z.string(), z.null()]).optional(), - finalAssistantMessage: z.union([z.string(), z.null()]).optional(), - errorMessage: z.union([z.string(), z.null()]).optional(), - score: z.union([z.number(), z.null()]).optional(), - agentId: z.union([z.string(), z.null()]).optional(), - agentName: z.union([z.string(), z.null()]).optional(), - evaluationDetails: z - .union([z.object({}).partial().passthrough(), z.null()]) - .optional(), - outputResourceIds: z.array(z.string()).optional(), + taskExecutionId: z.string(), + taskNodeId: z.string(), + workerBindingKey: z.string(), }); const RunEvaluationCriterionDto = z.object({ - id: z.string(), - stageNum: z.number().int(), - stageName: z.string(), + criterionDescription: z.string(), criterionNum: z.number().int(), criterionType: z.string(), - criterionDescription: z.string(), - evaluationInput: z.union([z.string(), z.null()]).optional(), - score: z.number(), - maxScore: z.number(), - feedback: z.union([z.string(), z.null()]).optional(), + error: z.union([z.object({}).partial().passthrough(), z.null()]).optional(), evaluatedActionIds: z.array(z.string()).optional(), evaluatedResourceIds: z.array(z.string()).optional(), - error: z.union([z.object({}).partial().passthrough(), z.null()]).optional(), + evaluationInput: z.union([z.string(), z.null()]).optional(), + feedback: z.union([z.string(), z.null()]).optional(), + id: z.string(), + maxScore: z.number(), + score: z.number(), + stageName: z.string(), + stageNum: z.number().int(), }); const RunTaskEvaluationDto = z.object({ + createdAt: z.string().datetime({ offset: true }), + criterionResults: z.array(RunEvaluationCriterionDto).optional(), + failedGate: z.union([z.string(), z.null()]).optional(), id: z.string(), - runId: z.string(), - taskId: z.union([z.string(), z.null()]).optional(), - totalScore: z.number(), maxScore: z.number(), normalizedScore: z.number(), + runId: z.string(), stagesEvaluated: z.number().int(), stagesPassed: z.number().int(), - failedGate: z.union([z.string(), z.null()]).optional(), + taskId: z.union([z.string(), z.null()]).optional(), + totalScore: z.number(), +}); +const RunExecutionAttemptDto = z.object({ + agentId: z.union([z.string(), z.null()]).optional(), + agentName: z.union([z.string(), z.null()]).optional(), + attemptNumber: z.number().int(), + completedAt: z.union([z.string(), z.null()]).optional(), + errorMessage: z.union([z.string(), z.null()]).optional(), + evaluationDetails: z + .union([z.object({}).partial().passthrough(), z.null()]) + .optional(), + finalAssistantMessage: z.union([z.string(), z.null()]).optional(), + id: z.string(), + outputResourceIds: z.array(z.string()).optional(), + score: z.union([z.number(), z.null()]).optional(), + startedAt: z.union([z.string(), z.null()]).optional(), + status: z.string(), + taskId: z.string(), +}); +const RunResourceDto = z.object({ createdAt: z.string().datetime({ offset: true }), - criterionResults: z.array(RunEvaluationCriterionDto).optional(), + filePath: z.string(), + id: z.string(), + mimeType: z.string(), + name: z.string(), + sizeBytes: z.number().int(), + taskExecutionId: z.string(), + taskId: z.string(), }); const RunSandboxCommandDto = z.object({ command: z.string(), - stdout: z.union([z.string(), z.null()]).optional(), - stderr: z.union([z.string(), z.null()]).optional(), - exitCode: z.union([z.number(), z.null()]).optional(), durationMs: z.union([z.number(), z.null()]).optional(), + exitCode: z.union([z.number(), z.null()]).optional(), + stderr: z.union([z.string(), z.null()]).optional(), + stdout: z.union([z.string(), z.null()]).optional(), timestamp: z.string().datetime({ offset: true }), }); const RunSandboxDto = z.object({ + closeReason: z.union([z.string(), z.null()]).optional(), + closedAt: z.union([z.string(), z.null()]).optional(), + commands: z.array(RunSandboxCommandDto).optional(), + createdAt: z.string().datetime({ offset: true }), sandboxId: z.string(), + status: z.string(), taskId: z.string(), template: z.union([z.string(), z.null()]).optional(), timeoutMinutes: z.number().int(), - status: z.string(), - createdAt: z.string().datetime({ offset: true }), - closedAt: z.union([z.string(), z.null()]).optional(), - closeReason: z.union([z.string(), z.null()]).optional(), - commands: z.array(RunSandboxCommandDto).optional(), }); -const RunContextEventDto = z.object({ +const RunTaskDto = z.object({ + assignedWorkerId: z.union([z.string(), z.null()]).optional(), + assignedWorkerName: z.union([z.string(), z.null()]).optional(), + childIds: z.array(z.string()).optional(), + completedAt: z.union([z.string(), z.null()]).optional(), + dependsOnIds: z.array(z.string()).optional(), + description: z.string(), id: z.string(), - taskExecutionId: z.string(), - taskNodeId: z.string(), - workerBindingKey: z.string(), - sequence: z.number().int(), - eventType: z.string(), - payload: z.object({}).partial().passthrough(), - createdAt: z.string(), + isLeaf: z.boolean(), + level: z.number().int(), + name: z.string(), + parentId: z.union([z.string(), z.null()]).optional(), startedAt: z.union([z.string(), z.null()]).optional(), - completedAt: z.union([z.string(), z.null()]).optional(), + status: z.string(), }); const RunCommunicationMessageDto = z.object({ + content: z.string(), + createdAt: z.string().datetime({ offset: true }), + fromAgentId: z.string(), id: z.string(), - threadId: z.string(), - threadTopic: z.string(), runId: z.string(), - taskId: z.union([z.string(), z.null()]).optional(), + sequenceNum: z.number().int(), taskExecutionId: z.union([z.string(), z.null()]).optional(), - fromAgentId: z.string(), + taskId: z.union([z.string(), z.null()]).optional(), + threadId: z.string(), + threadTopic: z.string(), toAgentId: z.string(), - content: z.string(), - sequenceNum: z.number().int(), - createdAt: z.string().datetime({ offset: true }), }); const RunCommunicationThreadDto = z.object({ + agentAId: z.string(), + agentBId: z.string(), + createdAt: z.string().datetime({ offset: true }), id: z.string(), + messages: z.array(RunCommunicationMessageDto).optional(), runId: z.string(), + summary: z.union([z.string(), z.null()]).optional(), taskId: z.union([z.string(), z.null()]).optional(), topic: z.string(), - summary: z.union([z.string(), z.null()]).optional(), - agentAId: z.string(), - agentBId: z.string(), - createdAt: z.string().datetime({ offset: true }), updatedAt: z.string().datetime({ offset: true }), - messages: z.array(RunCommunicationMessageDto).optional(), }); const RunSnapshotDto = z.object({ - id: z.string(), - experimentId: z.string(), - name: z.string(), - status: z.string(), - tasks: z.record(z.string(), RunTaskDto).optional(), - rootTaskId: z.string().optional().default(""), - resourcesByTask: z.record(z.string(), z.array(RunResourceDto)).optional(), - executionsByTask: z.record(z.string(), z.array(RunExecutionAttemptDto)).optional(), - evaluationsByTask: z.record(z.string(), RunTaskEvaluationDto).optional(), - sandboxesByTask: z.record(z.string(), RunSandboxDto).optional(), - contextEventsByTask: z.record(z.string(), z.array(RunContextEventDto)).optional(), - threads: z.array(RunCommunicationThreadDto).optional(), - startedAt: z.union([z.string(), z.null()]).optional(), + cancelledTasks: z.number().int().optional().default(0), completedAt: z.union([z.string(), z.null()]).optional(), - durationSeconds: z.union([z.number(), z.null()]).optional(), - totalTasks: z.number().int().optional().default(0), - totalLeafTasks: z.number().int().optional().default(0), completedTasks: z.number().int().optional().default(0), + contextEventsByTask: z.record(z.string(), z.array(RunContextEventDto)).optional(), + durationSeconds: z.union([z.number(), z.null()]).optional(), + error: z.union([z.string(), z.null()]).optional(), + evaluationsByTask: z.record(z.string(), RunTaskEvaluationDto).optional(), + executionsByTask: z.record(z.string(), z.array(RunExecutionAttemptDto)).optional(), + experimentId: z.string(), failedTasks: z.number().int().optional().default(0), - runningTasks: z.number().int().optional().default(0), - cancelledTasks: z.number().int().optional().default(0), finalScore: z.union([z.number(), z.null()]).optional(), - error: z.union([z.string(), z.null()]).optional(), -}); -const ValidationError = z - .object({ - loc: z.array(z.union([z.string(), z.number()])), - msg: z.string(), - type: z.string(), - input: z.unknown().optional(), - ctx: z.object({}).partial().passthrough().optional(), - }) - .passthrough(); -const HTTPValidationError = z - .object({ detail: z.array(ValidationError) }) - .partial() - .passthrough(); + id: z.string(), + name: z.string(), + resourcesByTask: z.record(z.string(), z.array(RunResourceDto)).optional(), + rootTaskId: z.string().optional().default(""), + runningTasks: z.number().int().optional().default(0), + sandboxesByTask: z.record(z.string(), RunSandboxDto).optional(), + startedAt: z.union([z.string(), z.null()]).optional(), + status: z.string(), + tasks: z.record(z.string(), RunTaskDto).optional(), + threads: z.array(RunCommunicationThreadDto).optional(), + totalLeafTasks: z.number().int().optional().default(0), + totalTasks: z.number().int().optional().default(0), +}); const NodeAddedMutation = z .object({ - mutation_type: z.string().optional().default("node.added"), - task_slug: z.string(), - instance_key: z.string(), + assigned_worker_slug: z.union([z.string(), z.null()]), description: z.string(), + instance_key: z.string(), + mutation_type: z.string().optional().default("node.added"), status: z.string(), - assigned_worker_slug: z.union([z.string(), z.null()]), + task_slug: z.string(), }) .passthrough(); const NodeRemovedMutation = z .object({ - mutation_type: z.string().optional().default("node.removed"), - task_slug: z.string(), - instance_key: z.string(), + assigned_worker_slug: z.union([z.string(), z.null()]), description: z.string(), + instance_key: z.string(), + mutation_type: z.string().optional().default("node.removed"), status: z.string(), - assigned_worker_slug: z.union([z.string(), z.null()]), + task_slug: z.string(), }) .passthrough(); const NodeStatusChangedMutation = z @@ -200,8 +469,8 @@ const NodeStatusChangedMutation = z .passthrough(); const NodeFieldChangedMutation = z .object({ - mutation_type: z.string().optional().default("node.field_changed"), field: z.enum(["description", "assigned_worker_slug"]), + mutation_type: z.string().optional().default("node.field_changed"), value: z.union([z.string(), z.null()]), }) .passthrough(); @@ -209,16 +478,16 @@ const EdgeAddedMutation = z .object({ mutation_type: z.string().optional().default("edge.added"), source_node_id: z.string(), - target_node_id: z.string(), status: z.string(), + target_node_id: z.string(), }) .passthrough(); const EdgeRemovedMutation = z .object({ mutation_type: z.string().optional().default("edge.removed"), source_node_id: z.string(), - target_node_id: z.string(), status: z.string(), + target_node_id: z.string(), }) .passthrough(); const EdgeStatusChangedMutation = z @@ -227,39 +496,42 @@ const EdgeStatusChangedMutation = z status: z.string(), }) .passthrough(); -const JsonScalar = z.union([ - z.string(), - z.number(), - z.number(), - z.boolean(), - z.null(), -]); -const JsonValue: z.ZodType = z.lazy(() => - z.union([JsonScalar, z.array(JsonValue), z.record(z.string(), JsonValue)]) -); -const JsonObject = z.record(z.string(), JsonValue); +const JsonValue_Output: z.ZodType = z.lazy(() => z.union([ + JsonScalar, + z.array(JsonValue_Output), + z.record(z.string(), JsonValue_Output), +])); +const JsonObject_Output = z.record(z.string(), JsonValue_Output); const AnnotationSetMutation = z .object({ mutation_type: z.string().optional().default("annotation.set"), namespace: z.string(), - payload: JsonObject, + payload: JsonObject_Output, }) .passthrough(); const AnnotationDeletedMutation = z .object({ mutation_type: z.string().optional().default("annotation.deleted"), namespace: z.string(), - payload: JsonObject, + payload: JsonObject_Output, }) .passthrough(); const RunGraphMutationDto = z.object({ + actor: z.string(), + created_at: z.string(), id: z.string(), - run_id: z.string(), - sequence: z.number().int(), mutation_type: z.string(), - target_type: z.string(), - target_id: z.string(), - actor: z.string(), + new_value: z.discriminatedUnion("mutation_type", [ + NodeAddedMutation, + NodeRemovedMutation, + NodeStatusChangedMutation, + NodeFieldChangedMutation, + EdgeAddedMutation, + EdgeRemovedMutation, + EdgeStatusChangedMutation, + AnnotationSetMutation, + AnnotationDeletedMutation, + ]), old_value: z.union([ z.discriminatedUnion("mutation_type", [ NodeAddedMutation, @@ -274,174 +546,58 @@ const RunGraphMutationDto = z.object({ ]), z.null(), ]), - new_value: z.discriminatedUnion("mutation_type", [ - NodeAddedMutation, - NodeRemovedMutation, - NodeStatusChangedMutation, - NodeFieldChangedMutation, - EdgeAddedMutation, - EdgeRemovedMutation, - EdgeStatusChangedMutation, - AnnotationSetMutation, - AnnotationDeletedMutation, - ]), reason: z.union([z.string(), z.null()]), - created_at: z.string(), -}); -const definition_id = z.union([z.string(), z.null()]).optional(); -const TrainingCurvePointDto = z.object({ - runId: z.string(), - step: z.number().int(), - meanScore: z.number(), - benchmarkType: z.union([z.string(), z.null()]).optional(), - createdAt: z.union([z.string(), z.null()]).optional(), -}); -const TrainingSessionDto = z.object({ - id: z.string(), - experimentDefinitionId: z.string(), - modelName: z.string(), - status: z.string(), - startedAt: z.union([z.string(), z.null()]).optional(), - completedAt: z.union([z.string(), z.null()]).optional(), - outputDir: z.union([z.string(), z.null()]).optional(), - totalSteps: z.union([z.number(), z.null()]).optional(), - finalLoss: z.union([z.number(), z.null()]).optional(), -}); -const TrainingMetricDto = z.object({ - step: z.number().int(), - epoch: z.union([z.number(), z.null()]).optional(), - loss: z.union([z.number(), z.null()]).optional(), - gradNorm: z.union([z.number(), z.null()]).optional(), - learningRate: z.union([z.number(), z.null()]).optional(), - rewardMean: z.union([z.number(), z.null()]).optional(), - rewardStd: z.union([z.number(), z.null()]).optional(), - entropy: z.union([z.number(), z.null()]).optional(), - completionMeanLength: z.union([z.number(), z.null()]).optional(), - stepTimeS: z.union([z.number(), z.null()]).optional(), + run_id: z.string(), + sequence: z.number().int(), + target_id: z.string(), + target_type: z.string(), }); -const CohortStatusCountsDto = z - .object({ - pending: z.number().int().default(0), - executing: z.number().int().default(0), - evaluating: z.number().int().default(0), - completed: z.number().int().default(0), - failed: z.number().int().default(0), - }) - .partial() - .passthrough(); -const CohortSummaryDto = z - .object({ - cohort_id: z.string().uuid(), - name: z.string(), - description: z.union([z.string(), z.null()]).optional(), - created_by: z.union([z.string(), z.null()]).optional(), - created_at: z.string().datetime({ offset: true }), - status: z.string(), - total_runs: z.number().int().optional().default(0), - status_counts: CohortStatusCountsDto.optional(), - average_score: z.union([z.number(), z.null()]).optional(), - best_score: z.union([z.number(), z.null()]).optional(), - worst_score: z.union([z.number(), z.null()]).optional(), - average_duration_ms: z.union([z.number(), z.null()]).optional(), - failure_rate: z.number().optional().default(0), - stats_updated_at: z.union([z.string(), z.null()]).optional(), - }) - .passthrough(); -const CohortRunRowDto = z - .object({ - run_id: z.string().uuid(), - definition_id: z.string().uuid(), - cohort_id: z.string().uuid(), - cohort_name: z.string(), - status: z.string(), - created_at: z.string().datetime({ offset: true }), - started_at: z.union([z.string(), z.null()]).optional(), - completed_at: z.union([z.string(), z.null()]).optional(), - running_time_ms: z.union([z.number(), z.null()]).optional(), - final_score: z.union([z.number(), z.null()]).optional(), - total_tasks: z.union([z.number(), z.null()]).optional(), - total_cost_usd: z.union([z.number(), z.null()]).optional(), - error_message: z.union([z.string(), z.null()]).optional(), - }) - .passthrough(); -const CohortDetailDto = z - .object({ - summary: CohortSummaryDto, - runs: z.array(CohortRunRowDto).optional(), - }) - .passthrough(); -const ExperimentCohortStatus = z.enum(["active", "archived"]); -const UpdateCohortRequest = z - .object({ status: ExperimentCohortStatus }) - .passthrough(); -const SubmitRequest = z - .object({ - definition_id: z.string().uuid(), - num_episodes: z.number().int().gte(1), - policy_version: z.union([z.number(), z.null()]).optional(), - model_target_override: z.union([z.string(), z.null()]).optional(), - }) - .passthrough(); -const BatchStatus = z.enum([ - "pending", - "running", - "complete", - "failed", - "cancelled", -]); -const SubmitResponse = z - .object({ - batch_id: z.string().uuid(), - run_ids: z.array(z.string().uuid()), - status: BatchStatus.optional(), - }) - .passthrough(); -const Trajectory = z - .object({ - run_id: z.string().uuid(), - agent_id: z.string(), - prompt_ids: z.array(z.number().int()), - completion_ids: z.array(z.number().int()), - logprobs: z.array(z.number()), - env_mask: z.array(z.number().int()), - reward: z.number(), - num_turns: z.number().int(), - }) - .passthrough(); -const EpisodeFailure = z - .object({ run_id: z.string().uuid(), error: z.string() }) - .passthrough(); -const PollResponse = z - .object({ - batch_id: z.string().uuid(), - status: BatchStatus, - completed: z.number().int().optional().default(0), - total: z.number().int().optional().default(0), - trajectories: z.array(Trajectory).optional(), - failures: z.array(EpisodeFailure).optional(), - }) - .passthrough(); -const WeightSyncRequest = z - .object({ checkpoint_path: z.string(), model_name: z.string() }) - .passthrough(); -const WeightSyncResponse = z - .object({ success: z.boolean(), vllm_model_loaded: z.string() }) - .passthrough(); export const schemas = { - RunTaskDto, - RunResourceDto, - RunExecutionAttemptDto, + CohortStatusCountsDto, + CohortSummaryDto, + ValidationError, + HTTPValidationError, + CohortExperimentRowDto, + CohortDetailDto, + ExperimentCohortStatus, + UpdateCohortRequest, + ExperimentSummaryDto, + JsonScalar, + JsonValue_Input, + JsonObject_Input, + ExperimentDefineRequest, + ExperimentDefineResult, + ExperimentStatusCountsDto, + ExperimentAnalyticsDto, + ExperimentRunRowDto, + ExperimentDetailDto, + ExperimentRunRequest, + run_experiment_experiments__experiment_id__run_post_Body, + ExperimentRunResult, + SubmitRequest, + BatchStatus, + SubmitResponse, + WeightSyncRequest, + WeightSyncResponse, + EpisodeFailure, + Trajectory, + PollResponse, + definition_id, + TrainingCurvePointDto, + TrainingSessionDto, + TrainingMetricDto, + RunContextEventDto, RunEvaluationCriterionDto, RunTaskEvaluationDto, + RunExecutionAttemptDto, + RunResourceDto, RunSandboxCommandDto, RunSandboxDto, - RunContextEventDto, + RunTaskDto, RunCommunicationMessageDto, RunCommunicationThreadDto, RunSnapshotDto, - ValidationError, - HTTPValidationError, NodeAddedMutation, NodeRemovedMutation, NodeStatusChangedMutation, @@ -449,28 +605,9 @@ export const schemas = { EdgeAddedMutation, EdgeRemovedMutation, EdgeStatusChangedMutation, - JsonScalar, - JsonValue, - JsonObject, + JsonValue_Output, + JsonObject_Output, AnnotationSetMutation, AnnotationDeletedMutation, RunGraphMutationDto, - definition_id, - TrainingCurvePointDto, - TrainingSessionDto, - TrainingMetricDto, - CohortStatusCountsDto, - CohortSummaryDto, - CohortRunRowDto, - CohortDetailDto, - ExperimentCohortStatus, - UpdateCohortRequest, - SubmitRequest, - BatchStatus, - SubmitResponse, - Trajectory, - EpisodeFailure, - PollResponse, - WeightSyncRequest, - WeightSyncResponse, }; diff --git a/ergon-dashboard/src/generated/rest/openapi.json b/ergon-dashboard/src/generated/rest/openapi.json index 906046ed..52fa4c76 100644 --- a/ergon-dashboard/src/generated/rest/openapi.json +++ b/ergon-dashboard/src/generated/rest/openapi.json @@ -1,1229 +1,2051 @@ { - "openapi": "3.1.0", - "info": { - "title": "Ergon Core", - "description": "Ergon experiment orchestration API", - "version": "0.1.0" - }, - "paths": { - "/runs/{run_id}": { - "get": { - "tags": [ - "runs" - ], - "summary": "Get Run", - "description": "Get a persisted run-detail snapshot suitable for frontend hydration.", - "operationId": "get_run_runs__run_id__get", - "parameters": [ - { - "name": "run_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Run Id" - } + "components": { + "schemas": { + "AnnotationDeletedMutation": { + "description": "annotation.deleted \u2014 tombstone.", + "properties": { + "mutation_type": { + "const": "annotation.deleted", + "default": "annotation.deleted", + "title": "Mutation Type", + "type": "string" + }, + "namespace": { + "title": "Namespace", + "type": "string" + }, + "payload": { + "$ref": "#/components/schemas/JsonObject-Output" } + }, + "required": [ + "namespace", + "payload" ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunSnapshotDto" - } - } - } + "title": "AnnotationDeletedMutation", + "type": "object" + }, + "AnnotationSetMutation": { + "description": "annotation.set.", + "properties": { + "mutation_type": { + "const": "annotation.set", + "default": "annotation.set", + "title": "Mutation Type", + "type": "string" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } + "namespace": { + "title": "Namespace", + "type": "string" + }, + "payload": { + "$ref": "#/components/schemas/JsonObject-Output" } - } - } - }, - "/runs/{run_id}/mutations": { - "get": { - "tags": [ - "runs" + }, + "required": [ + "namespace", + "payload" ], - "summary": "Get Mutations", - "description": "Return the append-only mutation log for a run, ordered by sequence.\n\nUsed by the Timeline scrubber to replay DAG state at any point in time.", - "operationId": "get_mutations_runs__run_id__mutations_get", - "parameters": [ - { - "name": "run_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Run Id" - } - } + "title": "AnnotationSetMutation", + "type": "object" + }, + "BatchStatus": { + "enum": [ + "pending", + "running", + "complete", + "failed", + "cancelled" ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/RunGraphMutationDto" - }, - "title": "Response Get Mutations Runs Run Id Mutations Get" - } - } - } + "title": "BatchStatus", + "type": "string" + }, + "CohortDetailDto": { + "description": "Full payload for a single cohort detail page.", + "properties": { + "experiments": { + "items": { + "$ref": "#/components/schemas/CohortExperimentRowDto" + }, + "title": "Experiments", + "type": "array" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } + "summary": { + "$ref": "#/components/schemas/CohortSummaryDto" } - } - } - }, - "/runs/{run_id}/resources/{resource_id}/content": { - "get": { - "tags": [ - "runs" + }, + "required": [ + "summary" ], - "summary": "Get Resource Content", - "description": "Stream the blob bytes for a RunResource.\n\nUsed by the dashboard's file-viewer modal. Enforces:\n- resource must belong to the named run (no cross-run leaks);\n- resolved path must sit under ``ERGON_BLOB_ROOT`` (traversal guard);\n- size <= ``_RESOURCE_CONTENT_MAX_BYTES`` (413 otherwise).", - "operationId": "get_resource_content_runs__run_id__resources__resource_id__content_get", - "parameters": [ - { - "name": "run_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Run Id" - } + "title": "CohortDetailDto", + "type": "object" + }, + "CohortExperimentRowDto": { + "description": "One experiment inside a cohort detail view.", + "properties": { + "benchmark_type": { + "title": "Benchmark Type", + "type": "string" }, - { - "name": "resource_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Resource Id" - } - } - ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} + "created_at": { + "format": "date-time", + "title": "Created At", + "type": "string" + }, + "default_evaluator_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Default Evaluator Slug" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } + "default_model_target": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } - } - } - } - }, - "/runs/training/curves": { - "get": { - "tags": [ - "runs" - ], - "summary": "Get Training Curves", - "description": "Return score-over-step data for checkpoint evaluations.\n\nReads ``summary_json`` on ``RunRecord`` for checkpoint metadata\n(``checkpoint_step``, ``checkpoint_path``) written by the eval\nwatcher, and aggregates ``RunTaskEvaluation.score`` per run.\n\nFilter by ``definition_id`` or ``cohort_id``.", - "operationId": "get_training_curves_runs_training_curves_get", - "parameters": [ - { - "name": "definition_id", - "in": "query", - "required": false, - "schema": { - "anyOf": [ - { - "type": "string", - "format": "uuid" - }, - { - "type": "null" - } - ], - "title": "Definition Id" - } + ], + "title": "Default Model Target" }, - { - "name": "cohort_id", - "in": "query", - "required": false, - "schema": { - "anyOf": [ - { - "type": "string", - "format": "uuid" - }, - { - "type": "null" - } - ], - "title": "Cohort Id" - } - } - ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TrainingCurvePointDto" - }, - "title": "Response Get Training Curves Runs Training Curves Get" - } + "error_message": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Error Message" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } + "experiment_id": { + "format": "uuid", + "title": "Experiment Id", + "type": "string" + }, + "final_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" } - } + ], + "title": "Final Score" + }, + "name": { + "title": "Name", + "type": "string" + }, + "sample_count": { + "title": "Sample Count", + "type": "integer" + }, + "status": { + "title": "Status", + "type": "string" + }, + "status_counts": { + "$ref": "#/components/schemas/CohortStatusCountsDto" + }, + "total_cost_usd": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Total Cost Usd" + }, + "total_runs": { + "default": 0, + "title": "Total Runs", + "type": "integer" } - } - } - }, - "/runs/training/sessions": { - "get": { - "tags": [ - "runs" + }, + "required": [ + "experiment_id", + "name", + "benchmark_type", + "sample_count", + "status", + "created_at" + ], + "title": "CohortExperimentRowDto", + "type": "object" + }, + "CohortStatusCountsDto": { + "description": "Aggregate run counts by lifecycle status.", + "properties": { + "completed": { + "default": 0, + "title": "Completed", + "type": "integer" + }, + "evaluating": { + "default": 0, + "title": "Evaluating", + "type": "integer" + }, + "executing": { + "default": 0, + "title": "Executing", + "type": "integer" + }, + "failed": { + "default": 0, + "title": "Failed", + "type": "integer" + }, + "pending": { + "default": 0, + "title": "Pending", + "type": "integer" + } + }, + "title": "CohortStatusCountsDto", + "type": "object" + }, + "CohortSummaryDto": { + "description": "Summary row for cohort list and live updates.", + "properties": { + "average_duration_ms": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Average Duration Ms" + }, + "average_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Average Score" + }, + "best_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Best Score" + }, + "cohort_id": { + "format": "uuid", + "title": "Cohort Id", + "type": "string" + }, + "created_at": { + "format": "date-time", + "title": "Created At", + "type": "string" + }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Created By" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "failure_rate": { + "default": 0.0, + "title": "Failure Rate", + "type": "number" + }, + "name": { + "title": "Name", + "type": "string" + }, + "stats_updated_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Stats Updated At" + }, + "status": { + "title": "Status", + "type": "string" + }, + "status_counts": { + "$ref": "#/components/schemas/CohortStatusCountsDto" + }, + "total_runs": { + "default": 0, + "title": "Total Runs", + "type": "integer" + }, + "worst_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Worst Score" + } + }, + "required": [ + "cohort_id", + "name", + "created_at", + "status" + ], + "title": "CohortSummaryDto", + "type": "object" + }, + "EdgeAddedMutation": { + "description": "edge.added \u2014 full edge snapshot.", + "properties": { + "mutation_type": { + "const": "edge.added", + "default": "edge.added", + "title": "Mutation Type", + "type": "string" + }, + "source_node_id": { + "title": "Source Node Id", + "type": "string" + }, + "status": { + "title": "Status", + "type": "string" + }, + "target_node_id": { + "title": "Target Node Id", + "type": "string" + } + }, + "required": [ + "source_node_id", + "target_node_id", + "status" + ], + "title": "EdgeAddedMutation", + "type": "object" + }, + "EdgeRemovedMutation": { + "description": "edge.removed.", + "properties": { + "mutation_type": { + "const": "edge.removed", + "default": "edge.removed", + "title": "Mutation Type", + "type": "string" + }, + "source_node_id": { + "title": "Source Node Id", + "type": "string" + }, + "status": { + "title": "Status", + "type": "string" + }, + "target_node_id": { + "title": "Target Node Id", + "type": "string" + } + }, + "required": [ + "source_node_id", + "target_node_id", + "status" + ], + "title": "EdgeRemovedMutation", + "type": "object" + }, + "EdgeStatusChangedMutation": { + "description": "edge.status_changed.", + "properties": { + "mutation_type": { + "const": "edge.status_changed", + "default": "edge.status_changed", + "title": "Mutation Type", + "type": "string" + }, + "status": { + "title": "Status", + "type": "string" + } + }, + "required": [ + "status" + ], + "title": "EdgeStatusChangedMutation", + "type": "object" + }, + "EpisodeFailure": { + "description": "An episode that didn't complete successfully.", + "properties": { + "error": { + "title": "Error", + "type": "string" + }, + "run_id": { + "format": "uuid", + "title": "Run Id", + "type": "string" + } + }, + "required": [ + "run_id", + "error" + ], + "title": "EpisodeFailure", + "type": "object" + }, + "ExperimentAnalyticsDto": { + "properties": { + "average_duration_ms": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Average Duration Ms" + }, + "average_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Average Score" + }, + "average_tasks": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Average Tasks" + }, + "error_count": { + "default": 0, + "title": "Error Count", + "type": "integer" + }, + "latest_activity_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Latest Activity At" + }, + "status_counts": { + "$ref": "#/components/schemas/ExperimentStatusCountsDto" + }, + "total_cost_usd": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Total Cost Usd" + }, + "total_runs": { + "default": 0, + "title": "Total Runs", + "type": "integer" + } + }, + "title": "ExperimentAnalyticsDto", + "type": "object" + }, + "ExperimentCohortStatus": { + "enum": [ + "active", + "archived" + ], + "title": "ExperimentCohortStatus", + "type": "string" + }, + "ExperimentDefineRequest": { + "properties": { + "benchmark_slug": { + "title": "Benchmark Slug", + "type": "string" + }, + "cohort_id": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cohort Id" + }, + "default_evaluator_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Default Evaluator Slug" + }, + "default_model_target": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Default Model Target" + }, + "default_worker_team": { + "$ref": "#/components/schemas/JsonObject-Input" + }, + "design": { + "$ref": "#/components/schemas/JsonObject-Input" + }, + "limit": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Limit" + }, + "metadata": { + "$ref": "#/components/schemas/JsonObject-Input" + }, + "name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Name" + }, + "sample_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Sample Ids" + }, + "seed": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Seed" + } + }, + "required": [ + "benchmark_slug" + ], + "title": "ExperimentDefineRequest", + "type": "object" + }, + "ExperimentDefineResult": { + "properties": { + "benchmark_type": { + "title": "Benchmark Type", + "type": "string" + }, + "cohort_id": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cohort Id" + }, + "experiment_id": { + "format": "uuid", + "title": "Experiment Id", + "type": "string" + }, + "sample_count": { + "title": "Sample Count", + "type": "integer" + }, + "selected_samples": { + "items": { + "type": "string" + }, + "title": "Selected Samples", + "type": "array" + } + }, + "required": [ + "experiment_id", + "cohort_id", + "benchmark_type", + "sample_count", + "selected_samples" ], - "summary": "Get Training Sessions", - "description": "List training sessions, optionally filtered by definition.", - "operationId": "get_training_sessions_runs_training_sessions_get", - "parameters": [ - { - "name": "definition_id", - "in": "query", - "required": false, - "schema": { - "anyOf": [ - { - "type": "string", - "format": "uuid" - }, - { - "type": "null" - } - ], - "title": "Definition Id" - } + "title": "ExperimentDefineResult", + "type": "object" + }, + "ExperimentDetailDto": { + "properties": { + "analytics": { + "$ref": "#/components/schemas/ExperimentAnalyticsDto" + }, + "design": { + "additionalProperties": true, + "title": "Design", + "type": "object" + }, + "experiment": { + "$ref": "#/components/schemas/ExperimentSummaryDto" + }, + "metadata": { + "additionalProperties": true, + "title": "Metadata", + "type": "object" + }, + "runs": { + "items": { + "$ref": "#/components/schemas/ExperimentRunRowDto" + }, + "title": "Runs", + "type": "array" + }, + "sample_selection": { + "additionalProperties": true, + "title": "Sample Selection", + "type": "object" } + }, + "required": [ + "experiment" ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TrainingSessionDto" - }, - "title": "Response Get Training Sessions Runs Training Sessions Get" - } - } - } + "title": "ExperimentDetailDto", + "type": "object" + }, + "ExperimentRunRequest": { + "properties": { + "experiment_id": { + "format": "uuid", + "title": "Experiment Id", + "type": "string" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } + "timeout_seconds": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" } - } + ], + "title": "Timeout Seconds" + }, + "wait": { + "default": true, + "title": "Wait", + "type": "boolean" } - } - } - }, - "/runs/training/sessions/{session_id}/metrics": { - "get": { - "tags": [ - "runs" + }, + "required": [ + "experiment_id" ], - "summary": "Get Training Metrics", - "description": "Get per-step training metrics for a session.", - "operationId": "get_training_metrics_runs_training_sessions__session_id__metrics_get", - "parameters": [ - { - "name": "session_id", - "in": "path", - "required": true, - "schema": { - "type": "string", + "title": "ExperimentRunRequest", + "type": "object" + }, + "ExperimentRunResult": { + "properties": { + "experiment_id": { + "format": "uuid", + "title": "Experiment Id", + "type": "string" + }, + "run_ids": { + "items": { "format": "uuid", - "title": "Session Id" - } + "type": "string" + }, + "title": "Run Ids", + "type": "array" + }, + "workflow_definition_ids": { + "items": { + "format": "uuid", + "type": "string" + }, + "title": "Workflow Definition Ids", + "type": "array" } + }, + "required": [ + "experiment_id", + "run_ids" ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TrainingMetricDto" - }, - "title": "Response Get Training Metrics Runs Training Sessions Session Id Metrics Get" - } + "title": "ExperimentRunResult", + "type": "object" + }, + "ExperimentRunRowDto": { + "properties": { + "benchmark_type": { + "title": "Benchmark Type", + "type": "string" + }, + "completed_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Completed At" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } + "created_at": { + "format": "date-time", + "title": "Created At", + "type": "string" + }, + "error_message": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } - } - } - } - }, - "/cohorts": { - "get": { - "tags": [ - "cohorts" - ], - "summary": "List Cohorts", - "description": "List all experiment cohorts.", - "operationId": "list_cohorts_cohorts_get", - "parameters": [ - { - "name": "include_archived", - "in": "query", - "required": false, - "schema": { - "type": "boolean", - "default": false, - "title": "Include Archived" - } - } - ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/CohortSummaryDto" - }, - "title": "Response List Cohorts Cohorts Get" - } + ], + "title": "Error Message" + }, + "evaluator_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Evaluator Slug" + }, + "final_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Final Score" + }, + "instance_key": { + "title": "Instance Key", + "type": "string" + }, + "model_target": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Model Target" + }, + "run_id": { + "format": "uuid", + "title": "Run Id", + "type": "string" + }, + "running_time_ms": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Running Time Ms" + }, + "seed": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Seed" + }, + "started_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Started At" + }, + "status": { + "title": "Status", + "type": "string" + }, + "total_cost_usd": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" } - } + ], + "title": "Total Cost Usd" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } + "total_tasks": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" } - } + ], + "title": "Total Tasks" + }, + "worker_team": { + "additionalProperties": true, + "title": "Worker Team", + "type": "object" + }, + "workflow_definition_id": { + "format": "uuid", + "title": "Workflow Definition Id", + "type": "string" } - } - } - }, - "/cohorts/{cohort_id}": { - "get": { - "tags": [ - "cohorts" + }, + "required": [ + "run_id", + "workflow_definition_id", + "benchmark_type", + "instance_key", + "status", + "created_at" ], - "summary": "Get Cohort", - "description": "Get one cohort detail payload.", - "operationId": "get_cohort_cohorts__cohort_id__get", - "parameters": [ - { - "name": "cohort_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Cohort Id" - } + "title": "ExperimentRunRowDto", + "type": "object" + }, + "ExperimentStatusCountsDto": { + "properties": { + "cancelled": { + "default": 0, + "title": "Cancelled", + "type": "integer" + }, + "completed": { + "default": 0, + "title": "Completed", + "type": "integer" + }, + "evaluating": { + "default": 0, + "title": "Evaluating", + "type": "integer" + }, + "executing": { + "default": 0, + "title": "Executing", + "type": "integer" + }, + "failed": { + "default": 0, + "title": "Failed", + "type": "integer" + }, + "pending": { + "default": 0, + "title": "Pending", + "type": "integer" } - ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CohortDetailDto" - } + }, + "title": "ExperimentStatusCountsDto", + "type": "object" + }, + "ExperimentSummaryDto": { + "properties": { + "benchmark_type": { + "title": "Benchmark Type", + "type": "string" + }, + "cohort_id": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Cohort Id" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } + "completed_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" } - } - } - } - }, - "patch": { - "tags": [ - "cohorts" - ], - "summary": "Update Cohort", - "description": "Update one cohort's operator-managed fields.", - "operationId": "update_cohort_cohorts__cohort_id__patch", - "parameters": [ - { - "name": "cohort_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Cohort Id" - } - } - ], - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UpdateCohortRequest" + ], + "title": "Completed At" + }, + "created_at": { + "format": "date-time", + "title": "Created At", + "type": "string" + }, + "default_evaluator_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } - } - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CohortSummaryDto" - } + ], + "title": "Default Evaluator Slug" + }, + "default_model_target": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Default Model Target" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } + "default_worker_team": { + "additionalProperties": true, + "title": "Default Worker Team", + "type": "object" + }, + "experiment_id": { + "format": "uuid", + "title": "Experiment Id", + "type": "string" + }, + "name": { + "title": "Name", + "type": "string" + }, + "run_count": { + "default": 0, + "title": "Run Count", + "type": "integer" + }, + "sample_count": { + "title": "Sample Count", + "type": "integer" + }, + "started_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Started At" + }, + "status": { + "title": "Status", + "type": "string" } - } - } - }, - "/rollouts/submit": { - "post": { - "tags": [ - "rollouts" + }, + "required": [ + "experiment_id", + "name", + "benchmark_type", + "sample_count", + "status", + "created_at" ], - "summary": "Submit Rollout", - "description": "Start a batch of episodes. Returns immediately with batch_id.", - "operationId": "submit_rollout_rollouts_submit_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SubmitRequest" - } - } + "title": "ExperimentSummaryDto", + "type": "object" + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "title": "Detail", + "type": "array" + } + }, + "title": "HTTPValidationError", + "type": "object" + }, + "JsonObject-Input": { + "additionalProperties": { + "$ref": "#/components/schemas/JsonValue-Input" + }, + "type": "object" + }, + "JsonObject-Output": { + "additionalProperties": { + "$ref": "#/components/schemas/JsonValue-Output" + }, + "type": "object" + }, + "JsonScalar": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" }, - "required": true - }, - "responses": { - "202": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SubmitResponse" - } - } - } + { + "type": "boolean" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } - } - } - }, - "/rollouts/{batch_id}": { - "get": { - "tags": [ - "rollouts" - ], - "summary": "Poll Rollout", - "description": "Poll batch status. Returns trajectories when complete.", - "operationId": "poll_rollout_rollouts__batch_id__get", - "parameters": [ { - "name": "batch_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Batch Id" - } + "type": "null" } - ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/PollResponse" - } - } - } + ] + }, + "JsonValue-Input": { + "anyOf": [ + { + "$ref": "#/components/schemas/JsonScalar" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } + { + "items": { + "$ref": "#/components/schemas/JsonValue-Input" + }, + "type": "array" + }, + { + "additionalProperties": { + "$ref": "#/components/schemas/JsonValue-Input" + }, + "type": "object" } - } + ] }, - "delete": { - "tags": [ - "rollouts" - ], - "summary": "Cancel Rollout", - "description": "Cancel a pending/running batch.", - "operationId": "cancel_rollout_rollouts__batch_id__delete", - "parameters": [ + "JsonValue-Output": { + "anyOf": [ { - "name": "batch_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "format": "uuid", - "title": "Batch Id" - } - } - ], - "responses": { - "204": { - "description": "Successful Response" + "$ref": "#/components/schemas/JsonScalar" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } + { + "items": { + "$ref": "#/components/schemas/JsonValue-Output" + }, + "type": "array" + }, + { + "additionalProperties": { + "$ref": "#/components/schemas/JsonValue-Output" + }, + "type": "object" } - } - } - }, - "/rollouts/sync-weights": { - "post": { - "tags": [ - "rollouts" - ], - "summary": "Sync Weights", - "description": "Restart vLLM with a new checkpoint (full-weight RFT).\n\nBlocks until the new vLLM process is healthy.", - "operationId": "sync_weights_rollouts_sync_weights_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/WeightSyncRequest" + ] + }, + "NodeAddedMutation": { + "description": "node.added \u2014 full node snapshot.", + "properties": { + "assigned_worker_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Assigned Worker Slug" }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/WeightSyncResponse" - } - } - } + "description": { + "title": "Description", + "type": "string" }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } - } - } - }, - "/api/inngest": { - "get": { - "summary": "Get Api Inngest", - "operationId": "get_api_inngest_api_inngest_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } + "instance_key": { + "title": "Instance Key", + "type": "string" + }, + "mutation_type": { + "const": "node.added", + "default": "node.added", + "title": "Mutation Type", + "type": "string" + }, + "status": { + "title": "Status", + "type": "string" + }, + "task_slug": { + "title": "Task Slug", + "type": "string" } - } + }, + "required": [ + "task_slug", + "instance_key", + "description", + "status", + "assigned_worker_slug" + ], + "title": "NodeAddedMutation", + "type": "object" }, - "put": { - "summary": "Put Inngest Api", - "operationId": "put_inngest_api_api_inngest_put", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} + "NodeFieldChangedMutation": { + "description": "node.field_changed.", + "properties": { + "field": { + "enum": [ + "description", + "assigned_worker_slug" + ], + "title": "Field", + "type": "string" + }, + "mutation_type": { + "const": "node.field_changed", + "default": "node.field_changed", + "title": "Mutation Type", + "type": "string" + }, + "value": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" } - } + ], + "title": "Value" } - } + }, + "required": [ + "field", + "value" + ], + "title": "NodeFieldChangedMutation", + "type": "object" }, - "post": { - "summary": "Post Inngest Api", - "operationId": "post_inngest_api_api_inngest_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - } - }, - "components": { - "schemas": { - "AnnotationDeletedMutation": { + "NodeRemovedMutation": { + "description": "node.removed \u2014 node snapshot at removal time.", "properties": { + "assigned_worker_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Assigned Worker Slug" + }, + "description": { + "title": "Description", + "type": "string" + }, + "instance_key": { + "title": "Instance Key", + "type": "string" + }, "mutation_type": { - "type": "string", - "const": "annotation.deleted", + "const": "node.removed", + "default": "node.removed", "title": "Mutation Type", - "default": "annotation.deleted" + "type": "string" }, - "namespace": { - "type": "string", - "title": "Namespace" + "status": { + "title": "Status", + "type": "string" }, - "payload": { - "$ref": "#/components/schemas/JsonObject" + "task_slug": { + "title": "Task Slug", + "type": "string" } }, - "type": "object", "required": [ - "namespace", - "payload" + "task_slug", + "instance_key", + "description", + "status", + "assigned_worker_slug" ], - "title": "AnnotationDeletedMutation", - "description": "annotation.deleted \u2014 tombstone." + "title": "NodeRemovedMutation", + "type": "object" }, - "AnnotationSetMutation": { + "NodeStatusChangedMutation": { + "description": "node.status_changed.", "properties": { "mutation_type": { - "type": "string", - "const": "annotation.set", + "const": "node.status_changed", + "default": "node.status_changed", "title": "Mutation Type", - "default": "annotation.set" + "type": "string" }, - "namespace": { - "type": "string", - "title": "Namespace" + "status": { + "title": "Status", + "type": "string" + } + }, + "required": [ + "status" + ], + "title": "NodeStatusChangedMutation", + "type": "object" + }, + "PollResponse": { + "description": "Ergon \u2192 Trainer: current batch status + trajectories if complete.", + "properties": { + "batch_id": { + "format": "uuid", + "title": "Batch Id", + "type": "string" }, - "payload": { - "$ref": "#/components/schemas/JsonObject" + "completed": { + "default": 0, + "title": "Completed", + "type": "integer" + }, + "failures": { + "items": { + "$ref": "#/components/schemas/EpisodeFailure" + }, + "title": "Failures", + "type": "array" + }, + "status": { + "$ref": "#/components/schemas/BatchStatus" + }, + "total": { + "default": 0, + "title": "Total", + "type": "integer" + }, + "trajectories": { + "items": { + "$ref": "#/components/schemas/Trajectory" + }, + "title": "Trajectories", + "type": "array" } }, - "type": "object", "required": [ - "namespace", - "payload" + "batch_id", + "status" ], - "title": "AnnotationSetMutation", - "description": "annotation.set." + "title": "PollResponse", + "type": "object" }, - "BatchStatus": { - "type": "string", - "enum": [ - "pending", - "running", - "complete", - "failed", - "cancelled" + "RunCommunicationMessageDto": { + "additionalProperties": false, + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "createdAt": { + "format": "date-time", + "title": "Createdat", + "type": "string" + }, + "fromAgentId": { + "title": "Fromagentid", + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "runId": { + "title": "Runid", + "type": "string" + }, + "sequenceNum": { + "title": "Sequencenum", + "type": "integer" + }, + "taskExecutionId": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Taskexecutionid" + }, + "taskId": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Taskid" + }, + "threadId": { + "title": "Threadid", + "type": "string" + }, + "threadTopic": { + "title": "Threadtopic", + "type": "string" + }, + "toAgentId": { + "title": "Toagentid", + "type": "string" + } + }, + "required": [ + "id", + "threadId", + "threadTopic", + "runId", + "fromAgentId", + "toAgentId", + "content", + "sequenceNum", + "createdAt" ], - "title": "BatchStatus" + "title": "RunCommunicationMessageDto", + "type": "object" }, - "CohortDetailDto": { + "RunCommunicationThreadDto": { + "additionalProperties": false, "properties": { + "agentAId": { + "title": "Agentaid", + "type": "string" + }, + "agentBId": { + "title": "Agentbid", + "type": "string" + }, + "createdAt": { + "format": "date-time", + "title": "Createdat", + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "messages": { + "items": { + "$ref": "#/components/schemas/RunCommunicationMessageDto" + }, + "title": "Messages", + "type": "array" + }, + "runId": { + "title": "Runid", + "type": "string" + }, "summary": { - "$ref": "#/components/schemas/CohortSummaryDto" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Summary" + }, + "taskId": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Taskid" + }, + "topic": { + "title": "Topic", + "type": "string" + }, + "updatedAt": { + "format": "date-time", + "title": "Updatedat", + "type": "string" + } + }, + "required": [ + "id", + "runId", + "topic", + "agentAId", + "agentBId", + "createdAt", + "updatedAt" + ], + "title": "RunCommunicationThreadDto", + "type": "object" + }, + "RunContextEventDto": { + "additionalProperties": false, + "properties": { + "completedAt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Completedat" + }, + "createdAt": { + "title": "Createdat", + "type": "string" + }, + "eventType": { + "title": "Eventtype", + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "payload": { + "additionalProperties": true, + "title": "Payload", + "type": "object" + }, + "sequence": { + "title": "Sequence", + "type": "integer" + }, + "startedAt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Startedat" }, - "runs": { - "items": { - "$ref": "#/components/schemas/CohortRunRowDto" - }, - "type": "array", - "title": "Runs" + "taskExecutionId": { + "title": "Taskexecutionid", + "type": "string" + }, + "taskNodeId": { + "title": "Tasknodeid", + "type": "string" + }, + "workerBindingKey": { + "title": "Workerbindingkey", + "type": "string" } }, - "type": "object", "required": [ - "summary" + "id", + "taskExecutionId", + "taskNodeId", + "workerBindingKey", + "sequence", + "eventType", + "payload", + "createdAt" ], - "title": "CohortDetailDto", - "description": "Full payload for a single cohort detail page." + "title": "RunContextEventDto", + "type": "object" }, - "CohortRunRowDto": { + "RunEvaluationCriterionDto": { + "additionalProperties": false, "properties": { - "run_id": { - "type": "string", - "format": "uuid", - "title": "Run Id" + "criterionDescription": { + "title": "Criteriondescription", + "type": "string" }, - "definition_id": { - "type": "string", - "format": "uuid", - "title": "Definition Id" + "criterionNum": { + "title": "Criterionnum", + "type": "integer" }, - "cohort_id": { - "type": "string", - "format": "uuid", - "title": "Cohort Id" + "criterionType": { + "title": "Criteriontype", + "type": "string" }, - "cohort_name": { - "type": "string", - "title": "Cohort Name" + "error": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Error" }, - "status": { - "type": "string", - "title": "Status" + "evaluatedActionIds": { + "items": { + "type": "string" + }, + "title": "Evaluatedactionids", + "type": "array" }, - "created_at": { - "type": "string", - "format": "date-time", - "title": "Created At" + "evaluatedResourceIds": { + "items": { + "type": "string" + }, + "title": "Evaluatedresourceids", + "type": "array" }, - "started_at": { + "evaluationInput": { "anyOf": [ { - "type": "string", - "format": "date-time" + "type": "string" }, { "type": "null" } ], - "title": "Started At" + "title": "Evaluationinput" }, - "completed_at": { + "feedback": { "anyOf": [ { - "type": "string", - "format": "date-time" + "type": "string" }, { "type": "null" } ], - "title": "Completed At" + "title": "Feedback" }, - "running_time_ms": { + "id": { + "title": "Id", + "type": "string" + }, + "maxScore": { + "title": "Maxscore", + "type": "number" + }, + "score": { + "title": "Score", + "type": "number" + }, + "stageName": { + "title": "Stagename", + "type": "string" + }, + "stageNum": { + "title": "Stagenum", + "type": "integer" + } + }, + "required": [ + "id", + "stageNum", + "stageName", + "criterionNum", + "criterionType", + "criterionDescription", + "score", + "maxScore" + ], + "title": "RunEvaluationCriterionDto", + "type": "object" + }, + "RunExecutionAttemptDto": { + "additionalProperties": false, + "properties": { + "agentId": { "anyOf": [ { - "type": "integer" + "type": "string" }, { "type": "null" } ], - "title": "Running Time Ms" + "title": "Agentid" }, - "final_score": { + "agentName": { "anyOf": [ { - "type": "number" + "type": "string" }, { "type": "null" } ], - "title": "Final Score" + "title": "Agentname" }, - "total_tasks": { + "attemptNumber": { + "title": "Attemptnumber", + "type": "integer" + }, + "completedAt": { "anyOf": [ { - "type": "integer" + "format": "date-time", + "type": "string" }, { "type": "null" } ], - "title": "Total Tasks" + "title": "Completedat" }, - "total_cost_usd": { + "errorMessage": { "anyOf": [ { - "type": "number" + "type": "string" }, { "type": "null" } ], - "title": "Total Cost Usd" + "title": "Errormessage" }, - "error_message": { + "evaluationDetails": { "anyOf": [ { - "type": "string" + "additionalProperties": true, + "type": "object" }, { "type": "null" } ], - "title": "Error Message" - } - }, - "type": "object", - "required": [ - "run_id", - "definition_id", - "cohort_id", - "cohort_name", - "status", - "created_at" - ], - "title": "CohortRunRowDto", - "description": "One run inside a cohort detail view." - }, - "CohortStatusCountsDto": { - "properties": { - "pending": { - "type": "integer", - "title": "Pending", - "default": 0 - }, - "executing": { - "type": "integer", - "title": "Executing", - "default": 0 - }, - "evaluating": { - "type": "integer", - "title": "Evaluating", - "default": 0 + "title": "Evaluationdetails" }, - "completed": { - "type": "integer", - "title": "Completed", - "default": 0 + "finalAssistantMessage": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Finalassistantmessage" }, - "failed": { - "type": "integer", - "title": "Failed", - "default": 0 - } - }, - "type": "object", - "title": "CohortStatusCountsDto", - "description": "Aggregate run counts by lifecycle status." - }, - "CohortSummaryDto": { - "properties": { - "cohort_id": { - "type": "string", - "format": "uuid", - "title": "Cohort Id" + "id": { + "title": "Id", + "type": "string" }, - "name": { - "type": "string", - "title": "Name" + "outputResourceIds": { + "items": { + "type": "string" + }, + "title": "Outputresourceids", + "type": "array" }, - "description": { + "score": { "anyOf": [ { - "type": "string" + "type": "number" }, { "type": "null" } ], - "title": "Description" + "title": "Score" }, - "created_by": { + "startedAt": { "anyOf": [ { + "format": "date-time", "type": "string" }, { "type": "null" } ], - "title": "Created By" - }, - "created_at": { - "type": "string", - "format": "date-time", - "title": "Created At" + "title": "Startedat" }, "status": { - "type": "string", - "title": "Status" + "title": "Status", + "type": "string" }, - "total_runs": { - "type": "integer", - "title": "Total Runs", - "default": 0 + "taskId": { + "title": "Taskid", + "type": "string" + } + }, + "required": [ + "id", + "taskId", + "attemptNumber", + "status" + ], + "title": "RunExecutionAttemptDto", + "type": "object" + }, + "RunGraphMutationDto": { + "additionalProperties": false, + "description": "One entry in the append-only mutation log for a run.\n\nField names are snake_case to match the frontend GraphMutationDtoSchema.\nCamelModel is intentionally not used here \u2014 the frontend contract uses snake_case.", + "properties": { + "actor": { + "title": "Actor", + "type": "string" }, - "status_counts": { - "$ref": "#/components/schemas/CohortStatusCountsDto" + "created_at": { + "title": "Created At", + "type": "string" }, - "average_score": { - "anyOf": [ + "id": { + "title": "Id", + "type": "string" + }, + "mutation_type": { + "title": "Mutation Type", + "type": "string" + }, + "new_value": { + "discriminator": { + "mapping": { + "annotation.deleted": "#/components/schemas/AnnotationDeletedMutation", + "annotation.set": "#/components/schemas/AnnotationSetMutation", + "edge.added": "#/components/schemas/EdgeAddedMutation", + "edge.removed": "#/components/schemas/EdgeRemovedMutation", + "edge.status_changed": "#/components/schemas/EdgeStatusChangedMutation", + "node.added": "#/components/schemas/NodeAddedMutation", + "node.field_changed": "#/components/schemas/NodeFieldChangedMutation", + "node.removed": "#/components/schemas/NodeRemovedMutation", + "node.status_changed": "#/components/schemas/NodeStatusChangedMutation" + }, + "propertyName": "mutation_type" + }, + "oneOf": [ { - "type": "number" + "$ref": "#/components/schemas/NodeAddedMutation" }, { - "type": "null" - } - ], - "title": "Average Score" - }, - "best_score": { - "anyOf": [ + "$ref": "#/components/schemas/NodeRemovedMutation" + }, { - "type": "number" + "$ref": "#/components/schemas/NodeStatusChangedMutation" }, { - "type": "null" - } - ], - "title": "Best Score" - }, - "worst_score": { - "anyOf": [ + "$ref": "#/components/schemas/NodeFieldChangedMutation" + }, { - "type": "number" + "$ref": "#/components/schemas/EdgeAddedMutation" }, { - "type": "null" + "$ref": "#/components/schemas/EdgeRemovedMutation" + }, + { + "$ref": "#/components/schemas/EdgeStatusChangedMutation" + }, + { + "$ref": "#/components/schemas/AnnotationSetMutation" + }, + { + "$ref": "#/components/schemas/AnnotationDeletedMutation" } ], - "title": "Worst Score" + "title": "New Value" }, - "average_duration_ms": { + "old_value": { "anyOf": [ { - "type": "integer" + "discriminator": { + "mapping": { + "annotation.deleted": "#/components/schemas/AnnotationDeletedMutation", + "annotation.set": "#/components/schemas/AnnotationSetMutation", + "edge.added": "#/components/schemas/EdgeAddedMutation", + "edge.removed": "#/components/schemas/EdgeRemovedMutation", + "edge.status_changed": "#/components/schemas/EdgeStatusChangedMutation", + "node.added": "#/components/schemas/NodeAddedMutation", + "node.field_changed": "#/components/schemas/NodeFieldChangedMutation", + "node.removed": "#/components/schemas/NodeRemovedMutation", + "node.status_changed": "#/components/schemas/NodeStatusChangedMutation" + }, + "propertyName": "mutation_type" + }, + "oneOf": [ + { + "$ref": "#/components/schemas/NodeAddedMutation" + }, + { + "$ref": "#/components/schemas/NodeRemovedMutation" + }, + { + "$ref": "#/components/schemas/NodeStatusChangedMutation" + }, + { + "$ref": "#/components/schemas/NodeFieldChangedMutation" + }, + { + "$ref": "#/components/schemas/EdgeAddedMutation" + }, + { + "$ref": "#/components/schemas/EdgeRemovedMutation" + }, + { + "$ref": "#/components/schemas/EdgeStatusChangedMutation" + }, + { + "$ref": "#/components/schemas/AnnotationSetMutation" + }, + { + "$ref": "#/components/schemas/AnnotationDeletedMutation" + } + ] }, { "type": "null" } ], - "title": "Average Duration Ms" - }, - "failure_rate": { - "type": "number", - "title": "Failure Rate", - "default": 0.0 + "title": "Old Value" }, - "stats_updated_at": { + "reason": { "anyOf": [ { - "type": "string", - "format": "date-time" + "type": "string" }, { "type": "null" } ], - "title": "Stats Updated At" - } - }, - "type": "object", - "required": [ - "cohort_id", - "name", - "created_at", - "status" - ], - "title": "CohortSummaryDto", - "description": "Summary row for cohort list and live updates." - }, - "EdgeAddedMutation": { - "properties": { - "mutation_type": { - "type": "string", - "const": "edge.added", - "title": "Mutation Type", - "default": "edge.added" - }, - "source_node_id": { - "type": "string", - "title": "Source Node Id" - }, - "target_node_id": { - "type": "string", - "title": "Target Node Id" - }, - "status": { - "type": "string", - "title": "Status" - } - }, - "type": "object", - "required": [ - "source_node_id", - "target_node_id", - "status" - ], - "title": "EdgeAddedMutation", - "description": "edge.added \u2014 full edge snapshot." - }, - "EdgeRemovedMutation": { - "properties": { - "mutation_type": { - "type": "string", - "const": "edge.removed", - "title": "Mutation Type", - "default": "edge.removed" - }, - "source_node_id": { - "type": "string", - "title": "Source Node Id" + "title": "Reason" }, - "target_node_id": { - "type": "string", - "title": "Target Node Id" + "run_id": { + "title": "Run Id", + "type": "string" }, - "status": { - "type": "string", - "title": "Status" - } - }, - "type": "object", - "required": [ - "source_node_id", - "target_node_id", - "status" - ], - "title": "EdgeRemovedMutation", - "description": "edge.removed." - }, - "EdgeStatusChangedMutation": { - "properties": { - "mutation_type": { - "type": "string", - "const": "edge.status_changed", - "title": "Mutation Type", - "default": "edge.status_changed" + "sequence": { + "title": "Sequence", + "type": "integer" }, - "status": { - "type": "string", - "title": "Status" - } - }, - "type": "object", - "required": [ - "status" - ], - "title": "EdgeStatusChangedMutation", - "description": "edge.status_changed." - }, - "EpisodeFailure": { - "properties": { - "run_id": { - "type": "string", - "format": "uuid", - "title": "Run Id" + "target_id": { + "title": "Target Id", + "type": "string" }, - "error": { - "type": "string", - "title": "Error" + "target_type": { + "title": "Target Type", + "type": "string" } }, - "type": "object", "required": [ + "id", "run_id", - "error" - ], - "title": "EpisodeFailure", - "description": "An episode that didn't complete successfully." - }, - "ExperimentCohortStatus": { - "type": "string", - "enum": [ - "active", - "archived" + "sequence", + "mutation_type", + "target_type", + "target_id", + "actor", + "old_value", + "new_value", + "reason", + "created_at" ], - "title": "ExperimentCohortStatus" - }, - "HTTPValidationError": { - "properties": { - "detail": { - "items": { - "$ref": "#/components/schemas/ValidationError" - }, - "type": "array", - "title": "Detail" - } - }, - "type": "object", - "title": "HTTPValidationError" - }, - "JsonObject": { - "additionalProperties": { - "$ref": "#/components/schemas/JsonValue" - }, + "title": "RunGraphMutationDto", "type": "object" }, - "JsonScalar": { - "anyOf": [ - { + "RunResourceDto": { + "additionalProperties": false, + "properties": { + "createdAt": { + "format": "date-time", + "title": "Createdat", "type": "string" }, - { - "type": "integer" + "filePath": { + "title": "Filepath", + "type": "string" }, - { - "type": "number" + "id": { + "title": "Id", + "type": "string" }, - { - "type": "boolean" + "mimeType": { + "title": "Mimetype", + "type": "string" }, - { - "type": "null" - } - ] - }, - "JsonValue": { - "anyOf": [ - { - "$ref": "#/components/schemas/JsonScalar" + "name": { + "title": "Name", + "type": "string" }, - { - "items": { - "$ref": "#/components/schemas/JsonValue" - }, - "type": "array" + "sizeBytes": { + "title": "Sizebytes", + "type": "integer" }, - { - "additionalProperties": { - "$ref": "#/components/schemas/JsonValue" - }, - "type": "object" + "taskExecutionId": { + "title": "Taskexecutionid", + "type": "string" + }, + "taskId": { + "title": "Taskid", + "type": "string" } - ] + }, + "required": [ + "id", + "taskId", + "taskExecutionId", + "name", + "mimeType", + "filePath", + "sizeBytes", + "createdAt" + ], + "title": "RunResourceDto", + "type": "object" }, - "NodeAddedMutation": { + "RunSandboxCommandDto": { + "additionalProperties": false, "properties": { - "mutation_type": { - "type": "string", - "const": "node.added", - "title": "Mutation Type", - "default": "node.added" - }, - "task_slug": { - "type": "string", - "title": "Task Slug" + "command": { + "title": "Command", + "type": "string" }, - "instance_key": { - "type": "string", - "title": "Instance Key" + "durationMs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Durationms" }, - "description": { - "type": "string", - "title": "Description" + "exitCode": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Exitcode" }, - "status": { - "type": "string", - "title": "Status" + "stderr": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Stderr" }, - "assigned_worker_slug": { + "stdout": { "anyOf": [ { "type": "string" @@ -1232,81 +2054,72 @@ "type": "null" } ], - "title": "Assigned Worker Slug" + "title": "Stdout" + }, + "timestamp": { + "format": "date-time", + "title": "Timestamp", + "type": "string" } }, - "type": "object", "required": [ - "task_slug", - "instance_key", - "description", - "status", - "assigned_worker_slug" + "command", + "timestamp" ], - "title": "NodeAddedMutation", - "description": "node.added \u2014 full node snapshot." + "title": "RunSandboxCommandDto", + "type": "object" }, - "NodeFieldChangedMutation": { + "RunSandboxDto": { + "additionalProperties": false, "properties": { - "mutation_type": { - "type": "string", - "const": "node.field_changed", - "title": "Mutation Type", - "default": "node.field_changed" - }, - "field": { - "type": "string", - "enum": [ - "description", - "assigned_worker_slug" + "closeReason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } ], - "title": "Field" + "title": "Closereason" }, - "value": { + "closedAt": { "anyOf": [ { + "format": "date-time", "type": "string" }, { "type": "null" } ], - "title": "Value" - } - }, - "type": "object", - "required": [ - "field", - "value" - ], - "title": "NodeFieldChangedMutation", - "description": "node.field_changed." - }, - "NodeRemovedMutation": { - "properties": { - "mutation_type": { - "type": "string", - "const": "node.removed", - "title": "Mutation Type", - "default": "node.removed" + "title": "Closedat" }, - "task_slug": { - "type": "string", - "title": "Task Slug" + "commands": { + "items": { + "$ref": "#/components/schemas/RunSandboxCommandDto" + }, + "title": "Commands", + "type": "array" }, - "instance_key": { - "type": "string", - "title": "Instance Key" + "createdAt": { + "format": "date-time", + "title": "Createdat", + "type": "string" }, - "description": { - "type": "string", - "title": "Description" + "sandboxId": { + "title": "Sandboxid", + "type": "string" }, "status": { - "type": "string", - "title": "Status" + "title": "Status", + "type": "string" }, - "assigned_worker_slug": { + "taskId": { + "title": "Taskid", + "type": "string" + }, + "template": { "anyOf": [ { "type": "string" @@ -1315,113 +2128,70 @@ "type": "null" } ], - "title": "Assigned Worker Slug" - } - }, - "type": "object", - "required": [ - "task_slug", - "instance_key", - "description", - "status", - "assigned_worker_slug" - ], - "title": "NodeRemovedMutation", - "description": "node.removed \u2014 node snapshot at removal time." - }, - "NodeStatusChangedMutation": { - "properties": { - "mutation_type": { - "type": "string", - "const": "node.status_changed", - "title": "Mutation Type", - "default": "node.status_changed" + "title": "Template" }, - "status": { - "type": "string", - "title": "Status" + "timeoutMinutes": { + "title": "Timeoutminutes", + "type": "integer" } }, - "type": "object", "required": [ - "status" + "sandboxId", + "taskId", + "timeoutMinutes", + "status", + "createdAt" ], - "title": "NodeStatusChangedMutation", - "description": "node.status_changed." + "title": "RunSandboxDto", + "type": "object" }, - "PollResponse": { + "RunSnapshotDto": { + "additionalProperties": false, "properties": { - "batch_id": { - "type": "string", - "format": "uuid", - "title": "Batch Id" - }, - "status": { - "$ref": "#/components/schemas/BatchStatus" - }, - "completed": { - "type": "integer", - "title": "Completed", - "default": 0 + "cancelledTasks": { + "default": 0, + "title": "Cancelledtasks", + "type": "integer" }, - "total": { - "type": "integer", - "title": "Total", - "default": 0 + "completedAt": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Completedat" }, - "trajectories": { - "items": { - "$ref": "#/components/schemas/Trajectory" - }, - "type": "array", - "title": "Trajectories" + "completedTasks": { + "default": 0, + "title": "Completedtasks", + "type": "integer" }, - "failures": { - "items": { - "$ref": "#/components/schemas/EpisodeFailure" + "contextEventsByTask": { + "additionalProperties": { + "items": { + "$ref": "#/components/schemas/RunContextEventDto" + }, + "type": "array" }, - "type": "array", - "title": "Failures" - } - }, - "type": "object", - "required": [ - "batch_id", - "status" - ], - "title": "PollResponse", - "description": "Ergon \u2192 Trainer: current batch status + trajectories if complete." - }, - "RunCommunicationMessageDto": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "threadId": { - "type": "string", - "title": "Threadid" - }, - "threadTopic": { - "type": "string", - "title": "Threadtopic" - }, - "runId": { - "type": "string", - "title": "Runid" + "title": "Contexteventsbytask", + "type": "object" }, - "taskId": { + "durationSeconds": { "anyOf": [ { - "type": "string" + "type": "number" }, { "type": "null" } ], - "title": "Taskid" + "title": "Durationseconds" }, - "taskExecutionId": { + "error": { "anyOf": [ { "type": "string" @@ -1430,158 +2200,84 @@ "type": "null" } ], - "title": "Taskexecutionid" - }, - "fromAgentId": { - "type": "string", - "title": "Fromagentid" - }, - "toAgentId": { - "type": "string", - "title": "Toagentid" - }, - "content": { - "type": "string", - "title": "Content" - }, - "sequenceNum": { - "type": "integer", - "title": "Sequencenum" - }, - "createdAt": { - "type": "string", - "format": "date-time", - "title": "Createdat" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "id", - "threadId", - "threadTopic", - "runId", - "fromAgentId", - "toAgentId", - "content", - "sequenceNum", - "createdAt" - ], - "title": "RunCommunicationMessageDto" - }, - "RunCommunicationThreadDto": { - "properties": { - "id": { - "type": "string", - "title": "Id" + "title": "Error" }, - "runId": { - "type": "string", - "title": "Runid" + "evaluationsByTask": { + "additionalProperties": { + "$ref": "#/components/schemas/RunTaskEvaluationDto" + }, + "title": "Evaluationsbytask", + "type": "object" }, - "taskId": { - "anyOf": [ - { - "type": "string" + "executionsByTask": { + "additionalProperties": { + "items": { + "$ref": "#/components/schemas/RunExecutionAttemptDto" }, - { - "type": "null" - } - ], - "title": "Taskid" + "type": "array" + }, + "title": "Executionsbytask", + "type": "object" }, - "topic": { - "type": "string", - "title": "Topic" + "experimentId": { + "title": "Experimentid", + "type": "string" }, - "summary": { + "failedTasks": { + "default": 0, + "title": "Failedtasks", + "type": "integer" + }, + "finalScore": { "anyOf": [ { - "type": "string" + "type": "number" }, { "type": "null" } ], - "title": "Summary" - }, - "agentAId": { - "type": "string", - "title": "Agentaid" - }, - "agentBId": { - "type": "string", - "title": "Agentbid" - }, - "createdAt": { - "type": "string", - "format": "date-time", - "title": "Createdat" - }, - "updatedAt": { - "type": "string", - "format": "date-time", - "title": "Updatedat" + "title": "Finalscore" }, - "messages": { - "items": { - "$ref": "#/components/schemas/RunCommunicationMessageDto" - }, - "type": "array", - "title": "Messages" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "id", - "runId", - "topic", - "agentAId", - "agentBId", - "createdAt", - "updatedAt" - ], - "title": "RunCommunicationThreadDto" - }, - "RunContextEventDto": { - "properties": { "id": { - "type": "string", - "title": "Id" - }, - "taskExecutionId": { - "type": "string", - "title": "Taskexecutionid" - }, - "taskNodeId": { - "type": "string", - "title": "Tasknodeid" + "title": "Id", + "type": "string" }, - "workerBindingKey": { - "type": "string", - "title": "Workerbindingkey" + "name": { + "title": "Name", + "type": "string" }, - "sequence": { - "type": "integer", - "title": "Sequence" + "resourcesByTask": { + "additionalProperties": { + "items": { + "$ref": "#/components/schemas/RunResourceDto" + }, + "type": "array" + }, + "title": "Resourcesbytask", + "type": "object" }, - "eventType": { - "type": "string", - "title": "Eventtype" + "rootTaskId": { + "default": "", + "title": "Roottaskid", + "type": "string" }, - "payload": { - "additionalProperties": true, - "type": "object", - "title": "Payload" + "runningTasks": { + "default": 0, + "title": "Runningtasks", + "type": "integer" }, - "createdAt": { - "type": "string", - "title": "Createdat" + "sandboxesByTask": { + "additionalProperties": { + "$ref": "#/components/schemas/RunSandboxDto" + }, + "title": "Sandboxesbytask", + "type": "object" }, "startedAt": { "anyOf": [ { + "format": "date-time", "type": "string" }, { @@ -1590,59 +2286,48 @@ ], "title": "Startedat" }, - "completedAt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Completedat" + "status": { + "title": "Status", + "type": "string" + }, + "tasks": { + "additionalProperties": { + "$ref": "#/components/schemas/RunTaskDto" + }, + "title": "Tasks", + "type": "object" + }, + "threads": { + "items": { + "$ref": "#/components/schemas/RunCommunicationThreadDto" + }, + "title": "Threads", + "type": "array" + }, + "totalLeafTasks": { + "default": 0, + "title": "Totalleaftasks", + "type": "integer" + }, + "totalTasks": { + "default": 0, + "title": "Totaltasks", + "type": "integer" } }, - "additionalProperties": false, - "type": "object", "required": [ "id", - "taskExecutionId", - "taskNodeId", - "workerBindingKey", - "sequence", - "eventType", - "payload", - "createdAt" + "experimentId", + "name", + "status" ], - "title": "RunContextEventDto" + "title": "RunSnapshotDto", + "type": "object" }, - "RunEvaluationCriterionDto": { + "RunTaskDto": { + "additionalProperties": false, "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "stageNum": { - "type": "integer", - "title": "Stagenum" - }, - "stageName": { - "type": "string", - "title": "Stagename" - }, - "criterionNum": { - "type": "integer", - "title": "Criterionnum" - }, - "criterionType": { - "type": "string", - "title": "Criteriontype" - }, - "criterionDescription": { - "type": "string", - "title": "Criteriondescription" - }, - "evaluationInput": { + "assignedWorkerId": { "anyOf": [ { "type": "string" @@ -1651,17 +2336,9 @@ "type": "null" } ], - "title": "Evaluationinput" - }, - "score": { - "type": "number", - "title": "Score" - }, - "maxScore": { - "type": "number", - "title": "Maxscore" + "title": "Assignedworkerid" }, - "feedback": { + "assignedWorkerName": { "anyOf": [ { "type": "string" @@ -1670,103 +2347,109 @@ "type": "null" } ], - "title": "Feedback" - }, - "evaluatedActionIds": { - "items": { - "type": "string" - }, - "type": "array", - "title": "Evaluatedactionids" + "title": "Assignedworkername" }, - "evaluatedResourceIds": { + "childIds": { "items": { "type": "string" }, - "type": "array", - "title": "Evaluatedresourceids" + "title": "Childids", + "type": "array" }, - "error": { + "completedAt": { "anyOf": [ { - "additionalProperties": true, - "type": "object" + "format": "date-time", + "type": "string" }, { "type": "null" } ], - "title": "Error" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "id", - "stageNum", - "stageName", - "criterionNum", - "criterionType", - "criterionDescription", - "score", - "maxScore" - ], - "title": "RunEvaluationCriterionDto" - }, - "RunExecutionAttemptDto": { - "properties": { - "id": { - "type": "string", - "title": "Id" + "title": "Completedat" }, - "taskId": { - "type": "string", - "title": "Taskid" + "dependsOnIds": { + "items": { + "type": "string" + }, + "title": "Dependsonids", + "type": "array" }, - "attemptNumber": { - "type": "integer", - "title": "Attemptnumber" + "description": { + "title": "Description", + "type": "string" }, - "status": { - "type": "string", - "title": "Status" + "id": { + "title": "Id", + "type": "string" }, - "startedAt": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "null" - } - ], - "title": "Startedat" + "isLeaf": { + "title": "Isleaf", + "type": "boolean" }, - "completedAt": { + "level": { + "title": "Level", + "type": "integer" + }, + "name": { + "title": "Name", + "type": "string" + }, + "parentId": { "anyOf": [ { - "type": "string", - "format": "date-time" + "type": "string" }, { "type": "null" } ], - "title": "Completedat" + "title": "Parentid" }, - "finalAssistantMessage": { + "startedAt": { "anyOf": [ { + "format": "date-time", "type": "string" }, { "type": "null" } ], - "title": "Finalassistantmessage" + "title": "Startedat" + }, + "status": { + "title": "Status", + "type": "string" + } + }, + "required": [ + "id", + "name", + "description", + "status", + "isLeaf", + "level" + ], + "title": "RunTaskDto", + "type": "object" + }, + "RunTaskEvaluationDto": { + "additionalProperties": false, + "properties": { + "createdAt": { + "format": "date-time", + "title": "Createdat", + "type": "string" + }, + "criterionResults": { + "items": { + "$ref": "#/components/schemas/RunEvaluationCriterionDto" + }, + "title": "Criterionresults", + "type": "array" }, - "errorMessage": { + "failedGate": { "anyOf": [ { "type": "string" @@ -1775,20 +2458,33 @@ "type": "null" } ], - "title": "Errormessage" + "title": "Failedgate" }, - "score": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Score" + "id": { + "title": "Id", + "type": "string" }, - "agentId": { + "maxScore": { + "title": "Maxscore", + "type": "number" + }, + "normalizedScore": { + "title": "Normalizedscore", + "type": "number" + }, + "runId": { + "title": "Runid", + "type": "string" + }, + "stagesEvaluated": { + "title": "Stagesevaluated", + "type": "integer" + }, + "stagesPassed": { + "title": "Stagespassed", + "type": "integer" + }, + "taskId": { "anyOf": [ { "type": "string" @@ -1797,9 +2493,35 @@ "type": "null" } ], - "title": "Agentid" + "title": "Taskid" }, - "agentName": { + "totalScore": { + "title": "Totalscore", + "type": "number" + } + }, + "required": [ + "id", + "runId", + "totalScore", + "maxScore", + "normalizedScore", + "stagesEvaluated", + "stagesPassed", + "createdAt" + ], + "title": "RunTaskEvaluationDto", + "type": "object" + }, + "SubmitRequest": { + "description": "Trainer \u2192 Ergon: start a batch of episodes.", + "properties": { + "definition_id": { + "format": "uuid", + "title": "Definition Id", + "type": "string" + }, + "model_target_override": { "anyOf": [ { "type": "string" @@ -1808,168 +2530,75 @@ "type": "null" } ], - "title": "Agentname" + "title": "Model Target Override" }, - "evaluationDetails": { + "num_episodes": { + "minimum": 1.0, + "title": "Num Episodes", + "type": "integer" + }, + "policy_version": { "anyOf": [ { - "additionalProperties": true, - "type": "object" + "type": "integer" }, { "type": "null" } ], - "title": "Evaluationdetails" + "title": "Policy Version" + } + }, + "required": [ + "definition_id", + "num_episodes" + ], + "title": "SubmitRequest", + "type": "object" + }, + "SubmitResponse": { + "description": "Ergon \u2192 Trainer: batch accepted.", + "properties": { + "batch_id": { + "format": "uuid", + "title": "Batch Id", + "type": "string" }, - "outputResourceIds": { + "run_ids": { "items": { + "format": "uuid", "type": "string" }, - "type": "array", - "title": "Outputresourceids" + "title": "Run Ids", + "type": "array" + }, + "status": { + "$ref": "#/components/schemas/BatchStatus", + "default": "pending" } }, - "additionalProperties": false, - "type": "object", "required": [ - "id", - "taskId", - "attemptNumber", - "status" + "batch_id", + "run_ids" ], - "title": "RunExecutionAttemptDto" + "title": "SubmitResponse", + "type": "object" }, - "RunGraphMutationDto": { + "TrainingCurvePointDto": { + "additionalProperties": false, "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "run_id": { - "type": "string", - "title": "Run Id" - }, - "sequence": { - "type": "integer", - "title": "Sequence" - }, - "mutation_type": { - "type": "string", - "title": "Mutation Type" - }, - "target_type": { - "type": "string", - "title": "Target Type" - }, - "target_id": { - "type": "string", - "title": "Target Id" - }, - "actor": { - "type": "string", - "title": "Actor" - }, - "old_value": { + "benchmarkType": { "anyOf": [ { - "oneOf": [ - { - "$ref": "#/components/schemas/NodeAddedMutation" - }, - { - "$ref": "#/components/schemas/NodeRemovedMutation" - }, - { - "$ref": "#/components/schemas/NodeStatusChangedMutation" - }, - { - "$ref": "#/components/schemas/NodeFieldChangedMutation" - }, - { - "$ref": "#/components/schemas/EdgeAddedMutation" - }, - { - "$ref": "#/components/schemas/EdgeRemovedMutation" - }, - { - "$ref": "#/components/schemas/EdgeStatusChangedMutation" - }, - { - "$ref": "#/components/schemas/AnnotationSetMutation" - }, - { - "$ref": "#/components/schemas/AnnotationDeletedMutation" - } - ], - "discriminator": { - "propertyName": "mutation_type", - "mapping": { - "annotation.deleted": "#/components/schemas/AnnotationDeletedMutation", - "annotation.set": "#/components/schemas/AnnotationSetMutation", - "edge.added": "#/components/schemas/EdgeAddedMutation", - "edge.removed": "#/components/schemas/EdgeRemovedMutation", - "edge.status_changed": "#/components/schemas/EdgeStatusChangedMutation", - "node.added": "#/components/schemas/NodeAddedMutation", - "node.field_changed": "#/components/schemas/NodeFieldChangedMutation", - "node.removed": "#/components/schemas/NodeRemovedMutation", - "node.status_changed": "#/components/schemas/NodeStatusChangedMutation" - } - } + "type": "string" }, { "type": "null" } ], - "title": "Old Value" - }, - "new_value": { - "oneOf": [ - { - "$ref": "#/components/schemas/NodeAddedMutation" - }, - { - "$ref": "#/components/schemas/NodeRemovedMutation" - }, - { - "$ref": "#/components/schemas/NodeStatusChangedMutation" - }, - { - "$ref": "#/components/schemas/NodeFieldChangedMutation" - }, - { - "$ref": "#/components/schemas/EdgeAddedMutation" - }, - { - "$ref": "#/components/schemas/EdgeRemovedMutation" - }, - { - "$ref": "#/components/schemas/EdgeStatusChangedMutation" - }, - { - "$ref": "#/components/schemas/AnnotationSetMutation" - }, - { - "$ref": "#/components/schemas/AnnotationDeletedMutation" - } - ], - "title": "New Value", - "discriminator": { - "propertyName": "mutation_type", - "mapping": { - "annotation.deleted": "#/components/schemas/AnnotationDeletedMutation", - "annotation.set": "#/components/schemas/AnnotationSetMutation", - "edge.added": "#/components/schemas/EdgeAddedMutation", - "edge.removed": "#/components/schemas/EdgeRemovedMutation", - "edge.status_changed": "#/components/schemas/EdgeStatusChangedMutation", - "node.added": "#/components/schemas/NodeAddedMutation", - "node.field_changed": "#/components/schemas/NodeFieldChangedMutation", - "node.removed": "#/components/schemas/NodeRemovedMutation", - "node.status_changed": "#/components/schemas/NodeStatusChangedMutation" - } - } + "title": "Benchmarktype" }, - "reason": { + "createdAt": { "anyOf": [ { "type": "string" @@ -1978,1055 +2607,1247 @@ "type": "null" } ], - "title": "Reason" - }, - "created_at": { - "type": "string", - "title": "Created At" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "id", - "run_id", - "sequence", - "mutation_type", - "target_type", - "target_id", - "actor", - "old_value", - "new_value", - "reason", - "created_at" - ], - "title": "RunGraphMutationDto", - "description": "One entry in the append-only mutation log for a run.\n\nField names are snake_case to match the frontend GraphMutationDtoSchema.\nCamelModel is intentionally not used here \u2014 the frontend contract uses snake_case." - }, - "RunResourceDto": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "taskId": { - "type": "string", - "title": "Taskid" - }, - "taskExecutionId": { - "type": "string", - "title": "Taskexecutionid" - }, - "name": { - "type": "string", - "title": "Name" - }, - "mimeType": { - "type": "string", - "title": "Mimetype" + "title": "Createdat" }, - "filePath": { - "type": "string", - "title": "Filepath" + "meanScore": { + "title": "Meanscore", + "type": "number" }, - "sizeBytes": { - "type": "integer", - "title": "Sizebytes" + "runId": { + "title": "Runid", + "type": "string" }, - "createdAt": { - "type": "string", - "format": "date-time", - "title": "Createdat" + "step": { + "title": "Step", + "type": "integer" } }, - "additionalProperties": false, - "type": "object", "required": [ - "id", - "taskId", - "taskExecutionId", - "name", - "mimeType", - "filePath", - "sizeBytes", - "createdAt" + "runId", + "step", + "meanScore" ], - "title": "RunResourceDto" + "title": "TrainingCurvePointDto", + "type": "object" }, - "RunSandboxCommandDto": { + "TrainingMetricDto": { + "additionalProperties": false, "properties": { - "command": { - "type": "string", - "title": "Command" - }, - "stdout": { + "completionMeanLength": { "anyOf": [ { - "type": "string" + "type": "number" }, { "type": "null" } ], - "title": "Stdout" + "title": "Completionmeanlength" }, - "stderr": { + "entropy": { "anyOf": [ { - "type": "string" + "type": "number" }, { "type": "null" } ], - "title": "Stderr" + "title": "Entropy" }, - "exitCode": { + "epoch": { "anyOf": [ { - "type": "integer" + "type": "number" }, { "type": "null" } ], - "title": "Exitcode" + "title": "Epoch" }, - "durationMs": { + "gradNorm": { "anyOf": [ { - "type": "integer" + "type": "number" }, { "type": "null" } ], - "title": "Durationms" - }, - "timestamp": { - "type": "string", - "format": "date-time", - "title": "Timestamp" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "command", - "timestamp" - ], - "title": "RunSandboxCommandDto" - }, - "RunSandboxDto": { - "properties": { - "sandboxId": { - "type": "string", - "title": "Sandboxid" - }, - "taskId": { - "type": "string", - "title": "Taskid" + "title": "Gradnorm" }, - "template": { + "learningRate": { "anyOf": [ { - "type": "string" + "type": "number" }, { "type": "null" } ], - "title": "Template" - }, - "timeoutMinutes": { - "type": "integer", - "title": "Timeoutminutes" - }, - "status": { - "type": "string", - "title": "Status" + "title": "Learningrate" }, - "createdAt": { - "type": "string", - "format": "date-time", - "title": "Createdat" + "loss": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Loss" }, - "closedAt": { + "rewardMean": { "anyOf": [ { - "type": "string", - "format": "date-time" + "type": "number" }, { "type": "null" } ], - "title": "Closedat" + "title": "Rewardmean" }, - "closeReason": { + "rewardStd": { "anyOf": [ { - "type": "string" + "type": "number" }, { "type": "null" } ], - "title": "Closereason" + "title": "Rewardstd" }, - "commands": { - "items": { - "$ref": "#/components/schemas/RunSandboxCommandDto" - }, - "type": "array", - "title": "Commands" + "step": { + "title": "Step", + "type": "integer" + }, + "stepTimeS": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Steptimes" } }, - "additionalProperties": false, - "type": "object", "required": [ - "sandboxId", - "taskId", - "timeoutMinutes", - "status", - "createdAt" + "step" ], - "title": "RunSandboxDto" + "title": "TrainingMetricDto", + "type": "object" }, - "RunSnapshotDto": { + "TrainingSessionDto": { + "additionalProperties": false, "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "experimentId": { - "type": "string", - "title": "Experimentid" - }, - "name": { - "type": "string", - "title": "Name" - }, - "status": { - "type": "string", - "title": "Status" - }, - "tasks": { - "additionalProperties": { - "$ref": "#/components/schemas/RunTaskDto" - }, - "type": "object", - "title": "Tasks" - }, - "rootTaskId": { - "type": "string", - "title": "Roottaskid", - "default": "" - }, - "resourcesByTask": { - "additionalProperties": { - "items": { - "$ref": "#/components/schemas/RunResourceDto" - }, - "type": "array" - }, - "type": "object", - "title": "Resourcesbytask" - }, - "executionsByTask": { - "additionalProperties": { - "items": { - "$ref": "#/components/schemas/RunExecutionAttemptDto" + "completedAt": { + "anyOf": [ + { + "type": "string" }, - "type": "array" - }, - "type": "object", - "title": "Executionsbytask" - }, - "evaluationsByTask": { - "additionalProperties": { - "$ref": "#/components/schemas/RunTaskEvaluationDto" - }, - "type": "object", - "title": "Evaluationsbytask" + { + "type": "null" + } + ], + "title": "Completedat" }, - "sandboxesByTask": { - "additionalProperties": { - "$ref": "#/components/schemas/RunSandboxDto" - }, - "type": "object", - "title": "Sandboxesbytask" + "experimentDefinitionId": { + "title": "Experimentdefinitionid", + "type": "string" }, - "contextEventsByTask": { - "additionalProperties": { - "items": { - "$ref": "#/components/schemas/RunContextEventDto" + "finalLoss": { + "anyOf": [ + { + "type": "number" }, - "type": "array" - }, - "type": "object", - "title": "Contexteventsbytask" + { + "type": "null" + } + ], + "title": "Finalloss" }, - "threads": { - "items": { - "$ref": "#/components/schemas/RunCommunicationThreadDto" - }, - "type": "array", - "title": "Threads" + "id": { + "title": "Id", + "type": "string" }, - "startedAt": { + "modelName": { + "title": "Modelname", + "type": "string" + }, + "outputDir": { "anyOf": [ { - "type": "string", - "format": "date-time" + "type": "string" }, { "type": "null" } ], - "title": "Startedat" + "title": "Outputdir" }, - "completedAt": { + "startedAt": { "anyOf": [ { - "type": "string", - "format": "date-time" + "type": "string" }, { "type": "null" } ], - "title": "Completedat" + "title": "Startedat" }, - "durationSeconds": { + "status": { + "title": "Status", + "type": "string" + }, + "totalSteps": { "anyOf": [ { - "type": "number" + "type": "integer" }, { "type": "null" } ], - "title": "Durationseconds" - }, - "totalTasks": { - "type": "integer", - "title": "Totaltasks", - "default": 0 + "title": "Totalsteps" + } + }, + "required": [ + "id", + "experimentDefinitionId", + "modelName", + "status" + ], + "title": "TrainingSessionDto", + "type": "object" + }, + "Trajectory": { + "description": "One agent's extracted trajectory from a completed episode.\n\nMaps 1:1 to AgentTrajectory from extraction.py, plus metadata.", + "properties": { + "agent_id": { + "title": "Agent Id", + "type": "string" }, - "totalLeafTasks": { - "type": "integer", - "title": "Totalleaftasks", - "default": 0 + "completion_ids": { + "items": { + "type": "integer" + }, + "title": "Completion Ids", + "type": "array" }, - "completedTasks": { - "type": "integer", - "title": "Completedtasks", - "default": 0 + "env_mask": { + "items": { + "type": "integer" + }, + "title": "Env Mask", + "type": "array" }, - "failedTasks": { - "type": "integer", - "title": "Failedtasks", - "default": 0 + "logprobs": { + "items": { + "type": "number" + }, + "title": "Logprobs", + "type": "array" }, - "runningTasks": { - "type": "integer", - "title": "Runningtasks", - "default": 0 + "num_turns": { + "title": "Num Turns", + "type": "integer" }, - "cancelledTasks": { - "type": "integer", - "title": "Cancelledtasks", - "default": 0 + "prompt_ids": { + "items": { + "type": "integer" + }, + "title": "Prompt Ids", + "type": "array" }, - "finalScore": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Finalscore" + "reward": { + "title": "Reward", + "type": "number" }, - "error": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Error" + "run_id": { + "format": "uuid", + "title": "Run Id", + "type": "string" + } + }, + "required": [ + "run_id", + "agent_id", + "prompt_ids", + "completion_ids", + "logprobs", + "env_mask", + "reward", + "num_turns" + ], + "title": "Trajectory", + "type": "object" + }, + "UpdateCohortRequest": { + "description": "Mutable cohort fields exposed through the operator API.", + "properties": { + "status": { + "$ref": "#/components/schemas/ExperimentCohortStatus" } }, - "additionalProperties": false, - "type": "object", "required": [ - "id", - "experimentId", - "name", "status" ], - "title": "RunSnapshotDto" + "title": "UpdateCohortRequest", + "type": "object" }, - "RunTaskDto": { + "ValidationError": { "properties": { - "id": { - "type": "string", - "title": "Id" + "ctx": { + "title": "Context", + "type": "object" }, - "name": { - "type": "string", - "title": "Name" + "input": { + "title": "Input" }, - "description": { - "type": "string", - "title": "Description" + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "title": "Location", + "type": "array" }, - "status": { - "type": "string", - "title": "Status" + "msg": { + "title": "Message", + "type": "string" }, - "parentId": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Parentid" + "type": { + "title": "Error Type", + "type": "string" + } + }, + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError", + "type": "object" + }, + "WeightSyncRequest": { + "description": "Trainer \u2192 Ergon: restart vLLM with updated checkpoint.\n\nFor full-weight RFT: Ergon kills the vLLM process and restarts it\nwith --model pointing to checkpoint_path. Blocks until healthy.", + "properties": { + "checkpoint_path": { + "title": "Checkpoint Path", + "type": "string" }, - "childIds": { - "items": { - "type": "string" + "model_name": { + "title": "Model Name", + "type": "string" + } + }, + "required": [ + "checkpoint_path", + "model_name" + ], + "title": "WeightSyncRequest", + "type": "object" + }, + "WeightSyncResponse": { + "description": "Ergon \u2192 Trainer: sync result.", + "properties": { + "success": { + "title": "Success", + "type": "boolean" + }, + "vllm_model_loaded": { + "title": "Vllm Model Loaded", + "type": "string" + } + }, + "required": [ + "success", + "vllm_model_loaded" + ], + "title": "WeightSyncResponse", + "type": "object" + } + } + }, + "info": { + "description": "Ergon experiment orchestration API", + "title": "Ergon Core", + "version": "0.1.0" + }, + "openapi": "3.1.0", + "paths": { + "/api/inngest": { + "get": { + "operationId": "get_api_inngest_api_inngest_get", + "responses": { + "200": { + "content": { + "application/json": { + "schema": {} + } + }, + "description": "Successful Response" + } + }, + "summary": "Get Api Inngest" + }, + "post": { + "operationId": "post_inngest_api_api_inngest_post", + "responses": { + "200": { + "content": { + "application/json": { + "schema": {} + } + }, + "description": "Successful Response" + } + }, + "summary": "Post Inngest Api" + }, + "put": { + "operationId": "put_inngest_api_api_inngest_put", + "responses": { + "200": { + "content": { + "application/json": { + "schema": {} + } + }, + "description": "Successful Response" + } + }, + "summary": "Put Inngest Api" + } + }, + "/cohorts": { + "get": { + "description": "List all experiment cohorts.", + "operationId": "list_cohorts_cohorts_get", + "parameters": [ + { + "in": "query", + "name": "include_archived", + "required": false, + "schema": { + "default": false, + "title": "Include Archived", + "type": "boolean" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/CohortSummaryDto" + }, + "title": "Response List Cohorts Cohorts Get", + "type": "array" + } + } }, - "type": "array", - "title": "Childids" + "description": "Successful Response" }, - "dependsOnIds": { - "items": { + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + }, + "description": "Validation Error" + } + }, + "summary": "List Cohorts", + "tags": [ + "cohorts" + ] + } + }, + "/cohorts/{cohort_id}": { + "get": { + "description": "Get one cohort detail payload.", + "operationId": "get_cohort_cohorts__cohort_id__get", + "parameters": [ + { + "in": "path", + "name": "cohort_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Cohort Id", "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CohortDetailDto" + } + } }, - "type": "array", - "title": "Dependsonids" - }, - "isLeaf": { - "type": "boolean", - "title": "Isleaf" - }, - "level": { - "type": "integer", - "title": "Level" + "description": "Successful Response" }, - "assignedWorkerId": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Assignedworkerid" - }, - "assignedWorkerName": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + }, + "description": "Validation Error" + } + }, + "summary": "Get Cohort", + "tags": [ + "cohorts" + ] + }, + "patch": { + "description": "Update one cohort's operator-managed fields.", + "operationId": "update_cohort_cohorts__cohort_id__patch", + "parameters": [ + { + "in": "path", + "name": "cohort_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Cohort Id", + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UpdateCohortRequest" } - ], - "title": "Assignedworkername" + } }, - "startedAt": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "null" + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CohortSummaryDto" + } } - ], - "title": "Startedat" + }, + "description": "Successful Response" }, - "completedAt": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Completedat" + }, + "description": "Validation Error" } }, - "additionalProperties": false, - "type": "object", - "required": [ - "id", - "name", - "description", - "status", - "isLeaf", - "level" + "summary": "Update Cohort", + "tags": [ + "cohorts" + ] + } + }, + "/experiments": { + "get": { + "operationId": "list_experiments_experiments_get", + "parameters": [ + { + "in": "query", + "name": "limit", + "required": false, + "schema": { + "default": 50, + "title": "Limit", + "type": "integer" + } + } ], - "title": "RunTaskDto" - }, - "RunTaskEvaluationDto": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "runId": { - "type": "string", - "title": "Runid" - }, - "taskId": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/ExperimentSummaryDto" + }, + "title": "Response List Experiments Experiments Get", + "type": "array" + } } - ], - "title": "Taskid" - }, - "totalScore": { - "type": "number", - "title": "Totalscore" - }, - "maxScore": { - "type": "number", - "title": "Maxscore" - }, - "normalizedScore": { - "type": "number", - "title": "Normalizedscore" - }, - "stagesEvaluated": { - "type": "integer", - "title": "Stagesevaluated" - }, - "stagesPassed": { - "type": "integer", - "title": "Stagespassed" + }, + "description": "Successful Response" }, - "failedGate": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Failedgate" - }, - "createdAt": { - "type": "string", - "format": "date-time", - "title": "Createdat" - }, - "criterionResults": { - "items": { - "$ref": "#/components/schemas/RunEvaluationCriterionDto" }, - "type": "array", - "title": "Criterionresults" + "description": "Validation Error" } }, - "additionalProperties": false, - "type": "object", - "required": [ - "id", - "runId", - "totalScore", - "maxScore", - "normalizedScore", - "stagesEvaluated", - "stagesPassed", - "createdAt" - ], - "title": "RunTaskEvaluationDto" - }, - "SubmitRequest": { - "properties": { - "definition_id": { - "type": "string", - "format": "uuid", - "title": "Definition Id" - }, - "num_episodes": { - "type": "integer", - "minimum": 1.0, - "title": "Num Episodes" + "summary": "List Experiments", + "tags": [ + "experiments" + ] + } + }, + "/experiments/define": { + "post": { + "operationId": "define_experiment_experiments_define_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExperimentDefineRequest" + } + } }, - "policy_version": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" + "required": true + }, + "responses": { + "201": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExperimentDefineResult" + } } - ], - "title": "Policy Version" + }, + "description": "Successful Response" }, - "model_target_override": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Model Target Override" + }, + "description": "Validation Error" + } + }, + "summary": "Define Experiment", + "tags": [ + "experiments" + ] + } + }, + "/experiments/{experiment_id}": { + "get": { + "operationId": "get_experiment_experiments__experiment_id__get", + "parameters": [ + { + "in": "path", + "name": "experiment_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Experiment Id", + "type": "string" + } } - }, - "type": "object", - "required": [ - "definition_id", - "num_episodes" ], - "title": "SubmitRequest", - "description": "Trainer \u2192 Ergon: start a batch of episodes." - }, - "SubmitResponse": { - "properties": { - "batch_id": { - "type": "string", - "format": "uuid", - "title": "Batch Id" - }, - "run_ids": { - "items": { - "type": "string", - "format": "uuid" + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExperimentDetailDto" + } + } }, - "type": "array", - "title": "Run Ids" + "description": "Successful Response" }, - "status": { - "$ref": "#/components/schemas/BatchStatus", - "default": "pending" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + }, + "description": "Validation Error" } }, - "type": "object", - "required": [ - "batch_id", - "run_ids" + "summary": "Get Experiment", + "tags": [ + "experiments" + ] + } + }, + "/experiments/{experiment_id}/run": { + "post": { + "operationId": "run_experiment_experiments__experiment_id__run_post", + "parameters": [ + { + "in": "path", + "name": "experiment_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Experiment Id", + "type": "string" + } + } ], - "title": "SubmitResponse", - "description": "Ergon \u2192 Trainer: batch accepted." - }, - "TrainingCurvePointDto": { - "properties": { - "runId": { - "type": "string", - "title": "Runid" - }, - "step": { - "type": "integer", - "title": "Step" - }, - "meanScore": { - "type": "number", - "title": "Meanscore" - }, - "benchmarkType": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "requestBody": { + "content": { + "application/json": { + "schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/ExperimentRunRequest" + }, + { + "type": "null" + } + ], + "title": "Request" } - ], - "title": "Benchmarktype" + } + } + }, + "responses": { + "202": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExperimentRunResult" + } + } + }, + "description": "Successful Response" }, - "createdAt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Createdat" + }, + "description": "Validation Error" } }, - "additionalProperties": false, - "type": "object", - "required": [ - "runId", - "step", - "meanScore" - ], - "title": "TrainingCurvePointDto" - }, - "TrainingMetricDto": { - "properties": { - "step": { - "type": "integer", - "title": "Step" - }, - "epoch": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "summary": "Run Experiment", + "tags": [ + "experiments" + ] + } + }, + "/rollouts/submit": { + "post": { + "description": "Start a batch of episodes. Returns immediately with batch_id.", + "operationId": "submit_rollout_rollouts_submit_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SubmitRequest" } - ], - "title": "Epoch" + } }, - "loss": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "required": true + }, + "responses": { + "202": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SubmitResponse" + } } - ], - "title": "Loss" + }, + "description": "Successful Response" }, - "gradNorm": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Gradnorm" + }, + "description": "Validation Error" + } + }, + "summary": "Submit Rollout", + "tags": [ + "rollouts" + ] + } + }, + "/rollouts/sync-weights": { + "post": { + "description": "Restart vLLM with a new checkpoint (full-weight RFT).\n\nBlocks until the new vLLM process is healthy.", + "operationId": "sync_weights_rollouts_sync_weights_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/WeightSyncRequest" + } + } }, - "learningRate": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/WeightSyncResponse" + } } - ], - "title": "Learningrate" + }, + "description": "Successful Response" }, - "rewardMean": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Rewardmean" + }, + "description": "Validation Error" + } + }, + "summary": "Sync Weights", + "tags": [ + "rollouts" + ] + } + }, + "/rollouts/{batch_id}": { + "delete": { + "description": "Cancel a pending/running batch.", + "operationId": "cancel_rollout_rollouts__batch_id__delete", + "parameters": [ + { + "in": "path", + "name": "batch_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Batch Id", + "type": "string" + } + } + ], + "responses": { + "204": { + "description": "Successful Response" }, - "rewardStd": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Rewardstd" + }, + "description": "Validation Error" + } + }, + "summary": "Cancel Rollout", + "tags": [ + "rollouts" + ] + }, + "get": { + "description": "Poll batch status. Returns trajectories when complete.", + "operationId": "poll_rollout_rollouts__batch_id__get", + "parameters": [ + { + "in": "path", + "name": "batch_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Batch Id", + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/PollResponse" + } + } + }, + "description": "Successful Response" }, - "entropy": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Entropy" + }, + "description": "Validation Error" + } + }, + "summary": "Poll Rollout", + "tags": [ + "rollouts" + ] + } + }, + "/runs/training/curves": { + "get": { + "description": "Return score-over-step data for checkpoint evaluations.\n\nReads ``summary_json`` on ``RunRecord`` for checkpoint metadata\n(``checkpoint_step``, ``checkpoint_path``) written by the eval\nwatcher, and aggregates ``RunTaskEvaluation.score`` per run.\n\nFilter by ``definition_id`` or ``cohort_id``.", + "operationId": "get_training_curves_runs_training_curves_get", + "parameters": [ + { + "in": "query", + "name": "definition_id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Definition Id" + } }, - "completionMeanLength": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + { + "in": "query", + "name": "cohort_id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cohort Id" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/TrainingCurvePointDto" + }, + "title": "Response Get Training Curves Runs Training Curves Get", + "type": "array" + } } - ], - "title": "Completionmeanlength" + }, + "description": "Successful Response" }, - "stepTimeS": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Steptimes" + }, + "description": "Validation Error" } }, - "additionalProperties": false, - "type": "object", - "required": [ - "step" + "summary": "Get Training Curves", + "tags": [ + "runs" + ] + } + }, + "/runs/training/sessions": { + "get": { + "description": "List training sessions, optionally filtered by definition.", + "operationId": "get_training_sessions_runs_training_sessions_get", + "parameters": [ + { + "in": "query", + "name": "definition_id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Definition Id" + } + } ], - "title": "TrainingMetricDto" - }, - "TrainingSessionDto": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "experimentDefinitionId": { - "type": "string", - "title": "Experimentdefinitionid" - }, - "modelName": { - "type": "string", - "title": "Modelname" - }, - "status": { - "type": "string", - "title": "Status" - }, - "startedAt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Startedat" - }, - "completedAt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/TrainingSessionDto" + }, + "title": "Response Get Training Sessions Runs Training Sessions Get", + "type": "array" + } } - ], - "title": "Completedat" + }, + "description": "Successful Response" }, - "outputDir": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Outputdir" - }, - "totalSteps": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" + }, + "description": "Validation Error" + } + }, + "summary": "Get Training Sessions", + "tags": [ + "runs" + ] + } + }, + "/runs/training/sessions/{session_id}/metrics": { + "get": { + "description": "Get per-step training metrics for a session.", + "operationId": "get_training_metrics_runs_training_sessions__session_id__metrics_get", + "parameters": [ + { + "in": "path", + "name": "session_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Session Id", + "type": "string" + } + } + ], + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/TrainingMetricDto" + }, + "title": "Response Get Training Metrics Runs Training Sessions Session Id Metrics Get", + "type": "array" + } } - ], - "title": "Totalsteps" + }, + "description": "Successful Response" }, - "finalLoss": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } - ], - "title": "Finalloss" + }, + "description": "Validation Error" } }, - "additionalProperties": false, - "type": "object", - "required": [ - "id", - "experimentDefinitionId", - "modelName", - "status" + "summary": "Get Training Metrics", + "tags": [ + "runs" + ] + } + }, + "/runs/{run_id}": { + "get": { + "description": "Get a persisted run-detail snapshot suitable for frontend hydration.", + "operationId": "get_run_runs__run_id__get", + "parameters": [ + { + "in": "path", + "name": "run_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Run Id", + "type": "string" + } + } ], - "title": "TrainingSessionDto" - }, - "Trajectory": { - "properties": { - "run_id": { - "type": "string", - "format": "uuid", - "title": "Run Id" - }, - "agent_id": { - "type": "string", - "title": "Agent Id" - }, - "prompt_ids": { - "items": { - "type": "integer" - }, - "type": "array", - "title": "Prompt Ids" - }, - "completion_ids": { - "items": { - "type": "integer" - }, - "type": "array", - "title": "Completion Ids" - }, - "logprobs": { - "items": { - "type": "number" + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunSnapshotDto" + } + } }, - "type": "array", - "title": "Logprobs" + "description": "Successful Response" }, - "env_mask": { - "items": { - "type": "integer" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } }, - "type": "array", - "title": "Env Mask" - }, - "reward": { - "type": "number", - "title": "Reward" - }, - "num_turns": { - "type": "integer", - "title": "Num Turns" + "description": "Validation Error" } }, - "type": "object", - "required": [ - "run_id", - "agent_id", - "prompt_ids", - "completion_ids", - "logprobs", - "env_mask", - "reward", - "num_turns" - ], - "title": "Trajectory", - "description": "One agent's extracted trajectory from a completed episode.\n\nMaps 1:1 to AgentTrajectory from extraction.py, plus metadata." - }, - "UpdateCohortRequest": { - "properties": { - "status": { - "$ref": "#/components/schemas/ExperimentCohortStatus" + "summary": "Get Run", + "tags": [ + "runs" + ] + } + }, + "/runs/{run_id}/mutations": { + "get": { + "description": "Return the append-only mutation log for a run, ordered by sequence.\n\nUsed by the Timeline scrubber to replay DAG state at any point in time.", + "operationId": "get_mutations_runs__run_id__mutations_get", + "parameters": [ + { + "in": "path", + "name": "run_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Run Id", + "type": "string" + } } - }, - "type": "object", - "required": [ - "status" ], - "title": "UpdateCohortRequest", - "description": "Mutable cohort fields exposed through the operator API." - }, - "ValidationError": { - "properties": { - "loc": { - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/RunGraphMutationDto" + }, + "title": "Response Get Mutations Runs Run Id Mutations Get", + "type": "array" } - ] + } }, - "type": "array", - "title": "Location" - }, - "msg": { - "type": "string", - "title": "Message" - }, - "type": { - "type": "string", - "title": "Error Type" - }, - "input": { - "title": "Input" + "description": "Successful Response" }, - "ctx": { - "type": "object", - "title": "Context" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + }, + "description": "Validation Error" } }, - "type": "object", - "required": [ - "loc", - "msg", - "type" - ], - "title": "ValidationError" - }, - "WeightSyncRequest": { - "properties": { - "checkpoint_path": { - "type": "string", - "title": "Checkpoint Path" + "summary": "Get Mutations", + "tags": [ + "runs" + ] + } + }, + "/runs/{run_id}/resources/{resource_id}/content": { + "get": { + "description": "Stream the blob bytes for a RunResource.\n\nUsed by the dashboard's file-viewer modal. Enforces:\n- resource must belong to the named run (no cross-run leaks);\n- resolved path must sit under ``ERGON_BLOB_ROOT`` (traversal guard);\n- size <= ``_RESOURCE_CONTENT_MAX_BYTES`` (413 otherwise).", + "operationId": "get_resource_content_runs__run_id__resources__resource_id__content_get", + "parameters": [ + { + "in": "path", + "name": "run_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Run Id", + "type": "string" + } }, - "model_name": { - "type": "string", - "title": "Model Name" + { + "in": "path", + "name": "resource_id", + "required": true, + "schema": { + "format": "uuid", + "title": "Resource Id", + "type": "string" + } } - }, - "type": "object", - "required": [ - "checkpoint_path", - "model_name" ], - "title": "WeightSyncRequest", - "description": "Trainer \u2192 Ergon: restart vLLM with updated checkpoint.\n\nFor full-weight RFT: Ergon kills the vLLM process and restarts it\nwith --model pointing to checkpoint_path. Blocks until healthy." - }, - "WeightSyncResponse": { - "properties": { - "success": { - "type": "boolean", - "title": "Success" + "responses": { + "200": { + "content": { + "application/json": { + "schema": {} + } + }, + "description": "Successful Response" }, - "vllm_model_loaded": { - "type": "string", - "title": "Vllm Model Loaded" + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + }, + "description": "Validation Error" } }, - "type": "object", - "required": [ - "success", - "vllm_model_loaded" - ], - "title": "WeightSyncResponse", - "description": "Ergon \u2192 Trainer: sync result." + "summary": "Get Resource Content", + "tags": [ + "runs" + ] } } } diff --git a/ergon-dashboard/src/hooks/useCohortDetail.ts b/ergon-dashboard/src/hooks/useCohortDetail.ts deleted file mode 100644 index 953a44c3..00000000 --- a/ergon-dashboard/src/hooks/useCohortDetail.ts +++ /dev/null @@ -1,93 +0,0 @@ -"use client"; - -import { useCallback, useEffect, useMemo, useState } from "react"; - -import { - parseDashboardCohortUpdatedData, - parseRunCompletedSocketData, -} from "@/lib/contracts/events"; -import { parseCohortDetail } from "@/lib/contracts/rest"; -import { CohortDetail } from "@/lib/types"; -import { useSocket } from "@/hooks/useSocket"; - -interface UseCohortDetailResult { - detail: CohortDetail | null; - isLoading: boolean; - error: string | null; -} - -export function useCohortDetail( - cohortId: string, - initialDetail: CohortDetail | null = null, -): UseCohortDetailResult { - const { socket, isConnected } = useSocket(); - const [detail, setDetail] = useState(initialDetail); - const [isLoading, setIsLoading] = useState(initialDetail === null); - const [error, setError] = useState(null); - - const load = useCallback(async () => { - if (!cohortId) { - setDetail(null); - setIsLoading(false); - setError(null); - return; - } - try { - if (initialDetail === null) { - setIsLoading(true); - } - const response = await fetch(`/api/cohorts/${cohortId}`, { cache: "no-store" }); - if (!response.ok) { - throw new Error(`Failed to load cohort (${response.status})`); - } - const data = parseCohortDetail(await response.json()); - setDetail(data); - setError(null); - } catch (err) { - setError(err instanceof Error ? err.message : "Failed to load cohort"); - } finally { - setIsLoading(false); - } - }, [cohortId, initialDetail]); - - useEffect(() => { - void load(); - }, [load]); - - useEffect(() => { - if (!socket) return; - - const handleCohortUpdated = (payload: unknown) => { - const data = parseDashboardCohortUpdatedData(payload); - if (data.cohort_id !== cohortId) return; - void load(); - }; - - const handleRunCompleted = (payload: unknown) => { - parseRunCompletedSocketData(payload); - void load(); - }; - - socket.on("cohort:updated", handleCohortUpdated); - socket.on("run:completed", handleRunCompleted); - return () => { - socket.off("cohort:updated", handleCohortUpdated); - socket.off("run:completed", handleRunCompleted); - }; - }, [socket, cohortId, load]); - - useEffect(() => { - if (!isConnected && socket) { - setError("Disconnected from server"); - } - }, [isConnected, socket]); - - return useMemo( - () => ({ - detail, - isLoading, - error, - }), - [detail, error, isLoading], - ); -} diff --git a/ergon-dashboard/src/lib/contracts/rest.ts b/ergon-dashboard/src/lib/contracts/rest.ts index cc596656..5745d834 100644 --- a/ergon-dashboard/src/lib/contracts/rest.ts +++ b/ergon-dashboard/src/lib/contracts/rest.ts @@ -9,6 +9,7 @@ export const TaskStatusSchema = z.string(); export const CohortSummarySchema = schemas.CohortSummaryDto; export const CohortDetailSchema = schemas.CohortDetailDto; +export const ExperimentDetailSchema = schemas.ExperimentDetailDto; export const UpdateCohortRequestSchema = schemas.UpdateCohortRequest; export const RunExecutionAttemptSchema = schemas.RunExecutionAttemptDto; @@ -36,7 +37,9 @@ export type TaskStatusValue = z.infer; type RawCohortSummary = KnownKeys>; type RawCohortDetail = KnownKeys>; -type RawCohortRunRow = KnownKeys[number]>; +type RawCohortExperimentRow = KnownKeys[number]>; +type RawExperimentDetail = KnownKeys>; +type RawExperimentRunRow = KnownKeys[number]>; type RawRunExecutionAttempt = KnownKeys>; type RawRunResource = KnownKeys>; type RawRunSandboxCommand = KnownKeys>; @@ -71,6 +74,10 @@ export interface CohortStatusCounts { failed: number; } +export interface ExperimentStatusCounts extends CohortStatusCounts { + cancelled: number; +} + export interface CohortStatsExtras { benchmark_counts?: Record; latest_run_at?: string | null; @@ -104,29 +111,69 @@ export interface CohortSummary worst_score: number | null; } -export interface CohortRunRow +export interface CohortExperimentRow extends Omit< - RawCohortRunRow, + RawCohortExperimentRow, + | "default_evaluator_slug" + | "default_model_target" + | "error_message" + | "final_score" + | "status_counts" + | "total_cost_usd" + > { + default_evaluator_slug: string | null; + default_model_target: string | null; + error_message: string | null; + final_score: number | null; + status_counts: CohortStatusCounts; + total_cost_usd: number | null; +} + +export interface CohortDetail { + summary: CohortSummary; + experiments: CohortExperimentRow[]; +} + +export interface ExperimentRunRow + extends Omit< + RawExperimentRunRow, | "completed_at" | "error_message" + | "evaluator_slug" | "final_score" + | "model_target" | "running_time_ms" + | "seed" | "started_at" | "total_cost_usd" | "total_tasks" + | "worker_team" > { completed_at: string | null; error_message: string | null; + evaluator_slug: string | null; final_score: number | null; + model_target: string | null; running_time_ms: number | null; + seed: number | null; started_at: string | null; total_cost_usd: number | null; total_tasks: number | null; -} - -export interface CohortDetail { - summary: CohortSummary; - runs: CohortRunRow[]; + worker_team: Record; +} + +export interface ExperimentDetail extends Omit { + runs: ExperimentRunRow[]; + analytics: { + total_runs: number; + status_counts: ExperimentStatusCounts; + average_score: number | null; + average_duration_ms: number | null; + average_tasks: number | null; + total_cost_usd: number | null; + latest_activity_at: string | null; + error_count: number; + }; } export interface RunExecutionAttempt @@ -360,15 +407,58 @@ export function parseCohortDetail(input: unknown): CohortDetail { const detail = CohortDetailSchema.parse(input); return { summary: normalizeCohortSummary(detail.summary), + experiments: (detail.experiments ?? []).map((experiment) => ({ + ...experiment, + default_evaluator_slug: experiment.default_evaluator_slug ?? null, + default_model_target: experiment.default_model_target ?? null, + error_message: experiment.error_message ?? null, + final_score: experiment.final_score ?? null, + status_counts: { + pending: experiment.status_counts?.pending ?? 0, + executing: experiment.status_counts?.executing ?? 0, + evaluating: experiment.status_counts?.evaluating ?? 0, + completed: experiment.status_counts?.completed ?? 0, + failed: experiment.status_counts?.failed ?? 0, + }, + total_cost_usd: experiment.total_cost_usd ?? null, + })), + }; +} + +export function parseExperimentDetail(input: unknown): ExperimentDetail { + const detail = ExperimentDetailSchema.parse(input); + return { + ...detail, + analytics: { + total_runs: detail.analytics?.total_runs ?? 0, + average_duration_ms: detail.analytics?.average_duration_ms ?? null, + average_score: detail.analytics?.average_score ?? null, + average_tasks: detail.analytics?.average_tasks ?? null, + error_count: detail.analytics?.error_count ?? 0, + latest_activity_at: detail.analytics?.latest_activity_at ?? null, + status_counts: { + pending: detail.analytics?.status_counts?.pending ?? 0, + executing: detail.analytics?.status_counts?.executing ?? 0, + evaluating: detail.analytics?.status_counts?.evaluating ?? 0, + completed: detail.analytics?.status_counts?.completed ?? 0, + failed: detail.analytics?.status_counts?.failed ?? 0, + cancelled: detail.analytics?.status_counts?.cancelled ?? 0, + }, + total_cost_usd: detail.analytics?.total_cost_usd ?? null, + }, runs: (detail.runs ?? []).map((run) => ({ ...run, completed_at: run.completed_at ?? null, error_message: run.error_message ?? null, + evaluator_slug: run.evaluator_slug ?? null, final_score: run.final_score ?? null, + model_target: run.model_target ?? null, running_time_ms: run.running_time_ms ?? null, + seed: run.seed ?? null, started_at: run.started_at ?? null, total_cost_usd: run.total_cost_usd ?? null, total_tasks: run.total_tasks ?? null, + worker_team: run.worker_team ?? {}, })), }; } diff --git a/ergon-dashboard/src/lib/testing/dashboardHarness.ts b/ergon-dashboard/src/lib/testing/dashboardHarness.ts index 16955fcb..ebaed953 100644 --- a/ergon-dashboard/src/lib/testing/dashboardHarness.ts +++ b/ergon-dashboard/src/lib/testing/dashboardHarness.ts @@ -13,6 +13,7 @@ import { CohortDetail, CohortSummary, ContextEventState, + ExperimentDetail, ExperimentCohortStatus, SerializedWorkflowRunState, TaskEvaluationState, @@ -26,6 +27,7 @@ declare global { | { cohorts: CohortSummary[]; cohortDetails: Record; + experimentDetails: Record; mutationsByRun: Record; } | undefined; @@ -34,6 +36,7 @@ declare global { export interface DashboardHarnessSeedPayload { cohorts?: CohortSummary[]; cohortDetails?: Record; + experimentDetails?: Record; runs?: SerializedWorkflowRunState[]; mutations?: Record; } @@ -43,6 +46,7 @@ function getHarnessState() { global.__dashboardHarness = { cohorts: [], cohortDetails: {}, + experimentDetails: {}, mutationsByRun: {}, }; } @@ -61,6 +65,7 @@ export function resetDashboardHarness(): void { const harness = getHarnessState(); harness.cohorts = []; harness.cohortDetails = {}; + harness.experimentDetails = {}; harness.mutationsByRun = {}; } @@ -71,6 +76,7 @@ export function seedDashboardHarness(payload: DashboardHarnessSeedPayload): void const harness = getHarnessState(); harness.cohorts = payload.cohorts ?? []; harness.cohortDetails = payload.cohortDetails ?? {}; + harness.experimentDetails = payload.experimentDetails ?? {}; harness.mutationsByRun = payload.mutations ?? {}; for (const run of payload.runs ?? []) { @@ -88,6 +94,11 @@ export function getHarnessCohort(cohortId: string): CohortDetail | null { return getHarnessState().cohortDetails[cohortId] ?? null; } +export function getHarnessExperiment(experimentId: string): ExperimentDetail | null { + requireHarnessEnabled(); + return getHarnessState().experimentDetails[experimentId] ?? null; +} + export function updateHarnessCohortStatus( cohortId: string, status: ExperimentCohortStatus, @@ -160,37 +171,24 @@ export function emitHarnessRunCompleted(data: { const harness = getHarnessState(); const detail = harness.cohortDetails[data.cohortId]; if (detail) { - const updatedRuns = (detail.runs ?? []).map((run) => - run.run_id === data.runId - ? { - ...run, - status: data.status, - final_score: data.finalScore, - error_message: data.error, - completed_at: new Date().toISOString(), - running_time_ms: data.durationSeconds * 1000, - } - : run, - ); - const completed = updatedRuns.filter((run) => run.status === "completed").length; - const failed = updatedRuns.filter((run) => run.status === "failed").length; - const executing = updatedRuns.filter((run) => run.status === "executing").length; - const pending = updatedRuns.filter((run) => run.status === "pending").length; - const evaluating = updatedRuns.filter((run) => run.status === "evaluating").length; const summary: CohortSummary = { ...detail.summary, - total_runs: updatedRuns.length, + total_runs: detail.summary.total_runs, status_counts: { - pending, - executing, - evaluating, - completed, - failed, + ...detail.summary.status_counts, + completed: + data.status === "completed" + ? detail.summary.status_counts.completed + 1 + : detail.summary.status_counts.completed, + failed: + data.status === "failed" + ? detail.summary.status_counts.failed + 1 + : detail.summary.status_counts.failed, }, }; const updatedDetail: CohortDetail = { + ...detail, summary, - runs: updatedRuns, }; harness.cohortDetails[data.cohortId] = updatedDetail; harness.cohorts = harness.cohorts.map((cohort) => diff --git a/ergon-dashboard/src/lib/types.ts b/ergon-dashboard/src/lib/types.ts index 8e7e9e02..a9a2ac20 100644 --- a/ergon-dashboard/src/lib/types.ts +++ b/ergon-dashboard/src/lib/types.ts @@ -4,6 +4,7 @@ export type { ContextEventState }; import type { BenchmarkName as RestBenchmarkName, CohortDetail as RestCohortDetail, + ExperimentDetail as RestExperimentDetail, CohortSummary as RestCohortSummary, ExperimentCohortStatusValue, RunCommunicationMessage as RestRunCommunicationMessage, @@ -91,8 +92,9 @@ export type DashboardEventName = export type DashboardWorkflowStartedData = GeneratedDashboardWorkflowStartedData; export type DashboardWorkflowCompletedData = GeneratedDashboardWorkflowCompletedData; export type CohortSummary = RestCohortSummary; -export type CohortRunRow = NonNullable[number]; +export type CohortExperimentRow = NonNullable[number]; export type CohortDetail = RestCohortDetail; +export type ExperimentDetail = RestExperimentDetail; export type DashboardCohortUpdatedData = GeneratedDashboardCohortUpdatedData; export type DashboardTaskStatusChangedData = GeneratedDashboardTaskStatusChangedData; export type DashboardResourcePublishedData = GeneratedDashboardResourcePublishedData; diff --git a/ergon-dashboard/tests/contracts/contracts.test.ts b/ergon-dashboard/tests/contracts/contracts.test.ts index cfd735da..155c8956 100644 --- a/ergon-dashboard/tests/contracts/contracts.test.ts +++ b/ergon-dashboard/tests/contracts/contracts.test.ts @@ -100,9 +100,11 @@ test("cohort detail parser accepts harness payload", () => { const parsed = parseCohortDetail(cohortDetail); assert.equal(parsed.summary.cohort_id, FIXTURE_IDS.cohortId); - assert.equal((parsed.runs ?? []).length, 3); - assert.equal(parsed.runs[0]?.total_tasks, 10); - assert.equal(parsed.runs[0]?.total_cost_usd, 0.12); + assert.equal((parsed.experiments ?? []).length, 1); + assert.equal(parsed.experiments[0]?.total_runs, 3); + assert.equal(parsed.experiments[0]?.status_counts.completed, 3); + assert.equal(parsed.experiments[0]?.final_score, 1); + assert.equal(parsed.experiments[0]?.total_cost_usd, 0.42); }); test("workflow started event parser validates recursive task trees", () => { diff --git a/ergon-dashboard/tests/e2e/_shared/smoke.ts b/ergon-dashboard/tests/e2e/_shared/smoke.ts index 7f98c562..2640a41b 100644 --- a/ergon-dashboard/tests/e2e/_shared/smoke.ts +++ b/ergon-dashboard/tests/e2e/_shared/smoke.ts @@ -279,17 +279,19 @@ export function defineSmokeSpec(cfg: SmokeSpecConfig): void { }); } - test(`cohort ${cohortKey} index lists all runs`, async ({ page }) => { + test(`cohort ${cohortKey} index lists all experiments`, async ({ page }) => { const cohortRuns = await client.getCohortRuns(cohortKey); - expect(cohortRuns.length).toBe(cohort.length); + expect(cohortRuns.length).toBeGreaterThanOrEqual(cohort.length); const cohortId = await client.getCohortId(cohortKey); await page.goto(`/cohorts/${cohortId}`); - // Dashboard keys cohort-run rows as ``cohort-run-row-`` - // (per CohortDetailView.tsx:36) — prefix match via locator rather + // Dashboard keys cohort experiment rows as ``cohort-experiment-row-``; + // prefix match via locator rather // than exact getByTestId. - const rows = page.locator('[data-testid^="cohort-run-row-"]'); - await expect(rows).toHaveCount(cohort.length); + const rows = page.locator('[data-testid^="cohort-experiment-row-"]'); + await expect(async () => { + await expect(rows).toHaveCount(cohortRuns.length); + }).toPass(); // ``cohort-header`` exists but no dedicated env label testid yet — // follow-up for dashboard. Screenshot captures the page state. await expect(page.getByTestId("cohort-header")).toBeVisible(); diff --git a/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts b/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts index e8ba4d3c..f19e7c69 100644 --- a/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts +++ b/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts @@ -25,7 +25,7 @@ test.afterEach(async () => { }); test("cohort index renders cohort-first snapshot truth", async ({ page }) => { - await page.goto("/"); + await page.goto("/cohorts"); await expect(page.getByTestId("cohort-index-header")).toContainText("Cohorts"); await expect(page.getByTestId(`cohort-row-${FIXTURE_IDS.cohortId}`)).toContainText( @@ -34,28 +34,32 @@ test("cohort index renders cohort-first snapshot truth", async ({ page }) => { await expect(page.getByTestId("cohort-index-list")).toContainText("Runs"); }); -test("cohort detail renders summary and run list", async ({ page }) => { +test("cohort detail renders summary and experiment list", async ({ page }) => { await page.goto(`/cohorts/${FIXTURE_IDS.cohortId}`); await expect(page.getByTestId("cohort-header")).toContainText("minif2f-react-worker-gpt5v3"); - await expect(page.getByTestId("cohort-summary-cards")).toContainText("Runs · pass / fail"); - await expect(page.getByTestId("cohort-summary-cards")).toContainText("3 of 3 runs"); - await expect(page.getByTestId("cohort-summary-cards")).toContainText("Avg tasks"); - await expect(page.getByTestId("cohort-summary-cards")).toContainText("10.0"); + await expect(page.getByTestId("cohort-summary-cards")).toContainText("Experiments"); + await expect(page.getByTestId("cohort-summary-cards")).toContainText("3 total runs"); + await expect(page.getByTestId("cohort-summary-cards")).toContainText("$0.42"); await expect(page.getByRole("button", { name: "Compare" })).toHaveCount(0); await expect(page.getByRole("button", { name: "Re-run failed" })).toHaveCount(0); await expect(page.getByRole("button", { name: "Open in training" })).toHaveCount(0); - await expect(page.getByTestId("cohort-run-distribution")).toBeVisible(); - await expect(page.getByTestId("cohort-run-distribution")).toContainText("Score distribution"); - await expect(page.getByTestId("cohort-distribution-point")).toHaveCount(3); - await page.getByTestId("cohort-distribution-metric-runtime").click(); - await expect(page.getByTestId("cohort-run-distribution")).toContainText("Runtime distribution"); - await expect(page.getByTestId("cohort-distribution-point")).toHaveCount(3); - const runRow = page.getByTestId(`cohort-run-row-${FIXTURE_IDS.runId}`); - await expect(runRow).toContainText("minif2f-react-worker-gpt5v3"); - await expect(runRow).toContainText("Started"); - await expect(runRow.locator("time[datetime]")).toHaveAttribute( - "datetime", - "2026-03-18T12:00:00.000Z", + const experimentRow = page.getByTestId(`cohort-experiment-row-${FIXTURE_IDS.experimentId}`); + await expect(experimentRow).toContainText("minif2f smoke n=3"); + await expect(experimentRow).toContainText("3 done · 0 failed · 0 active"); + await expect(experimentRow).toContainText("lean-evaluator"); +}); + +test("experiment detail renders restored run analytics surface", async ({ page }) => { + await page.goto(`/experiments/${FIXTURE_IDS.experimentId}`); + + await expect(page.getByRole("heading", { name: "minif2f smoke n=3" })).toBeVisible(); + await expect(page.getByTestId("experiment-summary-cards")).toContainText("Score"); + await expect(page.getByTestId("experiment-summary-cards")).toContainText("10"); + await expect(page.getByTestId("experiment-run-distribution")).toContainText("algebra_sample"); + await expect(page.getByTestId("experiment-run-distribution")).toContainText("score 1"); + await expect(page.getByRole("link", { name: FIXTURE_IDS.runId })).toHaveAttribute( + "href", + `/cohorts/${FIXTURE_IDS.cohortId}/runs/${FIXTURE_IDS.runId}`, ); }); diff --git a/ergon-dashboard/tests/helpers/dashboardFixtures.ts b/ergon-dashboard/tests/helpers/dashboardFixtures.ts index beaabe38..7ff94eb2 100644 --- a/ergon-dashboard/tests/helpers/dashboardFixtures.ts +++ b/ergon-dashboard/tests/helpers/dashboardFixtures.ts @@ -461,53 +461,88 @@ export function createDashboardSeed(): DashboardHarnessSeedPayload { const detail = { summary, + experiments: [ + { + experiment_id: FIXTURE_IDS.experimentId, + name: "minif2f smoke n=3", + benchmark_type: "minif2f", + sample_count: 3, + total_runs: 3, + status_counts: { + pending: 0, + executing: 0, + evaluating: 0, + completed: 3, + failed: 0, + }, + status: "completed", + created_at: "2026-03-18T11:59:30.000Z", + default_model_target: "openai:gpt-5", + default_evaluator_slug: "lean-evaluator", + final_score: 1, + total_cost_usd: 0.42, + error_message: null, + }, + ], + }; + + const experimentDetail = { + experiment: { + experiment_id: FIXTURE_IDS.experimentId, + cohort_id: FIXTURE_IDS.cohortId, + name: "minif2f smoke n=3", + benchmark_type: "minif2f", + sample_count: 3, + status: "completed", + default_worker_team: { primary: "minif2f-react" }, + default_evaluator_slug: "lean-evaluator", + default_model_target: "openai:gpt-5", + created_at: "2026-03-18T11:59:30.000Z", + started_at: "2026-03-18T12:00:00.000Z", + completed_at: "2026-03-18T12:02:26.000Z", + run_count: 3, + }, runs: [ { run_id: FIXTURE_IDS.runId, - definition_id: FIXTURE_IDS.experimentId, - cohort_id: FIXTURE_IDS.cohortId, - cohort_name: summary.name, + workflow_definition_id: FIXTURE_IDS.experimentId, + benchmark_type: "minif2f", + instance_key: "algebra_sample", status: "completed", created_at: "2026-03-18T11:59:30.000Z", started_at: "2026-03-18T12:00:00.000Z", completed_at: "2026-03-18T12:00:24.000Z", + evaluator_slug: "lean-evaluator", + model_target: "openai:gpt-5", + worker_team: { primary: "minif2f-react" }, + seed: null, running_time_ms: 24_000, final_score: 1, total_tasks: 10, total_cost_usd: 0.12, error_message: null, }, - { - run_id: "22222222-2222-4222-8222-222222222223", - definition_id: FIXTURE_IDS.experimentId, - cohort_id: FIXTURE_IDS.cohortId, - cohort_name: summary.name, - status: "completed", - created_at: "2026-03-18T12:00:30.000Z", - started_at: "2026-03-18T12:01:00.000Z", - completed_at: "2026-03-18T12:01:22.000Z", - running_time_ms: 22_000, - final_score: 1, - total_tasks: 10, - total_cost_usd: 0.14, - error_message: null, - }, - { - run_id: "22222222-2222-4222-8222-222222222224", - definition_id: FIXTURE_IDS.experimentId, - cohort_id: FIXTURE_IDS.cohortId, - cohort_name: summary.name, - status: "completed", - created_at: "2026-03-18T12:01:30.000Z", - started_at: "2026-03-18T12:02:00.000Z", - completed_at: "2026-03-18T12:02:26.000Z", - running_time_ms: 26_000, - final_score: 1, - total_tasks: 10, - total_cost_usd: 0.16, - error_message: null, - }, ], + analytics: { + total_runs: 3, + status_counts: { + pending: 0, + executing: 0, + evaluating: 0, + completed: 3, + failed: 0, + cancelled: 0, + }, + average_score: 1, + average_duration_ms: 24_000, + average_tasks: 10, + total_cost_usd: 0.42, + latest_activity_at: "2026-03-18T12:02:26.000Z", + error_count: 0, + }, + sample_selection: { instance_keys: ["algebra_sample", "number_theory_sample", "geometry_sample"] }, + design: {}, + metadata: {}, }; const concurrent = createConcurrentMasSeedOnly(); @@ -517,6 +552,10 @@ export function createDashboardSeed(): DashboardHarnessSeedPayload { [FIXTURE_IDS.cohortId]: detail, ...(concurrent.cohortDetails ?? {}), }, + experimentDetails: { + [FIXTURE_IDS.experimentId]: experimentDetail, + ...(concurrent.experimentDetails ?? {}), + }, runs: [runState, ...(concurrent.runs ?? [])], mutations: concurrent.mutations, }; @@ -569,19 +608,25 @@ function createConcurrentMasSeedOnly(): DashboardHarnessSeedPayload { const detail = { summary, - runs: [ + experiments: [ { - run_id: CONCURRENT_MAS_FIXTURE_IDS.runId, - definition_id: CONCURRENT_MAS_FIXTURE_IDS.experimentId, - cohort_id: CONCURRENT_MAS_FIXTURE_IDS.cohortId, - cohort_name: summary.name, + experiment_id: CONCURRENT_MAS_FIXTURE_IDS.experimentId, + name: "visual debugger n=1", + benchmark_type: "visual_debugger", + sample_count: 1, + total_runs: 1, + status_counts: { + pending: 0, + executing: 1, + evaluating: 0, + completed: 0, + failed: 0, + }, status: "executing", created_at: "2026-04-26T11:59:30.000Z", - started_at: "2026-04-26T12:00:00.000Z", - completed_at: null, - running_time_ms: 30_000, + default_model_target: "fixture", + default_evaluator_slug: null, final_score: null, - total_tasks: null, total_cost_usd: null, error_message: null, }, @@ -593,6 +638,7 @@ function createConcurrentMasSeedOnly(): DashboardHarnessSeedPayload { cohortDetails: { [CONCURRENT_MAS_FIXTURE_IDS.cohortId]: detail, }, + experimentDetails: {}, runs: [concurrentMasFixture.runState as SerializedWorkflowRunState], mutations: { [CONCURRENT_MAS_FIXTURE_IDS.runId]: concurrentMasFixture.mutations, diff --git a/ergon_builtins/AGENTS.md b/ergon_builtins/AGENTS.md index cfb169a3..e8c07e07 100644 --- a/ergon_builtins/AGENTS.md +++ b/ergon_builtins/AGENTS.md @@ -13,13 +13,13 @@ add a new component, update the dicts there *and* this doc. | Goal | Command | |---|---| -| Populate **SANDBOX** panel (stdin/stdout events) with no LLM | `ergon benchmark run researchrubrics-smoke --worker canonical-smoke` | -| Populate **GENERATIONS** without calling a model | `ergon benchmark run smoke-test --worker training-stub` | +| Populate **SANDBOX** panel (stdin/stdout events) with no LLM | `ergon experiment define researchrubrics-smoke --worker canonical-smoke --model stub:constant --limit 1 && ergon experiment run ` | +| Populate **GENERATIONS** without calling a model | `ergon experiment define smoke-test --worker training-stub --model stub:constant --limit 1 && ergon experiment run ` | | Populate **EVALUATION** with a passing gate, no LLM | any benchmark + `--evaluator stub-rubric` | | Populate **EVALUATION** with varied scores (RL reward-shape test) | any benchmark + `--evaluator varied-stub-rubric` | -| Test a real ReAct agent end-to-end | `ergon benchmark run swebench-verified --worker swebench-react --model openai:gpt-4o` | -| Test manager → researcher delegation with a real LLM | `ergon benchmark run researchrubrics-smoke --worker researchrubrics-researcher --model openai:gpt-4o` | -| Test Lean 4 proof verification | `ergon benchmark run minif2f --worker minif2f-react --model openai:gpt-4o` (needs Lean sandbox) | +| Test a real ReAct agent end-to-end | `ergon experiment define swebench-verified --worker swebench-react --model openai:gpt-4o --limit 1 && ergon experiment run ` | +| Test manager → researcher delegation with a real LLM | `ergon experiment define researchrubrics-smoke --worker researchrubrics-researcher --model openai:gpt-4o --limit 1 && ergon experiment run ` | +| Test Lean 4 proof verification | `ergon experiment define minif2f --worker minif2f-react --model openai:gpt-4o --limit 1 && ergon experiment run ` (needs Lean sandbox) | --- @@ -108,16 +108,16 @@ and is instantiated directly by `researchrubrics-researcher`; it is not in --- -## Model backends (`MODEL_BACKENDS` in registry_core.py) +## Model targets (`resolve_model_target`) | prefix | file | notes | |---|---|---| -| `vllm:` | `models/vllm_backend.py` | Points at a running vLLM server; supports logprobs. | -| `openai:`, `anthropic:`, `google:` | `models/cloud_passthrough.py` | Passes through to pydantic-ai's provider. No logprobs. | -| *(no prefix)* | fallthrough | Handed to pydantic-ai's `infer_model` — may pick a default or fail. | +| `vllm:[#]` | `ergon_core/core/providers/generation/openai_compatible.py` | Points at a running vLLM server; supports logprobs. | +| `openai-compatible:#` | `ergon_core/core/providers/generation/openai_compatible.py` | Generic OpenAI-compatible endpoints such as Ollama. | +| `openai:`, `anthropic:`, `google:` | `ergon_core/core/providers/generation/openrouter.py` | Always routed through OpenRouter, not direct cloud APIs. | Default when `--model` is omitted: `openai:gpt-4o` -(`ergon_core/core/providers/generation/model_resolution.py:57`). +(`ergon_core/core/providers/generation/model_resolution.py`). --- diff --git a/ergon_builtins/ergon_builtins/benchmarks/researchrubrics/benchmark.py b/ergon_builtins/ergon_builtins/benchmarks/researchrubrics/benchmark.py index 6d343710..b9b11107 100644 --- a/ergon_builtins/ergon_builtins/benchmarks/researchrubrics/benchmark.py +++ b/ergon_builtins/ergon_builtins/benchmarks/researchrubrics/benchmark.py @@ -107,10 +107,11 @@ def _payload_from_row( row: Mapping[str, Any], # slopcop: ignore[no-typing-any] ) -> ResearchRubricsTaskPayload: """Convert one raw HuggingFace row into the benchmark payload schema.""" + ablated_prompt = row.get("ablated_prompt") or row["prompt"] return ResearchRubricsTaskPayload( sample_id=row["sample_id"], domain=str(row.get("domain", "")), - ablated_prompt=row["ablated_prompt"], + ablated_prompt=ablated_prompt, rubrics=[ RubricCriterion( criterion=r["criterion"], diff --git a/ergon_builtins/ergon_builtins/models/cloud_passthrough.py b/ergon_builtins/ergon_builtins/models/cloud_passthrough.py deleted file mode 100644 index e7620a1d..00000000 --- a/ergon_builtins/ergon_builtins/models/cloud_passthrough.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Cloud passthrough: resolves ``openai:``, ``anthropic:``, etc. by passing through to PydanticAI.""" - -from ergon_core.core.providers.generation.model_resolution import ResolvedModel - - -def resolve_cloud( - target: str, - *, - model_name: str | None = None, - policy_version: str | None = None, - api_key: str | None = None, -) -> ResolvedModel: - """Pass cloud model targets through to PydanticAI's infer_model.""" - return ResolvedModel(model=target, supports_logprobs=False) diff --git a/ergon_builtins/ergon_builtins/models/vllm_backend.py b/ergon_builtins/ergon_builtins/models/vllm_backend.py deleted file mode 100644 index 0488f5a1..00000000 --- a/ergon_builtins/ergon_builtins/models/vllm_backend.py +++ /dev/null @@ -1,58 +0,0 @@ -"""vLLM backend: resolves ``vllm:http://...`` targets to OpenAI-compatible PydanticAI models.""" - -import json -import logging -import urllib.error -import urllib.request - -from ergon_core.core.providers.generation.model_resolution import ResolvedModel -from pydantic_ai.models.openai import OpenAIModel as OpenAIChatModel -from pydantic_ai.providers.openai import OpenAIProvider - -logger = logging.getLogger(__name__) - - -def resolve_vllm( - target: str, - *, - model_name: str | None = None, - policy_version: str | None = None, - api_key: str | None = None, -) -> ResolvedModel: - """Resolve a ``vllm:http://...`` target to a PydanticAI model.""" - endpoint = target[5:].rstrip("/") - resolved_name = model_name or _discover_model_name(endpoint) - provider = OpenAIProvider( - base_url=f"{endpoint}/v1", - api_key=api_key or "not-needed", - ) - model = OpenAIChatModel(model_name=resolved_name, provider=provider) - logger.info( - "Resolved vLLM model: endpoint=%s model_name=%s policy_version=%s", - endpoint, - resolved_name, - policy_version, - ) - return ResolvedModel(model=model, policy_version=policy_version, supports_logprobs=True) - - -def _discover_model_name(endpoint: str) -> str: - """Query ``/v1/models`` to discover the served model name.""" - url = f"{endpoint}/v1/models" - try: - with urllib.request.urlopen(url, timeout=5) as resp: - body = json.loads(resp.read()) - models = body.get("data", []) - if models: - name = models[0].get("id", "default") - logger.info("Discovered vLLM model name: %s", name) - return name - except ( - urllib.error.HTTPError, - urllib.error.URLError, - TimeoutError, - OSError, - json.JSONDecodeError, - ): - logger.warning("Could not discover vLLM model name from %s, using 'default'", url) - return "default" diff --git a/ergon_builtins/ergon_builtins/registry.py b/ergon_builtins/ergon_builtins/registry.py index aa340e2f..f91f9ddb 100644 --- a/ergon_builtins/ergon_builtins/registry.py +++ b/ergon_builtins/ergon_builtins/registry.py @@ -8,10 +8,6 @@ import structlog from ergon_core.api import Benchmark, Evaluator, Worker -from ergon_core.core.providers.generation.model_resolution import ( - ResolvedModel, - register_model_backend, -) from ergon_core.core.providers.sandbox.manager import BaseSandboxManager from ergon_builtins.registry_core import ( @@ -20,9 +16,6 @@ from ergon_builtins.registry_core import ( EVALUATORS as _core_evaluators, ) -from ergon_builtins.registry_core import ( - MODEL_BACKENDS as _core_model_backends, -) from ergon_builtins.registry_core import ( SANDBOX_MANAGERS as _core_sandbox_managers, ) @@ -42,19 +35,6 @@ EVALUATORS: dict[str, type[Evaluator]] = {**_core_evaluators} SANDBOX_MANAGERS: dict[str, type[BaseSandboxManager]] = {**_core_sandbox_managers} -_model_backends: dict[str, Callable[..., ResolvedModel]] = {**_core_model_backends} - -# -- Capability: local-models ---------------------------------------------- - -try: - from ergon_builtins.registry_local_models import ( - MODEL_BACKENDS as _local_model_backends, - ) - - _model_backends.update(_local_model_backends) -except ImportError: - log.info("ergon-builtins[local-models] not installed; local transformers inference unavailable") - # -- Capability: data ------------------------------------------------------ try: @@ -80,11 +60,6 @@ "ergon-builtins[data] not installed; gdpeval and researchrubrics benchmarks unavailable" ) -# -- Register model backends ----------------------------------------------- - -for prefix, resolver in _model_backends.items(): - register_model_backend(prefix, resolver) - # -- Install hints for slugs that require optional capabilities ------------- INSTALL_HINTS: dict[str, str] = { diff --git a/ergon_builtins/ergon_builtins/registry_core.py b/ergon_builtins/ergon_builtins/registry_core.py index 7be86868..67ea3697 100644 --- a/ergon_builtins/ergon_builtins/registry_core.py +++ b/ergon_builtins/ergon_builtins/registry_core.py @@ -10,7 +10,6 @@ from uuid import UUID from ergon_core.api import Benchmark, Evaluator, Worker -from ergon_core.core.providers.generation.model_resolution import ResolvedModel from ergon_core.core.providers.sandbox.manager import BaseSandboxManager from ergon_builtins.benchmarks.gdpeval.rubric import StagedRubric @@ -25,8 +24,6 @@ ) from ergon_builtins.benchmarks.swebench_verified.toolkit import SWEBenchToolkit from ergon_builtins.evaluators.rubrics.swebench_rubric import SWEBenchRubric -from ergon_builtins.models.cloud_passthrough import resolve_cloud -from ergon_builtins.models.vllm_backend import resolve_vllm from ergon_builtins.workers.baselines.react_prompts import ( MINIF2F_SYSTEM_PROMPT, SWEBENCH_SYSTEM_PROMPT, @@ -178,10 +175,3 @@ def _swebench_react( "minif2f": Path(__file__).parent / "benchmarks/minif2f/sandbox", "swebench-verified": Path(__file__).parent / "benchmarks/swebench_verified/sandbox", } - -MODEL_BACKENDS: dict[str, Callable[..., ResolvedModel]] = { - "vllm": resolve_vllm, - "openai": resolve_cloud, - "anthropic": resolve_cloud, - "google": resolve_cloud, -} diff --git a/ergon_builtins/ergon_builtins/registry_local_models.py b/ergon_builtins/ergon_builtins/registry_local_models.py deleted file mode 100644 index e45abd5d..00000000 --- a/ergon_builtins/ergon_builtins/registry_local_models.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Components that require the [local-models] capability (torch + transformers). - -Eager, fully-typed imports. This module will fail to import if torch/ -transformers/outlines are not installed — that's by design. The composition -layer in registry.py handles the ImportError gracefully. -""" - -from collections.abc import Callable - -from ergon_core.core.providers.generation.model_resolution import ResolvedModel - -from ergon_builtins.models.transformers_backend import resolve_transformers - -MODEL_BACKENDS: dict[str, Callable[..., ResolvedModel]] = { - "transformers": resolve_transformers, -} diff --git a/ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py b/ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py index 0b540b6c..f15a0984 100644 --- a/ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py +++ b/ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py @@ -1,20 +1,29 @@ from collections.abc import Awaitable, Callable +import shlex from typing import Protocol from uuid import UUID from ergon_cli.commands.workflow import ( WorkflowCommandContext, WorkflowCommandOutput, - execute_workflow_command, + execute_workflow_command_async, ) from ergon_core.api.worker_context import WorkerContext from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.runtime.services.workflow_service import WorkflowService from sqlmodel import Session +_MANAGER_ONLY_ACTIONS = { + "add-task", + "add-edge", + "update-task-description", + "restart-task", + "abandon-task", +} + class WorkflowCommandExecutor(Protocol): - def __call__( + async def __call__( self, command: str, *, @@ -29,9 +38,10 @@ def make_workflow_cli_tool( worker_context: WorkerContext, sandbox_task_key: UUID, benchmark_type: str, - execute_command: WorkflowCommandExecutor = execute_workflow_command, + execute_command: WorkflowCommandExecutor = execute_workflow_command_async, session_factory: Callable[[], Session] = get_session, service_factory: Callable[[], WorkflowService] = WorkflowService, + manager_capable: bool = False, ) -> Callable[[str], Awaitable[str]]: """Build an agent-facing ``workflow(command)`` callable. @@ -44,19 +54,25 @@ async def workflow(command: str) -> str: """Inspect workflow topology/resources or dry-run workflow management commands.""" if worker_context.node_id is None: raise ValueError("workflow tool requires WorkerContext.node_id") + denial = _denial_reason(command, manager_capable=manager_capable) + if denial is not None: + return f"workflow denied: {denial}" - output = execute_command( - command, - context=WorkflowCommandContext( - run_id=worker_context.run_id, - node_id=worker_context.node_id, - execution_id=worker_context.execution_id, - sandbox_task_key=sandbox_task_key, - benchmark_type=benchmark_type, - ), - session_factory=session_factory, - service=service_factory(), - ) + try: + output = await execute_command( + command, + context=WorkflowCommandContext( + run_id=worker_context.run_id, + node_id=worker_context.node_id, + execution_id=worker_context.execution_id, + sandbox_task_key=sandbox_task_key, + benchmark_type=benchmark_type, + ), + session_factory=session_factory, + service=service_factory(), + ) + except Exception as exc: # slopcop: ignore[no-broad-except] + return f"workflow failed: {type(exc).__name__}: {exc}" if output.exit_code != 0: detail = output.stderr or output.stdout return f"workflow exited {output.exit_code}: {detail}".strip() @@ -65,3 +81,16 @@ async def workflow(command: str) -> str: return output.stdout return workflow + + +def _denial_reason(command: str, *, manager_capable: bool) -> str | None: + if "\n" in command or "\r" in command: + return "multiline commands are not allowed" + try: + argv = shlex.split(command) + except ValueError as exc: + return f"could not parse command: {exc}" + if len(argv) >= 3 and argv[0] == "manage" and argv[1] in _MANAGER_ONLY_ACTIONS: + if not manager_capable: + return f"{argv[1]} requires a manager-capable workflow tool" + return None diff --git a/ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py b/ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py index 459351c8..37ec781e 100644 --- a/ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py +++ b/ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py @@ -21,11 +21,11 @@ ModelResponsePart, TextPart, ThinkingPart, - TokenLogprob, ToolCallPart, ToolReturnPart, UserPromptPart, ) +from ergon_core.core.providers.generation.types import TokenLogprob class TrainingStubWorker(Worker): diff --git a/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py b/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py index 302a9bde..f7d48845 100644 --- a/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py +++ b/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py @@ -45,14 +45,39 @@ "- workflow: Inspect current-run task topology and resources\n\n" "Write your final report to 'final_output/report.md' using write_report_draft. " "Include a # Findings section and a ## Sources section with citations.\n\n" + "Hard operating budget: use at most 6 exa_search calls for your own work. " + "After that, write the report from the evidence you have. Prefer targeted " + "queries over broad exploration.\n\n" "Use workflow(command) to inspect this run before " "deciding what context is missing. Useful commands include: " - "`inspect task-tree`, `inspect resource-list --scope input`, " + "`inspect task-workspace --format json`, `inspect task-tree`, " + "`inspect resource-list --scope input`, " "`inspect resource-list --scope visible --limit 20`, " + "`inspect resource-location --resource-id `, " "`inspect next-actions`, and " "`manage materialize-resource --resource-id --dry-run`. " "Use `--format json` when you need stable IDs. Resource copies are snapshots: " - "materialized files become resources owned by this task, not edits to the source." + "materialized files become resources owned by this task, not edits to the source.\n\n" + 'First call `workflow("inspect task-workspace --format json")`. Use only ' + "`task_workspace.task.level` from that response to decide whether this current " + "task may delegate. Ignore level-0 tasks shown elsewhere in task-tree. If " + "`task_workspace.task.level is exactly 0`, create exactly three specialist " + "child tasks before researching: " + "(1) a source scout for finding citations, " + "(2) a rubric compliance checker for mapping requirements to an outline, and " + "(3) a synthesis reviewer for risks, gaps, and counterclaims. " + 'Use `workflow("manage add-task --task-slug --worker worker ' + "--description ''\")` for each child. " + "Give each child a role-specific description that includes the original task " + "goal and asks for a concise markdown report in `final_output/report.md`. " + "Then continue your own report; do not wait for child results unless visible " + "resources are already available.\n\n" + "If your current `task_workspace.task.level` is not 0, you are already a " + "specialist child. You must do only your assigned specialist work; do not call " + '`workflow("manage add-task` under any ' + "circumstances. Do not inspect the workflow repeatedly. Use at most 2 " + "workflow inspections and at most 3 exa_search calls, then write your " + "specialist markdown report to `final_output/report.md`." ) @@ -121,6 +146,7 @@ async def publisher_sync() -> list[RunResourceView]: worker_context=context, sandbox_task_key=self.task_id, benchmark_type="researchrubrics", + manager_capable=True, ) self.tools = [*rr_toolkit.build_tools(), *graph_toolkit.build_tools(), workflow_tool] diff --git a/ergon_cli/ergon_cli/commands/benchmark.py b/ergon_cli/ergon_cli/commands/benchmark.py index e2420992..5fd5684a 100644 --- a/ergon_cli/ergon_cli/commands/benchmark.py +++ b/ergon_cli/ergon_cli/commands/benchmark.py @@ -1,6 +1,5 @@ -"""Benchmark subcommand: list, run, and setup benchmarks.""" +"""Benchmark subcommand: list and setup benchmarks.""" -import asyncio import json import os import sys @@ -11,23 +10,13 @@ from pathlib import Path from typing import Protocol -import inngest from e2b import Template - -from ergon_cli.composition import build_experiment -from ergon_cli.discovery import list_benchmarks -from ergon_cli.rendering import render_run_result, render_table -from ergon_core.api.handles import ExperimentRunHandle from ergon_core.api.json_types import JsonObject -from ergon_core.core.persistence.shared.db import ensure_db, get_session -from ergon_core.core.persistence.shared.enums import TERMINAL_RUN_STATUSES -from ergon_core.core.persistence.telemetry.models import RunRecord -from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent -from ergon_core.core.runtime.inngest_client import inngest_client -from ergon_core.core.runtime.services.cohort_service import experiment_cohort_service -from ergon_core.core.runtime.services.run_service import create_run from ergon_core.core.settings import settings +from ergon_cli.discovery import list_benchmarks +from ergon_cli.rendering import render_table + class BuildLog(Protocol): def __str__(self) -> str: ... @@ -49,8 +38,6 @@ async def handle_benchmark(args: Namespace) -> int: benchmarks = list_benchmarks() render_table(["Slug", "Name", "Description"], benchmarks) return 0 - elif args.bench_action == "run": - return await run_benchmark(args) elif args.bench_action == "setup": return setup_benchmark(args) else: @@ -177,90 +164,8 @@ def _on_build_logs(log: BuildLog) -> None: # 7. Report print(f"\nSuccess! Template ID: {template_id} (build {build_info.build_id}, {build_time}s)") - print(f"Now run: `ergon benchmark run {slug} --worker minif2f-react --model --limit 1`") - return 0 - - -async def run_benchmark(args: Namespace) -> int: - ensure_db() - - experiment = build_experiment( - benchmark_slug=args.slug, - model=args.model, - worker_slug=args.worker, - evaluator_slug=args.evaluator, - workflow=args.workflow, - limit=args.limit, - ) - experiment.validate() - persisted = experiment.persist() - render_run_result(persisted) - print(f"\nExperiment persisted: {persisted.definition_id}") - - cohort_name = args.slug if args.cohort is None else args.cohort - cohort = experiment_cohort_service.resolve_or_create( - name=cohort_name, - description=f"Benchmark: {args.slug} | worker: {args.worker} | evaluator: {args.evaluator}", - created_by="ergon-cli", + print( + "Now run: " + f"`ergon experiment define {slug} --worker minif2f-react --model --limit 1`" ) - print(f"\nCohort: {cohort.name} (id={cohort.id})") - - print("\nCreating run and dispatching via Inngest...") - run_handle = await _create_and_dispatch(persisted, timeout=args.timeout, cohort_id=cohort.id) - - print("\nRun completed:") - print(f" Run ID: {run_handle.run_id}") - print(f" Status: {run_handle.status}") - print(f" Benchmark: {run_handle.benchmark_type}") - return 0 if run_handle.status == "completed" else 1 - - -async def _create_and_dispatch(persisted, timeout: int = 600, cohort_id=None): - run = create_run(persisted, cohort_id=cohort_id) - print(f" Run ID: {run.id}") - - event = WorkflowStartedEvent( - run_id=run.id, - definition_id=persisted.definition_id, - ) - await inngest_client.send( - inngest.Event( - name=WorkflowStartedEvent.name, - data=event.model_dump(mode="json"), - ) - ) - print(" WorkflowStartedEvent emitted. Polling for completion...") - - start = time.time() - terminal = TERMINAL_RUN_STATUSES - poll_interval = 2.0 - - while True: - elapsed = time.time() - start - if elapsed > timeout: - print(f" TIMEOUT after {timeout}s") - return ExperimentRunHandle( - run_id=run.id, - definition_id=persisted.definition_id, - benchmark_type=persisted.benchmark_type, - status="timeout", - ) - - session = get_session() - try: - current = session.get(RunRecord, run.id) - if current and current.status in terminal: - return ExperimentRunHandle( - run_id=run.id, - definition_id=persisted.definition_id, - benchmark_type=persisted.benchmark_type, - status=current.status, - ) - status = current.status if current else "unknown" - finally: - session.close() - - mins = int(elapsed) // 60 - secs = int(elapsed) % 60 - print(f" [{mins:02d}:{secs:02d}] status={status}") - await asyncio.sleep(poll_interval) + return 0 diff --git a/ergon_cli/ergon_cli/commands/experiment.py b/ergon_cli/ergon_cli/commands/experiment.py index 8534069d..00727339 100644 --- a/ergon_cli/ergon_cli/commands/experiment.py +++ b/ergon_cli/ergon_cli/commands/experiment.py @@ -1,9 +1,145 @@ -"""Experiment command (placeholder for direct experiment composition).""" +"""Experiment lifecycle commands.""" from argparse import Namespace +import logging +from uuid import UUID +from ergon_core.core.persistence.shared.db import ensure_db +from ergon_core.core.runtime.services.cohort_service import experiment_cohort_service +from ergon_core.core.runtime.services.experiment_definition_service import ( + ExperimentDefinitionService, +) +from ergon_core.core.runtime.services.experiment_launch_service import ExperimentLaunchService +from ergon_core.core.runtime.services.experiment_read_service import ExperimentReadService +from ergon_core.core.runtime.services.experiment_schemas import ( + ExperimentDefineRequest, + ExperimentRunRequest, +) -def handle_experiment(args: Namespace) -> int: - print("Direct experiment composition from CLI is not yet supported.") - print("Use Python API: Experiment.from_single_worker(...)") +logger = logging.getLogger(__name__) + + +async def handle_experiment(args: Namespace) -> int: + _ensure_cli_logging() + if args.experiment_action == "define": + return handle_experiment_define(args) + if args.experiment_action == "run": + return await handle_experiment_run(args) + if args.experiment_action == "show": + return handle_experiment_show(args) + if args.experiment_action == "list": + return handle_experiment_list(args) + logger.error("Usage: ergon experiment {define|run|show|list}") + return 1 + + +def handle_experiment_define(args: Namespace) -> int: + _ensure_cli_logging() + ensure_db() + cohort_id = None + if args.cohort: + cohort = experiment_cohort_service.resolve_or_create( + name=args.cohort, + description=f"CLI experiment folder for {args.benchmark_slug}", + created_by="ergon-cli", + ) + cohort_id = cohort.id + + sample_ids = args.sample_id or None + request = ExperimentDefineRequest( + benchmark_slug=args.benchmark_slug, + name=args.name, + cohort_id=cohort_id, + limit=args.limit, + sample_ids=sample_ids, + default_model_target=args.model, + default_worker_team={"primary": args.worker}, + default_evaluator_slug=args.evaluator, + metadata={"workflow": args.workflow, "max_questions": args.max_questions}, + ) + result = ExperimentDefinitionService().define_benchmark_experiment(request) + logger.info("EXPERIMENT_ID=%s", result.experiment_id) + if result.cohort_id is not None: + logger.info("COHORT_ID=%s", result.cohort_id) + logger.info("BENCHMARK=%s", result.benchmark_type) + logger.info("SAMPLES=%s", ",".join(result.selected_samples)) return 0 + + +async def handle_experiment_run(args: Namespace) -> int: + _ensure_cli_logging() + ensure_db() + result = await ExperimentLaunchService().run_experiment( + ExperimentRunRequest( + experiment_id=UUID(args.experiment_id), + timeout_seconds=args.timeout, + wait=not args.no_wait, + ) + ) + logger.info("EXPERIMENT_ID=%s", result.experiment_id) + for run_id in result.run_ids: + logger.info("RUN_ID=%s", run_id) + return 0 + + +def handle_experiment_show(args: Namespace) -> int: + _ensure_cli_logging() + detail = ExperimentReadService().get_experiment(UUID(args.experiment_id)) + if detail is None: + logger.error("Experiment not found: %s", args.experiment_id) + return 1 + + experiment = detail.experiment + logger.info("EXPERIMENT_ID=%s", experiment.experiment_id) + if experiment.cohort_id is not None: + logger.info("COHORT_ID=%s", experiment.cohort_id) + logger.info("NAME=%s", experiment.name) + logger.info("BENCHMARK=%s", experiment.benchmark_type) + logger.info("STATUS=%s", experiment.status) + logger.info("SAMPLE_COUNT=%s", experiment.sample_count) + logger.info("RUN_COUNT=%s", experiment.run_count) + if experiment.default_model_target is not None: + logger.info("DEFAULT_MODEL=%s", experiment.default_model_target) + if experiment.default_evaluator_slug is not None: + logger.info("DEFAULT_EVALUATOR=%s", experiment.default_evaluator_slug) + + if detail.sample_selection: + logger.info("SAMPLE_SELECTION=%s", detail.sample_selection) + if detail.runs: + logger.info("RUNS") + for run in detail.runs: + logger.info( + "%s\t%s\t%s\t%s", + run.run_id, + run.instance_key, + run.status, + "" if run.model_target is None else run.model_target, + ) + return 0 + + +def handle_experiment_list(args: Namespace) -> int: + _ensure_cli_logging() + experiments = ExperimentReadService().list_experiments(limit=args.limit) + if not experiments: + logger.info("No experiments found.") + return 0 + + logger.info("EXPERIMENT_ID\tNAME\tBENCHMARK\tSTATUS\tSAMPLES\tRUNS\tMODEL") + for experiment in experiments: + logger.info( + "%s\t%s\t%s\t%s\t%s\t%s\t%s", + experiment.experiment_id, + experiment.name, + experiment.benchmark_type, + experiment.status, + experiment.sample_count, + experiment.run_count, + "" if experiment.default_model_target is None else experiment.default_model_target, + ) + return 0 + + +def _ensure_cli_logging() -> None: + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO, format="%(message)s") diff --git a/ergon_cli/ergon_cli/commands/workflow.py b/ergon_cli/ergon_cli/commands/workflow.py index 27a32d9f..21ec9559 100644 --- a/ergon_cli/ergon_cli/commands/workflow.py +++ b/ergon_cli/ergon_cli/commands/workflow.py @@ -8,6 +8,7 @@ from ergon_core.api.json_types import JsonObject from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.runtime.services.workflow_service import WorkflowService +from ergon_core.core.runtime.services.workflow_dto import WorkflowMutationRef from pydantic import BaseModel from sqlmodel import Session from collections.abc import Callable @@ -59,10 +60,17 @@ def build_workflow_parser() -> argparse.ArgumentParser: resource_content.add_argument("--max-bytes", type=int, default=100_000) resource_content.add_argument("--format", choices=["text", "json"], default="text") + resource_location = inspect_sub.add_parser("resource-location") + resource_location.add_argument("--resource-id", required=True) + resource_location.add_argument("--format", choices=["text", "json"], default="text") + task_tree = inspect_sub.add_parser("task-tree") task_tree.add_argument("--format", choices=["text", "json"], default="text") task_tree.add_argument("--parent-node-id", default=None) + task_workspace = inspect_sub.add_parser("task-workspace") + task_workspace.add_argument("--format", choices=["text", "json"], default="text") + dependencies = inspect_sub.add_parser("task-dependencies") dependencies.add_argument( "--direction", choices=["upstream", "downstream", "both"], default="both" @@ -81,8 +89,32 @@ def build_workflow_parser() -> argparse.ArgumentParser: materialize.add_argument("--dry-run", action="store_true") materialize.add_argument("--format", choices=["text", "json"], default="text") - for action in ("add-task", "add-edge", "restart-task", "abandon-task"): + add_task = manage_sub.add_parser("add-task") + add_task.add_argument("--task-slug", required=True) + add_task.add_argument("--description", required=True) + add_task.add_argument("--worker", required=True) + add_task.add_argument("--parent-node-id", default=None) + add_task.add_argument("--dry-run", action="store_true") + add_task.add_argument("--format", choices=["text", "json"], default="text") + add_task.add_argument("--reason", default=None) + + add_edge = manage_sub.add_parser("add-edge") + add_edge.add_argument("--source-task-slug", required=True) + add_edge.add_argument("--target-task-slug", required=True) + add_edge.add_argument("--dry-run", action="store_true") + add_edge.add_argument("--format", choices=["text", "json"], default="text") + add_edge.add_argument("--reason", default=None) + + update_description = manage_sub.add_parser("update-task-description") + update_description.add_argument("--task-slug", required=True) + update_description.add_argument("--description", required=True) + update_description.add_argument("--dry-run", action="store_true") + update_description.add_argument("--format", choices=["text", "json"], default="text") + update_description.add_argument("--reason", default=None) + + for action in ("restart-task", "abandon-task"): parser_for_action = manage_sub.add_parser(action) + parser_for_action.add_argument("--task-slug", required=True) parser_for_action.add_argument("--dry-run", action="store_true") parser_for_action.add_argument("--format", choices=["text", "json"], default="text") parser_for_action.add_argument("--reason", default=None) @@ -96,6 +128,23 @@ def execute_workflow_command( context: WorkflowCommandContext, session_factory: Callable[[], Session], service: WorkflowService, +) -> WorkflowCommandOutput: + return asyncio.run( # slopcop: ignore[no-async-from-sync] -- CLI sync bridge + execute_workflow_command_async( + command, + context=context, + session_factory=session_factory, + service=service, + ) + ) + + +async def execute_workflow_command_async( + command: str, + *, + context: WorkflowCommandContext, + session_factory: Callable[[], Session], + service: WorkflowService, ) -> WorkflowCommandOutput: argv = shlex.split(command) _reject_context_flags(argv) @@ -105,9 +154,7 @@ def execute_workflow_command( if args.group == "inspect": return _handle_inspect(args, context=context, session=session, service=service) if args.group == "manage": - return asyncio.run( # slopcop: ignore[no-async-from-sync] -- CLI/tool sync bridge - _handle_manage(args, context=context, session=session, service=service) - ) + return await _handle_manage(args, context=context, session=session, service=service) finally: _close_session(session) raise ValueError(f"unsupported workflow command group: {args.group}") @@ -184,6 +231,22 @@ def _handle_inspect( if args.format == "json": return _format_output({"content": content.decode(errors="replace")}, [], "json") return WorkflowCommandOutput(stdout=content.decode(errors="replace")) + if args.action == "resource-location": + location = service.get_resource_location( + session, + run_id=context.run_id, + resource_id=UUID(args.resource_id), + ) + return _format_output( + {"resource_location": _dump(location)}, + text_lines=[ + f"resource {location.resource.name}", + f"producer={location.producer_task_slug or '-'}", + f"local={location.local_file_path}", + f"default_sandbox_path={location.default_sandbox_path}", + ], + output_format=args.format, + ) if args.action == "task-tree": parent = UUID(args.parent_node_id) if args.parent_node_id else None tasks = service.list_tasks(session, run_id=context.run_id, parent_node_id=parent) @@ -195,6 +258,28 @@ def _handle_inspect( ], output_format=args.format, ) + if args.action == "task-workspace": + workspace = service.get_task_workspace( + session, + run_id=context.run_id, + node_id=context.node_id, + ) + lines = [ + f"task {workspace.task.task_slug} status={workspace.task.status}", + ] + if workspace.latest_execution is not None: + lines.append( + "execution " + f"{workspace.latest_execution.execution_id} " + f"status={workspace.latest_execution.status}" + ) + lines.extend(f"own: {resource.name}" for resource in workspace.own_resources) + lines.extend(f"input: {resource.name}" for resource in workspace.input_resources) + return _format_output( + {"task_workspace": _dump(workspace)}, + text_lines=lines, + output_format=args.format, + ) if args.action == "task-dependencies": deps = service.list_dependencies( session, @@ -249,18 +334,52 @@ async def _handle_manage( text_lines=[f"{result.source_resource_id} -> {result.sandbox_path}"], output_format=args.format, ) - try: - dry_run = args.dry_run - except AttributeError: - dry_run = False - if dry_run: - payload: JsonObject = { - "action": args.action, - "dry_run": True, - "message": "Graph lifecycle command validated; no changes applied.", - } - return _format_output(payload, [str(payload["message"])], args.format) - raise ValueError(f"{args.action} requires --dry-run in workflow CLI v1") + if args.action == "add-task": + result = await service.add_task( + session, + run_id=context.run_id, + parent_node_id=UUID(args.parent_node_id) if args.parent_node_id else context.node_id, + task_slug=args.task_slug, + description=args.description, + assigned_worker_slug=args.worker, + dry_run=args.dry_run, + ) + return _mutation_output(result, args.format) + if args.action == "add-edge": + result = await service.add_edge( + session, + run_id=context.run_id, + source_task_slug=args.source_task_slug, + target_task_slug=args.target_task_slug, + dry_run=args.dry_run, + ) + return _mutation_output(result, args.format) + if args.action == "update-task-description": + result = await service.update_task_description( + session, + run_id=context.run_id, + task_slug=args.task_slug, + description=args.description, + dry_run=args.dry_run, + ) + return _mutation_output(result, args.format) + if args.action == "restart-task": + result = await service.restart_task( + session, + run_id=context.run_id, + task_slug=args.task_slug, + dry_run=args.dry_run, + ) + return _mutation_output(result, args.format) + if args.action == "abandon-task": + result = await service.abandon_task( + session, + run_id=context.run_id, + task_slug=args.task_slug, + dry_run=args.dry_run, + ) + return _mutation_output(result, args.format) + raise ValueError(f"unsupported manage action: {args.action}") def _format_output( @@ -273,6 +392,11 @@ def _format_output( return WorkflowCommandOutput(stdout="\n".join(text_lines)) +def _mutation_output(result: WorkflowMutationRef, output_format: str) -> WorkflowCommandOutput: + payload: JsonObject = {"mutation": _dump(result)} + return _format_output(payload, [result.message], output_format) + + def _dump(value: BaseModel | JsonObject) -> JsonObject: if isinstance(value, BaseModel): return cast(JsonObject, value.model_dump(mode="json")) diff --git a/ergon_cli/ergon_cli/main.py b/ergon_cli/ergon_cli/main.py index 19d6905e..21bc74d0 100644 --- a/ergon_cli/ergon_cli/main.py +++ b/ergon_cli/ergon_cli/main.py @@ -8,11 +8,12 @@ from ergon_cli.commands.doctor import handle_doctor from ergon_cli.commands.eval import handle_eval from ergon_cli.commands.evaluator import handle_evaluator +from ergon_cli.commands.experiment import handle_experiment from ergon_cli.commands.onboard import handle_onboard from ergon_cli.commands.run import handle_run from ergon_cli.commands.train import handle_train -from ergon_cli.commands.workflow import handle_workflow from ergon_cli.commands.worker import handle_worker +from ergon_cli.commands.workflow import handle_workflow def build_parser() -> argparse.ArgumentParser: @@ -30,35 +31,42 @@ def build_parser() -> argparse.ArgumentParser: "--force", action="store_true", help="Rebuild even if the template already exists" ) - run_parser = bench_sub.add_parser("run", help="Run a benchmark") - run_parser.add_argument("slug", help="Benchmark slug") - run_parser.add_argument("--model", default="openai:gpt-4o", help="Model to use") - run_parser.add_argument("--worker", default="training-stub", help="Worker slug") - run_parser.add_argument("--evaluator", default="stub-rubric", help="Evaluator slug") - run_parser.add_argument("--workflow", default="single", help="Workflow variant") - run_parser.add_argument( - "--limit", - type=int, + experiment = sub.add_parser("experiment", help="Experiment lifecycle") + experiment_sub = experiment.add_subparsers(dest="experiment_action") + experiment_define = experiment_sub.add_parser("define", help="Define an experiment") + experiment_define.add_argument("benchmark_slug", help="Benchmark slug") + sample_group = experiment_define.add_mutually_exclusive_group(required=True) + sample_group.add_argument("--limit", type=int, default=None, help="Number of samples") + sample_group.add_argument( + "--sample-id", + action="append", default=None, - help="Number of samples/tasks to run (benchmark-specific)", - ) - run_parser.add_argument( - "--timeout", - type=int, - default=600, - help="Timeout in seconds per run", - ) - run_parser.add_argument( + help="Specific benchmark sample id; can be repeated", + ) + experiment_define.add_argument("--name", default=None, help="Experiment name") + experiment_define.add_argument("--cohort", default=None, help="Optional cohort/project folder") + experiment_define.add_argument("--worker", required=True, help="Primary worker slug") + experiment_define.add_argument("--model", required=True, help="Primary model target") + experiment_define.add_argument("--evaluator", default=None, help="Optional evaluator slug") + experiment_define.add_argument("--workflow", default="single", help="Workflow variant") + experiment_define.add_argument( "--max-questions", type=int, default=10, help="Max questions workers can ask", ) - run_parser.add_argument( - "--cohort", - default=None, - help="Cohort name to group this run under (auto-generated from slug if omitted)", + experiment_run = experiment_sub.add_parser("run", help="Run a defined experiment") + experiment_run.add_argument("experiment_id", help="Experiment UUID") + experiment_run.add_argument("--timeout", type=int, default=600, help="Timeout seconds") + experiment_run.add_argument( + "--no-wait", + action="store_true", + help="Do not wait for terminal runs", ) + experiment_show = experiment_sub.add_parser("show", help="Show experiment detail") + experiment_show.add_argument("experiment_id", help="Experiment UUID") + experiment_list = experiment_sub.add_parser("list", help="List experiments") + experiment_list.add_argument("--limit", type=int, default=50, help="Number of experiments") run = sub.add_parser("run", help="Run management") run_sub = run.add_subparsers(dest="run_action") @@ -183,24 +191,25 @@ async def _main(argv: list[str] | None = None) -> int: parser = build_parser() args = parser.parse_args(argv) - if args.command == "benchmark": - return await handle_benchmark(args) - elif args.command == "run": - return handle_run(args) - elif args.command == "worker": - return handle_worker(args) - elif args.command == "workflow": - return await handle_workflow(args) - elif args.command == "evaluator": - return handle_evaluator(args) - elif args.command == "eval": - return await handle_eval(args) - elif args.command == "train": - return handle_train(args) - elif args.command == "onboard": - return handle_onboard(args) - elif args.command == "doctor": - return handle_doctor(args) + async_handlers = { + "benchmark": handle_benchmark, + "experiment": handle_experiment, + "workflow": handle_workflow, + "eval": handle_eval, + } + sync_handlers = { + "run": handle_run, + "worker": handle_worker, + "evaluator": handle_evaluator, + "train": handle_train, + "onboard": handle_onboard, + "doctor": handle_doctor, + } + + if args.command in async_handlers: + return await async_handlers[args.command](args) + if args.command in sync_handlers: + return sync_handlers[args.command](args) else: parser.print_help() return 0 diff --git a/ergon_core/ergon_core/api/__init__.py b/ergon_core/ergon_core/api/__init__.py index fe6ad7f0..4a11ad94 100644 --- a/ergon_core/ergon_core/api/__init__.py +++ b/ergon_core/ergon_core/api/__init__.py @@ -1,5 +1,7 @@ """Object-first Ergon public API surface.""" +from typing import TYPE_CHECKING + from ergon_core.api.benchmark import Benchmark from ergon_core.api.benchmark_deps import BenchmarkDeps from ergon_core.api.criterion import Criterion @@ -8,15 +10,17 @@ from ergon_core.api.evaluation_context import EvaluationContext from ergon_core.api.evaluator import Evaluator, Rubric from ergon_core.api.experiment import Experiment -from ergon_core.api.handles import ExperimentRunHandle, PersistedExperimentDefinition +from ergon_core.api.handles import PersistedExperimentDefinition from ergon_core.api.results import CriterionResult, TaskEvaluationResult, WorkerOutput -from ergon_core.api.run_resource import RunResourceKind, RunResourceView from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload from ergon_core.api.types import Tool from ergon_core.api.worker import Worker from ergon_core.api.worker_context import WorkerContext from ergon_core.api.worker_spec import WorkerSpec +if TYPE_CHECKING: + from ergon_core.api.run_resource import RunResourceKind, RunResourceView + __all__ = [ "Benchmark", "BenchmarkDeps", @@ -30,7 +34,6 @@ "EvaluationContext", "Evaluator", "Experiment", - "ExperimentRunHandle", "EmptyTaskPayload", "PersistedExperimentDefinition", "Rubric", @@ -44,3 +47,18 @@ "WorkerOutput", "WorkerSpec", ] + + +def __getattr__( + name: str, +) -> object: # slopcop: ignore[no-typing-any] -- module hook returns lazy public exports + if name in {"RunResourceKind", "RunResourceView"}: + from ergon_core.api.run_resource import ( # slopcop: ignore[guarded-function-import] -- reason: avoid import cycle between api package exports and run_resource + RunResourceKind, + RunResourceView, + ) + + globals()["RunResourceKind"] = RunResourceKind + globals()["RunResourceView"] = RunResourceView + return globals()[name] + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/ergon_core/ergon_core/api/experiment.py b/ergon_core/ergon_core/api/experiment.py index 4cdfa21c..21ab84ec 100644 --- a/ergon_core/ergon_core/api/experiment.py +++ b/ergon_core/ergon_core/api/experiment.py @@ -5,14 +5,14 @@ from ergon_core.api.benchmark import Benchmark from ergon_core.api.evaluator import Evaluator -from ergon_core.api.handles import ExperimentRunHandle, PersistedExperimentDefinition +from ergon_core.api.handles import PersistedExperimentDefinition from ergon_core.api.worker_spec import WorkerSpec class Experiment: """Composition root binding a benchmark, worker specs, evaluators, and assignments. - This is the main object users build and hand to ``persist()`` / ``run()``. + This is the main object users build and hand to ``persist()``. reason: RFC 2026-04-22 §1 — workers are described here as ``WorkerSpec`` (config-time descriptor), not as live ``Worker`` instances. The @@ -81,11 +81,12 @@ def validate(self) -> None: for evaluator in self.evaluators.values(): evaluator.validate() - required_slots = set(self.benchmark.evaluator_requirements()) - missing_slots = required_slots - set(self.evaluators) - if missing_slots: - missing = ", ".join(sorted(missing_slots)) - raise ValueError(f"Missing required evaluator bindings: {missing}") + if self.evaluators: + required_slots = set(self.benchmark.evaluator_requirements()) + missing_slots = required_slots - set(self.evaluators) + if missing_slots: + missing = ", ".join(sorted(missing_slots)) + raise ValueError(f"Missing required evaluator bindings: {missing}") instances = self.benchmark.build_instances() all_task_slugs_by_instance: dict[str, set[str]] = {} @@ -161,21 +162,6 @@ def persist(self) -> PersistedExperimentDefinition: self._persisted = persisted return persisted - # ------------------------------------------------------------------ - # Execution - # ------------------------------------------------------------------ - - async def run(self) -> ExperimentRunHandle: - """Ensure a persisted definition exists, create a run, and dispatch execution.""" - # Deferred: api/ should not depend on core/ at module level (same as persist). - from ergon_core.core.runtime.services.run_service import create_experiment_run - - if self._persisted is None: - self.persist() - if self._persisted is None: - raise RuntimeError("persist() did not produce a persisted definition") - return await create_experiment_run(self._persisted) - # ------------------------------------------------------------------ # Helpers diff --git a/ergon_core/ergon_core/api/generation.py b/ergon_core/ergon_core/api/generation.py index 449f24ed..c54f2618 100644 --- a/ergon_core/ergon_core/api/generation.py +++ b/ergon_core/ergon_core/api/generation.py @@ -15,19 +15,10 @@ from typing import Annotated, Any, Literal from ergon_core.api.json_types import JsonObject +from ergon_core.core.providers.generation.types import TokenLogprob from pydantic import BaseModel, Field -class TokenLogprob(BaseModel): - """Per-token log probability from the serving backend.""" - - model_config = {"frozen": True} - - token: str - logprob: float - top_logprobs: list[JsonObject] = Field(default_factory=list) - - # --------------------------------------------------------------------------- # Request parts (ModelRequest input — what went INTO the model) # --------------------------------------------------------------------------- diff --git a/ergon_core/ergon_core/api/handles.py b/ergon_core/ergon_core/api/handles.py index b1fa8b25..ff57042f 100644 --- a/ergon_core/ergon_core/api/handles.py +++ b/ergon_core/ergon_core/api/handles.py @@ -25,22 +25,3 @@ class PersistedExperimentDefinition(BaseModel): task_count: int = 0 created_at: datetime = Field(default_factory=utcnow) metadata: dict[str, Any] = Field(default_factory=dict) # slopcop: ignore[no-typing-any] - - -class ExperimentRunHandle(BaseModel): - """Rich handle returned by Experiment.run(). - - Carries enough information for inspection, logging, and downstream use - without requiring a database round-trip. - """ - - model_config = {"frozen": True} - - run_id: UUID - definition_id: UUID - benchmark_type: str - status: str - worker_bindings: dict[str, str] = Field(default_factory=dict) - created_at: datetime = Field(default_factory=utcnow) - started_at: datetime | None = None - metadata: dict[str, Any] = Field(default_factory=dict) # slopcop: ignore[no-typing-any] diff --git a/ergon_core/ergon_core/core/api/app.py b/ergon_core/ergon_core/core/api/app.py index c251d1d4..d01aef4e 100644 --- a/ergon_core/ergon_core/core/api/app.py +++ b/ergon_core/ergon_core/core/api/app.py @@ -21,6 +21,7 @@ import inngest.fast_api from ergon_core.core.api.cohorts import router as cohorts_router +from ergon_core.core.api.experiments import router as experiments_router from ergon_core.core.api.rollouts import init_service as init_rollout_service from ergon_core.core.api.rollouts import router as rollouts_router from ergon_core.core.api.runs import router as runs_router @@ -36,9 +37,8 @@ from ergon_core.core.providers.sandbox.manager import DefaultSandboxManager from ergon_core.core.rl.rollout_service import RolloutService from ergon_core.core.runtime.inngest_client import inngest_client -from ergon_core.core.settings import settings from ergon_core.core.runtime.inngest_registry import ALL_FUNCTIONS -from ergon_core.core.settings import Settings +from ergon_core.core.settings import Settings, settings from fastapi import FastAPI logger = logging.getLogger(__name__) @@ -87,9 +87,16 @@ async def lifespan(app: FastAPI): ) app.include_router(runs_router) +app.include_router(experiments_router) app.include_router(cohorts_router) app.include_router(rollouts_router) + +@app.get("/health", include_in_schema=False) +def health() -> dict[str, str]: + return {"status": "ok"} + + # Test-only harness: mounted in CI + local-e2e only. if settings.enable_test_harness: app.include_router(_test_harness_router) diff --git a/ergon_core/ergon_core/core/api/experiments.py b/ergon_core/ergon_core/core/api/experiments.py new file mode 100644 index 00000000..5e664746 --- /dev/null +++ b/ergon_core/ergon_core/core/api/experiments.py @@ -0,0 +1,48 @@ +"""Experiment lifecycle API routes.""" + +from uuid import UUID + +from ergon_core.core.runtime.services.experiment_definition_service import ( + ExperimentDefinitionService, +) +from ergon_core.core.runtime.services.experiment_launch_service import ExperimentLaunchService +from ergon_core.core.runtime.services.experiment_read_service import ( + ExperimentDetailDto, + ExperimentReadService, + ExperimentSummaryDto, +) +from ergon_core.core.runtime.services.experiment_schemas import ( + ExperimentDefineRequest, + ExperimentDefineResult, + ExperimentRunRequest, + ExperimentRunResult, +) +from fastapi import APIRouter, HTTPException + +router = APIRouter(prefix="/experiments", tags=["experiments"]) + + +@router.get("", response_model=list[ExperimentSummaryDto]) +def list_experiments(limit: int = 50) -> list[ExperimentSummaryDto]: + return ExperimentReadService().list_experiments(limit=limit) + + +@router.get("/{experiment_id}", response_model=ExperimentDetailDto) +def get_experiment(experiment_id: UUID) -> ExperimentDetailDto: + detail = ExperimentReadService().get_experiment(experiment_id) + if detail is None: + raise HTTPException(status_code=404, detail=f"Experiment {experiment_id} not found") + return detail + + +@router.post("/define", response_model=ExperimentDefineResult, status_code=201) +def define_experiment(request: ExperimentDefineRequest) -> ExperimentDefineResult: + return ExperimentDefinitionService().define_benchmark_experiment(request) + + +@router.post("/{experiment_id}/run", response_model=ExperimentRunResult, status_code=202) +async def run_experiment(experiment_id: UUID, request: ExperimentRunRequest | None = None): + launch_request = request or ExperimentRunRequest(experiment_id=experiment_id) + if launch_request.experiment_id != experiment_id: + raise HTTPException(status_code=400, detail="experiment_id mismatch") + return await ExperimentLaunchService().run_experiment(launch_request) diff --git a/ergon_core/ergon_core/core/api/test_harness.py b/ergon_core/ergon_core/core/api/test_harness.py index c51f7f57..d2949d35 100644 --- a/ergon_core/ergon_core/core/api/test_harness.py +++ b/ergon_core/ergon_core/core/api/test_harness.py @@ -19,26 +19,30 @@ from typing import Annotated from uuid import UUID -import inngest -from ergon_cli.composition import build_experiment +from ergon_core.core.persistence.context.models import RunContextEvent from ergon_core.core.persistence.graph.models import RunGraphMutation, RunGraphNode from ergon_core.core.persistence.shared.db import get_engine from ergon_core.core.persistence.shared.enums import RunStatus -from ergon_core.core.persistence.context.models import RunContextEvent from ergon_core.core.persistence.telemetry.models import ( ExperimentCohort, + ExperimentRecord, RunRecord, RunResource, RunTaskEvaluation, RunTaskExecution, Thread, ) -from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent -from ergon_core.core.runtime.inngest_client import inngest_client from ergon_core.core.runtime.services.cohort_service import experiment_cohort_service -from ergon_core.core.runtime.services.run_service import create_run +from ergon_core.core.runtime.services.experiment_definition_service import ( + ExperimentDefinitionService, +) +from ergon_core.core.runtime.services.experiment_launch_service import ExperimentLaunchService +from ergon_core.core.runtime.services.experiment_schemas import ( + ExperimentDefineRequest, + ExperimentRunRequest, +) from fastapi import APIRouter, Depends, Header, HTTPException, status -from pydantic import BaseModel +from pydantic import BaseModel, Field from sqlmodel import Session, asc, select router = APIRouter(prefix="/api/test", tags=["test-harness"]) @@ -276,7 +280,9 @@ def read_cohort_runs( if cohort is None: return [] runs = list( - session.exec(select(RunRecord).where(RunRecord.cohort_id == cohort.id)).all(), + session.exec( + select(RunRecord).join(ExperimentRecord).where(ExperimentRecord.cohort_id == cohort.id) + ).all(), ) return [TestCohortRunDto(run_id=r.id, status=r.status) for r in runs] @@ -299,14 +305,18 @@ def read_cohort_runs( # ``experiment_definition_id`` passes validation and the secret gate (which # runs inside the handler body, after FastAPI's validation phase) can surface # 401/500 without 422 noise. ``experiment_definition_id`` is required because -# ``RunRecord.experiment_definition_id`` is a NOT NULL FK to -# ``experiment_definitions.id`` — no synthetic definition is created here. +# ``RunRecord.workflow_definition_id`` is a NOT NULL FK to +# ``experiment_definitions.id``; callers pass that existing definition id. # ``ResetRequest.cohort_prefix`` has no default: reset is destructive, so # callers must always specify what to nuke. class SeedRunRequest(BaseModel): - experiment_definition_id: UUID + workflow_definition_id: UUID + experiment_id: UUID | None = None + benchmark_type: str = "test-harness" + instance_key: str = "seeded" + worker_team: dict = Field(default_factory=lambda: {"primary": "test-harness-worker"}) cohort: str = "_test_" status: str = "completed" task_slugs: list[str] = [] @@ -332,8 +342,34 @@ def seed_run( detail=f"unknown run status: {body.status!r}", ) from exc with Session(get_engine()) as s: + cohort = experiment_cohort_service.resolve_or_create( + name=body.cohort, + description="test harness seeded cohort", + created_by="test-harness", + ) + experiment_kwargs = {} + if body.experiment_id is not None: + experiment_kwargs["id"] = body.experiment_id + experiment = ExperimentRecord( + **experiment_kwargs, + cohort_id=cohort.id, + name=f"Seeded {body.cohort}", + benchmark_type=body.benchmark_type, + sample_count=1, + sample_selection_json={"instance_keys": [body.instance_key]}, + default_worker_team_json=body.worker_team, + design_json={}, + metadata_json={"_test_seeded": True, "_test_cohort": body.cohort}, + status="seeded", + ) + s.add(experiment) + s.flush() run = RunRecord( - experiment_definition_id=body.experiment_definition_id, + experiment_id=experiment.id, + workflow_definition_id=body.workflow_definition_id, + benchmark_type=body.benchmark_type, + instance_key=body.instance_key, + worker_team_json=body.worker_team, status=run_status, summary_json={ "_test_seeded": True, @@ -364,6 +400,12 @@ def reset_test_rows( tag = meta.get("_test_cohort") if isinstance(tag, str) and tag.startswith(body.cohort_prefix): s.delete(r) + experiments = list(s.exec(select(ExperimentRecord)).all()) + for experiment in experiments: + meta = {} if experiment.metadata_json is None else experiment.metadata_json + tag = meta.get("_test_cohort") + if isinstance(tag, str) and tag.startswith(body.cohort_prefix): + s.delete(experiment) s.commit() return None @@ -404,10 +446,9 @@ class SubmitCohortResponse(BaseModel): async def submit_cohort(body: SubmitCohortRequest) -> SubmitCohortResponse: """Build + persist + dispatch N runs under one cohort. - Per-slot flow mirrors the CLI's ``ergon benchmark run``: - ``build_experiment`` → ``validate`` → ``persist`` → ``create_run`` - → send ``WorkflowStartedEvent``. Slots submit sequentially — - typical N ≤ 3, so the parallel-gather savings are negligible. + Per-slot flow mirrors the CLI's ``ergon experiment define`` followed by + ``ergon experiment run``. Slots submit sequentially — typical N ≤ 3, + so the parallel-gather savings are negligible. """ cohort = experiment_cohort_service.resolve_or_create( name=body.cohort_key, @@ -417,25 +458,20 @@ async def submit_cohort(body: SubmitCohortRequest) -> SubmitCohortResponse: run_ids: list[UUID] = [] for slot in body.slots: - experiment = build_experiment( - benchmark_slug=body.benchmark_slug, - model=body.model, - worker_slug=slot.worker_slug, - evaluator_slug=slot.evaluator_slug, - limit=body.limit, - ) - experiment.validate() - persisted = experiment.persist() - run = create_run(persisted, cohort_id=cohort.id) - await inngest_client.send( - inngest.Event( - name=WorkflowStartedEvent.name, - data=WorkflowStartedEvent( - run_id=run.id, - definition_id=persisted.definition_id, - ).model_dump(mode="json"), + defined = ExperimentDefinitionService().define_benchmark_experiment( + ExperimentDefineRequest( + benchmark_slug=body.benchmark_slug, + cohort_id=cohort.id, + limit=body.limit, + default_model_target=body.model, + default_worker_team={"primary": slot.worker_slug}, + default_evaluator_slug=slot.evaluator_slug, + metadata={"source": "test-harness"}, ) ) - run_ids.append(run.id) + launched = await ExperimentLaunchService().run_experiment( + ExperimentRunRequest(experiment_id=defined.experiment_id) + ) + run_ids.extend(launched.run_ids) return SubmitCohortResponse(run_ids=run_ids, cohort_id=cohort.id) diff --git a/ergon_core/ergon_core/core/persistence/context/event_payloads.py b/ergon_core/ergon_core/core/persistence/context/event_payloads.py index db19d6b1..b2f58bd7 100644 --- a/ergon_core/ergon_core/core/persistence/context/event_payloads.py +++ b/ergon_core/ergon_core/core/persistence/context/event_payloads.py @@ -7,7 +7,7 @@ from typing import Annotated, Any, Literal -from ergon_core.api.generation import TokenLogprob +from ergon_core.core.providers.generation.types import TokenLogprob from pydantic import BaseModel, Field # Exported type alias — use everywhere event_type is stored as a string field. @@ -49,7 +49,7 @@ class ToolCallPayload(BaseModel): args: dict[str, Any] # slopcop: ignore[no-typing-any] turn_id: str # links events from the same generation call turn_token_ids: list[int] | None = None # None if another event in this turn holds them - turn_logprobs: list[TokenLogprob] | None = None # None if another event in this turn holds them + turn_logprobs: list[TokenLogprob] | None = None class ToolResultPayload(BaseModel): diff --git a/ergon_core/ergon_core/core/persistence/queries.py b/ergon_core/ergon_core/core/persistence/queries.py index 7cd99f73..23171a35 100644 --- a/ergon_core/ergon_core/core/persistence/queries.py +++ b/ergon_core/ergon_core/core/persistence/queries.py @@ -20,10 +20,11 @@ ExperimentDefinitionTaskEvaluator, ExperimentDefinitionWorker, ) -from ergon_core.core.persistence.graph.models import RunGraphEdge, RunGraphNode +from ergon_core.core.persistence.graph.models import RunGraphNode from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import RunStatus, TaskExecutionStatus from ergon_core.core.persistence.telemetry.models import ( + ExperimentRecord, RunRecord, RunResource, RunTaskExecution, @@ -93,7 +94,7 @@ def list_by_definition(self, definition_id: UUID) -> list[RunRecord]: with get_session() as session: stmt = ( select(RunRecord) - .where(RunRecord.experiment_definition_id == definition_id) + .where(RunRecord.workflow_definition_id == definition_id) .order_by(desc(RunRecord.created_at)) ) return list(session.exec(stmt).all()) @@ -113,7 +114,10 @@ def get_cohort_id(self, run_id: UUID) -> UUID | None: run = session.get(RunRecord, run_id) if run is None: return None - return run.cohort_id + experiment = session.get(ExperimentRecord, run.experiment_id) + if experiment is None: + return None + return experiment.cohort_id # --------------------------------------------------------------------------- diff --git a/ergon_core/ergon_core/core/persistence/telemetry/models.py b/ergon_core/ergon_core/core/persistence/telemetry/models.py index 89062405..2928fdcf 100644 --- a/ergon_core/ergon_core/core/persistence/telemetry/models.py +++ b/ergon_core/ergon_core/core/persistence/telemetry/models.py @@ -9,6 +9,7 @@ from typing import TYPE_CHECKING from uuid import UUID, uuid4 +import sqlalchemy as sa from ergon_core.api.json_types import JsonObject from ergon_core.core.persistence.shared.enums import ( RunStatus, @@ -17,7 +18,6 @@ ) from ergon_core.core.utils import utcnow as _utcnow from pydantic import model_validator -import sqlalchemy as sa from sqlalchemy import JSON, Column, DateTime from sqlmodel import Field, SQLModel @@ -42,6 +42,68 @@ class ExperimentCohortStatus(StrEnum): ARCHIVED = "archived" +# --------------------------------------------------------------------------- +# ExperimentRecord +# --------------------------------------------------------------------------- + + +class ExperimentRecord(SQLModel, table=True): + """One launched experiment definition and sample selection.""" + + __tablename__ = "experiments" + + id: UUID = Field(default_factory=uuid4, primary_key=True) + cohort_id: UUID | None = Field( + default=None, + foreign_key="experiment_cohorts.id", + index=True, + ) + name: str = Field(index=True) + benchmark_type: str = Field(index=True) + sample_count: int + sample_selection_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) + default_worker_team_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) + default_evaluator_slug: str | None = Field(default=None, index=True) + default_model_target: str | None = None + design_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) + seed: int | None = None + metadata_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) + status: str = Field(default="defined", index=True) + created_at: datetime = Field(default_factory=_utcnow, sa_type=TZDateTime) + started_at: datetime | None = Field(default=None, sa_type=TZDateTime) + completed_at: datetime | None = Field(default=None, sa_type=TZDateTime) + + def parsed_sample_selection(self) -> JsonObject: + return self.__class__._parse_json_object( + self.sample_selection_json, "sample_selection_json" + ) + + def parsed_default_worker_team(self) -> JsonObject: + return self.__class__._parse_json_object( + self.default_worker_team_json, "default_worker_team_json" + ) + + def parsed_design(self) -> JsonObject: + return self.__class__._parse_json_object(self.design_json, "design_json") + + def parsed_metadata(self) -> JsonObject: + return self.__class__._parse_json_object(self.metadata_json, "metadata_json") + + @classmethod + def _parse_json_object(cls, data: dict, field_name: str) -> JsonObject: + if not isinstance(data, dict): + raise ValueError(f"{field_name} must be a dict, got {type(data).__name__}") + return data + + @model_validator(mode="after") + def _validate_fields(self) -> "ExperimentRecord": + self.__class__._parse_json_object(self.sample_selection_json, "sample_selection_json") + self.__class__._parse_json_object(self.default_worker_team_json, "default_worker_team_json") + self.__class__._parse_json_object(self.design_json, "design_json") + self.__class__._parse_json_object(self.metadata_json, "metadata_json") + return self + + # --------------------------------------------------------------------------- # RunRecord # --------------------------------------------------------------------------- @@ -51,15 +113,19 @@ class RunRecord(SQLModel, table=True): __tablename__ = "runs" id: UUID = Field(default_factory=uuid4, primary_key=True) - experiment_definition_id: UUID = Field( + experiment_id: UUID = Field(foreign_key="experiments.id", index=True) + workflow_definition_id: UUID = Field( foreign_key="experiment_definitions.id", index=True, ) - cohort_id: UUID | None = Field( - default=None, - foreign_key="experiment_cohorts.id", - index=True, - ) + benchmark_type: str = Field(index=True) + instance_key: str = Field(index=True) + sample_id: str | None = Field(default=None, index=True) + worker_team_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) + evaluator_slug: str | None = Field(default=None, index=True) + model_target: str | None = None + assignment_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) + seed: int | None = None status: RunStatus = Field(index=True) error_message: str | None = None created_at: datetime = Field(default_factory=_utcnow, sa_type=TZDateTime) @@ -67,20 +133,26 @@ class RunRecord(SQLModel, table=True): completed_at: datetime | None = Field(default=None, sa_type=TZDateTime) summary_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) - # -- JSON accessor: summary_json -- + def parsed_worker_team(self) -> JsonObject: + return self.__class__._parse_json_object(self.worker_team_json, "worker_team_json") + + def parsed_assignment(self) -> JsonObject: + return self.__class__._parse_json_object(self.assignment_json, "assignment_json") def parsed_summary(self) -> JsonObject: - return self.__class__._parse_summary(self.summary_json) + return self.__class__._parse_json_object(self.summary_json, "summary_json") @classmethod - def _parse_summary(cls, data: dict) -> JsonObject: + def _parse_json_object(cls, data: dict, field_name: str) -> JsonObject: if not isinstance(data, dict): - raise ValueError(f"summary_json must be a dict, got {type(data).__name__}") + raise ValueError(f"{field_name} must be a dict, got {type(data).__name__}") return data @model_validator(mode="after") def _validate_fields(self) -> "RunRecord": - self.__class__._parse_summary(self.summary_json) + self.__class__._parse_json_object(self.worker_team_json, "worker_team_json") + self.__class__._parse_json_object(self.assignment_json, "assignment_json") + self.__class__._parse_json_object(self.summary_json, "summary_json") try: RunStatus(self.status) except ValueError: diff --git a/ergon_core/ergon_core/core/providers/generation/__init__.py b/ergon_core/ergon_core/core/providers/generation/__init__.py index 585bef15..5a166577 100644 --- a/ergon_core/ergon_core/core/providers/generation/__init__.py +++ b/ergon_core/ergon_core/core/providers/generation/__init__.py @@ -2,8 +2,7 @@ from ergon_core.core.providers.generation.model_resolution import ( ResolvedModel, - register_model_backend, resolve_model_target, ) -__all__ = ["ResolvedModel", "register_model_backend", "resolve_model_target"] +__all__ = ["ResolvedModel", "resolve_model_target"] diff --git a/ergon_core/ergon_core/core/providers/generation/model_resolution.py b/ergon_core/ergon_core/core/providers/generation/model_resolution.py index 93f312df..c3f254f9 100644 --- a/ergon_core/ergon_core/core/providers/generation/model_resolution.py +++ b/ergon_core/ergon_core/core/providers/generation/model_resolution.py @@ -1,20 +1,8 @@ -"""Prefix-based model target resolution. - -Dispatches ``model_target`` strings to the appropriate backend based on -their prefix (``vllm:``, ``transformers:``, ``openai:``, etc.). - -Concrete backend implementations live in ``ergon_builtins.models``. -This module owns the contract (``ResolvedModel``) and the dispatch logic. -""" - -import logging -from typing import Callable +"""Prefix-based model target resolution.""" import pydantic_ai.models from pydantic import BaseModel -logger = logging.getLogger(__name__) - class ResolvedModel(BaseModel): """A resolved model target with backend metadata. @@ -32,16 +20,6 @@ class ResolvedModel(BaseModel): supports_logprobs: bool = False -# Backend resolver registry: prefix -> callable -# Populated by ergon_builtins.registry at import time. -_BACKEND_REGISTRY: dict[str, Callable[..., ResolvedModel]] = {} - - -def register_model_backend(prefix: str, resolver: Callable[..., ResolvedModel]) -> None: - """Register a model backend resolver for a given prefix.""" - _BACKEND_REGISTRY[prefix] = resolver - - def resolve_model_target( model_target: str | None, *, @@ -51,20 +29,48 @@ def resolve_model_target( ) -> ResolvedModel: """Resolve a ``model_target`` string to a PydanticAI-compatible model. - Dispatches by prefix to registered backends. Unrecognised prefixes - are passed through to PydanticAI's ``infer_model``. + Cloud provider targets (``openai:*``, ``anthropic:*``, ``google:*``) + intentionally resolve to OpenRouter-hosted models. Direct cloud provider + API access is not part of Ergon's model-target grammar. """ + target = model_target or "openai:gpt-4o" + prefix = target.split(":", 1)[0] if ":" in target else "" + + if prefix == "vllm": + from ergon_core.core.providers.generation.openai_compatible import ( # slopcop: ignore[guarded-function-import] -- reason: avoid import cycle; provider modules import ResolvedModel + resolve_vllm, + ) - prefix = target.split(":")[0] if ":" in target else "" + return resolve_vllm( + target, model_name=model_name, policy_version=policy_version, api_key=api_key + ) + + if prefix == "openai-compatible": + from ergon_core.core.providers.generation.openai_compatible import ( # slopcop: ignore[guarded-function-import] -- reason: avoid import cycle; provider modules import ResolvedModel + resolve_openai_compatible, + ) + + return resolve_openai_compatible( + target, model_name=model_name, policy_version=policy_version, api_key=api_key + ) + + if prefix in {"openai", "anthropic", "google"}: + from ergon_core.core.providers.generation.openrouter import ( # slopcop: ignore[guarded-function-import] -- reason: avoid import cycle; provider modules import ResolvedModel + resolve_cloud_via_openrouter, + ) + + return resolve_cloud_via_openrouter( + target, model_name=model_name, policy_version=policy_version, api_key=api_key + ) + + if prefix == "openrouter": + from ergon_core.core.providers.generation.openrouter import ( # slopcop: ignore[guarded-function-import] -- reason: avoid import cycle; provider modules import ResolvedModel + resolve_openrouter_alias, + ) - resolver = _BACKEND_REGISTRY.get(prefix) - if resolver is not None: - return resolver( - target, - model_name=model_name, - policy_version=policy_version, - api_key=api_key, + return resolve_openrouter_alias( + target, model_name=model_name, policy_version=policy_version, api_key=api_key ) - return ResolvedModel(model=target, supports_logprobs=False) + raise ValueError(f"Unsupported model target: {target!r}") diff --git a/ergon_core/ergon_core/core/providers/generation/openai_compatible.py b/ergon_core/ergon_core/core/providers/generation/openai_compatible.py new file mode 100644 index 00000000..8a4b8727 --- /dev/null +++ b/ergon_core/ergon_core/core/providers/generation/openai_compatible.py @@ -0,0 +1,100 @@ +"""OpenAI-compatible endpoint resolution for local and custom model servers.""" + +import json +import logging +import urllib.error +import urllib.request + +from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.providers.openai import OpenAIProvider + +from ergon_core.core.providers.generation.model_resolution import ResolvedModel + +logger = logging.getLogger(__name__) + + +def resolve_openai_compatible( + target: str, + *, + model_name: str | None = None, + policy_version: str | None = None, + api_key: str | None = None, +) -> ResolvedModel: + """Resolve ``openai-compatible:#`` targets.""" + + base_url, parsed_model_name = _split_endpoint_target( + target, + prefix="openai-compatible:", + require_model_name=True, + ) + resolved_name = model_name or parsed_model_name + if resolved_name is None: + raise ValueError("openai-compatible target requires a model name") + provider = OpenAIProvider(base_url=base_url.rstrip("/"), api_key=api_key or "not-needed") + model = OpenAIChatModel(model_name=resolved_name, provider=provider) + return ResolvedModel(model=model, policy_version=policy_version, supports_logprobs=False) + + +def resolve_vllm( + target: str, + *, + model_name: str | None = None, + policy_version: str | None = None, + api_key: str | None = None, +) -> ResolvedModel: + """Resolve ``vllm:[#]`` targets.""" + + endpoint, parsed_model_name = _split_endpoint_target( + target, + prefix="vllm:", + require_model_name=False, + ) + endpoint = endpoint.rstrip("/") + resolved_name = model_name or parsed_model_name or _discover_model_name(endpoint) + provider = OpenAIProvider(base_url=f"{endpoint}/v1", api_key=api_key or "not-needed") + model = OpenAIChatModel(model_name=resolved_name, provider=provider) + logger.info( + "Resolved vLLM model: endpoint=%s model_name=%s policy_version=%s", + endpoint, + resolved_name, + policy_version, + ) + return ResolvedModel(model=model, policy_version=policy_version, supports_logprobs=True) + + +def _split_endpoint_target( + target: str, + *, + prefix: str, + require_model_name: bool, +) -> tuple[str, str | None]: + body = target.removeprefix(prefix) + endpoint, separator, model_name = body.partition("#") + if not endpoint: + raise ValueError(f"{prefix} target requires a base URL") + if require_model_name and not (separator and model_name): + raise ValueError(f"{prefix}# target requires a model name") + return endpoint, model_name or None + + +def _discover_model_name(endpoint: str) -> str: + """Query ``/v1/models`` to discover the served model name.""" + + url = f"{endpoint}/v1/models" + try: + with urllib.request.urlopen(url, timeout=5) as resp: + body = json.loads(resp.read()) + models = body.get("data", []) + if models: + name = models[0].get("id", "default") + logger.info("Discovered vLLM model name: %s", name) + return name + except ( + urllib.error.HTTPError, + urllib.error.URLError, + TimeoutError, + OSError, + json.JSONDecodeError, + ): + logger.warning("Could not discover vLLM model name from %s, using 'default'", url) + return "default" diff --git a/ergon_core/ergon_core/core/providers/generation/openrouter.py b/ergon_core/ergon_core/core/providers/generation/openrouter.py new file mode 100644 index 00000000..34f8946f --- /dev/null +++ b/ergon_core/ergon_core/core/providers/generation/openrouter.py @@ -0,0 +1,55 @@ +"""OpenRouter-hosted cloud model resolution.""" + +from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.providers.openrouter import OpenRouterProvider + +from ergon_core.core.providers.generation.model_resolution import ResolvedModel +from ergon_core.core.settings import settings + +CLOUD_PROVIDER_PREFIXES = frozenset({"openai", "anthropic", "google"}) + + +def resolve_cloud_via_openrouter( + target: str, + *, + model_name: str | None = None, + policy_version: str | None = None, + api_key: str | None = None, +) -> ResolvedModel: + """Resolve ``openai:*``, ``anthropic:*``, and ``google:*`` through OpenRouter.""" + + provider_prefix, separator, provider_model_name = target.partition(":") + if not separator or not provider_model_name: + raise ValueError(f"Unsupported model target: {target!r}") + if provider_prefix not in CLOUD_PROVIDER_PREFIXES: + raise ValueError(f"Unsupported cloud provider target: {target!r}") + + openrouter_model_name = model_name or f"{provider_prefix}/{provider_model_name}" + provider = _openrouter_provider(api_key) + model = OpenAIChatModel(model_name=openrouter_model_name, provider=provider) + return ResolvedModel(model=model, policy_version=policy_version, supports_logprobs=False) + + +def resolve_openrouter_alias( + target: str, + *, + model_name: str | None = None, + policy_version: str | None = None, + api_key: str | None = None, +) -> ResolvedModel: + """Resolve legacy ``openrouter:/`` targets through OpenRouter.""" + + provider_model_name = target.removeprefix("openrouter:") + if not provider_model_name: + raise ValueError("openrouter:/ target requires a model name") + + provider = _openrouter_provider(api_key) + model = OpenAIChatModel(model_name=model_name or provider_model_name, provider=provider) + return ResolvedModel(model=model, policy_version=policy_version, supports_logprobs=False) + + +def _openrouter_provider(api_key: str | None) -> OpenRouterProvider: + resolved_api_key = api_key or settings.openrouter_api_key + if resolved_api_key: + return OpenRouterProvider(api_key=resolved_api_key) + return OpenRouterProvider() diff --git a/ergon_core/ergon_core/core/providers/generation/pydantic_ai_format.py b/ergon_core/ergon_core/core/providers/generation/pydantic_ai_format.py index 243dc2b7..744440b6 100644 --- a/ergon_core/ergon_core/core/providers/generation/pydantic_ai_format.py +++ b/ergon_core/ergon_core/core/providers/generation/pydantic_ai_format.py @@ -15,8 +15,8 @@ rather than re-implementing the parsing. """ -from ergon_core.api.generation import TokenLogprob from ergon_core.api.json_types import JsonObject +from ergon_core.core.providers.generation.types import TokenLogprob def extract_logprobs( diff --git a/ergon_core/ergon_core/core/providers/generation/types.py b/ergon_core/ergon_core/core/providers/generation/types.py new file mode 100644 index 00000000..cf206095 --- /dev/null +++ b/ergon_core/ergon_core/core/providers/generation/types.py @@ -0,0 +1,17 @@ +"""Shared generation provider value types.""" + +from pydantic import BaseModel, Field + +type JsonScalar = str | int | float | bool | None +type JsonValue = JsonScalar | list[JsonValue] | dict[str, JsonValue] +type JsonObject = dict[str, JsonValue] + + +class TokenLogprob(BaseModel): + """Per-token log probability from the serving backend.""" + + model_config = {"frozen": True} + + token: str + logprob: float + top_logprobs: list[JsonObject] = Field(default_factory=list) diff --git a/ergon_core/ergon_core/core/rl/eval_runner.py b/ergon_core/ergon_core/core/rl/eval_runner.py index 748a682a..198baf86 100644 --- a/ergon_core/ergon_core/core/rl/eval_runner.py +++ b/ergon_core/ergon_core/core/rl/eval_runner.py @@ -1,7 +1,8 @@ """Eval watcher: score checkpoints on Ergon benchmarks. Watches a checkpoint directory, detects new checkpoints, runs -``ergon benchmark run`` against each, and optionally reports results. +``ergon experiment define`` + ``ergon experiment run`` against each, +and optionally reports results. The watcher runs on CPU. For vLLM-based evaluation, use ``--on-checkpoint`` to spawn a SkyPilot GPU job per checkpoint. @@ -9,12 +10,17 @@ import asyncio import logging +import re import shlex import subprocess from ergon_core.core.rl.checkpoint import CheckpointInfo, discover_checkpoints logger = logging.getLogger(__name__) +_UUID_RE = re.compile( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + re.IGNORECASE, +) async def watch_and_evaluate( @@ -97,44 +103,70 @@ async def _run_local_eval( model_base: str, eval_limit: int | None, ) -> int: - """Run benchmark evaluation locally via the CLI. Returns exit code. + """Run checkpoint evaluation locally via the experiment CLI. Returns exit code. Uses the checkpoint path as the vLLM model target so each checkpoint is actually evaluated (not just the base model). """ + if eval_limit is None: + raise ValueError("--eval-limit is required for local checkpoint evaluation") + model_target = f"vllm:{ckpt.path}" - cmd = [ + define_cmd = [ "ergon", - "benchmark", - "run", - "--benchmark", + "experiment", + "define", benchmark_type, + "--worker", + "training-stub", "--evaluator", evaluator_type, "--model", model_target, + "--limit", + str(eval_limit), ] - if eval_limit: - cmd.extend(["--limit", str(eval_limit)]) - - logger.info("Running local eval for step %d: %s", ckpt.step, " ".join(cmd)) + logger.info("Defining local eval experiment for step %d: %s", ckpt.step, " ".join(define_cmd)) env = dict(__import__("os").environ) env["ERGON_CHECKPOINT_STEP"] = str(ckpt.step) env["ERGON_CHECKPOINT_PATH"] = ckpt.path try: - proc = await asyncio.create_subprocess_exec( - *cmd, + define_proc = await asyncio.create_subprocess_exec( + *define_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=env, ) - _stdout, stderr = await proc.communicate() + stdout, stderr = await define_proc.communicate() + + exit_code = 0 if define_proc.returncode is None else define_proc.returncode + output = stdout.decode() + stderr.decode() + if exit_code != 0: + logger.warning( + "Eval experiment definition failed for step %d (exit %d): %s", + ckpt.step, + exit_code, + output[:500], + ) + return exit_code + + experiment_id = _parse_uuid_line("EXPERIMENT_ID=", output) + run_cmd = ["ergon", "experiment", "run", experiment_id] + logger.info("Running local eval for step %d: %s", ckpt.step, " ".join(run_cmd)) - exit_code = 0 if proc.returncode is None else proc.returncode + run_proc = await asyncio.create_subprocess_exec( + *run_cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=env, + ) + _stdout, stderr = await run_proc.communicate() + + exit_code = 0 if run_proc.returncode is None else run_proc.returncode if exit_code == 0: logger.info("Eval complete for step %d", ckpt.step) else: @@ -173,3 +205,13 @@ async def evaluate_checkpoint( model_base=model_base, eval_limit=eval_limit, ) + + +def _parse_uuid_line(prefix: str, output: str) -> str: + for line in output.splitlines(): + if not line.startswith(prefix): + continue + match = _UUID_RE.search(line) + if match is not None: + return match.group(0) + raise RuntimeError(f"missing {prefix} line in CLI output:\n{output}") diff --git a/ergon_core/ergon_core/core/rl/rollout_service.py b/ergon_core/ergon_core/core/rl/rollout_service.py index 4162d4ea..4cba6a42 100644 --- a/ergon_core/ergon_core/core/rl/rollout_service.py +++ b/ergon_core/ergon_core/core/rl/rollout_service.py @@ -13,12 +13,14 @@ import inngest from ergon_core.core.persistence.context.models import RunContextEvent +from ergon_core.core.persistence.definitions.models import ExperimentDefinition from ergon_core.core.persistence.shared.enums import ( TERMINAL_RUN_STATUSES, RunStatus, ) from ergon_core.core.persistence.shared.ids import new_id from ergon_core.core.persistence.telemetry.models import ( + ExperimentRecord, RolloutBatch, RolloutBatchRun, RunRecord, @@ -82,6 +84,26 @@ def submit(self, request: SubmitRequest) -> SubmitResponse: run_ids: list[UUID] = [] with self._session_factory() as session: + definition = session.get(ExperimentDefinition, request.definition_id) + benchmark_type = definition.benchmark_type if definition else "rl-rollout" + experiment = ExperimentRecord( + name=f"RL rollout batch {batch_id}", + benchmark_type=benchmark_type, + sample_count=request.num_episodes, + sample_selection_json={ + "instance_keys": [f"episode-{index}" for index in range(request.num_episodes)] + }, + default_worker_team_json={"primary": "rl-rollout"}, + default_model_target=request.model_target_override, + design_json={}, + metadata_json={ + "source": "rollout_service", + "batch_id": str(batch_id), + "definition_id": str(request.definition_id), + }, + status="running", + ) + session.add(experiment) session.add( RolloutBatch( id=batch_id, @@ -90,12 +112,17 @@ def submit(self, request: SubmitRequest) -> SubmitResponse: ) ) - for _ in range(request.num_episodes): + for index in range(request.num_episodes): run_id = new_id() session.add( RunRecord( id=run_id, - experiment_definition_id=request.definition_id, + experiment_id=experiment.id, + workflow_definition_id=request.definition_id, + benchmark_type=benchmark_type, + instance_key=f"episode-{index}", + worker_team_json={"primary": "rl-rollout"}, + model_target=request.model_target_override, status=RunStatus.PENDING, ) ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py b/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py deleted file mode 100644 index f517de7a..00000000 --- a/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Inngest function triggered by benchmark/run-request from CLI. - -Reconstructs the full Experiment object graph server-side, persists it, -creates a RunRecord, and emits workflow/started to kick off execution. -""" - -import logging -from typing import ClassVar - -import inngest -from ergon_builtins.registry import BENCHMARKS, EVALUATORS, WORKERS -from ergon_core.api.experiment import Experiment -from ergon_core.api.worker_spec import WorkerSpec -from ergon_core.core.runtime.errors import RegistryLookupError -from ergon_core.core.runtime.events.base import InngestEventContract -from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent -from ergon_core.core.runtime.inngest_client import inngest_client -from ergon_core.core.runtime.services.inngest_function_results import ( - BenchmarkRunStartResult, -) -from ergon_core.core.runtime.services.run_service import create_run - -logger = logging.getLogger(__name__) - - -class BenchmarkRunRequest(InngestEventContract): - """CLI sends this to request a full benchmark run.""" - - name: ClassVar[str] = "benchmark/run-request" - - benchmark_slug: str - model: str - worker_slug: str - evaluator_slug: str - cohort_name: str = "" # slopcop: ignore[no-str-empty-default] - - -@inngest_client.create_function( - fn_id="benchmark-run-start", - trigger=inngest.TriggerEvent(event="benchmark/run-request"), - retries=1, - output_type=BenchmarkRunStartResult, -) -async def benchmark_run_start_fn(ctx: inngest.Context) -> BenchmarkRunStartResult: - """Initialise a benchmark run from a CLI request. - - Steps - ----- - 1. Parse event payload into ``BenchmarkRunRequest``. - 2. Look up benchmark / worker / evaluator classes in the builtins registry. - 3. Build an ``Experiment``, persist it, create a ``RunRecord``. - 4. Emit ``workflow/started`` so the orchestration pipeline takes over. - """ - payload = BenchmarkRunRequest.model_validate(ctx.event.data) - logger.info( - "benchmark-run-start: slug=%s model=%s worker=%s evaluator=%s", - payload.benchmark_slug, - payload.model, - payload.worker_slug, - payload.evaluator_slug, - ) - - benchmark_cls = BENCHMARKS.get(payload.benchmark_slug) - if benchmark_cls is None: - raise RegistryLookupError("benchmark", payload.benchmark_slug) - - # reason: RFC 2026-04-22 §1 — config-time composition uses ``WorkerSpec``; - # the registry is only hit here to validate the slug so we fail fast with - # a ``RegistryLookupError`` instead of a late ``KeyError`` at execute. - if payload.worker_slug not in WORKERS: - raise RegistryLookupError("worker", payload.worker_slug) - - evaluator_cls = EVALUATORS.get(payload.evaluator_slug) - if evaluator_cls is None: - raise RegistryLookupError("evaluator", payload.evaluator_slug) - - benchmark = benchmark_cls() - worker_spec = WorkerSpec( - worker_slug=payload.worker_slug, - name="worker", - model=payload.model, - ) - evaluator = evaluator_cls(name="evaluator") - - experiment = Experiment.from_single_worker( - benchmark=benchmark, - worker=worker_spec, - evaluators={"default": evaluator}, - ) - persisted = experiment.persist() - - run = create_run(persisted) - - event = WorkflowStartedEvent( - run_id=run.id, - definition_id=persisted.definition_id, - ) - await inngest_client.send( - inngest.Event( - name=WorkflowStartedEvent.name, - data=event.model_dump(mode="json"), - ) - ) - - return BenchmarkRunStartResult( - run_id=run.id, - definition_id=persisted.definition_id, - benchmark=payload.benchmark_slug, - ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py b/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py index 20fc5393..48db73f8 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py +++ b/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py @@ -7,7 +7,7 @@ from ergon_core.core.dashboard import emit_cohort_updated_for_run from ergon_core.core.dashboard.emitter import dashboard_emitter from ergon_core.core.persistence.shared.db import get_session -from ergon_core.core.persistence.telemetry.models import RunRecord +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord from ergon_core.core.runtime.events.infrastructure_events import RunCleanupEvent from ergon_core.core.runtime.events.task_events import WorkflowCompletedEvent from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client @@ -99,6 +99,7 @@ async def complete_workflow_fn(ctx: inngest.Context) -> WorkflowCompleteResult: with get_session() as session: run = session.get(RunRecord, payload.run_id) + experiment = session.get(ExperimentRecord, run.experiment_id) if run else None if run and run.started_at and run.completed_at: sink.emit_span( CompletedSpan( @@ -109,7 +110,9 @@ async def complete_workflow_fn(ctx: inngest.Context) -> WorkflowCompleteResult: attributes={ "run_id": str(payload.run_id), "definition_id": str(payload.definition_id), - "cohort_id": str(run.cohort_id) if run.cohort_id else "", + "cohort_id": str(experiment.cohort_id) + if experiment and experiment.cohort_id + else "", "status": run.status, "final_score": finalized.final_score, "normalized_score": finalized.normalized_score, diff --git a/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py b/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py index 4dda0b84..eb4921de 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py +++ b/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py @@ -6,7 +6,7 @@ import inngest from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import RunStatus -from ergon_core.core.persistence.telemetry.models import RunRecord +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord from ergon_core.core.runtime.errors import DataIntegrityError from ergon_core.core.runtime.events.infrastructure_events import RunCleanupEvent from ergon_core.core.runtime.events.task_events import WorkflowFailedEvent @@ -82,6 +82,7 @@ async def fail_workflow_fn(ctx: inngest.Context) -> WorkflowFailedResult: with get_session() as session: run = session.get(RunRecord, payload.run_id) + experiment = session.get(ExperimentRecord, run.experiment_id) if run else None if run and run.started_at and run.completed_at: sink.emit_span( CompletedSpan( @@ -94,7 +95,9 @@ async def fail_workflow_fn(ctx: inngest.Context) -> WorkflowFailedResult: attributes={ "run_id": str(payload.run_id), "definition_id": str(payload.definition_id), - "cohort_id": str(run.cohort_id) if run.cohort_id else "", + "cohort_id": str(experiment.cohort_id) + if experiment and experiment.cohort_id + else "", "status": run.status, "error": truncate_text(payload.error), }, diff --git a/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py b/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py index da57ea8d..078597a2 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py +++ b/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py @@ -163,6 +163,7 @@ async def start_workflow_fn(ctx: inngest.Context) -> WorkflowStartResult: run_id=payload.run_id, definition_id=payload.definition_id, task_id=td.task_id, + node_id=td.node_id, ).model_dump(mode="json"), ) for td in initialized.initial_ready_tasks diff --git a/ergon_core/ergon_core/core/runtime/inngest_registry.py b/ergon_core/ergon_core/core/runtime/inngest_registry.py index a3033010..0859303b 100644 --- a/ergon_core/ergon_core/core/runtime/inngest_registry.py +++ b/ergon_core/ergon_core/core/runtime/inngest_registry.py @@ -3,7 +3,6 @@ Pass ALL_FUNCTIONS to inngest.serve() or the framework integration. """ -from ergon_core.core.runtime.inngest.benchmark_run_start import benchmark_run_start_fn from ergon_core.core.runtime.inngest.cancel_orphan_subtasks import ( block_descendants_on_failed_fn, cancel_orphans_on_cancelled_fn, @@ -25,8 +24,6 @@ from ergon_core.core.runtime.inngest.worker_execute import worker_execute_fn ALL_FUNCTIONS = [ - # Benchmark entry point - benchmark_run_start_fn, # Task orchestration start_workflow_fn, execute_task_fn, diff --git a/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py b/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py index b0a0d8da..238549ad 100644 --- a/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py +++ b/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py @@ -37,20 +37,20 @@ class CohortSummaryDto(BaseModel): stats_updated_at: datetime | None = None -class CohortRunRowDto(BaseModel): - """One run inside a cohort detail view.""" +class CohortExperimentRowDto(BaseModel): + """One experiment inside a cohort detail view.""" - run_id: UUID - definition_id: UUID - cohort_id: UUID - cohort_name: str + experiment_id: UUID + name: str + benchmark_type: str + sample_count: int + total_runs: int = 0 + status_counts: CohortStatusCountsDto = Field(default_factory=CohortStatusCountsDto) status: str created_at: datetime - started_at: datetime | None = None - completed_at: datetime | None = None - running_time_ms: int | None = None + default_model_target: str | None = None + default_evaluator_slug: str | None = None final_score: float | None = None - total_tasks: int | None = None total_cost_usd: float | None = None error_message: str | None = None @@ -59,7 +59,7 @@ class CohortDetailDto(BaseModel): """Full payload for a single cohort detail page.""" summary: CohortSummaryDto - runs: list[CohortRunRowDto] = Field(default_factory=list) + experiments: list[CohortExperimentRowDto] = Field(default_factory=list) class UpdateCohortRequest(BaseModel): diff --git a/ergon_core/ergon_core/core/runtime/services/cohort_service.py b/ergon_core/ergon_core/core/runtime/services/cohort_service.py index 5cb73e3d..d4122b1f 100644 --- a/ergon_core/ergon_core/core/runtime/services/cohort_service.py +++ b/ergon_core/ergon_core/core/runtime/services/cohort_service.py @@ -7,18 +7,18 @@ ExperimentCohort, ExperimentCohortStats, ExperimentCohortStatus, + ExperimentRecord, RunRecord, ) -from ergon_core.core.persistence.graph.models import RunGraphNode from ergon_core.core.runtime.services.cohort_schemas import ( CohortDetailDto, - CohortRunRowDto, + CohortExperimentRowDto, CohortStatusCountsDto, CohortSummaryDto, UpdateCohortRequest, ) from ergon_core.core.utils import utcnow -from sqlmodel import func, select +from sqlmodel import select class ExperimentCohortService: @@ -66,7 +66,7 @@ def list_summaries(self, *, include_archived: bool = False) -> list[CohortSummar return results def get_detail(self, cohort_id: UUID) -> CohortDetailDto | None: - """Get a cohort detail DTO with all current run rows.""" + """Get a cohort detail DTO with all experiments in the project folder.""" with get_session() as session: cohort = session.get(ExperimentCohort, cohort_id) if cohort is None: @@ -77,26 +77,23 @@ def get_detail(self, cohort_id: UUID) -> CohortDetailDto | None: ).first() summary = self._build_summary(cohort, stats) - runs = list( - session.exec(select(RunRecord).where(RunRecord.cohort_id == cohort_id)).all() + experiments = list( + session.exec( + select(ExperimentRecord).where(ExperimentRecord.cohort_id == cohort_id) + ).all() ) - task_counts = ( - { - run_id: count - for run_id, count in session.exec( - select(RunGraphNode.run_id, func.count(RunGraphNode.id)) - .where(RunGraphNode.run_id.in_([run.id for run in runs])) - .group_by(RunGraphNode.run_id) - ).all() - } - if runs - else {} - ) - run_rows = [ - self._build_run_row(cohort, run, int(task_counts.get(run.id, 0)) or None) - for run in runs + experiment_rows = [ + self._build_experiment_row( + experiment, + list( + session.exec( + select(RunRecord).where(RunRecord.experiment_id == experiment.id) + ).all() + ), + ) + for experiment in experiments ] - return CohortDetailDto(summary=summary, runs=run_rows) + return CohortDetailDto(summary=summary, experiments=experiment_rows) def get_summary(self, cohort_id: UUID) -> CohortSummaryDto | None: """Get a single cohort summary DTO.""" @@ -159,42 +156,75 @@ def _build_summary( ) @staticmethod - def _build_run_row( - cohort: ExperimentCohort, - run: RunRecord, - total_tasks: int | None = None, - ) -> CohortRunRowDto: - running_time_ms: int | None = None - if run.started_at is not None: - end_time = run.completed_at or utcnow() - running_time_ms = max(int((end_time - run.started_at).total_seconds() * 1000), 0) - + def _build_experiment_row( + experiment: ExperimentRecord, + runs: list[RunRecord], + ) -> CohortExperimentRowDto: score: float | None = None - summary = run.parsed_summary() - if summary: + total_cost_usd: float | None = None + for run in runs: + summary = run.parsed_summary() raw_score = summary.get("normalized_score") if raw_score is None: raw_score = summary.get("final_score") - score = float(raw_score) if isinstance(raw_score, int | float) else None - total_cost_usd = summary.get("total_cost_usd") if summary else None - - return CohortRunRowDto( - run_id=run.id, - definition_id=run.experiment_definition_id, - cohort_id=cohort.id, - cohort_name=cohort.name, - status=run.status, - created_at=run.created_at, - started_at=run.started_at, - completed_at=run.completed_at, - running_time_ms=running_time_ms, + if isinstance(raw_score, int | float): + score = float(raw_score) + raw_cost = summary.get("total_cost_usd") + if isinstance(raw_cost, int | float): + total_cost_usd = (total_cost_usd or 0.0) + float(raw_cost) + + status_counts = CohortStatusCountsDto() + for run in runs: + _increment_status_count(status_counts, str(run.status)) + + return CohortExperimentRowDto( + experiment_id=experiment.id, + name=experiment.name, + benchmark_type=experiment.benchmark_type, + sample_count=experiment.sample_count, + total_runs=len(runs), + status_counts=status_counts, + status=_experiment_row_status(experiment.status, status_counts, len(runs)), + created_at=experiment.created_at, + default_model_target=experiment.default_model_target, + default_evaluator_slug=experiment.default_evaluator_slug, final_score=score, - total_tasks=total_tasks, - total_cost_usd=( - float(total_cost_usd) if isinstance(total_cost_usd, int | float) else None - ), - error_message=run.error_message, + total_cost_usd=total_cost_usd, + error_message=None, ) +def _increment_status_count(counts: CohortStatusCountsDto, status: str) -> None: + match status: + case "pending": + counts.pending += 1 + case "executing": + counts.executing += 1 + case "evaluating": + counts.evaluating += 1 + case "completed": + counts.completed += 1 + case "failed": + counts.failed += 1 + + +def _experiment_row_status( + experiment_status: str, + counts: CohortStatusCountsDto, + total_runs: int, +) -> str: + if total_runs == 0: + return experiment_status + active_runs = counts.pending + counts.executing + counts.evaluating + if active_runs > 0: + return experiment_status + if counts.failed == total_runs: + return "failed" + if counts.completed == total_runs: + return "completed" + if counts.failed > 0 and counts.completed > 0: + return "completed_with_failures" + return experiment_status + + experiment_cohort_service = ExperimentCohortService() diff --git a/ergon_core/ergon_core/core/runtime/services/cohort_stats_service.py b/ergon_core/ergon_core/core/runtime/services/cohort_stats_service.py index 25feb7bb..4f1598a8 100644 --- a/ergon_core/ergon_core/core/runtime/services/cohort_stats_service.py +++ b/ergon_core/ergon_core/core/runtime/services/cohort_stats_service.py @@ -7,6 +7,7 @@ from ergon_core.core.persistence.shared.enums import RunStatus from ergon_core.core.persistence.telemetry.models import ( ExperimentCohortStats, + ExperimentRecord, RunRecord, ) from ergon_core.core.utils import utcnow @@ -20,7 +21,11 @@ def recompute(self, cohort_id: UUID) -> None: """Recompute and persist aggregate stats for one cohort.""" with get_session() as session: runs = list( - session.exec(select(RunRecord).where(RunRecord.cohort_id == cohort_id)).all() + session.exec( + select(RunRecord) + .join(ExperimentRecord) + .where(ExperimentRecord.cohort_id == cohort_id) + ).all() ) status_counts = Counter(run.status for run in runs) diff --git a/ergon_core/ergon_core/core/runtime/services/experiment_definition_service.py b/ergon_core/ergon_core/core/runtime/services/experiment_definition_service.py new file mode 100644 index 00000000..66f24b96 --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/services/experiment_definition_service.py @@ -0,0 +1,96 @@ +"""Experiment definition service.""" + +from collections.abc import Callable, Mapping, Sequence +from inspect import Parameter, signature + +from ergon_core.api.benchmark import Benchmark +from ergon_core.api.task_types import BenchmarkTask +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.persistence.telemetry.models import ExperimentRecord +from ergon_core.core.runtime.services.experiment_schemas import ( + ExperimentDefineRequest, + ExperimentDefineResult, +) +from ergon_core.core.utils import utcnow +from pydantic import BaseModel + + +class ExperimentDefinitionService: + """Create experiment records without launching runs.""" + + def __init__(self, *, benchmarks: Mapping[str, Callable[..., Benchmark]] | None = None) -> None: + self._benchmarks = benchmarks + + def define_benchmark_experiment( + self, request: ExperimentDefineRequest + ) -> ExperimentDefineResult: + benchmark_cls = self._benchmark_cls(request.benchmark_slug) + benchmark = _construct_benchmark(benchmark_cls, limit=request.limit) + instances = benchmark.build_instances() + selected_samples = _select_samples(instances, request) + name = request.name or _generated_name(request.benchmark_slug, len(selected_samples)) + + experiment = ExperimentRecord( + cohort_id=request.cohort_id, + name=name, + benchmark_type=benchmark.type_slug, + sample_count=len(selected_samples), + sample_selection_json={"instance_keys": selected_samples}, + default_worker_team_json=request.default_worker_team, + default_evaluator_slug=request.default_evaluator_slug, + default_model_target=request.default_model_target, + design_json=request.design, + seed=request.seed, + metadata_json={ + **request.metadata, + "benchmark_slug": request.benchmark_slug, + }, + status="defined", + ) + with get_session() as session: + session.add(experiment) + session.commit() + session.refresh(experiment) + + return ExperimentDefineResult( + experiment_id=experiment.id, + cohort_id=experiment.cohort_id, + benchmark_type=experiment.benchmark_type, + sample_count=experiment.sample_count, + selected_samples=selected_samples, + ) + + def _benchmark_cls(self, benchmark_slug: str) -> Callable[..., Benchmark]: + if self._benchmarks is None: + from ergon_builtins.registry import ( # slopcop: ignore[guarded-function-import] -- reason: optional plugin registry; load only when defining benchmark experiments + BENCHMARKS, + ) + + self._benchmarks = BENCHMARKS + return self._benchmarks[benchmark_slug] + + +def _construct_benchmark(cls: Callable[..., Benchmark], *, limit: int | None) -> Benchmark: + parameters = signature(cls).parameters + accepts_limit = "limit" in parameters or any( + parameter.kind is Parameter.VAR_KEYWORD for parameter in parameters.values() + ) + if limit is not None and accepts_limit: + return cls(limit=limit) + return cls() + + +def _select_samples( + instances: Mapping[str, Sequence[BenchmarkTask[BaseModel]]], + request: ExperimentDefineRequest, +) -> list[str]: + if request.sample_ids is not None: + missing = [sample_id for sample_id in request.sample_ids if sample_id not in instances] + if missing: + raise ValueError(f"Unknown benchmark sample ids: {missing}") + return list(request.sample_ids) + return list(instances.keys()) + + +def _generated_name(benchmark_slug: str, sample_count: int) -> str: + return f"{benchmark_slug} n={sample_count} {utcnow().strftime('%Y-%m-%d %H:%M:%S')}" diff --git a/ergon_core/ergon_core/core/runtime/services/experiment_launch_service.py b/ergon_core/ergon_core/core/runtime/services/experiment_launch_service.py new file mode 100644 index 00000000..97eda74b --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/services/experiment_launch_service.py @@ -0,0 +1,211 @@ +"""Experiment launch service.""" + +from collections.abc import Awaitable, Callable, Mapping, Sequence +from uuid import UUID + +import inngest +from ergon_core.api.benchmark import Benchmark +from ergon_core.api.evaluator import Evaluator +from ergon_core.api.experiment import Experiment +from ergon_core.api.handles import PersistedExperimentDefinition +from ergon_core.api.json_types import JsonObject +from ergon_core.api.task_types import BenchmarkTask +from ergon_core.api.worker_spec import WorkerSpec +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.persistence.telemetry.models import ExperimentRecord +from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent +from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.services.experiment_schemas import ( + ExperimentRunRequest, + ExperimentRunResult, + RunAssignment, +) +from ergon_core.core.runtime.services.run_service import create_run +from pydantic import BaseModel + +WorkflowDefinitionFactory = Callable[ + [ExperimentRecord, RunAssignment], + PersistedExperimentDefinition, +] +WorkflowStartedEmitter = Callable[[UUID, UUID], Awaitable[None]] + + +class ExperimentLaunchService: + """Materialize runs for a previously defined experiment.""" + + def __init__( + self, + *, + workflow_definition_factory: WorkflowDefinitionFactory | None = None, + emit_workflow_started: WorkflowStartedEmitter | None = None, + ) -> None: + self._workflow_definition_factory = ( + workflow_definition_factory or _persist_single_sample_workflow_definition + ) + self._emit_workflow_started = emit_workflow_started or _emit_workflow_started + + async def run_experiment(self, request: ExperimentRunRequest) -> ExperimentRunResult: + with get_session() as session: + experiment = session.get(ExperimentRecord, request.experiment_id) + if experiment is None: + raise ValueError(f"Experiment {request.experiment_id} not found") + assignments = _assign_runs(experiment) + experiment.status = "running" + session.add(experiment) + session.commit() + session.refresh(experiment) + + run_ids: list[UUID] = [] + workflow_definition_ids: list[UUID] = [] + for assignment in assignments: + definition = self._workflow_definition_factory(experiment, assignment) + run = create_run( + definition, + experiment_id=experiment.id, + workflow_definition_id=definition.definition_id, + instance_key=assignment.instance_key, + worker_team_json=assignment.worker_team, + evaluator_slug=assignment.evaluator_slug, + model_target=assignment.model_target, + assignment_json=assignment.metadata, + seed=assignment.seed, + ) + await self._emit_workflow_started(run.id, definition.definition_id) + run_ids.append(run.id) + workflow_definition_ids.append(definition.definition_id) + + return ExperimentRunResult( + experiment_id=experiment.id, + run_ids=run_ids, + workflow_definition_ids=workflow_definition_ids, + ) + + +def _assign_runs(experiment: ExperimentRecord) -> list[RunAssignment]: + sample_selection = experiment.parsed_sample_selection() + instance_keys = sample_selection.get("instance_keys") + if not isinstance(instance_keys, list) or not all( + isinstance(instance_key, str) for instance_key in instance_keys + ): + raise ValueError("Experiment sample_selection_json must include string instance_keys") + + return [ + RunAssignment( + instance_key=instance_key, + sample_id=instance_key, + worker_team=experiment.parsed_default_worker_team(), + evaluator_slug=experiment.default_evaluator_slug, + model_target=experiment.default_model_target, + arm_key="default", + seed=experiment.seed, + metadata={"arm_key": "default"}, + ) + for instance_key in instance_keys + ] + + +def _persist_single_sample_workflow_definition( + experiment: ExperimentRecord, + assignment: RunAssignment, +) -> PersistedExperimentDefinition: + benchmark_slug = _metadata_str(experiment, "benchmark_slug") or experiment.benchmark_type + benchmark = _single_sample_benchmark(benchmark_slug, assignment.instance_key) + worker_slug = _primary_worker_slug(assignment.worker_team) + worker = WorkerSpec( + worker_slug=worker_slug, + name="primary", + model=assignment.model_target or "openai:gpt-4o", + ) + evaluators = _evaluator_bindings(assignment.evaluator_slug) + workflow = Experiment.from_single_worker( + benchmark=benchmark, + worker=worker, + evaluators=evaluators, + ) + return workflow.persist() + + +def _metadata_str(experiment: ExperimentRecord, key: str) -> str | None: + value = experiment.parsed_metadata().get(key) + return value if isinstance(value, str) else None + + +def _primary_worker_slug(worker_team: JsonObject) -> str: + value = worker_team.get("primary") + if not isinstance(value, str) or not value: + raise ValueError("Run assignment worker_team requires a string 'primary' worker slug") + return value + + +def _evaluator_bindings(evaluator_slug: str | None) -> dict[str, Evaluator]: + if evaluator_slug is None: + return {} + from ergon_builtins.registry import ( # slopcop: ignore[guarded-function-import] -- reason: optional plugin registry; load only when launching experiment runs + EVALUATORS, + ) + + evaluator_cls = EVALUATORS[evaluator_slug] + return {"default": evaluator_cls(name="evaluator")} + + +def _single_sample_benchmark(benchmark_slug: str, instance_key: str) -> Benchmark: + from ergon_builtins.registry import ( # slopcop: ignore[guarded-function-import] -- reason: optional plugin registry; load only when launching experiment runs + BENCHMARKS, + ) + + source = BENCHMARKS[benchmark_slug]() + instances = source.build_instances() + if instance_key not in instances: + raise ValueError( + f"Experiment sample {instance_key!r} not found in benchmark {benchmark_slug!r}" + ) + wrapper_cls = _single_sample_benchmark_cls(source) + return wrapper_cls(source, instance_key, instances[instance_key]) + + +class _SingleSampleBenchmark(Benchmark): + type_slug = "single-sample-wrapper" + + def __init__( + self, + source: Benchmark, + instance_key: str, + tasks: Sequence[BenchmarkTask[BaseModel]], + ) -> None: + super().__init__( + name=source.name, + description=source.description, + metadata=source.metadata, + ) + self._source = source + self._instance_key = instance_key + self._tasks = list(tasks) + + def build_instances(self) -> Mapping[str, Sequence[BenchmarkTask[BaseModel]]]: + return {self._instance_key: self._tasks} + + def evaluator_requirements(self) -> Sequence[str]: + return self._source.evaluator_requirements() + + +def _single_sample_benchmark_cls(source: Benchmark) -> type[_SingleSampleBenchmark]: + return type( + f"SingleSample{source.type_slug.replace('-', '_').title()}Benchmark", + (_SingleSampleBenchmark,), + { + "type_slug": source.type_slug, + "task_payload_model": source.task_payload_model, + "required_packages": source.required_packages, + "install_hint": source.install_hint, + }, + ) + + +async def _emit_workflow_started(run_id: UUID, definition_id: UUID) -> None: + event = WorkflowStartedEvent(run_id=run_id, definition_id=definition_id) + await inngest_client.send( + inngest.Event( + name=WorkflowStartedEvent.name, + data=event.model_dump(mode="json"), + ) + ) diff --git a/ergon_core/ergon_core/core/runtime/services/experiment_read_service.py b/ergon_core/ergon_core/core/runtime/services/experiment_read_service.py new file mode 100644 index 00000000..abde1dc6 --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/services/experiment_read_service.py @@ -0,0 +1,251 @@ +"""Read service for experiment API views.""" + +from datetime import datetime +from uuid import UUID + +from ergon_core.core.persistence.graph.models import RunGraphNode +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord +from pydantic import BaseModel, Field +from sqlmodel import select + + +class ExperimentStatusCountsDto(BaseModel): + pending: int = 0 + executing: int = 0 + evaluating: int = 0 + completed: int = 0 + failed: int = 0 + cancelled: int = 0 + + +class ExperimentSummaryDto(BaseModel): + experiment_id: UUID + cohort_id: UUID | None = None + name: str + benchmark_type: str + sample_count: int + status: str + default_worker_team: dict = Field(default_factory=dict) + default_evaluator_slug: str | None = None + default_model_target: str | None = None + created_at: datetime + started_at: datetime | None = None + completed_at: datetime | None = None + run_count: int = 0 + + +class ExperimentRunRowDto(BaseModel): + run_id: UUID + workflow_definition_id: UUID + benchmark_type: str + instance_key: str + status: str + created_at: datetime + started_at: datetime | None = None + completed_at: datetime | None = None + evaluator_slug: str | None = None + model_target: str | None = None + worker_team: dict = Field(default_factory=dict) + seed: int | None = None + running_time_ms: int | None = None + final_score: float | None = None + total_tasks: int | None = None + total_cost_usd: float | None = None + error_message: str | None = None + + +class ExperimentAnalyticsDto(BaseModel): + total_runs: int = 0 + status_counts: ExperimentStatusCountsDto = Field(default_factory=ExperimentStatusCountsDto) + average_score: float | None = None + average_duration_ms: int | None = None + average_tasks: float | None = None + total_cost_usd: float | None = None + latest_activity_at: datetime | None = None + error_count: int = 0 + + +class ExperimentDetailDto(BaseModel): + experiment: ExperimentSummaryDto + runs: list[ExperimentRunRowDto] = Field(default_factory=list) + analytics: ExperimentAnalyticsDto = Field(default_factory=ExperimentAnalyticsDto) + sample_selection: dict = Field(default_factory=dict) + design: dict = Field(default_factory=dict) + metadata: dict = Field(default_factory=dict) + + +class ExperimentReadService: + def list_experiments(self, *, limit: int = 50) -> list[ExperimentSummaryDto]: + with get_session() as session: + experiments = list( + session.exec( + select(ExperimentRecord) + .order_by(ExperimentRecord.created_at.desc()) + .limit(limit) + ).all() + ) + return [_summary(session, experiment) for experiment in experiments] + + def get_experiment(self, experiment_id: UUID) -> ExperimentDetailDto | None: + with get_session() as session: + experiment = session.get(ExperimentRecord, experiment_id) + if experiment is None: + return None + runs = list( + session.exec( + select(RunRecord).where(RunRecord.experiment_id == experiment.id) + ).all() + ) + task_counts = _task_counts_by_run(session, [run.id for run in runs]) + run_rows = [_run_row(run, total_tasks=task_counts.get(run.id)) for run in runs] + return ExperimentDetailDto( + experiment=_summary(session, experiment, runs=runs), + runs=run_rows, + analytics=_analytics(run_rows), + sample_selection=experiment.parsed_sample_selection(), + design=experiment.parsed_design(), + metadata=experiment.parsed_metadata(), + ) + + +def _summary( + session, + experiment: ExperimentRecord, + *, + runs: list[RunRecord] | None = None, +) -> ExperimentSummaryDto: + run_count = len(runs) if runs is not None else _run_count(session, experiment.id) + return ExperimentSummaryDto( + experiment_id=experiment.id, + cohort_id=experiment.cohort_id, + name=experiment.name, + benchmark_type=experiment.benchmark_type, + sample_count=experiment.sample_count, + status=experiment.status, + default_worker_team=experiment.parsed_default_worker_team(), + default_evaluator_slug=experiment.default_evaluator_slug, + default_model_target=experiment.default_model_target, + created_at=experiment.created_at, + started_at=experiment.started_at, + completed_at=experiment.completed_at, + run_count=run_count, + ) + + +def _run_count(session, experiment_id: UUID) -> int: + return len( + list(session.exec(select(RunRecord.id).where(RunRecord.experiment_id == experiment_id))) + ) + + +def _run_row(run: RunRecord, *, total_tasks: int | None = None) -> ExperimentRunRowDto: + summary = run.parsed_summary() + return ExperimentRunRowDto( + run_id=run.id, + workflow_definition_id=run.workflow_definition_id, + benchmark_type=run.benchmark_type, + instance_key=run.instance_key, + status=run.status, + created_at=run.created_at, + started_at=run.started_at, + completed_at=run.completed_at, + evaluator_slug=run.evaluator_slug, + model_target=run.model_target, + worker_team=run.parsed_worker_team(), + seed=run.seed, + running_time_ms=_duration_ms(run), + final_score=_summary_number(summary, "normalized_score") + or _summary_number(summary, "final_score"), + total_tasks=total_tasks, + total_cost_usd=_summary_number(summary, "total_cost_usd"), + error_message=run.error_message or _summary_text(summary, "error_message"), + ) + + +def _task_counts_by_run(session, run_ids: list[UUID]) -> dict[UUID, int]: + return { + run_id: len( + list(session.exec(select(RunGraphNode.id).where(RunGraphNode.run_id == run_id))) + ) + for run_id in run_ids + } + + +def _analytics(rows: list[ExperimentRunRowDto]) -> ExperimentAnalyticsDto: + status_counts = ExperimentStatusCountsDto() + scores: list[float] = [] + durations: list[int] = [] + task_counts: list[int] = [] + total_cost_usd: float | None = None + latest_activity_at: datetime | None = None + error_count = 0 + + for row in rows: + _increment_status_count(status_counts, row.status) + if row.final_score is not None: + scores.append(row.final_score) + if row.running_time_ms is not None: + durations.append(row.running_time_ms) + if row.total_tasks is not None: + task_counts.append(row.total_tasks) + if row.total_cost_usd is not None: + total_cost_usd = (total_cost_usd or 0.0) + row.total_cost_usd + if row.error_message: + error_count += 1 + activity_at = row.completed_at or row.started_at or row.created_at + if latest_activity_at is None or activity_at > latest_activity_at: + latest_activity_at = activity_at + + return ExperimentAnalyticsDto( + total_runs=len(rows), + status_counts=status_counts, + average_score=_average(scores), + average_duration_ms=round(_average(durations)) if durations else None, + average_tasks=_average(task_counts), + total_cost_usd=total_cost_usd, + latest_activity_at=latest_activity_at, + error_count=error_count, + ) + + +def _increment_status_count(counts: ExperimentStatusCountsDto, status: str) -> None: + match status: + case "pending": + counts.pending += 1 + case "executing": + counts.executing += 1 + case "evaluating": + counts.evaluating += 1 + case "completed": + counts.completed += 1 + case "failed": + counts.failed += 1 + case "cancelled": + counts.cancelled += 1 + + +def _average(values: list[float] | list[int]) -> float | None: + if not values: + return None + return sum(values) / len(values) + + +def _duration_ms(run: RunRecord) -> int | None: + if run.started_at is None or run.completed_at is None: + return None + return round((run.completed_at - run.started_at).total_seconds() * 1000) + + +def _summary_number(summary: dict, key: str) -> float | None: + value = summary.get(key) + if isinstance(value, int | float): + return float(value) + return None + + +def _summary_text(summary: dict, key: str) -> str | None: + value = summary.get(key) + if isinstance(value, str) and value: + return value + return None diff --git a/ergon_core/ergon_core/core/runtime/services/experiment_schemas.py b/ergon_core/ergon_core/core/runtime/services/experiment_schemas.py new file mode 100644 index 00000000..e6ac631f --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/services/experiment_schemas.py @@ -0,0 +1,75 @@ +"""DTOs for experiment definition and launch services.""" + +from typing import Self +from uuid import UUID + +from ergon_core.api.json_types import JsonObject +from pydantic import BaseModel, Field, model_validator + + +class ExperimentDefineRequest(BaseModel): + benchmark_slug: str + name: str | None = None + cohort_id: UUID | None = None + limit: int | None = None + sample_ids: list[str] | None = None + default_model_target: str | None = None + default_worker_team: JsonObject = Field(default_factory=dict) + default_evaluator_slug: str | None = None + design: JsonObject = Field(default_factory=dict) + seed: int | None = None + metadata: JsonObject = Field(default_factory=dict) + + @model_validator(mode="after") + def validate_define_request(self) -> Self: + if (self.limit is None) == (self.sample_ids is None): + raise ValueError("Provide exactly one of limit or sample_ids") + if self.limit is not None and self.limit < 1: + raise ValueError("limit must be >= 1") + + if self.design.get("arms"): + raise ValueError("design.arms is not supported until multi-arm launch semantics exist") + + has_default_assignment = bool(self.default_worker_team) and bool(self.default_model_target) + if not has_default_assignment: + raise ValueError( + "Experiment definition requires default_worker_team + default_model_target" + ) + return self + + +class ExperimentDefineResult(BaseModel): + experiment_id: UUID + cohort_id: UUID | None + benchmark_type: str + sample_count: int + selected_samples: list[str] + + +class ExperimentRunRequest(BaseModel): + experiment_id: UUID + timeout_seconds: int | None = None + wait: bool = True + + +class ExperimentRunResult(BaseModel): + experiment_id: UUID + run_ids: list[UUID] + workflow_definition_ids: list[UUID] = Field(default_factory=list) + + +class RunAssignment(BaseModel): + instance_key: str + sample_id: str | None = None + worker_team: JsonObject + evaluator_slug: str | None = None + model_target: str | None = None + arm_key: str | None = None + seed: int | None = None + metadata: JsonObject = Field(default_factory=dict) + + @model_validator(mode="after") + def validate_assignment(self) -> Self: + if not self.worker_team: + raise ValueError("Run assignment requires a worker team") + return self diff --git a/ergon_core/ergon_core/core/runtime/services/graph_repository.py b/ergon_core/ergon_core/core/runtime/services/graph_repository.py index 01ce6591..0efffd3d 100644 --- a/ergon_core/ergon_core/core/runtime/services/graph_repository.py +++ b/ergon_core/ergon_core/core/runtime/services/graph_repository.py @@ -22,6 +22,7 @@ ExperimentDefinitionTask, ExperimentDefinitionTaskAssignment, ExperimentDefinitionTaskDependency, + ExperimentDefinitionWorker, ) from ergon_core.core.persistence.graph.models import ( RunGraphAnnotation, @@ -134,9 +135,16 @@ def initialize_from_definition( ExperimentDefinitionTaskAssignment.experiment_definition_id == definition_id, ) assignments = list(session.exec(assignments_stmt).all()) + + workers_stmt = select(ExperimentDefinitionWorker).where( + ExperimentDefinitionWorker.experiment_definition_id == definition_id, + ) + worker_type_by_binding = { + worker.binding_key: worker.worker_type for worker in session.exec(workers_stmt).all() + } worker_by_task: dict[UUID, str] = {} for a in assignments: - worker_by_task[a.task_id] = a.worker_binding_key + worker_by_task[a.task_id] = worker_type_by_binding[a.worker_binding_key] deps = list( session.exec( diff --git a/ergon_core/ergon_core/core/runtime/services/inngest_function_results.py b/ergon_core/ergon_core/core/runtime/services/inngest_function_results.py index 6304f143..571418b2 100644 --- a/ergon_core/ergon_core/core/runtime/services/inngest_function_results.py +++ b/ergon_core/ergon_core/core/runtime/services/inngest_function_results.py @@ -98,14 +98,6 @@ class EvaluateTaskRunResult(BaseModel): error: str | None = None -class BenchmarkRunStartResult(BaseModel): - model_config = {"frozen": True} - - run_id: UUID - definition_id: UUID - benchmark: str = "" # slopcop: ignore[no-str-empty-default] - - class RunCleanupResult(BaseModel): model_config = {"frozen": True} diff --git a/ergon_core/ergon_core/core/runtime/services/run_read_service.py b/ergon_core/ergon_core/core/runtime/services/run_read_service.py index f8a2b811..cafd1b40 100644 --- a/ergon_core/ergon_core/core/runtime/services/run_read_service.py +++ b/ergon_core/ergon_core/core/runtime/services/run_read_service.py @@ -21,6 +21,7 @@ from ergon_core.core.persistence.graph.models import RunGraphEdge, RunGraphMutation, RunGraphNode from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.telemetry.models import ( + ExperimentRecord, RunRecord, RunResource, RunTaskEvaluation, @@ -56,11 +57,11 @@ def build_run_snapshot(self, run_id: UUID) -> RunSnapshotDto | None: if run is None: return None - definition = session.get(ExperimentDefinition, run.experiment_definition_id) + definition = session.get(ExperimentDefinition, run.workflow_definition_id) if definition is None: return None - def_id = run.experiment_definition_id + def_id = run.workflow_definition_id nodes = list( session.exec(select(RunGraphNode).where(RunGraphNode.run_id == run_id)).all() ) @@ -144,7 +145,7 @@ def build_run_snapshot(self, run_id: UUID) -> RunSnapshotDto | None: return RunSnapshotDto( id=run_id_str, - experiment_id=str(def_id), + experiment_id=str(run.experiment_id), name=run_name, status=run.status, tasks=task_map, @@ -244,9 +245,9 @@ def list_training_curves( with get_session() as session: stmt = select(RunRecord) if definition_id: - stmt = stmt.where(RunRecord.experiment_definition_id == definition_id) + stmt = stmt.where(RunRecord.workflow_definition_id == definition_id) if cohort_id: - stmt = stmt.where(RunRecord.cohort_id == cohort_id) + stmt = stmt.join(ExperimentRecord).where(ExperimentRecord.cohort_id == cohort_id) stmt = stmt.order_by(RunRecord.created_at) runs = list(session.exec(stmt).all()) diff --git a/ergon_core/ergon_core/core/runtime/services/run_service.py b/ergon_core/ergon_core/core/runtime/services/run_service.py index ee7e947c..fb4943ff 100644 --- a/ergon_core/ergon_core/core/runtime/services/run_service.py +++ b/ergon_core/ergon_core/core/runtime/services/run_service.py @@ -1,11 +1,10 @@ """Run creation, dispatch, and cancellation via Inngest.""" -import asyncio import logging from uuid import UUID import inngest -from ergon_core.api.handles import ExperimentRunHandle, PersistedExperimentDefinition +from ergon_core.api.handles import PersistedExperimentDefinition from ergon_core.api.json_types import JsonObject from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import TERMINAL_RUN_STATUSES, RunStatus @@ -14,16 +13,12 @@ RunCancelledEvent, RunCleanupEvent, ) -from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent from ergon_core.core.runtime.inngest_client import inngest_client from ergon_core.core.settings import settings from ergon_core.core.utils import utcnow logger = logging.getLogger(__name__) -_POLL_INTERVAL_S = 1.0 -_DEFAULT_TIMEOUT_S = 600.0 - def _checkpoint_metadata() -> JsonObject: """Checkpoint context for ``RunRecord.summary_json`` (eval watcher / checkpoint subprocess). @@ -39,14 +34,29 @@ def _checkpoint_metadata() -> JsonObject: } -def create_run( +def create_run( # slopcop: ignore[max-function-params] -- service boundary mirrors RunRecord provenance fields definition: PersistedExperimentDefinition, - cohort_id: UUID | None = None, + *, + experiment_id: UUID, + workflow_definition_id: UUID, + instance_key: str, + worker_team_json: JsonObject, + evaluator_slug: str | None = None, + model_target: str | None = None, + assignment_json: JsonObject | None = None, + seed: int | None = None, ) -> RunRecord: with get_session() as session: run = RunRecord( - experiment_definition_id=definition.definition_id, - cohort_id=cohort_id, + experiment_id=experiment_id, + workflow_definition_id=workflow_definition_id, + benchmark_type=definition.benchmark_type, + instance_key=instance_key, + worker_team_json=worker_team_json, + evaluator_slug=evaluator_slug, + model_target=model_target, + assignment_json=assignment_json or {}, + seed=seed, status=RunStatus.PENDING, created_at=utcnow(), summary_json=_checkpoint_metadata(), @@ -57,54 +67,6 @@ def create_run( return run -async def create_experiment_run( - definition: PersistedExperimentDefinition, - timeout_s: float = _DEFAULT_TIMEOUT_S, -) -> ExperimentRunHandle: - run = create_run(definition) - - event = WorkflowStartedEvent( - run_id=run.id, - definition_id=definition.definition_id, - ) - await inngest_client.send( - inngest.Event( - name=WorkflowStartedEvent.name, - data=event.model_dump(mode="json"), - ) - ) - - logger.info("Dispatched workflow/started for run %s", run.id) - - elapsed = 0.0 - final_status = RunStatus.PENDING - while elapsed < timeout_s: - await asyncio.sleep(_POLL_INTERVAL_S) - elapsed += _POLL_INTERVAL_S - - with get_session() as session: - current = session.get(RunRecord, run.id) - if current is None: - raise RuntimeError(f"RunRecord {run.id} vanished during polling") - final_status = current.status - if final_status in TERMINAL_RUN_STATUSES: - break - - if final_status not in TERMINAL_RUN_STATUSES: - logger.warning("Run %s did not reach terminal state within %ss", run.id, timeout_s) - - return ExperimentRunHandle( - run_id=run.id, - definition_id=definition.definition_id, - benchmark_type=definition.benchmark_type, - status=final_status, - worker_bindings=definition.worker_bindings, - created_at=run.created_at, - started_at=run.started_at, - metadata=definition.metadata, - ) - - def cancel_run(run_id: UUID) -> RunRecord: """Cancel a run: mark CANCELLED in PG, kill Inngest functions, trigger cleanup.""" with get_session() as session: diff --git a/ergon_core/ergon_core/core/runtime/services/task_execution_service.py b/ergon_core/ergon_core/core/runtime/services/task_execution_service.py index acc13892..0dfada45 100644 --- a/ergon_core/ergon_core/core/runtime/services/task_execution_service.py +++ b/ergon_core/ergon_core/core/runtime/services/task_execution_service.py @@ -13,7 +13,7 @@ from ergon_core.core.persistence.graph.models import RunGraphNode from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import TaskExecutionStatus -from ergon_core.core.persistence.telemetry.models import RunTaskExecution +from ergon_core.core.persistence.telemetry.models import RunRecord, RunTaskExecution from ergon_core.core.runtime.errors.inngest_errors import ConfigurationError from ergon_core.core.runtime.execution.propagation import ( mark_task_failed, @@ -106,29 +106,63 @@ async def _prepare_graph_native( task_id=command.task_id, ) - worker_row = session.exec( - select(ExperimentDefinitionWorker).where( - ExperimentDefinitionWorker.experiment_definition_id == command.definition_id, - ExperimentDefinitionWorker.binding_key == assigned_worker_slug, - ) - ).first() - if worker_row is None: - raise ConfigurationError( - f"No ExperimentDefinitionWorker with binding_key=" - f"'{assigned_worker_slug}' for definition {command.definition_id}", - run_id=command.run_id, - task_id=command.task_id, - ) - definition = require_not_none( session.get(ExperimentDefinition, command.definition_id), f"Definition {command.definition_id} not found", ) + definition_worker_id: UUID | None + worker_type: str + model_target: str + + if node.definition_task_id is None: + from ergon_builtins.registry import ( # slopcop: ignore[guarded-function-import] -- reason: dynamic graph tasks resolve test/plugin workers at execution time + WORKERS, + ) + + if assigned_worker_slug not in WORKERS: + raise ConfigurationError( + f"Unknown worker slug '{assigned_worker_slug}' for dynamic graph task", + run_id=command.run_id, + task_id=command.task_id, + ) + run = require_not_none( + session.get(RunRecord, command.run_id), + f"RunRecord {command.run_id} not found", + ) + definition_worker_id = None + worker_type = assigned_worker_slug + model_target = run.model_target or "openai:gpt-4o" + else: + assignment = require_not_none( + session.exec( + select(ExperimentDefinitionTaskAssignment).where( + ExperimentDefinitionTaskAssignment.experiment_definition_id + == command.definition_id, + ExperimentDefinitionTaskAssignment.task_id == node.definition_task_id, + ) + ).first(), + f"Definition task {node.definition_task_id} has no worker assignment", + ) + worker_row = require_not_none( + session.exec( + select(ExperimentDefinitionWorker).where( + ExperimentDefinitionWorker.experiment_definition_id + == command.definition_id, + ExperimentDefinitionWorker.binding_key == assignment.worker_binding_key, + ) + ).first(), + f"No ExperimentDefinitionWorker with binding_key=" + f"'{assignment.worker_binding_key}' for definition {command.definition_id}", + ) + definition_worker_id = worker_row.id + worker_type = worker_row.worker_type + model_target = worker_row.model_target execution = RunTaskExecution( run_id=command.run_id, node_id=node_id, - definition_worker_id=worker_row.id, + definition_task_id=node.definition_task_id, + definition_worker_id=definition_worker_id, attempt_number=self._next_attempt_number(session, command.run_id, node_id), status=TaskExecutionStatus.RUNNING, started_at=utcnow(), @@ -154,7 +188,7 @@ async def _prepare_graph_native( task_slug=node.task_slug, new_status=TaskExecutionStatus.RUNNING, old_status=None, - worker_id=worker_row.id, + worker_id=definition_worker_id, worker_name=assigned_worker_slug, ) @@ -171,8 +205,8 @@ async def _prepare_graph_native( task_description=node.description, benchmark_type=definition.benchmark_type, assigned_worker_slug=assigned_worker_slug, - worker_type=worker_row.worker_type, - model_target=worker_row.model_target, + worker_type=worker_type, + model_target=model_target, execution_id=execution.id, ) diff --git a/ergon_core/ergon_core/core/runtime/services/task_management_service.py b/ergon_core/ergon_core/core/runtime/services/task_management_service.py index f952250a..ed50f78c 100644 --- a/ergon_core/ergon_core/core/runtime/services/task_management_service.py +++ b/ergon_core/ergon_core/core/runtime/services/task_management_service.py @@ -123,6 +123,12 @@ async def add_subtask( dependency edges (source=dep, target=new_node). """ task_slug = command.task_slug + from ergon_builtins.registry import ( # slopcop: ignore[guarded-function-import] -- reason: dynamic task creation validates plugin worker slugs only when manager tools run + WORKERS, + ) + + if command.assigned_worker_slug not in WORKERS: + raise ValueError(f"Unknown worker slug: {command.assigned_worker_slug!r}") parent = self._graph_repo.get_node( session, run_id=command.run_id, node_id=command.parent_node_id @@ -248,6 +254,13 @@ async def plan_subtasks( root tasks (those with no depends_on). """ self._validate_plan(command.subtasks) + from ergon_builtins.registry import ( # slopcop: ignore[guarded-function-import] -- reason: dynamic task creation validates plugin worker slugs only when manager tools run + WORKERS, + ) + + for spec in command.subtasks: + if spec.assigned_worker_slug not in WORKERS: + raise ValueError(f"Unknown worker slug: {spec.assigned_worker_slug!r}") parent = self._graph_repo.get_node( session, run_id=command.run_id, node_id=command.parent_node_id @@ -668,7 +681,7 @@ def _check_no_cycles(subtasks: list[SubtaskSpec]) -> None: raise CycleDetectedError(remaining) def _resolve_definition_id(self, session: Session, run_id: UUID) -> UUID: - """Read experiment_definition_id from RunRecord. + """Read workflow_definition_id from RunRecord. Every run references exactly one definition, so a missing RunRecord is an invariant violation — callers must always create the RunRecord @@ -678,7 +691,7 @@ def _resolve_definition_id(self, session: Session, run_id: UUID) -> UUID: run = session.exec(select(RunRecord).where(RunRecord.id == run_id)).first() if run is None: raise RunRecordMissingError(run_id) - return run.experiment_definition_id + return run.workflow_definition_id async def _dispatch_task_ready( self, diff --git a/ergon_core/ergon_core/core/runtime/services/workflow_dto.py b/ergon_core/ergon_core/core/runtime/services/workflow_dto.py index d5b45aaa..33dd2ac3 100644 --- a/ergon_core/ergon_core/core/runtime/services/workflow_dto.py +++ b/ergon_core/ergon_core/core/runtime/services/workflow_dto.py @@ -13,6 +13,7 @@ class WorkflowTaskRef(BaseModel): level: int parent_node_id: UUID | None = None assigned_worker_slug: str | None = None + description: str | None = None class WorkflowExecutionRef(BaseModel): @@ -82,3 +83,32 @@ class WorkflowMaterializedResourceRef(BaseModel): sandbox_path: str dry_run: bool = False source_mutated: bool = False + + +class WorkflowMutationRef(BaseModel): + model_config = {"frozen": True} + + action: str + dry_run: bool + node: WorkflowTaskRef | None = None + edge: WorkflowDependencyRef | None = None + message: str + suggested_commands: list[str] = Field(default_factory=list) + + +class WorkflowResourceLocationRef(BaseModel): + model_config = {"frozen": True} + + resource: WorkflowResourceRef + producer_task_slug: str | None = None + local_file_path: str + default_sandbox_path: str + + +class WorkflowTaskWorkspaceRef(BaseModel): + model_config = {"frozen": True} + + task: WorkflowTaskRef + latest_execution: WorkflowExecutionRef | None = None + own_resources: list[WorkflowResourceRef] = Field(default_factory=list) + input_resources: list[WorkflowResourceRef] = Field(default_factory=list) diff --git a/ergon_core/ergon_core/core/runtime/services/workflow_initialization_service.py b/ergon_core/ergon_core/core/runtime/services/workflow_initialization_service.py index 28f09284..13f3247b 100644 --- a/ergon_core/ergon_core/core/runtime/services/workflow_initialization_service.py +++ b/ergon_core/ergon_core/core/runtime/services/workflow_initialization_service.py @@ -38,15 +38,6 @@ async def initialize(self, command: InitializeWorkflowCommand) -> InitializedWor ) all_tasks = list(session.exec(tasks_stmt).all()) - task_descriptors = [ - TaskDescriptor( - task_id=t.id, - task_slug=t.task_slug, - parent_task_id=t.parent_task_id, - ) - for t in all_tasks - ] - graph_repo = WorkflowGraphRepository() graph_repo.initialize_from_definition( session, @@ -60,6 +51,15 @@ async def initialize(self, command: InitializeWorkflowCommand) -> InitializedWor session.commit() graph_lookup = GraphNodeLookup(session, command.run_id) + task_descriptors = [ + TaskDescriptor( + task_id=t.id, + task_slug=t.task_slug, + parent_task_id=t.parent_task_id, + node_id=graph_lookup.node_id(t.id), + ) + for t in all_tasks + ] run_record = require_not_none( session.get(RunRecord, command.run_id), diff --git a/ergon_core/ergon_core/core/runtime/services/workflow_service.py b/ergon_core/ergon_core/core/runtime/services/workflow_service.py index a9aaff6b..790fb708 100644 --- a/ergon_core/ergon_core/core/runtime/services/workflow_service.py +++ b/ergon_core/ergon_core/core/runtime/services/workflow_service.py @@ -1,23 +1,33 @@ -from collections.abc import Callable +from collections.abc import Awaitable, Callable from pathlib import PurePosixPath from typing import Literal -from uuid import UUID +from uuid import UUID, uuid4 +import inngest from ergon_core.core.persistence.graph.models import RunGraphEdge, RunGraphNode from ergon_core.core.persistence.shared.enums import TaskExecutionStatus from ergon_core.core.persistence.telemetry.models import ( + RunRecord, RunResource, RunResourceKind, RunTaskExecution, ) from ergon_core.core.providers.sandbox.manager import BaseSandboxManager, DefaultSandboxManager +from ergon_core.core.runtime.events.task_events import TaskReadyEvent +from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.services.graph_dto import GraphEdgeDto, GraphNodeDto, MutationMeta +from ergon_core.core.runtime.services.graph_repository import WorkflowGraphRepository from ergon_core.core.runtime.services.workflow_dto import ( WorkflowBlockerRef, WorkflowDependencyRef, + WorkflowExecutionRef, WorkflowMaterializedResourceRef, + WorkflowMutationRef, WorkflowNextActionRef, + WorkflowResourceLocationRef, WorkflowResourceRef, WorkflowTaskRef, + WorkflowTaskWorkspaceRef, ) from sqlmodel import Session, col, select @@ -37,8 +47,12 @@ def __init__( self, *, sandbox_manager_factory: Callable[[str], BaseSandboxManager] | None = None, + graph_repository: WorkflowGraphRepository | None = None, + task_ready_dispatcher: Callable[[UUID, UUID, UUID], Awaitable[None]] | None = None, ) -> None: self._sandbox_manager_factory = sandbox_manager_factory or self._sandbox_manager_for + self._graph_repo = graph_repository or WorkflowGraphRepository() + self._task_ready_dispatcher = task_ready_dispatcher or self._dispatch_task_ready def list_tasks( self, @@ -156,6 +170,60 @@ def read_resource_bytes( with open(resource.file_path, "rb") as handle: return handle.read(max_bytes) + def get_resource_location( + self, + session: Session, + *, + run_id: UUID, + resource_id: UUID, + ) -> WorkflowResourceLocationRef: + resource = self._resource_in_run(session, run_id=run_id, resource_id=resource_id) + producer = self._producer_node_for_resource(session, resource) + copied_name = self._copy_name(resource.name) + default_path = self._sandbox_destination( + destination=None, + producer_slug=producer.task_slug if producer is not None else "unknown", + copied_name=copied_name, + ) + return WorkflowResourceLocationRef( + resource=self._resource_ref(session, resource), + producer_task_slug=producer.task_slug if producer is not None else None, + local_file_path=resource.file_path, + default_sandbox_path=default_path, + ) + + def get_task_workspace( + self, + session: Session, + *, + run_id: UUID, + node_id: UUID, + ) -> WorkflowTaskWorkspaceRef: + node = self._resolve_node(session, run_id=run_id, node_id=node_id, task_slug=None) + latest = self.get_latest_execution(session, node_id=node_id) + own_resources: list[WorkflowResourceRef] = [] + if latest is not None: + own_rows = list( + session.exec( + select(RunResource) + .where(RunResource.run_id == run_id) + .where(RunResource.task_execution_id == latest.id), + ).all(), + ) + own_rows.sort(key=lambda resource: (resource.created_at, resource.id), reverse=True) + own_resources = [self._resource_ref(session, resource) for resource in own_rows] + return WorkflowTaskWorkspaceRef( + task=self._task_ref(node), + latest_execution=self._execution_ref(latest) if latest is not None else None, + own_resources=own_resources, + input_resources=self.list_resources( + session, + run_id=run_id, + node_id=node_id, + scope="input", + ), + ) + def get_task_blockers( self, session: Session, @@ -203,6 +271,221 @@ def get_next_actions( ) ] + async def add_task( + self, + session: Session, + *, + run_id: UUID, + parent_node_id: UUID, + task_slug: str, + description: str, + assigned_worker_slug: str, + dry_run: bool, + ) -> WorkflowMutationRef: + from ergon_builtins.registry import ( # slopcop: ignore[guarded-function-import] -- reason: workflow mutation validates plugin worker slugs only when CLI tools run + WORKERS, + ) + + if assigned_worker_slug not in WORKERS: + raise ValueError(f"Unknown worker slug: {assigned_worker_slug!r}") + parent = self._resolve_node( + session, + run_id=run_id, + node_id=parent_node_id, + task_slug=None, + ) + node_ref = WorkflowTaskRef( + node_id=uuid4(), + task_slug=task_slug, + status=TaskExecutionStatus.PENDING.value, + level=parent.level + 1, + parent_node_id=parent.id, + assigned_worker_slug=assigned_worker_slug, + description=description, + ) + if dry_run: + return WorkflowMutationRef( + action="add-task", + dry_run=True, + node=node_ref, + message=f"Would add task {task_slug}", + ) + + created = await self._graph_repo.add_node( + session, + run_id, + task_slug=task_slug, + instance_key=parent.instance_key, + description=description, + status=TaskExecutionStatus.PENDING.value, + assigned_worker_slug=assigned_worker_slug, + parent_node_id=parent.id, + level=parent.level + 1, + meta=self._meta("add-task"), + ) + session.commit() + definition_id = self._resolve_definition_id(session, run_id) + await self._task_ready_dispatcher(run_id, definition_id, created.id) + return WorkflowMutationRef( + action="add-task", + dry_run=False, + node=self._task_ref_from_graph(created), + message=f"Added task {task_slug}", + ) + + async def add_edge( + self, + session: Session, + *, + run_id: UUID, + source_task_slug: str, + target_task_slug: str, + dry_run: bool, + ) -> WorkflowMutationRef: + source = self._resolve_node( + session, + run_id=run_id, + node_id=None, + task_slug=source_task_slug, + ) + target = self._resolve_node( + session, + run_id=run_id, + node_id=None, + task_slug=target_task_slug, + ) + edge_ref = WorkflowDependencyRef( + edge_id=uuid4(), + edge_status="pending", + source=self._task_ref(source), + target=self._task_ref(target), + ) + if dry_run: + return WorkflowMutationRef( + action="add-edge", + dry_run=True, + edge=edge_ref, + message=f"Would add dependency {source_task_slug} -> {target_task_slug}", + ) + + created = await self._graph_repo.add_edge( + session, + run_id, + source_node_id=source.id, + target_node_id=target.id, + status="pending", + meta=self._meta("add-edge"), + ) + session.commit() + return WorkflowMutationRef( + action="add-edge", + dry_run=False, + edge=self._dependency_ref_from_graph(session, run_id, created), + message=f"Added dependency {source_task_slug} -> {target_task_slug}", + ) + + async def update_task_description( + self, + session: Session, + *, + run_id: UUID, + task_slug: str, + description: str, + dry_run: bool, + ) -> WorkflowMutationRef: + node = self._resolve_node(session, run_id=run_id, node_id=None, task_slug=task_slug) + if dry_run: + return WorkflowMutationRef( + action="update-task-description", + dry_run=True, + node=self._task_ref(node).model_copy(update={"description": description}), + message=f"Would update description for {task_slug}", + ) + + updated = await self._graph_repo.update_node_field( + session, + run_id=run_id, + node_id=node.id, + field="description", + value=description, + meta=self._meta("update-task-description"), + ) + session.commit() + return WorkflowMutationRef( + action="update-task-description", + dry_run=False, + node=self._task_ref_from_graph(updated), + message=f"Updated description for {task_slug}", + ) + + async def restart_task( + self, + session: Session, + *, + run_id: UUID, + task_slug: str, + dry_run: bool, + ) -> WorkflowMutationRef: + return await self._set_task_status( + session, + run_id=run_id, + task_slug=task_slug, + action="restart-task", + status=TaskExecutionStatus.PENDING.value, + dry_run=dry_run, + ) + + async def abandon_task( + self, + session: Session, + *, + run_id: UUID, + task_slug: str, + dry_run: bool, + ) -> WorkflowMutationRef: + return await self._set_task_status( + session, + run_id=run_id, + task_slug=task_slug, + action="abandon-task", + status=TaskExecutionStatus.CANCELLED.value, + dry_run=dry_run, + ) + + async def _set_task_status( + self, + session: Session, + *, + run_id: UUID, + task_slug: str, + action: str, + status: str, + dry_run: bool, + ) -> WorkflowMutationRef: + node = self._resolve_node(session, run_id=run_id, node_id=None, task_slug=task_slug) + if dry_run: + return WorkflowMutationRef( + action=action, + dry_run=True, + node=self._task_ref(node).model_copy(update={"status": status}), + message=f"Would set {task_slug} to {status}", + ) + await self._graph_repo.update_node_status( + session, + run_id=run_id, + node_id=node.id, + new_status=status, + meta=self._meta(action), + ) + session.commit() + refreshed = self._resolve_node(session, run_id=run_id, node_id=None, task_slug=task_slug) + return WorkflowMutationRef( + action=action, + dry_run=False, + node=self._task_ref(refreshed), + message=f"Set {task_slug} to {status}", + ) + async def materialize_resource( # slopcop: ignore[max-function-params] -- mirrors CLI scope fields self, session: Session, @@ -278,6 +561,57 @@ def _task_ref(node: RunGraphNode) -> WorkflowTaskRef: level=node.level, parent_node_id=node.parent_node_id, assigned_worker_slug=node.assigned_worker_slug, + description=node.description, + ) + + @staticmethod + def _task_ref_from_graph(node: GraphNodeDto) -> WorkflowTaskRef: + return WorkflowTaskRef( + node_id=node.id, + task_slug=node.task_slug, + status=node.status, + level=node.level, + parent_node_id=node.parent_node_id, + assigned_worker_slug=node.assigned_worker_slug, + description=node.description, + ) + + def _dependency_ref_from_graph( + self, + session: Session, + run_id: UUID, + edge: GraphEdgeDto, + ) -> WorkflowDependencyRef: + nodes = self._nodes_by_id(session, run_id) + return WorkflowDependencyRef( + edge_id=edge.id, + edge_status=edge.status, + source=self._task_ref(nodes[edge.source_node_id]), + target=self._task_ref(nodes[edge.target_node_id]), + ) + + @staticmethod + def _meta(action: str) -> MutationMeta: + return MutationMeta(actor="workflow-cli", reason=action) + + def _resolve_definition_id(self, session: Session, run_id: UUID) -> UUID: + run = session.get(RunRecord, run_id) + if run is None: + raise ValueError(f"run {run_id} not found") + return run.workflow_definition_id + + async def _dispatch_task_ready(self, run_id: UUID, definition_id: UUID, node_id: UUID) -> None: + event = TaskReadyEvent( + run_id=run_id, + definition_id=definition_id, + task_id=None, + node_id=node_id, + ) + await inngest_client.send( + inngest.Event( + name=TaskReadyEvent.name, + data=event.model_dump(mode="json"), + ) ) def _resource_ref(self, session: Session, resource: RunResource) -> WorkflowResourceRef: @@ -298,6 +632,15 @@ def _resource_ref(self, session: Session, resource: RunResource) -> WorkflowReso created_at=resource.created_at, ) + @staticmethod + def _execution_ref(execution: RunTaskExecution) -> WorkflowExecutionRef: + return WorkflowExecutionRef( + execution_id=execution.id, + status=execution.status, + attempt_number=execution.attempt_number, + final_assistant_message=execution.final_assistant_message, + ) + @staticmethod def _resource_in_run(session: Session, *, run_id: UUID, resource_id: UUID) -> RunResource: resource = session.get(RunResource, resource_id) diff --git a/ergon_core/migrations/versions/b1c2d3e4f5a6_add_experiment_records.py b/ergon_core/migrations/versions/b1c2d3e4f5a6_add_experiment_records.py new file mode 100644 index 00000000..43f4e6f4 --- /dev/null +++ b/ergon_core/migrations/versions/b1c2d3e4f5a6_add_experiment_records.py @@ -0,0 +1,256 @@ +"""add experiment records + +Revision ID: b1c2d3e4f5a6 +Revises: 0a1b2c3d4e5f +Create Date: 2026-04-27 11:35:00.000000 +""" + +import json +from typing import Sequence, Union +from uuid import uuid4 + +import sqlalchemy as sa +import sqlmodel +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "b1c2d3e4f5a6" +down_revision: Union[str, None] = "0a1b2c3d4e5f" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "experiments", + sa.Column("id", sa.Uuid(), nullable=False), + sa.Column("cohort_id", sa.Uuid(), nullable=True), + sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("benchmark_type", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("sample_count", sa.Integer(), nullable=False), + sa.Column("sample_selection_json", sa.JSON(), nullable=False), + sa.Column("default_worker_team_json", sa.JSON(), nullable=False), + sa.Column("default_evaluator_slug", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column("default_model_target", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column("design_json", sa.JSON(), nullable=False), + sa.Column("seed", sa.Integer(), nullable=True), + sa.Column("metadata_json", sa.JSON(), nullable=False), + sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("started_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint(["cohort_id"], ["experiment_cohorts.id"]), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_experiments_benchmark_type"), "experiments", ["benchmark_type"]) + op.create_index(op.f("ix_experiments_cohort_id"), "experiments", ["cohort_id"]) + op.create_index(op.f("ix_experiments_name"), "experiments", ["name"]) + op.create_index(op.f("ix_experiments_status"), "experiments", ["status"]) + + op.add_column("runs", sa.Column("experiment_id", sa.Uuid(), nullable=True)) + op.add_column("runs", sa.Column("workflow_definition_id", sa.Uuid(), nullable=True)) + op.add_column( + "runs", sa.Column("benchmark_type", sqlmodel.sql.sqltypes.AutoString(), nullable=True) + ) + op.add_column( + "runs", sa.Column("instance_key", sqlmodel.sql.sqltypes.AutoString(), nullable=True) + ) + op.add_column("runs", sa.Column("sample_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True)) + op.add_column( + "runs", sa.Column("worker_team_json", sa.JSON(), nullable=False, server_default="{}") + ) + op.add_column( + "runs", sa.Column("evaluator_slug", sqlmodel.sql.sqltypes.AutoString(), nullable=True) + ) + op.add_column( + "runs", sa.Column("model_target", sqlmodel.sql.sqltypes.AutoString(), nullable=True) + ) + op.add_column( + "runs", sa.Column("assignment_json", sa.JSON(), nullable=False, server_default="{}") + ) + op.add_column("runs", sa.Column("seed", sa.Integer(), nullable=True)) + op.create_foreign_key("fk_runs_experiment_id", "runs", "experiments", ["experiment_id"], ["id"]) + op.create_foreign_key( + "fk_runs_workflow_definition_id", + "runs", + "experiment_definitions", + ["workflow_definition_id"], + ["id"], + ) + + _migrate_existing_runs() + + op.alter_column("runs", "experiment_id", nullable=False) + op.alter_column("runs", "workflow_definition_id", nullable=False) + op.alter_column("runs", "benchmark_type", nullable=False) + op.alter_column("runs", "instance_key", nullable=False) + op.create_index(op.f("ix_runs_experiment_id"), "runs", ["experiment_id"]) + op.create_index(op.f("ix_runs_workflow_definition_id"), "runs", ["workflow_definition_id"]) + op.create_index(op.f("ix_runs_benchmark_type"), "runs", ["benchmark_type"]) + op.create_index(op.f("ix_runs_instance_key"), "runs", ["instance_key"]) + op.create_index(op.f("ix_runs_sample_id"), "runs", ["sample_id"]) + op.create_index(op.f("ix_runs_evaluator_slug"), "runs", ["evaluator_slug"]) + + op.drop_index(op.f("ix_runs_cohort_id"), table_name="runs") + op.drop_index(op.f("ix_runs_experiment_definition_id"), table_name="runs") + op.drop_column("runs", "cohort_id") + op.drop_column("runs", "experiment_definition_id") + + +def downgrade() -> None: + op.add_column("runs", sa.Column("experiment_definition_id", sa.Uuid(), nullable=True)) + op.add_column("runs", sa.Column("cohort_id", sa.Uuid(), nullable=True)) + op.create_foreign_key( + "fk_runs_experiment_definition_id", + "runs", + "experiment_definitions", + ["experiment_definition_id"], + ["id"], + ) + op.create_foreign_key("fk_runs_cohort_id", "runs", "experiment_cohorts", ["cohort_id"], ["id"]) + + connection = op.get_bind() + connection.execute( + sa.text( + """ + UPDATE runs + SET + experiment_definition_id = workflow_definition_id, + cohort_id = experiments.cohort_id + FROM experiments + WHERE runs.experiment_id = experiments.id + """ + ) + ) + + op.alter_column("runs", "experiment_definition_id", nullable=False) + op.create_index(op.f("ix_runs_experiment_definition_id"), "runs", ["experiment_definition_id"]) + op.create_index(op.f("ix_runs_cohort_id"), "runs", ["cohort_id"]) + + op.drop_index(op.f("ix_runs_evaluator_slug"), table_name="runs") + op.drop_index(op.f("ix_runs_sample_id"), table_name="runs") + op.drop_index(op.f("ix_runs_instance_key"), table_name="runs") + op.drop_index(op.f("ix_runs_benchmark_type"), table_name="runs") + op.drop_index(op.f("ix_runs_workflow_definition_id"), table_name="runs") + op.drop_index(op.f("ix_runs_experiment_id"), table_name="runs") + op.drop_constraint("fk_runs_workflow_definition_id", "runs", type_="foreignkey") + op.drop_constraint("fk_runs_experiment_id", "runs", type_="foreignkey") + op.drop_column("runs", "seed") + op.drop_column("runs", "assignment_json") + op.drop_column("runs", "model_target") + op.drop_column("runs", "evaluator_slug") + op.drop_column("runs", "worker_team_json") + op.drop_column("runs", "sample_id") + op.drop_column("runs", "instance_key") + op.drop_column("runs", "benchmark_type") + op.drop_column("runs", "workflow_definition_id") + op.drop_column("runs", "experiment_id") + + op.drop_index(op.f("ix_experiments_status"), table_name="experiments") + op.drop_index(op.f("ix_experiments_name"), table_name="experiments") + op.drop_index(op.f("ix_experiments_cohort_id"), table_name="experiments") + op.drop_index(op.f("ix_experiments_benchmark_type"), table_name="experiments") + op.drop_table("experiments") + + +def _migrate_existing_runs() -> None: + connection = op.get_bind() + rows = connection.execute( + sa.text( + """ + SELECT + runs.id AS run_id, + runs.experiment_definition_id AS definition_id, + runs.cohort_id AS cohort_id, + runs.status AS run_status, + runs.created_at AS created_at, + runs.started_at AS started_at, + runs.completed_at AS completed_at, + experiment_definitions.benchmark_type AS benchmark_type + FROM runs + JOIN experiment_definitions + ON experiment_definitions.id = runs.experiment_definition_id + """ + ) + ).mappings() + + for row in rows: + experiment_id = uuid4() + instance_key = ( + _first_instance_key(connection, row["definition_id"]) or f"migrated-{row['run_id']}" + ) + metadata = { + "migrated_from_legacy_run": True, + "source_run_id": str(row["run_id"]), + "source_experiment_definition_id": str(row["definition_id"]), + } + connection.execute( + sa.text( + """ + INSERT INTO experiments ( + id, cohort_id, name, benchmark_type, sample_count, + sample_selection_json, default_worker_team_json, + design_json, metadata_json, status, created_at, started_at, completed_at + ) + VALUES ( + :id, :cohort_id, :name, :benchmark_type, :sample_count, + CAST(:sample_selection_json AS JSON), CAST(:default_worker_team_json AS JSON), + CAST(:design_json AS JSON), CAST(:metadata_json AS JSON), + :status, :created_at, :started_at, :completed_at + ) + """ + ), + { + "id": experiment_id, + "cohort_id": row["cohort_id"], + "name": f"Migrated experiment for run {row['run_id']}", + "benchmark_type": row["benchmark_type"], + "sample_count": 1, + "sample_selection_json": json.dumps({"instance_keys": [instance_key]}), + "default_worker_team_json": json.dumps({}), + "design_json": json.dumps({}), + "metadata_json": json.dumps(metadata), + "status": row["run_status"], + "created_at": row["created_at"], + "started_at": row["started_at"], + "completed_at": row["completed_at"], + }, + ) + connection.execute( + sa.text( + """ + UPDATE runs + SET + experiment_id = :experiment_id, + workflow_definition_id = :workflow_definition_id, + benchmark_type = :benchmark_type, + instance_key = :instance_key + WHERE id = :run_id + """ + ), + { + "experiment_id": experiment_id, + "workflow_definition_id": row["definition_id"], + "benchmark_type": row["benchmark_type"], + "instance_key": instance_key, + "run_id": row["run_id"], + }, + ) + + +def _first_instance_key(connection, definition_id) -> str | None: + row = connection.execute( + sa.text( + """ + SELECT instance_key + FROM experiment_definition_instances + WHERE experiment_definition_id = :definition_id + ORDER BY created_at ASC + LIMIT 1 + """ + ), + {"definition_id": definition_id}, + ).first() + if row is None: + return None + return row[0] diff --git a/ergon_core/pyproject.toml b/ergon_core/pyproject.toml index 19b4002a..107b52fd 100644 --- a/ergon_core/pyproject.toml +++ b/ergon_core/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "uvicorn>=0.24.0", "e2b-code-interpreter", "openai", - "pydantic-ai", + "pydantic-ai>=0.8.1", "litellm", "opentelemetry-api", "opentelemetry-sdk", diff --git a/ergon_infra/ergon_infra/templates/eval-only.yaml b/ergon_infra/ergon_infra/templates/eval-only.yaml index d85a67ee..0c9a1374 100644 --- a/ergon_infra/ergon_infra/templates/eval-only.yaml +++ b/ergon_infra/ergon_infra/templates/eval-only.yaml @@ -39,7 +39,9 @@ run: | sleep 1 done - ergon benchmark run "$BENCHMARK" \ + EXPERIMENT_ID="$(ergon experiment define "$BENCHMARK" \ --model "vllm:http://localhost:8000" \ + --worker "training-stub" \ --evaluator "$EVALUATOR" \ - --limit "$EVAL_LIMIT" + --limit "$EVAL_LIMIT" 2>&1 | sed -n 's/^EXPERIMENT_ID=//p')" + ergon experiment run "$EXPERIMENT_ID" diff --git a/ergon_infra/ergon_infra/training/trl_runner.py b/ergon_infra/ergon_infra/training/trl_runner.py index 248f36f6..13b9c512 100644 --- a/ergon_infra/ergon_infra/training/trl_runner.py +++ b/ergon_infra/ergon_infra/training/trl_runner.py @@ -49,7 +49,8 @@ def run_trl_training(config: TrainingConfig) -> int: definition_id = config.definition_id if not definition_id: raise ValueError( - "--definition-id is required. Create one via: ergon benchmark run --limit 1" + "--definition-id is required. Create one via: " + "ergon experiment define --limit 1 --worker --model " ) tokenizer = AutoTokenizer.from_pretrained(config.model) diff --git a/tests/e2e/_asserts.py b/tests/e2e/_asserts.py index 09409faf..447ca45a 100644 --- a/tests/e2e/_asserts.py +++ b/tests/e2e/_asserts.py @@ -356,10 +356,7 @@ def _assert_cohort_membership(cohort_key: str, run_ids: list[UUID]) -> None: rows = r.json() returned = {UUID(row["run_id"]) for row in rows} expected = set(run_ids) - assert returned == expected, ( - f"cohort membership mismatch: only returned={returned - expected}, " - f"only expected={expected - returned}" - ) + assert expected <= returned, f"cohort missing expected run ids: {expected - returned}" # ============================================================================= diff --git a/tests/e2e/_submit.py b/tests/e2e/_submit.py index 1ffd3122..65023de9 100644 --- a/tests/e2e/_submit.py +++ b/tests/e2e/_submit.py @@ -18,6 +18,7 @@ from __future__ import annotations import os +from datetime import datetime, timezone from uuid import UUID import httpx @@ -26,6 +27,15 @@ _DEFAULT_API = "http://127.0.0.1:9000" +def smoke_cohort_key(env: str) -> str: + """Return a shared QA cohort key when provided, otherwise an env-scoped one.""" + override = os.environ.get("E2E_COHORT_KEY") + if override is not None and override: + return override + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + return f"ci-smoke-{env}-{timestamp}" + + def _api_base() -> str: return os.environ.get("ERGON_API_BASE_URL", _DEFAULT_API) @@ -67,4 +77,4 @@ async def submit_cohort( return [UUID(rid) for rid in body["run_ids"]] -__all__ = ["submit_cohort"] +__all__ = ["smoke_cohort_key", "submit_cohort"] diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 68bfd310..48b99481 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -5,6 +5,7 @@ """ import os +import re import socket import subprocess from urllib.parse import urlparse @@ -14,6 +15,11 @@ from ergon_core.core.settings import settings from sqlmodel import Session +_UUID_RE = re.compile( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + re.IGNORECASE, +) + # NOTE: smoke fixture registration now lives exclusively inside the api # container via ``ERGON_STARTUP_PLUGINS``. # Host-side pytest is a black-box client (``_submit.py`` → HTTP) and @@ -91,29 +97,57 @@ def run_benchmark( *, worker: str, evaluator: str, + model: str = "stub:constant", limit: int = 1, cohort: str = "ci", timeout: int = 120, ) -> subprocess.CompletedProcess: - """Run a benchmark via the ergon CLI and return the process result.""" - cmd = [ + """Define and run an experiment via the ergon CLI.""" + define_cmd = [ "ergon", - "benchmark", - "run", + "experiment", + "define", slug, "--worker", worker, + "--model", + model, "--evaluator", evaluator, "--limit", str(limit), "--cohort", cohort, - "--timeout", - str(timeout), ] env = {**os.environ, "PYTHONUNBUFFERED": "1"} - return subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=timeout + 30) + define = subprocess.run( + define_cmd, + capture_output=True, + text=True, + env=env, + timeout=timeout + 30, + ) + if define.returncode != 0: + return define + + experiment_id = _parse_uuid_line("EXPERIMENT_ID=", define.stdout + define.stderr) + return subprocess.run( + ["ergon", "experiment", "run", experiment_id, "--timeout", str(timeout)], + capture_output=True, + text=True, + env=env, + timeout=timeout + 30, + ) + + +def _parse_uuid_line(prefix: str, output: str) -> str: + for line in output.splitlines(): + if not line.startswith(prefix): + continue + match = _UUID_RE.search(line) + if match is not None: + return match.group(0) + raise AssertionError(f"missing {prefix} line in CLI output:\n{output}") @pytest.fixture(scope="session") diff --git a/tests/e2e/test_minif2f_smoke.py b/tests/e2e/test_minif2f_smoke.py index ddb17578..b2b90ea1 100644 --- a/tests/e2e/test_minif2f_smoke.py +++ b/tests/e2e/test_minif2f_smoke.py @@ -7,7 +7,6 @@ import os import pathlib import subprocess -from datetime import datetime, timezone import pytest @@ -23,7 +22,7 @@ _assert_temporal_ordering, wait_for_terminal_status, ) -from tests.e2e._submit import submit_cohort +from tests.e2e._submit import smoke_cohort_key, submit_cohort ENV = "minif2f" WORKER = f"{ENV}-sadpath-smoke-worker" @@ -36,7 +35,7 @@ @pytest.mark.e2e @pytest.mark.asyncio async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: - cohort_key = f"ci-smoke-{ENV}-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%S')}" + cohort_key = smoke_cohort_key(ENV) run_ids = await submit_cohort( benchmark_slug=ENV, diff --git a/tests/e2e/test_researchrubrics_smoke.py b/tests/e2e/test_researchrubrics_smoke.py index d032e801..a2d0d3bc 100644 --- a/tests/e2e/test_researchrubrics_smoke.py +++ b/tests/e2e/test_researchrubrics_smoke.py @@ -19,7 +19,6 @@ import os import pathlib import subprocess -from datetime import datetime, timezone import pytest @@ -35,7 +34,7 @@ _assert_temporal_ordering, wait_for_terminal_status, ) -from tests.e2e._submit import submit_cohort +from tests.e2e._submit import smoke_cohort_key, submit_cohort ENV = "researchrubrics" WORKER = f"{ENV}-sadpath-smoke-worker" @@ -49,7 +48,7 @@ @pytest.mark.e2e @pytest.mark.asyncio async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: - cohort_key = f"ci-smoke-{ENV}-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%S')}" + cohort_key = smoke_cohort_key(ENV) run_ids = await submit_cohort( benchmark_slug=ENV, diff --git a/tests/e2e/test_swebench_smoke.py b/tests/e2e/test_swebench_smoke.py index 9889d5f9..00befe4f 100644 --- a/tests/e2e/test_swebench_smoke.py +++ b/tests/e2e/test_swebench_smoke.py @@ -7,7 +7,6 @@ import os import pathlib import subprocess -from datetime import datetime, timezone import pytest @@ -23,7 +22,7 @@ _assert_temporal_ordering, wait_for_terminal_status, ) -from tests.e2e._submit import submit_cohort +from tests.e2e._submit import smoke_cohort_key, submit_cohort # Benchmark slug is 'swebench-verified' (matches BENCHMARKS registry); # worker + criterion slugs use 'swebench' (shorter). The per-env @@ -41,7 +40,7 @@ @pytest.mark.e2e @pytest.mark.asyncio async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: - cohort_key = f"ci-smoke-{ENV}-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%S')}" + cohort_key = smoke_cohort_key(ENV) run_ids = await submit_cohort( benchmark_slug=ENV, diff --git a/tests/integration/minif2f/test_verification_integration.py b/tests/integration/minif2f/test_verification_integration.py index 2438cb27..e0824168 100644 --- a/tests/integration/minif2f/test_verification_integration.py +++ b/tests/integration/minif2f/test_verification_integration.py @@ -12,6 +12,7 @@ import json import os +from pathlib import Path from uuid import uuid4 import pytest @@ -24,12 +25,24 @@ from ergon_core.api.evaluation_context import EvaluationContext from ergon_core.api.results import WorkerOutput from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload +from ergon_core.core.persistence.definitions.models import ExperimentDefinition +from ergon_core.core.persistence.graph.models import RunGraphNode +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.persistence.shared.enums import RunStatus, TaskExecutionStatus +from ergon_core.core.persistence.telemetry.models import ( + ExperimentRecord, + RunRecord, + RunTaskExecution, +) from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.providers.sandbox.resource_publisher import SandboxResourcePublisher from ergon_core.core.runtime.evaluation.criterion_runtime import ( DefaultCriterionRuntime, ) from ergon_core.core.runtime.evaluation.evaluation_schemas import CriterionContext +_FIXTURE_PROOF = Path(__file__).parents[2] / "fixtures" / "minif2f" / "known_good_proof.lean" + def _require_setup() -> None: if not os.environ.get("E2B_API_KEY"): @@ -85,15 +98,91 @@ async def _setup_runtime( return DefaultCriterionRuntime(context=ctx, sandbox_manager=sandbox_manager) +async def _publish_fixture_proof( + runtime: DefaultCriterionRuntime, + sandbox_manager: MiniF2FSandboxManager, + *, + run_id, + task_execution_id, + blob_root: Path, +) -> None: + await runtime.run_command("mkdir -p /workspace/final_output") + await runtime.write_file( + "/workspace/final_output/final_solution.lean", + _FIXTURE_PROOF.read_bytes(), + ) + sandbox = sandbox_manager.get_sandbox(run_id) + assert sandbox is not None + publisher = SandboxResourcePublisher( + sandbox=sandbox, + run_id=run_id, + task_execution_id=task_execution_id, + blob_root=blob_root, + ) + await publisher.sync() + + +def _seed_run_record(run_id, task_execution_id) -> None: + with get_session() as session: + definition = ExperimentDefinition(benchmark_type="minif2f") + session.add(definition) + session.flush() + experiment = ExperimentRecord( + name="minif2f verification fixture", + benchmark_type="minif2f", + sample_count=1, + sample_selection_json={"instance_keys": ["default"]}, + default_worker_team_json={"primary": "minif2f-react"}, + design_json={}, + metadata_json={}, + status="running", + ) + session.add(experiment) + session.flush() + run = RunRecord( + id=run_id, + experiment_id=experiment.id, + workflow_definition_id=definition.id, + benchmark_type="minif2f", + instance_key="default", + worker_team_json={"primary": "minif2f-react"}, + status=RunStatus.EXECUTING, + ) + session.add(run) + session.flush() + node = RunGraphNode( + run_id=run_id, + instance_key="default", + task_slug="mathd_algebra_176", + description=_make_task().description, + status="completed", + assigned_worker_slug="minif2f-react", + level=0, + ) + session.add(node) + session.flush() + execution = RunTaskExecution( + id=task_execution_id, + run_id=run_id, + node_id=node.id, + status=TaskExecutionStatus.COMPLETED, + final_assistant_message="fixture proof written", + ) + session.add(execution) + session.commit() + + # --------------------------------------------------------------------------- @pytest.mark.asyncio @pytest.mark.timeout(600) # template pull + first mathlib import can be slow -async def test_fixture_proof_verifies_to_score_1() -> None: +async def test_fixture_proof_verifies_to_score_1(tmp_path: Path) -> None: _require_setup() run_id = uuid4() + task_execution_id = uuid4() + _seed_run_record(run_id, task_execution_id) mgr = MiniF2FSandboxManager() runtime = await _setup_runtime(mgr, run_id) @@ -102,10 +191,17 @@ async def test_fixture_proof_verifies_to_score_1() -> None: output="", success=True, ) + await _publish_fixture_proof( + runtime, + mgr, + run_id=run_id, + task_execution_id=task_execution_id, + blob_root=tmp_path / "blob", + ) eval_ctx = EvaluationContext( run_id=run_id, task_id=uuid4(), - execution_id=uuid4(), + execution_id=task_execution_id, task=_make_task(), worker_result=worker_output, sandbox_id=mgr.get_sandbox(run_id).sandbox_id, # type: ignore[union-attr] diff --git a/tests/integration/propagation/_helpers.py b/tests/integration/propagation/_helpers.py index 9c6f50c5..722c60eb 100644 --- a/tests/integration/propagation/_helpers.py +++ b/tests/integration/propagation/_helpers.py @@ -3,14 +3,11 @@ import time from uuid import UUID -from sqlmodel import Session, select - from ergon_core.core.persistence.definitions.models import ExperimentDefinition from ergon_core.core.persistence.graph.models import RunGraphEdge, RunGraphMutation, RunGraphNode -from ergon_core.core.persistence.graph.status_conventions import TERMINAL_STATUSES -from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import RunStatus -from ergon_core.core.persistence.telemetry.models import RunRecord +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord +from sqlmodel import Session, select def poll_until(condition, *, timeout: float = 30, interval: float = 0.5) -> None: @@ -84,8 +81,24 @@ def make_experiment_definition(session: Session) -> ExperimentDefinition: def make_run(session: Session, definition_id: UUID) -> RunRecord: """Create a minimal RunRecord row for test scaffolding.""" + experiment = ExperimentRecord( + name="ci propagation experiment", + benchmark_type="ci-propagation-test", + sample_count=1, + sample_selection_json={"instance_keys": ["test"]}, + default_worker_team_json={"primary": "test-worker"}, + design_json={}, + metadata_json={}, + status="running", + ) + session.add(experiment) + session.flush() run = RunRecord( - experiment_definition_id=definition_id, + experiment_id=experiment.id, + workflow_definition_id=definition_id, + benchmark_type="ci-propagation-test", + instance_key="test", + worker_team_json={"primary": "test-worker"}, status=RunStatus.EXECUTING, ) session.add(run) diff --git a/tests/integration/smokes/test_smoke_harness.py b/tests/integration/smokes/test_smoke_harness.py index 9b11d06b..aeabcbb7 100644 --- a/tests/integration/smokes/test_smoke_harness.py +++ b/tests/integration/smokes/test_smoke_harness.py @@ -15,11 +15,10 @@ import httpx import pytest -from sqlalchemy import text -from sqlalchemy.exc import OperationalError - from ergon_core.core.persistence.definitions.models import ExperimentDefinition from ergon_core.core.persistence.shared.db import get_engine, get_session +from sqlalchemy import text +from sqlalchemy.exc import OperationalError pytestmark = pytest.mark.integration @@ -114,7 +113,7 @@ def test_seed_then_read_then_reset_roundtrip() -> None: seed_resp = client.post( f"{API}/api/test/write/run/seed", json={ - "experiment_definition_id": str(defn_id), + "workflow_definition_id": str(defn_id), "cohort": _COHORT, "status": "completed", }, diff --git a/tests/integration/swebench_verified/conftest.py b/tests/integration/swebench_verified/conftest.py index 188effbc..4363b697 100644 --- a/tests/integration/swebench_verified/conftest.py +++ b/tests/integration/swebench_verified/conftest.py @@ -5,8 +5,6 @@ from uuid import UUID, uuid4 import pytest -from sqlmodel import select - from ergon_core.core.persistence.definitions.models import ( ExperimentDefinition, ExperimentDefinitionInstance, @@ -14,7 +12,12 @@ ) from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import RunStatus, TaskExecutionStatus -from ergon_core.core.persistence.telemetry.models import RunRecord, RunTaskExecution +from ergon_core.core.persistence.telemetry.models import ( + ExperimentRecord, + RunRecord, + RunTaskExecution, +) +from sqlmodel import select _MINIMAL_SWEBENCH_PAYLOAD: dict[str, object] = { "instance_id": "django__django-1", @@ -69,8 +72,26 @@ def swebench_execution() -> tuple[UUID, UUID]: session.flush() session.refresh(task) + experiment = ExperimentRecord( + name="swebench verified fixture", + benchmark_type="swebench-verified", + sample_count=1, + sample_selection_json={"instance_keys": ["django__django-1"]}, + default_worker_team_json={"primary": "swebench-verified"}, + design_json={}, + metadata_json={}, + status="running", + ) + session.add(experiment) + session.flush() + session.refresh(experiment) + run = RunRecord( - experiment_definition_id=defn.id, + experiment_id=experiment.id, + workflow_definition_id=defn.id, + benchmark_type="swebench-verified", + instance_key="django__django-1", + worker_team_json={"primary": "swebench-verified"}, status=RunStatus.EXECUTING, ) session.add(run) diff --git a/tests/real_llm/benchmarks/test_researchrubrics.py b/tests/real_llm/benchmarks/test_researchrubrics.py index f8a842b1..744c5c07 100644 --- a/tests/real_llm/benchmarks/test_researchrubrics.py +++ b/tests/real_llm/benchmarks/test_researchrubrics.py @@ -1,7 +1,7 @@ """Real-LLM rollout harness for the ``researchrubrics`` benchmark. This test is a **trigger**, not an assertion suite. It runs a real -``ergon benchmark run researchrubrics`` end-to-end against a real LLM +``ergon experiment define`` + ``ergon experiment run`` end-to-end against a real LLM (Sonnet 4.6 via OpenRouter by default) and dumps an exhaustive rollout artifact — every persistence table, dashboard screenshots, and a stitched ``report.md`` — to @@ -24,23 +24,25 @@ """ import os +import re import subprocess import time from datetime import datetime, timezone from uuid import UUID import pytest -from sqlmodel import select - -from ergon_core.core.persistence.shared.db import ensure_db, get_session +from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.telemetry.models import ( RunRecord, RunResource, RunTaskEvaluation, + RunTaskExecution, ) from ergon_core.core.providers.generation.openrouter_budget import OpenRouterBudget from ergon_core.core.settings import settings +from sqlmodel import select +from tests.real_llm.rollout import _fingerprint as fingerprint from tests.real_llm.rollout import ( capture_dashboard, dump_rollout, @@ -48,13 +50,12 @@ write_manifest, write_report, ) -from tests.real_llm.rollout import _fingerprint as fingerprint pytestmark = [pytest.mark.real_llm, pytest.mark.asyncio] -# Default to Sonnet 4.6 via OpenRouter. Override with ERGON_REAL_LLM_MODEL -# to roll out against a different model without editing the test. -_DEFAULT_MODEL = "openrouter:anthropic/claude-sonnet-4.6" +# Cloud provider prefixes resolve through OpenRouter. Override with +# ERGON_REAL_LLM_MODEL to roll out against a different model. +_DEFAULT_MODEL = "anthropic:claude-sonnet-4.6" # Wall-clock caps. Real-LLM + real-sandbox rollouts are slow; keep # these generous enough to absorb E2B startup + Exa retries but bounded @@ -75,30 +76,12 @@ def _require_keys() -> None: ) -def _latest_run_id_since(since: datetime) -> UUID: - """Return the most recent RunRecord.id created at or after ``since``.""" - ensure_db() - with get_session() as session: - stmt = ( - select(RunRecord) - .where(RunRecord.created_at >= since) - .order_by(RunRecord.created_at.desc()) - .limit(1) - ) - row = session.exec(stmt).first() - if row is None: - raise RuntimeError( - "no RunRecord created since the harness started — " - "did the CLI subprocess actually dispatch a run?" - ) - return row.id - - def _wait_for_post_terminal_artifacts(run_id: UUID) -> None: """Let async resource/evaluation rows land before dumping artifacts.""" deadline = time.monotonic() + _POST_TERMINAL_ARTIFACT_TIMEOUT_SECONDS while time.monotonic() < deadline: with get_session() as session: + run = session.get(RunRecord, run_id) resources = len( list(session.exec(select(RunResource).where(RunResource.run_id == run_id)).all()) ) @@ -109,8 +92,24 @@ def _wait_for_post_terminal_artifacts(run_id: UUID) -> None: ).all() ) ) + executions = list( + session.exec( + select(RunTaskExecution).where(RunTaskExecution.run_id == run_id) + ).all() + ) if resources > 0 and evaluations > 0: return + run_status = str(getattr(run.status, "value", run.status)).lower() if run else "" + running_executions = { + "pending", + "running", + "executing", + } + if run_status in {"failed", "cancelled"} and not any( + str(getattr(execution.status, "value", execution.status)).lower() in running_executions + for execution in executions + ): + return time.sleep(2) @@ -128,7 +127,7 @@ async def test_researchrubrics_rollout( state inside the time budget. """ model = os.environ.get("ERGON_REAL_LLM_MODEL", _DEFAULT_MODEL) - benchmark = "researchrubrics" + benchmark = os.environ.get("ERGON_REAL_LLM_BENCHMARK", "researchrubrics") worker = os.environ.get("ERGON_REAL_LLM_WORKER", "researchrubrics-researcher") evaluator = "research-rubric" limit = os.environ.get("ERGON_REAL_LLM_LIMIT", "1") @@ -138,13 +137,13 @@ async def test_researchrubrics_rollout( ) started_at = datetime.now(timezone.utc) - cli_proc = subprocess.run( + define_proc = subprocess.run( [ "uv", "run", "ergon", - "benchmark", - "run", + "experiment", + "define", benchmark, "--worker", worker, @@ -160,20 +159,48 @@ async def test_researchrubrics_rollout( text=True, check=False, ) + experiment_id = _parse_single_uuid("EXPERIMENT_ID", define_proc.stdout) - run_id = _latest_run_id_since(started_at) - terminal_state = harness_client.wait_for_terminal( - run_id, - timeout_s=_HARNESS_POLL_TIMEOUT_SECONDS, + run_proc = subprocess.run( + [ + "uv", + "run", + "ergon", + "experiment", + "run", + str(experiment_id), + ], + timeout=_CLI_TIMEOUT_SECONDS, + capture_output=True, + text=True, + check=False, ) - _wait_for_post_terminal_artifacts(run_id) + run_ids = _parse_uuid_lines("RUN_ID", run_proc.stdout) + if not run_ids: + raise RuntimeError( + f"experiment run produced no RUN_ID lines:\n{run_proc.stdout}\n{run_proc.stderr}" + ) + + terminal_states = [ + harness_client.wait_for_terminal( + run_id, + timeout_s=_HARNESS_POLL_TIMEOUT_SECONDS, + ) + for run_id in run_ids + ] + for run_id in run_ids: + _wait_for_post_terminal_artifacts(run_id) + run_id = run_ids[0] + terminal_state = terminal_states[0] out_dir = rollout_dir(run_id) # Persist CLI stdout/stderr up front so a crashed DB dump still # leaves breadcrumbs for the reviewing agent. - (out_dir / "cli_stdout.txt").write_text(cli_proc.stdout or "") - (out_dir / "cli_stderr.txt").write_text(cli_proc.stderr or "") + (out_dir / "cli_define_stdout.txt").write_text(define_proc.stdout or "") + (out_dir / "cli_define_stderr.txt").write_text(define_proc.stderr or "") + (out_dir / "cli_run_stdout.txt").write_text(run_proc.stdout or "") + (out_dir / "cli_run_stderr.txt").write_text(run_proc.stderr or "") table_counts = dump_rollout(run_id, out_dir) screenshots = await capture_dashboard(run_id, playwright_context, out_dir) @@ -190,7 +217,7 @@ async def test_researchrubrics_rollout( worker=worker, evaluator=evaluator, model=model, - cli_returncode=cli_proc.returncode, + cli_returncode=run_proc.returncode, terminal_state=terminal_state, started_at=started_at, finished_at=finished_at, @@ -219,3 +246,15 @@ async def test_researchrubrics_rollout( f"run {run_id} did not reach a terminal status within " f"{_HARNESS_POLL_TIMEOUT_SECONDS}s — see {out_dir}" ) + + +def _parse_single_uuid(name: str, output: str) -> UUID: + values = _parse_uuid_lines(name, output) + if len(values) != 1: + raise RuntimeError(f"expected exactly one {name}=... line, got {len(values)}:\n{output}") + return values[0] + + +def _parse_uuid_lines(name: str, output: str) -> list[UUID]: + pattern = re.compile(rf"^{re.escape(name)}=([0-9a-fA-F-]{{36}})$", re.MULTILINE) + return [UUID(match.group(1)) for match in pattern.finditer(output or "")] diff --git a/tests/real_llm/benchmarks/test_smoke_stub.py b/tests/real_llm/benchmarks/test_smoke_stub.py index 097fb557..2164012d 100644 --- a/tests/real_llm/benchmarks/test_smoke_stub.py +++ b/tests/real_llm/benchmarks/test_smoke_stub.py @@ -3,39 +3,34 @@ Validates: - docker stack up (or --assume-stack-up), stack fixture did not skip - - `ergon benchmark run` CLI path works + - `ergon experiment define` and `ergon experiment run` CLI paths work - /api/test/read/run/{id}/state returns a terminal state - Postgres row exists with the right relationships - Playwright can find the cohort in the dashboard """ import os +import re import subprocess -from datetime import datetime, timezone import pytest -from sqlmodel import select - -from ergon_core.core.persistence.shared.db import ensure_db, get_session -from ergon_core.core.persistence.telemetry.models import RunRecord pytestmark = [pytest.mark.real_llm, pytest.mark.asyncio] +_UUID_RE = re.compile( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + re.IGNORECASE, +) + -def _latest_run_id_since(since: datetime) -> str: - """Query the most recent RunRecord created at or after `since`.""" - ensure_db() - with get_session() as session: - stmt = ( - select(RunRecord) - .where(RunRecord.created_at >= since) - .order_by(RunRecord.created_at.desc()) - .limit(1) - ) - row = session.exec(stmt).first() - if row is None: - raise RuntimeError("no RunRecord found after canary CLI invocation") - return str(row.id) +def _parse_uuid_line(prefix: str, output: str) -> str: + for line in output.splitlines(): + if not line.startswith(prefix): + continue + match = _UUID_RE.search(line) + if match is not None: + return match.group(0) + raise AssertionError(f"missing {prefix} line in CLI output:\n{output}") async def test_harness_canary_smoke_stub( @@ -43,9 +38,6 @@ async def test_harness_canary_smoke_stub( harness_client, playwright_context, ) -> None: - # Timestamp the boundary so we can filter for a run created *after* this point. - before = datetime.now(timezone.utc) - env = { **os.environ, "ENABLE_TEST_HARNESS": "1", @@ -55,18 +47,18 @@ async def test_harness_canary_smoke_stub( "postgresql://ergon:ergon_dev@127.0.0.1:5433/ergon", ), } - result = subprocess.run( + define = subprocess.run( [ "uv", "run", "ergon", - "benchmark", - "run", + "experiment", + "define", "researchrubrics", - "--model", - "stub:constant", "--worker", "researchrubrics-smoke-worker", + "--model", + "stub:constant", "--evaluator", "researchrubrics-smoke-criterion", "--limit", @@ -77,11 +69,22 @@ async def test_harness_canary_smoke_stub( timeout=180, env=env, ) - assert result.returncode == 0, ( - f"CLI failed (rc={result.returncode}):\nstdout: {result.stdout}\nstderr: {result.stderr}" + assert define.returncode == 0, ( + f"CLI failed (rc={define.returncode}):\nstdout: {define.stdout}\nstderr: {define.stderr}" ) + experiment_id = _parse_uuid_line("EXPERIMENT_ID=", define.stdout + define.stderr) - run_id = _latest_run_id_since(before) + run = subprocess.run( + ["uv", "run", "ergon", "experiment", "run", experiment_id], + capture_output=True, + text=True, + timeout=180, + env=env, + ) + assert run.returncode == 0, ( + f"CLI failed (rc={run.returncode}):\nstdout: {run.stdout}\nstderr: {run.stderr}" + ) + run_id = _parse_uuid_line("RUN_ID=", run.stdout + run.stderr) # Poll the harness until terminal. state = harness_client.wait_for_terminal(run_id, timeout_s=120) diff --git a/tests/real_llm/rollout.py b/tests/real_llm/rollout.py index 14b07dc7..90509b32 100644 --- a/tests/real_llm/rollout.py +++ b/tests/real_llm/rollout.py @@ -1,7 +1,7 @@ """Rollout artifact dump for the real-LLM tier. A rollout harness (not a TDD tier): its job is to trigger a real-LLM -benchmark run and then dump an exhaustive snapshot of what happened into +experiment run and then dump an exhaustive snapshot of what happened into a per-run directory so a future agent session (or a human) can read the artifacts and reason about whether the agent succeeded, and what to tweak in either the model or the simulator to iterate. diff --git a/tests/unit/cli/test_experiment_cli.py b/tests/unit/cli/test_experiment_cli.py new file mode 100644 index 00000000..585c486e --- /dev/null +++ b/tests/unit/cli/test_experiment_cli.py @@ -0,0 +1,173 @@ +import logging +from argparse import Namespace +from uuid import uuid4 + +import pytest +from ergon_cli.commands import experiment as experiment_cmd +from ergon_cli.main import build_parser +from ergon_core.core.runtime.services.experiment_read_service import ( + ExperimentDetailDto, + ExperimentRunRowDto, + ExperimentSummaryDto, +) +from ergon_core.core.runtime.services.experiment_schemas import ( + ExperimentDefineResult, + ExperimentRunResult, +) + + +def _summary(**overrides) -> ExperimentSummaryDto: + data = { + "experiment_id": uuid4(), + "cohort_id": None, + "name": "ci experiment", + "benchmark_type": "ci-benchmark", + "sample_count": 2, + "status": "defined", + "created_at": "2026-04-27T12:00:00Z", + "run_count": 0, + } + data.update(overrides) + return ExperimentSummaryDto.model_validate(data) + + +def test_experiment_subcommands_are_registered_in_main_parser() -> None: + parser = build_parser() + + define_args = parser.parse_args( + [ + "experiment", + "define", + "ci-benchmark", + "--limit", + "1", + "--worker", + "test-worker", + "--model", + "stub:constant", + ] + ) + run_args = parser.parse_args(["experiment", "run", str(uuid4())]) + show_args = parser.parse_args(["experiment", "show", str(uuid4())]) + list_args = parser.parse_args(["experiment", "list", "--limit", "3"]) + + assert define_args.experiment_action == "define" + assert run_args.experiment_action == "run" + assert show_args.experiment_action == "show" + assert list_args.experiment_action == "list" + assert list_args.limit == 3 + + +def test_benchmark_run_is_not_registered_as_launch_command() -> None: + parser = build_parser() + + with pytest.raises(SystemExit): + parser.parse_args(["benchmark", "run", "ci-benchmark"]) + + +def test_experiment_list_logs_rows_without_printing(monkeypatch, caplog, capsys): + class FakeReadService: + def list_experiments(self, *, limit: int): + assert limit == 3 + return [_summary(name="alpha"), _summary(name="beta", status="running", run_count=2)] + + monkeypatch.setattr(experiment_cmd, "ExperimentReadService", FakeReadService) + caplog.set_level(logging.INFO, logger=experiment_cmd.__name__) + + rc = experiment_cmd.handle_experiment_list(Namespace(limit=3)) + + assert rc == 0 + assert capsys.readouterr().out == "" + assert "alpha" in caplog.text + assert "beta" in caplog.text + assert "running" in caplog.text + + +def test_experiment_show_logs_detail_without_printing(monkeypatch, caplog, capsys): + run_id = uuid4() + + class FakeReadService: + def get_experiment(self, experiment_id): + return ExperimentDetailDto( + experiment=_summary(experiment_id=experiment_id), + runs=[ + ExperimentRunRowDto( + run_id=run_id, + workflow_definition_id=uuid4(), + benchmark_type="ci-benchmark", + instance_key="sample-a", + status="completed", + created_at="2026-04-27T12:00:00Z", + ) + ], + sample_selection={"instance_keys": ["sample-a"]}, + ) + + experiment_id = uuid4() + monkeypatch.setattr(experiment_cmd, "ExperimentReadService", FakeReadService) + caplog.set_level(logging.INFO, logger=experiment_cmd.__name__) + + rc = experiment_cmd.handle_experiment_show(Namespace(experiment_id=str(experiment_id))) + + assert rc == 0 + assert capsys.readouterr().out == "" + assert str(experiment_id) in caplog.text + assert str(run_id) in caplog.text + assert "sample-a" in caplog.text + + +@pytest.mark.asyncio +async def test_experiment_define_and_run_log_machine_ids_without_printing( + monkeypatch, caplog, capsys +): + experiment_id = uuid4() + run_id = uuid4() + + class FakeDefinitionService: + def define_benchmark_experiment(self, request): + return ExperimentDefineResult( + experiment_id=experiment_id, + cohort_id=None, + benchmark_type=request.benchmark_slug, + sample_count=1, + selected_samples=["sample-a"], + ) + + class FakeLaunchService: + async def run_experiment(self, request): + return ExperimentRunResult( + experiment_id=request.experiment_id, + run_ids=[run_id], + workflow_definition_ids=[uuid4()], + ) + + monkeypatch.setattr(experiment_cmd, "ensure_db", lambda: None) + monkeypatch.setattr(experiment_cmd, "ExperimentDefinitionService", FakeDefinitionService) + monkeypatch.setattr(experiment_cmd, "ExperimentLaunchService", FakeLaunchService) + caplog.set_level(logging.INFO, logger=experiment_cmd.__name__) + + define_rc = experiment_cmd.handle_experiment_define( + Namespace( + benchmark_slug="ci-benchmark", + cohort=None, + sample_id=None, + limit=1, + name=None, + model="openai:gpt-4o", + worker="test-worker", + evaluator=None, + workflow="single", + max_questions=10, + ) + ) + + assert define_rc == 0 + + run_rc = await experiment_cmd.handle_experiment_run( + Namespace(experiment_id=str(experiment_id), timeout=60, no_wait=False) + ) + + assert run_rc == 0 + assert capsys.readouterr().out == "" + assert f"EXPERIMENT_ID={experiment_id}" in caplog.text + assert f"RUN_ID={run_id}" in caplog.text diff --git a/tests/unit/cli/test_workflow_cli.py b/tests/unit/cli/test_workflow_cli.py index 4c587413..1c6cf652 100644 --- a/tests/unit/cli/test_workflow_cli.py +++ b/tests/unit/cli/test_workflow_cli.py @@ -1,11 +1,18 @@ import json -from dataclasses import dataclass from datetime import UTC, datetime from uuid import uuid4 import pytest from ergon_cli.commands.workflow import WorkflowCommandContext, execute_workflow_command -from ergon_core.core.runtime.services.workflow_dto import WorkflowResourceRef +from ergon_core.core.runtime.services.workflow_dto import ( + WorkflowExecutionRef, + WorkflowMutationRef, + WorkflowResourceLocationRef, + WorkflowResourceRef, + WorkflowTaskRef, + WorkflowTaskWorkspaceRef, +) +from pydantic import BaseModel class _Session: @@ -13,12 +20,14 @@ def close(self) -> None: pass -@dataclass -class _Service: - resource: WorkflowResourceRef +class _Service(BaseModel): + model_config = {"arbitrary_types_allowed": True} + + resource: WorkflowResourceRef | None def list_resources(self, session, *, run_id, node_id, scope, kind=None, max_depth=3, limit=50): assert isinstance(session, _Session) + assert self.resource is not None assert run_id == self.resource.run_id assert node_id == self.resource.node_id assert scope == "visible" @@ -56,7 +65,7 @@ def test_resource_list_json_uses_injected_context() -> None: benchmark_type="researchrubrics", ), session_factory=_Session, - service=_Service(resource), + service=_Service(resource=resource), ) payload = json.loads(output.stdout) @@ -66,6 +75,188 @@ def test_resource_list_json_uses_injected_context() -> None: assert payload["resources"][0]["task_slug"] == "research" +def test_manage_add_task_json_plumbs_cli_arguments_to_service() -> None: + expected_run_id = uuid4() + expected_parent_node_id = uuid4() + created_node_id = uuid4() + + class Service: + async def add_task( + self, + session, + *, + run_id, + parent_node_id, + task_slug, + description, + assigned_worker_slug, + dry_run, + ): + assert isinstance(session, _Session) + assert run_id == expected_run_id + assert parent_node_id == expected_parent_node_id + assert task_slug == "new_leaf" + assert description == "New leaf" + assert assigned_worker_slug == "researchrubrics-researcher" + assert dry_run is True + return WorkflowMutationRef( + action="add-task", + dry_run=True, + node=WorkflowTaskRef( + node_id=created_node_id, + task_slug="new_leaf", + status="pending", + level=2, + parent_node_id=expected_parent_node_id, + assigned_worker_slug="researchrubrics-researcher", + description="New leaf", + ), + message="Would add task new_leaf", + ) + + output = execute_workflow_command( + "manage add-task --task-slug new_leaf --description 'New leaf' " + "--worker researchrubrics-researcher " + f"--parent-node-id {expected_parent_node_id} --dry-run --format json", + context=WorkflowCommandContext( + run_id=expected_run_id, + node_id=expected_parent_node_id, + execution_id=uuid4(), + sandbox_task_key=uuid4(), + benchmark_type="researchrubrics", + ), + session_factory=_Session, + service=Service(), + ) + + payload = json.loads(output.stdout) + assert payload["mutation"]["action"] == "add-task" + assert payload["mutation"]["node"]["task_slug"] == "new_leaf" + assert payload["mutation"]["dry_run"] is True + + +def test_resource_location_json_uses_injected_run_scope() -> None: + run_id = uuid4() + node_id = uuid4() + resource_id = uuid4() + resource = WorkflowResourceRef( + resource_id=resource_id, + run_id=run_id, + task_execution_id=uuid4(), + node_id=node_id, + task_slug="producer", + kind="report", + name="paper.txt", + mime_type="text/plain", + size_bytes=12, + file_path="/tmp/paper.txt", + content_hash="sha256:abc", + copied_from_resource_id=None, + created_at=datetime(2026, 4, 26, tzinfo=UTC), + ) + + class Service: + def get_resource_location(self, session, *, run_id, resource_id): + assert isinstance(session, _Session) + assert run_id == resource.run_id + assert resource_id == resource.resource_id + return WorkflowResourceLocationRef( + resource=resource, + producer_task_slug="producer", + local_file_path="/tmp/paper.txt", + default_sandbox_path="/workspace/imported/producer/paper (copy).txt", + ) + + output = execute_workflow_command( + f"inspect resource-location --resource-id {resource_id} --format json", + context=WorkflowCommandContext( + run_id=run_id, + node_id=node_id, + execution_id=uuid4(), + sandbox_task_key=uuid4(), + benchmark_type="researchrubrics", + ), + session_factory=_Session, + service=Service(), + ) + + payload = json.loads(output.stdout) + assert payload["resource_location"]["producer_task_slug"] == "producer" + assert payload["resource_location"]["default_sandbox_path"].startswith("/workspace/imported") + + +def test_task_workspace_text_lists_own_and_input_resources() -> None: + run_id = uuid4() + node_id = uuid4() + execution_id = uuid4() + + class Service: + def get_task_workspace(self, session, *, run_id, node_id): + assert isinstance(session, _Session) + return WorkflowTaskWorkspaceRef( + task=WorkflowTaskRef( + node_id=node_id, + task_slug="current", + status="running", + level=1, + description="Current", + ), + latest_execution=WorkflowExecutionRef( + execution_id=execution_id, + status="running", + attempt_number=1, + final_assistant_message=None, + ), + own_resources=[ + WorkflowResourceRef( + resource_id=uuid4(), + run_id=run_id, + task_execution_id=execution_id, + node_id=node_id, + task_slug="current", + kind="report", + name="own.txt", + mime_type="text/plain", + size_bytes=3, + file_path="/tmp/own.txt", + created_at=datetime(2026, 4, 26, tzinfo=UTC), + ) + ], + input_resources=[ + WorkflowResourceRef( + resource_id=uuid4(), + run_id=run_id, + task_execution_id=uuid4(), + node_id=uuid4(), + task_slug="upstream", + kind="report", + name="input.txt", + mime_type="text/plain", + size_bytes=5, + file_path="/tmp/input.txt", + created_at=datetime(2026, 4, 26, tzinfo=UTC), + ) + ], + ) + + output = execute_workflow_command( + "inspect task-workspace", + context=WorkflowCommandContext( + run_id=run_id, + node_id=node_id, + execution_id=execution_id, + sandbox_task_key=uuid4(), + benchmark_type="researchrubrics", + ), + session_factory=_Session, + service=Service(), + ) + + assert "task current status=running" in output.stdout + assert "own: own.txt" in output.stdout + assert "input: input.txt" in output.stdout + + def test_agent_command_rejects_user_supplied_context_flags() -> None: with pytest.raises(ValueError, match="scope/context flags are injected"): execute_workflow_command( diff --git a/tests/unit/providers/test_model_resolution.py b/tests/unit/providers/test_model_resolution.py new file mode 100644 index 00000000..89f0445a --- /dev/null +++ b/tests/unit/providers/test_model_resolution.py @@ -0,0 +1,45 @@ +import pytest + +from ergon_core.core.providers.generation.model_resolution import resolve_model_target + + +def test_cloud_provider_targets_resolve_to_openrouter_provider() -> None: + from pydantic_ai.models.openai import OpenAIChatModel + from pydantic_ai.providers.openrouter import OpenRouterProvider + + resolved = resolve_model_target("openai:gpt-4o", api_key="test-openrouter-key") + + assert isinstance(resolved.model, OpenAIChatModel) + assert isinstance(resolved.model._provider, OpenRouterProvider) + assert resolved.model.model_name == "openai/gpt-4o" + assert resolved.model.system == "openrouter" + assert resolved.supports_logprobs is False + + +def test_anthropic_target_resolves_to_openrouter_namespace() -> None: + from pydantic_ai.models.openai import OpenAIChatModel + + resolved = resolve_model_target("anthropic:claude-sonnet-4.6", api_key="test-openrouter-key") + + assert isinstance(resolved.model, OpenAIChatModel) + assert resolved.model.model_name == "anthropic/claude-sonnet-4.6" + + +def test_vllm_endpoint_target_resolves_to_openai_compatible_model() -> None: + from pydantic_ai.models.openai import OpenAIChatModel + + resolved = resolve_model_target("vllm:http://localhost:8000#served-model") + + assert isinstance(resolved.model, OpenAIChatModel) + assert resolved.model.model_name == "served-model" + assert resolved.supports_logprobs is True + + +def test_openai_compatible_target_requires_model_name() -> None: + with pytest.raises(ValueError, match="model name"): + resolve_model_target("openai-compatible:http://localhost:11434/v1") + + +def test_unknown_model_target_prefix_is_rejected() -> None: + with pytest.raises(ValueError, match="Unsupported model target"): + resolve_model_target("mystery:model") diff --git a/tests/unit/runtime/test_cohort_service.py b/tests/unit/runtime/test_cohort_service.py new file mode 100644 index 00000000..27eb8508 --- /dev/null +++ b/tests/unit/runtime/test_cohort_service.py @@ -0,0 +1,59 @@ +from datetime import UTC, datetime +from uuid import uuid4 + +from ergon_core.core.persistence.shared.enums import RunStatus +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord +from ergon_core.core.runtime.services.cohort_service import ExperimentCohortService + + +def _experiment(status: str = "running") -> ExperimentRecord: + return ExperimentRecord( + id=uuid4(), + name="ci experiment", + benchmark_type="ci-benchmark", + sample_count=2, + sample_selection_json={"instance_keys": ["a", "b"]}, + default_worker_team_json={"primary": "ci-worker"}, + default_evaluator_slug=None, + default_model_target="openai:gpt-4o", + design_json={}, + metadata_json={}, + status=status, + created_at=datetime(2026, 4, 27, tzinfo=UTC), + ) + + +def _run(experiment_id, status: RunStatus) -> RunRecord: + return RunRecord( + id=uuid4(), + experiment_id=experiment_id, + workflow_definition_id=uuid4(), + benchmark_type="ci-benchmark", + instance_key=str(status), + worker_team_json={"primary": "ci-worker"}, + evaluator_slug=None, + model_target="openai:gpt-4o", + status=status, + summary_json={}, + ) + + +def test_experiment_row_status_reflects_terminal_run_outcomes() -> None: + experiment = _experiment(status="running") + + failed_row = ExperimentCohortService._build_experiment_row( + experiment, + [_run(experiment.id, RunStatus.FAILED), _run(experiment.id, RunStatus.FAILED)], + ) + completed_row = ExperimentCohortService._build_experiment_row( + experiment, + [_run(experiment.id, RunStatus.COMPLETED), _run(experiment.id, RunStatus.COMPLETED)], + ) + mixed_row = ExperimentCohortService._build_experiment_row( + experiment, + [_run(experiment.id, RunStatus.COMPLETED), _run(experiment.id, RunStatus.FAILED)], + ) + + assert failed_row.status == "failed" + assert completed_row.status == "completed" + assert mixed_row.status == "completed_with_failures" diff --git a/tests/unit/runtime/test_experiment_definition_service.py b/tests/unit/runtime/test_experiment_definition_service.py new file mode 100644 index 00000000..e5de6342 --- /dev/null +++ b/tests/unit/runtime/test_experiment_definition_service.py @@ -0,0 +1,90 @@ +from collections.abc import Mapping, Sequence + +from ergon_core.api.benchmark import Benchmark +from ergon_core.api.task_types import BenchmarkTask +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord +from ergon_core.core.runtime.services import experiment_definition_service as service_module +from ergon_core.core.runtime.services.experiment_definition_service import ( + ExperimentDefinitionService, +) +from ergon_core.core.runtime.services.experiment_schemas import ExperimentDefineRequest +from pydantic import BaseModel + + +class _Payload(BaseModel): + value: int + + +class _Benchmark(Benchmark): + type_slug = "ci-benchmark" + task_payload_model = _Payload + + def __init__(self, *, limit: int | None = None) -> None: + super().__init__() + self.limit = limit + + def build_instances(self) -> Mapping[str, Sequence[BenchmarkTask[BaseModel]]]: + selected = ["sample-a", "sample-b", "sample-c"][: self.limit] + return { + key: [ + BenchmarkTask( + instance_key=key, + task_slug=f"{key}-root", + description=f"Task for {key}", + payload=_Payload(value=index), + ) + ] + for index, key in enumerate(selected) + } + + +class _FakeSession: + def __init__(self) -> None: + self.added = [] + + def __enter__(self) -> "_FakeSession": + return self + + def __exit__(self, *args) -> None: + return None + + def add(self, row) -> None: + self.added.append(row) + + def commit(self) -> None: + return None + + def refresh(self, row) -> None: + return None + + +def test_define_benchmark_experiment_creates_experiment_record_without_runs(monkeypatch): + session = _FakeSession() + monkeypatch.setattr(service_module, "get_session", lambda: session) + service = ExperimentDefinitionService(benchmarks={"ci-benchmark": _Benchmark}) + + result = service.define_benchmark_experiment( + ExperimentDefineRequest( + benchmark_slug="ci-benchmark", + limit=2, + default_model_target="openai:gpt-4o", + default_worker_team={"primary": "test-worker"}, + default_evaluator_slug=None, + ) + ) + + assert result.benchmark_type == "ci-benchmark" + assert result.sample_count == 2 + assert result.selected_samples == ["sample-a", "sample-b"] + assert len(session.added) == 1 + assert isinstance(session.added[0], ExperimentRecord) + assert not any(isinstance(row, RunRecord) for row in session.added) + + experiment = session.added[0] + assert experiment.name.startswith("ci-benchmark n=2") + assert experiment.cohort_id is None + assert experiment.sample_selection_json == {"instance_keys": ["sample-a", "sample-b"]} + assert experiment.default_worker_team_json == {"primary": "test-worker"} + assert experiment.default_model_target == "openai:gpt-4o" + assert experiment.default_evaluator_slug is None + assert experiment.status == "defined" diff --git a/tests/unit/runtime/test_experiment_launch_service.py b/tests/unit/runtime/test_experiment_launch_service.py new file mode 100644 index 00000000..e901d270 --- /dev/null +++ b/tests/unit/runtime/test_experiment_launch_service.py @@ -0,0 +1,97 @@ +from uuid import uuid4 + +import pytest +from ergon_core.api.handles import PersistedExperimentDefinition +from ergon_core.core.persistence.shared.enums import RunStatus +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord +from ergon_core.core.runtime.services import experiment_launch_service as service_module +from ergon_core.core.runtime.services.experiment_launch_service import ExperimentLaunchService +from ergon_core.core.runtime.services.experiment_schemas import ExperimentRunRequest, RunAssignment + + +class _FakeSession: + def __init__(self, experiment: ExperimentRecord) -> None: + self.experiment = experiment + + def __enter__(self) -> "_FakeSession": + return self + + def __exit__(self, *args) -> None: + return None + + def get(self, cls, row_id): + if cls is ExperimentRecord and row_id == self.experiment.id: + return self.experiment + return None + + def add(self, row) -> None: + return None + + def commit(self) -> None: + return None + + def refresh(self, row) -> None: + return None + + +@pytest.mark.asyncio +async def test_run_experiment_creates_one_run_per_selected_sample(monkeypatch): + experiment = ExperimentRecord( + id=uuid4(), + name="ci experiment", + benchmark_type="ci-benchmark", + sample_count=2, + sample_selection_json={"instance_keys": ["sample-a", "sample-b"]}, + default_worker_team_json={"primary": "test-worker"}, + default_evaluator_slug=None, + default_model_target="openai:gpt-4o", + design_json={}, + metadata_json={}, + status="defined", + ) + created_runs: list[RunRecord] = [] + emitted: list[tuple] = [] + + def workflow_factory( + experiment_record: ExperimentRecord, + assignment: RunAssignment, + ) -> PersistedExperimentDefinition: + return PersistedExperimentDefinition( + definition_id=uuid4(), + benchmark_type=experiment_record.benchmark_type, + worker_bindings=assignment.worker_team, + ) + + def fake_create_run(definition, **kwargs): + run = RunRecord( + id=uuid4(), + status=RunStatus.PENDING, + benchmark_type=definition.benchmark_type, + **kwargs, + ) + created_runs.append(run) + return run + + async def fake_emit(run_id, definition_id): + emitted.append((run_id, definition_id)) + + monkeypatch.setattr(service_module, "get_session", lambda: _FakeSession(experiment)) + monkeypatch.setattr(service_module, "create_run", fake_create_run) + + service = ExperimentLaunchService( + workflow_definition_factory=workflow_factory, + emit_workflow_started=fake_emit, + ) + + result = await service.run_experiment(ExperimentRunRequest(experiment_id=experiment.id)) + + assert result.experiment_id == experiment.id + assert result.run_ids == [run.id for run in created_runs] + assert len(result.workflow_definition_ids) == 2 + assert [run.instance_key for run in created_runs] == ["sample-a", "sample-b"] + assert {run.experiment_id for run in created_runs} == {experiment.id} + assert [run.worker_team_json for run in created_runs] == [ + {"primary": "test-worker"}, + {"primary": "test-worker"}, + ] + assert len(emitted) == 2 diff --git a/tests/unit/runtime/test_experiment_read_service.py b/tests/unit/runtime/test_experiment_read_service.py new file mode 100644 index 00000000..1ad15178 --- /dev/null +++ b/tests/unit/runtime/test_experiment_read_service.py @@ -0,0 +1,111 @@ +from datetime import UTC, datetime, timedelta +from uuid import uuid4 + +import pytest +from ergon_core.core.persistence.definitions.models import ExperimentDefinitionTask +from ergon_core.core.persistence.graph.models import RunGraphNode +from ergon_core.core.persistence.shared.enums import RunStatus +from ergon_core.core.persistence.telemetry.models import ExperimentRecord, RunRecord +from ergon_core.core.runtime.services import experiment_read_service as module +from ergon_core.core.runtime.services.experiment_read_service import ExperimentReadService +from sqlalchemy.pool import StaticPool +from sqlmodel import Session, SQLModel, create_engine + + +@pytest.fixture() +def session_factory(): + _ = ExperimentRecord + _ = ExperimentDefinitionTask + _ = RunGraphNode + engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + SQLModel.metadata.create_all(engine) + + def _get_session() -> Session: + return Session(engine) + + return _get_session + + +def test_experiment_detail_aggregates_run_analytics(monkeypatch, session_factory) -> None: + now = datetime(2026, 4, 27, 12, 0, tzinfo=UTC) + experiment_id = uuid4() + definition_id = uuid4() + run_a_id = uuid4() + run_b_id = uuid4() + run_c_id = uuid4() + + with session_factory() as session: + session.add( + ExperimentRecord( + id=experiment_id, + name="ci experiment", + benchmark_type="ci-benchmark", + sample_count=3, + sample_selection_json={"instance_keys": ["a", "b", "c"]}, + default_worker_team_json={"primary": "ci-worker"}, + default_evaluator_slug="ci-evaluator", + default_model_target="openai:gpt-4o", + design_json={}, + metadata_json={}, + status="running", + created_at=now, + ) + ) + for run_id, instance_key, status, started, completed, score, cost in [ + (run_a_id, "a", RunStatus.COMPLETED, now, now + timedelta(seconds=10), 1.0, 0.2), + (run_b_id, "b", RunStatus.FAILED, now, now + timedelta(seconds=20), 0.0, 0.3), + (run_c_id, "c", RunStatus.EXECUTING, now, None, None, None), + ]: + session.add( + RunRecord( + id=run_id, + experiment_id=experiment_id, + workflow_definition_id=definition_id, + benchmark_type="ci-benchmark", + instance_key=instance_key, + worker_team_json={"primary": "ci-worker"}, + evaluator_slug="ci-evaluator", + model_target="openai:gpt-4o", + status=status, + started_at=started, + completed_at=completed, + summary_json=( + {"final_score": score, "total_cost_usd": cost} + if score is not None and cost is not None + else {} + ), + ) + ) + for index in range(2): + session.add( + RunGraphNode( + run_id=run_id, + instance_key=instance_key, + task_slug=f"{instance_key}-{index}", + description="Task", + status="completed", + assigned_worker_slug="ci-worker", + level=index, + ) + ) + session.commit() + + monkeypatch.setattr(module, "get_session", session_factory) + + detail = ExperimentReadService().get_experiment(experiment_id) + + assert detail is not None + assert detail.analytics.total_runs == 3 + assert detail.analytics.status_counts.completed == 1 + assert detail.analytics.status_counts.failed == 1 + assert detail.analytics.status_counts.executing == 1 + assert detail.analytics.average_score == 0.5 + assert detail.analytics.average_duration_ms == 15_000 + assert detail.analytics.average_tasks == 2.0 + assert detail.analytics.total_cost_usd == 0.5 + assert detail.runs[0].running_time_ms == 10_000 + assert detail.runs[0].total_tasks == 2 diff --git a/tests/unit/runtime/test_experiment_schemas.py b/tests/unit/runtime/test_experiment_schemas.py new file mode 100644 index 00000000..30aa4c9a --- /dev/null +++ b/tests/unit/runtime/test_experiment_schemas.py @@ -0,0 +1,78 @@ +from uuid import uuid4 + +import pytest +from ergon_core.core.runtime.services.experiment_schemas import ( + ExperimentDefineRequest, + ExperimentRunRequest, +) +from pydantic import ValidationError + + +def test_define_request_accepts_optional_name_cohort_and_evaluator() -> None: + request = ExperimentDefineRequest( + benchmark_slug="researchrubrics", + limit=5, + default_model_target="anthropic:claude-sonnet-4.6", + default_worker_team={"primary": "researchrubrics-workflow-cli-react"}, + ) + + assert request.name is None + assert request.cohort_id is None + assert request.default_evaluator_slug is None + + +@pytest.mark.parametrize( + "payload", + [ + { + "benchmark_slug": "researchrubrics", + "default_model_target": "anthropic:claude-sonnet-4.6", + "default_worker_team": {"primary": "worker"}, + }, + { + "benchmark_slug": "researchrubrics", + "limit": 5, + "sample_ids": ["a"], + "default_model_target": "anthropic:claude-sonnet-4.6", + "default_worker_team": {"primary": "worker"}, + }, + ], +) +def test_define_request_requires_exactly_one_sample_selector(payload) -> None: + with pytest.raises(ValidationError, match="exactly one"): + ExperimentDefineRequest.model_validate(payload) + + +def test_define_request_requires_assignment_defaults_without_arms() -> None: + with pytest.raises(ValidationError, match="default_worker_team"): + ExperimentDefineRequest( + benchmark_slug="researchrubrics", + limit=5, + ) + + +def test_define_request_rejects_design_arms_until_launch_support_exists() -> None: + with pytest.raises(ValidationError, match="design\\.arms"): + ExperimentDefineRequest( + benchmark_slug="researchrubrics", + sample_ids=["a"], + default_model_target="anthropic:claude-sonnet-4.6", + default_worker_team={"primary": "worker"}, + design={ + "arms": { + "baseline": { + "worker_team": {"primary": "worker"}, + "model_target": "anthropic:claude-sonnet-4.6", + }, + }, + }, + ) + + +def test_run_request_identifies_defined_experiment() -> None: + experiment_id = uuid4() + + request = ExperimentRunRequest(experiment_id=experiment_id) + + assert request.experiment_id == experiment_id + assert request.wait is True diff --git a/tests/unit/runtime/test_graph_worker_identity.py b/tests/unit/runtime/test_graph_worker_identity.py new file mode 100644 index 00000000..e4564d3a --- /dev/null +++ b/tests/unit/runtime/test_graph_worker_identity.py @@ -0,0 +1,292 @@ +from uuid import UUID, uuid4 + +import pytest +from ergon_core.core.persistence.definitions.models import ( + ExperimentDefinition, + ExperimentDefinitionInstance, + ExperimentDefinitionTask, + ExperimentDefinitionTaskAssignment, + ExperimentDefinitionWorker, +) +from ergon_core.core.persistence.graph.models import RunGraphNode +from ergon_core.core.persistence.shared.enums import RunStatus, TaskExecutionStatus +from ergon_core.core.persistence.telemetry.models import ( + ExperimentRecord, + RunRecord, + RunTaskExecution, +) +from ergon_core.core.runtime.services import task_execution_service as task_execution_module +from ergon_core.core.runtime.services.graph_dto import MutationMeta +from ergon_core.core.runtime.services.graph_repository import WorkflowGraphRepository +from ergon_core.core.runtime.services.orchestration_dto import ( + InitializeWorkflowCommand, + PrepareTaskExecutionCommand, +) +from ergon_core.core.runtime.services.task_management_dto import AddSubtaskCommand +from ergon_core.core.runtime.services.task_management_service import TaskManagementService +from ergon_core.core.runtime.services.task_execution_service import TaskExecutionService +from ergon_core.core.runtime.services.workflow_initialization_service import ( + WorkflowInitializationService, +) +from pydantic import BaseModel +from sqlalchemy.pool import StaticPool +from sqlmodel import Session, SQLModel, create_engine, select + + +class _Payload(BaseModel): + pass + + +def _session() -> Session: + _ = ExperimentRecord + engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + SQLModel.metadata.create_all(engine) + return Session(engine) + + +def _definition_with_worker( + session: Session, + *, + worker_type: str = "minif2f-react", + benchmark_type: str = "minif2f", +) -> UUID: + definition_id = uuid4() + instance_id = uuid4() + task_id = uuid4() + session.add_all( + [ + ExperimentDefinition( + id=definition_id, + benchmark_type=benchmark_type, + metadata_json={}, + ), + ExperimentDefinitionInstance( + id=instance_id, + experiment_definition_id=definition_id, + instance_key="sample-1", + ), + ExperimentDefinitionTask( + id=task_id, + experiment_definition_id=definition_id, + instance_id=instance_id, + task_slug="root", + description="Root task", + task_payload_json={}, + ), + ExperimentDefinitionWorker( + experiment_definition_id=definition_id, + binding_key="primary", + worker_type=worker_type, + model_target="stub:constant", + snapshot_json={}, + ), + ExperimentDefinitionTaskAssignment( + experiment_definition_id=definition_id, + task_id=task_id, + worker_binding_key="primary", + ), + ] + ) + session.commit() + return definition_id + + +def _run( + session: Session, + *, + definition_id: UUID, + run_id: UUID | None = None, + model_target: str = "stub:constant", +) -> UUID: + experiment_id = uuid4() + resolved_run_id = run_id or uuid4() + session.add( + ExperimentRecord( + id=experiment_id, + name="worker identity", + benchmark_type="minif2f", + sample_count=1, + sample_selection_json={"instance_keys": ["sample-1"]}, + default_worker_team_json={"primary": "minif2f-react"}, + default_model_target=model_target, + design_json={}, + metadata_json={}, + status="running", + ) + ) + session.add( + RunRecord( + id=resolved_run_id, + experiment_id=experiment_id, + workflow_definition_id=definition_id, + benchmark_type="minif2f", + instance_key="sample-1", + worker_team_json={"primary": "minif2f-react"}, + model_target=model_target, + status=RunStatus.EXECUTING, + ) + ) + session.commit() + return resolved_run_id + + +def test_graph_initialization_writes_concrete_worker_slug_from_definition_binding() -> None: + session = _session() + definition_id = _definition_with_worker(session, worker_type="minif2f-react") + run_id = _run(session, definition_id=definition_id) + + WorkflowGraphRepository().initialize_from_definition( + session, + run_id, + definition_id, + initial_node_status=TaskExecutionStatus.PENDING, + initial_edge_status="pending", + task_payload_model=_Payload, + meta=MutationMeta(actor="test"), + ) + + node = session.exec(select(RunGraphNode).where(RunGraphNode.run_id == run_id)).one() + assert node.assigned_worker_slug == "minif2f-react" + + +@pytest.mark.asyncio +async def test_workflow_initialization_returns_node_ids_for_initial_ready_static_tasks( + monkeypatch: pytest.MonkeyPatch, +) -> None: + session = _session() + benchmark_type = "ci-worker-identity" + definition_id = _definition_with_worker( + session, + worker_type="minif2f-react", + benchmark_type=benchmark_type, + ) + run_id = _run(session, definition_id=definition_id) + + class _Benchmark: + task_payload_model = _Payload + + monkeypatch.setitem( + __import__( + "ergon_core.core.runtime.services.workflow_initialization_service", + fromlist=["BENCHMARKS"], + ).BENCHMARKS, + benchmark_type, + _Benchmark, + ) + monkeypatch.setattr( + "ergon_core.core.runtime.services.workflow_initialization_service.get_session", + lambda: _session_context(session), + ) + + initialized = await WorkflowInitializationService().initialize( + InitializeWorkflowCommand(run_id=run_id, definition_id=definition_id) + ) + + assert len(initialized.initial_ready_tasks) == 1 + ready_task = initialized.initial_ready_tasks[0] + node = session.exec( + select(RunGraphNode).where(RunGraphNode.definition_task_id == ready_task.task_id) + ).one() + assert ready_task.node_id == node.id + assert node.assigned_worker_slug == "minif2f-react" + + +@pytest.mark.asyncio +async def test_dynamic_prepare_uses_node_worker_slug_and_run_model_without_definition_binding( + monkeypatch: pytest.MonkeyPatch, +) -> None: + session = _session() + definition_id = _definition_with_worker(session, worker_type="minif2f-react") + run_id = _run(session, definition_id=definition_id, model_target="stub:constant") + node = RunGraphNode( + run_id=run_id, + instance_key="sample-1", + task_slug="dynamic-leaf", + description="Dynamic specialist task", + status=TaskExecutionStatus.PENDING, + assigned_worker_slug="swebench-react", + parent_node_id=None, + level=1, + ) + session.add(node) + session.commit() + + monkeypatch.setattr(task_execution_module, "get_session", lambda: _session_context(session)) + + prepared = await TaskExecutionService().prepare( + PrepareTaskExecutionCommand( + run_id=run_id, + definition_id=definition_id, + task_id=None, + node_id=node.id, + ) + ) + + execution = session.exec( + select(RunTaskExecution).where(RunTaskExecution.id == prepared.execution_id) + ).one() + dynamic_worker = session.exec( + select(ExperimentDefinitionWorker).where( + ExperimentDefinitionWorker.experiment_definition_id == definition_id, + ExperimentDefinitionWorker.binding_key == "swebench-react", + ) + ).first() + + assert prepared.assigned_worker_slug == "swebench-react" + assert prepared.worker_type == "swebench-react" + assert prepared.model_target == "stub:constant" + assert execution.definition_worker_id is None + assert dynamic_worker is None + + +@pytest.mark.asyncio +async def test_add_subtask_rejects_unknown_worker_slug_before_creating_node() -> None: + session = _session() + definition_id = _definition_with_worker(session, worker_type="minif2f-react") + run_id = _run(session, definition_id=definition_id) + parent = RunGraphNode( + run_id=run_id, + instance_key="sample-1", + task_slug="parent", + description="Parent task", + status=TaskExecutionStatus.RUNNING, + assigned_worker_slug="minif2f-react", + level=0, + ) + session.add(parent) + session.commit() + + with pytest.raises(ValueError, match="Unknown worker slug"): + await TaskManagementService().add_subtask( + session, + AddSubtaskCommand( + run_id=run_id, + parent_node_id=parent.id, + task_slug="bad-worker", + description="Should not be inserted", + assigned_worker_slug="not-a-real-worker", + ), + ) + + inserted = session.exec( + select(RunGraphNode).where( + RunGraphNode.run_id == run_id, + RunGraphNode.task_slug == "bad-worker", + ) + ).first() + assert inserted is None + + +class _session_context: + def __init__(self, session: Session) -> None: + self._session = session + + def __enter__(self) -> Session: + return self._session + + def __exit__(self, *args) -> None: + return None diff --git a/tests/unit/runtime/test_import_boundaries.py b/tests/unit/runtime/test_import_boundaries.py new file mode 100644 index 00000000..edf3245b --- /dev/null +++ b/tests/unit/runtime/test_import_boundaries.py @@ -0,0 +1,25 @@ +def test_telemetry_models_import_before_run_resource_api() -> None: + from ergon_core.core.persistence.telemetry.models import RunResource + + from ergon_core.api.run_resource import RunResourceView + + assert RunResource.__tablename__ == "run_resources" + assert RunResourceView.__name__ == "RunResourceView" + + +def test_context_models_import_without_worker_cycle() -> None: + from ergon_core.core.persistence.context.models import RunContextEvent + + assert RunContextEvent.__tablename__ == "run_context_events" + + +def test_context_event_payloads_use_shared_logprob_type_without_api_cycle() -> None: + from typing import get_args + + from ergon_core.core.persistence.context.event_payloads import ToolCallPayload + from ergon_core.core.providers.generation.types import TokenLogprob + + annotation_args = get_args(ToolCallPayload.model_fields["turn_logprobs"].annotation) + list_annotation = next(arg for arg in annotation_args if get_args(arg)) + + assert get_args(list_annotation) == (TokenLogprob,) diff --git a/tests/unit/runtime/test_run_service.py b/tests/unit/runtime/test_run_service.py new file mode 100644 index 00000000..67e6b941 --- /dev/null +++ b/tests/unit/runtime/test_run_service.py @@ -0,0 +1,63 @@ +from uuid import uuid4 + +from ergon_core.api.handles import PersistedExperimentDefinition +from ergon_core.core.persistence.shared.enums import RunStatus +from ergon_core.core.runtime.services import run_service + + +class _FakeSession: + def __init__(self) -> None: + self.added = [] + + def __enter__(self) -> "_FakeSession": + return self + + def __exit__(self, *args) -> None: + return None + + def add(self, row) -> None: + self.added.append(row) + + def commit(self) -> None: + return None + + def refresh(self, row) -> None: + return None + + +def test_create_run_requires_experiment_identity_and_records_workflow_assignment(monkeypatch): + session = _FakeSession() + experiment_id = uuid4() + workflow_definition_id = uuid4() + definition = PersistedExperimentDefinition( + definition_id=workflow_definition_id, + benchmark_type="ci-benchmark", + worker_bindings={"primary": "test-worker"}, + evaluator_bindings={"primary": "test-evaluator"}, + ) + + monkeypatch.setattr(run_service, "get_session", lambda: session) + + run = run_service.create_run( + definition, + experiment_id=experiment_id, + workflow_definition_id=workflow_definition_id, + instance_key="sample-1", + worker_team_json={"primary": "test-worker"}, + evaluator_slug="test-evaluator", + model_target="openai:gpt-4o", + assignment_json={"arm_key": "default"}, + seed=123, + ) + + assert session.added == [run] + assert run.experiment_id == experiment_id + assert run.workflow_definition_id == workflow_definition_id + assert run.benchmark_type == "ci-benchmark" + assert run.instance_key == "sample-1" + assert run.worker_team_json == {"primary": "test-worker"} + assert run.evaluator_slug == "test-evaluator" + assert run.model_target == "openai:gpt-4o" + assert run.assignment_json == {"arm_key": "default"} + assert run.seed == 123 + assert run.status == RunStatus.PENDING diff --git a/tests/unit/runtime/test_workflow_service.py b/tests/unit/runtime/test_workflow_service.py index 69e5a73a..de9dd75e 100644 --- a/tests/unit/runtime/test_workflow_service.py +++ b/tests/unit/runtime/test_workflow_service.py @@ -31,6 +31,7 @@ def _node( *, run_id: UUID, slug: str, + description: str | None = None, status: str = "completed", parent_node_id: UUID | None = None, level: int = 0, @@ -39,7 +40,7 @@ def _node( run_id=run_id, instance_key="instance", task_slug=slug, - description=f"Task {slug}", + description=description or f"Task {slug}", status=status, assigned_worker_slug="worker", parent_node_id=parent_node_id, @@ -47,6 +48,15 @@ def _node( ) +def _edge(*, run_id: UUID, source_node_id: UUID, target_node_id: UUID) -> RunGraphEdge: + return RunGraphEdge( + run_id=run_id, + source_node_id=source_node_id, + target_node_id=target_node_id, + status="satisfied", + ) + + def _execution( *, run_id: UUID, @@ -85,10 +95,15 @@ def _resource( def _run(session: Session) -> UUID: run_id = uuid4() + workflow_definition_id = uuid4() session.add( RunRecord( id=run_id, - experiment_definition_id=uuid4(), + experiment_id=uuid4(), + workflow_definition_id=workflow_definition_id, + benchmark_type="ci-workflow-service", + instance_key="sample-1", + worker_team_json={"primary": "test-worker"}, status=RunStatus.EXECUTING, ) ) @@ -301,3 +316,302 @@ async def test_materialize_resource_dry_run_keeps_copy_name_for_explicit_destina assert result.sandbox_path == "/workspace/selected/paper (copy).pdf" assert result.copied_resource_id is None + + +def test_resource_location_describes_producer_and_workspace_destination(tmp_path: Path) -> None: + session = _session() + run_id = _run(session) + producer = _node(run_id=run_id, slug="producer") + session.add(producer) + session.flush() + producer_exec = _execution(run_id=run_id, node_id=producer.id) + session.add(producer_exec) + session.flush() + source = _resource( + run_id=run_id, + execution_id=producer_exec.id, + name="paper.pdf", + path=tmp_path / "paper.pdf", + content=b"paper", + ) + session.add(source) + session.commit() + + location = WorkflowService().get_resource_location( + session, + run_id=run_id, + resource_id=source.id, + ) + + assert location.resource.resource_id == source.id + assert location.producer_task_slug == "producer" + assert location.default_sandbox_path == "/workspace/imported/producer/paper (copy).pdf" + assert location.local_file_path == source.file_path + + +def test_task_workspace_reports_latest_execution_and_resources(tmp_path: Path) -> None: + session = _session() + run_id = _run(session) + current = _node(run_id=run_id, slug="current", status="running") + upstream = _node(run_id=run_id, slug="upstream") + session.add_all([current, upstream]) + session.flush() + current_exec = _execution( + run_id=run_id, + node_id=current.id, + status=TaskExecutionStatus.RUNNING, + ) + upstream_exec = _execution(run_id=run_id, node_id=upstream.id) + session.add_all([current_exec, upstream_exec]) + session.flush() + session.add(_edge(run_id=run_id, source_node_id=upstream.id, target_node_id=current.id)) + session.add_all( + [ + _resource( + run_id=run_id, + execution_id=current_exec.id, + name="own.txt", + path=tmp_path / "own.txt", + content=b"own", + ), + _resource( + run_id=run_id, + execution_id=upstream_exec.id, + name="input.txt", + path=tmp_path / "input.txt", + content=b"input", + ), + ] + ) + session.commit() + + workspace = WorkflowService().get_task_workspace( + session, + run_id=run_id, + node_id=current.id, + ) + + assert workspace.task.task_slug == "current" + assert workspace.latest_execution is not None + assert workspace.latest_execution.execution_id == current_exec.id + assert [resource.name for resource in workspace.own_resources] == ["own.txt"] + assert [resource.name for resource in workspace.input_resources] == ["input.txt"] + + +@pytest.mark.asyncio +async def test_materialize_resource_rejects_parent_directory_destination( + tmp_path: Path, +) -> None: + session = _session() + run_id = _run(session) + producer = _node(run_id=run_id, slug="producer") + consumer = _node(run_id=run_id, slug="consumer") + session.add_all([producer, consumer]) + session.flush() + producer_exec = _execution(run_id=run_id, node_id=producer.id) + consumer_exec = _execution( + run_id=run_id, + node_id=consumer.id, + status=TaskExecutionStatus.RUNNING, + ) + session.add_all([producer_exec, consumer_exec]) + session.flush() + source = _resource( + run_id=run_id, + execution_id=producer_exec.id, + name="paper.pdf", + path=tmp_path / "paper.pdf", + content=b"paper", + ) + session.add(source) + session.commit() + + with pytest.raises(ValueError, match="destination must stay inside /workspace"): + await WorkflowService().materialize_resource( + session, + run_id=run_id, + current_node_id=consumer.id, + current_execution_id=consumer_exec.id, + sandbox_task_key=consumer.id, + benchmark_type="test", + resource_id=source.id, + destination="../escape/paper.pdf", + dry_run=True, + ) + + +@pytest.mark.asyncio +async def test_add_task_dry_run_does_not_write_node() -> None: + session = _session() + run_id = _run(session) + parent = _node(run_id=run_id, slug="parent", level=1) + session.add(parent) + session.commit() + + result = await WorkflowService().add_task( + session, + run_id=run_id, + parent_node_id=parent.id, + task_slug="child", + description="Child task", + assigned_worker_slug="minif2f-react", + dry_run=True, + ) + + nodes = session.exec(select(RunGraphNode).where(RunGraphNode.run_id == run_id)).all() + assert len(nodes) == 1 + assert result.action == "add-task" + assert result.dry_run is True + assert result.node is not None + assert result.node.task_slug == "child" + assert result.node.parent_node_id == parent.id + assert result.node.level == 2 + + +@pytest.mark.asyncio +async def test_add_task_writes_node_and_mutation() -> None: + session = _session() + run_id = _run(session) + parent = _node(run_id=run_id, slug="parent", level=1) + session.add(parent) + session.commit() + dispatched = [] + + async def dispatch_task_ready(run_id, definition_id, node_id): + dispatched.append((run_id, definition_id, node_id)) + + result = await WorkflowService(task_ready_dispatcher=dispatch_task_ready).add_task( + session, + run_id=run_id, + parent_node_id=parent.id, + task_slug="child", + description="Child task", + assigned_worker_slug="minif2f-react", + dry_run=False, + ) + + assert result.dry_run is False + assert result.node is not None + child = session.get(RunGraphNode, result.node.node_id) + assert child is not None + assert child.task_slug == "child" + assert child.description == "Child task" + assert child.parent_node_id == parent.id + assert child.level == 2 + assert child.status == TaskExecutionStatus.PENDING.value + run = session.get(RunRecord, run_id) + assert run is not None + assert dispatched == [(run_id, run.workflow_definition_id, child.id)] + + +@pytest.mark.asyncio +async def test_add_task_rejects_unknown_worker_slug_before_creating_node() -> None: + session = _session() + run_id = _run(session) + parent = _node(run_id=run_id, slug="parent", status="running") + session.add(parent) + session.commit() + + async def dispatch_task_ready(run_id: UUID, definition_id: UUID, node_id: UUID) -> None: + raise AssertionError("invalid worker should not dispatch") + + with pytest.raises(ValueError, match="Unknown worker slug"): + await WorkflowService(task_ready_dispatcher=dispatch_task_ready).add_task( + session, + run_id=run_id, + parent_node_id=parent.id, + task_slug="bad-worker", + description="Should not be inserted", + assigned_worker_slug="not-a-real-worker", + dry_run=False, + ) + + inserted = session.exec( + select(RunGraphNode).where( + RunGraphNode.run_id == run_id, + RunGraphNode.task_slug == "bad-worker", + ) + ).first() + assert inserted is None + + +@pytest.mark.asyncio +async def test_add_edge_writes_dependency_between_slugs() -> None: + session = _session() + run_id = _run(session) + source = _node(run_id=run_id, slug="source") + target = _node(run_id=run_id, slug="target") + session.add_all([source, target]) + session.commit() + + result = await WorkflowService().add_edge( + session, + run_id=run_id, + source_task_slug="source", + target_task_slug="target", + dry_run=False, + ) + + assert result.action == "add-edge" + assert result.edge is not None + edge = session.get(RunGraphEdge, result.edge.edge_id) + assert edge is not None + assert edge.source_node_id == source.id + assert edge.target_node_id == target.id + assert edge.status == "pending" + + +@pytest.mark.asyncio +async def test_update_task_description_changes_only_description() -> None: + session = _session() + run_id = _run(session) + node = _node(run_id=run_id, slug="target", description="Old") + session.add(node) + session.commit() + + result = await WorkflowService().update_task_description( + session, + run_id=run_id, + task_slug="target", + description="New description", + dry_run=False, + ) + + refreshed = session.get(RunGraphNode, node.id) + assert refreshed is not None + assert refreshed.description == "New description" + assert refreshed.task_slug == "target" + assert result.node is not None + assert result.node.description == "New description" + + +@pytest.mark.asyncio +async def test_restart_and_abandon_task_update_node_status() -> None: + session = _session() + run_id = _run(session) + failed = _node(run_id=run_id, slug="failed", status="failed") + running = _node(run_id=run_id, slug="running", status="running") + session.add_all([failed, running]) + session.commit() + + restarted = await WorkflowService().restart_task( + session, + run_id=run_id, + task_slug="failed", + dry_run=False, + ) + abandoned = await WorkflowService().abandon_task( + session, + run_id=run_id, + task_slug="running", + dry_run=False, + ) + + failed_row = session.get(RunGraphNode, failed.id) + running_row = session.get(RunGraphNode, running.id) + assert failed_row is not None + assert running_row is not None + assert failed_row.status == TaskExecutionStatus.PENDING.value + assert running_row.status == TaskExecutionStatus.CANCELLED.value + assert restarted.action == "restart-task" + assert abandoned.action == "abandon-task" diff --git a/tests/unit/state/test_research_rubrics_benchmark.py b/tests/unit/state/test_research_rubrics_benchmark.py index d56502c3..72e8370c 100644 --- a/tests/unit/state/test_research_rubrics_benchmark.py +++ b/tests/unit/state/test_research_rubrics_benchmark.py @@ -82,6 +82,44 @@ def __getitem__(self, idx): ) ] + def test_load_rows_accepts_vanilla_prompt_field(self, monkeypatch: pytest.MonkeyPatch): + class FakeTrainDataset: + def __len__(self): + return 1 + + def __getitem__(self, idx): + assert idx == 0 + return { + "sample_id": "vanilla-sample", + "domain": "planning", + "prompt": "Plan a day in Washington DC.", + "rubrics": [ + { + "criterion": "Includes a timed itinerary.", + "axis": "quality", + "weight": 5.0, + }, + ], + } + + monkeypatch.setattr( + "ergon_builtins.benchmarks.researchrubrics.benchmark.load_dataset", + lambda *args, **kwargs: {"train": FakeTrainDataset()}, + ) + + rows = ResearchRubricsBenchmark(dataset_name="ScaleAI/researchrubrics")._load_rows() + + assert rows == [ + ResearchRubricsTaskPayload( + sample_id="vanilla-sample", + domain="planning", + ablated_prompt="Plan a day in Washington DC.", + rubrics=[ + {"criterion": "Includes a timed itinerary.", "axis": "quality", "weight": 5.0} + ], + ) + ] + class TestResearchRubricsRubric: """Verify task-payload-driven rubric construction.""" diff --git a/tests/unit/state/test_research_rubrics_workers.py b/tests/unit/state/test_research_rubrics_workers.py index 65e31f7d..85351ee1 100644 --- a/tests/unit/state/test_research_rubrics_workers.py +++ b/tests/unit/state/test_research_rubrics_workers.py @@ -14,6 +14,7 @@ ResearchRubricsResearcherWorker, ) from ergon_builtins.workers.research_rubrics.workflow_cli_react_worker import ( + _WORKFLOW_PROMPT, ResearchRubricsWorkflowCliReActWorker, ) from ergon_builtins.benchmarks.researchrubrics.toolkit_types import ( @@ -157,6 +158,12 @@ async def test_workflow_cli_worker_adds_workflow_tool(self): assert worker.type_slug == "researchrubrics-workflow-cli-react" assert "workflow" in tool_names + def test_workflow_cli_prompt_uses_current_task_level_for_delegation(self): + assert "inspect task-workspace --format json" in _WORKFLOW_PROMPT + assert "task_workspace.task.level is exactly 0" in _WORKFLOW_PROMPT + assert "Ignore level-0 tasks shown elsewhere in task-tree" in _WORKFLOW_PROMPT + assert 'do not call `workflow("manage add-task' in _WORKFLOW_PROMPT + @pytest.mark.asyncio async def test_report_write_uses_manager_public_file_api(self): task_id = uuid4() diff --git a/tests/unit/state/test_type_invariants.py b/tests/unit/state/test_type_invariants.py index e0061dab..2e23ecae 100644 --- a/tests/unit/state/test_type_invariants.py +++ b/tests/unit/state/test_type_invariants.py @@ -7,10 +7,9 @@ is empty as a result; see the inline comment for details. """ -import pytest -from pydantic import ValidationError from uuid import uuid4 +import pytest from ergon_core.core.persistence.graph.models import ( RunGraphAnnotation, RunGraphMutation, @@ -23,13 +22,14 @@ from ergon_core.core.persistence.telemetry.models import ( ExperimentCohort, ExperimentCohortStatus, + ExperimentRecord, RolloutBatch, RunRecord, RunResource, RunTaskExecution, TrainingSession, ) - +from pydantic import ValidationError # --------------------------------------------------------------------------- # Happy path — field accepts valid value and stores it @@ -40,7 +40,14 @@ "build_fn,field,expected", [ ( - lambda: RunRecord(experiment_definition_id=uuid4(), status=RunStatus.PENDING), + lambda: RunRecord( + experiment_id=uuid4(), + workflow_definition_id=uuid4(), + benchmark_type="ci-test", + instance_key="sample-1", + worker_team_json={"primary": "test-worker"}, + status=RunStatus.PENDING, + ), "status", RunStatus.PENDING, ), @@ -136,6 +143,49 @@ def test_task_execution_rejects_missing_static_or_dynamic_identity(): ) +def test_experiment_record_accepts_optional_cohort_and_required_name(): + experiment = ExperimentRecord.model_validate( + { + "name": "ci experiment", + "benchmark_type": "ci-benchmark", + "sample_count": 1, + "sample_selection_json": {"instance_keys": ["sample-1"]}, + "default_worker_team_json": {"primary": "test-worker"}, + "design_json": {}, + "metadata_json": {}, + } + ) + + assert experiment.cohort_id is None + assert experiment.name == "ci experiment" + assert experiment.parsed_sample_selection() == {"instance_keys": ["sample-1"]} + assert experiment.parsed_default_worker_team() == {"primary": "test-worker"} + + +def test_run_record_uses_experiment_and_workflow_definition_identity(): + experiment_id = uuid4() + workflow_definition_id = uuid4() + + run = RunRecord.model_validate( + { + "experiment_id": str(experiment_id), + "workflow_definition_id": str(workflow_definition_id), + "benchmark_type": "ci-benchmark", + "instance_key": "sample-1", + "worker_team_json": {"primary": "test-worker"}, + "status": "pending", + } + ) + + assert run.experiment_id == experiment_id + assert run.workflow_definition_id == workflow_definition_id + assert run.benchmark_type == "ci-benchmark" + assert run.instance_key == "sample-1" + assert run.parsed_worker_team() == {"primary": "test-worker"} + assert not hasattr(run, "experiment_definition_id") + assert not hasattr(run, "cohort_id") + + def test_enum_value_matches_string(): assert RunStatus.PENDING == "pending" assert RunStatus.COMPLETED == "completed" @@ -155,7 +205,14 @@ def test_enum_value_matches_string(): [ ( RunRecord, - {"experiment_definition_id": str(uuid4()), "status": "pending"}, + { + "experiment_id": str(uuid4()), + "workflow_definition_id": str(uuid4()), + "benchmark_type": "ci-test", + "instance_key": "sample-1", + "worker_team_json": {"primary": "test-worker"}, + "status": "pending", + }, "status", "not-a-status", ), diff --git a/tests/unit/state/test_workflow_cli_tool.py b/tests/unit/state/test_workflow_cli_tool.py index a52f7a6b..85883e76 100644 --- a/tests/unit/state/test_workflow_cli_tool.py +++ b/tests/unit/state/test_workflow_cli_tool.py @@ -7,16 +7,17 @@ @pytest.mark.asyncio async def test_workflow_tool_injects_worker_context() -> None: + task_key = uuid4() context = WorkerContext( run_id=uuid4(), - task_id=uuid4(), + task_id=task_key, execution_id=uuid4(), sandbox_id="sandbox", node_id=uuid4(), ) seen = {} - def execute(command, *, context, session_factory, service): + async def execute(command, *, context, session_factory, service): seen["command"] = command seen["context"] = context @@ -29,7 +30,7 @@ class Output: workflow = make_workflow_cli_tool( worker_context=context, - sandbox_task_key=context.task_id, + sandbox_task_key=task_key, benchmark_type="researchrubrics", execute_command=execute, ) @@ -39,21 +40,22 @@ class Output: assert seen["context"].run_id == context.run_id assert seen["context"].node_id == context.node_id assert seen["context"].execution_id == context.execution_id - assert seen["context"].sandbox_task_key == context.task_id + assert seen["context"].sandbox_task_key == task_key assert seen["context"].benchmark_type == "researchrubrics" @pytest.mark.asyncio async def test_workflow_tool_reports_nonzero_exit() -> None: + task_key = uuid4() context = WorkerContext( run_id=uuid4(), - task_id=uuid4(), + task_id=task_key, execution_id=uuid4(), sandbox_id="sandbox", node_id=uuid4(), ) - def execute(command, *, context, session_factory, service): + async def execute(command, *, context, session_factory, service): class Output: stdout = "" stderr = "bad command" @@ -63,9 +65,97 @@ class Output: workflow = make_workflow_cli_tool( worker_context=context, - sandbox_task_key=context.task_id, + sandbox_task_key=task_key, benchmark_type="researchrubrics", execute_command=execute, ) assert await workflow("inspect nope") == "workflow exited 2: bad command" + + +@pytest.mark.asyncio +async def test_leaf_workflow_tool_rejects_graph_edit_commands() -> None: + task_key = uuid4() + context = WorkerContext( + run_id=uuid4(), + task_id=task_key, + execution_id=uuid4(), + sandbox_id="sandbox", + node_id=uuid4(), + ) + + async def execute(command, *, context, session_factory, service): + raise AssertionError("denied commands must not reach executor") + + workflow = make_workflow_cli_tool( + worker_context=context, + sandbox_task_key=task_key, + benchmark_type="researchrubrics", + execute_command=execute, + ) + + result = await workflow("manage add-task --task-slug child --description Child --worker worker") + + assert result.startswith("workflow denied:") + assert "manager-capable" in result + + +@pytest.mark.asyncio +async def test_manager_workflow_tool_allows_graph_edit_commands() -> None: + task_key = uuid4() + context = WorkerContext( + run_id=uuid4(), + task_id=task_key, + execution_id=uuid4(), + sandbox_id="sandbox", + node_id=uuid4(), + ) + seen = {} + + async def execute(command, *, context, session_factory, service): + seen["command"] = command + + class Output: + stdout = "ok" + stderr = "" + exit_code = 0 + + return Output() + + workflow = make_workflow_cli_tool( + worker_context=context, + sandbox_task_key=task_key, + benchmark_type="researchrubrics", + execute_command=execute, + manager_capable=True, + ) + + assert await workflow("manage restart-task --task-slug child --dry-run") == "ok" + assert seen["command"] == "manage restart-task --task-slug child --dry-run" + + +@pytest.mark.asyncio +async def test_workflow_tool_rejects_multiline_commands() -> None: + task_key = uuid4() + context = WorkerContext( + run_id=uuid4(), + task_id=task_key, + execution_id=uuid4(), + sandbox_id="sandbox", + node_id=uuid4(), + ) + + async def execute(command, *, context, session_factory, service): + raise AssertionError("multiline commands must not reach executor") + + workflow = make_workflow_cli_tool( + worker_context=context, + sandbox_task_key=task_key, + benchmark_type="researchrubrics", + execute_command=execute, + manager_capable=True, + ) + + assert await workflow("inspect task-tree\ninspect next-actions") == ( + "workflow denied: multiline commands are not allowed" + ) diff --git a/tests/unit/test_app_mounts_harness_conditionally.py b/tests/unit/test_app_mounts_harness_conditionally.py index dd7ce8f4..ac9ac39a 100644 --- a/tests/unit/test_app_mounts_harness_conditionally.py +++ b/tests/unit/test_app_mounts_harness_conditionally.py @@ -38,3 +38,13 @@ def test_harness_mounted_when_env_set(monkeypatch: pytest.MonkeyPatch) -> None: # if Postgres is unreachable from the unit-test env. Either proves the route # is mounted. assert resp.status_code in (404, 500) + + +def test_health_route_is_available_without_test_harness(monkeypatch: pytest.MonkeyPatch) -> None: + app = _reload_app_with(monkeypatch, None) + client = TestClient(app) + + resp = client.get("/health") + + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} diff --git a/tests/unit/test_test_harness.py b/tests/unit/test_test_harness.py index 3f34038d..6b041e5e 100644 --- a/tests/unit/test_test_harness.py +++ b/tests/unit/test_test_harness.py @@ -4,12 +4,11 @@ from uuid import uuid4 import pytest -from fastapi import FastAPI -from fastapi.testclient import TestClient - from ergon_core.core.api import test_harness from ergon_core.core.api.startup_plugins import run_startup_plugins from ergon_core.core.api.test_harness import get_session_dep, router +from fastapi import FastAPI +from fastapi.testclient import TestClient class _NullSession: @@ -88,7 +87,7 @@ def test_seed_requires_secret_header(monkeypatch: pytest.MonkeyPatch) -> None: client = TestClient(app) resp = client.post( "/api/test/write/run/seed", - json={"experiment_definition_id": "00000000-0000-0000-0000-000000000001"}, + json={"workflow_definition_id": "00000000-0000-0000-0000-000000000001"}, ) assert resp.status_code == 401 @@ -98,7 +97,7 @@ def test_seed_returns_500_when_secret_env_missing(monkeypatch: pytest.MonkeyPatc client = TestClient(app) resp = client.post( "/api/test/write/run/seed", - json={"experiment_definition_id": "00000000-0000-0000-0000-000000000001"}, + json={"workflow_definition_id": "00000000-0000-0000-0000-000000000001"}, headers={"X-Test-Secret": "anything"}, ) assert resp.status_code == 500 diff --git a/uv.lock b/uv.lock index d7bfcd87..9f248d29 100644 --- a/uv.lock +++ b/uv.lock @@ -1128,7 +1128,7 @@ requires-dist = [ { name = "outlines", marker = "extra == 'dev'" }, { name = "psycopg2-binary", specifier = ">=2.9.9" }, { name = "pydantic", specifier = ">=2.5.0" }, - { name = "pydantic-ai" }, + { name = "pydantic-ai", specifier = ">=0.8.1" }, { name = "pydantic-settings", specifier = ">=2.1.0" }, { name = "sqlmodel", specifier = ">=0.0.14" }, { name = "structlog", specifier = ">=23.2.0" }, @@ -1489,6 +1489,19 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "genai-prices" +version = "0.0.57" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/be/30/11f3d683cf3b1d9612475ad8bfffe3423ce9f50fc617733109033e73a038/genai_prices-0.0.57.tar.gz", hash = "sha256:6e101e9c53975557ceffa237b0995787d81fe75aac12410f2898504188bcad89", size = 66555, upload-time = "2026-04-21T13:42:52.554Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/fe/d0095040c120d97cb63d055224ecd4e913dc5655315c203c8e83bf13aa86/genai_prices-0.0.57-py3-none-any.whl", hash = "sha256:14e50fb69cdc5a06ddb2a6df5a7fe06741b9e44304ce3f1728f56abdf1856cca", size = 69654, upload-time = "2026-04-21T13:42:51.236Z" }, +] + [[package]] name = "genson" version = "1.3.0" @@ -2635,14 +2648,14 @@ wheels = [ [[package]] name = "nexus-rpc" -version = "1.4.0" +version = "1.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/35/d5/cd1ffb202b76ebc1b33c1332a3416e55a39929006982adc2b1eb069aaa9b/nexus_rpc-1.4.0.tar.gz", hash = "sha256:3b8b373d4865671789cc43623e3dc0bcbf192562e40e13727e17f1c149050fba", size = 82367, upload-time = "2026-02-25T22:01:34.053Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/66/540687556bd28cf1ec370cc6881456203dfddb9dab047b8979c6865b5984/nexus_rpc-1.1.0.tar.gz", hash = "sha256:d65ad6a2f54f14e53ebe39ee30555eaeb894102437125733fb13034a04a44553", size = 77383, upload-time = "2025-07-07T19:03:58.368Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/11/52/6327a5f4fda01207205038a106a99848a41c83e933cd23ea2cab3d2ebc6c/nexus_rpc-1.4.0-py3-none-any.whl", hash = "sha256:14c953d3519113f8ccec533a9efdb6b10c28afef75d11cdd6d422640c40b3a49", size = 29645, upload-time = "2026-02-25T22:01:33.122Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2f/9e9d0dcaa4c6ffa22b7aa31069a8a264c753ff8027b36af602cce038c92f/nexus_rpc-1.1.0-py3-none-any.whl", hash = "sha256:d1b007af2aba186a27e736f8eaae39c03aed05b488084ff6c3d1785c9ba2ad38", size = 27743, upload-time = "2025-07-07T19:03:57.556Z" }, ] [[package]] @@ -3312,17 +3325,16 @@ wheels = [ [[package]] name = "protobuf" -version = "6.33.6" +version = "5.29.6" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/57/394a763c103e0edf87f0938dafcd918d53b4c011dfc5c8ae80f3b0452dbb/protobuf-5.29.6.tar.gz", hash = "sha256:da9ee6a5424b6b30fd5e45c5ea663aef540ca95f9ad99d1e887e819cdf9b8723", size = 425623, upload-time = "2026-02-04T22:54:40.584Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" }, - { url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" }, - { url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" }, - { url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" }, - { url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" }, - { url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" }, - { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" }, + { url = "https://files.pythonhosted.org/packages/d4/88/9ee58ff7863c479d6f8346686d4636dd4c415b0cbeed7a6a7d0617639c2a/protobuf-5.29.6-cp310-abi3-win32.whl", hash = "sha256:62e8a3114992c7c647bce37dcc93647575fc52d50e48de30c6fcb28a6a291eb1", size = 423357, upload-time = "2026-02-04T22:54:25.805Z" }, + { url = "https://files.pythonhosted.org/packages/1c/66/2dc736a4d576847134fb6d80bd995c569b13cdc7b815d669050bf0ce2d2c/protobuf-5.29.6-cp310-abi3-win_amd64.whl", hash = "sha256:7e6ad413275be172f67fdee0f43484b6de5a904cc1c3ea9804cb6fe2ff366eda", size = 435175, upload-time = "2026-02-04T22:54:28.592Z" }, + { url = "https://files.pythonhosted.org/packages/06/db/49b05966fd208ae3f44dcd33837b6243b4915c57561d730a43f881f24dea/protobuf-5.29.6-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:b5a169e664b4057183a34bdc424540e86eea47560f3c123a0d64de4e137f9269", size = 418619, upload-time = "2026-02-04T22:54:30.266Z" }, + { url = "https://files.pythonhosted.org/packages/b7/d7/48cbf6b0c3c39761e47a99cb483405f0fde2be22cf00d71ef316ce52b458/protobuf-5.29.6-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:a8866b2cff111f0f863c1b3b9e7572dc7eaea23a7fae27f6fc613304046483e6", size = 320284, upload-time = "2026-02-04T22:54:31.782Z" }, + { url = "https://files.pythonhosted.org/packages/e3/dd/cadd6ec43069247d91f6345fa7a0d2858bef6af366dbd7ba8f05d2c77d3b/protobuf-5.29.6-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:e3387f44798ac1106af0233c04fb8abf543772ff241169946f698b3a9a3d3ab9", size = 320478, upload-time = "2026-02-04T22:54:32.909Z" }, + { url = "https://files.pythonhosted.org/packages/5a/cb/e3065b447186cb70aa65acc70c86baf482d82bf75625bf5a2c4f6919c6a3/protobuf-5.29.6-py3-none-any.whl", hash = "sha256:6b9edb641441b2da9fa8f428760fc136a49cf97a52076010cf22a2ff73438a86", size = 173126, upload-time = "2026-02-04T22:54:39.462Z" }, ] [[package]] @@ -3605,22 +3617,23 @@ email = [ [[package]] name = "pydantic-ai" -version = "0.7.2" +version = "0.8.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic-ai-slim", extra = ["ag-ui", "anthropic", "bedrock", "cli", "cohere", "evals", "google", "groq", "huggingface", "mcp", "mistral", "openai", "retries", "temporal", "vertexai"] }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6f/d0/ca0dbea87aa677192fa4b663532bd37ae8273e883c55b661b786dbb52731/pydantic_ai-0.7.2.tar.gz", hash = "sha256:d215c323741d47ff13c6b48aa75aedfb8b6b5f9da553af709675c3078a4be4fc", size = 43763306, upload-time = "2025-08-14T22:59:58.912Z" } +sdist = { url = "https://files.pythonhosted.org/packages/56/d7/fcc18ce80008e888404a3615f973aa3f39b98384d61b03621144c9f4c2d4/pydantic_ai-0.8.1.tar.gz", hash = "sha256:05974382082ee4f3706909d06bdfcc5e95f39e29230cc4d00e47429080099844", size = 43772581, upload-time = "2025-08-29T14:46:23.201Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a3/77/402a278b9694cdfaeb5bf0ed4e0fee447de624aa67126ddcce8d98dc6062/pydantic_ai-0.7.2-py3-none-any.whl", hash = "sha256:a6e5d0994aa87385a05fdfdad7fda1fd14576f623635e4000883c4c7856eba13", size = 10188, upload-time = "2025-08-14T22:59:50.653Z" }, + { url = "https://files.pythonhosted.org/packages/f9/04/802b8cf834dffcda8baabb3b76c549243694a83346c3f54e47a3a4d519fb/pydantic_ai-0.8.1-py3-none-any.whl", hash = "sha256:5fa923097132aa69b4d6a310b462dc091009c7b87705edf4443d37b887d5ef9a", size = 10188, upload-time = "2025-08-29T14:46:11.137Z" }, ] [[package]] name = "pydantic-ai-slim" -version = "0.7.2" +version = "0.8.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "eval-type-backport" }, + { name = "genai-prices" }, { name = "griffe" }, { name = "httpx" }, { name = "opentelemetry-api" }, @@ -3628,9 +3641,9 @@ dependencies = [ { name = "pydantic-graph" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/96/39/87500c5e038296fe1becf62ac24f7e62dd5a1fb7fe63a9e29c58a2898b1a/pydantic_ai_slim-0.7.2.tar.gz", hash = "sha256:636ca32c8928048ba1173963aab6b7eb33b71174bbc371ad3f2096fee4c48dfe", size = 211787, upload-time = "2025-08-14T23:00:02.67Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/91/08137459b3745900501b3bd11852ced6c81b7ce6e628696d75b09bb786c5/pydantic_ai_slim-0.8.1.tar.gz", hash = "sha256:12ef3dcbe5e1dad195d5e256746ef960f6e59aeddda1a55bdd553ee375ff53ae", size = 218906, upload-time = "2025-08-29T14:46:27.517Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/93/fc3723a7cde4a8edb2d060fb8abeba22270ae61984796ab653fdd05baca0/pydantic_ai_slim-0.7.2-py3-none-any.whl", hash = "sha256:f5749d63bf4c2deac45371874df30d1d76a1572ce9467f6505926ecb835da583", size = 289755, upload-time = "2025-08-14T22:59:53.346Z" }, + { url = "https://files.pythonhosted.org/packages/11/ce/8dbadd04f578d02a9825a46e931005743fe223736296f30b55846c084fab/pydantic_ai_slim-0.8.1-py3-none-any.whl", hash = "sha256:fc7edc141b21fe42bc54a2d92c1127f8a75160c5e57a168dba154d3f4adb963f", size = 297821, upload-time = "2025-08-29T14:46:14.647Z" }, ] [package.optional-dependencies] @@ -3647,6 +3660,7 @@ bedrock = [ cli = [ { name = "argcomplete" }, { name = "prompt-toolkit" }, + { name = "pyperclip" }, { name = "rich" }, ] cohere = [ @@ -3739,7 +3753,7 @@ wheels = [ [[package]] name = "pydantic-evals" -version = "0.7.2" +version = "0.8.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -3749,9 +3763,9 @@ dependencies = [ { name = "pyyaml" }, { name = "rich" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/32/b7/005b1b23b96abf2bce880a4c10496c00f8ebd67690f6888e576269059f54/pydantic_evals-0.7.2.tar.gz", hash = "sha256:0cf7adee67b8a12ea0b41e5162c7256ae0f6a237acb1eea161a74ed6cf61615a", size = 44086, upload-time = "2025-08-14T23:00:03.606Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/9d/460a1f2c9f5f263e9d8e9661acbd654ccc81ad3373ea43048d914091a817/pydantic_evals-0.8.1.tar.gz", hash = "sha256:c398a623c31c19ce70e346ad75654fcb1517c3f6a821461f64fe5cbbe0813023", size = 43933, upload-time = "2025-08-29T14:46:28.903Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/6f/3b844991fc1223f9c3b201f222397b0d115e236389bd90ced406ebc478ea/pydantic_evals-0.7.2-py3-none-any.whl", hash = "sha256:c7497d89659c35fbcaefbeb6f457ae09d62e36e161c4b25a462808178b7cfa92", size = 52753, upload-time = "2025-08-14T22:59:55.018Z" }, + { url = "https://files.pythonhosted.org/packages/6f/f9/1d21c4687167c4fa76fd3b1ed47f9bc2d38fd94cbacd9aa3f19e82e59830/pydantic_evals-0.8.1-py3-none-any.whl", hash = "sha256:6c76333b1d79632f619eb58a24ac656e9f402c47c75ad750ba0230d7f5514344", size = 52602, upload-time = "2025-08-29T14:46:16.602Z" }, ] [[package]] @@ -3774,7 +3788,7 @@ pycountry = [ [[package]] name = "pydantic-graph" -version = "0.7.2" +version = "0.8.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, @@ -3782,9 +3796,9 @@ dependencies = [ { name = "pydantic" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/cf/a9/8a918b4dc2cd55775d854e076823fa9b60a390e4fbec5283916346556754/pydantic_graph-0.7.2.tar.gz", hash = "sha256:f90e4ec6f02b899bf6f88cc026dafa119ea5041ab4c62ba81497717c003a946e", size = 21804, upload-time = "2025-08-14T23:00:04.834Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/97/b35b7cb82d9f1bb6d5c6d21bba54f6196a3a5f593373f3a9c163a3821fd7/pydantic_graph-0.8.1.tar.gz", hash = "sha256:c61675a05c74f661d4ff38d04b74bd652c1e0959467801986f2f85dc7585410d", size = 21675, upload-time = "2025-08-29T14:46:29.839Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/12/d7/639c69dda9e4b4cf376c9f45e5eae96721f2dc2f2dc618fb63142876dce4/pydantic_graph-0.7.2-py3-none-any.whl", hash = "sha256:b6189500a465ce1bce4bbc65ac5871149af8e0f81a15d54540d3dfc0cc9b2502", size = 27392, upload-time = "2025-08-14T22:59:56.564Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e3/5908643b049bb2384d143885725cbeb0f53707d418357d4d1ac8d2c82629/pydantic_graph-0.8.1-py3-none-any.whl", hash = "sha256:f1dd5db0fe22f4e3323c04c65e2f0013846decc312b3efc3196666764556b765", size = 27239, upload-time = "2025-08-29T14:46:18.317Z" }, ] [[package]] @@ -3871,6 +3885,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/29/7d/5945b5af29534641820d3bd7b00962abbbdfee84ec7e19f0d5b3175f9a31/pynacl-1.6.2-cp38-abi3-win_arm64.whl", hash = "sha256:834a43af110f743a754448463e8fd61259cd4ab5bbedcf70f9dabad1d28a394c", size = 184801, upload-time = "2026-01-01T17:32:36.309Z" }, ] +[[package]] +name = "pyperclip" +version = "1.11.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/52/d87eba7cb129b81563019d1679026e7a112ef76855d6159d24754dbd2a51/pyperclip-1.11.0.tar.gz", hash = "sha256:244035963e4428530d9e3a6101a1ef97209c6825edab1567beac148ccc1db1b6", size = 12185, upload-time = "2025-09-26T14:40:37.245Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/80/fc9d01d5ed37ba4c42ca2b55b4339ae6e200b456be3a1aaddf4a9fa99b8c/pyperclip-1.11.0-py3-none-any.whl", hash = "sha256:299403e9ff44581cb9ba2ffeed69c7aa96a008622ad0c46cb575ca75b5b84273", size = 11063, upload-time = "2025-09-26T14:40:36.069Z" }, +] + [[package]] name = "pytest" version = "9.0.3" @@ -4935,7 +4958,7 @@ wheels = [ [[package]] name = "temporalio" -version = "1.25.0" +version = "1.16.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "nexus-rpc" }, @@ -4943,13 +4966,12 @@ dependencies = [ { name = "types-protobuf" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/de/9c/3782bab0bf11a40b550147c19a5d1a476c17405391751982408902d9f138/temporalio-1.25.0.tar.gz", hash = "sha256:a3bbec1dcc904f674402cfa4faae480fda490b1c53ea5440c1f1996c562016fb", size = 2152534, upload-time = "2026-04-08T18:53:55.388Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/32/375ab75d0ebb468cf9c8abbc450a03d3a8c66401fc320b338bd8c00d36b4/temporalio-1.16.0.tar.gz", hash = "sha256:dd926f3e30626fd4edf5e0ce596b75ecb5bbe0e4a0281e545ac91b5577967c91", size = 1733873, upload-time = "2025-08-21T22:12:50.879Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/19/e3/5676dd10d1164b6d6ca8752314054097b89c5da931e936af402a7b15236c/temporalio-1.25.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6dc1bc8e1773b1a833d86a7ede2dd90ef4e031ced5b748b59e7f09a5bf9b327d", size = 13943906, upload-time = "2026-04-08T18:53:30.022Z" }, - { url = "https://files.pythonhosted.org/packages/89/50/7cbf7f845973be986ec165348f72f7a409750842a04d554965a39be5cb4f/temporalio-1.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:3c8fdcf79ea5ae8ae2cf6f48072e4a86c3e0f4778f6a8a066c6ff1d336587db4", size = 13298719, upload-time = "2026-04-08T18:53:35.95Z" }, - { url = "https://files.pythonhosted.org/packages/d2/31/d474bab8535552add6ed289911bf1ffae5d7071823ece1069842190fcaed/temporalio-1.25.0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:141f37aaafd7d090ba5c8776e4e9bc60df1fbc64b9f50c8f00e905a436588ddc", size = 13555435, upload-time = "2026-04-08T18:53:41.36Z" }, - { url = "https://files.pythonhosted.org/packages/2a/c8/e7dc053d6107bf2a037a3c9fe7b86639a25dcb888bde0e1ca366901ee47f/temporalio-1.25.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff7ca5bb80264976477d4dc7a839b3d22af8577ae92306526a061481db49bf92", size = 14052050, upload-time = "2026-04-08T18:53:46.44Z" }, - { url = "https://files.pythonhosted.org/packages/08/70/9340ed3a578321cbc153041d34834bb1ec3f1f3e3d9cded47cd1b7c3e403/temporalio-1.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:9411534279a2e64847231b6059c214bff4d57cfd1532bd09f333d0b1603daa7f", size = 14299684, upload-time = "2026-04-08T18:53:52.482Z" }, + { url = "https://files.pythonhosted.org/packages/e0/36/12bb7234c83ddca4b8b032c8f1a9e07a03067c6ed6d2ddb39c770a4c87c6/temporalio-1.16.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:547c0853310350d3e5b5b9c806246cbf2feb523f685b05bf14ec1b0ece8a7bb6", size = 12540769, upload-time = "2025-08-21T22:11:24.551Z" }, + { url = "https://files.pythonhosted.org/packages/3c/16/a7d402435b8f994979abfeffd3f5ffcaaeada467ac16438e61c51c9f7abe/temporalio-1.16.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b05bb0d06025645aed6f936615311a6774eb8dc66280f32a810aac2283e1258", size = 12968631, upload-time = "2025-08-21T22:11:48.375Z" }, + { url = "https://files.pythonhosted.org/packages/11/6f/16663eef877b61faa5fd917b3a63497416ec4319195af75f6169a1594479/temporalio-1.16.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a08aed4e0f6c2b6bfc779b714e91dfe8c8491a0ddb4c4370627bb07f9bddcfd", size = 13164612, upload-time = "2025-08-21T22:12:16.366Z" }, + { url = "https://files.pythonhosted.org/packages/af/0e/8c6704ca7033aa09dc084f285d70481d758972cc341adc3c84d5f82f7b01/temporalio-1.16.0-cp39-abi3-win_amd64.whl", hash = "sha256:7c190362b0d7254f1f93fb71456063e7b299ac85a89f6227758af82c6a5aa65b", size = 13177058, upload-time = "2025-08-21T22:12:44.239Z" }, ] [[package]]