diff --git a/.github/workflows/e2e-benchmarks.yml b/.github/workflows/e2e-benchmarks.yml index 20aa7350..abf33190 100644 --- a/.github/workflows/e2e-benchmarks.yml +++ b/.github/workflows/e2e-benchmarks.yml @@ -35,7 +35,7 @@ jobs: env: SMOKE_ENV: ${{ matrix.env }} ENABLE_TEST_HARNESS: "1" - ERGON_STARTUP_PLUGINS: "ergon_core.test_support.smoke_fixtures:register_smoke_fixtures" + ERGON_STARTUP_PLUGINS: "ergon_builtins.registry:register_builtins,tests.fixtures.smoke_components:register_smoke_fixtures" TEST_HARNESS_SECRET: ${{ secrets.TEST_HARNESS_SECRET || 'ci-test-harness' }} E2B_API_KEY: ${{ secrets.E2B_API_KEY }} GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} @@ -74,7 +74,7 @@ jobs: # Unified compose reads these as overrides (see docker-compose.yml). POSTGRES_PASSWORD: ci_test ENABLE_TEST_HARNESS: "1" - ERGON_STARTUP_PLUGINS: "ergon_core.test_support.smoke_fixtures:register_smoke_fixtures" + ERGON_STARTUP_PLUGINS: "ergon_builtins.registry:register_builtins,tests.fixtures.smoke_components:register_smoke_fixtures" run: docker compose up -d --build --wait timeout-minutes: 5 diff --git a/.gitignore b/.gitignore index 6e6134c7..7b081c5f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ build/ # Environment .env +.logfire/ # Databases *.db diff --git a/Dockerfile b/Dockerfile index 2b481776..e9fb0b61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,4 +37,4 @@ RUN cd ergon_cli && uv pip install --system -e "." EXPOSE 9000 -CMD ["uvicorn", "ergon_core.core.api.app:app", "--host", "0.0.0.0", "--port", "9000"] +CMD ["uvicorn", "ergon_core.core.rest_api.app:app", "--host", "0.0.0.0", "--port", "9000"] diff --git a/docker-compose.yml b/docker-compose.yml index 2adb82bf..6fdbdb7c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -84,7 +84,7 @@ services: - INNGEST_API_BASE_URL=http://inngest-dev:8288 - ERGON_API_BASE_URL=http://api:9000 - ENABLE_TEST_HARNESS=${ENABLE_TEST_HARNESS:-1} - - ERGON_STARTUP_PLUGINS=${ERGON_STARTUP_PLUGINS-ergon_core.test_support.smoke_fixtures:register_smoke_fixtures} + - ERGON_STARTUP_PLUGINS=${ERGON_STARTUP_PLUGINS-ergon_builtins.registry:register_builtins,tests.fixtures.smoke_components:register_smoke_fixtures} - TEST_HARNESS_SECRET=${TEST_HARNESS_SECRET:-local-dev} - ERGON_BLOB_ROOT=/tmp/ergon-blob - OTEL_TRACES_ENABLED=false @@ -120,7 +120,7 @@ services: postgres: condition: service_healthy command: > - uvicorn ergon_core.core.api.app:app + uvicorn ergon_core.core.rest_api.app:app --host 0.0.0.0 --port 9000 --reload --reload-dir /app/ergon_core --reload-dir /app/ergon_builtins diff --git a/docs/architecture/03_providers.md b/docs/architecture/03_providers.md index 89d9a90a..7b1d900e 100644 --- a/docs/architecture/03_providers.md +++ b/docs/architecture/03_providers.md @@ -2,26 +2,28 @@ ## 1. Purpose -The providers layer is Ergon's boundary between runtime code and external execution substrates. It owns four concerns: resolving `model_id` strings to `pydantic_ai.models.Model` instances, provisioning and tearing down E2B sandboxes via per-benchmark manager subclasses, surfacing sandbox state transitions as dashboard events, and publishing worker outputs as content-addressed blobs that evaluators can re-read. Everything that crosses the process boundary (LLM API, container runtime, blob storage) is routed through this layer so the runtime, workers, and evaluators stay substrate-agnostic. +The provider-style boundaries are Ergon's adapters between runtime code and external execution substrates. Model resolution lives in the generation registry, while sandbox infrastructure now lives under `ergon_core.core.sandbox` because it owns lifecycle, instrumentation, event emission, and artifact publishing rather than just a third-party provider adapter. ## 2. Core abstractions | Name | Kind | Location | Freeze status | Owner | | --- | --- | --- | --- | --- | +| `_BACKEND_REGISTRY` | module-level dict | `ergon_core/core/providers/generation/model_resolution.py` | Frozen shape; entries grow via registration. | Providers layer. | | `resolve_model_target` | function | `ergon_core/core/providers/generation/model_resolution.py` | Public, frozen signature. Returns `ResolvedModel`. | Providers layer. | -| `BaseSandboxManager` | abstract class + singleton | `ergon_core/core/providers/sandbox/manager.py` | Shape stable; `event_sink` activation path in flux. | Providers layer. | -| `DefaultSandboxManager` | concrete class | `ergon_core/core/providers/sandbox/manager.py` | Frozen. | Providers layer. | +| `register_model_backend` | function | `ergon_core/core/providers/generation/model_resolution.py` | Public, frozen signature. | Providers layer; callers are backend modules executing at import time. | +| `BaseSandboxManager` | abstract class + singleton | `ergon_core/core/sandbox/manager.py` | Shape stable; `event_sink` activation path in flux. | Sandbox domain. | +| `DefaultSandboxManager` | concrete class | `ergon_core/core/sandbox/manager.py` | Frozen. | Sandbox domain. | | `SWEBenchSandboxManager`, `MiniF2FSandboxManager`, `ResearchRubricsSandboxManager` | concrete subclasses | `ergon_builtins/` | Owned per benchmark; singletons. | Benchmark authors. | -| `SandboxEventSink` | `typing.Protocol` | `ergon_core/core/providers/sandbox/event_sink.py` | Frozen protocol; activation path in flux. | Providers layer. | -| `NoopSandboxEventSink`, `DashboardEmitterSandboxEventSink` | implementations | `ergon_core/core/providers/sandbox/event_sink.py` | Frozen. | Providers layer. | -| `SandboxResourcePublisher` | class | `ergon_core/core/providers/sandbox/resource_publisher.py` | Frozen API; storage backend swappable via `ERGON_BLOB_ROOT`. | Providers layer. | +| `SandboxEventSink` | `typing.Protocol` | `ergon_core/core/sandbox/event_sink.py` | Frozen protocol; activation path in flux. | Sandbox domain. | +| `NoopSandboxEventSink`, `DashboardEmitterSandboxEventSink` | implementations | `ergon_core/core/sandbox/event_sink.py` | Frozen. | Sandbox domain. | +| `SandboxResourcePublisher` | class | `ergon_core/core/sandbox/resource_publisher.py` | Frozen API; storage backend swappable via `ERGON_BLOB_ROOT`. | Sandbox domain. | | `TransformersModel` | `pydantic_ai.models.Model` subclass | `ergon_builtins/ergon_builtins/models/transformers_backend.py` | Frozen. | ML team (TRL training loop callers). | -### 2.1 Model target resolution +### 2.1 Generation registry -`resolve_model_target` is the single dispatch point for model target strings. It splits the target on its first colon and returns a `ResolvedModel` wrapping a concrete `pydantic_ai.models.Model` instance. Unknown prefixes raise immediately instead of falling through to PydanticAI inference. +`_BACKEND_REGISTRY` is a prefix-keyed dispatch table of resolver callables. `resolve_model_target` splits the target on its first colon, dispatches to the resolver, and returns a `ResolvedModel` wrapping either a `pydantic_ai.models.Model` instance or a passthrough string. Unknown prefixes fall through to a passthrough `ResolvedModel` — PydanticAI's own `infer_model` is invoked on use. Backends mutate the registry at import time; the builtins pack registers all four in a single loop at `ergon_builtins/ergon_builtins/registry.py:81`. -The supported prefixes are `vllm:[#]`, `openai-compatible:#`, and cloud provider prefixes `openai:*` / `anthropic:*` / `google:*`. Cloud provider prefixes always route through OpenRouter via PydanticAI's OpenRouter provider; they do not call direct OpenAI, Anthropic, or Google APIs. +The four prefixes registered today are `vllm:*` (local vLLM server via PydanticAI's `OpenAIChatModel`), `openai:*` / `anthropic:*` / `google:*` (passthrough to `infer_model`), and `transformers:*` (custom `TransformersModel` for TRL-trained checkpoints not served over vLLM). Workers are expected to hold no hardcoded SDK client constructions (`AsyncOpenAI`, `anthropic.Client`, `genai.Client`). This is an invariant (Section 4), not a coincidence, and is currently honored — enforcement is grep discipline. @@ -85,7 +87,7 @@ The decentralized shape means `ergon benchmark setup` iterates over whatever sub Worker.execute() | +-> resolve_model_target(self.model) --> ResolvedModel - | (explicit prefix dispatch; cloud targets route via OpenRouter) + | (prefix dispatch; 4 backends + fallthrough to infer_model) | +-> ManagerClass() (singleton; returns cached instance) | ManagerClass().create(sandbox_key=task_id, run_id=run_id, ...) @@ -124,7 +126,7 @@ Movement of data across this diagram: ## 4. Invariants 1. **One entry point to LLM resolution.** Every model reference goes through `resolve_model_target`. Enforced by grep discipline and review; no runtime check. -2. **Cloud provider prefixes use OpenRouter.** `openai:*`, `anthropic:*`, and `google:*` model targets are OpenRouter-hosted targets. Direct cloud SDK model routing is intentionally outside the grammar. +2. **Backends register at import time.** `register_model_backend` must be called before any caller hits `resolve_model_target`. Enforced by the builtins pack running its registration loop at import, before any worker module imports. 3. **Singleton managers hold authoritative sandbox state.** A subclass's class-level state is the only source of truth for in-process reconnect. Enforced by `__new__` caching the instance and `get_sandbox` reading the class dict. Applies only within a single Python process; cross-process actors must use `terminate_by_sandbox_id` or provision their own sandbox. 4. **Sandbox lifecycle is per-task.** Enforced by `create` accepting `sandbox_key` and by the worker runtime persisting `sandbox_id` on the execution row. 5. **Sandbox lives across evaluator fan-out.** Teardown runs at the end of `check_evaluators`, not at worker completion, not in `finalize_success`. Enforced by the evaluator harness, not by the manager itself. @@ -144,9 +146,10 @@ Movement of data across this diagram: ### 5.1 Add a new LLM backend -1. Add an explicit prefix branch in `resolve_model_target` and keep the constructor logic in a sibling module under `ergon_core/core/providers/generation/`. -2. Return a concrete `pydantic_ai.models.Model` instance wrapped in `ResolvedModel`. -3. Add an entry to `LLMProvider` and `PROVIDER_KEY_MAP` in `ergon_cli/onboarding/profile.py` so onboarding prompts for the key or server URL. +1. Write a resolver that maps `"myprefix:foo"` to a `pydantic_ai.models.Model` instance wrapped in `ResolvedModel`. +2. Register it in the builtins-pack registration loop so `register_model_backend` is called at import time. +3. Ensure the builtins pack is imported before any worker that references `myprefix:*` model ids. +4. Add an entry to `LLMProvider` and `PROVIDER_KEY_MAP` in `ergon_cli/onboarding/profile.py` so onboarding prompts for the key or server URL. ### 5.2 Add a new sandbox manager diff --git a/docs/architecture/07_testing.md b/docs/architecture/07_testing.md index 292d5212..69ceb37e 100644 --- a/docs/architecture/07_testing.md +++ b/docs/architecture/07_testing.md @@ -25,17 +25,17 @@ Path-based, not marker-based. The local gate and the CI workflow both dispatch b Every PR runs three benchmark legs in parallel via `.github/workflows/e2e-benchmarks.yml`: -| Leg | Slot 1 | Slot 2 | Slot 3 | -|---|---|---|---| -| `researchrubrics` | happy | happy | **sad** — `l_2` forced FAIL | -| `minif2f` | happy | happy | happy | -| `swebench-verified` | happy | happy | happy | +| Leg | Slot 1 | Slot 2 | +|---|---|---| +| `researchrubrics` | happy | **sad** — `l_2` forced FAIL | +| `minif2f` | happy | **sad** — `l_2` forced FAIL | +| `swebench-verified` | happy | **sad** — `l_2` forced FAIL | -**9 top-level runs per PR; 80 leaf sandbox acquisitions** (8 happy × 9 leaves + 1 sad × 8 leaves — `l_3` never provisioned because its dependency failed). +**6 top-level runs per PR; 57 dynamic child sandbox acquisitions** (3 happy × 11 child tasks + 3 sad × 8 child tasks — `l_3` never provisions on sad runs because its dependency failed). -### 3.1 Immutable 9-leaf DAG +### 3.1 Smoke DAG -Every smoke run — happy or sad — spawns exactly this graph: +Every smoke run starts with the same 9 direct children: ``` Diamond (4): Line (3): Singletons (2): @@ -46,9 +46,18 @@ d_left d_right d_join ``` -Topology is enforced by `tests/e2e/_fixtures/smoke_base/worker_base.py::SmokeWorkerBase.execute` being decorated `@typing.final`. Subclasses supply the leaf slug via `leaf_slug` and (optionally) override `_spec_for(slug, deps, desc)` to route specific slugs elsewhere — the sad-path subclass uses this to route `l_2` to a failing leaf. They cannot change the DAG itself. +Happy-path runs route top-level `l_2` to `{env}-smoke-recursive-worker`, which plans a nested two-node line under `l_2`: -The single source of truth for topology is [`tests/e2e/_fixtures/smoke_base/constants.py`](../../tests/e2e/_fixtures/smoke_base/constants.py): +```text +l_2 +└─ l_2_a → l_2_b +``` + +Top-level `l_3` depends on `l_2`, so the smoke proves dependency propagation waits for a non-leaf dynamic task before releasing downstream work. Sad-path runs route `l_2` to the failing leaf instead, so `l_3` remains blocked. + +Topology is enforced by `ergon_core/test_support/smoke_fixtures/smoke_base/worker_base.py::SmokeWorkerBase.execute` being decorated `@typing.final`. Subclasses supply the leaf slug via `leaf_slug` and override `_spec_for(slug, deps, desc)` only to route specific slugs elsewhere. They cannot change the direct-child DAG itself. + +The single source of truth for the direct-child topology is [`ergon_core/test_support/smoke_fixtures/smoke_base/constants.py`](../../ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/constants.py): ```python EXPECTED_SUBTASK_SLUGS = ( @@ -60,29 +69,32 @@ EXPECTED_SUBTASK_SLUGS = ( ### 3.2 Fixture residency — test-only, out of `ergon_builtins` -`ergon_builtins/` contains only production baselines (ReActWorker, TrainingStubWorker). All smoke workers, leaves, and criteria live under [`tests/e2e/_fixtures/`](../../tests/e2e/_fixtures/) and register into the process-level `WORKERS` / `EVALUATORS` dicts via an import side-effect in `tests/e2e/_fixtures/__init__.py`, which `tests/e2e/conftest.py` imports at session start. +`ergon_builtins/` contains only production baselines (ReActWorker, TrainingStubWorker). All smoke workers, leaves, and criteria live under [`tests/fixtures/smoke_components/`](../../tests/fixtures/smoke_components/) and register into the process-level core component registry through `register_smoke_fixtures()`. -11 registry rows total — none production: +19 registry rows total — none production: | Slug | Kind | |---|---| | `{env}-smoke-worker` × 3 | Worker (parent) — inherits `SmokeWorkerBase` | | `{env}-smoke-leaf` × 3 | Worker (leaf) — inherits `BaseSmokeLeafWorker` | -| `researchrubrics-sadpath-smoke-worker` | Worker (sad-path parent) | -| `researchrubrics-smoke-leaf-failing` | Worker (sad-path failing leaf) | +| `{env}-smoke-recursive-worker` × 3 | Worker (nested `l_2` parent) — inherits `RecursiveSmokeWorkerBase` | +| `{env}-sadpath-smoke-worker` × 3 | Worker (sad-path parent) | +| `{env}-smoke-leaf-failing` × 3 | Worker (sad-path failing leaf) | | `{env}-smoke-criterion` × 3 | Criterion — inherits `SmokeCriterionBase` | +| `smoke-post-root-timing-criterion` | Criterion — second root evaluator used for timing assertions | where `{env} ∈ {researchrubrics, minif2f, swebench}`. ### 3.3 Turn persistence - Parent `SmokeWorkerBase.execute` yields **3** `GenerationTurn`s (planning → planned → awaiting) so incremental turn persistence is exercised on every run. +- Happy-path recursive `l_2` yields **3** `GenerationTurn`s. - Each leaf `BaseSmokeLeafWorker.execute` yields **2** turns (attaching → done). -- Total per happy run: **1 × 3 + 9 × 2 = 21** `GenerationTurn` rows; driver asserts on this. +- Total per happy run: **3 + 3 + 10 × 2 = 26** `GenerationTurn` rows; driver asserts on this. ### 3.4 Inter-agent messaging -Each happy-path leaf calls `CommunicationService.save_message` once on the `smoke-completion` thread (first production caller of that service). 9 `ThreadMessage` rows per happy run, sequence_num 1..9 per thread. Sad-path `l_2` raises before reaching this call — 8 messages on a sad run, with `l_2` missing. +Each happy-path leaf calls `CommunicationService.save_message` once on the `smoke-completion` thread (first production caller of that service). The recursive `l_2` worker also sends one completion message after nested children finish. Happy runs emit 11 `ThreadMessage` rows (`9` direct slugs + `l_2_a`, `l_2_b`), sequence_num 1..11 per thread. Sad-path `l_2` raises before reaching this call and `l_3` blocks — 7 messages on a sad run, with `l_2` and `l_3` missing. ### 3.5 Sandbox-side checks @@ -98,14 +110,14 @@ For each run in a cohort, the pytest driver asserts: | Channel | What it checks | |---|---| -| `RunGraphNode` | 10 nodes (1 root + 9 leaves); all COMPLETED (happy) or cascade pattern (sad); `sorted(slugs) == EXPECTED_SUBTASK_SLUGS` | -| `RunGraphEdge` | 6 expected dependency edges (diamond + line) | -| `RunResource` | ≥ 18 rows (9 outputs + 9 probes); all with non-empty `content_hash` | -| `GenerationTurn` | Exactly 21 rows per happy run (derived from `PARENT_TURN_COUNT + 9 × LEAF_TURN_COUNT`) | -| `ThreadMessage` (topic `smoke-completion`) | 9 messages per happy run / 8 per sad; `sequence_num` strictly 1..N | +| `RunGraphNode` | Happy: 12 nodes (1 root + 9 direct children + 2 nested children), all COMPLETED; sad: cascade pattern with `l_2` FAILED and `l_3` BLOCKED | +| `RunGraphEdge` | Expected dependency edges (diamond, top-level line, nested `l_2_a → l_2_b`) | +| `RunResource` | Happy: 20 rows (10 outputs + 10 probes); all with non-empty `content_hash` | +| `GenerationTurn` | Exactly 26 rows per happy run | +| `ThreadMessage` (topic `smoke-completion`) | 11 messages per happy run / 7 per sad; `sequence_num` strictly 1..N | | Blob store round-trip | Re-read of one probe JSON is byte-stable + parses | | Temporal ordering | `RunTaskExecution.started_at` of children ≥ `completed_at` of parents | -| `RunTaskEvaluation` | Exactly 1 row; score 1.0 (happy) / 0.0 (sad); failed slug named in sad feedback | +| `RunTaskEvaluation` | Happy: 2 root rows, both score 1.0 and created after root execution completion; sad: no successful final score | Sad-path adds: partial artifact persisted (partial_*.md exists as RunResource), pre-failure WAL entry present, `l_3` status BLOCKED/CANCELLED per RFC `static-sibling-failure-semantics`. @@ -153,7 +165,7 @@ Required `data-testid` attributes: `run-status`, `task-node-{slug}` (one per `EX 3. **Test stubs live in `tests/e2e/_fixtures/`, not `ergon_builtins/`.** Production registry (`ergon_builtins/registry_core.py`) contains only production baselines. Exception: `training_stub_worker.py` — it's a real RL-trajectory baseline, not test scaffolding; operators invoke it via CLI. 4. **Criteria reconnect via the CriterionRuntime DI container, never via `AsyncSandbox.connect` directly.** Enforced by code inspection; the anti-pattern previously fixed by `bugs/fixed/2026-04-18-swebench-criterion-spawns-sandbox.md`. 5. **Sandbox outlives the task until all criteria finish.** RFC `sandbox-lifetime-covers-criteria`. Smoke is the living regression test for this. -6. **Cohort parallelism exercised on every PR.** 3-run cohorts prove concurrent workflow submission and cohort aggregation at the scale smoke uses. +6. **Cohort parallelism exercised on every PR.** 2-run happy/sad cohorts prove concurrent workflow submission and cohort aggregation at the scale smoke uses. 7. **Partial work persists on FAILED leaves.** Sad-path `AlwaysFailSubworker` writes a file + runs a probe command, then raises. Driver asserts the partial artifact and pre-failure WAL entry survive. ## 9. Budget @@ -161,10 +173,10 @@ Required `data-testid` attributes: `run-status`, `task-node-{slug}` (one per `EX | Measure | Value | |---|---| | Per matrix leg | 10-min job timeout; 5-min pytest timeout | -| Leaf-subtask sandbox acquisitions per leg | 26 or 27 (researchrubrics has 26 because the sad slot skips `l_3`) | -| Leaf-subtask sandbox acquisitions per PR | 80 across 3 sandbox images | +| Dynamic child sandbox acquisitions per leg | 19 (1 happy × 11 child tasks + 1 sad × 8 child tasks) | +| Dynamic child sandbox acquisitions per PR | 57 across 3 sandbox images | | Parent-task sandbox per run | 1 (used by parent worker + attached to by the criterion). Not additional at evaluation time. | -| Parallel workflow runs per PR | 9 (3 legs × 3-run cohort) | +| Parallel workflow runs per PR | 6 (3 legs × 2-run cohort) | | Warm wall-clock per leg | 1–3 min (post-Docker cache) | | Cold wall-clock per leg | up to 5 min | diff --git a/docs/architecture/cross_cutting/artifacts.md b/docs/architecture/cross_cutting/artifacts.md index bc6b5fe9..04506b02 100644 --- a/docs/architecture/cross_cutting/artifacts.md +++ b/docs/architecture/cross_cutting/artifacts.md @@ -15,7 +15,7 @@ produces computed artifacts through `CriterionRuntime.run_command(...)`. | Type | Location | Freeze | Owner | |------|----------|--------|-------| -| `SandboxResourcePublisher` | `ergon_core/core/providers/sandbox/resource_publisher.py` | Stable | Sandbox provider | +| `SandboxResourcePublisher` | `ergon_core/core/sandbox/resource_publisher.py` | Stable | Sandbox domain | | `RunResource` | ORM row; table `run_resources` | Stable wire shape | Persistence layer | | `dashboard/resource.published` | Inngest event | Stable | Dashboard lane | | `CriterionRuntime.read_resource(name)` | Proposed per RFC | Pending | Evaluator layer | diff --git a/docs/dead-code-audit-2026-04-25.md b/docs/dead-code-audit-2026-04-25.md index 8fa1c212..8551b54a 100644 --- a/docs/dead-code-audit-2026-04-25.md +++ b/docs/dead-code-audit-2026-04-25.md @@ -130,7 +130,7 @@ alternative control flow, not just unused helpers. | Area | File | Symbol / module | Current evidence | Decision | Why | Risk | Follow-up test/check | | --- | --- | --- | --- | --- | --- | --- | --- | | Core utils | `core/utils.py` | `get_mime_type` | No repo-wide caller. | Delete | Small unused helper. | Low | Search after deletion. | -| OpenRouter budget | `core/providers/generation/openrouter_budget.py` | `OpenRouterBudget` | Mostly referenced from tests/fixtures/benchmarks rather than active production modules. | Keep | Useful for real-LLM test budget gating. Not dead in the test harness context. | Low | None. | +| OpenRouter budget | `tests/real_llm/openrouter_budget.py` | `OpenRouterBudget` | Referenced from real-LLM fixtures/benchmarks rather than active production modules. | Keep test-local | Useful for real-LLM test budget gating. Not part of core runtime. | Low | None. | | Dashboard emitter | `core/dashboard/emitter.py` | `_RunContextEvent` import | Vulture flags unused import. | Delete | Straight unused import cleanup. | Low | Run lint/type check. | | RL extraction | `core/rl/extraction.py` | `add_special_tokens` parameter on `Tokenizer.encode()` protocol | Vulture flags it, but it is part of a `Protocol` signature matching common tokenizer APIs. Callers intentionally use bare `tokenizer.encode(...)`. | Keep | Static-analysis false positive. The parameter documents compatibility with tokenizer implementations such as Hugging Face tokenizers. | Low | If vulture noise matters, suppress/allowlist instead of deleting the protocol parameter. | diff --git a/docs/experiments/rq1-cli-specialism/changelog.md b/docs/experiments/rq1-cli-specialism/changelog.md index 11cb5f91..2f56b398 100644 --- a/docs/experiments/rq1-cli-specialism/changelog.md +++ b/docs/experiments/rq1-cli-specialism/changelog.md @@ -293,5 +293,3 @@ Runs append below. Each entry should include command, env knobs, rollout artifac - Removed the builtins model-backend registration path and the old `cloud_passthrough.py` / `vllm_backend.py` modules. - Note: - The installed PydanticAI version exposes `OpenRouterProvider` but not `OpenRouterModel`; the implementation uses `OpenAIChatModel(..., provider=OpenRouterProvider(...))`, which gives the desired OpenRouter routing semantics. - - diff --git a/docs/real-llm-rollout-harness.md b/docs/real-llm-rollout-harness.md index 813f6f95..902c4706 100644 --- a/docs/real-llm-rollout-harness.md +++ b/docs/real-llm-rollout-harness.md @@ -45,7 +45,7 @@ Shipped (PR 1): flag, session fixtures wired. - `fixtures/stack.py` — docker-compose up/wait/down against the unified `docker-compose.yml`. -- `fixtures/openrouter_budget.py` + `ergon_core/.../openrouter_budget.py` +- `openrouter_budget.py` + `fixtures/openrouter_budget.py` — live spend check against `/api/v1/auth/key`. - `fixtures/harness_client.py` — polls `/api/test/read/run/{id}/state` for terminal status. diff --git a/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md b/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md index 22b7426d..a207dbe5 100644 --- a/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md +++ b/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md @@ -51,7 +51,7 @@ to a `type[BaseSandboxManager]` (not an instance). The cleanup function would need to resolve the class and call the static method `BaseSandboxManager.terminate_by_sandbox_id(sandbox_id)`. `terminate_by_sandbox_id` is a `@staticmethod` at -`ergon_core/ergon_core/core/providers/sandbox/manager.py:472-490` that calls +`ergon_core/ergon_core/core/sandbox/manager.py:472-490` that calls `AsyncSandbox.kill(sandbox_id=..., api_key=...)` directly via E2B, so no instance is needed. However, `cleanup_cancelled_task_fn` currently has no import path to `SANDBOX_MANAGERS`. @@ -278,7 +278,7 @@ import logging import inngest from ergon_builtins.registry import SANDBOX_MANAGERS -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_core.core.runtime.events.task_events import TaskCancelledEvent from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.task_cleanup_dto import CleanupResult @@ -712,13 +712,13 @@ class TestReleaseSandboxStep: async def test_releases_sandbox_when_fields_present(self) -> None: """terminate_by_sandbox_id called exactly once for valid payload.""" with patch( - "ergon_core.core.providers.sandbox.manager.BaseSandboxManager" + "ergon_core.core.sandbox.manager.BaseSandboxManager" ".terminate_by_sandbox_id", new_callable=AsyncMock, return_value=True, ) as mock_terminate: from ergon_builtins.registry import SANDBOX_MANAGERS - from ergon_core.core.providers.sandbox.manager import BaseSandboxManager + from ergon_core.core.sandbox.manager import BaseSandboxManager # Any known slug from SANDBOX_MANAGERS slug = next(iter(SANDBOX_MANAGERS)) diff --git a/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md b/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md index 5b6c0bda..e54a8f9e 100644 --- a/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md +++ b/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md @@ -14,7 +14,7 @@ superseded_by: null ### Current state `BaseSandboxManager.create()` at -`ergon_core/ergon_core/core/providers/sandbox/manager.py:226` accepts a single +`ergon_core/ergon_core/core/sandbox/manager.py:226` accepts a single `timeout_minutes: int = 30` parameter. Every call site passes a literal or relies on the default: @@ -145,7 +145,7 @@ reconnect path; `CriterionRuntime.ensure_sandbox()` will call it once RFC **Change 3 — Define `SandboxExpiredError`.** New exception class at -`ergon_core/ergon_core/core/providers/sandbox/errors.py`. Subclasses the base +`ergon_core/ergon_core/core/sandbox/errors.py`. Subclasses the base `Exception` (not `ErgonNonRetriableError` — sandbox expiry is not a definition-level error; it is a transient infrastructure condition). Criteria that catch it should surface a `"sandbox-expired"` evaluation outcome rather @@ -225,7 +225,7 @@ SandboxSetupRequest (payload) ## Type / interface definitions ```python -# ergon_core/ergon_core/core/providers/sandbox/errors.py +# ergon_core/ergon_core/core/sandbox/errors.py """Sandbox-specific exception types.""" @@ -260,7 +260,7 @@ class SandboxExpiredError(SandboxError): ### `errors.py` (new file) ```python -# ergon_core/ergon_core/core/providers/sandbox/errors.py +# ergon_core/ergon_core/core/sandbox/errors.py """Sandbox-specific exception types.""" @@ -291,7 +291,7 @@ class SandboxExpiredError(SandboxError): ### `reconnect` method (added to `BaseSandboxManager`) ```python -# Added to: ergon_core/ergon_core/core/providers/sandbox/manager.py +# Added to: ergon_core/ergon_core/core/sandbox/manager.py # Location: after get_sandbox() at line 394, before get_sandbox_path() async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": @@ -308,7 +308,7 @@ async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": This method does NOT register the sandbox in class-level state; callers should not assume it shows up in _sandboxes. """ - from ergon_core.core.providers.sandbox.errors import SandboxExpiredError + from ergon_core.core.sandbox.errors import SandboxExpiredError if AsyncSandbox is None: raise RuntimeError( @@ -331,7 +331,7 @@ async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": ### Updated `create()` signature — `BaseSandboxManager` ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py # Replace lines 226-295 (existing create method) async def create( @@ -423,7 +423,7 @@ async def create( ### Updated `DefaultSandboxManager.create()` override ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py # Replace lines 503-526 (existing DefaultSandboxManager.create override) async def create( @@ -457,21 +457,21 @@ async def create( ### Updated `__init__.py` (sandbox package) ```python -# ergon_core/ergon_core/core/providers/sandbox/__init__.py +# ergon_core/ergon_core/core/sandbox/__init__.py # Add SandboxExpiredError, SandboxError to exports """Sandbox management: provisioning, file I/O, lifecycle.""" -from ergon_core.core.providers.sandbox.errors import ( +from ergon_core.core.sandbox.errors import ( SandboxError, SandboxExpiredError, ) -from ergon_core.core.providers.sandbox.event_sink import ( +from ergon_core.core.sandbox.event_sink import ( DashboardEmitterSandboxEventSink, NoopSandboxEventSink, SandboxEventSink, ) -from ergon_core.core.providers.sandbox.manager import ( +from ergon_core.core.sandbox.manager import ( BaseSandboxManager, DefaultSandboxManager, DownloadedFile, @@ -495,7 +495,7 @@ __all__ = [ ## Exact diffs for modified files -### `ergon_core/ergon_core/core/providers/sandbox/manager.py` +### `ergon_core/ergon_core/core/sandbox/manager.py` ```diff @@ -226,13 +226,16 @@ class BaseSandboxManager(ABC): @@ -559,7 +559,7 @@ __all__ = [ + sandbox is not found or has already timed out. Idempotent. + Does NOT register in class-level _sandboxes state. + """ -+ from ergon_core.core.providers.sandbox.errors import SandboxExpiredError ++ from ergon_core.core.sandbox.errors import SandboxExpiredError + + if AsyncSandbox is None: + raise RuntimeError( @@ -640,22 +640,22 @@ __all__ = [ Note: `reset_timeout` call changes from 30 to 40 to match the new provisioned total. The signature of `reset_timeout` at `manager.py:407` is unchanged (still accepts `timeout_minutes`). -### `ergon_core/ergon_core/core/providers/sandbox/__init__.py` +### `ergon_core/ergon_core/core/sandbox/__init__.py` ```diff @@ -1,6 +1,11 @@ """Sandbox management: provisioning, file I/O, lifecycle.""" -+from ergon_core.core.providers.sandbox.errors import ( ++from ergon_core.core.sandbox.errors import ( + SandboxError, + SandboxExpiredError, +) - from ergon_core.core.providers.sandbox.event_sink import ( + from ergon_core.core.sandbox.event_sink import ( DashboardEmitterSandboxEventSink, NoopSandboxEventSink, SandboxEventSink, ) - from ergon_core.core.providers.sandbox.manager import ( + from ergon_core.core.sandbox.manager import ( BaseSandboxManager, DefaultSandboxManager, DownloadedFile, @@ -683,7 +683,7 @@ New file, no new package. The errors module sits alongside the existing sandbox package files: ``` -ergon_core/ergon_core/core/providers/sandbox/ +ergon_core/ergon_core/core/sandbox/ ├── __init__.py MODIFY (add SandboxError, SandboxExpiredError exports) ├── errors.py ADD (SandboxError, SandboxExpiredError) ├── event_sink.py no change @@ -700,15 +700,15 @@ ergon_core/ergon_core/core/providers/sandbox/ | Step | Phase | What | Files touched | |------|-------|------|---------------| -| 1 | PR 1 | Create `errors.py` with `SandboxError` and `SandboxExpiredError` | ADD `ergon_core/ergon_core/core/providers/sandbox/errors.py` | -| 2 | PR 1 | Add `errors` imports to sandbox `__init__.py` | MODIFY `ergon_core/ergon_core/core/providers/sandbox/__init__.py` | -| 3 | PR 1 | Update `BaseSandboxManager.create()` signature: `timeout_minutes` → `task_timeout_minutes + max_criterion_timeout_minutes`; update WAL entry log | MODIFY `ergon_core/ergon_core/core/providers/sandbox/manager.py` | -| 4 | PR 1 | Update `DefaultSandboxManager.create()` override with same signature change | MODIFY `ergon_core/ergon_core/core/providers/sandbox/manager.py` | +| 1 | PR 1 | Create `errors.py` with `SandboxError` and `SandboxExpiredError` | ADD `ergon_core/ergon_core/core/sandbox/errors.py` | +| 2 | PR 1 | Add `errors` imports to sandbox `__init__.py` | MODIFY `ergon_core/ergon_core/core/sandbox/__init__.py` | +| 3 | PR 1 | Update `BaseSandboxManager.create()` signature: `timeout_minutes` → `task_timeout_minutes + max_criterion_timeout_minutes`; update WAL entry log | MODIFY `ergon_core/ergon_core/core/sandbox/manager.py` | +| 4 | PR 1 | Update `DefaultSandboxManager.create()` override with same signature change | MODIFY `ergon_core/ergon_core/core/sandbox/manager.py` | | 5 | PR 1 | Migrate `sandbox_setup.py` call site: `timeout_minutes=30` → `task_timeout_minutes=30` | MODIFY `ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py` | | 6 | PR 1 | Migrate `criterion_runtime.py` call sites: same rename; `reset_timeout` 30 → 40 | MODIFY `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` | | 7 | PR 1 | Migrate test call sites: `timeout_minutes=5` → `task_timeout_minutes=5` in `tests/swebench_verified/test_sandbox_manager.py` and `tests/minif2f/test_sandbox_manager.py` | MODIFY 2 test files | | 8 | PR 1 | Unit tests: `create()` passes correct total timeout to E2B; `task_timeout + max_criterion_timeout` arithmetic | ADD `tests/unit/test_sandbox_timeout.py` | -| 9 | PR 2 | Add `BaseSandboxManager.reconnect(sandbox_id)` method | MODIFY `ergon_core/ergon_core/core/providers/sandbox/manager.py` | +| 9 | PR 2 | Add `BaseSandboxManager.reconnect(sandbox_id)` method | MODIFY `ergon_core/ergon_core/core/sandbox/manager.py` | | 10 | PR 2 | Unit tests for `reconnect`: successful connect, E2B-not-found raises `SandboxExpiredError`, non-expired E2B error re-raises | ADD to `tests/unit/test_sandbox_reconnect.py` | | 11 | PR 2 | Canary e2e test: deliberately-slow criterion (sleep > task_timeout) still finds sandbox reachable | ADD `tests/e2e/test_sandbox_criterion_timeout_canary.py` | | 12 | PR 2 | (Deferred — depends on `2026-04-17-criterion-runtime-di-container`) Migrate `DefaultCriterionRuntime.ensure_sandbox()` to use `reconnect` when `get_sandbox` returns `None`, handling `SandboxExpiredError` | MODIFY `criterion_runtime.py` | @@ -724,7 +724,7 @@ Steps 1–8 land as PR 1 ("sandbox-lifetime/split-timeout"). Steps 9–11 land a | File | Purpose | |------|---------| -| `ergon_core/ergon_core/core/providers/sandbox/errors.py` | `SandboxError` base class; `SandboxExpiredError` raised by `reconnect()` on expired sandbox | +| `ergon_core/ergon_core/core/sandbox/errors.py` | `SandboxError` base class; `SandboxExpiredError` raised by `reconnect()` on expired sandbox | | `tests/unit/test_sandbox_timeout.py` | Unit tests: `create()` arithmetic, `task_timeout + max_criterion_timeout` passed to E2B | | `tests/unit/test_sandbox_reconnect.py` | Unit tests: `reconnect()` success, not-found raises `SandboxExpiredError`, other errors re-raise | | `tests/e2e/test_sandbox_criterion_timeout_canary.py` | E2e canary: slow criterion still reaches sandbox when timeout is correctly provisioned | @@ -733,8 +733,8 @@ Steps 1–8 land as PR 1 ("sandbox-lifetime/split-timeout"). Steps 9–11 land a | File | Changes | |------|---------| -| `ergon_core/ergon_core/core/providers/sandbox/manager.py` | Split `timeout_minutes` into `task_timeout_minutes + max_criterion_timeout_minutes` in `BaseSandboxManager.create()` and `DefaultSandboxManager.create()`; add `reconnect()` method | -| `ergon_core/ergon_core/core/providers/sandbox/__init__.py` | Export `SandboxError`, `SandboxExpiredError` | +| `ergon_core/ergon_core/core/sandbox/manager.py` | Split `timeout_minutes` into `task_timeout_minutes + max_criterion_timeout_minutes` in `BaseSandboxManager.create()` and `DefaultSandboxManager.create()`; add `reconnect()` method | +| `ergon_core/ergon_core/core/sandbox/__init__.py` | Export `SandboxError`, `SandboxExpiredError` | | `ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py` | Rename `timeout_minutes=30` → `task_timeout_minutes=30` at line 106 | | `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` | Rename `timeout_minutes=30` → `task_timeout_minutes=30` at line 59; `reset_timeout(..., timeout_minutes=30)` → `timeout_minutes=40` at line 63 | | `tests/swebench_verified/test_sandbox_manager.py` | Rename `timeout_minutes=5` → `task_timeout_minutes=5`; update assertion `call_kwargs["timeout"] == 5 * 60` → `== (5 + 10) * 60` | @@ -758,7 +758,7 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager, DefaultSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager, DefaultSandboxManager @pytest.fixture(autouse=True) @@ -792,11 +792,11 @@ async def test_create_passes_total_timeout_to_e2b(monkeypatch: pytest.MonkeyPatc fake_sandbox.sandbox_id = "sbx-test" fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -819,11 +819,11 @@ async def test_create_default_max_criterion_timeout(monkeypatch: pytest.MonkeyPa fake_sandbox.sandbox_id = "sbx-default" fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -841,11 +841,11 @@ async def test_create_zero_criterion_timeout(monkeypatch: pytest.MonkeyPatch) -> fake_sandbox.sandbox_id = "sbx-zero" fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -875,8 +875,8 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.errors import SandboxExpiredError -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.errors import SandboxExpiredError +from ergon_core.core.sandbox.manager import BaseSandboxManager @pytest.fixture(autouse=True) @@ -902,11 +902,11 @@ async def test_reconnect_returns_sandbox_on_success(monkeypatch: pytest.MonkeyPa fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -924,11 +924,11 @@ async def test_reconnect_raises_sandbox_expired_on_not_found( """reconnect() raises SandboxExpiredError when E2B returns 'not found'.""" fake_connect = AsyncMock(side_effect=Exception("sandbox not found (404)")) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -945,11 +945,11 @@ async def test_reconnect_reraises_non_expiry_errors(monkeypatch: pytest.MonkeyPa """reconnect() re-raises unexpected E2B errors unchanged.""" fake_connect = AsyncMock(side_effect=ConnectionError("network blip")) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -978,7 +978,7 @@ import asyncio import pytest from uuid import uuid4 -from ergon_core.core.providers.sandbox.manager import DefaultSandboxManager, BaseSandboxManager +from ergon_core.core.sandbox.manager import DefaultSandboxManager, BaseSandboxManager @pytest.fixture(autouse=True) diff --git a/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md b/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md index d694a25e..e0452646 100644 --- a/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md +++ b/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md @@ -25,7 +25,7 @@ reduces the diff size for that RFC. ## Problem `BaseSandboxManager.create()` -(`ergon_core/ergon_core/core/providers/sandbox/manager.py:226-233`) takes three +(`ergon_core/ergon_core/core/sandbox/manager.py:226-233`) takes three conceptual task-keys as positional/keyword arguments: ```python @@ -177,7 +177,7 @@ production cases — which is exactly what `task_id` is after the rename. ## Full implementation -### Modified file: `ergon_core/ergon_core/core/providers/sandbox/manager.py` +### Modified file: `ergon_core/ergon_core/core/sandbox/manager.py` #### 1. Remove `_display_task_ids` class attribute @@ -575,7 +575,7 @@ None. | File | Changes | |---|---| -| `ergon_core/ergon_core/core/providers/sandbox/manager.py` | Delete `_display_task_ids` attr (line 70); delete `_get_display_task_id()` (lines 96-97); rename `sandbox_key`→`task_id` + remove `display_task_id` in `BaseSandboxManager.create()` (lines 226-295); rename `sandbox_key`→`task_id` + rename `task_id`→`override_task_id` in `_emit_wal_entry()` (lines 99-131); simplify `terminate()` (lines 429-469); rename + remove `display_task_id` in `DefaultSandboxManager.create()` (lines 503-526) | +| `ergon_core/ergon_core/core/sandbox/manager.py` | Delete `_display_task_ids` attr (line 70); delete `_get_display_task_id()` (lines 96-97); rename `sandbox_key`→`task_id` + remove `display_task_id` in `BaseSandboxManager.create()` (lines 226-295); rename `sandbox_key`→`task_id` + rename `task_id`→`override_task_id` in `_emit_wal_entry()` (lines 99-131); simplify `terminate()` (lines 429-469); rename + remove `display_task_id` in `DefaultSandboxManager.create()` (lines 503-526) | | `ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py` | Drop `display_task_id=task_id` kwarg at line 108 | | `ergon_builtins/ergon_builtins/benchmarks/swebench_verified/criterion.py` | Rename `sandbox_key=` → `task_id=` at line 74 | | `tests/minif2f/test_sandbox_manager.py` | Remove `BaseSandboxManager._display_task_ids = {}` at line 30; rename `sandbox_key=` → `task_id=` at lines 121, 172, 206 | diff --git a/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md b/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md index 4e047349..0d82db05 100644 --- a/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md +++ b/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md @@ -14,7 +14,7 @@ superseded_by: null ## 1. Problem `BaseSandboxManager` -(`ergon_core/ergon_core/core/providers/sandbox/manager.py`) is wired as a +(`ergon_core/ergon_core/core/sandbox/manager.py`) is wired as a singleton-per-subclass via `__new__` at `manager.py:78-81`: ```python @@ -71,7 +71,7 @@ The same pattern appears in: - `ergon_builtins/ergon_builtins/benchmarks/swebench_verified/criterion.py:72` `ResearchRubricsSandboxManager` (in -`ergon_core/ergon_core/core/providers/sandbox/research_rubrics_manager.py`) also +`ergon_builtins/ergon_builtins/benchmarks/researchrubrics/sandbox_manager.py`) also calls `self._sandboxes[task_id]` directly at `research_rubrics_manager.py:105` in `publisher_for()`, relying on the class-level dict. @@ -237,7 +237,7 @@ DefaultCriterionRuntime.ensure_sandbox() (any process) ### 4.1 Updated `BaseSandboxManager.__init__` ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py class BaseSandboxManager(ABC): """Abstract base class for E2B sandbox lifecycle management. @@ -267,7 +267,7 @@ class BaseSandboxManager(ABC): ### 4.2 `reconnect` method signature ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": """Rehydrate a running sandbox by its E2B sandbox_id. @@ -538,7 +538,7 @@ Behavior unchanged. Stage 1 is a pure refactor. | File | Changes | |---|---| -| `ergon/ergon_core/ergon_core/core/providers/sandbox/manager.py` | Stage 1: move six dicts to `__init__`, fix `_event_sink` init; Stage 2: remove `__new__` + `_instance`, add `reconnect()` | +| `ergon/ergon_core/ergon_core/core/sandbox/manager.py` | Stage 1: move six dicts to `__init__`, fix `_event_sink` init; Stage 2: remove `__new__` + `_instance`, add `reconnect()` | | `ergon/ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` | Stage 3: update `ensure_sandbox()` to use `reconnect()` on cross-process miss | | `ergon/ergon_core/ergon_core/core/runtime/evaluation/evaluation_schemas.py` | Stage 3: add `sandbox_id: str \| None = None` to `CriterionContext` if absent | | `ergon/ergon_builtins/ergon_builtins/workers/baselines/minif2f_react_worker.py` | Stage 3: replace `manager.get_sandbox(context.task_id)` with `reconnect` or DI | @@ -567,7 +567,7 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager class _MinimalManager(BaseSandboxManager): @@ -610,12 +610,12 @@ class TestInstanceIsolation: ) def test_event_sink_initialized_in_init(self) -> None: - from ergon_core.core.providers.sandbox.event_sink import NoopSandboxEventSink + from ergon_core.core.sandbox.event_sink import NoopSandboxEventSink m = _MinimalManager() assert isinstance(m._event_sink, NoopSandboxEventSink) def test_custom_event_sink_set_without_stomp(self) -> None: - from ergon_core.core.providers.sandbox.event_sink import NoopSandboxEventSink + from ergon_core.core.sandbox.event_sink import NoopSandboxEventSink sink_a = NoopSandboxEventSink() sink_b = NoopSandboxEventSink() m1 = _MinimalManager(event_sink=sink_a) @@ -648,7 +648,7 @@ class TestReconnect: @pytest.mark.asyncio async def test_reconnect_calls_connect(self, monkeypatch: pytest.MonkeyPatch) -> None: - from ergon_core.core.providers.sandbox import manager as mgr_module + from ergon_core.core.sandbox import manager as mgr_module fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) @@ -667,7 +667,7 @@ class TestReconnect: async def test_reconnect_raises_when_e2b_not_installed( self, monkeypatch: pytest.MonkeyPatch ) -> None: - from ergon_core.core.providers.sandbox import manager as mgr_module + from ergon_core.core.sandbox import manager as mgr_module monkeypatch.setattr(mgr_module, "AsyncSandbox", None) @@ -680,7 +680,7 @@ class TestReconnect: self, monkeypatch: pytest.MonkeyPatch ) -> None: """reconnect() must not populate self._sandboxes (stateless by design).""" - from ergon_core.core.providers.sandbox import manager as mgr_module + from ergon_core.core.sandbox import manager as mgr_module fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) diff --git a/docs/rfcs/active/2026-04-21-real-llm-debug-harness.md b/docs/rfcs/active/2026-04-21-real-llm-debug-harness.md index be0bcb98..89b7924a 100644 --- a/docs/rfcs/active/2026-04-21-real-llm-debug-harness.md +++ b/docs/rfcs/active/2026-04-21-real-llm-debug-harness.md @@ -89,7 +89,7 @@ tests/real_llm/ └── results_writer.py # per-run .results.md + PR body emission ergon_builtins/ergon_builtins/tools/benchmark_toolkit_composer.py # NEW -ergon_core/ergon_core/core/providers/generation/openrouter_budget.py # NEW +tests/real_llm/openrouter_budget.py # NEW docker-compose.real-llm.yml # NEW ``` @@ -136,7 +136,7 @@ in `ergon_cli/composition/__init__.py` wires this into ### OpenRouter budget gate ```python -# ergon_core/core/providers/generation/openrouter_budget.py +# tests/real_llm/openrouter_budget.py class OpenRouterBudget: def __init__(self, limit_usd: float) -> None: diff --git a/docs/rfcs/active/architecture-refactor-audit/01-dependency-inversion.md b/docs/rfcs/active/architecture-refactor-audit/01-dependency-inversion.md new file mode 100644 index 00000000..7b0ccfce --- /dev/null +++ b/docs/rfcs/active/architecture-refactor-audit/01-dependency-inversion.md @@ -0,0 +1,418 @@ +--- +status: active +opened: 2026-04-27 +author: GPT-5.5 +architecture_refs: + - docs/architecture/01_public_api.md + - docs/architecture/03_providers.md + - docs/architecture/06_builtins.md +supersedes: [] +superseded_by: null +--- + +# RFC: Dependency Inversion And Package Boundaries + +## Problem + +The declared package graph says `ergon_core` is the reusable runtime and public +API, `ergon_builtins` supplies default implementations, `ergon_cli` adapts user +commands, and `ergon_infra` handles training/provisioning helpers. The source +graph is messier. Core runtime code imports the builtins registry, builtins +tooling imports CLI command modules, and test harness paths pull CLI +composition back into core. + +These dependencies work in the workspace, but they blur ownership. A reader +cannot easily tell which package owns composition, which APIs are stable, or +how to add a new benchmark/worker without coupling to the current default +registry. + +## Current findings + +### Core runtime imports builtins registry + +Runtime paths resolve slugs by importing `ergon_builtins.registry` directly. +This appears in Inngest handlers and services such as worker execution, +benchmark-run startup, evaluator dispatch, sandbox setup, output persistence, +and workflow initialization. The practical result is that core is not only a +runtime contract package; it also knows about the default plugin bundle. + +### Builtins registry reaches into core internals + +`ergon_builtins.registry` implements public `ergon_core.api` contracts, but it +also imports provider internals for model backend registration and sandbox +manager types. Some of this may be unavoidable today, but it should be named as +an extension boundary rather than an incidental import path. + +### Builtins tooling imports CLI command code + +`ergon_builtins.tools.workflow_cli_tool` imports `WorkflowCommandContext`, +`WorkflowCommandOutput`, and `execute_workflow_command` from +`ergon_cli.commands.workflow`. That makes an agent-facing builtin tool depend +on the CLI command layer instead of a shared application/service API. + +### Core test harness imports CLI composition + +`ergon_core.core.api.test_harness` imports `ergon_cli.composition` when the +test harness is enabled. The flag keeps this out of production by default, but +the import direction is still surprising for a core package. + +### CLI composition contains example-specific branches + +`ergon_cli.composition.build_experiment` performs registry lookup and then +branches for smoke workers and `researchrubrics-workflow-cli-react`. Those +branches may encode real composition needs, but they live in the generic CLI +composition path rather than behind benchmark/worker-owned composition hooks. + +## Target shape + +The target dependency direction should be: + +```text +ergon_core.api <- implemented by builtins and custom packages +ergon_core.runtime <- depends on injected registries/services, not builtins +ergon_builtins <- default implementation bundle +ergon_cli <- adapter that wires a registry bundle into core services +ergon_infra <- training/provisioning adapter over public/core services +ergon-dashboard <- frontend over HTTP/event contracts +``` + +Core may define protocols and service interfaces. Builtins may implement them. +CLI and application startup may choose the default builtins registry. Runtime +code should receive a resolver or registry interface rather than importing the +default bundle. + +## Standards proposed + +- Public contracts belong under `ergon_core.api` or a deliberately named core + interface module. +- A package should not import an adapter layer that is higher-level than + itself. In particular, builtins should not import `ergon_cli.commands.*`. +- Runtime services should depend on protocols such as `WorkerResolver`, + `BenchmarkResolver`, `EvaluatorResolver`, `SandboxManagerResolver`, or one + combined `RuntimeRegistry`. +- Example-specific composition should be owned by the benchmark/worker bundle + that requires it, or represented as data on the public API. +- Test-only composition should enter through explicit startup/plugin hooks, not + direct core-to-cli imports. + +## Candidate fixes + +Each candidate below should be treated as a small implementation plan, not an +idea bucket. A follow-up implementation plan may split these into separate PRs, +but each candidate already names the files, steps, tests, and acceptance gate +expected before the work is considered real. + +### DI-1: Add a runtime registry protocol in core + +**Issue fixed:** Core runtime code cannot express "I need a worker/benchmark/evaluator +resolver" without importing the concrete builtins registry, so dependency +direction is encoded as an implementation detail instead of a contract. + +Create a small protocol owned by core that contains the lookup methods runtime +code actually needs: + +- `get_worker(slug)` +- `get_benchmark(slug)` +- `get_evaluator(slug)` +- `get_sandbox_manager(slug)` +- optional install-hint lookup for user-facing errors + +Candidate location: `ergon_core.api.registry` if this becomes public extension +surface, or `ergon_core.core.runtime.registry` if it stays internal. The first +implementation can be an adapter around `ergon_builtins.registry`, preserving +all current slug names and optional-extra behavior. + +Files: + +- Create: `ergon_core/ergon_core/api/registry.py` or + `ergon_core/ergon_core/core/runtime/registry.py`. +- Create: `ergon_builtins/ergon_builtins/runtime_registry.py`. +- Modify: `ergon_builtins/ergon_builtins/registry.py` only if the adapter needs + a stable export. +- Test: `tests/unit/runtime/test_runtime_registry_contract.py`. + +Sketch: + +```python +from typing import Protocol + +class RuntimeRegistry(Protocol): + def get_worker(self, slug: str): ... + def get_benchmark(self, slug: str): ... + def get_evaluator(self, slug: str): ... + def get_sandbox_manager(self, slug: str): ... + def install_hint_for(self, slug: str) -> str | None: ... +``` + +Steps: + +- [ ] Add the protocol and a typed missing-slug error or document that `KeyError` + remains the compatibility behavior. +- [ ] Add a builtins-backed adapter over the existing registry dictionaries. +- [ ] Preserve model backend registration side effects at builtins registry + import time. +- [ ] Add a fake in-memory registry for tests that should not import builtins. +- [ ] Keep existing public imports of `ergon_builtins.registry` working. + +Verification: + +- Unit tests for successful and missing slug lookup. +- Characterization test that CLI defaults still resolve the same worker, + benchmark, evaluator, and sandbox manager classes. +- `python -c "from ergon_builtins.registry import WORKERS, BENCHMARKS"` still + succeeds in the workspace environment. + +Acceptance gate: + +- [ ] Registry contract tests pass for both the fake registry and builtins + adapter. +- [ ] No runtime behavior changes: current benchmark, worker, evaluator, and + sandbox slugs resolve to the same objects. +- [ ] Architecture docs mention where registry protocols live. + +### DI-2: Stop importing `ergon_builtins.registry` from core runtime modules + +**Issue fixed:** `ergon_core` is declared as the reusable runtime package, but +runtime modules currently depend on the default builtins bundle at import time. +That makes builtins a hidden runtime prerequisite and prevents fake/custom +registries from being injected cleanly. + +Replace direct registry imports in core runtime paths with an injected resolver +or application-level registry object. Initial target modules include: + +- `core/runtime/inngest/benchmark_run_start.py` +- `core/runtime/inngest/worker_execute.py` +- `core/runtime/inngest/evaluate_task_run.py` +- `core/runtime/inngest/sandbox_setup.py` +- `core/runtime/inngest/persist_outputs.py` +- `core/runtime/services/workflow_initialization_service.py` +- `core/api/app.py` + +The first pass can use a default registry provider at process startup so +behavior stays identical while import direction improves. + +Files: + +- Modify: `ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py`. +- Modify: `ergon_core/ergon_core/core/runtime/inngest/worker_execute.py`. +- Modify: `ergon_core/ergon_core/core/runtime/inngest/evaluate_task_run.py`. +- Modify: `ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py`. +- Modify: `ergon_core/ergon_core/core/runtime/inngest/persist_outputs.py`. +- Modify: + `ergon_core/ergon_core/core/runtime/services/workflow_initialization_service.py`. +- Modify: `ergon_core/ergon_core/core/api/app.py`. +- Test: `tests/unit/architecture/test_package_boundaries.py`. + +Steps: + +- [ ] Add a process-level registry provider or dependency accessor in core. +- [ ] Configure the builtins-backed registry from CLI/API startup. +- [ ] Convert each runtime module from `from ergon_builtins.registry import ...` + to the registry accessor. +- [ ] Keep error messages for unknown slugs at least as clear as today. +- [ ] Remove any import-time builtins dependency from core runtime modules. + +Verification: + +- Architecture test that `ergon_core.core.runtime` does not import + `ergon_builtins`. +- Existing benchmark/run tests continue to pass without slug changes. +- `rg "ergon_builtins.registry" ergon_core/ergon_core/core/runtime` returns no + matches. + +Acceptance gate: + +- [ ] Direct runtime imports of `ergon_builtins.registry` are gone. +- [ ] Unknown-slug behavior is characterized and preserved or deliberately + improved in a documented way. +- [ ] CLI/API startup still wires the default builtins registry. + +### DI-3: Move workflow command execution out of the CLI command module + +**Issue fixed:** Builtin agent tools reuse workflow behavior by importing +`ergon_cli.commands.workflow`, which makes a non-CLI package depend on CLI +command parsing/rendering code. + +Extract the command parsing/execution core from `ergon_cli.commands.workflow` +into a shared service module that has no CLI rendering dependency. The CLI +command should parse argv and render output; builtin tools should call the same +shared executor directly. + +Candidate owner: `ergon_core.core.runtime.services.workflow_command_service` if +the command surface is runtime-owned, or `ergon_cli.workflow_application` if it +is intentionally an application-layer adapter. The key rule is that +`ergon_builtins` should not import `ergon_cli.commands.*`. + +Verification: + +- Existing `tests/unit/cli/test_workflow_cli.py` still validates CLI behavior. +- New builtin-tool test imports the shared executor without importing the CLI + command module. +- Architecture test blocks `ergon_builtins -> ergon_cli.commands`. + +Files: + +- Create: + `ergon_core/ergon_core/core/runtime/services/workflow_command_service.py` + or a similarly named shared application module. +- Modify: `ergon_cli/ergon_cli/commands/workflow.py`. +- Modify: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py`. +- Test: `tests/unit/cli/test_workflow_cli.py`. +- Test: `tests/unit/state/test_workflow_cli_tool.py` or equivalent builtin + tool test. + +Steps: + +- [ ] Identify the current command parser/executor/renderer responsibilities in + `ergon_cli.commands.workflow`. +- [ ] Move parser and executor into the shared module without changing command + strings. +- [ ] Leave stdout/stderr formatting and argparse integration in CLI. +- [ ] Update the builtin workflow tool to call the shared executor. +- [ ] Add an import-boundary test that prevents future builtin imports from + `ergon_cli.commands`. + +Acceptance gate: + +- [ ] CLI workflow tests pass with unchanged expected output. +- [ ] Builtin workflow tool tests pass without importing CLI command modules. +- [ ] `rg "ergon_cli.commands" ergon_builtins/ergon_builtins/tools` returns no + matches, except an explicit migration allowlist if needed. + +### DI-4: Replace special-case CLI experiment branches with composition descriptors + +**Issue fixed:** Generic CLI experiment composition contains hard-coded +knowledge of specific worker families, so every new example with special +bindings risks adding another `if worker_slug == ...` branch. + +Move the smoke-worker and `researchrubrics-workflow-cli-react` branch knowledge +out of generic `build_experiment`. Candidate shape: + +- Workers or benchmarks may expose an optional composition descriptor. +- The descriptor declares extra worker bindings, evaluator bindings, and static + assignment strategy. +- `build_experiment` applies descriptors generically after registry lookup. + +This keeps current behavior while making future examples add data rather than a +new `if worker_slug == ...` branch. + +Verification: + +- Characterization tests for smoke worker composition. +- Characterization tests for research-rubrics workflow composition. +- A test that a synthetic descriptor can add an extra worker binding without + editing `ergon_cli.composition`. + +Files: + +- Modify: `ergon_cli/ergon_cli/composition/__init__.py`. +- Add: a composition descriptor type under `ergon_core.api` or + `ergon_cli.composition`. +- Modify smoke fixture registration under + `ergon_core/ergon_core/test_support/smoke_fixtures/`. +- Modify research-rubrics worker/benchmark registration under + `ergon_builtins/ergon_builtins/workers/research_rubrics/` or + `ergon_builtins/ergon_builtins/registry_data.py`. +- Test: `tests/unit/cli/test_build_experiment_composition.py`. + +Current branches to eliminate from generic composition: + +- `_is_smoke_worker(worker_slug)`. +- `worker_slug == "researchrubrics-workflow-cli-react"`. +- suffix parsing for `-smoke-worker` and `-sadpath-smoke-worker`. +- direct imports of smoke timing criteria from generic CLI composition. + +Sketch: + +```python +class ExperimentCompositionDescriptor(BaseModel): + extra_workers: dict[str, WorkerSpec] + extra_evaluators: dict[str, Evaluator] + static_assignments: dict[str, list[str]] +``` + +Steps: + +- [ ] Add the descriptor type and a no-op default descriptor. +- [ ] Teach `build_experiment` to ask the selected worker/benchmark registry + entry for a descriptor. +- [ ] Move smoke leaf/recursive/failing-leaf bindings into smoke fixture-owned + descriptor code. +- [ ] Move research-rubrics manager/researcher bindings into + research-rubrics-owned descriptor code. +- [ ] Add an architecture test that blocks new hard-coded worker slug branches + in `ergon_cli.composition`. + +Acceptance gate: + +- [ ] No generic composition branch checks a concrete worker slug. +- [ ] Existing smoke and research-rubrics composition behavior is unchanged. +- [ ] A synthetic descriptor test proves new special composition can be added + without editing `build_experiment`. + +### DI-5: Route smoke/test harness composition through startup plugins + +**Issue fixed:** Test harness and smoke-fixture setup rely on direct imports +that blur production startup, CLI composition, and test-support registration. + +Replace direct core-to-CLI composition imports in test-harness paths with the +same registry/composition extension point used by production startup. Smoke +fixtures can still be opt-in, but the opt-in should register providers through +a plugin hook rather than teaching core about CLI composition. + +Verification: + +- Test harness remains disabled by default. +- With `ENABLE_TEST_HARNESS=1`, smoke fixtures still register and run. +- Architecture test documents the only allowed test-support imports. + +Files: + +- Modify: `ergon_core/ergon_core/core/api/test_harness.py`. +- Modify: `ergon_core/ergon_core/core/api/app.py`. +- Modify or use existing startup plugin settings in + `ergon_core/ergon_core/core/settings.py`. +- Test: `tests/unit/architecture/test_smoke_fixture_package_boundary.py`. +- Test: harness tests that currently exercise `ENABLE_TEST_HARNESS`. + +Steps: + +- [ ] Inventory current `ENABLE_TEST_HARNESS` and `ENABLE_SMOKE_FIXTURES` + behavior. +- [ ] Define the plugin hook that can register smoke fixtures or experiment + builders. +- [ ] Move test-harness composition to the plugin path. +- [ ] Preserve disabled-by-default behavior. +- [ ] Add an architecture allowlist for the few remaining test-support imports, + if any. + +Acceptance gate: + +- [ ] Test harness smoke behavior still works under explicit opt-in. +- [ ] Core app startup no longer needs to know smoke fixture implementation + modules by name. +- [ ] Architecture tests fail if new production runtime modules import + `ergon_core.test_support`. + +## Migration / risk + +The risk is not algorithmic behavior; it is import-time behavior. The current +registry performs eager optional-capability imports and model backend +registration. Moving this behind protocols must preserve: + +- Existing CLI defaults and slug names. +- Optional extras behavior and install hints. +- Model backend registration side effects. +- Test harness smoke fixture behavior under explicit flags. + +The first implementation step should be characterization tests around registry +resolution and CLI experiment construction before import paths are changed. + +## Open questions + +- Should the registry protocol live in `ergon_core.api`, `ergon_core.core`, or + a new package such as `ergon_runtime_contracts`? +- Should CLI remain the primary composition root, or should FastAPI startup and + CLI share a new composition module? +- Do existing consumers import `ergon_builtins.registry` directly, and if so do + those imports need compatibility wrappers? diff --git a/docs/rfcs/active/architecture-refactor-audit/02-test-brittleness-and-gaps.md b/docs/rfcs/active/architecture-refactor-audit/02-test-brittleness-and-gaps.md new file mode 100644 index 00000000..1525b9b6 --- /dev/null +++ b/docs/rfcs/active/architecture-refactor-audit/02-test-brittleness-and-gaps.md @@ -0,0 +1,441 @@ +--- +status: active +opened: 2026-04-27 +author: GPT-5.5 +architecture_refs: + - docs/architecture/07_testing.md + - docs/architecture/02_runtime_lifecycle.md + - docs/architecture/04_persistence.md +supersedes: [] +superseded_by: null +--- + +# RFC: Test Brittleness And Confidence Gaps + +## Problem + +Behavior-preserving refactors need trustworthy tests. Ergon already has useful +unit, integration, e2e, state, and real-LLM tiers, but the test surface has +grown alongside the code. Some tests encode current implementation details, +some test-support concepts leak toward runtime code, and some important +package-boundary expectations are not yet expressed as contracts. + +The goal is to make tests better at preserving behavior while reducing their +ability to freeze accidental architecture. + +## Current findings + +### Test support has explicit gates, but the boundary is fragile + +Smoke fixtures and test harness paths are mostly gated behind environment +flags such as `ENABLE_TEST_HARNESS` and `ENABLE_SMOKE_FIXTURES`. This is +useful, but it means import discipline matters. A small number of direct +imports can turn test-only composition into runtime coupling. + +### Existing architecture tests are valuable but narrow + +There are tests that assert smoke fixtures do not move into old production +paths. That pattern should expand: import-boundary rules should cover core to +builtins, builtins to CLI, and core to CLI exceptions. + +### State tests exercise behavior but may mix concerns + +The `tests/unit/state` tier appears to group workflow/tool/research-rubric +state behavior rather than a dedicated state package. These tests are useful, +but they should make clear whether they are verifying public behavior, database +state transitions, or current helper implementation. + +### Real-LLM and e2e tests are opt-in + +Opt-in real-LLM rollout tests and dashboard/e2e tests are valuable for catching +integration failures, but they are not always part of the fast feedback loop. +The refactor program needs a smaller characterization layer for behavior that +must not change during architecture cleanup. + +### Fixtures can hide missing contracts + +When tests rely on broad fixtures or sentinel identities, they can keep passing +even though production composition boundaries are unclear. Refactors should +prefer explicit fake providers and public-contract setup over reaching into +runtime internals. + +## Target shape + +The test suite should have a clear contract for each tier: + +- **Architecture tests** enforce import direction, package ownership, and + allowed exceptions. +- **Unit tests** verify pure behavior and service logic without requiring the + default builtins registry unless that is the unit under test. +- **State/integration tests** verify persisted runtime transitions through + public service boundaries. +- **E2E tests** verify deployed surfaces and dashboard/API hydration. +- **Real-LLM tests** verify representative model-facing workflows and artifact + health, gated by explicit credentials. + +Each behavior-preserving refactor should start by identifying which tier locks +the behavior being preserved. + +## Standards proposed + +- Add architecture tests for dependency direction and allowed import + exceptions. Exceptions should be named and justified in one place. +- Prefer fake implementations of public protocols over sentinel strings that + runtime code must recognize. +- Keep smoke fixtures and real-LLM harnesses under test-support or tests, with + explicit opt-in registration. +- Avoid tests that assert line-by-line implementation detail unless the detail + is itself a contract. +- For every major refactor, add or identify characterization tests before + moving code. +- Keep slow/e2e/real-LLM tests useful but non-blocking for local refactor + loops; provide smaller contract tests for behavior that must always pass. + +## Candidate fixes + +Each candidate below should include enough detail for an implementation plan to +be written without rediscovering the audit. Tests are themselves part of the +architecture here: they define what future refactors are not allowed to break. + +### TB-1: Add import-boundary architecture tests + +**Issue fixed:** Package-boundary rules are currently mostly social +conventions, so new reverse imports or ad hoc slug branches can land without a +fast test failure. + +Create tests that parse imports and enforce the intended package graph. Start +with warnings/allowlists for current known violations, then tighten the rules +as dependency-inversion fixes land. + +Initial rules: + +- `ergon_core.core.runtime` should not import `ergon_builtins`. +- `ergon_core` should not import `ergon_cli` except explicitly allowed + test-harness paths. +- `ergon_builtins` should not import `ergon_cli.commands`. +- Production runtime modules should not import `ergon_core.test_support` or + `tests.*`. + +Candidate location: `tests/unit/architecture/test_package_boundaries.py`. + +Suggested helper shape: + +```python +def assert_no_imports(package_root: Path, forbidden: str, *, allowlist: set[str]) -> None: + offenders = scan_python_imports(package_root, forbidden) + unexpected = offenders - allowlist + assert unexpected == set() +``` + +Initial allowlist should include only named, reviewed exceptions. Avoid broad +directory-level exceptions unless the whole directory is intentionally an +adapter or test-support surface. + +Steps: + +- [ ] Implement a small AST-based import scanner, not a regex-only test. +- [ ] Add rules for core-to-builtins, core-to-cli, builtins-to-cli, and + production-to-test-support. +- [ ] Encode current known violations as explicit allowlist entries with a + linked candidate fix ID. +- [ ] Add a second test that fails on new concrete worker/benchmark slug + branches in generic composition modules. +- [ ] Document how to update the allowlist when a refactor removes a violation. + +Verification: + +- Test fails with a clear list of violating import edges. +- Current exceptions are named in one allowlist with comments. + +Acceptance gate: + +- [ ] Architecture test passes with only reviewed exceptions. +- [ ] Adding `from ergon_cli.commands...` to a builtin tool fails the test. +- [ ] Adding `worker_slug == "some-example"` to generic composition fails or is + caught by the branch-pattern test. + +### TB-2: Add CLI benchmark-run characterization tests + +**Issue fixed:** The benchmark-run path combines DB setup, experiment +composition, persistence, cohort creation, run creation, event dispatch, and +polling. Refactoring it without characterization tests risks changing behavior +while only moving imports around. + +Before changing composition or registry resolution, lock down the current +observable `ergon benchmark run` setup path without requiring a live Inngest +run: + +- `ensure_db()` is called before persistence. +- `build_experiment()` receives CLI args unchanged. +- `experiment.validate()` runs before `experiment.persist()`. +- cohort resolution uses the explicit cohort or benchmark slug. +- `create_run()` receives the persisted definition. +- `WorkflowStartedEvent` carries the run ID and definition ID. +- polling reads `RunRecord` until a terminal status. + +Candidate location: `tests/unit/cli/test_benchmark_run_flow.py`. + +Suggested cases: + +- `benchmark run` persists before dispatching. +- explicit `--cohort` is used when present. +- default cohort name falls back to benchmark slug. +- timeout returns a timeout handle without pretending the run completed. +- terminal failed/cancelled status exits non-zero. + +Test approach: + +- Monkeypatch `ensure_db`, `build_experiment`, `experiment_cohort_service`, + `create_run`, `inngest_client.send`, and `get_session`. +- Use a fake session whose `get(RunRecord, run.id)` returns a sequence of + statuses. +- Avoid real Postgres, real Inngest, and real builtins imports unless the test + is explicitly about registry wiring. + +Verification: + +- Tests use fakes/mocks at service boundaries, not real Postgres or real + Inngest. +- Refactors of composition/import paths keep this test green. + +Acceptance gate: + +- [ ] A future rewrite of `run_benchmark` can move code around but cannot skip + validate, persist, run creation, event dispatch, or terminal polling. +- [ ] The test names describe user-visible behavior, not private helper calls. + +### TB-3: Add registry protocol contract tests + +**Issue fixed:** Once registry lookup becomes injectable, there is no shared +contract proving that the builtins adapter and test fakes behave the same way. + +Once a registry/resolver protocol exists, test it independently from CLI and +runtime orchestration: + +- known worker/benchmark/evaluator slugs resolve; +- unknown slugs produce a typed error or clear `KeyError`; +- optional install hints remain available; +- model backend registration side effects still happen exactly once. + +Candidate location: `tests/unit/runtime/test_runtime_registry_contract.py` or +`tests/unit/api/test_registry_contract.py`, depending on ownership. + +Verification: + +- Same contract runs against the builtins-backed registry adapter and a small + fake registry used by tests. + +Files: + +- Test: `tests/unit/runtime/test_runtime_registry_contract.py`. +- Fixture/helper: a fake registry implementation near the test or under + `ergon_core.test_support`. +- Optional test: `tests/unit/architecture/test_registry_imports.py`. + +Steps: + +- [ ] Write the contract tests against a fixture parameter named `registry`. +- [ ] Run the same tests against the builtins adapter and fake registry. +- [ ] Assert missing-slug behavior explicitly. +- [ ] Assert install hints do not require importing data-heavy optional extras. +- [ ] Assert model backend registration remains idempotent. + +Acceptance gate: + +- [ ] Runtime services can be tested with fake registries. +- [ ] Builtins adapter passes the same contract as the fake implementation. +- [ ] Contract tests fail if a registry lookup imports CLI code. + +### TB-4: Reclassify `tests/unit/state` by contract type + +**Issue fixed:** The `state` test tier mixes workflow commands, persisted +runtime transitions, worker/tool behavior, benchmark composition, and fixture +behavior under one vague label. + +Add comments, module names, or a README that explains what the "state" tier +means. Then split or rename tests where the current grouping hides intent. + +Suggested categories: + +- workflow command behavior; +- persisted graph/task state transitions; +- worker/tool state interaction; +- research-rubrics benchmark/worker composition; +- fixture-only behavior. + +Verification: + +- A reader can tell why each state test exists without knowing the historical + branch that introduced it. +- No test loses coverage during renaming or movement. + +Files: + +- Add: `tests/unit/state/README.md` or rename/split tests into clearer + directories. +- Review: + `tests/unit/state/test_research_rubrics_workers.py`. +- Review: + `tests/unit/state/test_research_rubrics_benchmark.py`. +- Review workflow/tool state tests in the same directory. + +Steps: + +- [ ] Inventory each state test file and classify it as workflow command, + persisted graph/task transition, worker/tool behavior, benchmark + composition, or fixture behavior. +- [ ] Rename files only when the existing name hides the contract. +- [ ] Move fixture-only behavior under a fixture/test-support category if it is + not testing runtime state. +- [ ] Add README language that "state" is a test tier, not a production domain + package. + +Acceptance gate: + +- [ ] Every file in `tests/unit/state` has an obvious contract category. +- [ ] No test import path changes require production code changes. + +### TB-5: Add fast artifact-health tests for real-LLM assumptions + +**Issue fixed:** Some real-LLM artifact assumptions are only checked in opt-in +credentialed paths, so artifact schema or parser regressions can slip past the +fast local suite. + +The real-LLM artifact-health harness is opt-in, but some assumptions should be +validated without credentials: + +- rollout artifact directories are named and shaped consistently; +- required metadata fields are present; +- failed/incomplete runs produce diagnosable artifacts; +- fixture artifacts exercise the same reader/parser used by real runs. + +Candidate location: extend +`tests/unit/runtime/test_real_llm_rollout_artifact_health.py` or split a helper +contract test nearby. + +Verification: + +- Fast tests run without `ERGON_REAL_LLM`. +- Real-LLM tests remain opt-in but rely on the same artifact validation helper. + +Files: + +- Review/extend: + `tests/unit/runtime/test_real_llm_rollout_artifact_health.py`. +- Review: + `tests/real_llm/artifact_health.py`. +- Review: + `tests/real_llm/rollout.py`. + +Required cases: + +- artifact directory with complete healthy rollout passes; +- missing required metadata fails with actionable error; +- partial failed rollout still produces enough diagnostic fields; +- worker slug extraction handles both snake_case and camelCase shapes; +- fixture artifact parser is the same parser used by real-LLM checks. + +Acceptance gate: + +- [ ] Unit artifact-health tests pass without network credentials. +- [ ] Real-LLM path delegates to the same validation helper. +- [ ] Failure messages name the missing artifact or field. + +### TB-6: Replace sentinel-aware runtime tests with fake provider tests + +**Issue fixed:** Tests that rely on stub sandbox IDs or sentinel parsing +encourage production runtime code to understand test/provider implementation +details. + +Where runtime tests currently require stub or sentinel sandbox identities, +introduce fake provider implementations that satisfy public provider protocols. +The runtime should observe provider behavior, not parse provider-specific +sentinel strings. + +Verification: + +- Tests still cover skipped, failed, cancelled, and cleanup paths. +- Production runtime modules no longer need helpers such as + `is_stub_sandbox_id`. + +Files: + +- Review tests touching sandbox cleanup, cancellation, skipped tasks, and + propagation. +- Add fake provider helpers under `ergon_core/ergon_core/test_support/` only if + they are reusable across test tiers. +- Pair with code cleanup in `core/sandbox/manager.py` only after + characterization tests exist. + +Steps: + +- [ ] Inventory tests that assert or construct stub sandbox IDs. +- [ ] Define fake provider behavior in terms of public provider methods: + create, reconnect, terminate, publish resources. +- [ ] Replace tests that expect sentinel parsing with tests that assert provider + method calls and runtime state transitions. +- [ ] Add an architecture test blocking runtime imports of + `is_stub_sandbox_id`. + +Acceptance gate: + +- [ ] Runtime behavior for skipped/failed/cancelled cleanup is still covered. +- [ ] Runtime code no longer branches on provider-specific sentinel strings. +- [ ] Test fakes live under test support, not production provider modules. + +## Phase gates for the test stream + +### Phase T1 — Boundary tests first + +Scope: + +- `tests/unit/architecture/test_package_boundaries.py`. +- Allowlist current violations with links to `DI-*` / `CQ-*`. + +Acceptance: + +- [ ] Boundary tests pass and fail when a deliberate forbidden import is added + locally. + +### Phase T2 — Characterization before refactor + +Scope: + +- CLI benchmark-run characterization. +- Registry contract tests. +- Artifact-health fast contracts. + +Acceptance: + +- [ ] Refactor candidates have tests that describe the behavior they preserve. +- [ ] No new test requires real Postgres, real Inngest, or real LLM credentials. + +### Phase T3 — Ratchet allowlists down + +Scope: + +- After dependency-inversion and code-quality refactors land, remove resolved + allowlist entries. + +Acceptance: + +- [ ] Import-boundary allowlist shrinks over time. +- [ ] New exceptions require an RFC or explicit architecture-doc note. + +## Migration / risk + +The main risk is over-constraining architecture too early. The first pass +should allow existing known exceptions with comments, then ratchet them down as +refactors land. + +The second risk is test churn without confidence gain. New tests should be +written around observable behavior and import contracts, not around temporary +helper names introduced during the refactor. + +## Open questions + +- Should architecture tests live under `tests/unit/architecture`, or should + there be a dedicated `tests/architecture` tier? +- Which tests should be required before accepting dependency-inversion work? +- Should real-LLM artifact-health checks define a small golden contract that + can run without external model credentials? diff --git a/docs/rfcs/active/architecture-refactor-audit/03-code-quality.md b/docs/rfcs/active/architecture-refactor-audit/03-code-quality.md new file mode 100644 index 00000000..864c88a4 --- /dev/null +++ b/docs/rfcs/active/architecture-refactor-audit/03-code-quality.md @@ -0,0 +1,642 @@ +--- +status: active +opened: 2026-04-27 +author: GPT-5.5 +architecture_refs: + - docs/architecture/README.md + - docs/architecture/01_public_api.md + - docs/architecture/02_runtime_lifecycle.md + - docs/architecture/06_builtins.md +supersedes: [] +superseded_by: null +--- + +# RFC: Code Quality, Duplication, And Complexity + +## Problem + +Fast iteration has left parts of Ergon with high-complexity functions, +branch-heavy example paths, duplicated orchestration logic, and names that no +longer communicate precise ownership. The project already uses Ruff, ty, +slopcop, xenon, and radon-related tooling, but current configuration mostly +documents pre-existing debt rather than defining a refactor target. + +This audit defines the code-quality lens for behavior-preserving cleanup. + +## Current findings + +### Known complexity debt is already listed + +The root `pyproject.toml` has explicit complexity ignores for files such as +experiment persistence, experiment validation, RL rollout/extraction, MiniF2F +loading, file evidence collection, transformer message formatting, and scripts. +Those comments are useful because they identify areas where orchestration has +grown large enough to need ownership review. + +### Generic paths contain example-specific branches + +`ergon_cli.composition.build_experiment` has special branches for smoke workers +and `researchrubrics-workflow-cli-react`. These branches preserve necessary +behavior today, but the pattern does not scale. Generic composition code should +not need to know every benchmark or worker family that requires extra bindings. + +### Tool and workflow code can duplicate service behavior + +CLI command modules, builtin tools, and runtime services all touch workflow +semantics. Without a shared application service boundary, the same concept can +be parsed, validated, or executed in multiple places. + +### Names sometimes encode historical implementation + +Names such as "stub" can mean test double, development default, or lightweight +implementation depending on context. Ambiguous names make it harder to enforce +production/test boundaries and public/private API rules. + +### Deep nesting often reflects missing concepts + +When functions perform lookup, construction, validation, persistence, event +dispatch, and rendering in one flow, nesting and branch count increase. The +answer is not mechanical extraction; it is naming the concepts that already +exist and moving them to the owner that can enforce their invariants. + +## Target shape + +Code quality should be judged against architecture, not only metrics: + +- A module should have one clear owner and one reason to change. +- Public APIs should describe stable concepts, not current storage or CLI + mechanics. +- Composition should be declarative where possible and isolated where it must + branch. +- Runtime orchestration should read as a sequence of named domain operations. +- Tests should cover behavior before complexity-reducing rewrites. + +## Standards proposed + +- Treat new high-complexity ignores as design review triggers, not routine + lint suppressions. +- Prefer small domain objects or command/result types when a function is + passing many loosely related parameters across package boundaries. +- Keep branch-heavy compatibility paths local to adapters or composition + modules, not inside core runtime services. +- Deduplicate only after confirming the duplicated code represents the same + concept. Similar code in different domains may deserve different names. +- Rename "stub", "smoke", and "test" concepts when they are production + defaults or examples rather than test doubles. +- Use architecture docs to record anti-patterns and accepted exceptions so + refactors do not rely on tribal memory. + +## Candidate fixes + +Each candidate below should be concrete enough to become a scoped PR or a +section in an implementation plan. The intent is not generic "clean code"; the +intent is to find where the project encoded missing domain concepts as +duplicated services, private helpers, slug branches, or lint suppressions. + +### CQ-1: Create a complexity ledger from current ignores + +**Issue fixed:** Complexity suppressions are documented inline in +`pyproject.toml`, but there is no owner, smell classification, priority, or +exit criterion for paying the debt down. + +Turn the existing `pyproject.toml` complexity-ignore comments into an explicit +ledger that ranks each offender by risk, ownership, and likely refactor path. + +Initial entries should include: + +- `ExperimentPersistenceService.persist_definition` +- `Experiment.validate` +- RL rollout/extraction helpers +- MiniF2F problem loading +- file evidence collection +- transformer message formatting +- standalone scripts ignored for CLI/script reasons + +Candidate output: a section in this RFC, or a separate +`complexity-ledger.md` in this folder if the list gets long. + +Verification: + +- Every current C901 ignore has an owner, reason, and intended disposition: + keep, split, move, rename, or delete. +- New C901 ignores require adding an entry to the ledger. + +Ledger fields: + +```markdown +| Item | File | Current reason | Domain owner | Smell | Candidate fix | Gate | +|---|---|---|---|---|---|---| +``` + +Smell taxonomy: + +- orchestration doing persistence work; +- validation rules hidden in one large method; +- example-specific branch in generic path; +- private helper cluster that wants a domain object; +- duplicate service responsibility; +- optional dependency/test fallback mixed into production flow. + +Steps: + +- [ ] Convert each current C901 ignore into a ledger row. +- [ ] Run `rg "^def _|^ def _|class .*Service" ergon_core/ergon_core/core/runtime/services` + and add obvious private-helper clusters to the ledger even if not C901. +- [ ] Rank rows by "blocks dependency inversion", "blocks test confidence", + and "local cleanup only". +- [ ] Add a policy that any new C901 ignore must cite a ledger row or RFC. + +Acceptance gate: + +- [ ] The ledger exists and covers every current complexity ignore. +- [ ] The ledger includes at least the large service/private-helper clusters in + `task_management_service.py`, `workflow_service.py`, + `graph_repository.py`, `task_execution_service.py`, and + `experiment_persistence_service.py`. + +### CQ-2: Split experiment composition into generic pipeline plus descriptors + +**Issue fixed:** Generic experiment composition currently knows about concrete +worker families and fixture behavior, which turns every special example into a +potential new branch in shared CLI code. + +Refactor `ergon_cli.composition.build_experiment` so the generic path performs +only these steps: + +1. load registry; +2. construct benchmark/evaluator; +3. ask the selected benchmark/worker for any composition descriptor; +4. build the `Experiment` from descriptors and defaults. + +Current smoke and research-rubrics branches become descriptor providers. This +preserves behavior but removes the pattern where each special worker adds a new +generic CLI branch. + +Verification: + +- Existing smoke and research-rubrics composition tests pass. +- A new fake descriptor test proves a worker can request extra bindings without + changing `build_experiment`. + +Files: + +- Modify: `ergon_cli/ergon_cli/composition/__init__.py`. +- Add descriptor type where selected by `DI-4`. +- Modify smoke fixture registration and research-rubrics registration to + provide descriptors. +- Test: `tests/unit/cli/test_build_experiment_composition.py`. + +Implementation steps: + +- [ ] Write tests that fail on current hard-coded branches being required for + smoke and research-rubrics composition. +- [ ] Add descriptor support with a no-op default. +- [ ] Move smoke branch logic into smoke-owned descriptor provider. +- [ ] Move research-rubrics branch logic into research-rubrics-owned descriptor + provider. +- [ ] Delete `_is_smoke_worker`, `_build_smoke_experiment`, and + `_build_researchrubrics_workflow_experiment` from generic composition + once descriptors cover them. +- [ ] Add an architecture test that blocks new `if worker_slug ==` branches in + generic composition code. + +Acceptance gate: + +- [ ] `ergon_cli.composition` no longer contains concrete worker slug checks. +- [ ] Existing smoke and research-rubrics unit tests pass. +- [ ] New descriptor test demonstrates extension without modifying CLI + composition. + +### CQ-3: Split workflow command execution from CLI rendering + +**Issue fixed:** Workflow parsing, execution, and CLI rendering are coupled +together, causing non-CLI callers to import CLI command modules and making +workflow behavior harder to test independently. + +Separate workflow command concerns into three layers: + +- parser: command string/argv to typed command; +- executor: typed command plus context/session/service to result; +- renderer: result to CLI stdout/stderr text. + +The CLI owns rendering. Builtin agent tools call parser/executor and format +tool-friendly strings. Runtime services own state changes. + +Verification: + +- CLI tests assert the same stdout/stderr behavior. +- Builtin workflow tool tests no longer import `ergon_cli.commands.workflow`. +- Parser/executor tests cover invalid commands, missing context, dry-run paths, + and successful resource/topology operations. + +Files: + +- Add shared parser/executor module selected by `DI-3`. +- Modify: `ergon_cli/ergon_cli/commands/workflow.py`. +- Modify: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py`. +- Test: `tests/unit/cli/test_workflow_cli.py`. +- Test: builtin workflow tool test under `tests/unit/state` or a clearer + renamed location. + +Acceptance gate: + +- [ ] CLI rendering remains byte-for-byte compatible where tests already assert + output. +- [ ] Builtin tools no longer import CLI command modules. +- [ ] Shared executor accepts typed context rather than raw argparse namespace. + +### CQ-4: Audit and rename ambiguous "stub" concepts + +**Issue fixed:** The word "stub" is used across test doubles, development +defaults, smoke fixtures, and lightweight implementations, making it unclear +which code is production behavior and which code is test support. + +Classify every "stub" usage into one of four buckets: + +- test double; +- smoke fixture; +- development default; +- lightweight production implementation. + +Then rename where the current name lies about ownership. For example, a +production default should not be named like a test double, while a test fake +should live under test support and use fake/test naming consistently. + +Verification: + +- `rg "stub|smoke|test_harness|test_support"` has an reviewed allowlist for + production packages. +- User-facing CLI defaults do not imply test-only implementations unless they + really are test-only. + +Files: + +- Review: `ergon_core/ergon_core/core/sandbox/manager.py`. +- Review: `ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py`. +- Review: `ergon_core/ergon_core/core/rl/eval_runner.py`. +- Review: `ergon_core/ergon_core/test_support/smoke_fixtures/`. +- Review user-facing CLI defaults in `ergon_cli/ergon_cli/main.py`. + +Steps: + +- [ ] Produce a `stub-smoke-test-naming` section in the complexity ledger or a + small adjacent audit file. +- [ ] Rename test doubles to `Fake*` or `Test*` and move them under + test-support when possible. +- [ ] Rename lightweight production defaults to names that describe their + behavior, not their historical test role. +- [ ] Make production request contracts require explicit worker/evaluator + choices where defaulting to a stub hides behavior. +- [ ] Add tests for any compatibility aliases that must remain. + +Acceptance gate: + +- [ ] Production runtime modules do not branch on "stub" identity. +- [ ] User-facing docs/defaults no longer imply that test doubles are production + defaults. + +### CQ-5: Refactor `persist_definition` behind smaller persistence writers + +**Issue fixed:** Experiment definition persistence is concentrated in one +high-complexity method, so table-writing mechanics and experiment invariants +are hard to review independently. + +`ExperimentPersistenceService.persist_definition` is allowed to be complex +today because it writes a full experiment graph. Keep the transaction boundary, +but split the implementation into named private writer methods or helper +objects: + +- definition row writer; +- worker/evaluator writer; +- instance/task/dependency writer; +- assignment writer; +- task-evaluator link writer. + +The goal is not to change schema or behavior; it is to make persistence +invariants reviewable in smaller units. + +Verification: + +- Existing persistence tests pass. +- Add a focused test for multi-worker assignments if one does not already + cover the branch that motivated CLI special cases. +- Transaction rollback behavior remains unchanged. + +Files: + +- Modify: + `ergon_core/ergon_core/core/runtime/services/experiment_persistence_service.py`. +- Potential new helpers under + `ergon_core/ergon_core/core/persistence/definitions/` if the extracted code + is persistence-model-specific rather than runtime-service-specific. +- Test existing experiment persistence tests, plus add focused tests if missing. + +Implementation steps: + +- [ ] Add characterization tests for single-worker, multi-worker, dependency, + assignment, and evaluator-link persistence. +- [ ] Extract private writer methods without changing transaction boundaries. +- [ ] Name each writer by domain concept, not table name only. +- [ ] Keep `Experiment.persist()` public behavior unchanged. +- [ ] Remove or reduce the C901 ignore only if the extracted shape makes that + honest. + +Acceptance gate: + +- [ ] The service reads as orchestration over named writer steps. +- [ ] Rollback behavior remains a single transaction. +- [ ] Multi-worker assignment behavior is covered by tests. + +### CQ-6: Refactor `Experiment.validate` into rule objects or named validators + +**Issue fixed:** Experiment validation rules are concentrated in one +high-complexity public method, which makes it hard to tell which invariant +failed and hard to add tests for individual rule families. + +Split validation by invariant category while preserving the public +`Experiment.validate()` entrypoint: + +- task uniqueness and dependency validity; +- worker assignment validity; +- evaluator requirement coverage; +- multi-worker/subtask binding validity. + +This makes future public API changes easier to reason about without changing +the caller contract. + +Verification: + +- Existing validation tests pass. +- Each validator has at least one direct test for its failure mode. +- Error messages stay at least as actionable as current messages. + +Files: + +- Modify: `ergon_core/ergon_core/api/experiment.py`. +- Potential create: `ergon_core/ergon_core/api/experiment_validation.py`. +- Test: existing experiment API tests or new + `tests/unit/api/test_experiment_validation.py`. + +Implementation steps: + +- [ ] Snapshot current validation failure messages for representative invalid + experiments. +- [ ] Extract validators for task graph, assignments, evaluator coverage, and + worker bindings. +- [ ] Keep `Experiment.validate()` as the single public entrypoint. +- [ ] Avoid introducing a new public validation framework unless tests show it + pays for itself. + +Acceptance gate: + +- [ ] Public caller behavior is unchanged. +- [ ] Validation rules are testable independently. +- [ ] The original C901 ignore can be removed or justified with a smaller + remaining scope. + +### CQ-7: Establish a "no new branch-if example path" rule + +**Issue fixed:** The codebase has no enforceable guardrail preventing new +example-specific slug checks from being added to generic composition or runtime +paths. + +Add code review guidance and, where possible, tests that reject new generic +composition branches keyed to a specific benchmark or worker slug. The standard +should be: if an example needs special composition, it must declare that need +through a descriptor/hook owned by the example package. + +Verification: + +- Architecture or lint-style test detects new `if worker_slug ==` branches in + generic composition modules, with an allowlist during migration. +- Architecture docs record the accepted extension point. + +Files: + +- Test: `tests/unit/architecture/test_no_ad_hoc_slug_branching.py`. +- Update: `docs/architecture/06_builtins.md` after descriptor/composition + extension point is accepted. + +Rules to enforce: + +- No concrete benchmark/worker/evaluator slug comparisons in generic CLI + composition. +- No suffix parsing for a worker family in generic composition. +- No test-support imports from generic composition unless behind an approved + plugin/harness boundary. +- Slug checks are allowed inside the package that owns the slug family. + +Suggested test inputs: + +- Scan `ergon_cli/ergon_cli/composition`. +- Scan generic runtime services after registry injection is introduced. +- Allowlist current branches only until `CQ-2` lands. + +Acceptance gate: + +- [ ] Adding a new concrete slug branch to generic composition fails tests. +- [ ] Approved extension point is documented. + +### CQ-8: Add module ownership headers only where boundaries are unclear + +**Issue fixed:** Some modules repeatedly attract code from neighboring domains +because their ownership boundary is implicit and only understood by recent +contributors. + +For modules that repeatedly attract misplaced code, add a short top-level +docstring stating what the module owns and what does not belong there. Good +targets are composition, workflow command execution, registry adapters, and +test-support bootstrap modules. + +Verification: + +- Headers are short and enforceable, not narrative. +- Any new ownership statement points to the relevant architecture doc or RFC. + +Candidate modules: + +- `ergon_cli/ergon_cli/composition/__init__.py`. +- Shared workflow command executor introduced by `CQ-3`. +- Registry protocol/adapter modules introduced by `DI-1`. +- Smoke fixture bootstrap modules. +- Runtime services that remain broad after the DDD audit. + +Acceptance gate: + +- [ ] Header says what belongs and what does not belong. +- [ ] Header does not duplicate implementation details. +- [ ] Reviewers can use it to reject misplaced future code. + +### CQ-9: Audit runtime services using DDD-style boundaries + +**Issue fixed:** The runtime services folder contains many service-shaped +modules, but it is not clear which are true domain/application services and +which are duplicated lifecycle fragments or repositories wearing service names. + +The services folder currently contains many service-shaped modules. Some may be +right-sized; others may be procedural clusters that hide duplicate domain +concepts. Audit the folder using domain-driven ownership questions before +moving code: + +- What aggregate or lifecycle does this service own? +- What invariant does it enforce? +- What repositories/providers does it depend on? +- Which other services duplicate the same decision? +- Which private helpers are really domain policies? + +Initial service map to audit: + +```text +ergon_core/ergon_core/core/runtime/services/ + task_management_service.py + task_execution_service.py + workflow_service.py + workflow_initialization_service.py + workflow_finalization_service.py + graph_repository.py + task_cleanup_service.py + task_propagation_service.py + subtask_cancellation_service.py + subtask_blocking_service.py + task_inspection_service.py + experiment_persistence_service.py + evaluator_dispatch_service.py + evaluation_persistence_service.py + rubric_evaluation_service.py + run_service.py + run_read_service.py + cohort_service.py + cohort_stats_service.py + communication_service.py +``` + +Likely duplicate/overlap questions: + +- Do `task_management_service`, `subtask_cancellation_service`, + `subtask_blocking_service`, `task_cleanup_service`, and + `task_propagation_service` encode one task-lifecycle domain or genuinely + separate use cases? +- Does `workflow_service` duplicate graph/resource lookup logic that belongs in + a graph/resource application service? +- Is `graph_repository` both persistence repository and mutation-domain + service? +- Are evaluation dispatch, rubric evaluation, and evaluation persistence cleanly + separated by responsibility? + +Deliverable: + +- Add `04-runtime-service-domain-audit.md` to this RFC folder, or add a + detailed section here if the audit stays short. + +Acceptance gate: + +- [ ] Every service module has a one-sentence responsibility statement. +- [ ] Duplicate responsibilities are listed with candidate merge/split actions. +- [ ] No code moves happen until characterization tests cover the affected + lifecycle. + +### CQ-10: Audit private helpers as design-smell signals + +**Issue fixed:** Large clusters of private helpers can hide missing domain +policies, query objects, DTO mappers, or misplaced responsibilities, but today +they are not audited as architecture signals. + +Private `_` functions are not inherently bad, but clusters of private helpers +often mean the code is compensating for a missing domain object, policy, or +repository. Audit helpers before extracting them mechanically. + +Initial findings to inspect: + +- `task_management_service.py` has validation, invalidation, edge reset, + execution lookup, and dispatch helpers. +- `workflow_service.py` has sandbox manager lookup, task/resource references, + node scope resolution, descendant traversal, producer lookup, and copy + destination helpers. +- `graph_repository.py` has row lookup, sequence allocation, mutation logging, + cycle checks, DTO conversion, and snapshot helpers. +- `task_execution_service.py` has graph-native preparation, definition + preparation, attempt numbering, and status emission. + +Classification: + +- **Keep private helper:** local readability helper with no independent + invariant. +- **Promote to domain policy:** helper encodes a rule that needs tests and a + name. +- **Move to repository/query:** helper is mostly persistence lookup. +- **Move to DTO/mapper:** helper converts persistence rows to transport/domain + objects. +- **Delete after boundary change:** helper exists only because current package + layering is wrong. + +Acceptance gate: + +- [ ] Helper audit identifies at least five helpers to promote/move/delete. +- [ ] Each promoted helper gets a direct test or is covered by an existing + characterization test. +- [ ] No helper is extracted merely to reduce line count without a better name + or owner. + +## Phase gates for the code-quality stream + +### Phase Q1 — Audit before movement + +Scope: + +- Complexity ledger. +- Runtime service domain audit. +- Private-helper audit. +- Ad hoc branch architecture tests with current allowlist. + +Acceptance: + +- [ ] Audits identify concrete files and candidate actions. +- [ ] Tests prevent new ad hoc slug branches. +- [ ] No production behavior changes. + +### Phase Q2 — Composition and workflow cleanup + +Scope: + +- Descriptor-based experiment composition. +- Workflow parser/executor/renderer split. + +Acceptance: + +- [ ] Generic composition has no concrete example slug branches. +- [ ] Builtin tools no longer import CLI command modules. +- [ ] Characterization tests pass. + +### Phase Q3 — Service/domain refactors + +Scope: + +- One lifecycle cluster at a time, chosen from the service domain audit. +- Start with the cluster that blocks dependency inversion or test clarity most. + +Acceptance: + +- [ ] Behavior is locked by characterization tests before moving code. +- [ ] Each extracted domain policy has a named owner and test. +- [ ] Complexity ignores shrink or have updated ledger justification. + +## Migration / risk + +The main risk is aesthetic refactoring that changes behavior or creates more +abstractions without reducing coupling. Refactors should be small enough to +review and should preserve public behavior unless a separate RFC says +otherwise. + +The second risk is over-indexing on cyclomatic complexity. Some orchestration +is inherently sequential and readable. A lower branch count is only a win if +the resulting names clarify invariants and failure modes. + +## Open questions + +- Which complexity metric should become a hard CI gate after the first cleanup + pass: Ruff C901, xenon rank, radon score, or a smaller custom import/size + check? +- Should `ergon_cli.composition` remain one module after descriptors are + introduced, or should it become a package with separate composition owners? +- Which naming changes are worth compatibility wrappers, and which can be + changed directly because they are branch-local implementation details? diff --git a/docs/rfcs/active/architecture-refactor-audit/README.md b/docs/rfcs/active/architecture-refactor-audit/README.md new file mode 100644 index 00000000..6b7eb5c3 --- /dev/null +++ b/docs/rfcs/active/architecture-refactor-audit/README.md @@ -0,0 +1,142 @@ +--- +status: active +opened: 2026-04-27 +author: GPT-5.5 +architecture_refs: + - docs/architecture/README.md + - docs/architecture/01_public_api.md + - docs/architecture/02_runtime_lifecycle.md + - docs/architecture/04_persistence.md + - docs/architecture/06_builtins.md + - docs/architecture/07_testing.md +supersedes: [] +superseded_by: null +--- + +# RFC: Architecture Refactor Audit + +## Problem + +Ergon has moved quickly enough that useful behavior now lives beside accidental +structure: direct package coupling, special-case composition branches, +duplicated setup logic, test-support leakage, and high-complexity orchestration +code. The immediate goal is not to redesign product behavior. It is to make the +existing behavior easier to understand, test, extend, and preserve. + +This RFC folder starts an audit-driven refactor program. It separates the work +into three lenses so each proposal can stay concrete: + +- [`01-dependency-inversion.md`](01-dependency-inversion.md) covers package + boundaries, public API shape, registry resolution, and cross-package imports. +- [`02-test-brittleness-and-gaps.md`](02-test-brittleness-and-gaps.md) covers + brittle tests, fixture boundaries, missing contract tests, and real-LLM/e2e + confidence gaps. +- [`03-code-quality.md`](03-code-quality.md) covers duplication, branch-heavy + example paths, excessive nesting, cyclomatic complexity, naming drift, and + file ownership. + +## Refactor rule + +Behavior stays the same unless a follow-up RFC explicitly changes it. The +program should first extract boundaries, name concepts, move code to better +owners, and add characterization tests around risky flows. Any behavioral +change discovered during cleanup should be split into a separate bug or RFC. + +## Target architecture principles + +1. **Core owns contracts, not default implementations.** `ergon_core` should + expose stable interfaces and runtime services; concrete benchmark, worker, + evaluator, model, and sandbox registrations should be injected through an + explicit composition boundary. +2. **Builtins are plugins, not runtime prerequisites.** `ergon_builtins` should + implement public contracts and provide a default registry bundle without + requiring core runtime imports to know about that bundle. +3. **CLI is an adapter.** `ergon_cli` should parse user input and call shared + application services. Agent tools and core runtime code should not depend on + CLI command modules. +4. **Tests are consumers of public contracts.** Test support may provide + fixtures, fake providers, and smoke registrations, but core code should not + branch on test identities or sentinel values. +5. **Complexity should be paid down near ownership boundaries.** Large + orchestration functions should be split by responsibility only when the + split clarifies invariants or makes behavior easier to test. + +## Proposal + +Adopt this RFC folder as the tracking document for an architecture audit. Each +child document should collect concrete findings, define the target shape, and +list candidate refactors in dependency order. Accepted follow-up RFCs and +implementation plans can then pull from these findings without turning this +folder into a single mega-plan. + +The initial work should prioritize: + +1. Dependency inversion and composition boundaries, because package coupling + makes every later cleanup harder. +2. Test brittleness and missing contract coverage, because behavior-preserving + refactors need confidence. +3. Code quality and complexity cleanup, because it benefits most after the + owning modules and contracts are clearer. + +## Invariants affected + +This audit does not change runtime invariants by itself. It may produce +follow-up RFCs that update: + +- `docs/architecture/01_public_api.md` if public API ownership changes. +- `docs/architecture/02_runtime_lifecycle.md` if runtime composition or task + orchestration boundaries change. +- `docs/architecture/06_builtins.md` if registry/plugin semantics change. +- `docs/architecture/07_testing.md` if test tier responsibilities change. + +## Migration + +No code migration is proposed in this folder directly. Migration guidance lives +inside each child audit document and should be converted into implementation +plans only after the target architecture is accepted. + +Before implementation, each refactor should have: + +- A characterization test or existing test reference for the behavior being + preserved. +- A clear package-boundary statement: what module owns the new abstraction and + which packages may import it. +- A rollback path if the refactor uncovers behavior that differs from the docs. + +## Alternatives considered + +### One giant architecture RFC + +This would be easy to create, but it would encourage broad, vague findings and +make acceptance difficult. Dependency inversion, tests, and code quality have +different audiences and different risk profiles. + +### Three unrelated top-level RFCs + +This would make each stream independently acceptable, but it would hide the +shared refactor goal. The folder keeps the audit cohesive while preserving +focused documents. + +### Immediate code cleanup without an audit + +This risks preserving the current accidental architecture under new names. +Because the goal is behavior-preserving refactor, the first deliverable should +be shared understanding and standards. + +## Open questions + +- Which package boundary should own registry resolution: core, a new + composition package, or the CLI/application layer? +- How much backward compatibility is required for current import paths inside + the repo? +- Should complexity thresholds become CI-enforced once the first cleanup pass + lands, or should they remain advisory until the major offenders are reduced? + +## On acceptance + +When this RFC folder is accepted: + +- Move the folder or accepted child docs under `docs/rfcs/accepted/`. +- Link the first implementation plan in `docs/superpowers/plans/`. +- Update affected architecture docs with any new import-boundary or testing + invariants. diff --git a/docs/rfcs/active/final-worker-output-source-of-truth.md b/docs/rfcs/active/final-worker-output-source-of-truth.md new file mode 100644 index 00000000..09495a56 --- /dev/null +++ b/docs/rfcs/active/final-worker-output-source-of-truth.md @@ -0,0 +1,177 @@ +# Final Worker Output Source of Truth + +_Sketch for treating `WorkerOutput` as the semantic final answer, rather than inferring it from context transcript events._ + +--- + +## Problem + +`ReActWorker.get_output()` currently reconstructs the worker's final output by reading persisted `RunContextEvent` rows and taking the last `assistant_text`, with a fallback that searches for a `final_result` tool call. That works, but it conflates three different concepts: + +- `assistant_text`: model text emitted during a generation turn +- `tool_call(final_result)`: PydanticAI's structured-output protocol +- `WorkerOutput`: the worker's final semantic result for the task execution + +The final answer should not be inferred from transcript shape. It should be the explicit output returned by the worker and persisted by the runtime. + +## Current State + +The codebase already has most of the right destination: + +- `WorkerOutput(output=..., success=..., metadata=...)` is the worker API's semantic final result. +- `worker_execute_fn()` receives the worker's `WorkerOutput` after `worker.get_output(worker_context)`. +- `WorkerExecuteResult.final_assistant_message` carries that value from `worker-execute` back to `task-execute`. +- `execute_task_fn()` passes `worker_result.final_assistant_message` into `FinalizeTaskExecutionCommand`. +- `TaskExecutionService.finalize_success()` persists it to `RunTaskExecution.final_assistant_message`. +- `RunTaskExecution` also has `output_json` for structured execution output metadata. + +So the persistence model already has a first-class execution-level field for the final assistant message. The weak part is upstream: `ReActWorker.get_output()` still computes that value by re-reading the context-event transcript. + +## Desired Shape + +The runtime should treat final worker output as execution-level data, not as another transcript event. + +```text +worker.execute() yields GenerationTurn events + | + v +ContextEventRepository persists transcript evidence + | + v +worker.get_output() returns WorkerOutput + | + v +TaskExecutionService.finalize_success() persists execution result + | + v +RunTaskExecution.final_assistant_message / output_json are the source of truth +``` + +In this model: + +- `RunContextEvent` remains the append-only transcript log. +- `RunTaskExecution.final_assistant_message` is the final human-readable answer. +- `RunTaskExecution.output_json` can hold structured metadata from `WorkerOutput.metadata`. +- Rollout-card export reads both: context events for the trace, task execution fields for final execution outputs. + +## Proposed Contract + +`WorkerOutput` should be the only object that defines a worker's final semantic output. + +```python +class WorkerOutput(BaseModel): + output: str + success: bool = True + metadata: dict[str, Any] = Field(default_factory=dict) +``` + +The runtime should persist it as: + +```text +RunTaskExecution.final_assistant_message = WorkerOutput.output +RunTaskExecution.output_json = { + "worker_output": { + "success": WorkerOutput.success, + "metadata": WorkerOutput.metadata, + }, + "resource_ids": [...] +} +``` + +If we want the full `WorkerOutput` object available in exports, use `output_json["worker_output"]` rather than adding a new `RunContextEvent` type. + +## ReActWorker Implication + +`ReActWorker` should stop deriving output by querying `ContextEventRepository`. + +Instead, it should capture the structured final result while running the PydanticAI agent. The worker already configures: + +```python +agent: Agent[None, _AgentOutput] = Agent( + model=resolved.model, + instructions=self.system_prompt or None, + tools=self.tools, + output_type=_AgentOutput, +) +``` + +The final `_AgentOutput.final_assistant_message` should be stored on the worker instance during `execute()`, then returned directly from `get_output()`. + +Conceptually: + +```python +class ReActWorker(Worker): + def __init__(...): + ... + self._final_output: _AgentOutput | None = None + self._turn_count = 0 + + async def _run_agent(...): + async with agent.iter(...) as run: + ... + self._final_output = run.result.output + + def get_output(self, context: WorkerContext) -> WorkerOutput: + if self._final_output is None: + return WorkerOutput(output="", success=False) + return WorkerOutput( + output=self._final_output.final_assistant_message, + success=True, + metadata={ + "reasoning": self._final_output.reasoning, + "turn_count": self._turn_count, + }, + ) +``` + +The exact PydanticAI result access may differ, but the ownership is the important part: the worker returns the structured final result it received from the agent, rather than reconstructing it from persisted context events. + +## Why Not `final_agent_message` Context Events? + +A new context event type would make the transcript easier to query, but it blurs the abstraction boundary. + +`RunContextEvent` should answer: "What happened during the model/tool interaction?" + +`RunTaskExecution` should answer: "What did this worker execution finally produce?" + +The final output belongs to the second question. Mirroring it into a rollout-card export is useful; storing it as another transcript event is optional and should not be the source of truth. + +## Implementation Sketch + +1. Keep `ContextEventRepository` unchanged as the transcript serializer. +2. Update `WorkerExecuteResult` only if needed to carry `WorkerOutput.metadata`. +3. Update `FinalizeTaskExecutionCommand` to carry `worker_output_metadata` or a full `worker_output_json`. +4. Update `TaskExecutionService.finalize_success()` to persist: + - `final_assistant_message` + - `output_json["worker_output"]` + - existing `resource_ids` if present +5. Update `ReActWorker` to capture its PydanticAI structured result during execution. +6. Replace `ReActWorker._base_output()` with a simple read of the captured structured output. +7. Remove `_latest_final_result_message()` if no other worker needs it. +8. Update rollout-card export to include task execution final outputs from `RunTaskExecution`, not by scanning `RunContextEvent`. + +## Migration / Compatibility + +Existing completed runs may only have context events, so readers should remain tolerant: + +- Prefer `RunTaskExecution.final_assistant_message`. +- If absent, optionally fall back to the old transcript inference for legacy runs. +- Do not use the fallback in new execution paths. + +This preserves old data while making new runs explicit. + +## Tests + +Add focused tests for: + +- `ReActWorker.get_output()` returns the captured structured `_AgentOutput`, not the last `assistant_text`. +- A run with intermediate `assistant_text` plus final structured output persists the structured final output. +- `TaskExecutionService.finalize_success()` writes `final_assistant_message` and `output_json["worker_output"]`. +- Context event replay still reconstructs transcript messages without needing final-output semantics. +- Legacy read helpers fall back to transcript inference only when `RunTaskExecution.final_assistant_message` is missing. + +## Open Questions + +1. Should `WorkerExecuteResult` carry the full `WorkerOutput.metadata`, or should `worker_execute_fn()` persist it directly before returning? +2. Should `RunTaskExecution.output_json` store the full `WorkerOutput` shape, or only `metadata` plus resource references? +3. Should rollout-card export call this field `worker_output`, `execution_output`, or `final_worker_output`? diff --git a/docs/superpowers/plans/2026-04-26-finish-agent-workflow-cli.md b/docs/superpowers/plans/2026-04-26-finish-agent-workflow-cli.md index 23ad8eef..ffb11efd 100644 --- a/docs/superpowers/plans/2026-04-26-finish-agent-workflow-cli.md +++ b/docs/superpowers/plans/2026-04-26-finish-agent-workflow-cli.md @@ -97,4 +97,3 @@ uv run pytest tests/unit/runtime/test_workflow_service.py tests/unit/cli/test_wo uv run pytest tests/unit/runtime tests/unit/cli tests/unit/state -q pnpm --dir ergon-dashboard run typecheck ``` - diff --git a/docs/superpowers/plans/2026-04-27-frontend-evaluation-visibility.md b/docs/superpowers/plans/2026-04-27-frontend-evaluation-visibility.md new file mode 100644 index 00000000..76e79b86 --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-frontend-evaluation-visibility.md @@ -0,0 +1,1390 @@ +# Frontend Evaluation Visibility Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the evaluation feature set from the design brief to the dashboard: cohort rubric status pips, graph node rubric cues, skipped/error states, rubric metadata, richer evaluation drawer details, container roll-ups, and an evaluation lens. + +**Implementation note:** The first implementation keeps the original API strategy: additive backend fields, frontend-derived run/container roll-ups, backend-owned cohort summaries, and stable `data-testid` coverage for cohort pips, graph rubric glyphs, the evaluation lens toggle, and criterion status details. + +**Architecture:** Keep the backend read model additive and make the frontend own presentation-specific selectors in a new `features/evaluations` domain. Enrich existing `GET /runs/{run_id}` and `GET /cohorts/{cohort_id}` payloads rather than introducing a new fetch path for the first implementation. Keep E2E assertions anchored to stable `data-testid` attributes and the backend harness DTO. + +**Tech Stack:** FastAPI, Pydantic DTOs, SQLModel persistence, Next.js App Router, React, TypeScript, Zod, React Flow, Playwright, pytest. + +--- + +## RFC + +### Problem + +The backend now produces enough evaluation data to validate task-level correctness, but the dashboard still treats evaluation as a narrow workspace tab. The design brief expects evaluation to be visible across the debugging loop: + +- Cohort rows show per-run rubric status pips and failure/skipped state at a glance. +- Graph nodes show which tasks have attached rubrics without requiring a click. +- Container nodes summarize evaluation status for their descendant tasks. +- The evaluation tab explains score composition, weights, skipped criteria, evaluator errors, input, feedback, and timing. +- Operators can switch the DAG into an evaluation lens that highlights evaluation-bearing tasks and dims unrelated work. + +### Non-Goals + +- Do not change evaluation execution semantics. +- Do not add interactive re-evaluation controls. +- Do not introduce a new standalone evaluation API service. +- Do not persist new relational tables unless the additive summary JSON fields prove insufficient. + +### Source Of Truth + +Use persisted `RunTaskEvaluation` rows and their typed `summary_json` as the source of truth. The frontend should not infer evaluation status from task status alone. It may derive roll-ups from evaluation rows and task parent/child relationships. + +### Nullability And Defaults Policy + +Avoid silent defaults at contract boundaries. If a field is owned by the backend and is required for rendering, make it required in the DTO and populate it explicitly in the builder. Use `None`/`null` only for genuinely absent data such as optional model reasoning, optional feedback, optional evaluation input, or optional error detail. In frontend derived state, represent "there is no evaluation evidence" as `null`, not as an all-zero roll-up object with a `"none"` sentinel. + +### API Strategy + +Use existing endpoints with additive fields: + +- `GET /runs/{run_id}` returns the enriched `RunSnapshotDto`. +- `GET /cohorts/{cohort_id}` returns enriched `CohortRunRowDto` rows with lightweight rubric status summaries. +- `GET /api/test/read/run/{run_id}/state` returns the expanded smoke harness fields used by Playwright. + +No existing response field should be removed or renamed. + +### Evaluation Status Semantics + +Use one canonical status vocabulary everywhere: + +```python +EvalCriterionStatus = Literal["passed", "failed", "errored", "skipped"] +RubricStatusSummaryStatus = Literal["passing", "failing", "errored", "skipped", "mixed", "none"] +``` + +Criterion status rules: + +- `errored`: `error` is non-null. +- `skipped`: criterion was part of the evaluator spec but did not execute because a prior gate failed or the attached task never reached the required lifecycle point. +- `passed`: criterion executed and `passed` is true. +- `failed`: criterion executed and `passed` is false. + +Roll-up status rules: + +- `none`: no evaluation rows or criteria. +- `errored`: at least one errored criterion. +- `failing`: at least one failed criterion and no errors. +- `mixed`: passed plus skipped criteria with no failed or errored criteria. +- `skipped`: all known criteria skipped. +- `passing`: all known criteria passed. + +### Backend Contract Additions + +Do not add parallel DTOs for data the run snapshot already exposes. The codebase already has: + +- `RunEvaluationCriterionDto` +- `RunTaskEvaluationDto` +- `RunSnapshotDto.evaluations_by_task` +- `CohortRunRowDto` + +The implementation should extend those existing DTOs in place. Graph glyphs, task roll-ups, container roll-ups, and run-level detail roll-ups should be derived in frontend selectors from `RunSnapshotDto.evaluations_by_task`. + +The only new backend DTO shape needed for the first implementation is a lightweight cohort-row rubric status summary, because the cohort page should show pips without fetching every run snapshot. The backend should own this summary, including counts and aggregate status. Keep the implementation direct: one compact builder over persisted `EvaluationSummary` rows, not a chain of helper functions or a second generic roll-up subsystem. + +Extend `ergon_core/ergon_core/core/api/schemas.py`: + +```python +from typing import Literal + +EvalCriterionStatus = Literal["passed", "failed", "errored", "skipped"] +``` + +Add fields to the existing `RunEvaluationCriterionDto` class: + +```python +class RunEvaluationCriterionDto(CamelModel): + # existing fields stay unchanged + criterion_name: str + status: EvalCriterionStatus + passed: bool + weight: float + contribution: float + model_reasoning: str | None = None + skipped_reason: str | None = None +``` + +Add fields to the existing `RunTaskEvaluationDto` class: + +```python +class RunTaskEvaluationDto(CamelModel): + # existing fields stay unchanged + evaluator_name: str + aggregation_rule: str +``` + +Add one lightweight DTO in `ergon_core/ergon_core/core/runtime/services/cohort_schemas.py`: + +```python +class CohortRubricStatusSummaryDto(BaseModel): + status: RubricStatusSummaryStatus + total_criteria: int + passed: int + failed: int + errored: int + skipped: int + criterion_statuses: list[str] + evaluator_names: list[str] + + +class CohortRunRowDto(BaseModel): + # existing fields stay unchanged + rubric_status_summary: CohortRubricStatusSummaryDto +``` + +### Frontend Contract Additions + +The generated REST contracts feed `ergon-dashboard/src/lib/contracts/rest.ts`. After regenerating contracts, normalize only fields that are genuinely optional on the backend contract. Do not use frontend defaults to hide missing required fields such as criterion `status`, criterion `weight`, evaluator name, aggregation rule, or cohort `rubric_status_summary`. + +Add frontend-only derived roll-up types in `ergon-dashboard/src/features/evaluations/contracts.ts`; do not mirror them as run-snapshot backend DTOs: + +```ts +export type EvalCriterionStatus = "passed" | "failed" | "errored" | "skipped"; +export type EvalRollupStatus = "passing" | "failing" | "errored" | "skipped" | "mixed"; +export type RubricStatusSummaryStatus = EvalRollupStatus | "none"; + +export interface EvaluationRollup { + status: EvalRollupStatus; + totalCriteria: number; + passed: number; + failed: number; + errored: number; + skipped: number; + normalizedScore: number; + maxScore: number; + evaluatorNames: string[]; + attachedTaskIds: string[]; + criterionStatuses: EvalCriterionStatus[]; +} +``` + +Extend existing normalized REST types in `ergon-dashboard/src/lib/contracts/rest.ts`: + +```ts +export interface RunEvaluationCriterion { + id: string; + stageNum: number; + stageName: string; + criterionNum: number; + criterionType: string; + criterionDescription: string; + criterionName: string; + status: EvalCriterionStatus; + passed: boolean; + weight: number; + contribution: number; + evaluationInput: string | null; + score: number; + maxScore: number; + feedback: string | null; + modelReasoning: string | null; + skippedReason: string | null; + evaluatedActionIds: string[]; + evaluatedResourceIds: string[]; + error: Record | null; +} +``` + +### Frontend Domain Boundary + +Create a focused evaluation domain: + +```text +ergon-dashboard/src/features/evaluations/ + contracts.ts + status.ts + selectors.ts + selectors.test.ts + components/ + CriterionStatusPip.tsx + RubricStatusStrip.tsx + EvaluationNodeGlyph.tsx + EvaluationRollupBadge.tsx + EvaluationLensToggle.tsx + EvaluationCriterionCard.tsx + EvaluationMetadataSummary.tsx +``` + +Responsibilities: + +- `contracts.ts`: frontend-only types if the generated REST types are too broad for component props. +- `status.ts`: colors, labels, icons, and ordering for evaluation statuses. +- `selectors.ts`: pure roll-up helpers for run, task, container descendants, and cohort rows. +- `components/*`: small visual components with stable `data-testid` attributes. + +### UX Contract + +Use these stable test IDs: + +- `cohort-eval-strip-{run_id}` +- `cohort-eval-pip-{run_id}-{index}` +- `graph-eval-glyph-{task_id}` +- `graph-eval-rollup-{task_id}` +- `graph-eval-lens-toggle` +- `workspace-evaluation-metadata` +- `workspace-evaluation-criterion-{criterion_id}` +- `workspace-evaluation-criterion-status-{criterion_id}` +- `workspace-evaluation-input-{criterion_id}` +- `workspace-evaluation-reasoning-{criterion_id}` + +### Acceptance Criteria + +- Cohort run rows render a rubric status strip for runs with evaluations and an empty state for runs without evaluations. +- Graph task nodes with attached evaluations render a subtle diamond glyph using text or CSS, with an accessible label. +- Expanded graph containers render a roll-up badge computed from descendant task evaluations. +- Evaluation lens dims non-evaluated tasks and highlights tasks with direct or descendant evaluation evidence. +- Evaluation panel shows aggregation rule, weights, score contribution, status, input, feedback, model reasoning, skipped reasons, and error details. +- Existing smoke specs assert happy-path passing pips, sad-path failed/skipped/errored visibility, graph glyphs, and the evaluation drawer. + +--- + +## File Structure + +### Backend Files + +- Modify `ergon_core/ergon_core/core/api/schemas.py`: extend existing evaluation DTO fields only. +- Modify `ergon_core/ergon_core/core/persistence/telemetry/evaluation_summary.py`: persist criterion `status`, optional `model_reasoning`, and optional `skipped_reason` in `summary_json`. +- Modify `ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py`: build criterion status, contribution, and model reasoning from `CriterionResult.metadata`. +- Modify `ergon_core/ergon_core/core/api/runs.py`: pass enriched criterion fields through existing `evaluations_by_task`. +- Modify `ergon_core/ergon_core/core/runtime/services/run_read_service.py`: keep using existing `evaluations_by_task`; no new run-snapshot roll-up fields. +- Modify `ergon_core/ergon_core/core/runtime/services/cohort_schemas.py`: add `rubric_status_summary` to cohort run rows. +- Modify `ergon_core/ergon_core/core/runtime/services/cohort_service.py`: query run evaluations and attach a backend-owned rubric status summary. +- Modify `ergon_core/ergon_core/core/api/test_harness.py`: expose criterion statuses and a lightweight run rubric status summary to Playwright smoke tests. +- Test `tests/unit/runtime/test_evaluation_summary_contracts.py`: assert enriched summary fields. +- Test `tests/unit/runtime/test_cohort_rubric_status_summary.py`: assert cohort row rubric status summary. + +### Frontend Files + +- Regenerate `ergon-dashboard/src/generated/rest/contracts.ts` after backend schema updates. +- Modify `ergon-dashboard/src/lib/contracts/rest.ts`: normalize additive evaluation fields. +- Modify `ergon-dashboard/src/lib/types.ts`: export enriched evaluation aliases only. +- Modify `ergon-dashboard/src/lib/runState.ts`: deserialize enriched existing evaluations only. +- Create `ergon-dashboard/src/features/evaluations/status.ts`: central status display mapping. +- Create `ergon-dashboard/src/features/evaluations/selectors.ts`: pure derived state helpers. +- Test `ergon-dashboard/src/features/evaluations/selectors.test.ts`: assert direct and container roll-ups. +- Create `ergon-dashboard/src/features/evaluations/components/CriterionStatusPip.tsx`. +- Create `ergon-dashboard/src/features/evaluations/components/RubricStatusStrip.tsx`. +- Create `ergon-dashboard/src/features/evaluations/components/EvaluationNodeGlyph.tsx`. +- Create `ergon-dashboard/src/features/evaluations/components/EvaluationRollupBadge.tsx`. +- Create `ergon-dashboard/src/features/evaluations/components/EvaluationLensToggle.tsx`. +- Create `ergon-dashboard/src/features/evaluations/components/EvaluationCriterionCard.tsx`. +- Create `ergon-dashboard/src/features/evaluations/components/EvaluationMetadataSummary.tsx`. +- Modify `ergon-dashboard/src/components/cohorts/CohortDetailView.tsx`: render cohort run rubric status strips. +- Modify `ergon-dashboard/src/components/dag/TaskNode.tsx`: pass evaluation roll-up props. +- Modify `ergon-dashboard/src/features/graph/components/LeafNode.tsx`: render glyph and roll-up badge. +- Modify `ergon-dashboard/src/features/graph/components/ContainerNode.tsx`: render container roll-up badge. +- Modify `ergon-dashboard/src/components/dag/DAGCanvas.tsx`: add evaluation lens toggle and graph dimming behavior. +- Modify `ergon-dashboard/src/components/panels/EvaluationPanel.tsx`: render richer metadata and criterion cards. +- Modify `ergon-dashboard/tests/helpers/backendHarnessClient.ts`: expand backend harness DTO. +- Modify `ergon-dashboard/tests/e2e/_shared/smoke.ts`: assert the visible evaluation features. + +--- + +## Implementation Tasks + +### Task 1: Backend Evaluation Read Contract + +**Files:** +- Modify: `ergon_core/ergon_core/core/persistence/telemetry/evaluation_summary.py` +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py` +- Test: `tests/unit/runtime/test_evaluation_summary_contracts.py` + +- [ ] **Step 1: Write failing summary contract tests** + +Add tests that prove the persistence DTO carries status, weights, contribution, and optional reasoning: + +```python +def test_build_evaluation_summary_includes_status_weight_and_contribution() -> None: + result = _service_result( + criterion_score=0.5, + criterion_weight=2.0, + passed=False, + metadata={"model_reasoning": "missing supporting artifact"}, + ) + + summary = build_evaluation_summary(result, evaluation_input="task evidence") + + entry = summary.criterion_results[0] + assert entry.status == "failed" + assert entry.weight == 2.0 + assert entry.contribution == 0.5 + assert entry.model_reasoning == "missing supporting artifact" + assert entry.skipped_reason is None + + +def test_dashboard_evaluation_dto_includes_criterion_status_fields() -> None: + summary = EvaluationSummary( + evaluator_name="post-root", + max_score=1.0, + normalized_score=1.0, + stages_evaluated=1, + stages_passed=1, + criterion_results=[ + CriterionResultEntry( + criterion_name="timing", + criterion_type="smoke-post-root-timing-criterion", + criterion_description="post root timing", + status="passed", + score=1.0, + max_score=1.0, + passed=True, + weight=1.0, + contribution=1.0, + ) + ], + ) + + dto = build_dashboard_evaluation_dto( + evaluation_id=UUID("00000000-0000-0000-0000-000000000001"), + run_id=UUID("00000000-0000-0000-0000-000000000002"), + task_id=UUID("00000000-0000-0000-0000-000000000003"), + total_score=1.0, + created_at=datetime(2026, 4, 27, tzinfo=UTC), + summary=summary, + ) + + criterion = dto.criterion_results[0] + assert criterion.status == "passed" + assert criterion.passed is True + assert criterion.weight == 1.0 + assert criterion.contribution == 1.0 + assert dto.evaluator_name == "post-root" + assert dto.aggregation_rule == "weighted_sum" +``` + +- [ ] **Step 2: Run tests and verify failure** + +Run: `pytest tests/unit/runtime/test_evaluation_summary_contracts.py -q` + +Expected: failure mentioning missing fields such as `status`, `contribution`, or `evaluator_name`. + +- [ ] **Step 3: Add typed persistence fields** + +In `evaluation_summary.py`, extend `CriterionResultEntry`: + +```python +class CriterionResultEntry(BaseModel): + """One criterion result as stored in the evaluation summary.""" + + criterion_name: str + criterion_type: str + stage_num: int + stage_name: str + criterion_num: int + status: Literal["passed", "failed", "errored", "skipped"] + score: float + max_score: float + passed: bool + weight: float + contribution: float + criterion_description: str + feedback: str | None = None + model_reasoning: str | None = None + skipped_reason: str | None = None + evaluation_input: str | None = None + evaluated_action_ids: list[str] = Field(default_factory=list) + evaluated_resource_ids: list[str] = Field(default_factory=list) + error: dict | None = None +``` + +- [ ] **Step 4: Add DTO fields** + +In `schemas.py`, update `RunEvaluationCriterionDto` and `RunTaskEvaluationDto` with the RFC contract fields. + +- [ ] **Step 5: Build status and metadata in persistence** + +In `evaluation_persistence_service.py`, add a helper: + +```python +def _criterion_status(*, passed: bool, error: dict | None) -> str: + if error is not None: + return "errored" + return "passed" if passed else "failed" +``` + +Then populate the entry: + +```python +metadata = cr.metadata +model_reasoning = metadata.get("model_reasoning") +entries.append( + CriterionResultEntry( + criterion_name=cr.name, + criterion_type=spec.criterion.type_slug, + criterion_description=spec.criterion.name, + stage_num=spec.stage_idx, + stage_name=spec.stage_name, + criterion_num=spec.criterion_idx, + status=_criterion_status(passed=cr.passed, error=None), + score=cr.score, + max_score=spec.max_score, + passed=cr.passed, + weight=cr.weight, + contribution=cr.score, + feedback=cr.feedback, + model_reasoning=model_reasoning if isinstance(model_reasoning, str) else None, + evaluation_input=evaluation_input, + ) +) +``` + +- [ ] **Step 6: Run tests and verify pass** + +Run: `pytest tests/unit/runtime/test_evaluation_summary_contracts.py -q` + +Expected: all tests pass. + +- [ ] **Step 7: Commit** + +```bash +git add ergon_core/ergon_core/core/persistence/telemetry/evaluation_summary.py ergon_core/ergon_core/core/api/schemas.py ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py tests/unit/runtime/test_evaluation_summary_contracts.py +git commit -m "feat: enrich evaluation read contract" +``` + +### Task 2: Backend Cohort Rubric Status Summary + +**Files:** +- Modify: `ergon_core/ergon_core/core/api/runs.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/run_read_service.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/cohort_schemas.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/cohort_service.py` +- Modify: `ergon_core/ergon_core/core/api/test_harness.py` +- Test: `tests/unit/runtime/test_cohort_rubric_status_summary.py` + +- [ ] **Step 1: Write failing cohort rubric summary tests** + +Create `tests/unit/runtime/test_cohort_rubric_status_summary.py`: + +```python +def test_cohort_run_row_includes_rubric_status_summary(session: Session) -> None: + cohort, run, node = _persist_run_with_one_failed_evaluation(session) + + detail = experiment_cohort_service.get_detail(cohort.id) + + assert detail is not None + row = detail.runs[0] + assert row.rubric_status_summary.status == "failing" + assert row.rubric_status_summary.total_criteria == 1 + assert row.rubric_status_summary.failed == 1 + assert row.rubric_status_summary.criterion_statuses == ["failed"] +``` + +- [ ] **Step 2: Run tests and verify failure** + +Run: + +```bash +pytest tests/unit/runtime/test_cohort_rubric_status_summary.py -q +``` + +Expected: missing `rubric_status_summary` field or summary builder. + +- [ ] **Step 3: Implement one compact rubric summary builder** + +Add one private helper in `cohort_service.py`. Use `Counter` so the code says what it is doing without a separate status helper: + +```python +from collections import Counter + + +def _rubric_status_summary( + summaries: list[EvaluationSummary], +) -> CohortRubricStatusSummaryDto: + statuses = [ + criterion.status + for summary in summaries + for criterion in summary.criterion_results + ] + counts = Counter(statuses) + + if not statuses: + status = "none" + elif counts["errored"]: + status = "errored" + elif counts["failed"]: + status = "failing" + elif counts["passed"] and counts["skipped"]: + status = "mixed" + elif counts["skipped"] == len(statuses): + status = "skipped" + else: + status = "passing" + + return CohortRubricStatusSummaryDto( + status=status, + total_criteria=len(statuses), + passed=counts["passed"], + failed=counts["failed"], + errored=counts["errored"], + skipped=counts["skipped"], + criterion_statuses=statuses, + evaluator_names=sorted({summary.evaluator_name for summary in summaries}), + ) +``` + +- [ ] **Step 4: Attach cohort row rubric summary** + +In `cohort_service.py`, query `RunTaskEvaluation` for cohort runs, group by `run_id`, convert `summary_json` to `EvaluationSummary`, and pass `rubric_status_summary` into `_build_run_row`. + +- [ ] **Step 5: Expand test harness state** + +In `test_harness.py`, add these fields to the run state JSON: + +```json +{ + "rubric_status_summary": { + "status": "passing", + "total_criteria": 2, + "passed": 2, + "failed": 0, + "errored": 0, + "skipped": 0 + }, + "evaluations": [ + { + "task_id": "node-uuid", + "task_slug": "d_root", + "score": 1.0, + "reason": "root timing marker criterion ran", + "criterion_statuses": ["passed"], + "evaluator_name": "post-root" + } + ] +} +``` + +- [ ] **Step 6: Run backend tests** + +Run: + +```bash +pytest tests/unit/runtime/test_evaluation_summary_contracts.py tests/unit/runtime/test_cohort_rubric_status_summary.py -q +``` + +Expected: all selected tests pass. + +- [ ] **Step 7: Commit** + +```bash +git add ergon_core/ergon_core/core/api/runs.py ergon_core/ergon_core/core/runtime/services/run_read_service.py ergon_core/ergon_core/core/runtime/services/cohort_schemas.py ergon_core/ergon_core/core/runtime/services/cohort_service.py ergon_core/ergon_core/core/api/test_harness.py tests/unit/runtime/test_cohort_rubric_status_summary.py +git commit -m "feat: expose cohort rubric status summary" +``` + +### Task 3: Frontend Contracts And Evaluation Selectors + +**Files:** +- Modify: `ergon-dashboard/src/generated/rest/contracts.ts` +- Modify: `ergon-dashboard/src/lib/contracts/rest.ts` +- Modify: `ergon-dashboard/src/lib/types.ts` +- Modify: `ergon-dashboard/src/lib/runState.ts` +- Create: `ergon-dashboard/src/features/evaluations/contracts.ts` +- Create: `ergon-dashboard/src/features/evaluations/status.ts` +- Create: `ergon-dashboard/src/features/evaluations/selectors.ts` +- Test: `ergon-dashboard/src/features/evaluations/selectors.test.ts` + +- [ ] **Step 1: Regenerate REST contracts** + +Run the repository's existing OpenAPI generation command. If the command is not documented, inspect `package.json` scripts and use the local script rather than hand-editing generated files. + +Expected: `src/generated/rest/contracts.ts` includes the new evaluation fields. + +- [ ] **Step 2: Write selector tests** + +Create `selectors.test.ts`: + +```ts +import { describe, expect, it } from "vitest"; +import { buildContainerEvaluationRollup, isEvaluationBearingTask } from "./selectors"; +import type { EvaluationRollup } from "./contracts"; +import type { TaskState, WorkflowRunState } from "@/lib/types"; + +function evaluation(status: "passed" | "failed" | "errored" | "skipped") { + return { + id: `evaluation-${status}`, + evaluatorName: "default", + totalScore: status === "passed" ? 1 : 0, + maxScore: 1, + normalizedScore: status === "passed" ? 1 : 0, + criterionResults: [{ id: `criterion-${status}`, status, score: status === "passed" ? 1 : 0, maxScore: 1 }], + }; +} + +it("detects tasks with direct evaluation evidence", () => { + const task = { id: "a", childIds: [] } as TaskState; + const state = { + evaluationsByTask: new Map([["a", evaluation("passed")]]), + } as unknown as WorkflowRunState; + + expect(isEvaluationBearingTask(state, task)).toBe(true); +}); + +it("rolls descendant evaluation failures up to a container", () => { + const state = { + tasks: new Map([ + ["root", { id: "root", childIds: ["a", "b"] }], + ["a", { id: "a", childIds: [] }], + ["b", { id: "b", childIds: [] }], + ]), + evaluationsByTask: new Map([ + ["a", evaluation("passed")], + ["b", evaluation("failed")], + ]), + } as unknown as WorkflowRunState; + + expect(buildContainerEvaluationRollup(state, "root").status).toBe("failing"); +}); +``` + +- [ ] **Step 3: Run selector tests and verify failure** + +Run: `cd ergon-dashboard && npm test -- features/evaluations/selectors.test.ts` + +Expected: failure because files/types are missing. + +- [ ] **Step 4: Add frontend evaluation contracts and status mapping** + +Create `contracts.ts`: + +```ts +export type EvalCriterionStatus = "passed" | "failed" | "errored" | "skipped"; +export type EvalRollupStatus = "passing" | "failing" | "errored" | "skipped" | "mixed"; +export type RubricStatusSummaryStatus = EvalRollupStatus | "none"; + +export interface EvaluationRollup { + status: EvalRollupStatus; + totalCriteria: number; + passed: number; + failed: number; + errored: number; + skipped: number; + normalizedScore: number | null; + maxScore: number | null; + evaluatorNames: string[]; + attachedTaskIds: string[]; + criterionStatuses: EvalCriterionStatus[]; +} +``` + +Create `status.ts`: + +```ts +import type { EvalCriterionStatus, EvalRollupStatus } from "./contracts"; + +export const EVALUATION_STATUS_LABEL: Record = { + passing: "Passing", + failing: "Failing", + errored: "Errored", + skipped: "Skipped", + mixed: "Mixed", +}; + +export const CRITERION_STATUS_LABEL: Record = { + passed: "Passed", + failed: "Failed", + errored: "Errored", + skipped: "Skipped", +}; + +export function evaluationStatusTone(status: EvalRollupStatus): string { + switch (status) { + case "passing": + return "oklch(0.70 0.13 155)"; + case "failing": + return "oklch(0.68 0.18 22)"; + case "errored": + return "oklch(0.62 0.18 35)"; + case "skipped": + return "oklch(0.65 0.03 250)"; + case "mixed": + return "oklch(0.72 0.12 85)"; + } +} +``` + +- [ ] **Step 5: Add frontend selectors** + +Create `selectors.ts`: + +```ts +import type { TaskEvaluationState, TaskState, WorkflowRunState } from "@/lib/types"; +import type { EvalRollupStatus, EvaluationRollup } from "./contracts"; + +export function isEvaluationBearingTask(state: WorkflowRunState, task: TaskState): boolean { + return buildContainerEvaluationRollup(state, task.id) !== null; +} + +function combineStatus(statuses: EvalRollupStatus[]): EvalRollupStatus { + if (statuses.includes("errored")) return "errored"; + if (statuses.includes("failing")) return "failing"; + if (statuses.includes("mixed")) return "mixed"; + if (statuses.includes("skipped") && statuses.includes("passing")) return "mixed"; + if (statuses.every((status) => status === "skipped")) return "skipped"; + if (statuses.every((status) => status === "passing")) return "passing"; + return "mixed"; +} + +function evaluationToRollup(evaluation: TaskEvaluationState | undefined): EvaluationRollup | null { + if (!evaluation) return null; + const statuses = evaluation.criterionResults.map((criterion) => criterion.status); + if (statuses.length === 0) return null; + const passed = statuses.filter((status) => status === "passed").length; + const failed = statuses.filter((status) => status === "failed").length; + const errored = statuses.filter((status) => status === "errored").length; + const skipped = statuses.filter((status) => status === "skipped").length; + return { + status: combineStatus( + statuses.map((status) => + status === "passed" ? "passing" : status === "failed" ? "failing" : status === "errored" ? "errored" : "skipped", + ), + ), + totalCriteria: statuses.length, + passed, + failed, + errored, + skipped, + normalizedScore: evaluation.normalizedScore, + maxScore: evaluation.maxScore, + evaluatorNames: [evaluation.evaluatorName], + attachedTaskIds: evaluation.taskId ? [evaluation.taskId] : [], + criterionStatuses: statuses, + }; +} + +export function buildContainerEvaluationRollup(state: WorkflowRunState, taskId: string): EvaluationRollup | null { + const task = state.tasks.get(taskId); + if (!task) return null; + + const direct = evaluationToRollup(state.evaluationsByTask.get(taskId)); + const childRollups = task.childIds.map((childId) => buildContainerEvaluationRollup(state, childId)); + const rollups = [direct, ...childRollups].filter( + (rollup): rollup is EvaluationRollup => rollup !== null, + ); + + if (rollups.length === 0) return null; + + const totalCriteria = rollups.reduce((sum, rollup) => sum + rollup.totalCriteria, 0); + const maxScore = rollups.reduce((sum, rollup) => sum + rollup.maxScore, 0); + const weightedScore = rollups.reduce( + (sum, rollup) => sum + rollup.normalizedScore * rollup.maxScore, + 0, + ); + + return { + status: combineStatus(rollups.map((rollup) => rollup.status)), + totalCriteria, + passed: rollups.reduce((sum, rollup) => sum + rollup.passed, 0), + failed: rollups.reduce((sum, rollup) => sum + rollup.failed, 0), + errored: rollups.reduce((sum, rollup) => sum + rollup.errored, 0), + skipped: rollups.reduce((sum, rollup) => sum + rollup.skipped, 0), + normalizedScore: weightedScore / maxScore, + maxScore, + evaluatorNames: Array.from(new Set(rollups.flatMap((rollup) => rollup.evaluatorNames))).sort(), + attachedTaskIds: Array.from(new Set(rollups.flatMap((rollup) => rollup.attachedTaskIds))).sort(), + criterionStatuses: rollups.flatMap((rollup) => rollup.criterionStatuses), + }; +} +``` + +- [ ] **Step 6: Normalize contracts and run state** + +In `rest.ts`, require the enriched existing evaluation fields (`criterionName`, `status`, `passed`, `weight`, `contribution`, `evaluatorName`, `aggregationRule`) to be present after contract generation. Normalize only genuinely nullable fields (`modelReasoning`, `skippedReason`, `feedback`, `evaluationInput`, `error`) to `null`. In `runState.ts`, continue deserializing `evaluationsByTask`; do not add `taskEvaluationRollups` or `runEvaluationRollup` to `WorkflowRunState`. + +- [ ] **Step 7: Run frontend tests** + +Run: `cd ergon-dashboard && npm test -- features/evaluations/selectors.test.ts` + +Expected: tests pass. + +- [ ] **Step 8: Commit** + +```bash +git add ergon-dashboard/src/generated/rest/contracts.ts ergon-dashboard/src/lib/contracts/rest.ts ergon-dashboard/src/lib/types.ts ergon-dashboard/src/lib/runState.ts ergon-dashboard/src/features/evaluations/contracts.ts ergon-dashboard/src/features/evaluations/status.ts ergon-dashboard/src/features/evaluations/selectors.ts ergon-dashboard/src/features/evaluations/selectors.test.ts +git commit -m "feat: add frontend evaluation state domain" +``` + +### Task 4: Cohort Rubric Status Strips + +**Files:** +- Create: `ergon-dashboard/src/features/evaluations/components/CriterionStatusPip.tsx` +- Create: `ergon-dashboard/src/features/evaluations/components/RubricStatusStrip.tsx` +- Modify: `ergon-dashboard/src/components/cohorts/CohortDetailView.tsx` +- Test: `ergon-dashboard/tests/e2e/_shared/smoke.ts` + +- [ ] **Step 1: Add Playwright assertion first** + +In the cohort index test in `smoke.ts`, assert every run row has a strip: + +```ts +for (const { run_id } of cohort) { + await expect(page.getByTestId(`cohort-eval-strip-${run_id}`)).toBeVisible(); + await expect(page.locator(`[data-testid^="cohort-eval-pip-${run_id}-"]`).first()).toBeVisible(); +} +``` + +- [ ] **Step 2: Run Playwright smoke locally against an existing smoke stack** + +Run the narrow Playwright command used by the current E2E workflow for one benchmark. + +Expected: failure because the rubric status strip test IDs do not exist. + +- [ ] **Step 3: Create `CriterionStatusPip`** + +```tsx +import type { EvalCriterionStatus } from "@/features/evaluations/contracts"; +import { CRITERION_STATUS_LABEL, evaluationStatusTone } from "@/features/evaluations/status"; + +const rollupStatusByCriterion: Record[0]> = { + passed: "passing", + failed: "failing", + errored: "errored", + skipped: "skipped", +}; + +export function CriterionStatusPip({ + status, + testId, +}: { + status: EvalCriterionStatus; + testId?: string; +}) { + return ( + + ); +} +``` + +- [ ] **Step 4: Create `RubricStatusStrip`** + +```tsx +import type { CohortRunRow } from "@/lib/types"; +import { CriterionStatusPip } from "./CriterionStatusPip"; + +export function RubricStatusStrip({ + runId, + summary, +}: { + runId: string; + summary: CohortRunRow["rubric_status_summary"]; +}) { + const statuses = summary.criterion_statuses; + + return ( +
+ + Rubric + + {statuses.length === 0 ? ( + No criteria + ) : ( + + {statuses.map((status, index) => ( + + ))} + + )} +
+ ); +} +``` + +- [ ] **Step 5: Render strip in cohort rows** + +In `CohortRunRowCard`, render: + +```tsx + +``` + +Place it under the cohort/run ID metadata so it is visible without widening the grid. + +- [ ] **Step 6: Run frontend and E2E checks** + +Run: + +```bash +cd ergon-dashboard && npm test -- features/evaluations/selectors.test.ts +``` + +Then run the narrow Playwright smoke command. + +Expected: selector tests pass and Playwright sees cohort rubric status strips. + +- [ ] **Step 7: Commit** + +```bash +git add ergon-dashboard/src/features/evaluations/components/CriterionStatusPip.tsx ergon-dashboard/src/features/evaluations/components/RubricStatusStrip.tsx ergon-dashboard/src/components/cohorts/CohortDetailView.tsx ergon-dashboard/tests/e2e/_shared/smoke.ts +git commit -m "feat: show cohort rubric status" +``` + +### Task 5: Graph Glyphs, Container Roll-Ups, And Evaluation Lens + +**Files:** +- Create: `ergon-dashboard/src/features/evaluations/components/EvaluationNodeGlyph.tsx` +- Create: `ergon-dashboard/src/features/evaluations/components/EvaluationRollupBadge.tsx` +- Create: `ergon-dashboard/src/features/evaluations/components/EvaluationLensToggle.tsx` +- Modify: `ergon-dashboard/src/components/dag/TaskNode.tsx` +- Modify: `ergon-dashboard/src/features/graph/components/LeafNode.tsx` +- Modify: `ergon-dashboard/src/features/graph/components/ContainerNode.tsx` +- Modify: `ergon-dashboard/src/components/dag/DAGCanvas.tsx` +- Test: `ergon-dashboard/tests/e2e/_shared/smoke.ts` + +- [ ] **Step 1: Add Playwright graph assertions first** + +In `assertRunWorkspace`, after selecting an evaluated task: + +```ts +if (evaluatedTaskIds.has(selected.id)) { + await expect(page.getByTestId(`graph-eval-glyph-${selected.id}`)).toBeVisible(); +} +await expect(page.getByTestId("graph-eval-lens-toggle")).toBeVisible(); +await page.getByTestId("graph-eval-lens-toggle").click(); +await expect(page.getByTestId("graph-canvas")).toHaveAttribute("data-eval-lens", "on"); +``` + +- [ ] **Step 2: Run Playwright and verify failure** + +Expected: missing glyph/toggle test IDs. + +- [ ] **Step 3: Create graph evaluation components** + +`EvaluationNodeGlyph.tsx`: + +```tsx +import type { EvaluationRollup } from "@/features/evaluations/contracts"; +import { EVALUATION_STATUS_LABEL, evaluationStatusTone } from "@/features/evaluations/status"; + +export function EvaluationNodeGlyph({ + taskId, + rollup, +}: { + taskId: string; + rollup: EvaluationRollup; +}) { + return ( + + ◇ + + ); +} +``` + +`EvaluationRollupBadge.tsx`: + +```tsx +import type { EvaluationRollup } from "@/features/evaluations/contracts"; +import { EVALUATION_STATUS_LABEL, evaluationStatusTone } from "@/features/evaluations/status"; + +export function EvaluationRollupBadge({ + taskId, + rollup, +}: { + taskId: string; + rollup: EvaluationRollup; +}) { + return ( + + {EVALUATION_STATUS_LABEL[rollup.status]} · {rollup.totalCriteria} + + ); +} +``` + +`EvaluationLensToggle.tsx`: + +```tsx +export function EvaluationLensToggle({ + enabled, + onToggle, +}: { + enabled: boolean; + onToggle: () => void; +}) { + return ( + + ); +} +``` + +- [ ] **Step 4: Pass roll-ups through React Flow node data** + +Extend `TaskNodeData`: + +```ts +evaluationRollup?: EvaluationRollup; +evalLensEnabled?: boolean; +``` + +When building React Flow nodes in `DAGCanvas.tsx`, set: + +```ts +const evaluationRollup = buildContainerEvaluationRollup(runState, task.id); +const evalBearing = evaluationRollup !== null; +data: { + task, + evaluationRollup, + evalLensEnabled, + dimmed: evalLensEnabled ? !evalBearing : isSearchDimmed, +} +``` + +- [ ] **Step 5: Render glyphs and roll-ups in nodes** + +In `LeafNode.tsx`, render `EvaluationNodeGlyph` near the title for direct task evaluations and `EvaluationRollupBadge` if there are multiple criteria. + +In `ContainerNode.tsx`, render `EvaluationRollupBadge` in the header row next to the child count. + +- [ ] **Step 6: Add lens toggle to DAG controls** + +In `DAGCanvas.tsx`, keep: + +```ts +const [evalLensEnabled, setEvalLensEnabled] = useState(false); +``` + +Render `EvaluationLensToggle` in the floating control card area and set: + +```tsx +
+``` + +- [ ] **Step 7: Run focused frontend tests and Playwright** + +Run: + +```bash +cd ergon-dashboard && npm test -- features/evaluations/selectors.test.ts +``` + +Run the narrow Playwright smoke command. + +Expected: graph glyph and lens assertions pass. + +- [ ] **Step 8: Commit** + +```bash +git add ergon-dashboard/src/features/evaluations/components/EvaluationNodeGlyph.tsx ergon-dashboard/src/features/evaluations/components/EvaluationRollupBadge.tsx ergon-dashboard/src/features/evaluations/components/EvaluationLensToggle.tsx ergon-dashboard/src/components/dag/TaskNode.tsx ergon-dashboard/src/features/graph/components/LeafNode.tsx ergon-dashboard/src/features/graph/components/ContainerNode.tsx ergon-dashboard/src/components/dag/DAGCanvas.tsx ergon-dashboard/tests/e2e/_shared/smoke.ts +git commit -m "feat: add evaluation graph lens" +``` + +### Task 6: Rich Evaluation Workspace Panel + +**Files:** +- Create: `ergon-dashboard/src/features/evaluations/components/EvaluationCriterionCard.tsx` +- Create: `ergon-dashboard/src/features/evaluations/components/EvaluationMetadataSummary.tsx` +- Modify: `ergon-dashboard/src/components/panels/EvaluationPanel.tsx` +- Test: `ergon-dashboard/tests/e2e/_shared/smoke.ts` + +- [ ] **Step 1: Add Playwright drawer assertions first** + +In `assertRunWorkspace`, inside the evaluation tab branch for evaluated tasks: + +```ts +await expect(page.getByTestId("workspace-evaluation-metadata")).toBeVisible(); +await expect(page.locator('[data-testid^="workspace-evaluation-criterion-"]').first()).toBeVisible(); +await expect(page.locator('[data-testid^="workspace-evaluation-criterion-status-"]').first()).toBeVisible(); +await expect(page.locator('[data-testid^="workspace-evaluation-input-"]').first()).toBeVisible(); +``` + +- [ ] **Step 2: Run Playwright and verify failure** + +Expected: metadata and criterion card test IDs missing. + +- [ ] **Step 3: Create `EvaluationMetadataSummary`** + +```tsx +import type { TaskEvaluationState } from "@/lib/types"; + +export function EvaluationMetadataSummary({ evaluation }: { evaluation: TaskEvaluationState }) { + return ( +
+
+
+
Evaluator
+
{evaluation.evaluatorName}
+
+
+
Aggregation
+
{evaluation.aggregationRule}
+
+
+
Score
+
+ {evaluation.totalScore.toFixed(2)} / {evaluation.maxScore.toFixed(2)} +
+
+
+
Stages
+
+ {evaluation.stagesPassed} / {evaluation.stagesEvaluated} passed +
+
+
+
+ ); +} +``` + +- [ ] **Step 4: Create `EvaluationCriterionCard`** + +```tsx +import type { EvaluationCriterionState } from "@/lib/types"; +import { CRITERION_STATUS_LABEL, evaluationStatusTone } from "@/features/evaluations/status"; + +export function EvaluationCriterionCard({ criterion }: { criterion: EvaluationCriterionState }) { + const tone = evaluationStatusTone( + criterion.status === "passed" + ? "passing" + : criterion.status === "failed" + ? "failing" + : criterion.status === "errored" + ? "errored" + : "skipped", + ); + + return ( +
+
+
+

{criterion.criterionDescription}

+
+ {criterion.stageName} · weight {criterion.weight.toFixed(2)} · contribution {criterion.contribution.toFixed(2)} +
+
+ + {CRITERION_STATUS_LABEL[criterion.status]} + +
+ + {criterion.evaluationInput && ( +
+ {criterion.evaluationInput} +
+ )} + + {criterion.feedback &&

{criterion.feedback}

} + + {criterion.modelReasoning && ( +
+ {criterion.modelReasoning} +
+ )} + + {criterion.skippedReason &&

{criterion.skippedReason}

} + + {criterion.error && ( +
+          {JSON.stringify(criterion.error, null, 2)}
+        
+ )} +
+ ); +} +``` + +- [ ] **Step 5: Replace the current criterion map in `EvaluationPanel`** + +Keep existing empty state behavior, but render: + +```tsx + +
+ {evaluation.criterionResults.map((criterion) => ( + + ))} +
+``` + +- [ ] **Step 6: Run frontend and E2E checks** + +Run: + +```bash +cd ergon-dashboard && npm test -- features/evaluations/selectors.test.ts +``` + +Run the narrow Playwright smoke command. + +Expected: evaluation workspace assertions pass. + +- [ ] **Step 7: Commit** + +```bash +git add ergon-dashboard/src/features/evaluations/components/EvaluationCriterionCard.tsx ergon-dashboard/src/features/evaluations/components/EvaluationMetadataSummary.tsx ergon-dashboard/src/components/panels/EvaluationPanel.tsx ergon-dashboard/tests/e2e/_shared/smoke.ts +git commit -m "feat: enrich evaluation workspace panel" +``` + +### Task 7: End-To-End Hardening + +**Files:** +- Modify: `ergon-dashboard/tests/helpers/backendHarnessClient.ts` +- Modify: `ergon-dashboard/tests/e2e/_shared/smoke.ts` +- Modify: `tests/e2e/_asserts.py` +- Modify: `docs/architecture/07_testing.md` + +- [ ] **Step 1: Expand backend harness TypeScript DTO** + +In `backendHarnessClient.ts`, add: + +```ts +export interface BackendEvaluationRollup { + status: "passing" | "failing" | "errored" | "skipped" | "mixed" | "none" | string; + total_criteria: number; + passed: number; + failed: number; + errored: number; + skipped: number; +} +``` + +Extend `BackendRunState`: + +```ts +rubric_status_summary: BackendEvaluationRollup; +evaluations: { + task_id: string; + task_slug: string | null; + score: number; + reason: string; + evaluator_name: string | null; + criterion_statuses: string[]; +}[]; +``` + +- [ ] **Step 2: Add backend E2E assertions** + +In `tests/e2e/_asserts.py`, assert happy runs expose: + +```python +assert len(root_evaluations) == 2 +assert {ev.parsed_summary().evaluator_name for ev in root_evaluations} >= {"default", "post-root"} +assert all( + cr.status == "passed" + for ev in root_evaluations + for cr in ev.parsed_summary().criterion_results +) +``` + +For sad runs, assert failed or skipped criterion state is exposed when a criterion does not pass. + +- [ ] **Step 3: Add UI assertions for each feature** + +In `smoke.ts`, assert: + +```ts +expect(state.rubric_status_summary.total_criteria).toBeGreaterThan(0); +await expect(page.getByTestId("graph-eval-lens-toggle")).toBeVisible(); +await expect(page.locator('[data-testid^="workspace-evaluation-criterion-"]').first()).toBeVisible(); +``` + +For happy runs: + +```ts +expect(state.rubric_status_summary.status).toBe("passing"); +``` + +For sad runs: + +```ts +expect(["failing", "errored", "mixed", "skipped"]).toContain(state.rubric_status_summary.status); +``` + +- [ ] **Step 4: Update testing docs** + +In `docs/architecture/07_testing.md`, add the frontend evaluation visibility surface to the E2E assertion table: + +```text +Evaluation visibility | Cohort pips, graph glyphs, container roll-ups, eval lens, workspace criterion cards | Playwright + backend harness DTO +``` + +- [ ] **Step 5: Run focused checks** + +Run: + +```bash +pytest tests/unit/runtime/test_evaluation_summary_contracts.py tests/unit/runtime/test_cohort_rubric_status_summary.py -q +cd ergon-dashboard && npm test -- features/evaluations/selectors.test.ts +``` + +Run the benchmark E2E smoke workflow locally for one benchmark if the stack is already available. + +Expected: unit and frontend tests pass; Playwright passes for the exercised benchmark. + +- [ ] **Step 6: Commit** + +```bash +git add ergon-dashboard/tests/helpers/backendHarnessClient.ts ergon-dashboard/tests/e2e/_shared/smoke.ts tests/e2e/_asserts.py docs/architecture/07_testing.md +git commit -m "test: cover evaluation visibility e2e" +``` + +--- + +## Rollout Notes + +1. Backend changes are additive and can ship before frontend rendering. +2. Generated REST contracts must be refreshed after backend DTO changes and before frontend contract normalization. +3. Cohort roll-ups intentionally stay lightweight to avoid loading full run snapshots for every row. +4. The evaluation lens is local UI state; it should not change the URL in the first implementation. +5. If skipped criteria require semantics not available in `summary_json`, extend `CriterionExecutor` to emit explicit skipped results in a later follow-up rather than inferring skipped state from missing rows. + +## Verification Matrix + +- Backend unit: `pytest tests/unit/runtime/test_evaluation_summary_contracts.py -q` +- Backend unit: `pytest tests/unit/runtime/test_cohort_rubric_status_summary.py -q` +- Frontend unit: `cd ergon-dashboard && npm test -- features/evaluations/selectors.test.ts` +- E2E: run the existing canonical smoke command for at least one happy/sad cohort. +- Lints: use `ReadLints` for edited files after each frontend and backend slice. + +## Self-Review + +- Spec coverage: cohort pips are covered in Task 4; graph glyphs, container roll-ups, and eval lens are covered in Task 5; richer drawer metadata and criterion detail are covered in Task 6; backend schemas/endpoints are covered in Tasks 1 and 2; E2E coverage is covered in Task 7. +- Placeholder scan: the plan contains concrete fields, commands, file paths, test IDs, and code shapes. Follow-up notes are explicitly scoped to future semantics rather than missing implementation steps. +- Type consistency: `EvalCriterionStatus`, `EvalRollupStatus`, `CohortRubricStatusSummaryDto`, and frontend-only `EvaluationRollup` names are used consistently across backend, frontend contracts, selectors, and components. diff --git a/docs/superpowers/plans/2026-04-27-react-worker-context-capture.md b/docs/superpowers/plans/2026-04-27-react-worker-context-capture.md new file mode 100644 index 00000000..6890dc1b --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-react-worker-context-capture.md @@ -0,0 +1,1116 @@ +# ReAct Worker Context Capture Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make real LLM ReAct workers persist the full model-context transcript, including thinking blocks and tool observations, into `run_context_events`. + +**Architecture:** Keep `RunContextEvent` as the canonical durable context log. Move PydanticAI-specific transcript parsing out of `ReActWorker` into `ergon_builtins.common.llm_context`, add a small capture-settings helper there for provider-specific thinking/logprob settings, and keep runtime persistence framework-neutral by consuming `GenerationTurn`. + +**Tech Stack:** Python, PydanticAI, SQLModel, `GenerationTurn`, `ContextEventRepository`, pytest. + +--- + +## Scope + +This plan covers backend capture only: + +- Turn on reasoning/thinking capture for real ReAct workers where PydanticAI/provider support exists. +- Extract PydanticAI message history into `GenerationTurn` via a reusable utility. +- Persist `GenerationTurn.tool_results` as `tool_result` rows. +- Add unit tests around transcript extraction, model settings, and context event persistence. + +This plan does not redesign the workspace Actions UI. After this lands, the existing Actions tab should automatically receive richer `contextEventsByTask` data for new runs. + +--- + +## File Map + +The extraction and capture-settings code belongs in `ergon_builtins`, not `ergon_core`, because it depends on concrete worker/framework behavior. `ergon_core` should keep only stable contracts and persistence: `GenerationTurn` in, `RunContextEvent` out. + +Within `ergon_builtins`, this should live in `common/llm_context/`: shared code for built-in LLM workers that is not specific to MiniF2F, SWE-Bench, ResearchRubrics, or any one benchmark. Keep this domain narrow: + +- `capture_settings.py` decides what provider settings to pass when we want transcript capture. +- `adapters/base.py` defines the common transcript adapter interface in both directions. +- `adapters/pydantic_ai.py` adapts PydanticAI message history into Ergon's framework-neutral `GenerationTurn`, reconstructs PydanticAI messages from `RunContextEvent` rows, and owns PydanticAI response-metadata parsing such as logprobs. +- Benchmark toolkits, prompts, sandbox code, and worker output policy stay where they are. + +If this refactor also consolidates model resolution out of core, keep that under `ergon_builtins.models`, not under `llm_context`. Model resolution is about selecting a concrete model backend; `llm_context` is about transcript capture/replay. + +```text +ergon_builtins/ + ergon_builtins/ + common/ # add: shared builtins utility package + __init__.py # add + llm/ + structured_judge.py # optional move: core structured_judge helper if moving model resolution + llm_context/ # add: shared LLM context-capture domain for built-in workers + __init__.py # add + capture_settings.py # add: provider-specific thinking/logprob model_settings + adapters/ # add: framework transcript adapters + __init__.py # add + base.py # add: TranscriptAdapter protocol/base interface + pydantic_ai.py # add: PydanticAI <-> GenerationTurn/RunContextEvent adapter + langgraph.py # do not add yet: reserved for future framework adapter + openai_sdk.py # do not add yet: reserved for future direct-SDK adapter + prompts.py # do not add: benchmark prompts stay under workers/benchmarks + tools.py # do not add: benchmark toolkits stay under tools/ or benchmarks/ + workers/ + baselines/ + react_worker.py # modify: call shared capture/extraction helpers + # remove: _build_turns + # remove: _to_turn + # remove: _extract_request_parts + # remove: _extract_response_parts + # remove: _extract_tool_results + # remove: _make_json_safe + # remove: transcript-only imports for dataclasses, + # PydanticAI request/response parts, + # Ergon part classes, extract_logprobs, + # and LOGPROB_SETTINGS + react_prompts.py # leave alone: benchmark/system prompt definitions + research_rubrics/ + researcher_worker.py # leave alone unless it later adopts PydanticAI transcript capture + workflow_cli_react_worker.py # leave alone unless it later adopts PydanticAI transcript capture + models/ + resolution.py # optional move: ResolvedModel/register/resolve from core + openrouter_backend.py # leave alone: model resolution backend already exists + vllm_backend.py # leave alone: model resolution backend already exists + cloud_passthrough.py # leave alone: passthrough backend behavior unchanged + tools/ # leave alone: tool definitions are not transcript extraction + +ergon_core/ + ergon_core/ + api/ + generation.py # existing contract: GenerationTurn stays framework-neutral + core/ + rl/ + __init__.py # modify: remove PydanticAI-specific LOGPROB_SETTINGS if unused + providers/ + generation/ + model_resolution.py # optional remove: move to ergon_builtins.models.resolution + structured_judge.py # optional remove: move to ergon_builtins.common.llm.structured_judge + capture_settings.py # do not add here + adapters/ # do not add framework adapters here + pydantic_ai_format.py # remove or stop using: behavior moves to PydanticAI adapter + persistence/ + context/ + repository.py # modify: persist tool_result events from turn.tool_results + models.py # existing table model: RunContextEvent + event_payloads.py # existing payload union: tool_result/thinking/etc. + assembly.py # remove: PydanticAI-specific resume assembly moves to adapter + +tests/ + unit/ + builtins/ + common/ + test_capture_settings.py # add: provider settings contract + test_transcript_adapters.py # add: base interface + PydanticAI adapter contract + providers/ + test_capture_settings.py # do not add here + test_transcript_adapters.py # do not add here + persistence/ + test_context_event_repository.py # add: tool_results -> tool_result rows + state/ + test_generation_turn_build.py # modify: import new transcript adapter + test_context_assembly.py # remove or move assertions into test_transcript_adapters.py + workers/ + test_react_worker_contract.py # modify: ReActWorker no longer owns parser helpers +``` + +Import direction: + +- `ergon_builtins.common.llm_context.*` may import `ergon_core.api.generation` and, if moved, `ergon_builtins.models.resolution.ResolvedModel`. +- `ergon_builtins.workers.baselines.react_worker` may import `ergon_builtins.common.llm_context.*`. +- `ergon_core` must not import `ergon_builtins`. + +Additional core consolidation in scope: + +- Move `ergon_core/ergon_core/core/persistence/context/assembly.py` into `PydanticAITranscriptAdapter` because it imports `pydantic_ai.messages` directly. +- Move `ergon_core/ergon_core/core/providers/generation/pydantic_ai_format.py` behavior into `adapters/pydantic_ai.py` or a private sibling under `ergon_builtins.common.llm_context.adapters`; it is only useful for PydanticAI response dumps. +- Move `LOGPROB_SETTINGS` out of `ergon_core.core.rl.__init__` if no RL code imports it after this refactor; it is currently a PydanticAI model-settings constant, not an RL-domain primitive. +- Optional but coherent: move `ergon_core/ergon_core/core/providers/generation/model_resolution.py` to `ergon_builtins/ergon_builtins/models/resolution.py`. It imports PydanticAI and is populated by builtins model backends. +- Optional but coherent: move `ergon_core/ergon_core/core/providers/generation/structured_judge.py` to `ergon_builtins/ergon_builtins/common/llm/structured_judge.py`. It constructs a PydanticAI `Agent` and is currently used by builtins evaluator/benchmark code. +- Do not move `ergon_core/api/generation.py`, `event_payloads.py`, `models.py`, or `repository.py`; those are the framework-neutral core domain. +- Do not move model backends (`openrouter_backend.py`, `vllm_backend.py`, `cloud_passthrough.py`) in this refactor; they already live in `ergon_builtins.models`. + +--- + +## Provider Settings Contract + +Use one settings helper instead of scattering provider checks through workers. + +Expected behavior: + +- `vllm:*` keeps existing logprob settings: + +```python +{"openai_logprobs": True, "openai_top_logprobs": 1} +``` + +- `anthropic:*` asks Anthropic for thinking blocks: + +```python +{"anthropic_thinking": {"type": "enabled", "budget_tokens": 1024}} +``` + +- `openrouter:*` asks OpenRouter to include reasoning: + +```python +{"openrouter_reasoning": {"enabled": True, "exclude": False}} +``` + +- `google:*` asks Gemini to include thoughts: + +```python +{"gemini_thinking_config": {"include_thoughts": True}} +``` + +- Unknown providers return `None`; provider-specific capture behavior must be added explicitly with tests. + +If provider settings conflict with a model/output mode at runtime, the implementation should fail loudly in tests first. Do not silently suppress thinking capture unless a targeted fallback is added with a test. + +--- + +## Task 1: Add Capture Settings Helper + +**Files:** + +- Create: `ergon_builtins/ergon_builtins/common/__init__.py` +- Create: `ergon_builtins/ergon_builtins/common/llm_context/__init__.py` +- Create: `ergon_builtins/ergon_builtins/common/llm_context/capture_settings.py` +- Create: `ergon_builtins/ergon_builtins/common/llm_context/adapters/__init__.py` +- Create: `ergon_builtins/ergon_builtins/common/llm_context/adapters/base.py` +- Test: `tests/unit/builtins/common/test_capture_settings.py` + +- [ ] **Step 1: Write tests for provider-specific model settings** + +Create `tests/unit/builtins/common/test_capture_settings.py`: + +```python +from ergon_builtins.common.llm_context.capture_settings import build_capture_model_settings +from ergon_core.core.providers.generation.model_resolution import ResolvedModel + + +def _resolved(*, supports_logprobs: bool = False) -> ResolvedModel: + return ResolvedModel(model="dummy", supports_logprobs=supports_logprobs) + + +def test_vllm_enables_logprobs() -> None: + assert build_capture_model_settings("vllm:http://localhost:8000", _resolved(supports_logprobs=True)) == { + "openai_logprobs": True, + "openai_top_logprobs": 1, + } + + +def test_anthropic_enables_thinking() -> None: + assert build_capture_model_settings("anthropic:claude-sonnet-4", _resolved()) == { + "anthropic_thinking": {"type": "enabled", "budget_tokens": 1024}, + } + + +def test_openrouter_includes_reasoning() -> None: + assert build_capture_model_settings("openrouter:anthropic/claude-sonnet-4.6", _resolved()) == { + "openrouter_reasoning": {"enabled": True, "exclude": False}, + } + + +def test_google_includes_thoughts() -> None: + assert build_capture_model_settings("google:gemini-2.5-pro", _resolved()) == { + "gemini_thinking_config": {"include_thoughts": True}, + } + + +def test_unknown_provider_without_capture_returns_none() -> None: + assert build_capture_model_settings("openai:gpt-4o", _resolved()) is None +``` + +- [ ] **Step 2: Run the focused test and verify it fails** + +Run: + +```bash +pytest tests/unit/builtins/common/test_capture_settings.py -q +``` + +Expected: FAIL because `capture_settings.py` does not exist. + +- [ ] **Step 3: Implement `capture_settings.py`** + +Create `ergon_builtins/ergon_builtins/common/__init__.py`: + +```python +"""Shared utilities for built-in Ergon workers.""" +``` + +Create `ergon_builtins/ergon_builtins/common/llm_context/__init__.py`: + +```python +"""Helpers for capturing LLM context from built-in worker frameworks.""" +``` + +Create `ergon_builtins/ergon_builtins/common/llm_context/adapters/__init__.py`: + +```python +"""Framework adapters for LLM transcript extraction and replay assembly.""" +``` + +Create `ergon_builtins/ergon_builtins/common/llm_context/adapters/base.py`: + +```python +"""Base interface for framework transcript adapters.""" + +from typing import Protocol, TypeVar + +from ergon_core.api.generation import GenerationTurn +from ergon_core.core.persistence.context.models import RunContextEvent + +TranscriptT = TypeVar("TranscriptT") +ReplayT = TypeVar("ReplayT") + + +class TranscriptAdapter(Protocol[TranscriptT, ReplayT]): + """Convert between framework-native transcripts and Ergon context events.""" + + def build_turns(self, transcript: TranscriptT) -> list[GenerationTurn]: + """Return ordered turns extracted from a complete transcript.""" + ... + + def assemble_replay(self, events: list[RunContextEvent]) -> ReplayT: + """Return framework-native replay context from ordered context events.""" + ... +``` + +Create `ergon_builtins/ergon_builtins/common/llm_context/capture_settings.py`: + +```python +"""Provider-specific settings for capturing model context events. + +Workers call this once before running an agent. The returned dictionary is +passed to PydanticAI as model_settings. +""" + +from ergon_core.api.json_types import JsonObject +from ergon_core.core.providers.generation.model_resolution import ResolvedModel +_ANTHROPIC_THINKING_BUDGET_TOKENS = 1024 +_OPENAI_COMPAT_LOGPROB_SETTINGS: JsonObject = { + "openai_logprobs": True, + "openai_top_logprobs": 1, +} + + +def _prefix(model_target: str | None) -> str: + target = model_target or "" + return target.split(":", 1)[0] if ":" in target else "" + + +def build_capture_model_settings( + model_target: str | None, + resolved_model: ResolvedModel, +) -> JsonObject | None: + """Return PydanticAI model_settings for transcript capture.""" + prefix = _prefix(model_target) + + if prefix == "vllm" and resolved_model.supports_logprobs: + return dict(_OPENAI_COMPAT_LOGPROB_SETTINGS) + + if prefix == "anthropic": + return { + "anthropic_thinking": { + "type": "enabled", + "budget_tokens": _ANTHROPIC_THINKING_BUDGET_TOKENS, + } + } + + if prefix == "openrouter": + return { + "openrouter_reasoning": { + "enabled": True, + "exclude": False, + } + } + + if prefix == "google": + return { + "gemini_thinking_config": { + "include_thoughts": True, + } + } + + return None +``` + +- [ ] **Step 4: Run the focused test and verify it passes** + +Run: + +```bash +pytest tests/unit/builtins/common/test_capture_settings.py -q +``` + +Expected: PASS. + +--- + +## Task 2: Extract PydanticAI Transcript Conversion + +**Files:** + +- Create: `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` +- Test: `tests/unit/builtins/common/test_transcript_adapters.py` +- Modify: `tests/unit/state/test_generation_turn_build.py` + +- [ ] **Step 1: Write tests for transcript extraction** + +Create `tests/unit/builtins/common/test_transcript_adapters.py`: + +```python +from ergon_core.api.generation import ( + GenerationTurn, + TextPart as ErgonTextPart, + ThinkingPart as ErgonThinkingPart, + ToolCallPart as ErgonToolCallPart, + ToolReturnPart as ErgonToolReturnPart, + UserPromptPart as ErgonUserPromptPart, +) +from ergon_builtins.common.llm_context.adapters.base import TranscriptAdapter +from ergon_builtins.common.llm_context.adapters.pydantic_ai import ( + PydanticAITranscriptAdapter, +) +from pydantic_ai.messages import ( + ModelRequest, + ModelResponse, + TextPart, + ThinkingPart, + ToolCallPart, + ToolReturnPart, + UserPromptPart, +) + + +def test_text_and_thinking_are_response_parts() -> None: + adapter: TranscriptAdapter[list[ModelRequest | ModelResponse], list[ModelRequest | ModelResponse]] = ( + PydanticAITranscriptAdapter() + ) + turns = adapter.build_turns( + [ + ModelRequest(parts=[UserPromptPart(content="hard question")]), + ModelResponse( + parts=[ + ThinkingPart(content="let me reason"), + TextPart(content="answer"), + ] + ), + ] + ) + + assert len(turns) == 1 + turn = turns[0] + assert isinstance(turn, GenerationTurn) + assert any(isinstance(part, ErgonUserPromptPart) for part in turn.messages_in) + assert any(isinstance(part, ErgonThinkingPart) for part in turn.response_parts) + assert any(isinstance(part, ErgonTextPart) for part in turn.response_parts) + + +def test_tool_return_is_attached_to_generating_turn() -> None: + adapter = PydanticAITranscriptAdapter() + turns = adapter.build_turns( + [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ModelRequest( + parts=[ + ToolReturnPart( + tool_name="search", + tool_call_id="call-1", + content={"result": "found"}, + ) + ] + ), + ModelResponse(parts=[TextPart(content="done")]), + ] + ) + + assert len(turns) == 2 + first = turns[0] + assert any(isinstance(part, ErgonToolCallPart) for part in first.response_parts) + assert len(first.tool_results) == 1 + result = first.tool_results[0] + assert isinstance(result, ErgonToolReturnPart) + assert result.tool_call_id == "call-1" + assert result.tool_name == "search" + assert result.content == '{"result": "found"}' +``` + +- [ ] **Step 2: Run the focused test and verify it fails** + +Run: + +```bash +pytest tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: FAIL because `adapters/pydantic_ai.py` does not exist. + +- [ ] **Step 3: Implement the transcript utility** + +Create `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` by moving the existing parsing helpers out of `react_worker.py`: + +```python +"""PydanticAI transcript adapter.""" + +import dataclasses # slopcop: ignore[no-dataclass] +import json +from typing import Any + +from ergon_core.api.generation import ( + GenerationTurn, + SystemPromptPart, + TextPart, + ThinkingPart, + TokenLogprob, + ToolCallPart, + ToolReturnPart, + UserPromptPart, +) +from ergon_core.core.persistence.context.event_payloads import ( + AssistantTextPayload, + SystemPromptPayload, + ThinkingPayload, + ToolCallPayload, + ToolResultPayload, + UserMessagePayload, +) +from ergon_core.core.persistence.context.models import RunContextEvent +from ergon_builtins.common.llm_context.adapters.base import TranscriptAdapter +from pydantic_ai.messages import ModelMessage, ModelRequest, ModelResponse +from pydantic_ai.messages import SystemPromptPart as PydanticSystemPromptPart +from pydantic_ai.messages import TextPart as PydanticTextPart +from pydantic_ai.messages import ThinkingPart as PydanticThinkingPart +from pydantic_ai.messages import ToolCallPart as PydanticToolCallPart +from pydantic_ai.messages import ToolReturnPart as PydanticToolReturnPart +from pydantic_ai.messages import UserPromptPart as PydanticUserPromptPart + + +class PydanticAITranscriptAdapter(TranscriptAdapter[list[ModelMessage], list[ModelMessage]]): + """Convert complete PydanticAI message history into Ergon turns.""" + + def build_turns(self, transcript: list[ModelMessage]) -> list[GenerationTurn]: + """Build turns from a complete PydanticAI message list. + + The full message history is required because tool returns appear in the + request after the response that created the tool call. + """ + turns: list[GenerationTurn] = [] + pending_response: ModelResponse | None = None + pending_request_in: ModelRequest | None = None + + for message in transcript: + if isinstance(message, ModelRequest): + if pending_response is not None: + turns.append( + _to_turn( + pending_request_in, + pending_response, + tool_result_request=message, + ) + ) + pending_response = None + pending_request_in = None + pending_request_in = message + elif isinstance(message, ModelResponse): + pending_response = message + + if pending_response is not None: + turns.append(_to_turn(pending_request_in, pending_response, tool_result_request=None)) + + return turns + + def assemble_replay(self, events: list[RunContextEvent]) -> list[ModelMessage]: + """Reconstruct PydanticAI messages from ordered context events.""" + messages: list[ModelMessage] = [] + current_request_parts: list[Any] = [] + current_response_parts: list[Any] = [] + + for event in events: + if event.event_type in ("system_prompt", "user_message"): + current_request_parts.append(_to_pydantic_request_part(event)) + elif event.event_type in ("thinking", "assistant_text", "tool_call"): + if current_request_parts and not current_response_parts: + messages.append(ModelRequest(parts=current_request_parts)) + current_request_parts = [] + current_response_parts.append(_to_pydantic_response_part(event)) + elif event.event_type == "tool_result": + if current_response_parts: + messages.append(ModelResponse(parts=current_response_parts)) + current_response_parts = [] + current_request_parts.append(_to_pydantic_request_part(event)) + + if current_response_parts: + messages.append(ModelResponse(parts=current_response_parts)) + + return messages + + +def _to_turn( + request_in: ModelRequest | None, + response: ModelResponse, + tool_result_request: ModelRequest | None, +) -> GenerationTurn: + raw_resp = _make_json_safe(dataclasses.asdict(response)) + return GenerationTurn( + messages_in=_extract_request_parts(request_in) if request_in else [], + response_parts=_extract_response_parts(response), + tool_results=_extract_tool_results(tool_result_request) if tool_result_request else [], + turn_logprobs=extract_logprobs(raw_resp), + ) + + +def extract_logprobs(raw: dict[str, Any]) -> list[TokenLogprob] | None: + """Extract per-token logprobs from a PydanticAI response dump.""" + details = raw.get("provider_details") + if not isinstance(details, dict): + return None + raw_logprobs = details.get("logprobs") + if not isinstance(raw_logprobs, list) or not raw_logprobs: + return None + return [ + TokenLogprob( + token=entry["token"], + logprob=entry["logprob"], + top_logprobs=entry.get("top_logprobs", []), + ) + for entry in raw_logprobs + if isinstance(entry, dict) and "token" in entry and "logprob" in entry + ] + + +def _to_pydantic_response_part(event: RunContextEvent) -> Any: # slopcop: ignore[no-typing-any] + parsed = event.parsed_payload() + if event.event_type == "thinking": + if not isinstance(parsed, ThinkingPayload): + raise ValueError(f"Expected ThinkingPayload for thinking event, got {type(parsed)}") + return PydanticThinkingPart(content=parsed.text) + if event.event_type == "assistant_text": + if not isinstance(parsed, AssistantTextPayload): + raise ValueError(f"Expected AssistantTextPayload for assistant_text event, got {type(parsed)}") + return PydanticTextPart(content=parsed.text) + if event.event_type == "tool_call": + if not isinstance(parsed, ToolCallPayload): + raise ValueError(f"Expected ToolCallPayload for tool_call event, got {type(parsed)}") + return PydanticToolCallPart( + tool_name=parsed.tool_name, + tool_call_id=parsed.tool_call_id, + args=parsed.args, + ) + raise ValueError(f"Unexpected response event_type: {event.event_type!r}") + + +def _to_pydantic_request_part(event: RunContextEvent) -> Any: # slopcop: ignore[no-typing-any] + parsed = event.parsed_payload() + if event.event_type == "system_prompt": + if not isinstance(parsed, SystemPromptPayload): + raise ValueError(f"Expected SystemPromptPayload for system_prompt event, got {type(parsed)}") + return PydanticSystemPromptPart(content=parsed.text) + if event.event_type == "user_message": + if not isinstance(parsed, UserMessagePayload): + raise ValueError(f"Expected UserMessagePayload for user_message event, got {type(parsed)}") + return PydanticUserPromptPart(content=parsed.text) + if event.event_type == "tool_result": + if not isinstance(parsed, ToolResultPayload): + raise ValueError(f"Expected ToolResultPayload for tool_result event, got {type(parsed)}") + return PydanticToolReturnPart( + tool_call_id=parsed.tool_call_id, + tool_name=parsed.tool_name, + content=str(parsed.result), + ) + raise ValueError(f"Unexpected request event_type: {event.event_type!r}") + + +def _extract_request_parts(request: ModelRequest) -> list[Any]: # slopcop: ignore[no-typing-any] + parts: list[Any] = [] # slopcop: ignore[no-typing-any] + for part in request.parts: + if isinstance(part, PydanticSystemPromptPart): + parts.append(SystemPromptPart(content=part.content)) + elif isinstance(part, PydanticUserPromptPart) and isinstance(part.content, str): + parts.append(UserPromptPart(content=part.content)) + return parts + + +def _extract_response_parts(response: ModelResponse) -> list[Any]: # slopcop: ignore[no-typing-any] + parts: list[Any] = [] # slopcop: ignore[no-typing-any] + for part in response.parts: + if isinstance(part, PydanticTextPart): + parts.append(TextPart(content=part.content)) + elif isinstance(part, PydanticToolCallPart): + parts.append( + ToolCallPart( + tool_name=part.tool_name, + tool_call_id=part.tool_call_id, + args=part.args_as_dict(), + ) + ) + elif isinstance(part, PydanticThinkingPart): + parts.append(ThinkingPart(content=part.content)) + return parts + + +def _extract_tool_results(request: ModelRequest) -> list[ToolReturnPart]: + results: list[ToolReturnPart] = [] + for part in request.parts: + if isinstance(part, PydanticToolReturnPart): + content = part.content + serialized = content if isinstance(content, str) else json.dumps(content, default=str) + results.append( + ToolReturnPart( + tool_call_id=part.tool_call_id, + tool_name=part.tool_name, + content=serialized, + ) + ) + return results + + +def _make_json_safe(obj: Any) -> Any: # slopcop: ignore[no-typing-any] + from datetime import datetime + + if isinstance(obj, dict): + return {k: _make_json_safe(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_make_json_safe(v) for v in obj] + if isinstance(obj, datetime): + return obj.isoformat() + if isinstance(obj, bytes): + return obj.decode("utf-8", errors="replace") + return obj +``` + +- [ ] **Step 4: Run the focused test and verify it passes** + +Run: + +```bash +pytest tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: PASS. + +- [ ] **Step 5: Update old generation-turn tests to import the new utility** + +Modify `tests/unit/state/test_generation_turn_build.py`: + +```python +from ergon_builtins.common.llm_context.adapters.pydantic_ai import PydanticAITranscriptAdapter + + +def _build_turns(messages): + return PydanticAITranscriptAdapter().build_turns(messages) +``` + +Remove the old import from `ergon_builtins.workers.baselines.react_worker`. + +- [ ] **Step 6: Run the old and new transcript tests together** + +Run: + +```bash +pytest tests/unit/state/test_generation_turn_build.py tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: PASS. + +--- + +## Task 3: Simplify `ReActWorker` + +**Files:** + +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` +- Test: `tests/unit/workers/test_react_worker_contract.py` + +- [ ] **Step 1: Add a contract test that transcript helpers no longer live in `react_worker.py`** + +Modify `tests/unit/workers/test_react_worker_contract.py`: + +```python +def test_pydantic_ai_transcript_adapter_lives_outside_worker() -> None: + import ergon_builtins.workers.baselines.react_worker as react_worker + + assert not hasattr(react_worker, "_build_turns") + assert not hasattr(react_worker, "_extract_request_parts") + assert not hasattr(react_worker, "_extract_response_parts") + assert not hasattr(react_worker, "_extract_tool_results") +``` + +- [ ] **Step 2: Run the contract test and verify it fails** + +Run: + +```bash +pytest tests/unit/workers/test_react_worker_contract.py -q +``` + +Expected: FAIL because helper functions still exist in `react_worker.py`. + +- [ ] **Step 3: Update `ReActWorker` imports** + +In `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py`, remove imports that are only used by transcript parsing: + +```python +import dataclasses # remove +from ergon_core.api.generation import SystemPromptPart, TextPart, ThinkingPart, ToolCallPart, ToolReturnPart, UserPromptPart # remove +from ergon_core.api.json_types import JsonObject # remove if only used for model_settings type +from ergon_core.core.providers.generation.pydantic_ai_format import extract_logprobs # remove +from ergon_core.core.rl import LOGPROB_SETTINGS # remove +from pydantic_ai.messages import ModelRequest, ModelResponse # remove +from pydantic_ai.messages import SystemPromptPart as PydanticSystemPromptPart # remove +from pydantic_ai.messages import TextPart as PydanticTextPart # remove +from pydantic_ai.messages import ThinkingPart as PydanticThinkingPart # remove +from pydantic_ai.messages import ToolCallPart as PydanticToolCallPart # remove +from pydantic_ai.messages import ToolReturnPart as PydanticToolReturnPart # remove +from pydantic_ai.messages import UserPromptPart as PydanticUserPromptPart # remove +``` + +Add: + +```python +from ergon_builtins.common.llm_context.capture_settings import build_capture_model_settings +from ergon_builtins.common.llm_context.adapters.pydantic_ai import PydanticAITranscriptAdapter +``` + +- [ ] **Step 4: Update model settings and transcript extraction** + +Replace: + +```python +model_settings: JsonObject | None = None +if resolved.supports_logprobs and self.model and self.model.startswith("vllm:"): + model_settings = LOGPROB_SETTINGS +``` + +with: + +```python +model_settings = build_capture_model_settings(self.model, resolved) +``` + +Replace: + +```python +turns = _build_turns(run.ctx.state.message_history) +``` + +with: + +```python +turns = PydanticAITranscriptAdapter().build_turns(run.ctx.state.message_history) +``` + +- [ ] **Step 5: Delete transcript helper functions from `react_worker.py`** + +Delete the helper block that starts at: + +```python +# --------------------------------------------------------------------------- +# PydanticAI message → GenerationTurn +# --------------------------------------------------------------------------- +``` + +Keep `_format_task` and `_latest_final_result_message` in `react_worker.py` because they are worker behavior, not PydanticAI transcript parsing. + +- [ ] **Step 6: Run contract and worker tests** + +Run: + +```bash +pytest tests/unit/workers/test_react_worker_contract.py tests/unit/state/test_generation_turn_build.py tests/unit/builtins/common/test_capture_settings.py tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: PASS. + +--- + +## Task 4: Persist `GenerationTurn.tool_results` + +**Files:** + +- Modify: `ergon_core/ergon_core/core/persistence/context/repository.py` +- Test: `tests/unit/persistence/test_context_event_repository.py` + +- [ ] **Step 1: Write a failing repository test** + +Create `tests/unit/persistence/test_context_event_repository.py`: + +```python +from uuid import UUID + +import pytest +from ergon_core.api.generation import GenerationTurn, ToolCallPart, ToolReturnPart, UserPromptPart +from ergon_core.core.persistence.context.repository import ContextEventRepository +from ergon_core.core.persistence.telemetry.models import RunRecord, RunTaskExecution +from ergon_core.core.persistence.shared.ids import new_id +from sqlmodel import Session + + +@pytest.mark.asyncio +async def test_persist_turn_records_tool_results_from_tool_results(session: Session) -> None: + run_id = new_id() + execution_id = new_id() + + session.add(RunRecord(id=run_id, experiment_id=UUID(int=1), name="test", status="running")) + session.add( + RunTaskExecution( + id=execution_id, + run_id=run_id, + definition_task_id=UUID(int=2), + node_id=UUID(int=3), + attempt_number=1, + status="running", + ) + ) + session.commit() + + repo = ContextEventRepository() + events = await repo.persist_turn( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker", + turn=GenerationTurn( + messages_in=[UserPromptPart(content="search")], + response_parts=[ + ToolCallPart(tool_name="search", tool_call_id="call-1", args={"query": "ergon"}) + ], + tool_results=[ + ToolReturnPart(tool_name="search", tool_call_id="call-1", content="found") + ], + ), + ) + + assert [event.event_type for event in events] == ["user_message", "tool_call", "tool_result"] + tool_result = events[-1].parsed_payload() + assert tool_result.event_type == "tool_result" + assert tool_result.tool_name == "search" + assert tool_result.tool_call_id == "call-1" + assert tool_result.result == "found" +``` + +If the project uses a differently named DB fixture than `session`, adapt only the fixture name and setup rows to the existing test harness. Keep the assertion shape unchanged. + +- [ ] **Step 2: Run the focused repository test and verify it fails** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py -q +``` + +Expected: FAIL because `_events_from_tool_results` currently scans `turn.messages_in`, not `turn.tool_results`. + +- [ ] **Step 3: Update `_events_from_tool_results`** + +In `ergon_core/ergon_core/core/persistence/context/repository.py`, replace the loop source: + +```python +for part in turn.messages_in: +``` + +with a helper that prefers `turn.tool_results` and preserves compatibility with old/custom workers: + +```python +tool_result_parts = [ + *turn.tool_results, + *(part for part in turn.messages_in if isinstance(part, ToolReturnPart)), +] +for part in tool_result_parts: +``` + +Update the docstring to: + +```python +"""Produce tool_result events from GenerationTurn tool observations.""" +``` + +- [ ] **Step 4: Run the focused repository test and verify it passes** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py -q +``` + +Expected: PASS. + +--- + +## Task 5: Add End-to-End Unit Coverage for ReAct Capture Shape + +**Files:** + +- Modify or create: `tests/unit/builtins/common/test_transcript_adapters.py` +- Modify or create: `tests/unit/persistence/test_context_event_repository.py` + +- [ ] **Step 1: Add a combined transcript-to-event regression** + +Add a test that builds PydanticAI messages, converts them to `GenerationTurn`, persists the first turn, and asserts event types: + +```python +@pytest.mark.asyncio +async def test_pydantic_ai_tool_observation_becomes_context_event(session: Session) -> None: + from ergon_builtins.common.llm_context.adapters.pydantic_ai import PydanticAITranscriptAdapter + + turns = PydanticAITranscriptAdapter().build_turns( + [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ModelRequest( + parts=[ + ToolReturnPart( + tool_name="search", + tool_call_id="call-1", + content="found", + ) + ] + ), + ] + ) + + events = await repo.persist_turn( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker", + turn=turns[0], + ) + + assert [event.event_type for event in events] == ["user_message", "tool_call", "tool_result"] +``` + +Use the same DB setup helper from Task 4. This test is intentionally redundant: it protects the integration boundary where the current bug occurred. + +- [ ] **Step 2: Add a thinking regression** + +Add a test with `ThinkingPart(content="let me think")` in the PydanticAI response, then persist the resulting turn and assert a `thinking` context event appears before `assistant_text`: + +```python +assert [event.event_type for event in events] == ["user_message", "thinking", "assistant_text"] +``` + +- [ ] **Step 3: Run the combined tests** + +Run: + +```bash +pytest tests/unit/builtins/common/test_transcript_adapters.py tests/unit/persistence/test_context_event_repository.py -q +``` + +Expected: PASS. + +--- + +## Task 6: Verification Against a Real LLM Smoke Run + +**Files:** + +- No code changes required. +- Optional inspection command only. + +- [ ] **Step 1: Run a small real LLM benchmark using a reasoning-capable model** + +Use the repo's existing real-LLM harness or CLI with a cheap one-task run. Prefer a model target already used by the repo, such as: + +```bash +pytest tests/real_llm/benchmarks/test_researchrubrics.py -q +``` + +If the real-LLM test is intentionally skipped because credentials or budget are unavailable, record that skip in the implementation summary. + +- [ ] **Step 2: Inspect the run snapshot for richer context events** + +For a known run id, inspect event counts: + +```bash +RUN_ID= python - <<'PY' +import json, urllib.request +import os + +run_id = os.environ["RUN_ID"] +with urllib.request.urlopen(f"http://127.0.0.1:3002/api/runs/{run_id}", timeout=5) as r: + data = json.load(r) + +counts = {} +for events in (data.get("contextEventsByTask") or {}).values(): + for event in events: + counts[event.get("eventType")] = counts.get(event.get("eventType"), 0) + 1 + +print(counts) +PY +``` + +Expected for a tool-using run: `tool_result` count is non-zero. Expected for a provider/model that returns thinking: `thinking` count is non-zero. + +Do not fail the implementation if `thinking` is zero for a provider that does not return thoughts despite the request. Do fail if tool-using ReAct runs still have zero `tool_result` events. + +--- + +## Task 7: Final Test Pass + +**Files:** + +- No code changes unless tests reveal a regression. + +- [ ] **Step 1: Run focused backend tests** + +Run: + +```bash +pytest tests/unit/builtins/common/test_capture_settings.py tests/unit/builtins/common/test_transcript_adapters.py tests/unit/persistence/test_context_event_repository.py tests/unit/state/test_generation_turn_build.py tests/unit/workers/test_react_worker_contract.py -q +``` + +Expected: PASS. + +- [ ] **Step 2: Run lints for edited Python files** + +Run the repo's standard Python lint/type command if available. If the repo does not expose a single lint command, at minimum run: + +```bash +python -m compileall ergon_builtins/ergon_builtins/common ergon_builtins/ergon_builtins/workers/baselines ergon_core/ergon_core/core/persistence/context +``` + +Expected: PASS. + +- [ ] **Step 3: Record implementation notes** + +In the implementation summary, include: + +- Whether `tool_result` is now persisted from `GenerationTurn.tool_results`. +- Which provider settings were added for thinking/reasoning. +- Whether real-LLM verification produced `thinking` events or only verified `tool_result`. +- Any provider-specific caveat, especially Anthropic thinking plus structured output behavior. + +--- + +## Acceptance Criteria + +- ReAct worker no longer owns PydanticAI message parsing internals. +- PydanticAI transcript extraction is reusable by other PydanticAI-based workers. +- Real ReAct workers pass capture-oriented model settings when the provider supports thinking/reasoning/logprobs. +- `ContextEventRepository.persist_turn` writes `tool_result` rows from `GenerationTurn.tool_results`. +- A tool-using ReAct run can be inspected through `GET /api/runs/{run_id}` and shows non-zero `tool_result` events. +- Thinking blocks are persisted as `thinking` events when the provider returns PydanticAI `ThinkingPart` objects. + diff --git a/docs/superpowers/plans/2026-04-27-react-worker-failure-context-capture.md b/docs/superpowers/plans/2026-04-27-react-worker-failure-context-capture.md new file mode 100644 index 00000000..e730dc6f --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-react-worker-failure-context-capture.md @@ -0,0 +1,650 @@ +# ReAct Worker Failure Context Capture Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Preserve partial PydanticAI ReAct transcript history when `agent.iter(...)` raises before `ReActWorker._run_agent()` reaches its normal post-run transcript extraction. + +**Architecture:** Keep runtime persistence ownership in `worker_execute_fn()`: workers yield `GenerationTurn`, runtime persists `RunContextEvent`. Add an incremental/cursor-based extraction API to `PydanticAITranscriptAdapter` so `ReActWorker` can yield completed turns during normal iteration and flush any remaining partial turn in an exception path before re-raising. This keeps failure semantics intact while eliminating the current zero-context failure gap for failed ReAct/CLI child workers. + +**Tech Stack:** Python, PydanticAI `Agent.iter`, `GenerationTurn`, `PydanticAITranscriptAdapter`, `ContextEventRepository`, pytest. + +--- + +## Root Cause + +Current `ReActWorker._run_agent()` only converts PydanticAI messages into `GenerationTurn`s after the `agent.iter(...)` context exits normally: + +```python +async with agent.iter(...) as run: + async for _node in run: + ... + +turns = PydanticAITranscriptAdapter().build_turns(run.ctx.state.message_history) +for turn in turns: + yield turn +``` + +If PydanticAI raises inside `async for _node in run`, control jumps out of `_run_agent()` before `build_turns(...)` runs. Then `worker_execute_fn()` catches the exception before it has received any turns to persist. That explains executions with an error stack but `0` `RunContextEvent` rows. + +The ResearchRubrics workflow CLI worker is affected because it subclasses `ReActWorker`: + +```python +async for turn in super().execute(task, context=context): + yield turn +``` + +Successful CLI runs use the shared adapter; failed CLI runs can still lose partial transcript history. + +--- + +## Desired Behavior + +- Successful ReAct runs keep capturing the same full transcript as today. +- Failed ReAct runs yield/persist every turn that can be reconstructed from `run.ctx.state.message_history` before re-raising the original exception. +- Runtime failure semantics do not change: `worker_execute_fn()` still returns the failure result and task status remains failed. +- Workers do not call `ContextEventRepository` directly. +- No duplicate context events are emitted when incremental extraction is called multiple times. +- Partial trailing responses can be flushed on final success or failure, but not emitted prematurely while a tool call may still receive a following `ToolReturnPart`. + +--- + +## File Map + +```text +ergon_builtins/ + ergon_builtins/ + common/ + llm_context/ + adapters/ + pydantic_ai.py # modify: replace post-run-only turn extraction with cursor API + workers/ + baselines/ + react_worker.py # modify: yield incremental turns and flush on exception + +tests/ + unit/ + builtins/ + common/ + test_transcript_adapters.py # modify: cursor extraction + trailing flush tests + workers/ + test_react_worker_contract.py # modify or add tests for failure transcript yield/re-raise +``` + +Do not modify `worker_execute_fn()` for this fix unless tests prove it cannot persist turns yielded immediately before an async generator raises. The existing `async for turn in worker.execute(...)` loop already persists each yielded turn before requesting the next one. + +--- + +## Closure And Removals + +This is not an additive second serialization path. Close the old behavior explicitly: + +- Remove `ReActWorker._run_agent()`'s post-run-only extraction pattern: + +```python +turns = PydanticAITranscriptAdapter().build_turns(run.ctx.state.message_history) +for turn in turns: + yield turn +``` + +Replace it with cursor extraction during the loop plus final/failure flush. + +- Do not add a new repository or direct DB writer for failure capture. `ContextEventRepository` remains the only `GenerationTurn` -> `RunContextEvent` serializer, and it remains called by `worker_execute_fn()`. +- Do not restore the old core PydanticAI serializers removed in the previous refactor: `ergon_core/core/persistence/context/assembly.py` and `ergon_core/core/providers/generation/pydantic_ai_format.py`. +- Do not add any new `ergon_core` PydanticAI transcript code. All PydanticAI transcript extraction/replay stays in `ergon_builtins.common.llm_context.adapters.pydantic_ai`. +- Treat the cursor API as the runtime extraction surface. If a batch `build_turns(...)` helper remains for tests or protocol compatibility, implement it as a wrapper around the same cursor extraction logic, not as a second independent serializer. +- Update tests that assert the worker no longer owns parser helpers so they also assert `ReActWorker` does not call a post-run-only extraction helper directly. + +There is no separate old "turn serialization repository" to delete after the previous refactor. The durable serialization repository is still `ContextEventRepository`, and that should stay. The old thing to remove here is the worker's post-run-only transcript extraction path, because it is the failure gap. + +--- + +## Design + +Use a small cursor object in the PydanticAI adapter: + +```python +from pydantic import BaseModel + + +class TranscriptTurnCursor(BaseModel): + model_config = {"validate_assignment": True} + + emitted_turn_count: int = 0 +``` + +Make cursor extraction the runtime API: + +```python +class PydanticAITranscriptAdapter(...): + def build_new_turns( + self, + transcript: list[ModelMessage], + cursor: TranscriptTurnCursor, + *, + flush_pending: bool = False, + ) -> list[GenerationTurn]: + turns = _build_turns_from_transcript(transcript, flush_pending=flush_pending) + new_turns = turns[cursor.emitted_turn_count :] + cursor.emitted_turn_count = len(turns) + return new_turns +``` + +If `build_turns(...)` remains public because `TranscriptAdapter` currently declares it, it should delegate to the same internal implementation used by `build_new_turns(...)`. Do not keep two independent conversion implementations. + +Change current trailing-response behavior in `build_turns()` so it is explicit: + +```python +if pending_response is not None and flush_pending: + turns.append(_to_turn(pending_request_in, pending_response, tool_result_request=None)) +``` + +`flush_pending=False` is important during the live `agent.iter(...)` loop. It prevents emitting a tool-call response before the following `ModelRequest` has a chance to include the `ToolReturnPart`. On final success or failure, use `flush_pending=True` so partial model output is not lost. + +Update `ReActWorker._run_agent()`: + +```python +adapter = PydanticAITranscriptAdapter() +cursor = TranscriptTurnCursor() +run = None + +try: + async with agent.iter(...) as active_run: + run = active_run + async for _node in run: + node_count += 1 + + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=False, + ): + yield turn + + if node_count >= self.max_iterations: + logger.warning(...) + break +except Exception: + if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn + raise + +if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn +``` + +This is extraction-as-iterator in practice: the cursor marks what has already been yielded, and `build_new_turns(...)` can be called repeatedly as message history grows. + +Do not swallow exceptions. The final `raise` is required so `worker_execute_fn()` still records failure. + +--- + +## Task 1: Adapter Cursor API + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` +- Modify: `tests/unit/builtins/common/test_transcript_adapters.py` + +- [ ] **Step 1: Write failing test for no premature trailing response** + +Add to `tests/unit/builtins/common/test_transcript_adapters.py`: + +```python +from ergon_builtins.common.llm_context.adapters.pydantic_ai import TranscriptTurnCursor + + +def test_incremental_extraction_does_not_emit_pending_tool_call_response() -> None: + adapter = PydanticAITranscriptAdapter() + cursor = TranscriptTurnCursor() + transcript = [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ] + + assert adapter.build_new_turns(transcript, cursor, flush_pending=False) == [] + + flushed = adapter.build_new_turns(transcript, cursor, flush_pending=True) + assert len(flushed) == 1 + assert any(isinstance(part, ErgonToolCallPart) for part in flushed[0].response_parts) +``` + +- [ ] **Step 2: Write failing test for no duplicate new turns** + +Add: + +```python +def test_incremental_extraction_tracks_emitted_turns() -> None: + adapter = PydanticAITranscriptAdapter() + cursor = TranscriptTurnCursor() + transcript = [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ModelRequest( + parts=[ + ToolReturnPart( + tool_name="search", + tool_call_id="call-1", + content={"result": "found"}, + ) + ] + ), + ] + + first = adapter.build_new_turns(transcript, cursor, flush_pending=False) + second = adapter.build_new_turns(transcript, cursor, flush_pending=False) + + assert len(first) == 1 + assert second == [] +``` + +- [ ] **Step 3: Run red tests** + +Run: + +```bash +uv run pytest tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: FAIL because `TranscriptTurnCursor` and `build_new_turns()` do not exist. + +- [ ] **Step 4: Replace batch extraction internals with cursor-backed extraction** + +In `pydantic_ai.py`, add: + +```python +from pydantic import BaseModel + + +class TranscriptTurnCursor(BaseModel): + model_config = {"validate_assignment": True} + + emitted_turn_count: int = 0 +``` + +Move the existing `build_turns(...)` body into a private helper that takes `flush_pending`: + +```python +def _build_turns_from_transcript( + transcript: list[ModelMessage], + *, + flush_pending: bool, +) -> list[GenerationTurn]: + ... +``` + +Keep `build_turns(...)` only as compatibility with the existing `TranscriptAdapter` protocol and any batch tests: + +```python +def build_turns( + self, + transcript: list[ModelMessage], + *, + flush_pending: bool = True, +) -> list[GenerationTurn]: + return _build_turns_from_transcript(transcript, flush_pending=flush_pending) +``` + +Do not call `build_turns(...)` from `ReActWorker`. Runtime extraction should use the cursor API only. + +Change trailing append: + +```python +if pending_response is not None: + turns.append(_to_turn(pending_request_in, pending_response, tool_result_request=None)) +``` + +to: + +```python +if pending_response is not None and flush_pending: + turns.append(_to_turn(pending_request_in, pending_response, tool_result_request=None)) +``` + +Add: + +```python +def build_new_turns( + self, + transcript: list[ModelMessage], + cursor: TranscriptTurnCursor, + *, + flush_pending: bool = False, +) -> list[GenerationTurn]: + turns = _build_turns_from_transcript(transcript, flush_pending=flush_pending) + new_turns = turns[cursor.emitted_turn_count :] + cursor.emitted_turn_count = len(turns) + return new_turns +``` + +After this change, there is one conversion implementation: `_build_turns_from_transcript(...)`. `build_turns(...)` and `build_new_turns(...)` are wrappers with different calling semantics. + +- [ ] **Step 5: Run green tests** + +Run: + +```bash +uv run pytest tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: PASS. + +--- + +## Task 2: ReActWorker Failure Flush + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` +- Modify: `tests/unit/workers/test_react_worker_contract.py` + +- [ ] **Step 1: Write failing test for partial yield then re-raise** + +Add a fake `Agent` to `tests/unit/workers/test_react_worker_contract.py`: + +```python +from pydantic_ai.messages import ModelRequest, ModelResponse, TextPart, UserPromptPart + + +class _FakeRunState: + def __init__(self): + self.message_history = [ + ModelRequest(parts=[UserPromptPart(content="question")]), + ModelResponse(parts=[TextPart(content="partial answer")]), + ] + + +class _FakeRunContext: + def __init__(self): + self.state = _FakeRunState() + + +class _FailingAgentRun: + def __init__(self): + self.ctx = _FakeRunContext() + + def __aiter__(self): + return self + + async def __anext__(self): + raise RuntimeError("tool validation failed") + + +class _FailingAgentIter: + async def __aenter__(self): + return _FailingAgentRun() + + async def __aexit__(self, exc_type, exc, tb): + return False + + +class _FailingAgent: + def __init__(self, **kwargs): + pass + + def iter(self, *args, **kwargs): + return _FailingAgentIter() +``` + +Then add: + +```python +@pytest.mark.asyncio +async def test_react_worker_yields_partial_turn_before_reraising_agent_iter_failure(monkeypatch) -> None: + import ergon_builtins.workers.baselines.react_worker as react_worker + + monkeypatch.setattr(react_worker, "Agent", _FailingAgent) + monkeypatch.setattr( + react_worker, + "resolve_model_target", + lambda model: type( + "Resolved", + (), + {"model": "stub:constant", "capture_model_settings": None}, + )(), + ) + + worker = ReActWorker( + name="unit", + model=None, + task_id=UUID(int=1), + sandbox_id="test-sandbox", + tools=[], + system_prompt=None, + max_iterations=10, + ) + task = _minimal_task() + + turns = [] + with pytest.raises(RuntimeError, match="tool validation failed"): + async for turn in worker.execute(task, context=_minimal_context()): + turns.append(turn) + + assert len(turns) == 1 + assert any(part.content == "partial answer" for part in turns[0].response_parts) +``` + +Add small local helpers if this test file does not already have task/context fixtures: + +```python +from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload +from ergon_core.api.worker_context import WorkerContext + + +def _minimal_task() -> BenchmarkTask: + return BenchmarkTask( + task_id=UUID(int=2), + task_slug="unit-task", + description="Unit task", + task_payload=EmptyTaskPayload(), + ) + + +def _minimal_context() -> WorkerContext: + return WorkerContext( + run_id=UUID(int=3), + definition_id=UUID(int=4), + task_id=UUID(int=2), + execution_id=UUID(int=5), + sandbox_id="test-sandbox", + node_id=UUID(int=6), + ) +``` + +- [ ] **Step 2: Run red test** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py::test_react_worker_yields_partial_turn_before_reraising_agent_iter_failure -q +``` + +Expected: FAIL because `_run_agent()` currently re-raises before yielding the partial transcript. + +- [ ] **Step 3: Implement failure flush in `_run_agent()`** + +Modify `ReActWorker._run_agent()`: + +```python +adapter = PydanticAITranscriptAdapter() +cursor = TranscriptTurnCursor() +run = None + +try: + async with agent.iter( + task_prompt, + model_settings=resolved.capture_model_settings, + message_history=self._seed_messages, + ) as active_run: + run = active_run + async for _node in run: + node_count += 1 + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=False, + ): + yield turn + if node_count >= self.max_iterations: + logger.warning(...) + break +except Exception: + if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn + raise + +if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn +``` + +Keep the existing warning text for `max_iterations`. + +- [ ] **Step 4: Run worker test** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py -q +``` + +Expected: PASS. + +--- + +## Task 3: Runtime Persistence Regression + +**Files:** +- Modify: `tests/unit/runtime/test_failure_error_json.py` or add `tests/unit/runtime/test_worker_execute_partial_failure_context.py` + +- [ ] **Step 1: Add runtime-level regression if feasible** + +Add a unit test around `worker_execute_fn()` with a fake registered worker whose `execute()` yields one `GenerationTurn` and then raises. Assert that `ContextEventRepository.persist_turn()` is called before the failure result is returned. + +If existing `worker_execute_fn()` setup makes this too fixture-heavy, keep the worker-level test from Task 2 as the required regression and add a short comment in the test explaining why it is sufficient: + +```python +# worker_execute_fn persists each yielded turn before requesting the next item +# from the async generator, so this test covers the failure-capture contract at +# the worker boundary without rebuilding Inngest context fixtures. +``` + +- [ ] **Step 2: Run focused runtime/worker tests** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py tests/unit/persistence/test_context_event_repository.py -q +``` + +Expected: PASS. + +--- + +## Task 4: Verification + +**Files:** +- No production edits. + +- [ ] **Step 1: Run affected capture suite** + +Run: + +```bash +uv run pytest \ + tests/unit/builtins/common/test_transcript_adapters.py \ + tests/unit/persistence/test_context_event_repository.py \ + tests/unit/workers/test_react_worker_contract.py \ + tests/unit/state/test_generation_turn_build.py \ + tests/unit/state/test_context_assembly.py \ + -q +``` + +Expected: PASS. + +- [ ] **Step 2: Run lint/compile** + +Run: + +```bash +uv run ruff check \ + ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py \ + tests/unit/builtins/common/test_transcript_adapters.py \ + tests/unit/workers/test_react_worker_contract.py +uv run slopcop \ + ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py +uv run python -m compileall -q \ + ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py +``` + +Expected: PASS. + +- [ ] **Step 3: Optional real-run validation** + +Trigger a ReAct/CLI worker failure after the PydanticAI run has started, then inspect: + +```bash +RUN_ID= python - <<'PY' +from uuid import UUID +from sqlmodel import select +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.persistence.context.models import RunContextEvent + +run_id = UUID(__import__("os").environ["RUN_ID"]) +with get_session() as session: + rows = session.exec( + select(RunContextEvent) + .where(RunContextEvent.run_id == run_id) + .order_by(RunContextEvent.task_execution_id, RunContextEvent.sequence) + ).all() + for row in rows: + print(row.task_execution_id, row.sequence, row.event_type) +PY +``` + +Expected: the failed child execution has at least the partial model request/response/tool-call events that existed before the exception. + +--- + +## Self-Review + +- Spec coverage: The plan addresses the observed gap where `agent.iter(...)` raises before post-run extraction, including CLI workers through `ReActWorker` inheritance. +- Iterator question: The plan proposes cursor-based incremental extraction from growing `message_history`, which is the appropriate iterator shape for PydanticAI histories. +- Persistence boundary: The plan keeps `ContextEventRepository` in the runtime path and does not make workers write directly to the DB. +- Failure semantics: The original exception is re-raised after partial turns are yielded. +- Known limitation: If `agent.iter(...)` fails during `__aenter__` before a `run` object exists, there is no PydanticAI `message_history` to flush. That case should still produce normal task failure metadata, but cannot produce transcript events. diff --git a/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md b/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md new file mode 100644 index 00000000..c611f731 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md @@ -0,0 +1,810 @@ +# Agent Tool Budget Harness Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a simple, reusable tool-budget harness that prevents agent rollouts from looping indefinitely by counting `workflow` tool calls separately from all other tool calls and returning explicit budget-exhausted messages when either limit is reached. + +**Architecture:** Use Pydantic AI dependency injection. `ReActWorker` passes an optional deps object into `Agent.iter(...)`; tools that participate in the budget accept `RunContext[AgentToolBudgetDeps]` and call `ctx.deps.tool_budget.check(...)` before doing work. The budget system is generic and benchmark-agnostic: it knows only `workflow` vs `other`, not ResearchRubrics, Exa, or rubric-specific concepts. Reference: [Pydantic AI dependencies](https://pydantic.dev/docs/ai/core-concepts/dependencies/). + +**Tech Stack:** Python 3.13, pydantic-ai `RunContext`, Ergon `ReActWorker`, existing tool callables, pytest smoke checks, real-LLM rollout artifacts, Logfire. + +--- + +## Design + +The harness should enforce two counters per agent execution: + +```python +workflow_tool_calls <= max_workflow_tool_calls +other_tool_calls <= max_other_tool_calls +``` + +Initial defaults: + +```python +AgentToolBudgetPolicy( + max_workflow_tool_calls=12, + max_other_tool_calls=12, + warning_at_remaining=3, +) +``` + +The budget does not decide which benchmark is running and does not know about Exa. It only sees: + +- `workflow` calls: the workflow CLI tool. +- `other` calls: context-gathering and workspace-inspection tools other than `workflow`. +- `finalization` calls: tools that produce final output artifacts, such as report writing. These count for observability but are not blocked, because the budget should push the agent into finalization rather than prevent it. + +When a limit is reached, the tool returns a normal structured tool result: + +```python +AgentToolBudgetExhaustedResult( + status="TOOL_BUDGET_EXHAUSTED", + reason="workflow tool budget reached", + message="Stop calling workflow. Use currently visible context/resources and produce the best possible final output.", + budget_state={...}, +) +``` + +or: + +```python +AgentToolBudgetExhaustedResult( + status="TOOL_BUDGET_EXHAUSTED", + reason="non-workflow tool budget reached", + message="Stop calling tools. Produce the final answer from the context already gathered.", + budget_state={...}, +) +``` + +This is intentionally not a Python exception. The model gets a final chance to converge. The outer `max_iterations` guard still raises a real error if the agent keeps looping after exhausted tool responses. + +## Package Placement + +- Generic budget state: `ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py` +- Base agent execution hook: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` +- Budgeted workflow command tool: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` +- Budgeted non-workflow tools for this rollout: `ergon_builtins/ergon_builtins/tools/research_rubrics_toolkit.py` and `ergon_builtins/ergon_builtins/tools/graph_toolkit.py` +- Worker-specific budget policy wiring: `ergon_builtins/ergon_builtins/workers/research_rubrics/` +- Rollout diagnostics: `tests/real_llm/` + +## Added Files + +```text +ergon_builtins/ + ergon_builtins/ + workers/ + baselines/ + tool_budget.py +``` + +`tool_budget.py` owns the generic Pydantic models for budget policy, mutable per-execution budget state, deps passed into pydantic-ai, and helper logic for attaching warning text to tool results. + +## Edited Files + +```text +ergon_builtins/ + ergon_builtins/ + tools/ + graph_toolkit.py + research_rubrics_toolkit.py + workflow_cli_tool.py + workers/ + baselines/ + react_worker.py + research_rubrics/ + researcher_worker.py + workflow_cli_react_worker.py + +tests/ + real_llm/ + artifact_health.py + rollout.py +``` + +Edit responsibilities: + +- `react_worker.py`: add an optional deps hook, pass deps into `Agent.iter(...)`, and raise when `max_iterations` is hit. +- `workflow_cli_tool.py`: edit the existing workflow tool function path to support a ctx-taking budgeted mode for `workflow` calls. +- `research_rubrics_toolkit.py`: convert participating tools to ctx-taking functions and count context-gathering tools as `other`, while allowing report-writing as `finalization`. +- `graph_toolkit.py`: convert graph/resource tools to ctx-taking functions and count them as `other`. +- `researcher_worker.py`: provide generic budget deps to `ReActWorker` and steer the prompt toward quick convergence. +- `workflow_cli_react_worker.py`: provide generic budget deps, use budgeted workflow tool mode, and steer the prompt toward deliberate workflow use and subagent coordination. +- `artifact_health.py`: derive `workflow_tool_calls`, `other_tool_calls`, `budget_exhausted`, and `missing_final_report` from existing rollout artifacts. +- `rollout.py`: include those derived counters in `report.md`. + +## Deleted Files + +```text +(none) +``` + +## Optional Later Files + +If other benchmarks start showing the same loop behavior, apply the same `RunContext[AgentToolBudgetDeps]` pattern to their toolkits: + +```text +ergon_builtins/ + ergon_builtins/ + benchmarks/ + gdpeval/ + toolkit.py + minif2f/ + toolkit.py + swebench_verified/ + toolkit.py +``` + +--- + +## Task 1: Add Generic Tool Budget State + +**Files:** +- Create: `ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py` + +- [ ] **Step 1: Create generic budget types** + +Create `tool_budget.py`: + +```python +from __future__ import annotations + +from typing import Any, Literal + +from pydantic import BaseModel, Field + +ToolBudgetKind = Literal["workflow", "other", "finalization"] +ToolBudgetExhaustedStatus = Literal["TOOL_BUDGET_EXHAUSTED"] + + +class AgentToolBudgetExhaustedResult(BaseModel): + status: ToolBudgetExhaustedStatus = "TOOL_BUDGET_EXHAUSTED" + reason: str + message: str + budget_state: dict[str, Any] # slopcop: ignore[no-typing-any] + + +class AgentToolBudgetPolicy(BaseModel): + model_config = {"frozen": True} + + max_workflow_tool_calls: int = 12 + max_other_tool_calls: int = 12 + warning_at_remaining: int = 3 + + +class AgentToolBudgetDecision(BaseModel): + model_config = {"frozen": True} + + allowed: bool + warning: str | None = None + exhausted: AgentToolBudgetExhaustedResult | None = None + + +class AgentToolBudgetState(BaseModel): + policy: AgentToolBudgetPolicy = Field(default_factory=AgentToolBudgetPolicy) + workflow_tool_calls: int = 0 + other_tool_calls: int = 0 + finalization_tool_calls: int = 0 + calls_by_tool: dict[str, int] = Field(default_factory=dict) + + def check(self, tool_name: str, kind: ToolBudgetKind) -> AgentToolBudgetDecision: + self.calls_by_tool[tool_name] = self.calls_by_tool.get(tool_name, 0) + 1 + + if kind == "workflow": + self.workflow_tool_calls += 1 + if self.workflow_tool_calls > self.policy.max_workflow_tool_calls: + return AgentToolBudgetDecision( + allowed=False, + exhausted=self.exhausted_result("workflow tool budget reached"), + ) + remaining = self.policy.max_workflow_tool_calls - self.workflow_tool_calls + elif kind == "finalization": + self.finalization_tool_calls += 1 + return AgentToolBudgetDecision(allowed=True) + else: + self.other_tool_calls += 1 + if self.other_tool_calls > self.policy.max_other_tool_calls: + return AgentToolBudgetDecision( + allowed=False, + exhausted=self.exhausted_result("non-workflow tool budget reached"), + ) + remaining = self.policy.max_other_tool_calls - self.other_tool_calls + + if remaining <= self.policy.warning_at_remaining: + return AgentToolBudgetDecision( + allowed=True, + warning=( + f"TOOL_BUDGET_WARNING: {remaining} {kind} tool calls remain. " + "Converge now using the context already gathered." + ), + ) + return AgentToolBudgetDecision(allowed=True) + + def snapshot(self) -> dict[str, Any]: # slopcop: ignore[no-typing-any] + return { + "workflow_tool_calls": self.workflow_tool_calls, + "max_workflow_tool_calls": self.policy.max_workflow_tool_calls, + "other_tool_calls": self.other_tool_calls, + "max_other_tool_calls": self.policy.max_other_tool_calls, + "finalization_tool_calls": self.finalization_tool_calls, + "calls_by_tool": dict(sorted(self.calls_by_tool.items())), + } + + def exhausted_result(self, reason: str) -> AgentToolBudgetExhaustedResult: + return AgentToolBudgetExhaustedResult( + reason=reason, + message=( + "Stop calling tools in this category. Use the context/resources already " + "available and produce the best possible final output. If the output is " + "incomplete, state what context or resource was missing." + ), + budget_state=self.snapshot(), + ) + + +class AgentToolBudgetDeps(BaseModel): + tool_budget: AgentToolBudgetState + + +def with_budget_warning(result: Any, warning: str | None) -> Any: # slopcop: ignore[no-typing-any] + if warning is None: + return result + if isinstance(result, str): + return f"{result}\n\n{warning}" + if isinstance(result, dict): + updated = dict(result) + updated["tool_budget_warning"] = warning + return updated + return result +``` + +- [ ] **Step 2: Run import smoke check** + +Run: + +```bash +uv run python - <<'PY' +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetPolicy, + AgentToolBudgetState, +) + +state = AgentToolBudgetState( + policy=AgentToolBudgetPolicy(max_workflow_tool_calls=1, max_other_tool_calls=2), +) +deps = AgentToolBudgetDeps(tool_budget=state) +print(deps.tool_budget.check("workflow", "workflow").allowed) +print(deps.tool_budget.check("workflow", "workflow").allowed) +print(deps.tool_budget.snapshot()) +PY +``` + +Expected: first line `True`, second line `False`, then a snapshot dictionary. + +--- + +## Task 2: Pass Deps Through ReActWorker + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` + +- [ ] **Step 1: Add a deps hook** + +Add to `ReActWorker`: + +```python + def build_agent_deps(self, context: WorkerContext) -> Any | None: # slopcop: ignore[no-typing-any] + return None +``` + +- [ ] **Step 2: Pass context into `_run_agent`** + +Change: + +```python +async for turn in self._run_agent(task): +``` + +to: + +```python +async for turn in self._run_agent(task, context): +``` + +Change `_run_agent` signature: + +```python + async def _run_agent( + self, + task: BenchmarkTask, + context: WorkerContext, + ) -> AsyncGenerator[GenerationTurn, None]: +``` + +- [ ] **Step 3: Pass deps to pydantic-ai** + +Before `Agent(...)`: + +```python + agent_deps = self.build_agent_deps(context) + deps_type = type(agent_deps) if agent_deps is not None else None +``` + +Change the agent construction to include: + +```python + deps_type=deps_type, +``` + +Change `agent.iter(...)` to include: + +```python + deps=agent_deps, +``` + +- [ ] **Step 4: Make max-iteration exhaustion visible** + +Replace the current `break` on `max_iterations` with: + +```python + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn + raise RuntimeError( + f"ReActWorker exceeded max_iterations={self.max_iterations}" + ) +``` + +- [ ] **Step 5: Run existing focused tests** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: PASS. + +--- + +## Task 3: Budget the Workflow Tool + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` +- Existing test: `tests/unit/state/test_workflow_cli_tool.py` + +- [ ] **Step 1: Add ctx-aware mode** + +Import: + +```python +from pydantic_ai import RunContext +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetExhaustedResult, + with_budget_warning, +) +``` + +Add parameter to `make_workflow_cli_tool`: + +```python + budgeted: bool = False, +``` + +Edit the existing function body directly. Do not add a separate wrapper around workflow execution. Because pydantic-ai needs a clear callable signature, use two function definitions inside `make_workflow_cli_tool`: one ctx-taking definition for `budgeted=True`, and the existing no-ctx definition for `budgeted=False`. + +```python + if budgeted: + async def workflow( + ctx: RunContext[AgentToolBudgetDeps], + command: str, + ) -> str | AgentToolBudgetExhaustedResult: + decision = ctx.deps.tool_budget.check("workflow", "workflow") + if not decision.allowed: + assert decision.exhausted is not None + return decision.exhausted + + if worker_context.node_id is None: + raise ValueError("workflow tool requires WorkerContext.node_id") + + output = await asyncio.to_thread( + execute_command, + command, + context=WorkflowCommandContext( + run_id=worker_context.run_id, + node_id=worker_context.node_id, + execution_id=worker_context.execution_id, + sandbox_task_key=sandbox_task_key, + benchmark_type=benchmark_type, + ), + session_factory=session_factory, + service=service_factory(), + ) + if output.exit_code != 0: + detail = output.stderr or output.stdout + result = f"workflow exited {output.exit_code}: {detail}".strip() + elif output.stderr: + result = f"{output.stdout}\n\nstderr:\n{output.stderr}".strip() + else: + result = output.stdout + return with_budget_warning(result, decision.warning) + + return workflow +``` + +Keep the existing no-ctx `workflow(command: str)` function as the `budgeted=False` branch: + +```python + async def workflow(command: str) -> str: + if worker_context.node_id is None: + raise ValueError("workflow tool requires WorkerContext.node_id") + + output = await asyncio.to_thread( + execute_command, + command, + context=WorkflowCommandContext( + run_id=worker_context.run_id, + node_id=worker_context.node_id, + execution_id=worker_context.execution_id, + sandbox_task_key=sandbox_task_key, + benchmark_type=benchmark_type, + ), + session_factory=session_factory, + service=service_factory(), + ) + if output.exit_code != 0: + detail = output.stderr or output.stdout + return f"workflow exited {output.exit_code}: {detail}".strip() + if output.stderr: + return f"{output.stdout}\n\nstderr:\n{output.stderr}".strip() + return output.stdout + + return workflow +``` + +- [ ] **Step 2: Preserve existing behavior** + +Run: + +```bash +uv run pytest tests/unit/state/test_workflow_cli_tool.py -q +``` + +Expected: PASS. Existing tests use `budgeted=False`. + +--- + +## Task 4: Budget Other Tools Used by This Harness + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/tools/research_rubrics_toolkit.py` +- Modify: `ergon_builtins/ergon_builtins/tools/graph_toolkit.py` + +- [ ] **Step 1: Convert ResearchRubrics tools to ctx-taking functions** + +In `research_rubrics_toolkit.py`, import: + +```python +from pydantic_ai import RunContext +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetExhaustedResult, + with_budget_warning, +) +``` + +For each tool function, add `ctx` as the first arg: + +```python +ctx: RunContext[AgentToolBudgetDeps], +``` + +At the top of each context-gathering tool: + +```python +decision = ctx.deps.tool_budget.check("", "other") +if not decision.allowed: + assert decision.exhausted is not None + return decision.exhausted +``` + +For final-output tools such as `write_report_draft` and `edit_report_draft`, use: + +```python +decision = ctx.deps.tool_budget.check("", "finalization") +``` + +Do not block finalization tools after `other` is exhausted. The budget exists to force convergence into these tools. + +Use the actual function/tool name for each function so `calls_by_tool` remains useful in artifacts. + +After the existing result is produced: + +```python +return cast( | AgentToolBudgetExhaustedResult, with_budget_warning(resp, decision.warning)) +``` + +For response types that are Pydantic models, returning `AgentToolBudgetExhaustedResult` on exhaustion is acceptable because the tool result is serialized back to the model. Keep type annotations broad enough, for example: + +```python +) -> SearchResponse | AgentToolBudgetExhaustedResult: +``` + +Change each `Tool(..., takes_ctx=False)` to: + +```python +Tool(function=..., takes_ctx=True) +``` + +- [ ] **Step 2: Convert graph/resource tools to ctx-taking functions** + +In `graph_toolkit.py`, apply the same pattern: + +```python +decision = ctx.deps.tool_budget.check("list_child_resources", "other") +if not decision.allowed: + assert decision.exhausted is not None + return decision.exhausted +``` + +Update all graph tools to `takes_ctx=True`. + +- [ ] **Step 3: Run import smoke checks** + +Run: + +```bash +uv run python - <<'PY' +from ergon_builtins.tools.research_rubrics_toolkit import ResearchRubricsToolkit +from ergon_builtins.tools.graph_toolkit import ResearchGraphToolkit +print(ResearchRubricsToolkit) +print(ResearchGraphToolkit) +PY +``` + +Expected: imports cleanly. + +--- + +## Task 5: Wire Budget Deps Into Current ResearchRubrics Workers + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py` +- Modify: `ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py` + +- [ ] **Step 1: Add policy imports** + +In both workers: + +```python +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetPolicy, + AgentToolBudgetState, +) +``` + +- [ ] **Step 2: Add a shared policy** + +Use the same generic policy in both files: + +```python +_TOOL_BUDGET_POLICY = AgentToolBudgetPolicy( + max_workflow_tool_calls=12, + max_other_tool_calls=12, + warning_at_remaining=3, +) +``` + +- [ ] **Step 3: Create deps per execution** + +In each `execute(...)`, before calling `super().execute(...)`: + +```python +self._agent_deps = AgentToolBudgetDeps( + AgentToolBudgetState(_TOOL_BUDGET_POLICY), +) +``` + +Add method: + +```python +def build_agent_deps(self, context: WorkerContext) -> AgentToolBudgetDeps: + return self._agent_deps +``` + +These worker instances are currently execution-scoped. If that changes later, move deps creation into a base-class execution context instead of storing on `self`. + +- [ ] **Step 4: Use budgeted workflow tool in manager** + +In `workflow_cli_react_worker.py`, change: + +```python +workflow_tool = make_workflow_cli_tool(...) +``` + +to: + +```python +workflow_tool = make_workflow_cli_tool(..., budgeted=True) +``` + +- [ ] **Step 5: Tighten prompts, but keep them generic** + +Researcher prompt: + +```text +You have a limited non-workflow tool budget. Gather enough context, then stop using tools and write final_output/report.md. If any tool returns TOOL_BUDGET_WARNING or TOOL_BUDGET_EXHAUSTED, immediately produce the best possible final report from the context already gathered. +``` + +Manager prompt: + +```text +For multi-step work, divide and conquer with focused subagents to manage context. Workflow calls are limited, so inspect deliberately, create focused children, avoid duplicate research, and converge after child resources are visible. If any tool returns TOOL_BUDGET_WARNING or TOOL_BUDGET_EXHAUSTED, stop polling/searching and produce the best possible final output from current context/resources. +``` + +- [ ] **Step 6: Run focused worker import** + +Run: + +```bash +uv run python - <<'PY' +from ergon_builtins.workers.research_rubrics.researcher_worker import ResearchRubricsResearcherWorker +from ergon_builtins.workers.research_rubrics.workflow_cli_react_worker import ResearchRubricsWorkflowCliReActWorker +print(ResearchRubricsResearcherWorker.type_slug) +print(ResearchRubricsWorkflowCliReActWorker.type_slug) +PY +``` + +Expected: prints both type slugs. + +--- + +## Task 6: Add Lightweight Rollout Reporting + +**Files:** +- Modify: `tests/real_llm/artifact_health.py` +- Modify: `tests/real_llm/rollout.py` + +- [ ] **Step 1: Count budget signals from existing events** + +In `artifact_health.py`, derive: + +```python +workflow_tool_calls +other_tool_calls +budget_exhausted +missing_final_report +``` + +Implementation rule: + +- If `tool_name == "workflow"`, increment `workflow_tool_calls`. +- Else if event type is `tool_call`, increment `other_tool_calls`. +- If any event payload has `status == "TOOL_BUDGET_EXHAUSTED"`, set `budget_exhausted=True`. +- If no resource path is `final_output/report.md`, set `missing_final_report=True`. + +- [ ] **Step 2: Show counters in rollout report** + +In `rollout.py`, add lines: + +```python +f"- workflow tool calls: {health.workflow_tool_calls}", +f"- other tool calls: {health.other_tool_calls}", +f"- budget exhausted: {health.budget_exhausted}", +f"- missing final report: {health.missing_final_report}", +``` + +- [ ] **Step 3: Run collection smoke** + +Run: + +```bash +uv run pytest tests/real_llm -q --collect-only +``` + +Expected: collection succeeds. + +--- + +## Task 7: Verify With One Real Sample + +**Files:** +- No new source files. + +- [ ] **Step 1: Run focused checks** + +Run: + +```bash +uv run pytest \ + tests/unit/state/test_workflow_cli_tool.py \ + tests/unit/workers/test_react_worker_contract.py \ + tests/unit/builtins/common/test_transcript_adapters.py \ + -q +``` + +Expected: PASS. + +- [ ] **Step 2: Run lint on changed files** + +Run: + +```bash +uv run ruff check \ + ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py \ + ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py \ + ergon_builtins/ergon_builtins/tools/research_rubrics_toolkit.py \ + ergon_builtins/ergon_builtins/tools/graph_toolkit.py \ + ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py \ + ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py \ + tests/real_llm/artifact_health.py \ + tests/real_llm/rollout.py +``` + +Expected: `All checks passed!` + +- [ ] **Step 3: Rebuild and run one sample** + +Run: + +```bash +POSTGRES_PASSWORD=ergon_dev \ +TEST_HARNESS_SECRET=real-llm-secret \ +ENABLE_TEST_HARNESS=1 \ +ENABLE_SMOKE_FIXTURES=0 \ +ERGON_STARTUP_PLUGINS= \ +ERGON_LOGFIRE_PYDANTIC_AI=1 \ +ERGON_LOGFIRE_SERVICE_NAME=ergon-builtins \ +ERGON_LOGFIRE_ENVIRONMENT=real-llm \ +docker compose build api +``` + +Then: + +```bash +POSTGRES_PASSWORD=ergon_dev \ +TEST_HARNESS_SECRET=real-llm-secret \ +ENABLE_TEST_HARNESS=1 \ +ENABLE_SMOKE_FIXTURES=0 \ +ERGON_STARTUP_PLUGINS= \ +ERGON_LOGFIRE_PYDANTIC_AI=1 \ +ERGON_LOGFIRE_SERVICE_NAME=ergon-builtins \ +ERGON_LOGFIRE_ENVIRONMENT=real-llm \ +docker compose up -d --no-build --force-recreate --wait api +``` + +Then: + +```bash +ERGON_REAL_LLM=1 \ +ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react \ +ERGON_REAL_LLM_LIMIT=1 \ +ERGON_REAL_LLM_BUDGET_USD=5 \ +TEST_HARNESS_SECRET=real-llm-secret \ +ENABLE_TEST_HARNESS=1 \ +ENABLE_SMOKE_FIXTURES=0 \ +uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -q -s --assume-stack-up +``` + +Expected improvement: + +- no silent runaway loop. +- report shows `workflow tool calls <= 12`, or budget exhaustion is visible. +- report shows `other tool calls <= 12`, or budget exhaustion is visible. +- if the run fails, it fails with persisted transcript/error context that explains whether the budget was exhausted. + +--- + +## Notes + +- This is intentionally simpler than per-tool caps. No Exa-specific budget, no rubric-specific budget, no child-poll-specific budget. +- This still supports better prompt steering, but prompt steering is advisory. The two counters are enforcement. +- We should not add broad unit tests for every tool. Existing workflow tests, import smoke checks, lint, and the one-sample real rollout are enough for this change. +- Do not commit unless explicitly asked. diff --git a/docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md b/docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md new file mode 100644 index 00000000..d4f00e7a --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md @@ -0,0 +1,1359 @@ +# Context Part Chunk Stream Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the parallel `GenerationTurn` and context-event payload model with one canonical context-part stream emitted by workers and enriched by core before persistence. + +**Architecture:** Define a single discriminated `ContextPart` union for things that appear in an LLM context/action stream: system prompts, user messages, assistant text, tool calls, tool results, and thinking. Workers yield `ContextPartChunk` values containing a `part` plus optional token metadata; core normalizes and enriches those chunks into persisted `RunContextEvent` rows with sequence, turn id, timestamps, worker key, and run/execution ids. Keep database rows flat enough for SQLModel/JSONB, but make API, dashboard, replay, and RL consumers use typed chunk/log schemas instead of duplicate payload unions. This is a clean-break migration: old `*Payload`, `GenerationTurn`, request/response part aliases, and old discriminator names must be gone by the final task. + +**Tech Stack:** Python 3.13, Pydantic v2 discriminated unions, SQLModel JSON columns, pytest, existing Ergon worker/runtime/persistence packages. + +--- + +## Source Of Truth + +The canonical worker-facing stream type should live in `ergon_core.core.generation` or a renamed module such as `ergon_core.core.context_stream`. To avoid a large import churn in the first slice, start in `ergon_core.core.generation`. + +Use these names: + +```python +ContextPart +ContextPartChunk +ContextPartChunkLog +WorkerYield +``` + +`ContextPart` is the only union for LLM context/action parts. + +`ContextPartChunk` is the de facto worker generator type. + +`ContextPartChunkLog` is the core-enriched durable event shape. It is not the database ORM model; it is the typed payload/envelope used when projecting a stored `RunContextEvent`. + +`RunContextEvent` remains the SQLModel row with JSON storage and relational ids. + +--- + +## Change Tree + +```text +ergon/ + ergon_core/ + ergon_core/ + core/ + generation.py # modify: canonical ContextPart/ContextPartChunk/ContextPartChunkLog + api/ + schemas.py # modify: typed REST context event payloads + runs.py # modify: project parsed chunk logs + dashboard/ + event_contracts.py # modify: dashboard context event payload uses chunk log + emitter.py # modify: emit parsed chunk logs + persistence/ + context/ + event_payloads.py # modify/delete duplicate payload union; no final old aliases + models.py # modify: validate JSON as ContextPartChunkLog + repository.py # modify: add persist_chunk enrichment; later delete persist_turn + rl/ + extraction.py # modify: consume chunk-log parts + runtime/ + services/ + task_execution_service.py # modify: persist worker chunks instead of turns + test_support/ + smoke_fixtures/ + smoke_base/ + leaf_base.py # modify: yield ContextPartChunk + recursive.py # modify: yield ContextPartChunk + worker_base.py # modify: yield ContextPartChunk + tests/ + unit/ + architecture/ + test_core_schema_sources.py # modify: guard single context part union + test_model_field_descriptions.py # modify: check chunk-log field descriptions + builtins/ + common/ + test_transcript_adapters.py # modify: assert chunk extraction/replay + dashboard/ + test_event_contract_types.py # modify: assert typed chunk-log dashboard payload + persistence/ + test_context_event_repository.py # modify: persist_chunk tests + state/ + test_context_part_stream.py # add: canonical part/chunk serialization tests + test_context_assembly.py # modify: replay from ContextPartChunkLog + test_generation_turn_build.py # modify/delete after GenerationTurn compatibility removal + workers/ + test_react_worker_contract.py # modify: worker yields chunks + ergon_builtins/ + ergon_builtins/ + common/ + llm_context/ + adapters/ + pydantic_ai.py # modify: build_chunks/build_new_chunks and replay chunk logs + workers/ + baselines/ + react_worker.py # modify: inspect ContextPartChunkLog.part + training_stub_worker.py # modify: yield ContextPartChunk + research_rubrics/ + researcher_worker.py # modify if still yielding GenerationTurn + workflow_cli_react_worker.py # modify if still yielding GenerationTurn +``` + +--- + +## File Structure + +**Modify:** +- `ergon_core/ergon_core/core/generation.py` — replace request/response-specific part model as the canonical context stream model while preserving temporary aliases during migration. +- `ergon_core/ergon_core/core/persistence/context/event_payloads.py` — replace the duplicate payload union with canonical context-event type exports only; do not keep old payload aliases in the final state. +- `ergon_core/ergon_core/core/persistence/context/models.py` — validate stored JSON as `ContextPartChunkLog` or the log payload shape. +- `ergon_core/ergon_core/core/persistence/context/repository.py` — replace `persist_turn()` decomposition with `persist_chunk()` enrichment; keep a temporary `persist_turn()` adapter if needed for staged migration. +- `ergon_core/ergon_core/core/api/schemas.py` — type REST context-event DTOs with `ContextPartChunkLog` instead of `dict[str, Any]`. +- `ergon_core/ergon_core/core/api/runs.py` — project stored context events through typed log validation. +- `ergon_core/ergon_core/core/dashboard/event_contracts.py` — use the same typed log schema as REST for context events. +- `ergon_core/ergon_core/core/dashboard/emitter.py` — emit typed enriched context logs. +- `ergon_core/ergon_core/core/rl/extraction.py` — read `event.part` instead of payload-specific classes. +- `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` — convert PydanticAI messages into `ContextPartChunk` streams and replay logs back into PydanticAI messages. +- `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` — consume the new typed context stream. +- `ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py` — yield chunks instead of `GenerationTurn`. +- `ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/*.py` — yield chunks in smoke workers. + +**Tests:** +- `tests/unit/state/test_context_part_stream.py` — new focused tests for canonical union and chunk serialization. +- `tests/unit/persistence/test_context_event_repository.py` — rewrite around `persist_chunk()`. +- `tests/unit/builtins/common/test_transcript_adapters.py` — update PydanticAI adapter tests to assert chunk/log behavior. +- `tests/unit/state/test_context_assembly.py` — update replay tests around `ContextPartChunkLog`. +- `tests/unit/architecture/test_core_schema_sources.py` — add architecture guard against reintroducing duplicate context payload unions. +- Existing focused tests: `tests/unit/state/test_generation_turn_build.py`, `tests/unit/workers/test_react_worker_contract.py`, `tests/unit/dashboard/test_event_contract_types.py`, `tests/unit/architecture/test_model_field_descriptions.py`. + +--- + +### Task 1: Introduce Canonical Context Parts + +**Files:** +- Modify: `ergon_core/ergon_core/core/generation.py` +- Create: `tests/unit/state/test_context_part_stream.py` + +- [ ] **Step 1: Write failing tests for the canonical part union** + +Create `tests/unit/state/test_context_part_stream.py` with: + +```python +from pydantic import TypeAdapter + +from ergon_core.core.generation import ( + AssistantTextPart, + ContextPart, + ContextPartChunk, + ContextPartChunkLog, + SystemPromptPart, + ThinkingPart, + TokenLogprob, + ToolCallPart, + ToolResultPart, + UserMessagePart, +) + + +def test_context_part_discriminates_all_part_kinds() -> None: + adapter = TypeAdapter(ContextPart) + + cases = [ + SystemPromptPart(content="sys"), + UserMessagePart(content="hi"), + AssistantTextPart(content="hello"), + ToolCallPart(tool_call_id="call-1", tool_name="search", args={"q": "x"}), + ToolResultPart(tool_call_id="call-1", tool_name="search", content="ok"), + ThinkingPart(content="reasoning"), + ] + + for part in cases: + dumped = part.model_dump(mode="json") + parsed = adapter.validate_python(dumped) + assert parsed == part + + +def test_context_part_chunk_wraps_part_with_optional_token_metadata() -> None: + chunk = ContextPartChunk( + part=AssistantTextPart(content="answer"), + token_ids=[1, 2], + logprobs=[TokenLogprob(token="answer", logprob=-0.1)], + ) + + dumped = chunk.model_dump(mode="json") + + assert dumped["part"]["part_kind"] == "assistant_text" + assert dumped["token_ids"] == [1, 2] + assert dumped["logprobs"][0]["token"] == "answer" + + +def test_context_part_chunk_log_adds_core_enrichment() -> None: + log = ContextPartChunkLog( + part=ThinkingPart(content="hmm"), + sequence=7, + worker_binding_key="researcher", + turn_id="turn-1", + token_ids=None, + logprobs=None, + ) + + dumped = log.model_dump(mode="json") + + assert dumped["part"]["part_kind"] == "thinking" + assert dumped["sequence"] == 7 + assert dumped["worker_binding_key"] == "researcher" + assert dumped["turn_id"] == "turn-1" +``` + +- [ ] **Step 2: Run the failing tests** + +Run: + +```bash +pytest tests/unit/state/test_context_part_stream.py -v +``` + +Expected: FAIL because `AssistantTextPart`, `UserMessagePart`, `ToolResultPart`, `ContextPartChunk`, and `ContextPartChunkLog` do not exist yet. + +- [ ] **Step 3: Implement canonical context stream types** + +Modify `ergon_core/ergon_core/core/generation.py` to define the canonical names. This task may keep request/response subset aliases only if needed to keep the next migration task small; those aliases must be deleted in Task 7 before the plan is complete. + +```python +"""Core model context-stream types. + +These types are used by worker APIs, transcript adapters, persistence, replay, +and RL extraction. Keep them in core so persistence can import them without +loading ``ergon_core.api``. +""" + +from datetime import datetime +from typing import Annotated, Any, Literal + +from ergon_core.core.json_types import JsonObject +from pydantic import BaseModel, Field + + +class TokenLogprob(BaseModel): + """Per-token log probability from the serving backend.""" + + model_config = {"frozen": True} + + token: str + logprob: float + top_logprobs: list[JsonObject] = Field(default_factory=list) + + +class SystemPromptPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["system_prompt"] = "system_prompt" + content: str + + +class UserMessagePart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["user_message"] = "user_message" + content: str + + +class AssistantTextPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["assistant_text"] = "assistant_text" + content: str + + +class ToolCallPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["tool_call"] = "tool_call" + tool_name: str + tool_call_id: str + args: dict[str, Any] # slopcop: ignore[no-typing-any] + + +class ToolResultPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["tool_result"] = "tool_result" + tool_call_id: str + tool_name: str + content: str + is_error: bool = False + + +class ThinkingPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["thinking"] = "thinking" + content: str + + +ContextPart = Annotated[ + SystemPromptPart + | UserMessagePart + | AssistantTextPart + | ToolCallPart + | ToolResultPart + | ThinkingPart, + Field(discriminator="part_kind"), +] + + +class ContextPartChunk(BaseModel): + """One worker-emitted context/action stream item. + + Core adds run/execution/sequence/timing metadata before persistence. + """ + + model_config = {"frozen": True} + + part: ContextPart + token_ids: list[int] | None = None + logprobs: list[TokenLogprob] | None = None + + +class ContextPartChunkLog(ContextPartChunk): + """Core-enriched context stream item suitable for API/dashboard projection.""" + + sequence: int + worker_binding_key: str + turn_id: str | None = None + started_at: datetime | None = None + completed_at: datetime | None = None + policy_version: str | None = None + + +WorkerYield = ContextPartChunk + +# Temporary migration-only aliases. Task 7 must remove these before completion. +UserPromptPart = UserMessagePart +TextPart = AssistantTextPart +ToolReturnPart = ToolResultPart + +ModelRequestPart = Annotated[ + SystemPromptPart | UserMessagePart | ToolResultPart, + Field(discriminator="part_kind"), +] +ModelResponsePart = Annotated[ + AssistantTextPart | ToolCallPart | ThinkingPart, + Field(discriminator="part_kind"), +] + + +class GenerationTurn(BaseModel): + """Deprecated: use ContextPartChunk streams instead.""" + + model_config = {"frozen": True} + + messages_in: list[ModelRequestPart] = Field(default_factory=list) + response_parts: list[ModelResponsePart] = Field(default_factory=list) + tool_results: list[ToolResultPart] = Field(default_factory=list) + turn_token_ids: list[int] | None = None + turn_logprobs: list[TokenLogprob] | None = None + policy_version: str | None = None + started_at: datetime | None = None + completed_at: datetime | None = None +``` + +- [ ] **Step 4: Run the focused tests** + +Run: + +```bash +pytest tests/unit/state/test_context_part_stream.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Run generation-related tests to expose compatibility fallout** + +Run: + +```bash +pytest tests/unit/state/test_generation_turn_build.py tests/unit/builtins/common/test_transcript_adapters.py -v +``` + +Expected: likely FAIL because existing tests assert old discriminator values such as `tool-call` and old constructor names such as `ToolReturnPart`. + +--- + +### Task 2: Replace Payload Union With Enriched Chunk Log + +**Files:** +- Modify: `ergon_core/ergon_core/core/persistence/context/event_payloads.py` +- Modify: `ergon_core/ergon_core/core/persistence/context/models.py` +- Modify: `tests/unit/architecture/test_model_field_descriptions.py` + +- [ ] **Step 1: Write failing compatibility tests for typed log payload validation** + +Update or add tests that assert the context event row validates its JSON as `ContextPartChunkLog`: + +```python +from ergon_core.core.generation import AssistantTextPart, ContextPartChunkLog +from ergon_core.core.persistence.context.models import RunContextEvent + + +def test_run_context_event_parsed_payload_is_context_part_chunk_log() -> None: + log = ContextPartChunkLog( + part=AssistantTextPart(content="hello"), + sequence=3, + worker_binding_key="worker-a", + turn_id="turn-1", + ) + event = RunContextEvent( + run_id="00000000-0000-0000-0000-000000000001", + task_execution_id="00000000-0000-0000-0000-000000000002", + worker_binding_key="worker-a", + sequence=3, + event_type="assistant_text", + payload=log.model_dump(mode="json"), + ) + + parsed = event.parsed_payload() + + assert isinstance(parsed, ContextPartChunkLog) + assert parsed.part == AssistantTextPart(content="hello") +``` + +If UUID strings are not accepted by SQLModel in this test, use `uuid.UUID(...)` values instead. + +- [ ] **Step 2: Run the failing test** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py::test_run_context_event_parsed_payload_is_context_part_chunk_log -v +``` + +Expected: FAIL until `RunContextEvent.parsed_payload()` validates the new log shape. + +- [ ] **Step 3: Collapse `event_payloads.py` into canonical exports** + +Modify `ergon_core/ergon_core/core/persistence/context/event_payloads.py` so the canonical payload is `ContextPartChunkLog`. Do not define `SystemPromptPayload`, `UserMessagePayload`, `AssistantTextPayload`, `ToolCallPayload`, `ToolResultPayload`, or `ThinkingPayload`; callers must migrate to `ContextPartChunkLog.part` and the canonical part classes. + +```python +"""Typed context event payload exports. + +The canonical context payload is an enriched ContextPartChunkLog. Event-specific +payload classes were removed in favor of ContextPartChunkLog.part. +""" + +from typing import Literal + +from ergon_core.core.generation import ( + ContextPart, + ContextPartChunk, + ContextPartChunkLog, +) + +ContextEventType = Literal[ + "system_prompt", + "user_message", + "assistant_text", + "tool_call", + "tool_result", + "thinking", +] + +ContextEventPayload = ContextPartChunkLog +``` + +- [ ] **Step 4: Update `RunContextEvent` validation** + +Modify `ergon_core/ergon_core/core/persistence/context/models.py`: + +```python +from ergon_core.core.generation import ContextPartChunkLog +from pydantic import TypeAdapter + +_PAYLOAD_ADAPTER: TypeAdapter[ContextPartChunkLog] = TypeAdapter(ContextPartChunkLog) + + +class RunContextEvent(SQLModel, table=True): + ... + + def parsed_payload(self) -> ContextPartChunkLog: + return _PAYLOAD_ADAPTER.validate_python(self.payload) +``` + +Keep `event_type: str` and `payload: dict[str, Any]` on the SQLModel row because the database stores JSON and indexes `event_type`. + +- [ ] **Step 5: Replace field-description architecture tests** + +Update `tests/unit/architecture/test_model_field_descriptions.py` to check descriptions on `ContextPartChunkLog` if the project requires descriptions for public fields. Do not keep tests against the old payload classes once they are aliases. + +- [ ] **Step 6: Run focused tests** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py tests/unit/architecture/test_model_field_descriptions.py -v +``` + +Expected: repository tests still fail until Task 3 replaces `persist_turn()` behavior. + +--- + +### Task 3: Persist Worker Chunks With Core Enrichment + +**Files:** +- Modify: `ergon_core/ergon_core/core/persistence/context/repository.py` +- Modify: `tests/unit/persistence/test_context_event_repository.py` + +- [ ] **Step 1: Write repository tests for `persist_chunk()`** + +Replace turn-oriented tests with chunk-oriented tests: + +```python +from uuid import uuid4 + +from ergon_core.core.generation import ( + AssistantTextPart, + ContextPartChunk, + ThinkingPart, + ToolCallPart, + ToolResultPart, + UserMessagePart, +) + + +async def test_persist_chunk_records_prompt_and_model_output_in_order(session): + repo = ContextEventRepository() + run_id = uuid4() + execution_id = uuid4() + + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk(part=UserMessagePart(content="question")), + ) + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk(part=ThinkingPart(content="think")), + ) + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk(part=AssistantTextPart(content="answer")), + ) + + events = repo.get_for_execution(session, execution_id) + + assert [event.sequence for event in events] == [0, 1, 2] + assert [event.event_type for event in events] == [ + "user_message", + "thinking", + "assistant_text", + ] + assert events[1].parsed_payload().turn_id == events[2].parsed_payload().turn_id + + +async def test_persist_chunk_tool_result_closes_current_turn(session): + repo = ContextEventRepository() + run_id = uuid4() + execution_id = uuid4() + + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk( + part=ToolCallPart(tool_call_id="call-1", tool_name="search", args={"q": "x"}) + ), + ) + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk( + part=ToolResultPart(tool_call_id="call-1", tool_name="search", content="ok") + ), + ) + + events = repo.get_for_execution(session, execution_id) + + assert [event.event_type for event in events] == ["tool_call", "tool_result"] + assert events[0].parsed_payload().turn_id is not None + assert events[1].parsed_payload().turn_id is None +``` + +Adjust fixture names to match the existing `test_context_event_repository.py` session fixture. + +- [ ] **Step 2: Run repository tests to verify failure** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py -v +``` + +Expected: FAIL because `persist_chunk()` does not exist. + +- [ ] **Step 3: Implement event type derivation** + +In `ergon_core/ergon_core/core/persistence/context/repository.py`, add: + +```python +from ergon_core.core.generation import ( + AssistantTextPart, + ContextPartChunk, + ContextPartChunkLog, + SystemPromptPart, + ThinkingPart, + ToolCallPart, + ToolResultPart, + UserMessagePart, +) + + +def _event_type_for_part(part: ContextPart) -> str: + return part.part_kind +``` + +If type checkers object to `ContextPart` as an `Annotated` alias in the helper signature, use the explicit union type or accept `object` and narrow via `isinstance`. + +- [ ] **Step 4: Implement turn-id state machine** + +Add private state to the repository: + +```python +def __init__(self) -> None: + self._listeners: list[Callable[[RunContextEvent], Awaitable[None]]] = [] + self._sequence_counters: dict[UUID, int] = {} + self._active_turn_ids: dict[UUID, str] = {} +``` + +Add helpers: + +```python +def _turn_id_for_chunk(self, execution_id: UUID, chunk: ContextPartChunk) -> str | None: + part = chunk.part + if isinstance(part, (AssistantTextPart, ThinkingPart, ToolCallPart)): + turn_id = self._active_turn_ids.get(execution_id) + if turn_id is None: + turn_id = str(uuid4()) + self._active_turn_ids[execution_id] = turn_id + return turn_id + if isinstance(part, ToolResultPart): + self._active_turn_ids.pop(execution_id, None) + return None + if isinstance(part, (SystemPromptPart, UserMessagePart)): + return None + return None +``` + +This deliberately associates `thinking`, `assistant_text`, and `tool_call` chunks emitted contiguously with the same model-output turn. A following `tool_result` closes the active turn. + +- [ ] **Step 5: Implement `persist_chunk()`** + +Add: + +```python +async def persist_chunk( + self, + session: Session, + *, + run_id: UUID, + execution_id: UUID, + worker_binding_key: str, + chunk: ContextPartChunk, +) -> RunContextEvent: + seq = self._next_sequence(execution_id) + turn_id = self._turn_id_for_chunk(execution_id, chunk) + event_type = chunk.part.part_kind + now = datetime.now(UTC) + payload = ContextPartChunkLog( + part=chunk.part, + token_ids=chunk.token_ids, + logprobs=chunk.logprobs, + sequence=seq, + worker_binding_key=worker_binding_key, + turn_id=turn_id, + started_at=now, + completed_at=now, + ) + event = self._make_event( + run_id, + execution_id, + worker_binding_key, + seq, + payload, + started_at=payload.started_at, + completed_at=payload.completed_at, + policy_version=payload.policy_version, + ) + self._sequence_counters[execution_id] = seq + 1 + + session.add(event) + session.commit() + + for listener in self._listeners: + try: + await listener(event) + except Exception: # slopcop: ignore[no-broad-except] + logger.warning("Context event listener failed", exc_info=True) + + return event +``` + +Update `_make_event()` to accept `payload: ContextPartChunkLog` and store `payload.model_dump(mode="json")`. + +- [ ] **Step 6: Keep a temporary `persist_turn()` adapter** + +During migration only, keep `persist_turn()` by decomposing old `GenerationTurn` into chunks: + +```python +async def persist_turn(..., turn: GenerationTurn) -> list[RunContextEvent]: + events: list[RunContextEvent] = [] + for part in turn.messages_in: + events.append(await self.persist_chunk(..., chunk=ContextPartChunk(part=part))) + for part in turn.response_parts: + events.append( + await self.persist_chunk( + ..., + chunk=ContextPartChunk( + part=part, + token_ids=turn.turn_token_ids, + logprobs=turn.turn_logprobs, + ), + ) + ) + for part in turn.tool_results: + events.append(await self.persist_chunk(..., chunk=ContextPartChunk(part=part))) + return events +``` + +This keeps old workers running while the execution service migrates to chunks. + +- [ ] **Step 7: Run persistence tests** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py -v +``` + +Expected: PASS after updating any old assertions to inspect `event.parsed_payload().part`. + +--- + +### Task 4: Migrate PydanticAI Adapter To Chunk Streams + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` +- Modify: `tests/unit/builtins/common/test_transcript_adapters.py` +- Modify: `tests/unit/state/test_generation_turn_build.py` +- Modify: `tests/unit/state/test_context_assembly.py` + +- [ ] **Step 1: Write adapter tests for chunk extraction** + +Update `tests/unit/builtins/common/test_transcript_adapters.py` so PydanticAI transcript extraction returns chunks: + +```python +def test_text_and_thinking_are_context_part_chunks() -> None: + adapter = PydanticAITranscriptAdapter() + + chunks = adapter.build_chunks( + [ + ModelRequest(parts=[UserPromptPart(content="hard question")]), + ModelResponse( + parts=[ + ThinkingPart(content="let me reason"), + TextPart(content="answer"), + ] + ), + ] + ) + + assert [chunk.part.part_kind for chunk in chunks] == [ + "user_message", + "thinking", + "assistant_text", + ] +``` + +Add a tool-call/tool-result test: + +```python +def test_tool_call_and_return_become_context_part_chunks() -> None: + adapter = PydanticAITranscriptAdapter() + + chunks = adapter.build_chunks( + [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ModelRequest( + parts=[ + ToolReturnPart( + tool_name="search", + tool_call_id="call-1", + content={"result": "found"}, + ) + ] + ), + ] + ) + + assert [chunk.part.part_kind for chunk in chunks] == [ + "user_message", + "tool_call", + "tool_result", + ] +``` + +- [ ] **Step 2: Run adapter tests to verify failure** + +Run: + +```bash +pytest tests/unit/builtins/common/test_transcript_adapters.py -v +``` + +Expected: FAIL because `build_chunks()` does not exist. + +- [ ] **Step 3: Implement `build_chunks()` and `build_new_chunks()`** + +In `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py`, add methods parallel to the existing turn methods: + +```python +def build_chunks( + self, + transcript: list[ModelMessage], + *, + flush_pending: bool = True, +) -> list[ContextPartChunk]: + return _build_chunks_from_transcript(transcript, flush_pending=flush_pending) + + +def build_new_chunks( + self, + transcript: list[ModelMessage], + cursor: TranscriptTurnCursor, + *, + flush_pending: bool = False, +) -> list[ContextPartChunk]: + chunks = _build_chunks_from_transcript(transcript, flush_pending=flush_pending) + new_chunks = chunks[cursor.emitted_turn_count :] + cursor.emitted_turn_count = len(chunks) + return new_chunks +``` + +Rename `TranscriptTurnCursor.emitted_turn_count` to `emitted_chunk_count` only if the migration can update all callers in one task. Otherwise leave the field name temporarily and add a follow-up cleanup task. + +- [ ] **Step 4: Implement PydanticAI part conversion** + +Replace old `_extract_request_parts`, `_extract_response_parts`, and `_extract_tool_results` internals with chunk builders: + +```python +def _chunks_from_request(request: ModelRequest) -> list[ContextPartChunk]: + chunks: list[ContextPartChunk] = [] + for part in request.parts: + if isinstance(part, PydanticSystemPromptPart): + chunks.append(ContextPartChunk(part=SystemPromptPart(content=part.content))) + elif isinstance(part, PydanticUserPromptPart) and isinstance(part.content, str): + chunks.append(ContextPartChunk(part=UserMessagePart(content=part.content))) + elif isinstance(part, PydanticToolReturnPart): + chunks.append( + ContextPartChunk( + part=ToolResultPart( + tool_call_id=part.tool_call_id, + tool_name=part.tool_name, + content=_serialize_tool_content(part.content), + ) + ) + ) + return chunks + + +def _chunks_from_response(response: ModelResponse) -> list[ContextPartChunk]: + logprobs = extract_logprobs(response) + chunks: list[ContextPartChunk] = [] + for part in response.parts: + if isinstance(part, PydanticTextPart): + chunks.append( + ContextPartChunk(part=AssistantTextPart(content=part.content), logprobs=logprobs) + ) + logprobs = None + elif isinstance(part, PydanticToolCallPart): + chunks.append( + ContextPartChunk( + part=ToolCallPart( + tool_name=part.tool_name, + tool_call_id=part.tool_call_id, + args=part.args_as_dict(), + ), + logprobs=logprobs, + ) + ) + logprobs = None + elif isinstance(part, PydanticThinkingPart): + chunks.append( + ContextPartChunk(part=ThinkingPart(content=part.content), logprobs=logprobs) + ) + logprobs = None + return chunks +``` + +Only attach turn-level logprobs to the first model-output chunk. This matches the current persisted behavior where sibling events omit the shared token stream after the first model-output event. + +- [ ] **Step 5: Implement replay from chunk logs** + +Update `assemble_replay()` to consume `RunContextEvent.parsed_payload()` as `ContextPartChunkLog`, then switch on `log.part`. + +```python +payload = event.parsed_payload() +part = payload.part +``` + +Map: +- `SystemPromptPart` -> `PydanticSystemPromptPart` +- `UserMessagePart` -> `PydanticUserPromptPart` +- `ToolResultPart` -> `PydanticToolReturnPart` +- `ThinkingPart` -> `PydanticThinkingPart` +- `AssistantTextPart` -> `PydanticTextPart` +- `ToolCallPart` -> `PydanticToolCallPart` + +- [ ] **Step 6: Keep old adapter methods as wrappers** + +Keep `build_turns()` and `build_new_turns()` temporarily by grouping chunks into a deprecated `GenerationTurn` only if old callers still exist at this point. Add comments marking them as migration-only. Task 7 must delete these wrappers; the final codebase must not expose the old turn API. + +- [ ] **Step 7: Run adapter and replay tests** + +Run: + +```bash +pytest tests/unit/builtins/common/test_transcript_adapters.py tests/unit/state/test_context_assembly.py tests/unit/state/test_generation_turn_build.py -v +``` + +Expected: PASS after old tests are rewritten or any migration-only wrappers are correct. These wrappers are not allowed to remain after Task 7. + +--- + +### Task 5: Migrate Worker Interface And Execution Persistence + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/task_execution_service.py` +- Modify: `ergon_core/ergon_core/api/results.py` +- Modify: worker base API files that type `execute()` return values. +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py` +- Modify: smoke fixture workers under `ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/` +- Modify: `tests/unit/workers/test_react_worker_contract.py` +- Modify: `tests/unit/state/test_research_rubrics_workers.py` + +- [ ] **Step 1: Find all `AsyncGenerator[GenerationTurn` callers** + +Run: + +```bash +rg "AsyncGenerator\\[GenerationTurn|GenerationTurn" ergon_core ergon_builtins tests -n +``` + +Expected: a finite list including builtins workers, smoke fixtures, test support, and execution persistence. + +- [ ] **Step 2: Update worker API type hints** + +Replace worker `execute()` signatures from: + +```python +) -> AsyncGenerator[GenerationTurn, None]: +``` + +to: + +```python +) -> AsyncGenerator[ContextPartChunk, None]: +``` + +Import `ContextPartChunk` from `ergon_core.core.generation`. + +- [ ] **Step 3: Update task execution persistence loop** + +In `task_execution_service.py`, replace the turn persistence call: + +```python +async for turn in worker.execute(task, context=context): + await context_event_repository.persist_turn( + session, + run_id=run_id, + execution_id=execution.id, + worker_binding_key=worker_binding_key, + turn=turn, + ) +``` + +with: + +```python +async for chunk in worker.execute(task, context=context): + await context_event_repository.persist_chunk( + session, + run_id=run_id, + execution_id=execution.id, + worker_binding_key=worker_binding_key, + chunk=chunk, + ) +``` + +Keep exact local variable names consistent with the existing file. + +- [ ] **Step 4: Update simple text-yielding workers** + +For smoke workers that currently yield: + +```python +yield GenerationTurn(response_parts=[TextPart(content="...")]) +``` + +replace with: + +```python +yield ContextPartChunk(part=AssistantTextPart(content="...")) +``` + +For user prompt setup chunks, emit: + +```python +yield ContextPartChunk(part=UserMessagePart(content="...")) +``` + +Only emit prompt chunks if the worker previously included them in `messages_in`; do not invent additional prompt events. + +- [ ] **Step 5: Update `training_stub_worker.py`** + +Replace synthetic `GenerationTurn` creation with chunk lists: + +```python +chunks: list[ContextPartChunk] = [] +chunks.append(ContextPartChunk(part=UserMessagePart(content=f"Task: Synthetic task {task_slug}"))) +chunks.append( + ContextPartChunk( + part=ToolCallPart( + tool_name="stub_tool", + tool_call_id=f"call_{i}", + args={"turn": i, "task": task_slug}, + ), + logprobs=logprobs, + ) +) +chunks.append( + ContextPartChunk( + part=ToolResultPart( + tool_call_id=f"call_{i}", + tool_name="stub_tool", + content=f"Tool result for turn {i} of {task_slug}", + ) + ) +) +``` + +For final assistant output: + +```python +ContextPartChunk( + part=AssistantTextPart(content=f"Synthetic response turn {i}"), + logprobs=logprobs, +) +``` + +- [ ] **Step 6: Update `react_worker.py`** + +Where the worker previously handled `GenerationTurn` outputs or inspected payload classes, switch to chunk/log parts: + +```python +payload = event.parsed_payload() +part = payload.part +if isinstance(part, AssistantTextPart): + ... +``` + +For final assistant message extraction, replace `AssistantTextPayload` checks with `AssistantTextPart`. + +- [ ] **Step 7: Run worker contract tests** + +Run: + +```bash +pytest tests/unit/workers/test_react_worker_contract.py tests/unit/state/test_research_rubrics_workers.py -v +``` + +Expected: PASS after signatures and assertions are migrated. + +--- + +### Task 6: Update REST, Dashboard, And RL Consumers + +**Files:** +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Modify: `ergon_core/ergon_core/core/api/runs.py` +- Modify: `ergon_core/ergon_core/core/dashboard/event_contracts.py` +- Modify: `ergon_core/ergon_core/core/dashboard/emitter.py` +- Modify: `ergon_core/ergon_core/core/rl/extraction.py` +- Modify: dashboard generated contracts if this repo checks them in. +- Modify: `tests/unit/dashboard/test_event_contract_types.py` + +- [ ] **Step 1: Type REST context event DTOs with chunk logs** + +Modify `RunContextEventDto`: + +```python +from ergon_core.core.generation import ContextPartChunkLog +from ergon_core.core.persistence.context.event_payloads import ContextEventType + + +class RunContextEventDto(CamelModel): + id: str + task_execution_id: str + task_node_id: str + worker_binding_key: str + sequence: int + event_type: ContextEventType + payload: ContextPartChunkLog + created_at: str + started_at: str | None = None + completed_at: str | None = None +``` + +- [ ] **Step 2: Project typed payloads in REST snapshots** + +In `_context_events_by_task()`, change: + +```python +payload=event.payload, +``` + +to: + +```python +payload=event.parsed_payload(), +``` + +Keep `event_type=cast(ContextEventType, event.event_type)` if type checking requires it. + +- [ ] **Step 3: Type dashboard event contracts with the same payload** + +In `event_contracts.py`, ensure: + +```python +payload: ContextPartChunkLog +``` + +instead of the old `ContextEventPayload` union alias if that alias is still confusing. + +- [ ] **Step 4: Update dashboard emitter payload validation** + +In `emitter.py`, validate as: + +```python +payload=event.parsed_payload() +``` + +instead of constructing a separate TypeAdapter in the emitter. + +- [ ] **Step 5: Update RL extraction** + +Change event handling from payload-class checks to part-class checks: + +```python +payload = event.parsed_payload() +part = payload.part + +if isinstance(part, (SystemPromptPart, UserMessagePart)): + ... +elif isinstance(part, (AssistantTextPart, ToolCallPart, ThinkingPart)): + token_ids = _get_token_ids(payload, tokenizer) +elif isinstance(part, ToolResultPart): + result_tokens = tokenizer.encode(str(part.content)) +``` + +Update `_get_token_ids()` to accept `ContextPartChunkLog` and inspect `payload.part`. + +- [ ] **Step 6: Run REST/dashboard/RL tests** + +Run: + +```bash +pytest tests/unit/dashboard/test_event_contract_types.py tests/unit/state/test_context_assembly.py tests/unit/persistence/test_context_event_repository.py -v +``` + +Expected: PASS after DTOs and consumers use `ContextPartChunkLog`. + +--- + +### Task 7: Add Architecture Guards And Remove Deprecated Turn API + +**Files:** +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Modify: `ergon_core/ergon_core/core/generation.py` +- Modify: any remaining files found by `rg`. + +- [ ] **Step 1: Add architecture guard against duplicate context payload unions** + +Add to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +from pathlib import Path + + +def test_context_stream_has_single_discriminated_part_union() -> None: + root = Path(__file__).resolve().parents[3] + generation = root / "ergon_core" / "ergon_core" / "core" / "generation.py" + event_payloads = ( + root + / "ergon_core" + / "ergon_core" + / "core" + / "persistence" + / "context" + / "event_payloads.py" + ) + + generation_text = generation.read_text() + event_payloads_text = event_payloads.read_text() + + assert "ContextPart = Annotated[" in generation_text + assert "SystemPromptPayload |" not in event_payloads_text + assert "AssistantTextPayload |" not in event_payloads_text + assert "ToolCallPayload |" not in event_payloads_text +``` + +- [ ] **Step 2: Run the architecture test** + +Run: + +```bash +pytest tests/unit/architecture/test_core_schema_sources.py -v +``` + +Expected: PASS only after `event_payloads.py` no longer owns a duplicate payload union. + +- [ ] **Step 3: Remove deprecated `GenerationTurn` compatibility** + +Run: + +```bash +rg "GenerationTurn|ModelRequestPart|ModelResponsePart|ToolReturnPart|TextPart|UserPromptPart" ergon_core ergon_builtins tests -n +``` + +Remove remaining old names where possible. Keep `TextPart` only when it refers to `pydantic_ai.messages.TextPart`, and alias it as `PydanticTextPart` in imports to avoid confusion. + +- [ ] **Step 4: Delete compatibility aliases** + +From `generation.py`, remove: + +```python +UserPromptPart = UserMessagePart +TextPart = AssistantTextPart +ToolReturnPart = ToolResultPart +ModelRequestPart = ... +ModelResponsePart = ... +class GenerationTurn(...) +``` + +Only do this once `rg` confirms no production caller depends on those names. + +- [ ] **Step 5: Verify no old payload classes or aliases exist in `event_payloads.py`** + +Run: + +```bash +rg "SystemPromptPayload|UserMessagePayload|AssistantTextPayload|ToolCallPayload|ToolResultPayload|ThinkingPayload" ergon_core ergon_builtins tests -n +``` + +Expected: no production matches. Test matches should be migrated to `ContextPartChunkLog` and canonical part classes. + +Confirm `event_payloads.py` does not define or export: + +```python +SystemPromptPayload +UserMessagePayload +AssistantTextPayload +ToolCallPayload +ToolResultPayload +ThinkingPayload +``` + +Keep: + +```python +ContextEventType +ContextEventPayload = ContextPartChunkLog +``` + +or rename `ContextEventPayload` to `ContextPartChunkLog` everywhere if the alias is no longer useful. + +- [ ] **Step 6: Run full focused suite** + +Run: + +```bash +pytest \ + tests/unit/state/test_context_part_stream.py \ + tests/unit/persistence/test_context_event_repository.py \ + tests/unit/builtins/common/test_transcript_adapters.py \ + tests/unit/state/test_context_assembly.py \ + tests/unit/workers/test_react_worker_contract.py \ + tests/unit/dashboard/test_event_contract_types.py \ + tests/unit/architecture/test_core_schema_sources.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 7: Run broader unit smoke** + +Run: + +```bash +pytest tests/unit -q +``` + +Expected: PASS, or only unrelated pre-existing failures. Investigate any failures mentioning context events, generation turns, workers, dashboard contracts, replay, or RL extraction. + +--- + +## Migration Notes + +This is a schema/API clean break. Do not preserve backwards compatibility with the old schemas in the final state. + +Temporary adapters are allowed only inside intermediate tasks to make the migration reviewable: +- `GenerationTurn` can exist only until worker execution is moved to chunks. +- request/response subset aliases can exist only until all worker and adapter callers move to `ContextPartChunk`. +- old `*Payload` event classes should not be reintroduced as aliases; migrate those callers directly to `ContextPartChunkLog.part`. + +After Task 7, the only canonical stream type should be `ContextPart`, the worker generator type should be `ContextPartChunk`, and the enriched log type should be `ContextPartChunkLog`. + +Do not add a second new union in `event_payloads.py`. Do not leave compatibility exports for the old payload classes. Either outcome recreates the drift this plan is removing. + +--- + +## Self-Review + +**Spec coverage:** The plan implements the requested model: `ContextPart` as the single discriminated union, `ContextPartChunk` as the worker generator type, and `ContextPartChunkLog` as the core-enriched persistence/API shape. + +**Placeholder scan:** No steps rely on `TBD`, unspecified tests, or unnamed files. Commands and expected outcomes are included for each task. + +**Type consistency:** The plan consistently uses `content` for text-bearing parts, `part_kind` for the part discriminator, `token_ids`/`logprobs` for worker-provided token metadata, and `sequence`/`worker_binding_key`/`turn_id` for core-enriched log metadata. + +--- + +## Execution Handoff + +Plan complete and saved to `docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - dispatch a fresh subagent per task, review between tasks, fast iteration. + +**2. Inline Execution** - execute tasks in this session using executing-plans, batch execution with checkpoints. + +Which approach? diff --git a/docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout-implementation.md b/docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout-implementation.md new file mode 100644 index 00000000..a26fa51b --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout-implementation.md @@ -0,0 +1,1259 @@ +# Core Hybrid Domain Layout Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Move `ergon_core.core` to the approved hybrid layout: thin `rest_api`, product use cases under `application`, pure objects under `domain`, adapters under `infrastructure`, SQL rows under `persistence`, and `rl` kept as a separate bounded context. + +**Architecture:** This is a mechanical package migration with architecture guards. Each slice moves one cluster, bulk-renames imports, runs focused tests, and preserves behavior. A temporary exact-folder-structure test is added first and deleted at the end after durable architecture tests cover the important constraints. + +**Tech Stack:** Python, pytest, ruff, SQLModel, FastAPI, Inngest, Pydantic. + +**Commit Policy:** Do not create git commits unless the user explicitly asks. Treat each task's test run as the checkpoint. + +--- + +## Target Clusters + +The implementation follows `docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout.md`. + +```text +core/ + rest_api/ + application/ + experiments/ + workflows/ + graph/ + tasks/ + evaluation/ + read_models/ + communication/ + context/ + jobs/ + resources/ + events/ + domain/ + experiments/ + generation/ + persistence/ + infrastructure/ + inngest/ + handlers/ + sandbox/ + dashboard/ + tracing/ + dependencies.py + rl/ + shared/ +``` + +## Task 1: Add Temporary Exact Layout Guard + +**Files:** +- Create: `tests/unit/architecture/test_core_hybrid_layout_temporary.py` +- Modify: none +- Test: `tests/unit/architecture/test_core_hybrid_layout_temporary.py` + +- [ ] **Step 1: Add the temporary failing test** + +Create `tests/unit/architecture/test_core_hybrid_layout_temporary.py`: + +```python +"""Temporary guard for the core hybrid layout migration. + +Delete this file in the final migration task. It intentionally asserts the +exact file tree so each migration slice has a visible end state. +""" + +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[3] +CORE = ROOT / "ergon_core/ergon_core/core" + +EXPECTED_FILES = { + "__init__.py", + "rest_api/__init__.py", + "rest_api/app.py", + "rest_api/cohorts.py", + "rest_api/experiments.py", + "rest_api/rollouts.py", + "rest_api/runs.py", + "rest_api/test_harness.py", + "application/__init__.py", + "application/experiments/__init__.py", + "application/experiments/service.py", + "application/experiments/models.py", + "application/experiments/repository.py", + "application/experiments/definition_writer.py", + "application/experiments/launch.py", + "application/workflows/__init__.py", + "application/workflows/service.py", + "application/workflows/orchestration.py", + "application/workflows/runs.py", + "application/workflows/models.py", + "application/workflows/errors.py", + "application/graph/__init__.py", + "application/graph/repository.py", + "application/graph/propagation.py", + "application/graph/traversal.py", + "application/graph/lookup.py", + "application/graph/models.py", + "application/graph/errors.py", + "application/tasks/__init__.py", + "application/tasks/service.py", + "application/tasks/execution.py", + "application/tasks/management.py", + "application/tasks/inspection.py", + "application/tasks/cleanup.py", + "application/tasks/repository.py", + "application/tasks/models.py", + "application/tasks/errors.py", + "application/evaluation/__init__.py", + "application/evaluation/service.py", + "application/evaluation/executors.py", + "application/evaluation/inngest_executor.py", + "application/evaluation/criterion_runtime.py", + "application/evaluation/scoring.py", + "application/evaluation/protocols.py", + "application/evaluation/models.py", + "application/evaluation/errors.py", + "application/read_models/__init__.py", + "application/read_models/runs.py", + "application/read_models/run_snapshot.py", + "application/read_models/experiments.py", + "application/read_models/cohorts.py", + "application/read_models/resources.py", + "application/read_models/models.py", + "application/read_models/errors.py", + "application/communication/__init__.py", + "application/communication/service.py", + "application/communication/models.py", + "application/communication/errors.py", + "application/context/__init__.py", + "application/context/events.py", + "application/context/output_extraction.py", + "application/jobs/__init__.py", + "application/jobs/cancel_orphan_subtasks.py", + "application/jobs/check_evaluators.py", + "application/jobs/cleanup_cancelled_task.py", + "application/jobs/complete_workflow.py", + "application/jobs/evaluate_task_run.py", + "application/jobs/execute_task.py", + "application/jobs/fail_workflow.py", + "application/jobs/persist_outputs.py", + "application/jobs/propagate_execution.py", + "application/jobs/run_cleanup.py", + "application/jobs/sandbox_setup.py", + "application/jobs/start_workflow.py", + "application/jobs/worker_execute.py", + "application/jobs/models.py", + "application/resources/__init__.py", + "application/resources/repository.py", + "application/resources/models.py", + "application/events/__init__.py", + "application/events/base.py", + "application/events/task_events.py", + "application/events/infrastructure_events.py", + "domain/__init__.py", + "domain/experiments/__init__.py", + "domain/experiments/experiment.py", + "domain/experiments/handles.py", + "domain/experiments/worker_spec.py", + "domain/experiments/validation.py", + "domain/generation/__init__.py", + "domain/generation/context_parts.py", + "persistence/shared/__init__.py", + "persistence/shared/db.py", + "persistence/shared/enums.py", + "persistence/shared/ids.py", + "persistence/shared/types.py", + "persistence/definitions/__init__.py", + "persistence/definitions/models.py", + "persistence/telemetry/__init__.py", + "persistence/telemetry/models.py", + "persistence/telemetry/repositories.py", + "persistence/telemetry/evaluation_summary.py", + "persistence/graph/__init__.py", + "persistence/graph/models.py", + "persistence/graph/status_conventions.py", + "persistence/context/__init__.py", + "persistence/context/models.py", + "persistence/context/event_payloads.py", + "persistence/saved_specs/__init__.py", + "persistence/saved_specs/models.py", + "infrastructure/__init__.py", + "infrastructure/inngest/__init__.py", + "infrastructure/inngest/client.py", + "infrastructure/inngest/registry.py", + "infrastructure/inngest/contracts.py", + "infrastructure/inngest/errors.py", + "infrastructure/inngest/handlers/__init__.py", + "infrastructure/inngest/handlers/cancel_orphan_subtasks.py", + "infrastructure/inngest/handlers/check_evaluators.py", + "infrastructure/inngest/handlers/cleanup_cancelled_task.py", + "infrastructure/inngest/handlers/complete_workflow.py", + "infrastructure/inngest/handlers/evaluate_task_run.py", + "infrastructure/inngest/handlers/execute_task.py", + "infrastructure/inngest/handlers/fail_workflow.py", + "infrastructure/inngest/handlers/persist_outputs.py", + "infrastructure/inngest/handlers/propagate_execution.py", + "infrastructure/inngest/handlers/run_cleanup.py", + "infrastructure/inngest/handlers/sandbox_setup.py", + "infrastructure/inngest/handlers/start_workflow.py", + "infrastructure/inngest/handlers/worker_execute.py", + "infrastructure/sandbox/__init__.py", + "infrastructure/sandbox/manager.py", + "infrastructure/sandbox/lifecycle.py", + "infrastructure/sandbox/resource_publisher.py", + "infrastructure/sandbox/instrumentation.py", + "infrastructure/sandbox/event_sink.py", + "infrastructure/sandbox/errors.py", + "infrastructure/sandbox/utils.py", + "infrastructure/dashboard/__init__.py", + "infrastructure/dashboard/emitter.py", + "infrastructure/dashboard/provider.py", + "infrastructure/dashboard/event_contracts.py", + "infrastructure/tracing/__init__.py", + "infrastructure/tracing/attributes.py", + "infrastructure/tracing/contexts.py", + "infrastructure/tracing/ids.py", + "infrastructure/tracing/noop.py", + "infrastructure/tracing/otel.py", + "infrastructure/tracing/sinks.py", + "infrastructure/tracing/types.py", + "infrastructure/dependencies.py", + "rl/__init__.py", + "rl/rollout_service.py", + "rl/eval_runner.py", + "rl/extraction.py", + "rl/rewards.py", + "rl/checkpoint.py", + "rl/rollout_types.py", + "rl/vllm_manager.py", + "shared/__init__.py", + "shared/json_types.py", + "shared/settings.py", + "shared/utils.py", +} + +REMOVED_DIRS = { + "api", + "definitions", + "composition", + "runtime", + "sandbox", + "dashboard", +} + +REMOVED_ROOT_FILES = { + "generation.py", + "json_types.py", + "settings.py", + "utils.py", +} + + +def test_core_has_exact_target_layout_during_migration() -> None: + actual_files = { + str(path.relative_to(CORE)) + for path in CORE.rglob("*.py") + if "__pycache__" not in path.parts + } + missing = sorted(EXPECTED_FILES - actual_files) + unexpected = sorted(actual_files - EXPECTED_FILES) + + assert missing == [] + assert unexpected == [] + + +def test_old_core_roots_are_removed_during_migration() -> None: + restored_dirs = sorted(name for name in REMOVED_DIRS if (CORE / name).exists()) + restored_files = sorted(name for name in REMOVED_ROOT_FILES if (CORE / name).exists()) + + assert restored_dirs == [] + assert restored_files == [] +``` + +- [ ] **Step 2: Run the temporary test and confirm it fails** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_core_hybrid_layout_temporary.py -q +``` + +Expected: FAIL because the target directories do not exist yet and old roots still exist. + +## Task 2: Rename HTTP Layer To `core/rest_api` + +**Files:** +- Move: `ergon_core/ergon_core/core/api/*` -> `ergon_core/ergon_core/core/rest_api/*` +- Modify: imports in `ergon_core/ergon_core/core/rest_api/*.py` +- Modify: imports across `ergon_core`, `ergon_cli`, `ergon_builtins`, and `tests` +- Test: `tests/unit/architecture/test_public_api_boundaries.py` +- Test: `tests/unit/architecture/test_core_schema_sources.py` + +- [ ] **Step 1: Move the package** + +Move files: + +```bash +mkdir -p ergon_core/ergon_core/core/rest_api +mv ergon_core/ergon_core/core/api/*.py ergon_core/ergon_core/core/rest_api/ +rmdir ergon_core/ergon_core/core/api +``` + +- [ ] **Step 2: Bulk update imports** + +Replace every `ergon_core.core.api` import with `ergon_core.core.rest_api`. + +Run: + +```bash +python - <<'PY' +from pathlib import Path + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text.replace("ergon_core.core.api", "ergon_core.core.rest_api") + if new != text: + path.write_text(new) +PY +``` + +- [ ] **Step 3: Add a durable architecture guard** + +In `tests/unit/architecture/test_public_api_boundaries.py`, add: + +```python +def test_internal_http_api_is_named_rest_api_not_core_api() -> None: + core_root = ROOT / "ergon_core" / "ergon_core" / "core" + + assert not (core_root / "api").exists() + assert (core_root / "rest_api").exists() +``` + +- [ ] **Step 4: Run focused tests** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_public_api_boundaries.py tests/unit/architecture/test_core_schema_sources.py -q +``` + +Expected: PASS for durable architecture tests. The temporary exact-layout test still fails until the full migration finishes. + +## Task 3: Move Shared Primitives And Pure Domain Objects + +**Files:** +- Move: `core/json_types.py` -> `core/shared/json_types.py` +- Move: `core/settings.py` -> `core/shared/settings.py` +- Move: `core/utils.py` -> `core/shared/utils.py` +- Move: `core/generation.py` -> `core/domain/generation/context_parts.py` +- Move: `core/composition/*` -> `core/domain/experiments/*` +- Create: `core/shared/__init__.py` +- Create: `core/domain/__init__.py` +- Create: `core/domain/generation/__init__.py` +- Modify: imports across source and tests +- Test: `tests/unit/architecture/test_public_api_boundaries.py` +- Test: `tests/unit/architecture/test_core_schema_sources.py` + +- [ ] **Step 1: Move shared files** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/shared +mv ergon_core/ergon_core/core/json_types.py ergon_core/ergon_core/core/shared/json_types.py +mv ergon_core/ergon_core/core/settings.py ergon_core/ergon_core/core/shared/settings.py +mv ergon_core/ergon_core/core/utils.py ergon_core/ergon_core/core/shared/utils.py +touch ergon_core/ergon_core/core/shared/__init__.py +``` + +- [ ] **Step 2: Move generation primitives** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/domain/generation +mv ergon_core/ergon_core/core/generation.py ergon_core/ergon_core/core/domain/generation/context_parts.py +touch ergon_core/ergon_core/core/domain/__init__.py +touch ergon_core/ergon_core/core/domain/generation/__init__.py +``` + +- [ ] **Step 3: Move experiment composition domain** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/domain/experiments +mv ergon_core/ergon_core/core/composition/*.py ergon_core/ergon_core/core/domain/experiments/ +rmdir ergon_core/ergon_core/core/composition +``` + +- [ ] **Step 4: Bulk update imports** + +Run: + +```bash +python - <<'PY' +from pathlib import Path + +replacements = { + "ergon_core.core.json_types": "ergon_core.core.shared.json_types", + "ergon_core.core.settings": "ergon_core.core.shared.settings", + "ergon_core.core.utils": "ergon_core.core.shared.utils", + "ergon_core.core.generation": "ergon_core.core.domain.generation.context_parts", + "ergon_core.core.composition": "ergon_core.core.domain.experiments", +} + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text + for old, replacement in replacements.items(): + new = new.replace(old, replacement) + if new != text: + path.write_text(new) +PY +``` + +- [ ] **Step 5: Restore domain exports** + +Ensure `ergon_core/ergon_core/core/domain/experiments/__init__.py` exports the same names previously exported by `core/composition/__init__.py`: + +```python +from ergon_core.core.domain.experiments.experiment import Experiment +from ergon_core.core.domain.experiments.handles import DefinitionHandle +from ergon_core.core.domain.experiments.worker_spec import WorkerSpec + +__all__ = ["DefinitionHandle", "Experiment", "WorkerSpec"] +``` + +- [ ] **Step 6: Run focused tests** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_public_api_boundaries.py tests/unit/architecture/test_core_schema_sources.py tests/unit/api/test_public_api_imports.py -q +``` + +Expected: PASS. + +## Task 4: Move Experiment Application Cluster + +**Files:** +- Move: `core/definitions/service.py` -> `core/application/experiments/service.py` +- Move: `core/definitions/schemas.py` -> `core/application/experiments/models.py` +- Move: `core/definitions/repository.py` -> `core/application/experiments/repository.py` +- Move: `core/definitions/persistence.py` -> `core/application/experiments/definition_writer.py` +- Move: `core/runtime/workflows/launch.py` -> `core/application/experiments/launch.py` +- Create: `core/application/__init__.py` +- Create: `core/application/experiments/__init__.py` +- Delete: `core/definitions/` +- Test: `tests/unit/runtime/test_experiment_definition_service.py` +- Test: `tests/unit/runtime/test_experiment_launch_service.py` +- Test: `tests/unit/cli/test_experiment_cli.py` + +- [ ] **Step 1: Move files** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/experiments +mv ergon_core/ergon_core/core/definitions/service.py ergon_core/ergon_core/core/application/experiments/service.py +mv ergon_core/ergon_core/core/definitions/schemas.py ergon_core/ergon_core/core/application/experiments/models.py +mv ergon_core/ergon_core/core/definitions/repository.py ergon_core/ergon_core/core/application/experiments/repository.py +mv ergon_core/ergon_core/core/definitions/persistence.py ergon_core/ergon_core/core/application/experiments/definition_writer.py +mv ergon_core/ergon_core/core/runtime/workflows/launch.py ergon_core/ergon_core/core/application/experiments/launch.py +touch ergon_core/ergon_core/core/application/__init__.py +touch ergon_core/ergon_core/core/application/experiments/__init__.py +rm ergon_core/ergon_core/core/definitions/__init__.py +rmdir ergon_core/ergon_core/core/definitions +``` + +- [ ] **Step 2: Bulk update imports** + +Run: + +```bash +python - <<'PY' +from pathlib import Path + +replacements = { + "ergon_core.core.definitions.service": "ergon_core.core.application.experiments.service", + "ergon_core.core.definitions.schemas": "ergon_core.core.application.experiments.models", + "ergon_core.core.definitions.repository": "ergon_core.core.application.experiments.repository", + "ergon_core.core.definitions.persistence": "ergon_core.core.application.experiments.definition_writer", + "ergon_core.core.runtime.workflows.launch": "ergon_core.core.application.experiments.launch", +} + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text + for old, replacement in replacements.items(): + new = new.replace(old, replacement) + if new != text: + path.write_text(new) +PY +``` + +- [ ] **Step 3: Ensure experiment package exports the front door** + +Set `ergon_core/ergon_core/core/application/experiments/__init__.py` to: + +```python +from ergon_core.core.application.experiments.service import ExperimentService + +__all__ = ["ExperimentService"] +``` + +- [ ] **Step 4: Run focused tests** + +Run: + +```bash +uv run pytest tests/unit/runtime/test_experiment_definition_service.py tests/unit/runtime/test_experiment_launch_service.py tests/unit/cli/test_experiment_cli.py -q +``` + +Expected: PASS. + +## Task 5: Move Workflow, Graph, Task, And Evaluation Application Clusters + +**Files:** +- Move: `core/runtime/workflows/{service,orchestration,runs,models,errors}.py` -> `core/application/workflows/` +- Move: `core/runtime/graph/{repository,propagation,traversal,lookup,dto,errors}.py` -> `core/application/graph/` +- Rename: `core/application/graph/dto.py` -> `core/application/graph/models.py` +- Move: `core/runtime/tasks/*` -> `core/application/tasks/` +- Rename: `core/application/tasks/management.py` remains `management.py` +- Create: `core/application/tasks/service.py` if needed as a package front door +- Move: `core/runtime/evaluation/*` -> `core/application/evaluation/` +- Modify: imports across source and tests +- Test: runtime workflow/task/evaluation tests + +- [ ] **Step 1: Move workflows** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/workflows +mv ergon_core/ergon_core/core/runtime/workflows/service.py ergon_core/ergon_core/core/application/workflows/service.py +mv ergon_core/ergon_core/core/runtime/workflows/orchestration.py ergon_core/ergon_core/core/application/workflows/orchestration.py +mv ergon_core/ergon_core/core/runtime/workflows/runs.py ergon_core/ergon_core/core/application/workflows/runs.py +mv ergon_core/ergon_core/core/runtime/workflows/models.py ergon_core/ergon_core/core/application/workflows/models.py +mv ergon_core/ergon_core/core/runtime/workflows/errors.py ergon_core/ergon_core/core/application/workflows/errors.py +touch ergon_core/ergon_core/core/application/workflows/__init__.py +rm -f ergon_core/ergon_core/core/runtime/workflows/__init__.py +rmdir ergon_core/ergon_core/core/runtime/workflows +``` + +- [ ] **Step 2: Move graph** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/graph +mv ergon_core/ergon_core/core/runtime/graph/repository.py ergon_core/ergon_core/core/application/graph/repository.py +mv ergon_core/ergon_core/core/runtime/graph/propagation.py ergon_core/ergon_core/core/application/graph/propagation.py +mv ergon_core/ergon_core/core/runtime/graph/traversal.py ergon_core/ergon_core/core/application/graph/traversal.py +mv ergon_core/ergon_core/core/runtime/graph/lookup.py ergon_core/ergon_core/core/application/graph/lookup.py +mv ergon_core/ergon_core/core/runtime/graph/dto.py ergon_core/ergon_core/core/application/graph/models.py +mv ergon_core/ergon_core/core/runtime/graph/errors.py ergon_core/ergon_core/core/application/graph/errors.py +touch ergon_core/ergon_core/core/application/graph/__init__.py +rm -f ergon_core/ergon_core/core/runtime/graph/__init__.py +rmdir ergon_core/ergon_core/core/runtime/graph +``` + +- [ ] **Step 3: Move tasks** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/tasks +mv ergon_core/ergon_core/core/runtime/tasks/*.py ergon_core/ergon_core/core/application/tasks/ +touch ergon_core/ergon_core/core/application/tasks/service.py +rmdir ergon_core/ergon_core/core/runtime/tasks +``` + +Set `ergon_core/ergon_core/core/application/tasks/service.py` to: + +```python +"""Task application package front door. + +Task lifecycle behavior currently lives in focused modules: +`execution`, `management`, `inspection`, and `cleanup`. +""" + +from ergon_core.core.application.tasks.execution import TaskExecutionService +from ergon_core.core.application.tasks.management import TaskManagementService + +__all__ = ["TaskExecutionService", "TaskManagementService"] +``` + +- [ ] **Step 4: Move evaluation** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/evaluation +mv ergon_core/ergon_core/core/runtime/evaluation/*.py ergon_core/ergon_core/core/application/evaluation/ +touch ergon_core/ergon_core/core/application/evaluation/__init__.py +rmdir ergon_core/ergon_core/core/runtime/evaluation +``` + +- [ ] **Step 5: Bulk update imports** + +Run: + +```bash +python - <<'PY' +from pathlib import Path + +replacements = { + "ergon_core.core.runtime.workflows": "ergon_core.core.application.workflows", + "ergon_core.core.runtime.graph.dto": "ergon_core.core.application.graph.models", + "ergon_core.core.runtime.graph": "ergon_core.core.application.graph", + "ergon_core.core.runtime.tasks": "ergon_core.core.application.tasks", + "ergon_core.core.runtime.evaluation": "ergon_core.core.application.evaluation", +} + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text + for old, replacement in replacements.items(): + new = new.replace(old, replacement) + if new != text: + path.write_text(new) +PY +``` + +- [ ] **Step 6: Run focused tests** + +Run: + +```bash +uv run pytest tests/unit/runtime/test_workflow_service.py tests/unit/runtime/test_graph_mutation_contracts.py tests/unit/runtime/test_graph_worker_identity.py tests/unit/runtime/test_task_execution_repository.py tests/unit/runtime/test_inngest_criterion_executor.py tests/unit/runtime/test_dynamic_task_evaluation_mapping.py -q +``` + +Expected: PASS. + +## Task 6: Move Read Models, Communication, Context, And Resources + +**Files:** +- Move: `core/runtime/read_models/{runs,run_snapshot,experiments,cohorts,resources,errors}.py` -> `core/application/read_models/` +- Split: communication DTOs from `read_models/models.py` -> `core/application/communication/models.py` +- Move: `core/runtime/read_models/communication.py` -> `core/application/communication/service.py` +- Move: remaining read model DTOs -> `core/application/read_models/models.py` +- Move: `core/runtime/context_events.py` -> `core/application/context/events.py` +- Move: `core/runtime/output_extraction.py` -> `core/application/context/output_extraction.py` +- Split: `core/runtime/resources.py` -> `core/application/resources/models.py` and `core/application/resources/repository.py` +- Test: dashboard/read-model/context/resource tests + +- [ ] **Step 1: Move read models** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/read_models +mv ergon_core/ergon_core/core/runtime/read_models/runs.py ergon_core/ergon_core/core/application/read_models/runs.py +mv ergon_core/ergon_core/core/runtime/read_models/run_snapshot.py ergon_core/ergon_core/core/application/read_models/run_snapshot.py +mv ergon_core/ergon_core/core/runtime/read_models/experiments.py ergon_core/ergon_core/core/application/read_models/experiments.py +mv ergon_core/ergon_core/core/runtime/read_models/cohorts.py ergon_core/ergon_core/core/application/read_models/cohorts.py +mv ergon_core/ergon_core/core/runtime/read_models/resources.py ergon_core/ergon_core/core/application/read_models/resources.py +mv ergon_core/ergon_core/core/runtime/read_models/errors.py ergon_core/ergon_core/core/application/read_models/errors.py +mv ergon_core/ergon_core/core/runtime/read_models/models.py ergon_core/ergon_core/core/application/read_models/models.py +touch ergon_core/ergon_core/core/application/read_models/__init__.py +``` + +- [ ] **Step 2: Move communication domain** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/communication +mv ergon_core/ergon_core/core/runtime/read_models/communication.py ergon_core/ergon_core/core/application/communication/service.py +touch ergon_core/ergon_core/core/application/communication/__init__.py +touch ergon_core/ergon_core/core/application/communication/errors.py +touch ergon_core/ergon_core/core/application/communication/models.py +rm ergon_core/ergon_core/core/runtime/read_models/__init__.py +rmdir ergon_core/ergon_core/core/runtime/read_models +``` + +Move `RunCommunicationMessageDto` and `RunCommunicationThreadDto` from `application/read_models/models.py` into `application/communication/models.py`, then update imports to read from `ergon_core.core.application.communication.models`. + +- [ ] **Step 3: Move context domain** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/context +mv ergon_core/ergon_core/core/runtime/context_events.py ergon_core/ergon_core/core/application/context/events.py +mv ergon_core/ergon_core/core/runtime/output_extraction.py ergon_core/ergon_core/core/application/context/output_extraction.py +touch ergon_core/ergon_core/core/application/context/__init__.py +``` + +- [ ] **Step 4: Split resources module** + +Create `ergon_core/ergon_core/core/application/resources/models.py` with `RunResourceView`. + +Create `ergon_core/ergon_core/core/application/resources/repository.py` with `RunResourceRepository`. + +Delete `ergon_core/ergon_core/core/runtime/resources.py`. + +Use this package initializer: + +```python +from ergon_core.core.application.resources.models import RunResourceView +from ergon_core.core.application.resources.repository import RunResourceRepository + +__all__ = ["RunResourceRepository", "RunResourceView"] +``` + +- [ ] **Step 5: Bulk update imports** + +Run: + +```bash +python - <<'PY' +from pathlib import Path + +replacements = { + "ergon_core.core.runtime.read_models.communication": "ergon_core.core.application.communication.service", + "ergon_core.core.runtime.read_models": "ergon_core.core.application.read_models", + "ergon_core.core.runtime.context_events": "ergon_core.core.application.context.events", + "ergon_core.core.runtime.output_extraction": "ergon_core.core.application.context.output_extraction", + "ergon_core.core.runtime.resources": "ergon_core.core.application.resources", +} + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text + for old, replacement in replacements.items(): + new = new.replace(old, replacement) + if new != text: + path.write_text(new) +PY +``` + +- [ ] **Step 6: Run focused tests** + +Run: + +```bash +uv run pytest tests/unit/dashboard/test_communication_threads.py tests/unit/runtime/test_communication_service.py tests/unit/persistence/test_context_event_repository.py tests/unit/runtime/test_persist_outputs_resources.py tests/unit/runtime/test_experiment_read_service.py tests/unit/runtime/test_cohort_service.py -q +``` + +Expected: PASS. + +## Task 7: Split Inngest Handlers Into Application Jobs And Infrastructure Adapters + +**Files:** +- Move semantic logic: `core/runtime/inngest/{handler files}.py` -> `core/application/jobs/{handler files}.py` +- Create: `core/application/jobs/models.py` +- Create thin adapters: `core/infrastructure/inngest/handlers/{handler files}.py` +- Move: `runtime/inngest/client.py` -> `infrastructure/inngest/client.py` +- Move: `runtime/inngest/registry.py` -> `infrastructure/inngest/registry.py` +- Move: `runtime/inngest/contracts.py` -> `infrastructure/inngest/contracts.py` +- Move: `runtime/inngest/errors.py` -> `infrastructure/inngest/errors.py` +- Test: Inngest/runtime unit tests and import registry tests + +- [ ] **Step 1: Move infrastructure primitives** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/infrastructure/inngest/handlers +mv ergon_core/ergon_core/core/runtime/inngest/client.py ergon_core/ergon_core/core/infrastructure/inngest/client.py +mv ergon_core/ergon_core/core/runtime/inngest/registry.py ergon_core/ergon_core/core/infrastructure/inngest/registry.py +mv ergon_core/ergon_core/core/runtime/inngest/contracts.py ergon_core/ergon_core/core/infrastructure/inngest/contracts.py +mv ergon_core/ergon_core/core/runtime/inngest/errors.py ergon_core/ergon_core/core/infrastructure/inngest/errors.py +touch ergon_core/ergon_core/core/infrastructure/__init__.py +touch ergon_core/ergon_core/core/infrastructure/inngest/__init__.py +touch ergon_core/ergon_core/core/infrastructure/inngest/handlers/__init__.py +``` + +- [ ] **Step 2: Move handler semantics into jobs** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/application/jobs +for name in cancel_orphan_subtasks check_evaluators cleanup_cancelled_task complete_workflow evaluate_task_run execute_task fail_workflow persist_outputs propagate_execution run_cleanup sandbox_setup start_workflow worker_execute; do + mv "ergon_core/ergon_core/core/runtime/inngest/${name}.py" "ergon_core/ergon_core/core/application/jobs/${name}.py" +done +touch ergon_core/ergon_core/core/application/jobs/__init__.py +rm ergon_core/ergon_core/core/runtime/inngest/__init__.py 2>/dev/null || true +rmdir ergon_core/ergon_core/core/runtime/inngest +``` + +- [ ] **Step 3: Add thin adapters** + +For each moved job, remove the Inngest decorator from the application job file and expose an async `run__job(...)` function that contains the semantic behavior. The infrastructure handler owns the `@inngest_client.create_function(...)` decorator and delegates to the application job. + +For `worker_execute`, transform `core/application/jobs/worker_execute.py` so it starts like this: + +```python +"""Application job for worker execution.""" + +import logging +import traceback +from datetime import UTC, datetime + +from ergon_core.api.benchmark import EmptyTaskPayload, Task +from ergon_core.api.worker import WorkerContext +from ergon_core.core.application.context.events import ContextEventService +from ergon_core.core.application.experiments.repository import DefinitionRepository +from ergon_core.core.application.jobs.models import WorkerExecuteJobRequest +from ergon_core.core.application.jobs.models import WorkerExecuteJobResult +from ergon_core.core.domain.generation.context_parts import ContextPartChunk +from ergon_core.core.infrastructure.dashboard.provider import get_dashboard_emitter +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.infrastructure.inngest.errors import RegistryLookupError +from ergon_core.core.infrastructure.tracing import ( + CompletedSpan, + get_trace_sink, + worker_execute_context, +) +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + + +async def run_worker_execute_job(payload: WorkerExecuteJobRequest) -> WorkerExecuteJobResult: + from ergon_builtins.registry import BENCHMARKS, WORKERS + + # Move the current body of worker_execute_fn here, replacing ctx.event.data + # with the typed payload argument. +``` + +Create `core/application/jobs/models.py` for job request/result aliases imported from Inngest contracts during the first migration: + +```python +"""Application job contracts. + +These mirror external Inngest event contracts during the migration so job logic +can be called independently of Inngest decorators. +""" + +from ergon_core.core.infrastructure.inngest.contracts import ( + CleanupCancelledTaskRequest, + CleanupCancelledTaskResult, + CompleteWorkflowRequest, + CompleteWorkflowResult, + EvaluateTaskRequest, + EvaluateTaskResult, + ExecuteTaskRequest, + ExecuteTaskResult, + PropagateExecutionRequest, + PropagateExecutionResult, + SandboxSetupRequest, + SandboxSetupResult, + StartWorkflowRequest, + StartWorkflowResult, + WorkerExecuteRequest as WorkerExecuteJobRequest, + WorkerExecuteResult as WorkerExecuteJobResult, +) + +__all__ = [ + "CleanupCancelledTaskRequest", + "CleanupCancelledTaskResult", + "CompleteWorkflowRequest", + "CompleteWorkflowResult", + "EvaluateTaskRequest", + "EvaluateTaskResult", + "ExecuteTaskRequest", + "ExecuteTaskResult", + "PropagateExecutionRequest", + "PropagateExecutionResult", + "SandboxSetupRequest", + "SandboxSetupResult", + "StartWorkflowRequest", + "StartWorkflowResult", + "WorkerExecuteJobRequest", + "WorkerExecuteJobResult", +] +``` + +Create `core/infrastructure/inngest/handlers/worker_execute.py` as the thin adapter: + +```python +"""Inngest adapter for worker execution.""" + +import inngest + +from ergon_core.core.application.jobs.worker_execute import run_worker_execute_job +from ergon_core.core.infrastructure.inngest.client import inngest_client +from ergon_core.core.infrastructure.inngest.contracts import ( + WorkerExecuteRequest, + WorkerExecuteResult, +) + + +@inngest_client.create_function( + fn_id="worker-execute", + trigger=inngest.TriggerEvent(event="task/worker-execute"), + retries=0, + output_type=WorkerExecuteResult, +) +async def worker_execute_fn(ctx: inngest.Context) -> WorkerExecuteResult: + return await run_worker_execute_job(WorkerExecuteRequest.model_validate(ctx.event.data)) + +__all__ = ["worker_execute_fn"] +``` + +Use the same pattern for every handler: `application/jobs/.py` exports `run__job`, and `infrastructure/inngest/handlers/.py` owns the decorator and event parsing. Preserve the existing `fn_id`, trigger event, retry policy, and output type from the original handler. + +- [ ] **Step 4: Update registry imports** + +In `core/infrastructure/inngest/registry.py`, import handler modules from `ergon_core.core.infrastructure.inngest.handlers`. + +If the registry currently imports function objects from handler modules, keep the same object names and only change module paths. + +- [ ] **Step 5: Bulk update imports** + +Run: + +```bash +python - <<'PY' +from pathlib import Path + +replacements = { + "ergon_core.core.runtime.inngest.client": "ergon_core.core.infrastructure.inngest.client", + "ergon_core.core.runtime.inngest.registry": "ergon_core.core.infrastructure.inngest.registry", + "ergon_core.core.runtime.inngest.contracts": "ergon_core.core.infrastructure.inngest.contracts", + "ergon_core.core.runtime.inngest.errors": "ergon_core.core.infrastructure.inngest.errors", + "ergon_core.core.runtime.inngest.": "ergon_core.core.application.jobs.", +} + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text + for old, replacement in replacements.items(): + new = new.replace(old, replacement) + if new != text: + path.write_text(new) +PY +``` + +After the script, inspect `core/infrastructure/inngest/registry.py` and adapter files. Registry imports should point to `infrastructure.inngest.handlers`, not `application.jobs`. + +- [ ] **Step 6: Run focused tests** + +Run: + +```bash +uv run pytest tests/unit/runtime/test_child_function_payloads.py tests/unit/runtime/test_inngest_criterion_executor.py tests/unit/runtime/test_import_boundaries.py tests/unit/registry/test_react_factories.py -q +``` + +Expected: PASS. + +## Task 8: Move Infrastructure Packages + +**Files:** +- Move: `core/sandbox/*` -> `core/infrastructure/sandbox/*` +- Move: `core/dashboard/*` -> `core/infrastructure/dashboard/*` +- Move: `core/runtime/tracing/*` -> `core/infrastructure/tracing/*` +- Move: `core/runtime/dependencies.py` -> `core/infrastructure/dependencies.py` +- Modify: imports across source and tests +- Test: dashboard, sandbox, tracing, dependency tests + +- [ ] **Step 1: Move sandbox** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/infrastructure/sandbox +mv ergon_core/ergon_core/core/sandbox/*.py ergon_core/ergon_core/core/infrastructure/sandbox/ +rmdir ergon_core/ergon_core/core/sandbox +``` + +- [ ] **Step 2: Move dashboard** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/infrastructure/dashboard +mv ergon_core/ergon_core/core/dashboard/*.py ergon_core/ergon_core/core/infrastructure/dashboard/ +rmdir ergon_core/ergon_core/core/dashboard +``` + +- [ ] **Step 3: Move tracing and dependency probe** + +Run: + +```bash +mkdir -p ergon_core/ergon_core/core/infrastructure/tracing +mv ergon_core/ergon_core/core/runtime/tracing/*.py ergon_core/ergon_core/core/infrastructure/tracing/ +rmdir ergon_core/ergon_core/core/runtime/tracing +mv ergon_core/ergon_core/core/runtime/dependencies.py ergon_core/ergon_core/core/infrastructure/dependencies.py +``` + +- [ ] **Step 4: Bulk update imports** + +Run: + +```bash +python - <<'PY' +from pathlib import Path + +replacements = { + "ergon_core.core.sandbox": "ergon_core.core.infrastructure.sandbox", + "ergon_core.core.dashboard": "ergon_core.core.infrastructure.dashboard", + "ergon_core.core.runtime.tracing": "ergon_core.core.infrastructure.tracing", + "ergon_core.core.runtime.dependencies": "ergon_core.core.infrastructure.dependencies", +} + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text + for old, replacement in replacements.items(): + new = new.replace(old, replacement) + if new != text: + path.write_text(new) +PY +``` + +- [ ] **Step 5: Run focused tests** + +Run: + +```bash +uv run pytest tests/unit/dashboard/test_event_contract_types.py tests/unit/runtime/test_sandbox_setup_explicit_slug.py tests/unit/benchmarks/test_swebench_sandbox_manager.py tests/unit/state/test_benchmark_contract.py -q +``` + +Expected: PASS. + +## Task 9: Move Application Events, Remove Runtime Root, And Add Durable Import Direction Guards + +**Files:** +- Move: `ergon_core/ergon_core/core/runtime/events/*` -> `ergon_core/ergon_core/core/application/events/*` +- Delete: `ergon_core/ergon_core/core/runtime/` +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Test: architecture suite + +- [ ] **Step 1: Delete empty runtime root** + +First move the remaining semantic event contracts out of runtime: + +```bash +mkdir -p ergon_core/ergon_core/core/application/events +mv ergon_core/ergon_core/core/runtime/events/*.py ergon_core/ergon_core/core/application/events/ +rmdir ergon_core/ergon_core/core/runtime/events +``` + +Then update imports: + +```bash +python - <<'PY' +from pathlib import Path + +for root in [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests")]: + for path in root.rglob("*.py"): + text = path.read_text() + new = text.replace( + "ergon_core.core.runtime.events", + "ergon_core.core.application.events", + ) + if new != text: + path.write_text(new) +PY +``` + +Now delete the empty runtime root: + +Run: + +```bash +rmdir ergon_core/ergon_core/core/runtime +``` + +Expected: command succeeds because all runtime subpackages and files have moved. + +- [ ] **Step 2: Add durable root guard** + +Append to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +def test_core_uses_hybrid_domain_layout_roots() -> None: + core = ROOT / "ergon_core/ergon_core/core" + + expected_dirs = { + "application", + "domain", + "infrastructure", + "persistence", + "rest_api", + "rl", + "shared", + } + actual_dirs = {path.name for path in core.iterdir() if path.is_dir() and path.name != "__pycache__"} + + assert expected_dirs <= actual_dirs + assert "runtime" not in actual_dirs + assert "api" not in actual_dirs + assert "definitions" not in actual_dirs + assert "composition" not in actual_dirs + assert "sandbox" not in actual_dirs + assert "dashboard" not in actual_dirs +``` + +- [ ] **Step 3: Add import direction guard** + +Append to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +def test_core_hybrid_layout_import_directions() -> None: + forbidden_imports = { + "domain": ( + "ergon_core.core.application", + "ergon_core.core.persistence", + "ergon_core.core.infrastructure", + "ergon_core.core.rest_api", + ), + "persistence": ( + "ergon_core.core.application", + "ergon_core.core.infrastructure", + "ergon_core.core.rest_api", + ), + "application": ( + "ergon_core.core.rest_api", + "ergon_core.core.infrastructure.inngest.handlers", + ), + } + + offenders: list[str] = [] + for root_name, snippets in forbidden_imports.items(): + root = ROOT / "ergon_core/ergon_core/core" / root_name + for path in root.rglob("*.py"): + text = path.read_text() + for snippet in snippets: + if snippet in text: + offenders.append(f"{path.relative_to(ROOT)} imports {snippet}") + + assert offenders == [] +``` + +- [ ] **Step 4: Add job adapter split guard** + +Append to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +def test_application_jobs_do_not_own_inngest_decorators() -> None: + jobs_root = ROOT / "ergon_core/ergon_core/core/application/jobs" + offenders: list[str] = [] + + for path in jobs_root.rglob("*.py"): + text = path.read_text() + if "@inngest_client.create_function" in text or "import inngest" in text: + offenders.append(str(path.relative_to(ROOT))) + if "ergon_core.core.infrastructure.inngest.handlers" in text: + offenders.append(str(path.relative_to(ROOT))) + + assert offenders == [] +``` + +- [ ] **Step 5: Run architecture tests** + +Run: + +```bash +uv run pytest tests/unit/architecture -q +``` + +Expected: PASS except the temporary exact-layout test may still fail if additional unexpected files exist. If it fails, inspect the exact `unexpected` list and decide whether the target doc should include those files or the files should move/delete. + +## Task 10: Finalize Exact Layout, Delete Temporary Test + +**Files:** +- Delete: `tests/unit/architecture/test_core_hybrid_layout_temporary.py` +- Modify: `docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout.md` if any final file names changed during implementation +- Test: architecture suite and focused regression suite + +- [ ] **Step 1: Run temporary exact-layout test one last time** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_core_hybrid_layout_temporary.py -q +``` + +Expected: PASS. This proves the temporary exact target was achieved before deleting the brittle guard. + +- [ ] **Step 2: Delete the temporary test** + +Run: + +```bash +rm tests/unit/architecture/test_core_hybrid_layout_temporary.py +``` + +- [ ] **Step 3: Run architecture and focused regression tests** + +Run: + +```bash +uv run pytest tests/unit/architecture tests/unit/runtime/test_workflow_service.py tests/unit/runtime/test_task_execution_repository.py tests/unit/runtime/test_inngest_criterion_executor.py tests/unit/dashboard/test_communication_threads.py tests/unit/cli/test_experiment_cli.py tests/unit/benchmarks/test_swebench_sandbox_manager.py -q +``` + +Expected: PASS. + +- [ ] **Step 4: Run ruff on moved source and tests** + +Run: + +```bash +uv run ruff check ergon_core ergon_cli ergon_builtins tests/unit/architecture +``` + +Expected: PASS. + +## Task 11: Broad Verification + +**Files:** +- Modify: none unless tests reveal missed imports +- Test: broad unit/integration suite as time permits + +- [ ] **Step 1: Search for stale paths** + +Run: + +```bash +rg "ergon_core\\.core\\.(runtime|api|definitions|composition|sandbox|dashboard)|core/runtime|core/api|core/definitions|core/composition|core/sandbox|core/dashboard" ergon_core ergon_cli ergon_builtins tests docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout.md +``` + +Expected: no stale code imports. Documentation may mention old paths only in current-to-target move maps. + +- [ ] **Step 2: Run broad unit tests** + +Run: + +```bash +uv run pytest tests/unit -q +``` + +Expected: PASS, or failures only from known environment import-resolution issues. Fix any migration-related import failures. + +- [ ] **Step 3: Run targeted integration tests** + +Run: + +```bash +uv run pytest tests/integration/propagation tests/integration/restart tests/integration/smokes -q +``` + +Expected: PASS, or failures clearly unrelated to package movement. + +## Self-Review Checklist + +- Every moved package has a target path in the plan. +- The temporary exact folder test is added first and deleted in the final cleanup. +- `core/rl` remains top-level. +- `core/rest_api` is distinct from public `ergon_core.api`. +- Inngest semantic jobs land in `application/jobs`; adapters land in `infrastructure/inngest/handlers`. +- No compatibility aliases are required by the plan. +- No git commits are required by the plan. diff --git a/docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout.md b/docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout.md new file mode 100644 index 00000000..685b2316 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-core-hybrid-domain-layout.md @@ -0,0 +1,584 @@ +# Core Hybrid Domain Layout + +This documents the implemented hybrid layout for `ergon_core.core`: hard +technical layers stay visible (`rest_api`, `persistence`, `infrastructure`), +while product/application concepts live in explicit clusters under +`core/application`. + +The goal is not "everything is domain-first". The goal is that a new contributor +can answer three questions quickly: + +1. Where do use cases live? +2. Where do SQL/storage rows live? +3. Where do transport/infrastructure adapters live? + +## Implemented Top-Level Shape + +```text +ergon_core/ergon_core/core/ + __init__.py + + rest_api/ + # FastAPI / HTTP transport only. + # Named rest_api to avoid confusion with the public authoring API + # under ergon_core.api. + # Should import application services and read models, not own domain logic. + __init__.py + app.py + cohorts.py + experiments.py + rollouts.py + runs.py + test_harness.py + + application/ + # Product use cases and domain-aware repositories. + # This replaces the current "runtime as second root" feeling. + + experiments/ + # Define experiments, persist authored definitions, launch experiment runs. + # Implemented from: + # - core/definitions/service.py + # - core/definitions/persistence.py + # - core/definitions/repository.py + # - core/definitions/schemas.py + # - runtime/workflows/launch.py + __init__.py + service.py + models.py + repository.py + definition_writer.py + launch.py + + workflows/ + # Run/workflow lifecycle after a definition exists. + # Implemented from: + # - runtime/workflows/service.py + # - runtime/workflows/orchestration.py + # - runtime/workflows/runs.py + # - runtime/workflows/models.py + # - runtime/workflows/errors.py + service.py + orchestration.py + runs.py + models.py + errors.py + + graph/ + # Runtime graph mutations, traversal, lookup, and propagation. + # Implemented from: + # - runtime/graph/* + repository.py + propagation.py + traversal.py + lookup.py + models.py + errors.py + + tasks/ + # Task execution lifecycle and task execution repository. + # Implemented from: + # - runtime/tasks/* + __init__.py + service.py + execution.py + management.py + inspection.py + cleanup.py + repository.py + models.py + errors.py + + evaluation/ + # Evaluation dispatch, criterion runtime, scoring, persistence use cases. + # Implemented from: + # - runtime/evaluation/* + service.py + executors.py + inngest_executor.py + criterion_runtime.py + scoring.py + protocols.py + models.py + errors.py + + read_models/ + # Query-side DTO assembly for UI/API surfaces. + # Implemented from: + # - runtime/read_models/runs.py + # - runtime/read_models/run_snapshot.py + # - runtime/read_models/experiments.py + # - runtime/read_models/cohorts.py + # - runtime/read_models/resources.py + # - runtime/read_models/models.py + # - runtime/read_models/errors.py + __init__.py + runs.py + run_snapshot.py + experiments.py + cohorts.py + resources.py + models.py + errors.py + + communication/ + # Agent-to-agent communication is its own product domain. + # Do not fold this into run read models. + # Implemented from: + # - runtime/read_models/communication.py + # - relevant communication DTOs currently in runtime/read_models/models.py + __init__.py + service.py + models.py + errors.py + + context/ + # Worker context event stream and output extraction. + # Implemented from: + # - runtime/context_events.py + # - runtime/output_extraction.py + __init__.py + events.py + output_extraction.py + + jobs/ + # Core semantic workflows currently implemented inside Inngest handlers. + # These are background job use cases. Inngest should call them, not own + # their branching, persistence, and orchestration rules. + # Implemented from: + # - runtime/inngest/{handler files}.py, after extracting adapter details. + cancel_orphan_subtasks.py + check_evaluators.py + cleanup_cancelled_task.py + complete_workflow.py + evaluate_task_run.py + execute_task.py + fail_workflow.py + persist_outputs.py + propagate_execution.py + run_cleanup.py + sandbox_setup.py + start_workflow.py + worker_execute.py + models.py + + resources/ + # Run resource append/query use cases that are not just API presentation. + # Implemented from: + # - runtime/resources.py + # - sandbox/resource_publisher.py may depend on repository here + __init__.py + repository.py + models.py + + events/ + # Product/application event contracts used by jobs, adapters, and + # dashboard emission. The adapter layer may send these through Inngest, + # but it should not own their semantic schemas. + # Implemented from: + # - runtime/events/* + __init__.py + base.py + task_events.py + infrastructure_events.py + + domain/ + # Pure-ish domain objects that should not know about DB sessions, + # Inngest, FastAPI, or dashboard emission. + + experiments/ + # Authoring/composition objects. + # Implemented from: + # - core/composition/* + __init__.py + experiment.py + handles.py + worker_spec.py + validation.py + + generation/ + # Context stream and generation transcript primitives. + # Implemented from: + # - core/generation.py + context_parts.py + + persistence/ + # SQLModel rows, DB/session helpers, and storage-only repositories. + # Should not own product workflows or read-model assembly. + + shared/ + db.py + enums.py + ids.py + types.py + + definitions/ + models.py + + telemetry/ + models.py + repositories.py + evaluation_summary.py + + graph/ + models.py + status_conventions.py + + context/ + models.py + event_payloads.py + + saved_specs/ + models.py + + infrastructure/ + # External adapters and operational plumbing. + # Infrastructure calls application services; application should not import + # concrete infrastructure except through deliberate adapter seams. + + inngest/ + # Inngest client, contracts, registry, and thin function adapters. + # Implemented from: + # - runtime/inngest/client.py + # - runtime/inngest/registry.py + # - runtime/inngest/contracts.py + # - runtime/inngest/errors.py + # - runtime/inngest/{handler files}.py after semantic logic moves to + # application/jobs. + client.py + registry.py + contracts.py + errors.py + + handlers/ + cancel_orphan_subtasks.py + check_evaluators.py + cleanup_cancelled_task.py + complete_workflow.py + evaluate_task_run.py + execute_task.py + fail_workflow.py + persist_outputs.py + propagate_execution.py + run_cleanup.py + sandbox_setup.py + start_workflow.py + worker_execute.py + + sandbox/ + # E2B/local sandbox managers and sandbox instrumentation. + # Implemented from: + # - core/sandbox/* + __init__.py + manager.py + lifecycle.py + resource_publisher.py + instrumentation.py + event_sink.py + errors.py + utils.py + + dashboard/ + # Dashboard event emission/integration. + # Implemented from: + # - core/dashboard/* + __init__.py + emitter.py + provider.py + event_contracts.py + + tracing/ + # Tracing/OpenTelemetry adapters and sinks. + # Implemented from: + # - runtime/tracing/* + __init__.py + attributes.py + contexts.py + ids.py + noop.py + otel.py + sinks.py + types.py + + dependencies.py + + rl/ + # Keep as a separate bounded context for now. + # Rollouts, rewards, extraction, checkpointing, and vLLM management cut + # across product use cases and are closer to training/research machinery + # than ordinary application services. + __init__.py + rollout_service.py + eval_runner.py + extraction.py + rewards.py + checkpoint.py + rollout_types.py + vllm_manager.py + + shared/ + # Small cross-cutting primitives. Keep this boring and sparse. + json_types.py + settings.py + utils.py +``` + +## Clusters And Ownership Rules + +### `core/application` + +Application packages own use cases. They can import: + +- `core/domain` +- `core/persistence` +- `core/shared` + +They should not import: + +- `core/rest_api` +- Inngest function modules +- FastAPI router modules + +`application` is where the former `runtime` domains landed. The important rename +is conceptual: the old `runtime` package mixed use cases, adapters, and +operational helpers, while `application` now means "use cases over the persisted +product model." + +### `core/domain` + +Domain packages own objects that should be understandable without infrastructure: + +- experiment composition +- worker specs +- definition handles +- context/generation primitives + +These modules should not create DB sessions, emit dashboard events, or know about +Inngest. They may validate invariants and expose plain objects. + +### `core/persistence` + +Persistence owns rows and storage helpers. It should not own product decisions. + +Examples that should stay here: + +- SQLModel row classes +- session helpers +- enum/storage types +- storage-only repositories + +Examples that should not live here: + +- query-bag application workflows +- evaluation summary refresh orchestration +- context event sequencing logic +- run snapshot assembly + +### `core/infrastructure` + +Infrastructure owns adapters: + +- Inngest client, registry, contracts, and thin function adapters +- sandbox manager/resource publisher +- dashboard emitter +- tracing adapters +- package dependency probes + +Infrastructure modules can call application services. They should not become +the canonical home for business rules. Inngest handlers are split so core +semantic logic lives in `application/jobs`, while the Inngest-decorated shell +remains under `infrastructure/inngest/handlers`. + +### `core/rest_api` + +`core/rest_api` is the HTTP layer. The explicit name keeps it visually separate +from `ergon_core.api`, which is the public authoring/types API for builtins, +CLI, and students. It should be thin: + +- validate/deserialize transport requests +- call application services/read models +- map missing resources to HTTP errors + +It should not define reusable domain DTOs just because the frontend consumes +them. Those belong in `application/read_models` or the relevant application +domain. + +## Implemented Move Map + +```text +core/definitions/service.py + -> core/application/experiments/service.py + +core/definitions/schemas.py + -> core/application/experiments/models.py + +core/definitions/repository.py + -> core/application/experiments/repository.py + +core/definitions/persistence.py + -> core/application/experiments/definition_writer.py + +core/composition/* + -> core/domain/experiments/* + +core/runtime/workflows/* + -> core/application/workflows/* + except runtime/workflows/launch.py + -> core/application/experiments/launch.py + +core/runtime/graph/* + -> core/application/graph/* + +core/runtime/tasks/* + -> core/application/tasks/* + +core/runtime/evaluation/* + -> core/application/evaluation/* + +core/runtime/read_models/runs.py +core/runtime/read_models/run_snapshot.py +core/runtime/read_models/experiments.py +core/runtime/read_models/cohorts.py +core/runtime/read_models/resources.py +core/runtime/read_models/errors.py +core/runtime/read_models/models.py + -> core/application/read_models/* + +core/runtime/read_models/communication.py + -> core/application/communication/service.py + +communication DTOs from core/runtime/read_models/models.py + -> core/application/communication/models.py + +core/runtime/context_events.py + -> core/application/context/events.py + +core/runtime/output_extraction.py + -> core/application/context/output_extraction.py + +core/runtime/resources.py + -> core/application/resources/models.py + -> core/application/resources/repository.py + +core/runtime/events/* + -> core/application/events/* + +core/rl/* + -> core/rl/* + # Keep in place for now as a separate bounded context. + +core/runtime/inngest/client.py +core/runtime/inngest/registry.py +core/runtime/inngest/contracts.py +core/runtime/inngest/errors.py + -> core/infrastructure/inngest/* + +core/runtime/inngest/{handler files}.py + -> core/application/jobs/{handler files}.py + -> core/infrastructure/inngest/handlers/{handler files}.py + # Split each handler: semantic background job into application/jobs, + # Inngest decorator/event adapter into infrastructure/inngest/handlers. + +core/sandbox/* + -> core/infrastructure/sandbox/* + +core/dashboard/* + -> core/infrastructure/dashboard/* + +core/runtime/tracing/* + -> core/infrastructure/tracing/* + +core/runtime/dependencies.py + -> core/infrastructure/dependencies.py + +core/generation.py + -> core/domain/generation/context_parts.py + +core/json_types.py +core/settings.py +core/utils.py + -> core/shared/* +``` + +## Deleted Legacy Paths + +```text +core/runtime/ + # Deleted after all subpackages moved. + +core/definitions/ + # Deleted after experiment lifecycle files moved to application/experiments. + +core/composition/ + # Deleted after pure domain objects moved to domain/experiments. + +core/sandbox/ +core/dashboard/ + # Deleted after infrastructure moved. + +core/generation.py +core/json_types.py +core/settings.py +core/utils.py + # Deleted after shared/domain moves. +``` + +## Import Direction Guardrails + +```text +api -> application -> domain +api -> application -> persistence +api -> shared + +infrastructure -> application +infrastructure -> domain +infrastructure -> persistence +infrastructure -> shared + +application -> domain +application -> persistence +application -> shared + +persistence -> shared +persistence -> domain/generation only if row payload parsing requires typed context parts + +domain -> shared +``` + +Forbidden directions: + +```text +domain -> application +domain -> persistence +domain -> infrastructure +domain -> rest_api + +persistence -> application +persistence -> infrastructure +persistence -> rest_api + +application -> rest_api +application -> infrastructure/inngest/handlers +``` + +## Resolved Decisions + +1. This intentionally keeps `communication` separate from run read models. It is + a product domain for agents communicating with each other. +2. `read_models` stays as a query-side application cluster instead of being + split into every domain. That reduces churn while keeping REST + routers thin. +3. `application/jobs` keeps the core semantics of externally-triggered + background workflows visible. `infrastructure/inngest/handlers` should be + thin wrappers around those use cases. +4. `persistence` remains a visible top-level layer because hiding SQL rows + inside product domains would make storage contracts harder to + audit. +5. Old-path compatibility aliases are intentionally avoided. Bulk import renames + keep the finalized package structure explicit. +6. `domain/generation/context_parts.py` remains the name for generation context + primitives. +7. Dashboard emission stays under `infrastructure/dashboard`, while product + event contracts live under `application/events`. +8. `core/rl` remains its own bounded context instead of being renamed to + `core/learning`. diff --git a/docs/superpowers/plans/2026-04-28-core-schema-deduplication.md b/docs/superpowers/plans/2026-04-28-core-schema-deduplication.md new file mode 100644 index 00000000..db086f5d --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-core-schema-deduplication.md @@ -0,0 +1,1178 @@ +# Core Schema Deduplication Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make core workflow statuses, evaluation statuses, graph mutation payloads, event causes, and projection schemas have one clear source of truth per domain. + +**Architecture:** Keep persisted table schemas in `core/persistence/*`, graph lifecycle conventions in `core/persistence/graph/status_conventions.py`, typed graph mutation payloads in `core/runtime/services/graph_dto.py`, evaluation summary status in `core/persistence/telemetry/evaluation_summary.py`, and transport-specific projections in `core/api/schemas.py` and `core/dashboard/event_contracts.py`. REST and dashboard layers may project canonical DTOs, but must not redefine domain meaning. + +**Tech Stack:** Python 3.13, Pydantic v2, SQLModel, pytest, ty-compatible type aliases, existing Ergon core runtime/persistence packages. + +--- + +## Source Of Truth Decisions + +| Concept | Source of truth | Consumers should import from | Cleanup rule | +|---|---|---|---| +| Run row lifecycle | `ergon_core.core.persistence.shared.enums.RunStatus` | `core.persistence.shared.enums` | Only use for `RunRecord.status` and run-level orchestration. | +| Task execution row lifecycle | `ergon_core.core.persistence.shared.enums.TaskExecutionStatus` | `core.persistence.shared.enums` | Only use for `RunTaskExecution.status`; do not use it as the graph-node status type. | +| Graph node lifecycle | `ergon_core.core.persistence.graph.status_conventions.NodeStatus` and constants | `core.persistence.graph.status_conventions` | Use for `RunGraphNode.status`, propagation, subtask inspection, dashboard task-node status, and graph DTO status annotations. | +| Graph edge lifecycle | `ergon_core.core.persistence.graph.status_conventions.EdgeStatus` and constants | `core.persistence.graph.status_conventions` | Use for `RunGraphEdge.status` and edge mutation/status changes. | +| Graph target and mutation names | `GraphTargetType`, `MutationType` in `core/persistence/graph/models.py` | `core.persistence.graph.models` | Keep because these are persisted mutation-log contract names. | +| Graph mutation payload body | `GraphMutationValue` union in `core/runtime/services/graph_dto.py` | `core.runtime.services.graph_dto` | REST and dashboard events import this union; no separate payload definitions. | +| Evaluation criterion status | `EvalCriterionStatus` in `core/persistence/telemetry/evaluation_summary.py` | `core.persistence.telemetry.evaluation_summary` | REST evaluation DTOs import this alias. | +| Cancel cause | `CancelCause` in `core/runtime/events/task_events.py` | `core.runtime.events.task_events` | Services that accept cancel causes import the shared alias or narrower named aliases from the same module. | +| Context event payloads | `ContextEventType`, `ContextEventPayload` in `core/persistence/context/event_payloads.py` | `core.persistence.context.event_payloads` | REST/dashboard context event snapshots should use the canonical type where practical. | +| Generation transcript parts | `core/generation.py` | `core.generation` | Keep separate from context event payloads; add adapter tests for the mapping instead of merging naming schemes. | + +--- + +## DTO Collapse Targets + +The cleanup should collapse duplicate DTOs when two classes carry the same domain payload with only superficial transport differences. Keep separate models only when the shape is genuinely different at the boundary. + +| Current duplication | Collapse target | Keep separate? | Why | +|---|---|---|---| +| `GraphMutationDto`, `RunGraphMutationDto`, `DashboardGraphMutationEvent` repeat mutation identity/body fields | Add canonical `GraphMutationRecordDto` in `core/runtime/services/graph_dto.py`; REST returns it, dashboard event embeds it or is a thin envelope around it | Keep dashboard event envelope only | Mutation body and metadata are one concept; REST/dashboard differ only by transport envelope and timestamp naming. | +| `RunContextEventDto` and `DashboardContextEventEvent` repeat context-event fields, but REST is untyped | Add canonical `ContextEventDto` near `core/persistence/context/event_payloads.py` or `core/runtime/services/context_dto.py`; both REST and dashboard use `ContextEventType` + `ContextEventPayload` | Keep event envelope name only | Same persisted event snapshot should not have typed dashboard payload and untyped REST payload. | +| `WorkflowTaskRef` mostly duplicates a subset of `GraphNodeDto` | Prefer `GraphNodeDto` directly where the full node snapshot is acceptable; otherwise create one canonical `GraphTaskRef` in `graph_dto.py` and use it across workflow DTOs | Maybe | CLI/tool responses may intentionally omit fields, but the current separate class adds another status/name surface. | +| `RunTaskDto` and `TaskTreeNode` both represent UI task nodes but one is map-oriented and one is recursive | Extract a shared `TaskNodeSnapshot` payload if frontend compatibility allows; keep `RunSnapshotDto.tasks: dict[str, ...]` and `DashboardWorkflowStartedEvent.task_tree` as containers | Yes, containers differ | Map vs tree is a real transport difference; the task-node payload fields should not drift. | +| `TestGraphNodeDto` and `TestGraphMutationDto` are Playwright-only projections | Leave separate but derive from canonical DTO conversion helpers where possible | Yes | Test harness is intentionally narrow/additive-only, but should not define new domain semantics. | + +Rule: collapse the payload, not necessarily the envelope. For example, `DashboardGraphMutationEvent` can remain an event contract, but it should carry the same canonical mutation record/payload as REST and repository code. + +--- + +## File Structure + +**Modify:** +- `ergon_core/ergon_core/core/persistence/graph/status_conventions.py` — canonical graph status aliases, terminal/settled helpers, and small predicates. +- `ergon_core/ergon_core/core/runtime/execution/propagation.py` — use graph status constants consistently and align failure docs/results with `BLOCKED` behavior. +- `ergon_core/ergon_core/core/runtime/services/task_propagation_service.py` — remove stale cancellation wording and stop exposing unused invalidated targets from normal propagation if tests confirm it is dead. +- `ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py` — remove dead `TaskCancelledEvent` emission from propagation if `invalidated_targets` is removed. +- `ergon_core/ergon_core/core/runtime/services/orchestration_dto.py` — simplify `PropagationResult` around actual ready/block/terminal outcomes. +- `ergon_core/ergon_core/core/runtime/services/task_inspection_dto.py` — use `NodeStatus` directly instead of duplicating or aliasing `SubtaskStatus`. +- `ergon_core/ergon_core/core/persistence/telemetry/evaluation_summary.py` — keep `EvalCriterionStatus` canonical. +- `ergon_core/ergon_core/core/api/schemas.py` — import `EvalCriterionStatus`, remove duplicate mutation/context payload bodies, and keep REST projection thin. +- `ergon_core/ergon_core/core/runtime/services/graph_dto.py` — make `GraphMutationValue` the only typed mutation payload body and make edge mutation IDs consistent with graph DTO ID types. +- `ergon_core/ergon_core/core/dashboard/event_contracts.py` — keep event envelopes but reuse canonical graph mutation/context event DTO payloads. +- `ergon_core/ergon_core/core/runtime/events/task_events.py` — keep `CancelCause` canonical and add subset aliases if services need narrower inputs. +- `ergon_core/ergon_core/core/runtime/services/subtask_cancellation_service.py` — import shared cancel-cause aliases instead of duplicating string literals. +- `ergon_core/ergon_core/core/runtime/services/subtask_blocking_service.py` — share graph skip predicates from `status_conventions.py`. + +**Add or modify tests:** +- `tests/unit/architecture/test_core_schema_sources.py` — architecture guard for duplicate literals and forbidden imports. +- `tests/unit/runtime/test_propagation_contracts.py` or existing propagation tests — assert failure propagation blocks downstream nodes and does not emit cancellation targets. +- `tests/unit/runtime/test_graph_mutation_contracts.py` or existing graph repository tests — assert REST/dashboard mutation payloads accept the same `GraphMutationValue` body. +- Existing focused tests: `tests/unit/runtime/test_workflow_service.py`, `tests/unit/runtime/test_dynamic_task_evaluation_mapping.py`, `tests/unit/dashboard/test_event_contract_types.py`, `tests/unit/architecture/test_model_field_descriptions.py`. + +--- + +### Task 1: Guard Canonical Status Ownership + +**Files:** +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Modify: `ergon_core/ergon_core/core/persistence/graph/status_conventions.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/task_inspection_dto.py` + +- [ ] **Step 1: Write architecture tests that fail on duplicated graph status literals** + +Create `tests/unit/architecture/test_core_schema_sources.py` with this first test: + +```python +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[3] + + +def test_graph_status_literals_are_defined_only_in_status_conventions() -> None: + offenders: list[str] = [] + duplicate_snippets = ( + 'Literal["pending", "ready", "running", "completed", "failed", "cancelled", "blocked"]', + 'Literal["pending", "ready", "running", "completed", "failed", "blocked", "cancelled"]', + 'Literal["pending", "satisfied", "invalidated"]', + ) + allowed = { + ROOT / "ergon_core/ergon_core/core/persistence/graph/status_conventions.py", + } + + for path in (ROOT / "ergon_core/ergon_core/core").rglob("*.py"): + if path in allowed: + continue + text = path.read_text() + for snippet in duplicate_snippets: + if snippet in text: + offenders.append(f"{path.relative_to(ROOT)} duplicates {snippet}") + + assert offenders == [] +``` + +- [ ] **Step 2: Run the new test and verify it fails** + +Run: `uv run pytest tests/unit/architecture/test_core_schema_sources.py::test_graph_status_literals_are_defined_only_in_status_conventions -v` + +Expected: FAIL because `task_inspection_dto.py` duplicates the node status `Literal`. + +- [ ] **Step 3: Add canonical helpers to `status_conventions.py`** + +Update `ergon_core/ergon_core/core/persistence/graph/status_conventions.py`: + +```python +NodeStatus = Literal["pending", "ready", "running", "completed", "failed", "cancelled", "blocked"] + +NON_AUTONOMOUS_STATUSES = TERMINAL_STATUSES | frozenset({BLOCKED}) + + +def is_terminal_node_status(status: str) -> bool: + return status in TERMINAL_STATUSES + + +def is_blockable_node_status(status: str) -> bool: + return status != RUNNING and status not in TERMINAL_STATUSES +``` + +Keep `EdgeStatus` in the same file. Do not move graph statuses to `shared/enums.py`; graph status intentionally remains string-backed because `RunGraphNode.status` is free-form at the database layer. + +- [ ] **Step 4: Replace `SubtaskStatus` with `NodeStatus` at the field boundary** + +Update `ergon_core/ergon_core/core/runtime/services/task_inspection_dto.py`: + +```python +from ergon_core.core.persistence.graph.status_conventions import NodeStatus +from ergon_core.core.persistence.shared.types import NodeId +from pydantic import BaseModel +``` + +Change the model field from: + +```python +status: SubtaskStatus +``` + +to: + +```python +status: NodeStatus +``` + +Delete the `SubtaskStatus` name entirely. If any downstream call site imports `SubtaskStatus`, update that call site to import `NodeStatus` from `status_conventions.py` instead. The goal is one concept name for graph-node lifecycle state. + +- [ ] **Step 5: Run focused tests** + +Run: `uv run pytest tests/unit/architecture/test_core_schema_sources.py tests/unit/state/test_subtask_lifecycle_toolkit.py tests/unit/runtime/test_workflow_service.py -v` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add tests/unit/architecture/test_core_schema_sources.py ergon_core/ergon_core/core/persistence/graph/status_conventions.py ergon_core/ergon_core/core/runtime/services/task_inspection_dto.py +git commit -m "Consolidate graph status conventions" +``` + +--- + +### Task 2: Separate Graph Status From Task Execution Status In Propagation + +**Files:** +- Modify: `tests/unit/runtime/test_propagation_contracts.py` +- Modify: `ergon_core/ergon_core/core/runtime/execution/propagation.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/task_propagation_service.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/task_execution_service.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/workflow_initialization_service.py` + +- [ ] **Step 1: Write tests for graph-node status constants at every graph write boundary** + +Add `tests/unit/runtime/test_propagation_contracts.py`: + +```python +from ergon_core.core.persistence.graph import status_conventions as graph_status +from ergon_core.core.runtime.execution import propagation +from ergon_core.core.runtime.services import task_execution_service, task_propagation_service +from ergon_core.core.runtime.services import workflow_initialization_service + + +def _source(module: object) -> str: + loader = getattr(module, "__loader__") + source = loader.get_source(module.__name__) + assert source is not None + return source + + +def test_graph_writers_do_not_use_task_execution_status_for_node_status() -> None: + modules = [ + propagation, + task_execution_service, + task_propagation_service, + workflow_initialization_service, + ] + forbidden_snippets = ( + "new_status=TaskExecutionStatus.", + "initial_node_status=TaskExecutionStatus.", + ) + + offenders = [ + f"{module.__name__}: {snippet}" + for module in modules + for snippet in forbidden_snippets + if snippet in _source(module) + ] + + assert offenders == [] + assert graph_status.READY == "ready" +``` + +This is an architecture test. It is intentionally string-based because the cleanup goal is import-boundary clarity. + +- [ ] **Step 2: Run the test and verify it fails** + +Run: `uv run pytest tests/unit/runtime/test_propagation_contracts.py::test_graph_writers_do_not_use_task_execution_status_for_node_status -v` + +Expected: FAIL because `propagation.py`, `task_propagation_service.py`, `task_execution_service.py`, and `workflow_initialization_service.py` currently use `TaskExecutionStatus` values while writing graph-node status. + +- [ ] **Step 3: Update propagation imports** + +In `ergon_core/ergon_core/core/runtime/execution/propagation.py`, replace direct status imports with a module alias: + +```python +from ergon_core.core.persistence.graph import status_conventions as graph_status +``` + +Remove `TaskExecutionStatus` from `propagation.py` if it becomes unused. This module operates on `RunGraphNode` / `RunGraphEdge`, so all graph-node writes and graph-node comparisons must use `graph_status.*`. + +- [ ] **Step 4: Update graph node writes** + +Change graph-node status writes: + +```python +new_status=graph_status.PENDING +new_status=graph_status.RUNNING +new_status=graph_status.FAILED +new_status=graph_status.BLOCKED +``` + +Change comparisons: + +```python +is_success = terminal_status == graph_status.COMPLETED +if target_node.status == graph_status.RUNNING: +if target_node.status in graph_status.TERMINAL_STATUSES: +is_pending = status == graph_status.PENDING +is_reactivatable_cancelled = status == graph_status.CANCELLED and is_managed_subtask +if all(n is not None and n.status == graph_status.COMPLETED for n in source_nodes): +``` + +- [ ] **Step 5: Update service calls into propagation** + +In `task_propagation_service.py`, call `on_task_completed_or_failed` with graph status constants: + +```python +from ergon_core.core.persistence.graph import status_conventions as graph_status +``` + +Use: + +```python +new_status=graph_status.COMPLETED +terminal_status=graph_status.COMPLETED +new_status=graph_status.FAILED +terminal_status=graph_status.FAILED +new_status=graph_status.PENDING +``` + +- [ ] **Step 6: Update task execution graph writes without changing execution-row writes** + +In `task_execution_service.py`, keep `TaskExecutionStatus` for `RunTaskExecution.status` assignments: + +```python +execution = RunTaskExecution( + ... + status=TaskExecutionStatus.RUNNING, +) +execution.status = TaskExecutionStatus.COMPLETED +execution.status = TaskExecutionStatus.FAILED +``` + +But change graph-node updates and dashboard node-status emissions to graph status constants: + +```python +from ergon_core.core.persistence.graph import status_conventions as graph_status + +await self._graph_repo.update_node_status( + ..., + new_status=graph_status.RUNNING, + ... +) + +await _emit_task_status( + ..., + new_status=graph_status.RUNNING, + ... +) +``` + +For finalization events that are explicitly reporting task-node lifecycle state, use: + +```python +new_status=graph_status.COMPLETED +old_status=graph_status.RUNNING +new_status=graph_status.FAILED +``` + +The rule is: `TaskExecutionStatus` belongs to `RunTaskExecution.status`; `graph_status` belongs to `RunGraphNode.status` and dashboard task-node status payloads. + +- [ ] **Step 7: Update workflow initialization graph seeding** + +In `workflow_initialization_service.py`, keep `RunStatus.EXECUTING` for `RunRecord.status`, but change graph initialization inputs: + +```python +from ergon_core.core.persistence.graph import status_conventions as graph_status + +graph_repo.initialize_from_definition( + ..., + initial_node_status=graph_status.PENDING, + initial_edge_status=graph_status.EDGE_PENDING, + ... +) +``` + +- [ ] **Step 8: Run focused tests** + +Run: `uv run pytest tests/unit/runtime/test_propagation_contracts.py tests/unit/runtime/test_workflow_service.py tests/unit/runtime/test_dynamic_task_evaluation_mapping.py tests/unit/runtime/test_failure_error_json.py tests/unit/runtime/test_worker_execute_factory_call.py tests/unit/runtime/test_smoke_topology_drift.py -v` + +Expected: PASS. + +- [ ] **Step 9: Commit** + +```bash +git add tests/unit/runtime/test_propagation_contracts.py ergon_core/ergon_core/core/runtime/execution/propagation.py ergon_core/ergon_core/core/runtime/services/task_propagation_service.py ergon_core/ergon_core/core/runtime/services/task_execution_service.py ergon_core/ergon_core/core/runtime/services/workflow_initialization_service.py +git commit -m "Use graph status conventions in propagation" +``` + +--- + +### Task 3: Align Failure Propagation Contract With BLOCKED Behavior + +**Files:** +- Modify: `tests/unit/runtime/test_propagation_contracts.py` +- Modify: `ergon_core/ergon_core/core/runtime/execution/propagation.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/orchestration_dto.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/task_propagation_service.py` +- Modify: `ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py` + +- [ ] **Step 1: Add a contract test for no cancellation targets from propagation** + +Extend `tests/unit/runtime/test_propagation_contracts.py`: + +```python +from ergon_core.core.runtime.services.orchestration_dto import PropagationResult + + +def test_propagation_result_does_not_expose_invalidated_targets() -> None: + assert "invalidated_targets" not in PropagationResult.model_fields +``` + +- [ ] **Step 2: Run the test and verify it fails** + +Run: `uv run pytest tests/unit/runtime/test_propagation_contracts.py::test_propagation_result_does_not_expose_invalidated_targets -v` + +Expected: FAIL because `PropagationResult` currently has `invalidated_targets`. + +- [ ] **Step 3: Simplify `PropagationResult`** + +In `orchestration_dto.py`, remove the field: + +```python +invalidated_targets: list[UUID] = Field(default_factory=list) +``` + +Keep: + +```python +ready_tasks: list[TaskDescriptor] = Field(default_factory=list) +workflow_terminal_state: WorkflowTerminalState = WorkflowTerminalState.NONE +``` + +- [ ] **Step 4: Update `on_task_completed_or_failed` return type and docs** + +In `propagation.py`, change: + +```python +) -> tuple[list[UUID], list[UUID]]: +``` + +to: + +```python +) -> list[UUID]: +``` + +Update the docstring to say: + +```python +"""Handle a node reaching COMPLETED, FAILED, or CANCELLED. + +Returns newly ready node IDs. + +- COMPLETED: outgoing edges become SATISFIED; targets with all dependencies + satisfied transition to PENDING for scheduling. +- FAILED / CANCELLED: outgoing edges become INVALIDATED; reachable successors + transition to BLOCKED unless they are RUNNING or terminal. +""" +``` + +Remove the local `invalidated: list[UUID] = []` and return only `newly_ready`. + +- [ ] **Step 5: Update `TaskPropagationService`** + +Change: + +```python +newly_ready_node_ids, invalidated_node_ids = await on_task_completed_or_failed(...) +``` + +to: + +```python +newly_ready_node_ids = await on_task_completed_or_failed(...) +``` + +Remove `invalidated_targets=invalidated_node_ids` from returned `PropagationResult`. + +For failure propagation, change: + +```python +_ready, invalidated_node_ids = await on_task_completed_or_failed(...) +``` + +to: + +```python +await on_task_completed_or_failed(...) +``` + +Update docstrings to say failure blocks downstream graph nodes, not cancels them. + +- [ ] **Step 6: Remove dead cancellation emission from `propagate_execution.py`** + +Remove the import: + +```python +TaskCancelledEvent, +``` + +Remove the loop: + +```python +for inv_node_id in propagation.invalidated_targets: + events.append(...) +``` + +Keep `TaskCancelledEvent` in `task_events.py`; it is still used by manager/operator cancellation flows. + +- [ ] **Step 7: Run focused tests** + +Run: `uv run pytest tests/unit/runtime/test_propagation_contracts.py tests/unit/runtime/test_smoke_topology_drift.py tests/unit/runtime/test_dynamic_task_evaluation_mapping.py tests/unit/runtime/test_failed_task_sandbox_cleanup.py -v` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add tests/unit/runtime/test_propagation_contracts.py ergon_core/ergon_core/core/runtime/execution/propagation.py ergon_core/ergon_core/core/runtime/services/orchestration_dto.py ergon_core/ergon_core/core/runtime/services/task_propagation_service.py ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py +git commit -m "Align propagation contract with blocked successors" +``` + +--- + +### Task 4: Consolidate Evaluation Criterion Status + +**Files:** +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Confirm: `ergon_core/ergon_core/core/persistence/telemetry/evaluation_summary.py` + +- [ ] **Step 1: Add architecture test for duplicate evaluation status literals** + +Add to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +def test_eval_criterion_status_literal_is_defined_only_in_evaluation_summary() -> None: + offenders: list[str] = [] + snippet = 'EvalCriterionStatus = Literal["passed", "failed", "errored", "skipped"]' + allowed = { + ROOT / "ergon_core/ergon_core/core/persistence/telemetry/evaluation_summary.py", + } + + for path in (ROOT / "ergon_core/ergon_core/core").rglob("*.py"): + if path in allowed: + continue + if snippet in path.read_text(): + offenders.append(str(path.relative_to(ROOT))) + + assert offenders == [] +``` + +- [ ] **Step 2: Run the test and verify it fails** + +Run: `uv run pytest tests/unit/architecture/test_core_schema_sources.py::test_eval_criterion_status_literal_is_defined_only_in_evaluation_summary -v` + +Expected: FAIL because `core/api/schemas.py` currently defines the same alias. + +- [ ] **Step 3: Import canonical alias in REST schemas** + +In `core/api/schemas.py`, replace: + +```python +from typing import Any, Literal +EvalCriterionStatus = Literal["passed", "failed", "errored", "skipped"] +``` + +with: + +```python +from typing import Any +from ergon_core.core.persistence.telemetry.evaluation_summary import EvalCriterionStatus +``` + +- [ ] **Step 4: Run focused tests** + +Run: `uv run pytest tests/unit/architecture/test_core_schema_sources.py tests/unit/runtime/test_evaluation_summary_contracts.py tests/unit/runtime/test_dynamic_task_evaluation_mapping.py -v` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tests/unit/architecture/test_core_schema_sources.py ergon_core/ergon_core/core/api/schemas.py +git commit -m "Use canonical evaluation criterion status" +``` + +--- + +### Task 5: Collapse Graph Mutation DTOs Onto One Canonical Record + +**Files:** +- Modify: `tests/unit/runtime/test_graph_mutation_contracts.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/graph_dto.py` +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Modify: `ergon_core/ergon_core/core/dashboard/event_contracts.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/graph_repository.py` +- Modify: `ergon_core/ergon_core/core/dashboard/emitter.py` + +- [ ] **Step 1: Write mutation contract tests** + +Create `tests/unit/runtime/test_graph_mutation_contracts.py`: + +```python +from uuid import uuid4 + +from ergon_core.core.dashboard.event_contracts import DashboardGraphMutationEvent +from ergon_core.core.runtime.services.graph_dto import ( + EdgeAddedMutation, + GraphMutationRecordDto, + GraphMutationValue, +) +from pydantic import TypeAdapter + + +def test_rest_and_dashboard_mutations_share_graph_mutation_record_payloads() -> None: + run_id = uuid4() + mutation_id = uuid4() + edge_id = uuid4() + source_id = uuid4() + target_id = uuid4() + + payload = EdgeAddedMutation( + source_node_id=source_id, + target_node_id=target_id, + status="pending", + ) + + TypeAdapter(GraphMutationValue).validate_python(payload.model_dump(mode="json")) + + record = GraphMutationRecordDto( + id=mutation_id, + run_id=run_id, + sequence=1, + mutation_type="edge.added", + target_type="edge", + target_id=edge_id, + actor="test", + old_value=None, + new_value=payload, + reason=None, + created_at="2026-04-28T00:00:00Z", + ) + dashboard = DashboardGraphMutationEvent( + mutation=record, + ) + + assert dashboard.mutation == record + assert record.new_value == payload +``` + +- [ ] **Step 2: Run the test and verify it fails** + +Run: `uv run pytest tests/unit/runtime/test_graph_mutation_contracts.py::test_rest_and_dashboard_mutations_share_graph_mutation_record_payloads -v` + +Expected: FAIL because `GraphMutationRecordDto` does not exist yet and `DashboardGraphMutationEvent` currently duplicates mutation fields instead of wrapping one canonical record. + +- [ ] **Step 3: Make edge mutation IDs consistent with graph DTO IDs** + +In `graph_dto.py`, change: + +```python +source_node_id: str +target_node_id: str +``` + +to: + +```python +source_node_id: NodeId +target_node_id: NodeId +``` + +for both `EdgeAddedMutation` and `EdgeRemovedMutation`. + +If JSON serialization needs strings, keep conversion at the API/dashboard serialization boundary with `model_dump(mode="json")`; do not weaken the canonical payload type. + +- [ ] **Step 4: Add canonical mutation record DTO** + +In `graph_dto.py`, add: + +```python +from datetime import datetime + + +class GraphMutationRecordDto(BaseModel): + """Append-only graph mutation record with a typed mutation payload.""" + + model_config = {"frozen": True} + + id: UUID + run_id: RunId + sequence: int + mutation_type: MutationType + target_type: GraphTargetType + target_id: UUID + actor: str + old_value: GraphMutationValue | None + new_value: GraphMutationValue + reason: str | None + created_at: datetime +``` + +- [ ] **Step 5: Replace REST mutation DTO with canonical record** + +In `core/api/schemas.py`, remove `RunGraphMutationDto` and import: + +```python +from ergon_core.core.runtime.services.graph_dto import GraphMutationRecordDto +``` + +Update `core/api/runs.py` and `run_read_service.py` so `/runs/{run_id}/mutations` returns `list[GraphMutationRecordDto]`. Keep JSON stringification at FastAPI/Pydantic serialization, not in a second REST DTO. + +- [ ] **Step 6: Collapse dashboard event to a thin envelope** + +In `event_contracts.py`, replace duplicated mutation fields with: + +```python +from ergon_core.core.runtime.services.graph_dto import GraphMutationRecordDto + + +class DashboardGraphMutationEvent(InngestEventContract): + name: ClassVar[str] = "dashboard/graph.mutation" + + mutation: GraphMutationRecordDto +``` + +If frontend contract compatibility requires top-level fields for one release, stop and ask before adding a compatibility shim; the requested direction is to reduce duplicate DTOs. + +- [ ] **Step 7: Update repository/emitter conversion code** + +Search for mutation construction: + +```bash +rg "EdgeAddedMutation|EdgeRemovedMutation|GraphMutationValue|DashboardGraphMutationEvent|RunGraphMutationDto|GraphMutationRecordDto" ergon_core/ergon_core/core tests -n +``` + +Update `_to_mutation_dto` / mutation read paths to produce `GraphMutationRecordDto`. Update `dashboard/emitter.py` to construct `DashboardGraphMutationEvent(mutation=record)` instead of copying fields. Update call sites to pass UUID/`NodeId` values into `EdgeAddedMutation` / `EdgeRemovedMutation`. Use `model_dump(mode="json")` only when writing JSON columns or sending wire payloads. + +- [ ] **Step 8: Run focused mutation/dashboard tests** + +Run: `uv run pytest tests/unit/runtime/test_graph_mutation_contracts.py tests/unit/dashboard/test_event_contract_types.py tests/unit/architecture/test_model_field_descriptions.py -v` + +Expected: PASS. + +- [ ] **Step 9: Commit** + +```bash +git add tests/unit/runtime/test_graph_mutation_contracts.py ergon_core/ergon_core/core/runtime/services/graph_dto.py ergon_core/ergon_core/core/api/schemas.py ergon_core/ergon_core/core/dashboard/event_contracts.py ergon_core/ergon_core/core/runtime/services/graph_repository.py ergon_core/ergon_core/core/dashboard/emitter.py +git commit -m "Unify graph mutation payload contracts" +``` + +--- + +### Task 6: Collapse Task Node Projections Where Shapes Are Accidental + +**Files:** +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Modify: `ergon_core/ergon_core/core/api/runs.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/graph_dto.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/workflow_dto.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/workflow_service.py` +- Modify: `ergon_core/ergon_core/core/dashboard/event_contracts.py` +- Modify: `ergon_core/ergon_core/core/runtime/inngest/start_workflow.py` + +- [ ] **Step 1: Add tests for task-node DTO collapse** + +Add to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +def test_run_task_dto_does_not_label_worker_slug_as_name() -> None: + path = ROOT / "ergon_core/ergon_core/core/api/schemas.py" + text = path.read_text() + assert "assigned_worker_name" not in text + assert "assigned_worker_slug" in text + + +def test_workflow_task_ref_does_not_duplicate_graph_task_ref() -> None: + path = ROOT / "ergon_core/ergon_core/core/runtime/services/workflow_dto.py" + assert "class WorkflowTaskRef" not in path.read_text() +``` + +- [ ] **Step 2: Run the test and verify it fails** + +Run: `uv run pytest tests/unit/architecture/test_core_schema_sources.py::test_run_task_dto_does_not_label_worker_slug_as_name tests/unit/architecture/test_core_schema_sources.py::test_workflow_task_ref_does_not_duplicate_graph_task_ref -v` + +Expected: FAIL because `RunTaskDto` currently has `assigned_worker_name` and `workflow_dto.py` currently defines `WorkflowTaskRef`. + +- [ ] **Step 3: Rename REST task field to match its actual value** + +In `core/api/schemas.py`, change: + +```python +assigned_worker_name: str | None = None +``` + +to: + +```python +assigned_worker_slug: str | None = None +``` + +In `core/api/runs.py`, change the `_build_task_map` assignment from `assigned_worker_name=...` to `assigned_worker_slug=...`. + +- [ ] **Step 4: Introduce one canonical lightweight graph task ref** + +In `graph_dto.py`, add: + +```python +class GraphTaskRef(BaseModel): + """Lightweight task-node reference for workflow/tool projections.""" + + model_config = {"frozen": True} + + node_id: NodeId + task_slug: str + status: NodeStatus + level: int + parent_node_id: NodeId | None = None + assigned_worker_slug: str | None = None +``` + +Import `NodeStatus` from `status_conventions.py`. + +- [ ] **Step 5: Replace `WorkflowTaskRef` with `GraphTaskRef`** + +In `workflow_dto.py`, remove `WorkflowTaskRef` and import: + +```python +from ergon_core.core.runtime.services.graph_dto import GraphTaskRef +``` + +Update fields: + +```python +source: GraphTaskRef +target: GraphTaskRef +task: GraphTaskRef +task: GraphTaskRef | None = None +``` + +In `workflow_service.py`, update `_task_ref` to return `GraphTaskRef`. + +- [ ] **Step 6: Keep map-vs-tree containers, but share task-node semantics** + +Add or update comments near `RunTaskDto`: + +```python +class RunTaskDto(CamelModel): + """REST projection of RunGraphNode for run detail pages. + + This is not the canonical graph schema; graph semantics live in + runtime/services/graph_dto.py and persistence/graph/status_conventions.py. + """ +``` + +Keep `RunSnapshotDto.tasks: dict[str, RunTaskDto]` and `DashboardWorkflowStartedEvent.task_tree: TaskTreeNode` because map and tree containers are genuinely different. But align their field names and statuses with `GraphTaskRef`: `assigned_worker_slug` means slug, `status` is `NodeStatus`, and dependency/child fields are container-specific additions rather than new task-node semantics. + +- [ ] **Step 7: Run focused API/dashboard/workflow tests** + +Run: `uv run pytest tests/unit/architecture/test_core_schema_sources.py tests/unit/cli/test_workflow_cli.py tests/unit/dashboard/test_event_contract_types.py tests/unit/state/test_workflow_cli_tool.py -v` + +Expected: PASS. If frontend TypeScript expects `assignedWorkerName`, update that in a separate frontend-compatible task rather than sneaking it into this backend cleanup. + +- [ ] **Step 8: Commit** + +```bash +git add tests/unit/architecture/test_core_schema_sources.py ergon_core/ergon_core/core/api/schemas.py ergon_core/ergon_core/core/api/runs.py ergon_core/ergon_core/core/runtime/services/graph_dto.py ergon_core/ergon_core/core/runtime/services/workflow_dto.py ergon_core/ergon_core/core/runtime/services/workflow_service.py ergon_core/ergon_core/core/dashboard/event_contracts.py ergon_core/ergon_core/core/runtime/inngest/start_workflow.py +git commit -m "Collapse duplicate task node projections" +``` + +--- + +### Task 7: Reuse CancelCause Instead Of Local Literal Subsets + +**Files:** +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Modify: `ergon_core/ergon_core/core/runtime/events/task_events.py` +- Modify: `ergon_core/ergon_core/core/runtime/services/subtask_cancellation_service.py` +- Modify: any caller that accepts the same literal subset. + +- [ ] **Step 1: Add architecture test for local cancel-cause literals** + +Add to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +def test_cancel_cause_literals_live_in_task_events() -> None: + offenders: list[str] = [] + snippets = ( + 'Literal["parent_terminal", "dep_invalidated"]', + 'Literal["dep_invalidated", "parent_terminal"]', + ) + allowed = { + ROOT / "ergon_core/ergon_core/core/runtime/events/task_events.py", + } + + for path in (ROOT / "ergon_core/ergon_core/core").rglob("*.py"): + if path in allowed: + continue + text = path.read_text() + for snippet in snippets: + if snippet in text: + offenders.append(f"{path.relative_to(ROOT)} duplicates cancel cause subset") + + assert offenders == [] +``` + +- [ ] **Step 2: Run the test and verify it fails** + +Run: `uv run pytest tests/unit/architecture/test_core_schema_sources.py::test_cancel_cause_literals_live_in_task_events -v` + +Expected: FAIL if `subtask_cancellation_service.py` still defines a local subset literal. + +- [ ] **Step 3: Add named subset aliases in `task_events.py`** + +In `task_events.py`, below `CancelCause`, add: + +```python +PropagationCancelCause = Literal["parent_terminal", "dep_invalidated"] +``` + +This keeps narrower service typing but centralizes the strings. + +- [ ] **Step 4: Import the subset alias in services** + +In `subtask_cancellation_service.py`, replace the local `Literal[...]` import/annotation with: + +```python +from ergon_core.core.runtime.events.task_events import PropagationCancelCause +``` + +Use: + +```python +cause: PropagationCancelCause +``` + +- [ ] **Step 5: Run focused cancellation tests** + +Run: `uv run pytest tests/unit/runtime/test_failed_task_sandbox_cleanup.py tests/unit/runtime/test_dynamic_task_evaluation_mapping.py tests/unit/state/test_subtask_lifecycle_toolkit.py tests/unit/architecture/test_core_schema_sources.py -v` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add tests/unit/architecture/test_core_schema_sources.py ergon_core/ergon_core/core/runtime/events/task_events.py ergon_core/ergon_core/core/runtime/services/subtask_cancellation_service.py +git commit -m "Centralize task cancellation causes" +``` + +--- + +### Task 8: Collapse Context Event Snapshot DTOs Onto Typed Payloads + +**Files:** +- Modify: `tests/unit/runtime/test_context_event_contracts.py` +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Modify: `ergon_core/ergon_core/core/api/runs.py` +- Modify: `ergon_core/ergon_core/core/dashboard/event_contracts.py` +- Modify: `ergon_core/ergon_core/core/dashboard/emitter.py` + +- [ ] **Step 1: Write a context event DTO sharing test** + +Create `tests/unit/runtime/test_context_event_contracts.py`: + +```python +from uuid import uuid4 + +from ergon_core.core.api.schemas import RunContextEventDto +from ergon_core.core.dashboard.event_contracts import DashboardContextEventEvent +from ergon_core.core.persistence.context.event_payloads import AssistantTextPayload + + +def test_rest_and_dashboard_context_events_share_typed_payload_shape() -> None: + payload = AssistantTextPayload(text="hello") + common = { + "id": uuid4(), + "run_id": uuid4(), + "task_execution_id": uuid4(), + "task_node_id": uuid4(), + "worker_binding_key": "worker", + "sequence": 1, + "event_type": "assistant_text", + "payload": payload, + "created_at": "2026-04-28T00:00:00Z", + "started_at": None, + "completed_at": None, + } + + rest = RunContextEventDto.model_validate(common) + dashboard = DashboardContextEventEvent.model_validate(common) + + assert rest.payload == dashboard.payload + assert rest.event_type == dashboard.event_type +``` + +- [ ] **Step 2: Run the test and verify it fails** + +Run: `uv run pytest tests/unit/runtime/test_context_event_contracts.py::test_rest_and_dashboard_context_events_share_typed_payload_shape -v` + +Expected: FAIL because `RunContextEventDto` currently uses `event_type: str` and `payload: dict[str, Any]`, while dashboard uses `ContextEventType` and `ContextEventPayload`. + +- [ ] **Step 3: Type REST context event DTO with canonical event payloads** + +In `core/api/schemas.py`, import: + +```python +from ergon_core.core.persistence.context.event_payloads import ( + ContextEventPayload, + ContextEventType, +) +``` + +Update: + +```python +event_type: ContextEventType +payload: ContextEventPayload +``` + +- [ ] **Step 4: Update REST context event construction** + +In `core/api/runs.py`, when building `RunContextEventDto`, validate payload with the canonical discriminated payload type. If rows already store dict payloads, use the same validation path as dashboard emitter uses rather than passing raw dicts through REST. + +- [ ] **Step 5: Decide whether to fully collapse class names** + +If `RunContextEventDto` and `DashboardContextEventEvent` now have the same fields except event `name`, move the common fields into a shared model: + +```python +class ContextEventDto(CamelModel or BaseModel): + ... +``` + +Use that model directly in REST and embed it in the dashboard event envelope. If camelCase REST output makes a shared class awkward, keep the two envelope classes but require both to use `ContextEventType` and `ContextEventPayload`. + +- [ ] **Step 6: Run focused tests** + +Run: `uv run pytest tests/unit/runtime/test_context_event_contracts.py tests/unit/dashboard/test_event_contract_types.py tests/unit/architecture/test_model_field_descriptions.py -v` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add tests/unit/runtime/test_context_event_contracts.py ergon_core/ergon_core/core/api/schemas.py ergon_core/ergon_core/core/api/runs.py ergon_core/ergon_core/core/dashboard/event_contracts.py ergon_core/ergon_core/core/dashboard/emitter.py +git commit -m "Share typed context event payload schemas" +``` + +--- + +### Task 9: Add Mapping Guard Between Generation Parts And Context Events + +**Files:** +- Modify: `tests/unit/builtins/common/test_transcript_adapters.py` +- Modify: `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` only if the test reveals unmapped kinds. + +- [ ] **Step 1: Add explicit adapter coverage for vocabulary mapping** + +In `tests/unit/builtins/common/test_transcript_adapters.py`, add a test that documents the intended split between `core.generation` kebab-case `part_kind` and context event snake-case `event_type`: + +```python +from ergon_core.core.generation import TextPart, ThinkingPart, ToolCallPart, ToolReturnPart +from ergon_core.core.persistence.context.event_payloads import ContextEventType + + +def test_generation_part_kinds_have_context_event_counterparts() -> None: + assert TextPart(content="x").part_kind == "text" + assert ThinkingPart(content="x").part_kind == "thinking" + assert ToolCallPart(tool_name="t", tool_call_id="1", args={}).part_kind == "tool-call" + assert ToolReturnPart(tool_call_id="1", tool_name="t", content="ok").part_kind == "tool-return" + + assert "assistant_text" in ContextEventType.__args__ + assert "thinking" in ContextEventType.__args__ + assert "tool_call" in ContextEventType.__args__ + assert "tool_result" in ContextEventType.__args__ +``` + +- [ ] **Step 2: Run the test** + +Run: `uv run pytest tests/unit/builtins/common/test_transcript_adapters.py::test_generation_part_kinds_have_context_event_counterparts -v` + +Expected: PASS if the current split is intentional and covered; FAIL if any expected context event value has drifted. + +- [ ] **Step 3: Fix adapter mapping only if the test fails** + +If the test fails because context event values changed, update `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` to map the actual canonical context event types. Do not merge generation parts and context events into one model family. + +- [ ] **Step 4: Run focused adapter tests** + +Run: `uv run pytest tests/unit/builtins/common/test_transcript_adapters.py tests/unit/persistence/test_context_event_repository.py -v` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tests/unit/builtins/common/test_transcript_adapters.py ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py +git commit -m "Guard generation to context event mapping" +``` + +--- + +### Task 10: Final Architecture Sweep + +**Files:** +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Modify: `docs/superpowers/plans/2026-04-28-core-schema-deduplication.md` only if implementation reveals a necessary correction. + +- [ ] **Step 1: Add a broad forbidden-duplication guard** + +Add to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +def test_core_schema_source_imports_are_directional() -> None: + forbidden_pairs = { + "ergon_core.core.api.schemas": ( + "EvalCriterionStatus = Literal", + "GraphMutationValue =", + ), + "ergon_core.core.dashboard.event_contracts": ( + "GraphMutationValue =", + "CancelCause = Literal", + ), + } + + offenders: list[str] = [] + for module_path, snippets in forbidden_pairs.items(): + path = ROOT / (module_path.replace(".", "/") + ".py") + text = path.read_text() + for snippet in snippets: + if snippet in text: + offenders.append(f"{path.relative_to(ROOT)} contains local source {snippet!r}") + + assert offenders == [] +``` + +- [ ] **Step 2: Run the full architecture test set** + +Run: `uv run pytest tests/unit/architecture -v` + +Expected: PASS. + +- [ ] **Step 3: Run focused runtime/schema tests** + +Run: + +```bash +uv run pytest \ + tests/unit/runtime/test_workflow_service.py \ + tests/unit/runtime/test_dynamic_task_evaluation_mapping.py \ + tests/unit/runtime/test_evaluation_summary_contracts.py \ + tests/unit/dashboard/test_event_contract_types.py \ + tests/unit/builtins/common/test_transcript_adapters.py \ + tests/unit/architecture/test_model_field_descriptions.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 4: Search for remaining duplicate literals** + +Run: + +```bash +rg 'Literal\["pending", "ready", "running", "completed", "failed", "cancelled", "blocked"\]|EvalCriterionStatus = Literal|invalidated_targets|assigned_worker_name|Literal\["parent_terminal", "dep_invalidated"\]' ergon_core tests +``` + +Expected output may include only: + +```text +ergon_core/ergon_core/core/persistence/graph/status_conventions.py +ergon_core/ergon_core/core/persistence/telemetry/evaluation_summary.py +tests/unit/architecture/test_core_schema_sources.py +``` + +If other production files appear, either import the canonical alias or explain in a code comment why the duplicate-looking concept is distinct. + +- [ ] **Step 5: Run lints for touched files** + +Use Cursor lints for: + +```text +ergon_core/ergon_core/core/persistence/graph/status_conventions.py +ergon_core/ergon_core/core/runtime/execution/propagation.py +ergon_core/ergon_core/core/runtime/services +ergon_core/ergon_core/core/api/schemas.py +ergon_core/ergon_core/core/dashboard/event_contracts.py +tests/unit/architecture/test_core_schema_sources.py +``` + +Expected: no new diagnostics in touched files. + +- [ ] **Step 6: Commit final guard changes** + +```bash +git add tests/unit/architecture/test_core_schema_sources.py +git commit -m "Guard core schema source ownership" +``` + +--- + +## Execution Notes + +- Do not collapse legitimate transport envelopes into one giant schema. Do collapse duplicated payload bodies: `WorkflowTaskRef` should disappear in favor of `GraphTaskRef`; REST/dashboard task containers can remain map/tree envelopes only if their field semantics align with the canonical graph task ref. +- Do remove duplicate domain definitions. If two modules need the same literal values, one imports from the source-of-truth module. +- Keep table models free-form where the database intentionally allows extension, but make runtime conventions explicit through aliases and constants. +- Keep REST/dashboard serialization at the boundary. Canonical Python DTOs can use UUID/NewType fields; wire models can stringify with `model_dump(mode="json")`. +- Avoid compatibility facades. If a module owns a concept, import it directly from that module. + +## Self-Review + +- Spec coverage: high-priority graph status duplication, evaluation status duplication, stale propagation contract, graph mutation DTO collapse, task-node DTO collapse, context-event DTO typing, cancel-cause duplication, and generation/context event vocabulary mapping are each covered by a task. +- Placeholder scan: no task contains unresolved placeholder markers or an unspecified "add tests" instruction; every task names files and commands. +- Type consistency: graph status aliases live in `status_conventions.py`, evaluation status in `evaluation_summary.py`, mutation payload body in `graph_dto.py`, and cancel-cause aliases in `task_events.py` throughout the plan. diff --git a/docs/superpowers/plans/2026-04-28-ergon-builtins-rebuild-structure.md b/docs/superpowers/plans/2026-04-28-ergon-builtins-rebuild-structure.md new file mode 100644 index 00000000..167b0b20 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-ergon-builtins-rebuild-structure.md @@ -0,0 +1,709 @@ +# Ergon Built-ins Rebuild Structure + +This document lays out the target shape for `ergon_builtins` after the Ergon core public API cleanup. It assumes the core authoring API from `2026-04-28-public-api-target-structure.md`: + +- `Benchmark`, `Task`, `BenchmarkRequirements` +- `Worker`, `WorkerContext`, `WorkerOutput` +- `Criterion`, `CriterionContext`, `CriterionOutcome`, `ScoreScale` +- `Rubric`, `TaskEvaluationResult` +- advanced `Evaluator` only when a fixed `Rubric` is not expressive enough + +The key design rule is that built-ins should be normal public API consumers. The CLI and runtime should discover built-ins through typed registries and service facades, not by importing benchmark internals or rebuilding object graphs by hand. + +## Goals + +- Keep benchmark authoring code small, public-API-first, and easy to copy for external benchmark authors. +- Keep sandbox, dataset loading, and optional dependency code inside benchmark-owned packages. +- Keep the registry as the stable integration boundary for CLI discovery, experiment definition, run launch, and Inngest execution. +- Keep benchmark slugs separate from runtime choices: the CLI must pass worker, evaluator, sandbox, model, and extras/dependency intent explicitly for now. +- Avoid compatibility aliases for renamed public concepts during the coordinated rebuild. + +## Runtime Integration Model + +```mermaid +flowchart TD + accTitle: Builtins Runtime Flow + accDescr: Built-in benchmark, worker, and evaluator slugs flow from the registry through CLI services, persisted definitions, run records, and Inngest execution. + + registry["ergon_builtins.registry
slugs and factories"] + cli["CLI commands
define, run, list"] + facades["core runtime services
experiment, cohort, run"] + experiment["ExperimentRecord
selected samples and explicit choices"] + definition["Workflow definition
task graph and type slugs"] + run["RunRecord
instance key, worker team, evaluator slug"] + inngest["Inngest runtime
worker and evaluator execution"] + + registry --> cli + cli --> facades + facades --> experiment + facades --> definition + facades --> run + definition --> inngest + run --> inngest + registry --> inngest +``` + +The CLI path should be slug-driven: + +1. Validate the explicit `benchmark_slug`, `worker_slug`, `evaluator_slug`, and `sandbox_slug` against `ergon_builtins.registry`. +2. Ask a core service facade to define or launch the experiment. +3. Persist only durable identifiers and slugs in `ExperimentRecord`, workflow definitions, and `RunRecord`. +4. Rehydrate live workers, criteria, rubrics, and sandbox managers from registries at runtime. + +## Proposed Package Tree + +```text +ergon_builtins/ + ergon_builtins/ + __init__.py + + registry.py + # merged public discovery surface + # imports registry_core and optional registries + + registry_core.py + # always-importable built-ins with no [data] dependency + # exports BENCHMARKS, WORKERS, EVALUATORS, SANDBOX_MANAGERS, + # SANDBOX_TEMPLATES, MODEL_BACKENDS + + registry_data.py + # HuggingFace/pandas/datasets-dependent built-ins + # same export names as registry_core + + registry_local_models.py + # optional local model backends + + shared/ + __init__.py + criteria/ + code_check.py + file_check.py + llm_judge.py + sandbox_file_check.py + workers/ + react_worker.py + training_stub_worker.py + react_prompts.py + models/ + cloud_passthrough.py + openrouter_backend.py + openrouter_responses_backend.py + resolution.py + vllm_backend.py + tools/ + # reusable public worker tools only + observability/ + # event/transcript adapters used by shared workers + + benchmarks/ + minif2f/ + __init__.py + benchmark.py + task_schemas.py + worker_factory.py + prompts.py + toolkit.py + criteria.py + rubric.py + sandbox_manager.py + sandbox/ + + swebench_verified/ + __init__.py + benchmark.py + task_schemas.py + worker_factory.py + prompts.py + toolkit.py + criterion.py + rubric.py + sandbox_manager.py + sandbox_manager_support.py + sandbox/ + + gdpeval/ + __init__.py + benchmark.py + task_schemas.py + loader.py + worker_factory.py + criteria.py + rubric.py + sandbox.py + + researchrubrics/ + __init__.py + benchmark.py + vanilla.py + task_schemas.py + worker_factory.py + researcher_worker.py + workflow_cli_react_worker.py + criteria.py + judge_criterion.py + rubric.py + sandbox_manager.py +``` + +### Package Boundary Rules + +- Benchmark packages own their task payload schemas, dataset loaders, sandbox/toolkit wiring, benchmark-specific criteria, and default rubric. +- `shared/` contains reusable primitives that do not know about one benchmark's payload schema. +- Registered worker factories live next to the benchmark when they bind benchmark-specific tools or sandbox setup. +- Generic worker classes live in `shared/workers/`; benchmark packages wrap them with factories. +- Optional data dependencies stay in `registry_data.py` and data-only benchmark packages. Importing `registry_core.py` must not require `datasets`, pandas, `swebench`, or HuggingFace extras. +- CLI code should import only `ergon_builtins.registry` and core service facades. + +## Registry Contract + +The registry should continue to expose dictionaries keyed by stable slugs: + +```python +BENCHMARKS: dict[str, type[Benchmark]] +WORKERS: dict[str, WorkerFactory] +EVALUATORS: dict[str, type[Evaluator]] +SANDBOX_MANAGERS: dict[str, type[BaseSandboxManager]] +SANDBOX_TEMPLATES: dict[str, Path] +MODEL_BACKENDS: dict[str, Callable[..., ResolvedModel]] +``` + +`WorkerFactory` should remain a callable shape that the runtime can use after sandbox setup: + +```python +WorkerFactory = Callable[..., Worker] +``` + +Every registered worker factory must accept: + +```text +name: str +model: str | None +task_id: UUID +sandbox_id: str +``` + +The registry should not provide benchmark-level default profiles in this phase. Explicit beats implicit while the package structure is still moving: callers must specify the worker, evaluator, sandbox, model, and dependency extras they intend to use. + +This gives the CLI enough information to validate explicit requests for: + +- `ergon benchmark list` +- `ergon worker list` +- `ergon evaluator list` +- `ergon experiment define ` +- `ergon experiment run ` +- `ergon benchmark run ` +- onboarding/setup messages for explicitly requested extras, E2B, HuggingFace, or API keys + +## Public API Usage Rules + +Built-ins should use root imports for ordinary authoring: + +```python +from ergon_core.api import Benchmark, BenchmarkRequirements, Task +from ergon_core.api import Worker, WorkerContext, WorkerOutput +from ergon_core.api import Criterion, CriterionContext, CriterionOutcome +from ergon_core.api import Rubric, TaskEvaluationResult +``` + +Use advanced imports only where the benchmark needs dynamic criteria: + +```python +from ergon_core.api.rubric import Evaluator +``` + +Core composition types stay out of benchmark authoring files: + +- no `Experiment` imports in benchmark packages +- no `WorkerSpec` imports in benchmark packages +- no run/cohort/definition handles in benchmark packages +- no direct DB/session imports in workers, criteria, or rubrics + +## Benchmark Implementation Pattern + +Each benchmark package should follow the same high-level shape: + +```text +benchmark.py + Benchmark subclass + type_slug + task_payload_model + onboarding_deps / BenchmarkRequirements + build_instances() -> Mapping[str, Sequence[Task[Payload]]] + evaluator_requirements() + +task_schemas.py + Pydantic payload models + dataset row conversion helpers when lightweight + +worker_factory.py + factories that bind shared workers to benchmark-specific tools/sandboxes + +criteria.py / criterion.py + benchmark-specific Criterion implementations and builders + +rubric.py + Rubric or Evaluator subclass registered under a stable evaluator slug + +sandbox_manager.py / sandbox.py + benchmark-specific sandbox lifecycle and setup +``` + +`Task` construction should consistently set: + +- `task_slug`: stable dataset sample identifier +- `instance_key`: selected instance key used by experiment/run services +- `description`: worker-facing problem statement +- `evaluator_binding_keys`: usually `("default",)` unless the benchmark has multiple evaluator bindings +- `task_payload`: typed payload model containing all evaluator-only ground truth + +## MiniF2F + +### Folder + +```text +benchmarks/minif2f/ + benchmark.py + task_schemas.py + worker_factory.py + prompts.py + toolkit.py + criteria.py + rubric.py + sandbox_manager.py + sandbox/ +``` + +### Benchmark + +`MiniF2FBenchmark` should remain a public `Benchmark` implementation: + +- `type_slug = "minif2f"` +- `task_payload_model = MiniF2FTaskPayload` +- `onboarding_deps = BenchmarkRequirements(e2b=True)` +- `build_instances()` downloads or reads MiniF2F-v2c and returns one `Task` per theorem. +- `description` should include the informal statement, Lean header, and formal theorem. + +The payload should carry: + +- `name` +- `informal_statement` +- `formal_statement` +- `header` + +Ground truth proof, if available later, belongs in the payload or metadata for evaluation only, not in the worker prompt. + +### Worker + +The recommended first worker pairing is `minif2f-react`, implemented as a benchmark-owned factory around the shared ReAct worker: + +- resolve the live sandbox by `task_id` +- build `MiniF2FToolkit` +- bind Lean tools such as write file, check file, and verify proof +- pass a MiniF2F-specific system prompt +- return a `WorkerOutput` whose final answer includes the proof file path or proof text + +The factory belongs in `benchmarks/minif2f/worker_factory.py` because it knows about Lean, the sandbox manager, and the MiniF2F toolkit. + +### Criteria And Rubric + +`ProofVerificationCriterion` should use `CriterionContext` public capabilities rather than importing a concrete runtime protocol from public files. + +`MiniF2FRubric` should be a fixed `Rubric` with one proof-verification criterion: + +- score `1.0` when Lean verifies the final proof +- score partial credit for syntactically valid but incomplete proof attempts +- score `0.0` for missing or invalid proof artifacts +- return `TaskEvaluationResult` with normalized score and proof metadata + +### Required CLI Pairing + +```text +benchmark_slug: minif2f +worker_slug: minif2f-react +evaluator_slug: minif2f-rubric +sandbox_slug: minif2f +extras: none +model: explicit CLI value, e.g. openai:gpt-4o +``` + +## SWE-Bench Verified + +### Folder + +```text +benchmarks/swebench_verified/ + benchmark.py + task_schemas.py + worker_factory.py + prompts.py + toolkit.py + criterion.py + rubric.py + sandbox_manager.py + sandbox_manager_support.py + sandbox/ +``` + +### Benchmark + +`SweBenchVerifiedBenchmark` should remain the benchmark loader for `princeton-nlp/SWE-bench_Verified`: + +- `type_slug = "swebench-verified"` +- `task_payload_model = SWEBenchTaskPayload` +- `onboarding_deps = BenchmarkRequirements(e2b=True, extras=("ergon-builtins[data]",))` +- `build_instances()` returns one `Task` per SWE-Bench instance. +- the worker-facing `description` should include issue context and repo instructions, not the gold test patch. + +The payload should carry all evaluator-only data: + +- `instance_id` +- repo and base commit identifiers +- problem statement +- test patch +- FAIL_TO_PASS / PASS_TO_PASS metadata needed by the harness + +### Worker + +The recommended first worker pairing is `swebench-react`, implemented as a benchmark-owned factory around the shared ReAct worker: + +- resolve the live sandbox by `task_id` +- build `SWEBenchToolkit` +- expose shell/file/git tools scoped to `/workspace/repo` +- pass a SWE-Bench-specific system prompt +- return patch-oriented output or rely on sandbox diff extraction during evaluation + +The worker should not run the official evaluator. Its job is to modify the repo in the sandbox. + +### Criteria And Rubric + +`SWEBenchTestCriterion` should remain the atomic evaluation unit: + +- extract the agent patch from the sandbox through `CriterionContext` capabilities +- apply the gold test patch +- apply the agent patch +- run the official eval script +- parse the SWE-Bench harness report +- return `CriterionOutcome` with score `1.0` only when the instance is resolved + +`SWEBenchRubric` should live in `benchmarks/swebench_verified/rubric.py`, not in a detached global rubrics folder, because it is benchmark-specific and wraps `SWEBenchTestCriterion`. + +### Required CLI Pairing + +```text +benchmark_slug: swebench-verified +worker_slug: swebench-react +evaluator_slug: swebench-rubric +sandbox_slug: swebench-verified +extras: ergon-builtins[data] +model: explicit CLI value, e.g. openai:gpt-4o +``` + +## GDPEval + +### Folder + +```text +benchmarks/gdpeval/ + benchmark.py + task_schemas.py + loader.py + worker_factory.py + criteria.py + rubric.py + sandbox.py +``` + +### Benchmark + +`GDPEvalBenchmark` should stay in the `[data]` registry: + +- `type_slug = "gdpeval"` +- `task_payload_model = GDPTaskConfig` +- `onboarding_deps = BenchmarkRequirements(e2b=True, extras=("ergon-builtins[data]",))` +- `build_instances()` loads task IDs and reference files from HuggingFace. +- each `Task.description` should be the document-processing instruction extracted from the dataset. + +The payload should carry: + +- `task_id` +- `workflow_type` +- `reference_files` +- any expected output manifest or rubric category references needed by evaluation + +### Worker + +GDPEval should have an explicit recommended worker pairing instead of depending on a generic ReAct slug that has no benchmark tools. The worker can be implemented in either of two ways: + +- `gdpeval-react`: benchmark-owned factory around shared ReAct, with document/file tools and sandbox workspace instructions. +- `gdpeval-workflow-cli-react`: if GDP tasks are meant to exercise the workflow CLI and produce office artifacts through the sandbox. + +The recommended first target is `gdpeval-react` because it keeps the benchmark in the same authoring pattern as MiniF2F and SWE-Bench. + +### Criteria And Rubric + +`StagedRubric` is an advanced evaluator-like rubric because it supports sequential gates and stage-specific failure actions. It should be registered under one stable slug: + +```text +gdpeval-staged-rubric +``` + +If the CLI keeps the shorter compatibility slug during the rebuild, it should be temporary and removed in the coordinated built-ins rename. + +GDPEval criteria should be generated from explicit stage definitions: + +- format/file existence gates +- reference-file consistency checks +- LLM judge criteria for qualitative document quality +- optional code or spreadsheet checks for generated artifacts + +Each criterion should emit structured evidence for auditability: + +- files checked +- sandbox command IDs +- judge prompt messages +- parsed outputs +- failure reason + +### Required CLI Pairing + +```text +benchmark_slug: gdpeval +worker_slug: gdpeval-react +evaluator_slug: gdpeval-staged-rubric +sandbox_slug: gdpeval +extras: ergon-builtins[data] +model: explicit CLI value, e.g. openai:gpt-4o +``` + +## ResearchRubrics + +### Folder + +```text +benchmarks/researchrubrics/ + benchmark.py + vanilla.py + task_schemas.py + worker_factory.py + researcher_worker.py + workflow_cli_react_worker.py + criteria.py + judge_criterion.py + rubric.py + sandbox_manager.py +``` + +### Benchmark + +`ResearchRubricsBenchmark` and `ResearchRubricsVanillaBenchmark` should remain `[data]` benchmarks: + +- `type_slug = "researchrubrics"` and `type_slug = "researchrubrics-vanilla"` +- `task_payload_model = ResearchRubricsTaskPayload` +- `onboarding_deps = BenchmarkRequirements(extras=("ergon-builtins[data]",), optional_keys=("EXA_API_KEY",))` +- `build_instances()` returns one `Task` per dataset sample. +- `description` should be the research prompt. + +The payload should carry: + +- `sample_id` +- `domain` +- `prompt` +- list of weighted rubric criteria + +### Workers + +ResearchRubrics should keep two registered worker choices because they exercise different research-agent paths: + +```text +researchrubrics-researcher +researchrubrics-workflow-cli-react +``` + +`researchrubrics-researcher` should be the recommended first worker pairing: + +- accepts the research prompt +- uses model-backed research behavior +- writes final report artifacts through `WorkerContext` or public resource capabilities +- returns `WorkerOutput` with report summary and final artifact references + +`researchrubrics-workflow-cli-react` should remain an advanced/experimental worker: + +- uses the workflow CLI path inside the sandbox +- is useful for testing tool orchestration and dashboard traces +- should not be the default unless the CLI explicitly requests it + +### Criteria And Rubric + +`ResearchRubricsRubric` should remain an advanced dynamic evaluator or a `Rubric` that overrides `criteria_for(task)`, because its criteria come from each task payload. + +The task-specific path should: + +1. read `ResearchRubricsTaskPayload.rubrics` +2. build one `ResearchRubricsJudgeCriterion` per rubric criterion +3. evaluate the final report against each weighted criterion +4. aggregate positive and negative weights into normalized `TaskEvaluationResult` + +Judge criteria should use `CriterionEvidence` to preserve: + +- judge prompt +- report excerpt or artifact reference +- rubric criterion text +- axis and weight +- model output + +### Required CLI Pairings + +```text +benchmark_slug: researchrubrics +worker_slug: researchrubrics-researcher +evaluator_slug: researchrubrics-rubric +sandbox_slug: researchrubrics +extras: ergon-builtins[data] +model: explicit CLI value, e.g. openai:gpt-4o +``` + +```text +benchmark_slug: researchrubrics-vanilla +worker_slug: researchrubrics-researcher +evaluator_slug: researchrubrics-rubric +sandbox_slug: researchrubrics-vanilla +extras: ergon-builtins[data] +model: explicit CLI value, e.g. openai:gpt-4o +``` + +## CLI Requirements + +The CLI should not know benchmark internals. It should consume registry metadata and call core service facades. + +### Discovery + +`ergon benchmark list` should display: + +- slug +- description +- available registered workers +- available registered evaluators +- sandbox requirement +- data extra requirement + +`ergon worker list` and `ergon evaluator list` should continue to read `WORKERS` and `EVALUATORS`. + +### Experiment Define + +`ergon experiment define ` should: + +1. require explicit `--worker`, `--evaluator`, `--sandbox`, `--model`, and `--extras` or equivalent request fields +2. validate those explicit slugs against the registries +3. instantiate the benchmark by slug +4. call `build_instances()` +5. select samples by `--limit`, `--sample`, or future selection flags +6. persist an `ExperimentRecord` with benchmark slug, selected instance keys, explicit worker team JSON, evaluator slug, sandbox slug, model target, extras/dependency intent, and cohort metadata + +It should not instantiate workers or criteria at define time. + +### Experiment Run + +`ergon experiment run ` should: + +1. read the persisted experiment +2. create one run assignment per selected task or instance +3. build a single-sample workflow definition through core composition +4. persist the workflow definition with benchmark, worker, and evaluator slugs +5. create `RunRecord` rows linked to experiment/cohort/definition +6. emit workflow start events + +Workers, criteria, and sandbox managers are instantiated by runtime services from slugs after run creation. + +### Benchmark Run + +`ergon benchmark run ` should become a convenience wrapper around define plus run. It should not keep its own separate composition path long term. + +The rebuild should remove drift between: + +- `ergon_cli.commands.benchmark.run_benchmark` +- `ergon_cli.composition.build_experiment` +- `ExperimentDefinitionService` +- `ExperimentLaunchService` + +The preferred end state is: + +```text +benchmark run + -> experiment facade define + -> experiment facade run + -> run facade status/output +``` + +## Migration Order + +### Phase 1: Explicit Registry Contract + +- Keep registries explicit: no benchmark profiles or default pairing layer in this phase. +- Ensure `BENCHMARKS`, `WORKERS`, `EVALUATORS`, `SANDBOX_MANAGERS`, and `SANDBOX_TEMPLATES` are complete and typed. +- Update CLI list commands to display registered components without implying defaults. +- Add tests that every documented CLI pairing references registered benchmark, worker, evaluator, and sandbox slugs. + +### Phase 2: Public API Imports + +- Replace old built-ins imports: + - `BenchmarkTask` -> `Task` + - `BenchmarkDeps` -> `BenchmarkRequirements` + - `EvaluationContext` -> `CriterionContext` + - `CriterionResult` -> `CriterionOutcome` + - `CriterionScoreSpec` -> `ScoreScale` + - `CriterionObservation` -> `CriterionEvidence` + - `CriterionObservationMessage` -> `EvidenceMessage` +- Move SWE-Bench rubric beside the SWE-Bench benchmark. +- Move generic evaluator helpers under `shared/criteria` only if they are truly benchmark-independent. + +### Phase 3: Benchmark-Owned Worker Factories + +- Move `_minif2f_react` into `benchmarks/minif2f/worker_factory.py`. +- Move `_swebench_react` into `benchmarks/swebench_verified/worker_factory.py`. +- Add `gdpeval-react` factory. +- Keep ResearchRubrics workers in the benchmark package or re-export them from benchmark-owned `worker_factory.py`. +- Keep generic `ReActWorker` in `shared/workers/react_worker.py`. + +### Phase 4: CLI Facade Alignment + +- Make `benchmark run` call the same core service facade path as `experiment define` plus `experiment run`. +- Remove direct CLI composition of `Experiment` objects. +- Ensure `create_run` call sites use the current `RunRecord` contract: experiment ID, workflow definition ID, instance key, worker team JSON, evaluator slug, and model target. + +### Phase 5: Runtime And Evaluation Contracts + +- Update Inngest worker execution to construct `Task` from the registered benchmark payload model. +- Update evaluation execution to use `CriterionContext` public capability methods. +- Ensure sandbox setup happens before benchmark-owned worker factories are invoked. +- Ensure criteria never import persistence sessions or concrete runtime protocols through public API modules. + +## Testing Plan + +Core contract tests: + +- every `BENCHMARKS` key has a matching `Benchmark.type_slug` +- every documented required CLI pairing has registered benchmark, worker, evaluator, and sandbox slugs +- every benchmark exposes `task_payload_model` and `BenchmarkRequirements` +- every benchmark's `build_instances(limit=1)` returns at least one `Task` with a valid payload when optional dependencies are available + +Benchmark-specific tests: + +- MiniF2F proof criterion handles verified, syntactically valid incomplete, and invalid proof outputs. +- SWE-Bench criterion handles empty patch, patch extraction failure, git apply failure, unresolved report, and resolved report. +- GDPEval staged rubric handles required gate failure, continue, zero-category, and normalized score bounds. +- ResearchRubrics dynamic criteria build one judge criterion per payload rubric and aggregate negative weights correctly. + +CLI/service tests: + +- `benchmark list` shows registered benchmarks without default worker/evaluator metadata. +- `experiment define` stores slugs and selected sample keys, not live worker/evaluator objects. +- `experiment run` creates one workflow definition and run per selected sample. +- `benchmark run` uses the same facade path as define plus run. +- run records persist worker team JSON, evaluator slug, model target, instance key, experiment ID, and workflow definition ID. + +## Open Decisions + +1. Whether `Evaluator` stays root-public or is imported only from `ergon_core.api.rubric`. +2. Whether `gdpeval-react` should be the recommended GDP worker or GDP should use the workflow CLI worker. +3. Whether `researchrubrics-rubric` is the only final slug, removing `research-rubric`. +4. Whether `benchmark run` should remain as a public CLI command after it becomes a wrapper around experiment services. diff --git a/docs/superpowers/plans/2026-04-28-ergon-cli-refactor-structure.md b/docs/superpowers/plans/2026-04-28-ergon-cli-refactor-structure.md new file mode 100644 index 00000000..566b72f6 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-ergon-cli-refactor-structure.md @@ -0,0 +1,772 @@ +# Ergon CLI Refactor Structure + +This document specifies the target CLI structure after the Ergon core public API and `ergon_builtins` package refactors. It is a sibling to: + +- `2026-04-28-public-api-target-structure.md` +- `2026-04-28-ergon-builtins-rebuild-structure.md` + +The CLI should become the operator-facing shell over core service facades. It should not assemble low-level graph objects by hand, import benchmark internals, or maintain a second experiment launch path that can drift from the API and runtime. + +## Goals + +- Make `ergon experiment define` and `ergon experiment run` the canonical local lifecycle commands. +- Make API routes, CLI commands, and eval automation call the same core services with the same DTOs. +- Make `benchmark run`, if kept, a thin wrapper over define plus run. +- Use `ergon_builtins.registry` for discovery and validation, but require explicit worker/evaluator/sandbox/model/extras choices in benchmark requests for now. +- Remove stale direct composition paths from the CLI. +- Keep operational commands such as `benchmark setup`, `workflow`, `run list`, `run cancel`, `doctor`, `onboard`, and `train` clearly separated from experiment definition and launch. +- Ensure CLI output remains machine-readable enough for tests, shell scripts, and eval automation. + +## Current Shape + +```text +ergon_cli/ + ergon_cli/ + main.py + # top-level argparse parser and dispatch + + commands/ + benchmark.py + # list, setup, stale run path + experiment.py + # define, run, show, list + run.py + # list, cancel + worker.py + # list + evaluator.py + # list + workflow.py + # sandbox/workflow helper commands + eval.py + # checkpoint eval watcher + train.py + # local RL training + onboard.py + doctor.py + + composition/ + __init__.py + # stale direct Experiment composition helper + + discovery/ + __init__.py + # list BENCHMARKS/WORKERS/EVALUATORS + + rendering/ + __init__.py +``` + +The current parser registers: + +- `benchmark list` +- `benchmark setup` +- `experiment define` +- `experiment run` +- `experiment show` +- `experiment list` +- `run list` +- `run cancel` +- `worker list` +- `evaluator list` +- `workflow ...` +- `eval watch` +- `eval checkpoint` +- `onboard` +- `doctor` +- `train local` + +There is handler code for `benchmark run`, but `main.py` does not register a `benchmark run` subparser. This is intentional in at least one current unit test, but conflicts with dead handler code, old setup messages, and real-LLM tests that still invoke `ergon benchmark run`. + +## Target Command Model + +```mermaid +flowchart TD + accTitle: CLI Command Ownership + accDescr: The CLI command tree routes experiment lifecycle commands through core service facades, while setup, workflow, training, and diagnostics remain separate operational surfaces. + + cli["ergon CLI"] + discovery["discovery commands
benchmark/worker/evaluator list"] + setup["benchmark setup
sandbox template build"] + experiment["experiment lifecycle
define/run/show/list"] + run["run operations
list/cancel/show later"] + workflow["workflow helper
inside sandbox/task context"] + eval["eval watcher
checkpoint scoring"] + train["train local
RL training"] + doctor["doctor/onboard"] + + cli --> discovery + cli --> setup + cli --> experiment + cli --> run + cli --> workflow + cli --> eval + cli --> train + cli --> doctor + + experiment --> services["core runtime service facades"] + eval --> experiment + discovery --> registry["ergon_builtins.registry"] + setup --> sandbox_templates["SANDBOX_TEMPLATES"] +``` + +### Canonical Lifecycle Commands + +These commands define the supported local experiment lifecycle: + +```text +ergon experiment define [selection] --worker ... --evaluator ... --sandbox ... --model ... --extras ... +ergon experiment run [runtime options] +ergon experiment show +ergon experiment list +``` + +The HTTP API should remain parallel to this command set: + +```text +POST /api/experiments/define +POST /api/experiments/{id}/run +GET /api/experiments/{id} +GET /api/experiments +``` + +The CLI and HTTP API should use the same service layer: + +- `ExperimentDefinitionService.define_benchmark_experiment` +- `ExperimentLaunchService.run_experiment` +- `ExperimentReadService.get_experiment` +- `ExperimentReadService.list_experiments` +- `ExperimentCohortService.resolve_or_create` +- run read/cancel services + +### Wrapper Commands + +`ergon benchmark run` has two acceptable end states: + +1. Preferred: reintroduce it as a convenience wrapper over `experiment define` plus `experiment run`. +2. Strict: delete the handler and update all docs/tests to use `ergon experiment define` plus `ergon experiment run`. + +The preferred end state is to keep it as a wrapper because it is useful for demos and real-LLM canaries: + +```text +ergon benchmark run minif2f --limit 1 + +equivalent to: + ergon experiment define minif2f --limit 1 --worker minif2f-react --model openai:gpt-4o --evaluator minif2f-rubric --sandbox minif2f --extras none + ergon experiment run +``` + +The wrapper must not call `ergon_cli.composition.build_experiment` or create `RunRecord` rows itself. + +### Operational Commands + +These commands should stay outside the experiment lifecycle: + +- `ergon benchmark setup `: build/register E2B sandbox templates. +- `ergon workflow ...`: task-local workflow/resource helper used inside workers and sandboxes. +- `ergon run list`: operator telemetry over recent runs. +- `ergon run cancel `: cancellation and cleanup request. +- `ergon eval watch` and `ergon eval checkpoint`: checkpoint evaluation automation. +- `ergon train local`: local training integration. +- `ergon doctor` and `ergon onboard`: environment setup and diagnostics. + +## Target Package Tree + +```text +ergon_cli/ + ergon_cli/ + main.py + # argparse only; no business logic + + commands/ + benchmark.py + # list, setup, wrapper run only + experiment.py + # define, run, show, list through facade helpers + run.py + # list, cancel through run services + worker.py + evaluator.py + workflow.py + eval.py + train.py + onboard.py + doctor.py + + services/ + experiment_cli_facade.py + # CLI-specific orchestration over core service DTOs + # parse args -> requests -> logging/rendering + benchmark_cli_facade.py + # benchmark list/setup/wrapper helpers + run_cli_facade.py + # list/cancel/show helpers + + discovery/ + __init__.py + # registry reads only + + rendering/ + __init__.py + # tables, key=value output, errors + + parsing/ + __init__.py + # optional shared parser helper functions if main.py grows too large +``` + +`ergon_cli.composition` should be removed once `benchmark run` and smoke-only composition paths are replaced by service facade calls or test harness APIs. + +## Service Boundary + +The CLI may import: + +```python +from ergon_builtins.registry import ( + BENCHMARKS, + WORKERS, + EVALUATORS, + SANDBOX_MANAGERS, + SANDBOX_TEMPLATES, +) + +from ergon_core.core.runtime.services.experiment_definition_service import ExperimentDefinitionService +from ergon_core.core.runtime.services.experiment_launch_service import ExperimentLaunchService +from ergon_core.core.runtime.services.experiment_read_service import ExperimentReadService +from ergon_core.core.runtime.services.experiment_schemas import ExperimentDefineRequest, ExperimentRunRequest +from ergon_core.core.runtime.services.cohort_service import experiment_cohort_service +from ergon_core.core.runtime.services.run_service import cancel_run +``` + +The CLI should not import: + +- `ergon_core.core.composition.Experiment` except inside a temporary migration shim. +- `ergon_core.core.composition.WorkerSpec` except inside core services. +- benchmark package internals such as `ergon_builtins.benchmarks.minif2f.*`. +- concrete criterion classes. +- persistence model classes for command logic, except through a temporary run-list shim. +- Inngest event classes for experiment launch, except through core services. + +## Discovery Commands + +### `ergon benchmark list` + +Use `BENCHMARKS` plus the related worker/evaluator/sandbox registries for validation. Do not show or infer benchmark defaults in this phase. + +Target columns: + +```text +Slug +Name +Description +Requires Data Extra +Known Sandboxes +``` + +Rules: + +- Include all registered benchmark slugs. +- Do not display default workers or evaluators. +- Show dependency hints only when they come from `BenchmarkRequirements` or explicit registry metadata. +- A contract test should fail if CLI code starts deriving hidden worker/evaluator/sandbox defaults. + +### `ergon worker list` + +Use `WORKERS`. + +Target columns: + +```text +Slug +Name +Kind +Description +``` + +`Kind` can initially be inferred: + +- `class` +- `factory` + +Long term, worker metadata can move into an explicit descriptor object if the registry grows. + +### `ergon evaluator list` + +Use `EVALUATORS`. + +Target columns: + +```text +Slug +Name +Kind +Description +``` + +`Kind` can be: + +- `rubric` +- `evaluator` + +If `Evaluator` remains advanced public API, list it as an advanced evaluator, not a beginner rubric. + +## Experiment Define + +### Command + +```text +ergon experiment define + (--limit N | --sample-id SAMPLE_ID ...) + [--name NAME] + [--cohort COHORT_NAME] + --worker WORKER_SLUG + --model MODEL_TARGET + --evaluator EVALUATOR_SLUG + --sandbox SANDBOX_SLUG + --extras EXTRAS_SPEC + [--workflow single] + [--max-questions N] +``` + +The CLI should keep these choices compulsory while the package structure is stabilizing. A benchmark slug alone is not enough information to define an experiment. + +### Data Flow + +```mermaid +sequenceDiagram + accTitle: Experiment Define Flow + accDescr: The CLI validates explicit registry slugs, builds a request DTO, and delegates experiment definition to core services. + + participant User + participant CLI + participant Registry + participant Cohorts + participant DefinitionService + participant DB + + User->>CLI: ergon experiment define minif2f --limit 1 + CLI->>Registry: validate explicit benchmark/worker/evaluator/sandbox slugs + CLI->>Cohorts: resolve_or_create when --cohort is set + CLI->>DefinitionService: define_benchmark_experiment(request) + DefinitionService->>Registry: instantiate benchmark by slug + DefinitionService->>DefinitionService: build_instances and select samples + DefinitionService->>DB: persist ExperimentRecord + DefinitionService-->>CLI: ExperimentDefineResult + CLI-->>User: key=value identifiers +``` + +### Request Mapping + +```python +ExperimentDefineRequest( + benchmark_slug=args.benchmark_slug, + name=args.name, + cohort_id=cohort_id, + limit=args.limit, + sample_ids=args.sample_id or None, + default_model_target=args.model, + default_worker_team={"primary": args.worker}, + default_evaluator_slug=args.evaluator, + metadata={ + "workflow": args.workflow, + "max_questions": args.max_questions, + "sandbox_slug": args.sandbox, + "extras": args.extras, + "cli_command": "experiment define", + }, +) +``` + +### Output Contract + +The command should print stable key/value lines: + +```text +EXPERIMENT_ID= +COHORT_ID= # only when known +BENCHMARK= +SAMPLES= +DEFAULT_WORKER= +DEFAULT_EVALUATOR= +DEFAULT_MODEL= +``` + +Tests and automation should parse these lines rather than human prose. + +## Experiment Run + +### Command + +```text +ergon experiment run + [--timeout SECONDS] + [--no-wait] +``` + +### Required Core Behavior + +`ExperimentLaunchService.run_experiment` should own: + +1. read `ExperimentRecord` +2. create one `RunAssignment` per selected sample +3. construct a single-sample benchmark wrapper +4. instantiate evaluator binding from `EVALUATORS` +5. call `Experiment.from_single_worker(...)` +6. persist workflow definition through `ExperimentPersistenceService` +7. create `RunRecord` with: + - `experiment_id` + - `workflow_definition_id` + - `instance_key` + - `worker_team_json` + - `evaluator_slug` + - `model_target` + - optional assignment/seed metadata +8. emit `WorkflowStartedEvent` + +The CLI should not implement any of those steps directly. + +### Wait Semantics + +The current schema includes `timeout_seconds` and `wait`, but the launch service does not fully use them. The target semantics: + +- `wait=True`: return after all created runs reach terminal status or timeout. +- `wait=False`: return immediately after workflow start events are emitted. +- `timeout_seconds`: maximum wait time when `wait=True`. +- Timeout should not cancel the run by default; it should return a non-zero CLI code only for the waiting command. + +The result DTO should carry enough status for output: + +```text +EXPERIMENT_ID= +RUN_ID= +RUN_STATUS= # when wait=True and known +``` + +If multiple runs are launched, print one `RUN_ID=` and `RUN_STATUS=` pair per run, or a tabular block after the stable key/value lines. + +## Experiment Show/List + +`experiment show` should read `ExperimentReadService.get_experiment`. + +Output should include: + +```text +EXPERIMENT_ID= +COHORT_ID= +NAME= +BENCHMARK= +STATUS= +SAMPLE_COUNT= +RUN_COUNT= +DEFAULT_WORKER= +DEFAULT_EVALUATOR= +DEFAULT_MODEL= +SAMPLE_SELECTION= +``` + +If runs exist, print: + +```text +RUNS +\t\t\t +``` + +`experiment list` should remain a summary table. It should not instantiate benchmarks or workers. + +## Benchmark Setup + +`ergon benchmark setup ` remains separate from experiment lifecycle. + +It should: + +1. read `SANDBOX_TEMPLATES` +2. validate `E2B_API_KEY` +3. load the benchmark template spec +4. build the E2B template +5. write `~/.ergon/sandbox_templates.json` or `ERGON_CONFIG_DIR/sandbox_templates.json` +6. print a follow-up command using the canonical lifecycle + +The success message should not suggest stale `benchmark run` syntax unless `benchmark run` is formally kept. + +Preferred success message: + +```text +Success! Template ID: +Next: + ergon experiment define --limit 1 + ergon experiment run +``` + +If `benchmark run` is kept: + +```text +Or: + ergon benchmark run --limit 1 +``` + +## Benchmark Run Wrapper + +If kept, `benchmark run` should be registered in `main.py` and call a wrapper function that does exactly: + +1. require the same explicit worker/evaluator/sandbox/model/extras arguments as `experiment define` +2. validate those explicit choices against registries +3. call the same define facade as `experiment define` +4. call the same run facade as `experiment run` +5. print the same stable key/value output + +Target command: + +```text +ergon benchmark run + [--limit N | --sample-id SAMPLE_ID ...] + [--name NAME] + [--cohort COHORT_NAME] + --worker WORKER_SLUG + --model MODEL_TARGET + --evaluator EVALUATOR_SLUG + --sandbox SANDBOX_SLUG + --extras EXTRAS_SPEC + [--workflow single] + [--timeout SECONDS] + [--no-wait] +``` + +The handler should not call: + +- `build_experiment` +- `Experiment.persist` +- `create_run` directly +- `inngest_client.send` directly + +## Run Commands + +### `ergon run list` + +The current CLI queries `RunRecord` directly. Target state: + +- add a read method in core, either in `RunReadService` or a small `RunListService` +- support `--limit` +- support `--status` +- optionally support `--experiment-id` and `--cohort-id` later + +Output columns: + +```text +RUN_ID +STATUS +EXPERIMENT_ID +WORKFLOW_DEFINITION_ID +INSTANCE_KEY +MODEL +CREATED_AT +UPDATED_AT +``` + +### `ergon run cancel ` + +Keep routed through `run_service.cancel_run`. + +Target behavior: + +- return `0` if cancellation request is accepted +- return non-zero if run is missing or already terminal and cannot be canceled +- print stable key/value output: + +```text +RUN_ID= +STATUS=cancelled +``` + +## Workflow Command + +`ergon workflow` is an internal worker/sandbox helper surface, not an operator experiment lifecycle surface. + +It may continue to call `WorkflowService` directly because it is already scoped by: + +- `--run-id` +- `--node-id` +- `--execution-id` +- `--sandbox-task-key` +- `--benchmark-type` + +Refactor rules: + +- keep it isolated from benchmark definition and launch code +- do not make it import benchmark package internals +- keep `--benchmark-type` as a slug used by sandbox materialization +- add tests that workflow parser changes do not affect experiment parser behavior + +## Eval Commands + +`ergon eval watch` and `ergon eval checkpoint` should use the canonical experiment lifecycle for local evaluation. + +Current target: + +```text +eval checkpoint + -> evaluate_checkpoint + -> local eval path + -> ergon experiment define + -> ergon experiment run + -> read run/evaluation results +``` + +Required cleanup: + +- make `--eval-limit` required for local eval if `_run_local_eval` requires it, or provide a safe default +- ensure subprocess calls use `experiment define/run`, not `benchmark run` +- ensure output parsing relies on stable `EXPERIMENT_ID=` and `RUN_ID=` lines + +## Train Command + +`ergon train local` belongs to training infrastructure and should remain separate from CLI experiment lifecycle. + +It may accept: + +- `--benchmark` +- `--evaluator` +- `--definition-id` +- model/training parameters + +The refactor should not change training semantics unless import paths break. + +## Doctor And Onboard + +`doctor` and `onboard` should use explicit CLI request fields plus benchmark requirements to report missing dependencies. + +Examples: + +- benchmark requires `[data]` +- benchmark requires E2B +- benchmark recommends `EXA_API_KEY` +- benchmark requires sandbox template setup +- model backend requires environment keys + +They should not instantiate benchmark datasets just to list requirements. + +## Migration Plan + +### Phase 1: Parser And Command Contract + +- Decide final `benchmark run` behavior. +- If keeping it, register the parser and implement it as a wrapper. +- If removing it, delete handler code and update tests/docs/real-LLM canaries. +- Update `benchmark setup` success messaging. +- Add parser tests for all command surfaces. + +### Phase 2: Explicit Registry Validation + +- Update `discovery.list_benchmarks()` to display registered benchmarks without implying default pairings. +- Keep `--worker`, `--model`, `--evaluator`, `--sandbox`, and `--extras` required for `experiment define` and `benchmark run`. +- Add validation errors for missing or unknown explicit choices: + - unknown benchmark slug + - unknown worker slug + - unknown evaluator slug + - unknown sandbox slug + - missing model target + - missing extras/dependency intent + +### Phase 3: CLI Facade Extraction + +- Create `ergon_cli/services/experiment_cli_facade.py`. +- Move argument-to-DTO mapping out of command handlers. +- Keep `commands/experiment.py` thin. +- Add `benchmark_cli_facade.py` for list/setup/wrapper run. +- Add `run_cli_facade.py` for list/cancel once run read service exists. + +### Phase 4: Delete Direct Composition Path + +- Remove `ergon_cli.composition.build_experiment` from production CLI flows. +- Move any smoke-only composition behavior into core test harness or test support. +- Ensure no production CLI command imports `Experiment`, `WorkerSpec`, or Inngest events for launch. + +### Phase 5: Wait/Poll Semantics + +- Implement service-level `wait` and `timeout_seconds`, or remove those fields from CLI/schema. +- Prefer implementing them because e2e and demos need blocking behavior. +- Add tests for: + - `--no-wait` returns after dispatch + - timeout returns non-zero without canceling runs + - completed runs return status lines + +### Phase 6: Run Read Service + +- Add a service method for listing recent runs. +- Route `run list` through it. +- Keep `run cancel` through `cancel_run`. +- Add tests for status filtering. + +## Test Plan + +### Unit Tests + +Parser tests: + +- `benchmark list` parses +- `benchmark setup ` parses +- `benchmark run ` parses if kept, fails if removed +- `experiment define` parses only with explicit worker/model/evaluator/sandbox/extras +- `experiment run --no-wait` parses +- `run list --status failed` parses +- `eval checkpoint --eval-limit 1` parses + +Facade tests: + +- define facade builds `ExperimentDefineRequest` with explicit CLI choices +- define facade rejects missing explicit worker/evaluator/sandbox/model/extras +- define facade resolves cohort only when `--cohort` is provided +- run facade builds `ExperimentRunRequest` +- benchmark wrapper calls define then run facades +- benchmark wrapper does not import or call direct composition helpers + +Discovery tests: + +- benchmark list does not imply default worker/evaluator pairings +- worker list includes factory entries +- evaluator list includes rubric/evaluator entries +- discovery does not expose hidden benchmark defaults + +### Integration Tests + +Service/CLI integration tests should cover: + +- `experiment define` persists `ExperimentRecord` with slugs and sample selection +- `experiment run` creates `RunRecord` rows with required foreign keys and assignment JSON +- `benchmark run` wrapper produces the same database shape as define plus run +- `run list` reads persisted runs through service +- `run cancel` emits cancellation and cleanup events + +### E2E Tests + +E2E should keep using: + +```text +ergon experiment define +ergon experiment run +``` + +unless `benchmark run` is explicitly retained as a wrapper, in which case one small canary can prove the wrapper path. + +The full e2e matrix is specified in `2026-04-28-ergon-e2e-refactor-test-plan.md`. + +## Known Drifts To Resolve + +1. `benchmark run` exists in `commands/benchmark.py` but is not registered in `main.py`. +2. `commands/benchmark.py::_create_and_dispatch` calls `create_run` with an old signature. +3. `experiment run --timeout` and `--no-wait` are represented in DTOs but not fully honored by the launch service. +4. `ergon_cli.composition` imports stale public API modules for `Experiment` and `WorkerSpec`. +5. `run list` queries persistence directly instead of using a core read service. +6. `eval checkpoint` can reach local eval without an `eval_limit` even though the local eval helper requires one. +7. `benchmark setup` still prints stale `benchmark run` guidance. + +## Final CLI Contract + +The refactor is complete when: + +- all experiment lifecycle commands go through core service facades +- all discovery commands read registries +- no production CLI command constructs `Experiment` directly +- no production CLI command creates `RunRecord` directly for launch +- `benchmark run` is either a tested wrapper or fully removed +- API, CLI, e2e, and eval automation agree on the same define/run semantics +- stable key/value CLI output is covered by tests diff --git a/docs/superpowers/plans/2026-04-28-ergon-e2e-refactor-test-plan.md b/docs/superpowers/plans/2026-04-28-ergon-e2e-refactor-test-plan.md new file mode 100644 index 00000000..6052b909 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-ergon-e2e-refactor-test-plan.md @@ -0,0 +1,831 @@ +# Ergon E2E Refactor Test Plan + +This document specifies the test strategy that should accompany the Ergon core, built-ins, and CLI refactor. It is a sibling to: + +- `2026-04-28-public-api-target-structure.md` +- `2026-04-28-ergon-builtins-rebuild-structure.md` +- `2026-04-28-ergon-cli-refactor-structure.md` + +The purpose is to keep the refactor self-consistent: public API objects, built-in registry slugs, CLI commands, runtime rehydration, smoke fixtures, e2e harnesses, and dashboard assertions should all prove the same contract. + +## Goals + +- Preserve the existing four-tier testing model: unit, integration, e2e smoke, real-LLM. +- Keep production built-ins separate from test-only smoke fixtures. +- Use e2e smoke to prove cross-process behavior, not pure benchmark logic. +- Ensure CLI define/run behavior is covered by unit and integration tests before e2e uses it. +- Ensure every production benchmark family has contract tests for registry shape and explicit CLI pairing documentation. +- Ensure runtime execution can rehydrate workers, rubrics, criteria, task payloads, and sandbox managers from persisted slugs. +- Keep dashboard and harness checks aligned with run/cohort semantics. + +## Testing Tier Model + +The source of truth should remain path-based: + +```text +tests/unit/ + pure logic, models, validators, registry shape, parser behavior + +tests/integration/ + real Postgres and real Inngest dev server; service, persistence, API boundaries + +tests/e2e/ + full stack, test harness, real E2B, dashboard, Playwright + +tests/real_llm/ + opt-in or nightly; real model calls and budget-gated canaries +``` + +Markers are developer ergonomics, not the canonical tier definition. If `pyproject.toml` marker descriptions conflict with `docs/architecture/07_testing.md`, update the marker descriptions to match the path-based model. + +## High-Level Coverage Map + +```mermaid +flowchart TD + accTitle: Refactor Coverage Map + accDescr: Each test tier proves a different part of the Ergon refactor, from public API contracts through built-in registry shape and CLI service flow to full-stack dashboard behavior. + + public_api["Public API contracts"] + builtins["Built-ins registries
benchmarks/workers/evaluators"] + cli["CLI facades
define/run/list"] + services["Core runtime services
experiments/runs/cohorts"] + runtime["Inngest runtime
worker/evaluator rehydration"] + smoke["E2E smoke fixtures
happy/sad cohorts"] + dashboard["Dashboard and harness"] + + unit["Unit tests"] + integration["Integration tests"] + e2e["E2E smoke tests"] + real_llm["Real-LLM tests"] + + unit --> public_api + unit --> builtins + unit --> cli + integration --> services + integration --> runtime + e2e --> smoke + e2e --> dashboard + real_llm --> builtins + real_llm --> runtime +``` + +## Fixture Residency Rules + +## Stable E2E Boundary After Core Layout Refactor + +Core behavior is stable, but private repository and persistence modules may move. +E2E code should use only: + +- HTTP endpoints under `/api/test/*` +- `ergon_core.test_support` +- public core API objects from `ergon_core.api` +- application read-model facades, not private repository methods + +The existing smoke behavior assertions remain valid: + +- happy runs complete the 12-node graph +- sad runs fail `l_2` and block `l_3` +- happy runs produce 20 task resources and 26 context events +- happy root produces two score-1.0 evaluations +- sad runs produce one partial artifact and seven completion messages + +### Production Built-ins + +Production benchmark code belongs under: + +```text +ergon_builtins/ergon_builtins/ +``` + +Production built-ins include: + +- benchmark loaders +- production task payload schemas +- production worker factories +- production criteria and rubrics +- production sandbox managers +- production registry entries +- shared production worker/model/tool helpers + +Production built-ins must not import: + +- `ergon_core.test_support` +- `tests` +- smoke fixture workers +- smoke fixture criteria +- smoke benchmark loaders + +### Core Test Support + +Canonical smoke fixtures belong under: + +```text +ergon_core/ergon_core/test_support/smoke_fixtures/ +``` + +This package owns: + +- smoke benchmark replacements for `researchrubrics`, `minif2f`, and `swebench-verified` +- smoke workers +- smoke leaf workers +- recursive smoke workers +- sad-path workers +- smoke criteria and smoke rubrics +- `SmokeSandboxManager` +- registry mutation hook `register_smoke_fixtures()` + +Smoke fixtures register into `ergon_builtins.registry` only when explicitly enabled by: + +- `ERGON_STARTUP_PLUGINS=ergon_core.test_support.smoke_fixtures:register_smoke_fixtures` +- `ENABLE_TEST_HARNESS=1` +- `ENABLE_SMOKE_FIXTURES=1` for any remaining host-side transitional paths + +### Tests + +Test drivers and assertions belong under: + +```text +tests/ +``` + +They own: + +- unit parser tests +- registry and explicit pairing contract tests +- integration service tests +- e2e cohort submission +- e2e harness polling +- dashboard Playwright orchestration +- real-LLM canaries + +Tests can import `ergon_core.test_support` in unit/integration contexts. Black-box e2e client code should not register fixtures in the host process; fixture registration should happen inside the API process through startup plugins. + +## Current Smoke Fixture Shape + +```text +ergon_core/ergon_core/test_support/ + __init__.py + # register_smoke_fixtures public hook + + smoke_fixtures/ + __init__.py + # mutates WORKERS, EVALUATORS, and optionally BENCHMARKS/SANDBOX_MANAGERS + + benchmarks.py + # ResearchRubricsSmokeBenchmark + # MiniF2FSmokeBenchmark + # SweBenchSmokeBenchmark + + sandbox.py + # SmokeSandboxManager + + criteria/ + minif2f_smoke.py + researchrubrics_smoke.py + swebench_smoke.py + smoke_rubrics.py + timing.py + + smoke_base/ + worker_base.py + leaf_base.py + recursive.py + sadpath.py + criterion_base.py + subworker.py + constants.py + + workers/ + minif2f_smoke.py + researchrubrics_smoke.py + researchrubrics_smoke_sadpath.py + swebench_smoke.py +``` + +The smoke benchmarks deliberately reuse production benchmark slugs: + +```text +researchrubrics +minif2f +swebench-verified +``` + +They replace production benchmark loaders only when `ENABLE_TEST_HARNESS=1`, so e2e does not need HuggingFace, production data, or LLM access to materialize root tasks. + +## Canonical Smoke Program + +Every PR should continue to run three e2e legs: + +```text +researchrubrics +minif2f +swebench-verified +``` + +Each leg submits a cohort with: + +- one happy-path run +- one sad-path run + +The topology should stay identical across benchmark slugs: + +```text +Diamond: + d_root + / \ + d_left d_right + \ / + d_join + +Line: + l_1 -> l_2 -> l_3 + +Singletons: + s_a + s_b +``` + +Happy-path `l_2` routes to a recursive worker with nested children: + +```text +l_2 +└─ l_2_a -> l_2_b +``` + +Sad-path `l_2` routes to a failing leaf. `l_3` must remain blocked or cancelled according to the static-sibling failure semantics decision. + +## E2E Submission Flow + +```mermaid +sequenceDiagram + accTitle: Smoke E2E Flow + accDescr: E2E tests submit benchmark cohorts through the HTTP test harness, then assert run state through API and dashboard surfaces. + + participant Pytest + participant Harness as API Test Harness + participant Registry + participant Services + participant Inngest + participant Dashboard + + Pytest->>Harness: POST /api/test/write/cohort + Harness->>Registry: resolve smoke benchmark/worker/evaluator slugs + Harness->>Services: define/persist/dispatch runs + Services->>Inngest: WorkflowStartedEvent + Inngest->>Registry: rehydrate smoke workers/evaluators + Pytest->>Harness: poll /api/test/read/cohort/{key}/runs + Pytest->>Harness: read /api/test/read/run/{id}/state + Pytest->>Dashboard: Playwright assertions by cohort/run +``` + +The black-box e2e tests should not: + +- import production internals +- call `build_experiment` +- call `create_run` +- send Inngest events directly +- register smoke fixtures in the host pytest process + +The API process owns fixture registration through `ERGON_STARTUP_PLUGINS`. + +## CLI Coverage Flow + +CLI tests should be split by tier: + +```text +unit: + parser and facade DTO mapping + +integration: + experiment define/run persistence and dispatch semantics + +e2e: + one small black-box CLI canary only if needed +``` + +The canonical e2e smoke path should use the HTTP test harness, not the CLI, because it is primarily proving cross-process runtime, sandbox, dashboard, and cohort behavior. CLI define/run gets its own unit and integration coverage. + +If `benchmark run` is kept as a wrapper, add exactly one CLI e2e canary proving wrapper wiring. Do not duplicate the full smoke matrix through both HTTP harness and CLI. + +## Unit Test Plan + +### Public API Contract Tests + +Add or update tests under: + +```text +tests/unit/architecture/ +tests/unit/api/ +``` + +Required assertions: + +- `ergon_core.api` exports beginner public symbols: + - `Benchmark` + - `Task` + - `EmptyTaskPayload` + - `BenchmarkRequirements` + - `Worker` + - `WorkerContext` + - `WorkerOutput` + - `Criterion` + - `CriterionContext` + - `CriterionOutcome` + - `ScoreScale` + - `CriterionEvidence` + - `EvidenceMessage` + - `Rubric` + - `TaskEvaluationResult` + - `CriterionCheckError` +- moved core composition types are not root-public authoring concepts: + - `Experiment` + - `WorkerSpec` + - `DefinitionHandle` +- public API modules do not import DB/session modules. +- public worker code does not import context event repositories for default output extraction. + +### Built-ins Registry And Pairing Tests + +Add or update tests under: + +```text +tests/unit/registry/ +tests/unit/builtins/ +tests/unit/benchmarks/ +tests/unit/state/ +``` + +Required assertions: + +- every `BENCHMARKS` key equals the benchmark class `type_slug` +- every benchmark exposes `task_payload_model` +- every benchmark exposes `BenchmarkRequirements` +- every documented CLI pairing references registered benchmark, worker, evaluator, and sandbox slugs +- no production code derives hidden worker/evaluator/sandbox defaults from a benchmark slug +- importing `registry_core.py` does not require `[data]` dependencies +- importing `registry_data.py` is allowed to require optional data extras +- production registries do not include smoke worker slugs +- smoke fixture registration is idempotent +- smoke fixture registration only overrides benchmark loaders when `ENABLE_TEST_HARNESS=1` + +### CLI Unit Tests + +Add or update tests under: + +```text +tests/unit/cli/ +``` + +Required assertions: + +- parser registers all canonical commands +- parser outcome for `benchmark run` matches the decision in the CLI spec +- `experiment define` requires explicit `--worker`, `--model`, `--evaluator`, `--sandbox`, and `--extras` +- missing explicit worker/model/evaluator/sandbox/extras values fail before service calls +- define facade builds `ExperimentDefineRequest` +- run facade builds `ExperimentRunRequest` +- benchmark wrapper calls define plus run facades if kept +- `benchmark setup` success guidance uses canonical commands +- discovery output does not imply hidden benchmark defaults +- `run list` delegates to run read service after that service exists +- `eval checkpoint` handles missing or default `--eval-limit` consistently + +### Smoke Fixture Unit Tests + +Keep and extend tests under: + +```text +tests/unit/smoke_base/ +``` + +Required assertions: + +- topology constants remain the single source of truth +- `SmokeWorkerBase.execute` remains final +- every environment has: + - happy parent worker + - leaf worker + - recursive worker + - sad-path parent + - failing leaf + - smoke rubric +- all smoke workers accept the current public `Worker` constructor contract +- smoke criteria use the public `CriterionContext` capability surface +- smoke benchmark payload schemas match production payload shape enough for runtime serialization +- e2e driver pairs exist for every smoke environment + +### Architecture Boundary Tests + +Keep and extend: + +```text +tests/unit/architecture/test_no_test_logic_in_core.py +tests/unit/architecture/test_smoke_fixture_package_boundary.py +``` + +Target assertions: + +- production core does not import `ergon_core.test_support` except explicit test harness/plugin loading points +- `ergon_builtins` does not import `ergon_core.test_support` +- `ergon_builtins` does not import `tests` +- `ergon_cli` production commands do not import smoke fixture modules +- API startup plugin loader may import configured plugins dynamically +- `/api/test/*` is mounted only when `ENABLE_TEST_HARNESS=1` + +## Integration Test Plan + +Integration tests use real Postgres and real Inngest dev server. They should not require real LLM calls. + +### Experiment Services + +Add or update tests under: + +```text +tests/integration/ +tests/unit/runtime/ +``` + +Required scenarios: + +1. Define experiment from a smoke benchmark slug. +2. Persist selected sample keys and explicit worker/evaluator/sandbox/model/extras choices. +3. Run experiment and create one `RunRecord` per selected sample. +4. Persist workflow definition with benchmark, worker, and evaluator slugs. +5. Emit `WorkflowStartedEvent` for each run. +6. Support `wait=False` path. +7. Support timeout path without deleting or cancelling the run. + +### Runtime Rehydration + +Required scenarios: + +- worker execution rehydrates worker factory from `WORKERS` +- worker execution validates task payload through registered benchmark payload model +- evaluator execution rehydrates evaluator from `EVALUATORS` +- criteria run against `CriterionContext`, not direct concrete runtime imports in public modules +- sandbox manager is resolved from `SANDBOX_MANAGERS` +- sandbox setup completes before benchmark-owned worker factories are invoked +- failed worker path persists partial artifacts and marks downstream dependencies correctly + +### Sandbox Integration + +Keep benchmark-specific sandbox manager tests: + +```text +tests/integration/minif2f/test_sandbox_manager.py +tests/integration/researchrubrics/test_sandbox_manager.py +tests/integration/swebench_verified/test_sandbox_manager.py +tests/integration/sandbox/test_required_env_keys.py +``` + +Refactor expectations: + +- these tests should import benchmark sandbox managers from final package locations +- they should not depend on CLI composition helpers +- they should be skipped or marked clearly when E2B credentials are absent, according to current integration policy + +### Evaluator Integration + +Keep and align: + +```text +tests/integration/minif2f/test_verification_integration.py +tests/integration/swebench_verified/test_criterion.py +tests/integration/swebench_verified/test_rubric.py +``` + +Required updates: + +- import renamed public result/context classes +- assert `CriterionOutcome` evidence fields where appropriate +- avoid old `EvaluationContext` naming +- ensure SWE-Bench criterion patch extraction uses `CriterionContext` capabilities + +## E2E Smoke Test Plan + +### Python E2E Layout + +Target layout: + +```text +tests/e2e/ + conftest.py + # infra preflight, shared DB session, optional CLI helper + + _submit.py + # black-box cohort submission through /api/test/write/cohort + + _asserts.py + # run graph, resources, evaluation, communication, sandbox assertions + + _read_contracts.py + # DTO helpers for /api/test/read endpoints + + test_researchrubrics_smoke.py + test_minif2f_smoke.py + test_swebench_smoke.py +``` + +Each `test__smoke.py` should: + +1. build a cohort key +2. submit two slots: + - happy smoke worker plus smoke rubric + - sad-path smoke worker plus smoke rubric +3. wait for terminal statuses +4. assert happy run graph/resources/evaluations/messages +5. assert sad run partial artifacts and blocked/cancelled downstream node +6. run the dashboard Playwright smoke spec for that environment + +### Required Per-Run Assertions + +Happy run assertions: + +- root node completed +- expected direct child nodes exist +- nested `l_2_a` and `l_2_b` exist +- dependency edges match canonical topology +- all expected leaf/dynamic nodes completed +- `GenerationTurn` count matches expected topology +- communication thread messages exist in order +- run resources include outputs and probe artifacts +- blob store round-trip works +- root evaluations exist +- evaluation timestamps are after root execution completion +- sandbox health probe succeeded + +Sad run assertions: + +- root node reaches failed or terminal failed-equivalent state +- `l_2` failed +- `l_3` blocked or cancelled until the failure semantics RFC pins final status +- partial artifact from failing leaf exists +- pre-failure sandbox WAL entry exists when WAL persistence exists +- no successful final evaluation score is recorded +- unaffected branches completed as expected + +### Dashboard Assertions + +Dashboard e2e specs under: + +```text +ergon-dashboard/tests/e2e/ +``` + +should assert: + +- cohort page renders both happy and sad runs +- run status is visible +- graph canvas renders +- each expected task node appears by `data-testid` +- environment label appears +- failed/blocked node states are visible on sad path +- evaluation panel shows root evaluation where expected +- resources/artifacts are visible where expected + +Backend harness DTOs should remain the source of truth for data-rich assertions; Playwright should assert that the UI represents the same state. + +## Real-LLM Test Plan + +Real-LLM tests are opt-in and should not block ordinary local development. + +Target directory: + +```text +tests/real_llm/ + benchmarks/ + test_researchrubrics.py + test_minif2f.py # optional future canary + test_swebench.py # optional future canary + test_smoke_stub.py + fixtures/ + stack.py + harness_client.py + playwright_client.py + openrouter_budget.py +``` + +Required canaries: + +- one no-LLM stub model canary proving CLI wrapper behavior if `benchmark run` is kept +- one ResearchRubrics real model run proving report generation and LLM judge path + +Optional canaries: + +- MiniF2F real model proof attempt +- SWE-Bench real model patch attempt + +Real-LLM tests should use strict budgets and explicit environment gates: + +- `ERGON_REAL_LLM=1` +- OpenRouter/OpenAI/Anthropic keys as required +- stack readiness fixtures + +## Test Harness Contract + +The `/api/test/*` harness should remain test-only. + +Mounting rules: + +- enabled only when `ENABLE_TEST_HARNESS=1` +- write endpoints require `X-Test-Secret` or configured secret behavior +- read endpoints are safe for Playwright and pytest polling in test environments + +Required endpoints: + +```text +POST /api/test/write/cohort +GET /api/test/read/cohort/{cohort_key}/runs +GET /api/test/read/run/{run_id}/state +``` + +The write endpoint should use the same core services as production experiment launch. It may use smoke fixture registry entries, but it should not keep a separate run creation path that bypasses service invariants. + +## Coverage Matrix + +| Area | Unit | Integration | E2E Smoke | Real-LLM | +|---|---|---|---|---| +| Public API exports | required | no | no | no | +| Public API import boundaries | required | no | no | no | +| Built-ins registry and explicit pairing shape | required | optional | indirect | optional | +| Benchmark `build_instances` contract | required with stubs | data-dependent paths | smoke replacements | real datasets optional | +| CLI parser/facade mapping | required | optional | one canary only | optional | +| Experiment define/run services | fast mocked unit plus contract tests | required | indirect through harness | indirect | +| Run creation schema | required | required | indirect | indirect | +| Inngest worker rehydration | required | required | required | required for canaries | +| Evaluator/criterion rehydration | required | required | required | required for judge canaries | +| Sandbox manager setup | unit stubs | required per benchmark | required smoke path | optional | +| Dashboard event contracts | required | optional | required | optional | +| Cohort happy/sad behavior | unit topology | service-level partial | required | optional | +| LLM generation quality | no | no | no | required | + +## Migration Plan + +### Phase 1: Freeze Test Boundaries + +- Update this plan and `docs/architecture/07_testing.md` if necessary. +- Align `pyproject.toml` marker descriptions with the path-based tier model. +- Add boundary tests proving production built-ins do not import smoke/test modules. +- Add tests proving smoke fixtures register only through explicit hooks. + +### Phase 2: Public API Rename Tests + +- Update unit tests to use final public names: + - `Task` + - `BenchmarkRequirements` + - `CriterionContext` + - `CriterionOutcome` + - `ScoreScale` + - `CriterionEvidence` + - `EvidenceMessage` +- Keep no compatibility alias tests unless the product decision changes. + +### Phase 3: Built-ins Registry And Pairing Tests + +- Add explicit pairing contract tests for: + - `minif2f` + - `swebench-verified` + - `gdpeval` + - `researchrubrics` + - `researchrubrics-vanilla` +- Add optional dependency import tests for `registry_core.py` versus `registry_data.py`. + +### Phase 4: CLI Contract Tests + +- Update parser tests around the final `benchmark run` decision. +- Add facade tests for define/run DTO mapping. +- Add integration tests for `experiment define` and `experiment run`. +- Update real-LLM tests to use canonical CLI commands or the wrapper if retained. + +### Phase 5: Runtime Rehydration Tests + +- Update Inngest worker execution tests for final `Task` payload paths. +- Update evaluator execution tests for final `CriterionContext` and `CriterionOutcome`. +- Add regression tests for sandbox setup before worker factory invocation. +- Add tests for persisted slugs matching registry keys. + +### Phase 6: E2E Harness Alignment + +- Ensure `/api/test/write/cohort` calls the same core launch service path as CLI/API. +- Ensure e2e host process does not register fixtures. +- Ensure API process registers fixtures by startup plugin. +- Ensure smoke benchmark replacements override production benchmark loaders only when `ENABLE_TEST_HARNESS=1`. +- Keep Playwright specs aligned with expected smoke topology constants. + +### Phase 7: Dashboard And Artifact Assertions + +- Turn soft-skipped sandbox WAL assertions into hard assertions once WAL persistence exists. +- Keep screenshots on failure. +- Verify dashboard `data-testid` attributes remain stable: + - `run-status` + - `task-node-{slug}` + - `graph-canvas` + - `cohort-run-row` + - `cohort-env-label` + +## Required Test Files To Update Or Add + +### Unit + +```text +tests/unit/architecture/test_public_api_shape.py +tests/unit/architecture/test_no_test_logic_in_core.py +tests/unit/architecture/test_smoke_fixture_package_boundary.py +tests/unit/registry/test_builtin_pairings.py +tests/unit/registry/test_react_factories.py +tests/unit/cli/test_experiment_cli.py +tests/unit/cli/test_benchmark_setup.py +tests/unit/cli/test_eval_cli_required_fields.py +tests/unit/smoke_base/test_smoke_fixture_registration.py +tests/unit/smoke_base/test_e2e_smoke_driver_pairs.py +``` + +### Integration + +```text +tests/integration/smokes/test_smoke_harness.py +tests/integration/minif2f/test_verification_integration.py +tests/integration/minif2f/test_sandbox_manager.py +tests/integration/researchrubrics/test_sandbox_manager.py +tests/integration/swebench_verified/test_criterion.py +tests/integration/swebench_verified/test_rubric.py +tests/integration/swebench_verified/test_sandbox_manager.py +tests/integration/sandbox/test_required_env_keys.py +``` + +Add, if missing: + +```text +tests/integration/cli/test_experiment_define_run.py +tests/integration/runtime/test_registry_rehydration.py +tests/integration/runtime/test_experiment_launch_service_wait.py +``` + +### E2E + +```text +tests/e2e/conftest.py +tests/e2e/_submit.py +tests/e2e/_asserts.py +tests/e2e/_read_contracts.py +tests/e2e/test_researchrubrics_smoke.py +tests/e2e/test_minif2f_smoke.py +tests/e2e/test_swebench_smoke.py +``` + +### Dashboard + +```text +ergon-dashboard/tests/e2e/_shared/smoke.ts +ergon-dashboard/tests/e2e/researchrubrics.smoke.spec.ts +ergon-dashboard/tests/e2e/minif2f.smoke.spec.ts +ergon-dashboard/tests/e2e/swebench-verified.smoke.spec.ts +ergon-dashboard/tests/helpers/backendHarnessClient.ts +``` + +### Real-LLM + +```text +tests/real_llm/benchmarks/test_researchrubrics.py +tests/real_llm/benchmarks/test_smoke_stub.py +tests/real_llm/fixtures/stack.py +tests/real_llm/fixtures/harness_client.py +tests/real_llm/fixtures/openrouter_budget.py +``` + +## Acceptance Criteria + +The refactor is test-complete when: + +- unit tests prove public API exports and import boundaries +- unit tests prove built-ins registry and explicit pairing consistency +- unit tests prove CLI parser/facade behavior +- integration tests prove experiment define/run services persist the expected records +- integration tests prove runtime worker/evaluator rehydration from slugs +- e2e tests pass for `researchrubrics`, `minif2f`, and `swebench-verified` +- e2e host process remains a black-box client +- smoke fixtures stay out of production built-ins +- real-LLM tests are updated to the final CLI contract +- dashboard Playwright specs still render and assert cohort/run state + +## 2026-04-29 Finish Plan Update + +The current execution plan for completing the built-ins, CLI, and e2e refactor is: + +```text +docs/superpowers/plans/2026-04-29-finish-builtins-cli-e2e-refactor.md +``` + +That plan supersedes this document's older migration checklist where the two disagree. In particular: + +- `benchmark run` is retained as an explicit `experiment define` plus `experiment run` wrapper. +- E2E smoke submissions must pass explicit `worker`, `evaluator`, `sandbox`, `model`, and `extras` choices through the test harness. +- E2E host-side tests may import `ergon_core.test_support`, public API modules, HTTP `/api/test/*`, and stable application read models, but not private core repository or persistence internals. +- The existing smoke runtime assertions remain hard assertions: happy runs still expect 12 tasks, 10 leaves, 20 resources, 26 context events, 2 root evaluations, and 11 completion messages; sad runs still expect `l_2` failed, `l_3` blocked, one partial artifact, and 7 completion messages. +- Any persistence-level data still needed for e2e assertions should be exposed through `ergon_core.test_support` helpers rather than imported directly by `tests/e2e`. + +## Open Decisions + +1. Whether e2e should include one CLI subprocess canary in addition to HTTP harness submission. +2. Whether sandbox command WAL persistence lands during this refactor or remains a follow-up. +3. Whether `tests/integration/swebench_verified/test_smoke_e2e.py` should be renamed because it is not a full e2e test. diff --git a/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md b/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md new file mode 100644 index 00000000..59306462 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md @@ -0,0 +1,908 @@ +# Evaluation Resource Context and Scoring Patch Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make evaluator criteria fetch their own task-scoped resources, judge final artifacts rather than assistant summaries, and preserve evaluator-normalized scores without double-normalizing. + +**Architecture:** Core remains benchmark-agnostic: it exposes task-scoped resource access through `CriterionRuntime`. Benchmark criteria in `ergon_builtins` decide which resources to read, how to sort final outputs vs scratch files, and what to show verifiers or LLM judges. Evaluation persistence assumes all evaluators return normalized scalar task scores. + +**Tech Stack:** Python, Pydantic models, SQLModel, Ergon `CriterionRuntime`, ResearchRubrics LLM judge, real-LLM rollout artifacts. + +--- + +## Code Change Map + +- Modify: `ergon_core/ergon_core/api/criterion_runtime.py` + - Add optional `task_execution_id` to `list_resources`. + - Add `read_resource_by_id` so criteria can read exact SQL rows after listing. + +- Modify: `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` + - Implement optional task-execution scoping for `list_resources`. + - Implement `read_resource_by_id`. + - Keep core generic: no final-vs-scratch classification here. + +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py` + - Fetch resources from `context.runtime`. + - Classify ResearchRubrics final outputs vs scratch files locally. + - Build the judge prompt from resource content plus final assistant message. + - Record `evaluated_resource_ids` and `evaluation_input`. + +- Modify: `ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py` + - Stop re-normalizing `TaskEvaluationResult.score`. + - Store `summary.normalized_score = result.score`. + +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py` + - Keep existing ResearchRubrics formula, but clarify metadata with normalized score semantics. + +- Modify: `tests/real_llm/artifact_health.py` + - Detect missing final output via task-scoped resource rows and final-output provenance, not durable blob `file_path`. + +- Tests: + - `tests/unit/state/test_criterion_runtime_di.py` + - `tests/unit/state/test_research_rubrics_benchmark.py` + - `tests/unit/runtime/test_evaluation_summary_contracts.py` + - `tests/unit/runtime/test_real_llm_rollout_artifact_health.py` + +--- + +## Task 1: Extend Core Runtime Resource Access + +**Files:** +- Modify: `ergon_core/ergon_core/api/criterion_runtime.py` +- Modify: `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` +- Test: `tests/unit/state/test_criterion_runtime_di.py` + +### Rationale + +Criteria should own context selection. Core should only provide generic resource primitives: + +- list resources for the evaluated task execution by default; +- optionally list resources for an explicit task execution id; +- read exact resources by id to avoid name collisions. + +Core must not know about ResearchRubrics final reports, scratchpads, or judge prompt layout. + +### Patch: Public Protocol + +In `ergon_core/ergon_core/api/criterion_runtime.py`, add `UUID` under `TYPE_CHECKING` or as a normal import. Since Protocol signatures need the type at runtime under postponed annotations are not enabled in this file, use a normal import: + +```python +from uuid import UUID +``` + +Change the resource methods: + +```python +# ── resource I/O ────────────────────────────────────────────────── +async def read_resource(self, name: str) -> bytes: ... +async def read_resource_by_id(self, resource_id: UUID) -> bytes: ... +async def list_resources( + self, + task_execution_id: UUID | None = None, +) -> "list[RunResourceView]": ... +async def get_all_files_for_task(self) -> "dict[str, bytes]": + """Return ``{name: bytes}`` for every resource produced by this task. + + Scoped to the runtime's evaluator-bound task execution. On duplicate + ``name`` s, the newest ``created_at`` wins. Not size-capped. + """ + ... +``` + +### Patch: Concrete Runtime + +In `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py`, keep the existing SQLModel imports: + +```python +from sqlmodel import Session, desc, select +``` + +Add exact-id reading after `read_resource`: + +```python +async def read_resource_by_id(self, resource_id: UUID) -> bytes: + """Read one worker-published blob by its RunResource primary key.""" + with get_session() as session: + row = session.get(RunResource, resource_id) + + if row is None or row.run_id != self._run_id: + raise ResourceNotFoundError( + f"No run_resource {resource_id!s} for run {self._run_id}" + ) + + result = Path(row.file_path).read_bytes() + logger.info( + "criterion read_resource_by_id run_id=%s resource_id=%s size_bytes=%d", + self._run_id, + resource_id, + len(result), + ) + return result +``` + +Replace `list_resources` with task-aware behavior: + +```python +async def list_resources( + self, + task_execution_id: UUID | None = None, +) -> list[RunResourceView]: + """Return resource DTOs for this run, newest first. + + Defaults to this runtime's evaluated task execution. Passing + ``task_execution_id`` lets a benchmark criterion inspect a related task + explicitly without core knowing benchmark semantics. + """ + effective_execution_id = ( + task_execution_id if task_execution_id is not None else self._task_id + ) + with get_session() as session: + stmt = select(RunResource).where(RunResource.run_id == self._run_id) + if effective_execution_id is not None: + stmt = stmt.where(RunResource.task_execution_id == effective_execution_id) + stmt = stmt.order_by(desc(RunResource.created_at)) + rows = list(session.exec(stmt).all()) + return [RunResourceView.from_row(r) for r in rows] +``` + +### Tests + +In `tests/unit/state/test_criterion_runtime_di.py`, update the protocol test expected method set: + +```python +expected = { + "ensure_sandbox", + "upload_files", + "write_file", + "run_command", + "execute_code", + "cleanup", + "read_resource", + "read_resource_by_id", + "list_resources", + "get_all_files_for_task", + "db_read_session", + "event_sink", +} +``` + +Add tests: + +```python +@pytest.mark.asyncio +async def test_list_resources_defaults_to_runtime_task_execution() -> None: + task_execution_id = uuid4() + runtime = _make_runtime(task_id=task_execution_id) + + mock_row = MagicMock() + mock_session = MagicMock() + mock_session.__enter__ = MagicMock(return_value=mock_session) + mock_session.__exit__ = MagicMock(return_value=False) + mock_session.exec.return_value.all.return_value = [mock_row] + + with ( + patch( + "ergon_core.core.runtime.evaluation.criterion_runtime.get_session", + return_value=mock_session, + ), + patch.object(RunResourceView, "from_row", return_value=MagicMock()) as mock_from_row, + ): + result = await runtime.list_resources() + + assert len(result) == 1 + mock_from_row.assert_called_once_with(mock_row) + # Keep this assertion broad: SQLModel statements are hard to compare, but + # this ensures a DB query was issued through the runtime path. + mock_session.exec.assert_called_once() +``` + +```python +@pytest.mark.asyncio +async def test_read_resource_by_id_reads_exact_blob(tmp_path: Path) -> None: + blob = tmp_path / "abc" + blob.write_bytes(b"exact-resource") + + run_id = uuid4() + resource_id = uuid4() + row = MagicMock() + row.id = resource_id + row.run_id = run_id + row.file_path = str(blob) + + runtime = _make_runtime(run_id=run_id) + + mock_session = MagicMock() + mock_session.__enter__ = MagicMock(return_value=mock_session) + mock_session.__exit__ = MagicMock(return_value=False) + mock_session.get.return_value = row + + with patch( + "ergon_core.core.runtime.evaluation.criterion_runtime.get_session", + return_value=mock_session, + ): + result = await runtime.read_resource_by_id(resource_id) + + assert result == b"exact-resource" +``` + +Run: + +```bash +uv run pytest tests/unit/state/test_criterion_runtime_di.py -q +``` + +Expected: all tests pass. + +--- + +## Task 2: Make ResearchRubrics Criterion Fetch and Package Its Own Evidence + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py` +- Test: `tests/unit/state/test_research_rubrics_benchmark.py` + +### Rationale + +ResearchRubrics should judge the actual task artifacts, not the final assistant summary. The built-in criterion should use the generic runtime to fetch resources, then apply ResearchRubrics-specific evidence policy: + +- final outputs first; +- scratch/intermediate resources second; +- final assistant message as status/context only. + +### Patch + +Add imports: + +```python +from uuid import UUID + +from ergon_core.api.run_resource import RunResourceView +``` + +Add constants and a small local evidence type: + +```python +_MAX_RESOURCE_CHARS = 30_000 +_FINAL_OUTPUT_PREFIX = "/workspace/final_output/" + + +class _ResourceEvidence(BaseModel): + model_config = {"frozen": True, "arbitrary_types_allowed": True} + + resource: RunResourceView + content: str + + @property + def resource_id(self) -> str: + return str(self.resource.id) +``` + +Change `evaluate`: + +```python +async def evaluate(self, context: EvaluationContext) -> CriterionResult: + final_outputs, scratch_outputs = await _load_researchrubrics_evidence(context) + user_prompt = _build_user_prompt( + context, + final_outputs=final_outputs, + scratch_outputs=scratch_outputs, + ) + verdict = await call_structured_judge( + messages=[ + JudgeMessage(role="system", content=self.system_prompt), + JudgeMessage(role="user", content=user_prompt), + ], + response_type=ResearchRubricsVerdict, + model=self.model, + ) + evaluated_resource_ids = [ + evidence.resource_id for evidence in [*final_outputs, *scratch_outputs] + ] + return CriterionResult( + name=self.name, + score=self.max_score if verdict.passed else 0.0, + passed=verdict.passed, + weight=self.weight, + feedback=verdict.reasoning, + evaluation_input=_summarize_evaluation_input( + final_outputs=final_outputs, + scratch_outputs=scratch_outputs, + final_assistant_message=context.worker_result.output, + ), + evaluated_resource_ids=evaluated_resource_ids, + metadata={ + "primary_evidence_resource_ids": [e.resource_id for e in final_outputs], + "scratch_evidence_resource_ids": [e.resource_id for e in scratch_outputs], + }, + ) +``` + +Add evidence loading helpers: + +```python +async def _load_researchrubrics_evidence( + context: EvaluationContext, +) -> tuple[list[_ResourceEvidence], list[_ResourceEvidence]]: + if context.runtime is None: + return [], [] + + resources = await context.runtime.list_resources() + final_resources = [resource for resource in resources if _is_final_output_resource(resource)] + scratch_resources = [resource for resource in resources if resource not in final_resources] + + final_outputs = await _read_text_resources(context, final_resources) + scratch_outputs = await _read_text_resources(context, scratch_resources) + return final_outputs, scratch_outputs +``` + +```python +async def _read_text_resources( + context: EvaluationContext, + resources: list[RunResourceView], +) -> list[_ResourceEvidence]: + if context.runtime is None: + return [] + + evidence: list[_ResourceEvidence] = [] + for resource in resources: + if not _is_text_like(resource): + continue + content_bytes = await context.runtime.read_resource_by_id(resource.id) + content = content_bytes.decode("utf-8", errors="replace") + if len(content) > _MAX_RESOURCE_CHARS: + content = content[:_MAX_RESOURCE_CHARS] + "\n\n[truncated]" + evidence.append(_ResourceEvidence(resource=resource, content=content)) + return evidence +``` + +```python +def _is_text_like(resource: RunResourceView) -> bool: + return ( + resource.mime_type.startswith("text/") + or resource.mime_type in {"application/json", "application/x-ndjson"} + or resource.name.endswith((".md", ".txt", ".json", ".jsonl", ".csv")) + ) +``` + +```python +def _is_final_output_resource(resource: RunResourceView) -> bool: + origin = resource.metadata.get("sandbox_origin") + return isinstance(origin, str) and origin.startswith(_FINAL_OUTPUT_PREFIX) +``` + +Replace `_build_user_prompt`: + +```python +def _build_user_prompt( + context: EvaluationContext, + *, + final_outputs: list[_ResourceEvidence], + scratch_outputs: list[_ResourceEvidence], +) -> str: + return "\n\n".join( + [ + f"Original research request:\n{context.task.description}", + _format_resource_section( + "Final output resources (primary answer to judge)", + final_outputs, + empty="No final output resources were published.", + ), + _format_resource_section( + "Scratch/intermediate resources (supporting context; do not treat as final answer)", + scratch_outputs, + empty="No scratch resources were published.", + ), + ( + "Final assistant message (execution summary/status, not the primary answer):\n" + f"{context.worker_result.output}" + ), + ] + ) +``` + +Add format helpers: + +```python +def _format_resource_section( + title: str, + resources: list[_ResourceEvidence], + *, + empty: str, +) -> str: + if not resources: + return f"{title}:\n{empty}" + blocks = [f"{title}:"] + for evidence in resources: + resource = evidence.resource + origin = resource.metadata.get("sandbox_origin") + blocks.append( + "\n".join( + [ + f"--- resource_id={resource.id} name={resource.name} kind={resource.kind}", + f"mime_type={resource.mime_type} sandbox_origin={origin}", + evidence.content, + ] + ) + ) + return "\n\n".join(blocks) +``` + +```python +def _summarize_evaluation_input( + *, + final_outputs: list[_ResourceEvidence], + scratch_outputs: list[_ResourceEvidence], + final_assistant_message: str, +) -> str: + return "\n".join( + [ + "Evidence used by ResearchRubrics judge:", + "final_outputs=" + + ", ".join(f"{e.resource.name}:{e.resource.id}" for e in final_outputs), + "scratch_outputs=" + + ", ".join(f"{e.resource.name}:{e.resource.id}" for e in scratch_outputs), + "final_assistant_message=" + + final_assistant_message[:1000], + ] + ) +``` + +### Tests + +In `tests/unit/state/test_research_rubrics_benchmark.py`, add a fake runtime and direct unit test for the criterion. + +```python +class _Runtime: + def __init__(self, resources, blobs): + self._resources = resources + self._blobs = blobs + + async def list_resources(self, task_execution_id=None): + return self._resources + + async def read_resource_by_id(self, resource_id): + return self._blobs[resource_id] +``` + +Patch `call_structured_judge` and assert: + +```python +@pytest.mark.asyncio +async def test_researchrubrics_judge_uses_final_resource_content(monkeypatch): + from uuid import uuid4 + from ergon_core.api.evaluation_context import EvaluationContext + from ergon_core.api.results import WorkerOutput + from ergon_core.api.run_resource import RunResourceKind, RunResourceView + from ergon_builtins.benchmarks.researchrubrics.judge_criterion import ( + ResearchRubricsJudgeCriterion, + ResearchRubricsVerdict, + ) + + report_id = uuid4() + scratch_id = uuid4() + run_id = uuid4() + execution_id = uuid4() + report = RunResourceView( + id=report_id, + run_id=run_id, + task_execution_id=execution_id, + kind=RunResourceKind.REPORT, + name="report.md", + mime_type="text/markdown", + file_path="/tmp/blob/report", + size_bytes=12, + content_hash="abc", + error=None, + metadata={"sandbox_origin": "/workspace/final_output/report.md"}, + ) + scratch = RunResourceView( + id=scratch_id, + run_id=run_id, + task_execution_id=execution_id, + kind=RunResourceKind.NOTE, + name="notes.md", + mime_type="text/markdown", + file_path="/tmp/blob/notes", + size_bytes=5, + content_hash="def", + error=None, + metadata={"sandbox_origin": "/workspace/scratch/notes.md"}, + ) + captured = {} + + async def fake_judge(*, messages, response_type, model): + captured["prompt"] = messages[1].content + return ResearchRubricsVerdict(reasoning="report satisfies criterion", passed=True) + + monkeypatch.setattr( + "ergon_builtins.benchmarks.researchrubrics.judge_criterion.call_structured_judge", + fake_judge, + ) + + criterion = ResearchRubricsJudgeCriterion( + name="criterion_0", + rubric=RubricCriterion(criterion="Includes sources.", axis="Explicit", weight=2.0), + ) + task = BenchmarkTask( + task_slug="sample", + instance_key="default", + description="Write a report.", + ) + context = EvaluationContext( + run_id=run_id, + task_id=uuid4(), + execution_id=execution_id, + task=task, + worker_result=WorkerOutput(output="Wrote report.md"), + runtime=_Runtime( + [report, scratch], + { + report_id: b"# Findings\nFinal report text", + scratch_id: b"draft notes", + }, + ), + ) + + result = await criterion.evaluate(context) + + assert result.passed is True + assert str(report_id) in result.evaluated_resource_ids + assert str(scratch_id) in result.evaluated_resource_ids + assert "Final output resources" in captured["prompt"] + assert "Final report text" in captured["prompt"] + assert "Scratch/intermediate resources" in captured["prompt"] + assert "draft notes" in captured["prompt"] +``` + +Run: + +```bash +uv run pytest tests/unit/state/test_research_rubrics_benchmark.py -q +``` + +Expected: all tests pass. + +--- + +## Task 3: Align Rollout Artifact Health With Task-Scoped Final Outputs + +**Files:** +- Modify: `tests/real_llm/artifact_health.py` +- Test: `tests/unit/runtime/test_real_llm_rollout_artifact_health.py` + +### Rationale + +Health analysis works on dumped JSONL, not live SQL. It should mirror the same policy: + +- group resources by `task_execution_id`; +- a completed task has a final output if at least one resource has `metadata_json.sandbox_origin` under `/workspace/final_output/`; +- do not compare durable blob `file_path` to logical sandbox paths. + +### Patch + +In `tests/real_llm/artifact_health.py`, add helpers near `_tool_budget_signals`: + +```python +_FINAL_OUTPUT_PREFIX = "/workspace/final_output/" + + +def _resource_metadata(resource: dict[str, Any]) -> dict[str, Any]: # slopcop: ignore[no-typing-any] + metadata = resource.get("metadata_json") or resource.get("metadata") or {} + if isinstance(metadata, str): + return json.loads(metadata) + return metadata if isinstance(metadata, dict) else {} + + +def _is_final_output_resource(resource: dict[str, Any]) -> bool: # slopcop: ignore[no-typing-any] + origin = _resource_metadata(resource).get("sandbox_origin") + return isinstance(origin, str) and origin.startswith(_FINAL_OUTPUT_PREFIX) +``` + +Replace current `missing_final_report` calculation: + +```python +completed_execution_ids = { + str(execution.get("id")) + for execution in executions + if execution.get("status") == "completed" and execution.get("id") is not None +} +final_output_execution_ids = { + str(resource.get("task_execution_id")) + for resource in resources + if resource.get("task_execution_id") is not None and _is_final_output_resource(resource) +} +missing_final_report = bool(completed_execution_ids - final_output_execution_ids) +``` + +This field name can stay `missing_final_report` for now to avoid dashboard churn, but the semantics become “completed task is missing a final-output resource.” + +### Tests + +In `tests/unit/runtime/test_real_llm_rollout_artifact_health.py`, update `_write_minimal_rollout` to optionally write final-output metadata: + +```python +def _write_minimal_rollout( + root: Path, + *, + task_count: int = 1, + evaluation_rows: list[dict] | None = None, + resource_rows: list[dict] | None = None, +) -> None: + ... + execution_ids = [str(uuid4()) for _ in range(task_count)] + ... + _write_jsonl( + db / "run_task_executions.jsonl", + [ + { + "id": execution_ids[idx], + "task_slug": f"task-{idx}", + "status": "completed", + } + for idx in range(task_count) + ], + ) + ... + _write_jsonl( + db / "run_resources.jsonl", + resource_rows + if resource_rows is not None + else [ + { + "id": str(uuid4()), + "task_execution_id": execution_ids[0], + "name": "report.md", + "metadata_json": {"sandbox_origin": "/workspace/final_output/report.md"}, + } + ], + ) +``` + +Add: + +```python +def test_artifact_health_detects_final_output_by_task_resource_metadata(tmp_path: Path) -> None: + execution_id = str(uuid4()) + _write_minimal_rollout( + tmp_path, + task_count=1, + evaluation_rows=[ + { + "id": str(uuid4()), + "score": 0.75, + "summary_json": { + "evaluator_name": "research-rubric", + "normalized_score": 0.75, + "criterion_results": [ + { + "criterion_name": "criterion_0", + "criterion_type": "researchrubrics-llm-judge", + "score": 1.0, + "max_score": 1.0, + "passed": True, + "weight": 1.0, + "status": "passed", + "criterion_description": "Includes citations.", + "feedback": "The report cited source material.", + } + ], + }, + } + ], + resource_rows=[ + { + "id": str(uuid4()), + "task_execution_id": execution_id, + "name": "report.md", + "file_path": "/tmp/ergon-blob/abc", + "metadata_json": {"sandbox_origin": "/workspace/final_output/report.md"}, + } + ], + ) +``` + +If `_write_minimal_rollout` generates execution ids internally, return them from the helper or pass explicit ids. Keep the test focused: final-output detection must use `metadata_json.sandbox_origin`, not durable `file_path`. + +Run: + +```bash +uv run pytest tests/unit/runtime/test_real_llm_rollout_artifact_health.py tests/real_llm/test_artifact_health.py -q +``` + +Expected: all tests pass. + +--- + +## Task 4: Preserve Evaluator-Normalized Scores + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py` +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py` +- Test: `tests/unit/runtime/test_evaluation_summary_contracts.py` +- Test: `tests/unit/state/test_research_rubrics_benchmark.py` + +### Rationale + +New standard: all evaluators return normalized scalar scores in `TaskEvaluationResult.score`. Persistence must record, not reinterpret, that score. + +Current bug: + +```python +total_score = result.score +normalized = total_score / max_score_total if max_score_total > 0 else 0.0 +``` + +For ResearchRubrics, `result.score` is already normalized, so this divides twice. + +### Patch: Persistence + +In `build_evaluation_summary`, replace: + +```python +total_score = result.score +normalized = total_score / max_score_total if max_score_total > 0 else 0.0 +``` + +with: + +```python +normalized = result.score +``` + +Keep `max_score_total` as rubric display metadata: + +```python +return EvaluationSummary( + evaluator_name=result.evaluator_name, + max_score=max_score_total, + normalized_score=normalized, + stages_evaluated=len(stage_names), + stages_passed=stages_passed, + criterion_results=entries, +) +``` + +### Patch: ResearchRubrics Metadata + +In `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py`, keep the formula and add explicit score metadata: + +```python +return TaskEvaluationResult( + task_slug=task.task_slug, + score=normalized_score, + passed=total_score > 0, + evaluator_name=self.name, + criterion_results=results, + metadata={ + "score_scale": "normalized_0_1", + "raw_score": total_score, + "max_possible": max_possible, + "min_possible": min_possible, + }, +) +``` + +### Tests + +In `tests/unit/runtime/test_evaluation_summary_contracts.py`, add: + +```python +def test_build_evaluation_summary_preserves_evaluator_normalized_score() -> None: + summary = build_evaluation_summary( + _service_result( + feedback="criterion ran", + criterion_score=0.5, + criterion_weight=2.0, + passed=True, + ), + evaluation_input=None, + ) + + assert summary.normalized_score == 0.5 + assert summary.max_score == 1.0 +``` + +To make this test prove the no-double-normalization contract, change the helper's `CriterionSpec` for this test case from `max_score=1.0` to `max_score=2.0`. With the old implementation, `summary.normalized_score` would be `0.25`; with the new contract, it remains `0.5`. + +In `tests/unit/state/test_research_rubrics_benchmark.py`, update expected metadata: + +```python +assert result.metadata == { + "score_scale": "normalized_0_1", + "raw_score": 2.0, + "max_possible": 2.0, + "min_possible": -1.0, +} +``` + +Run: + +```bash +uv run pytest tests/unit/runtime/test_evaluation_summary_contracts.py tests/unit/state/test_research_rubrics_benchmark.py -q +``` + +Expected: all tests pass. + +--- + +## Task 5: Verify With One Real Rollout + +**Files:** +- No new code files. + +### Commands + +Run focused checks: + +```bash +uv run pytest \ + tests/unit/state/test_criterion_runtime_di.py \ + tests/unit/state/test_research_rubrics_benchmark.py \ + tests/unit/runtime/test_evaluation_summary_contracts.py \ + tests/unit/runtime/test_real_llm_rollout_artifact_health.py \ + tests/real_llm/test_artifact_health.py \ + -q +``` + +Expected: all tests pass. + +Run lint/compile for touched files: + +```bash +uv run ruff check \ + ergon_core/ergon_core/api/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py \ + tests/real_llm/artifact_health.py \ + tests/unit/state/test_criterion_runtime_di.py \ + tests/unit/state/test_research_rubrics_benchmark.py \ + tests/unit/runtime/test_evaluation_summary_contracts.py \ + tests/unit/runtime/test_real_llm_rollout_artifact_health.py +``` + +Expected: `All checks passed!` + +Run compile: + +```bash +uv run python -m compileall -q \ + ergon_core/ergon_core/api/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py \ + tests/real_llm/artifact_health.py +``` + +Expected: exit code `0`. + +After rebuild, rerun one real sample: + +```bash +ERGON_REAL_LLM=1 \ +ERGON_REAL_LLM_MODEL=openrouter:anthropic/claude-opus-4.7 \ +ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react \ +ERGON_REAL_LLM_LIMIT=1 \ +ERGON_REAL_LLM_BUDGET_USD=25 \ +TEST_HARNESS_SECRET=real-llm-secret \ +uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py --assume-stack-up -vv -s +``` + +Expected rollout properties: + +- terminal status is `completed`; +- artifact health reports `missing_final_report: False`; +- `normalized scores` matches `RunTaskEvaluation.score`; +- criterion `evaluated_resource_ids` contains the report resource id; +- judge feedback references details from the full final report, not just the final assistant summary. + +--- + +## Non-Goals + +- Do not put final-vs-scratch classification in `ergon_core`. +- Do not include full agent conversation in ResearchRubrics judge prompts by default. +- Do not introduce a new persisted table for evidence bundles. +- Do not preserve compatibility with double-normalized summary scores; new runs should use the normalized score invariant. diff --git a/docs/superpowers/plans/2026-04-28-mas-rebase-regression-recovery.md b/docs/superpowers/plans/2026-04-28-mas-rebase-regression-recovery.md new file mode 100644 index 00000000..f5475b3a --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-mas-rebase-regression-recovery.md @@ -0,0 +1,386 @@ +# MAS Rebase Regression Recovery Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Recover changes lost or blurred during the `feature/mas-main-rebase` merge, without undoing intentional main-branch experiment-run work. + +**Architecture:** Treat this as a rebase audit and repair plan. Definite regressions get direct test-first fixes; the older object-first `ExperimentRunHandle` / `Experiment.run()` API is intentionally retired in favor of the newer experiment definition and launch services. + +**Tech Stack:** Python 3.13, Pydantic, SQLModel, pytest, uv, Ergon core/runtime/API packages. + +--- + +## Audit Summary + +The rebase worktree is clean at `feature/mas-main-rebase` with `HEAD` at `ab28db3` (`Merge main into MAS debugger branch`). The broad cleanup survived, but two regressions need action. + +### Preserved Work + +- Public API thinning survived: removed `ergon_core.api.generation`, `json_types`, `run_resource`, `criterion_runtime`, `dependencies`, and `types`. +- Runtime homes survived: `core/runtime/resources.py`, `core/runtime/dependencies.py`, and `core/runtime/evaluation/protocols.py`. +- Context schema consolidation survived: `ContextPart`, `ContextPartChunk`, and `ContextPartChunkLog` are the core stream/log schemas; old `GenerationTurn` and old `*Payload` context-event classes are gone from core. +- File moves survived: Inngest client/registry under `core/runtime/inngest/`, sandbox under `core/sandbox/`, ResearchRubrics sandbox manager under builtins, OpenRouter budget under `tests/real_llm`, and tracing split into `core/runtime/tracing/`. +- `error_payload.py`, `build_error_json`, `RuntimeErrorPayload`, and `_worker_execute_result_from_exception` remain removed. + +### Definite Regression + +`_worker_execute_result_from_output()` has reappeared in `ergon_core/ergon_core/core/runtime/inngest/worker_execute.py`, along with `tests/unit/runtime/test_worker_execute_output_failure.py`. + +Today's intended state was: + +- No private adapter helper for `WorkerOutput -> WorkerExecuteResult`. +- Success result construction inlined at the only callsite. +- No helper-level test importing `_worker_execute_result_from_output`. + +### Intentional Retirement + +`ExperimentRunHandle` and `Experiment.run()` existed on `safety/mas-before-main-rebase`, but are absent in `feature/mas-main-rebase`. + +Current state: + +- `ergon_core/ergon_core/api/handles.py` defines only `PersistedExperimentDefinition`. +- `ergon_core/ergon_core/api/__init__.py` exports only `PersistedExperimentDefinition`, not `ExperimentRunHandle`. +- `ergon_core/ergon_core/api/experiment.py` exposes `persist()` but no `run()`. +- Main added experiment launch/read services under `core/runtime/services/experiment_*`, and that newer model is the one we want to keep. + +Decision: do **not** restore `ExperimentRunHandle` or `Experiment.run()`. Treat the older object-run API as retired. The fix is to remove stale handle/run wording and add tests that prevent the old single-run handle from returning to `ergon_core.api`. + +--- + +## Files To Touch + +### Definite Helper Regression + +- Modify: `ergon_core/ergon_core/core/runtime/inngest/worker_execute.py` +- Delete: `tests/unit/runtime/test_worker_execute_output_failure.py` +- Modify or add guard: `tests/unit/runtime/test_import_boundaries.py` or `tests/unit/architecture/test_public_api_boundaries.py` + +### Experiment Handle Retirement + +- Modify: `ergon_core/ergon_core/api/handles.py` docstring +- Modify/add API boundary test confirming no `ExperimentRunHandle` / no `Experiment.run` +- Update docs that still describe `run()` as part of the object-first authoring API. + +--- + +## Task 1: Lock In The Helper Removal Regression + +**Files:** +- Modify: `tests/unit/runtime/test_import_boundaries.py` +- Modify: `ergon_core/ergon_core/core/runtime/inngest/worker_execute.py` +- Delete: `tests/unit/runtime/test_worker_execute_output_failure.py` + +- [ ] **Step 1: Add a failing guard for deleted worker helper adapters** + +Add this test to `tests/unit/runtime/test_import_boundaries.py`: + +```python +def test_worker_execute_does_not_expose_result_adapter_helpers() -> None: + import ergon_core.core.runtime.inngest.worker_execute as worker_execute + + assert not hasattr(worker_execute, "_worker_execute_result_from_output") + assert not hasattr(worker_execute, "_worker_execute_result_from_exception") +``` + +- [ ] **Step 2: Run the guard and verify it fails before the fix** + +Run: + +```bash +uv run pytest tests/unit/runtime/test_import_boundaries.py::test_worker_execute_does_not_expose_result_adapter_helpers -q +``` + +Expected before fix: + +```text +FAILED ... assert not hasattr(worker_execute, "_worker_execute_result_from_output") +``` + +- [ ] **Step 3: Inline the success result construction** + +In `ergon_core/ergon_core/core/runtime/inngest/worker_execute.py`, remove: + +```python +def _worker_execute_result_from_output(output: WorkerOutput) -> WorkerExecuteResult: + return WorkerExecuteResult( + success=output.success, + final_assistant_message=output.output, + error=None if output.success else output.output, + ) +``` + +Then replace: + +```python +return _worker_execute_result_from_output(output) +``` + +with: + +```python +return WorkerExecuteResult( + success=output.success, + final_assistant_message=output.output, + error=None if output.success else output.output, +) +``` + +Also remove the now-unused import: + +```python +from ergon_core.api.results import WorkerOutput +``` + +- [ ] **Step 4: Delete helper-specific test** + +Delete: + +```text +tests/unit/runtime/test_worker_execute_output_failure.py +``` + +This test asserts a private helper mapping and should not survive once the helper is gone. The behavior is still covered by `worker_execute_fn` return construction and `WorkerExecuteResult` model validation. + +- [ ] **Step 5: Run focused verification** + +Run: + +```bash +uv run pytest tests/unit/runtime/test_import_boundaries.py tests/unit/runtime/test_failure_error_json.py -q +uv run ruff check ergon_core/ergon_core/core/runtime/inngest/worker_execute.py tests/unit/runtime/test_import_boundaries.py +``` + +Expected: + +```text +passed +All checks passed! +``` + +--- + +## Task 2: Lock In The New Experiment Launch Model + +**Files:** +- Inspect: `ergon_core/ergon_core/api/experiment.py` +- Inspect: `ergon_core/ergon_core/api/handles.py` +- Inspect: `ergon_core/ergon_core/core/runtime/services/run_service.py` +- Inspect: `ergon_core/ergon_core/core/runtime/services/experiment_launch_service.py` +- Inspect: `ergon_cli/ergon_cli/commands/benchmark.py` + +- [ ] **Step 1: Confirm current execution entry points** + +Run: + +```bash +rg "class ExperimentRunHandle|async def run\\(|create_experiment_run|launch" \ + ergon_core/ergon_core/api \ + ergon_core/ergon_core/core/runtime/services \ + ergon_cli/ergon_cli/commands \ + tests -n +``` + +Expected current signal: + +- `ExperimentRunHandle` appears only as a CLI-local class in `ergon_cli/ergon_cli/commands/benchmark.py`. +- `Experiment` has `persist()` but no `run()`. +- Main-branch experiment services own launch/read behavior. + +Step 1 confirms that the newer model is active: + +- `ExperimentRecord` stores the experiment campaign/sample selection. +- `ExperimentLaunchService.run_experiment()` expands one `ExperimentRecord` into many `RunRecord`s. +- `ExperimentRunResult` returns `run_ids: list[UUID]`, not a single `run_id`. +- `ergon_core.api.Experiment` remains a workflow-definition composition object with `persist()` only. + +- [ ] **Step 2: Write a guard for the retired object-run API** + +Add tests to `tests/unit/api/test_public_api_imports.py`: + +```python +def test_object_first_experiment_run_api_is_retired() -> None: + public_api = importlib.import_module("ergon_core.api") + + assert not hasattr(public_api, "ExperimentRunHandle") + assert not hasattr(public_api.Experiment, "run") +``` + +- [ ] **Step 3: Clean stale handle wording** + +Update `ergon_core/ergon_core/api/handles.py` docstring from: + +```python +"""Public lifecycle handle types returned by persist() and run().""" +``` + +to: + +```python +"""Public lifecycle handle types returned by Experiment.persist().""" +``` + +- [ ] **Step 4: Run focused API verification** + +Run: + +```bash +uv run pytest tests/unit/api/test_public_api_imports.py -q +``` + +Expected: + +```text +passed +``` + +--- + +## Task 3: Add A Rebase Recovery Guard For Historical Regressions + +**Files:** +- Modify: `tests/unit/architecture/test_public_api_boundaries.py` +- Modify: `tests/unit/runtime/test_import_boundaries.py` + +- [ ] **Step 1: Guard deleted API facade modules by module spec** + +Add to `tests/unit/architecture/test_public_api_boundaries.py`: + +```python +import importlib.util + + +def test_removed_api_facade_modules_do_not_exist() -> None: + removed_modules = ( + "ergon_core.api.generation", + "ergon_core.api.json_types", + "ergon_core.api.run_resource", + "ergon_core.api.criterion_runtime", + "ergon_core.api.dependencies", + "ergon_core.api.types", + ) + + for module_name in removed_modules: + assert importlib.util.find_spec(module_name) is None +``` + +- [ ] **Step 2: Guard worker private adapter helpers** + +Use the helper guard from Task 1. + +- [ ] **Step 3: Run architecture guards** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_public_api_boundaries.py tests/unit/runtime/test_import_boundaries.py -q +``` + +Expected: + +```text +passed +``` + +--- + +## Task 4: Final Verification + +**Files:** +- All touched files from Tasks 1-3. +- Verify: `tests/integration/smokes/test_smoke_harness.py` +- Verify: `tests/e2e/` + +- [ ] **Step 1: Run focused test group** + +Run: + +```bash +uv run pytest \ + tests/unit/api/test_public_api_imports.py \ + tests/unit/architecture/test_public_api_boundaries.py \ + tests/unit/runtime/test_import_boundaries.py \ + tests/unit/runtime/test_failure_error_json.py \ + -q +``` + +Expected: + +```text +passed +``` + +- [ ] **Step 2: Run targeted lint** + +Run: + +```bash +uv run ruff check \ + ergon_core/ergon_core/core/runtime/inngest/worker_execute.py \ + ergon_core/ergon_core/api/handles.py \ + ergon_core/ergon_core/api/__init__.py \ + ergon_core/ergon_core/api/experiment.py \ + tests/unit/api/test_public_api_imports.py \ + tests/unit/architecture/test_public_api_boundaries.py \ + tests/unit/runtime/test_import_boundaries.py +``` + +Expected: + +```text +All checks passed! +``` + +- [ ] **Step 3: Run local integration/e2e acceptance for the newer cohort -> experiment -> run model** + +Use this as the main system-level confidence metric for the rebase: + +> A local checkout can define an experiment through the newer cohort/experiment model, launch runs for selected samples, drive those runs through the runtime, persist graph/evaluation/resource outputs, and pass the e2e smoke path without relying on retired `Experiment.run()` / `ExperimentRunHandle`. + +Run the local smoke/e2e set used by this branch: + +```bash +uv run pytest tests/integration/smokes/test_smoke_harness.py -q +uv run pytest tests/e2e -q +``` + +Expected: + +```text +passed +``` + +If the e2e suite requires local services, start the normal local stack first, then rerun the same commands. A failure here is a blocker unless it is a documented environment prerequisite rather than a model/API regression. + +- [ ] **Step 4: Check git diff for scope** + +Run: + +```bash +git diff --stat +git diff --name-status +``` + +Expected changed files should be limited to: + +- `docs/superpowers/plans/2026-04-28-mas-rebase-regression-recovery.md` +- `ergon_core/ergon_core/core/runtime/inngest/worker_execute.py` +- `tests/unit/runtime/test_import_boundaries.py` +- `tests/unit/runtime/test_worker_execute_output_failure.py` deleted +- plus the accept-main guard/docstring files from Task 2. + +--- + +## Non-Goals + +- Do not reintroduce `ergon_core.api.generation`, `json_types`, `run_resource`, `criterion_runtime`, `dependencies`, or `types`. +- Do not reintroduce `error_payload.py`, `build_error_json`, or `RuntimeErrorPayload`. +- Do not undo main's experiment-run domain model or revive `ExperimentRunHandle` / `Experiment.run()`. +- Do not edit historical docs/RFCs unless they are actively misleading for the current public API. + +## Completion Criteria + +- `_worker_execute_result_from_output` and `_worker_execute_result_from_exception` are absent. +- `test_worker_execute_output_failure.py` is deleted or rewritten to avoid private helper imports. +- Public API state around `ExperimentRunHandle` is explicit and tested as intentionally absent. +- Local smoke/e2e tests pass through the newer `cohort -> experiment -> run` model without using the retired object-run API. +- Focused pytest and ruff checks pass. diff --git a/docs/superpowers/plans/2026-04-28-public-api-audit-and-ergonomics.md b/docs/superpowers/plans/2026-04-28-public-api-audit-and-ergonomics.md new file mode 100644 index 00000000..6c7de628 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-public-api-audit-and-ergonomics.md @@ -0,0 +1,1556 @@ +# Public API Audit And Ergonomics Working Doc + +This is a working document for deciding what belongs in `ergon_core.api`, what should move inward to `ergon_core.core`, and what concepts can be merged so the API is easier for students and benchmark authors to use. + +The goal is not to make the public API artificially tiny. The goal is to make it honest. A public symbol should either be: + +- something a benchmark author uses to describe work, +- something a worker author uses to solve work, +- something an evaluator author uses to score work, +- or a deliberately documented advanced extension point. + +Everything else should probably be core, CLI, dashboard, persistence, or runtime plumbing. + +## Current Public API Root + +`ergon_core.api.__all__` currently exports: + +```python +Benchmark +BenchmarkDeps +BenchmarkTask +Criterion +CriterionResult +CriteriaCheckError +DependencyError +EvaluationContext +Evaluator +Experiment +EmptyTaskPayload +PersistedExperimentDefinition +Rubric +TaskEvaluationResult +Worker +WorkerContext +WorkerOutput +WorkerSpec +``` + +Submodule-only public-ish symbols currently used or plausibly imported: + +```python +CriterionScoreSpec +CriterionObservation +CriterionObservationMessage +``` + +Important existing boundary tests: + +- `tests/unit/api/test_public_api_imports.py` already asserts that runtime/tooling concepts like `RunResourceView`, `CriterionRuntime`, `CommandResult`, `SandboxResult`, and `Tool` are not exposed at the root. +- `tests/unit/architecture/test_public_api_boundaries.py` already protects against restoring deleted facade modules like `api.generation`, `api.json_types`, `api.run_resource`, `api.criterion_runtime`, `api.dependencies`, and `api.types`. + +That means the codebase already wants `ergon_core.api` to stay authoring-scoped. The current issue is that some exported authoring-looking objects still pull runtime/persistence concepts through the side door. + +## Current Mental Model + +The current public API effectively asks users to understand this: + +```text +Benchmark -> BenchmarkTask -> Experiment -> WorkerSpec -> persisted definition -> run +Worker -> WorkerContext -> streamed core generation chunks -> WorkerOutput +Criterion -> EvaluationContext -> core CriterionRuntime -> CriterionResult +Evaluator/Rubric -> TaskEvaluationResult +``` + +The student-facing model we probably want is closer to: + +```text +Benchmark -> Task +Worker solves Task +Criterion checks WorkerOutput +Rubric combines Criteria +Core handles experiments, runs, cohorts, persistence, dispatch, and dashboards +``` + +## Usage Map At A Glance + +### CLI + +The built-in CLI imports only a small part of `ergon_core.api` directly: + +- `ergon_cli/ergon_cli/composition/__init__.py` + - imports `Experiment` + - imports `WorkerSpec` +- `ergon_cli/ergon_cli/onboarding/profile.py` + - imports `BenchmarkDeps` + +The CLI otherwise reaches straight into `ergon_core.core` for: + +- DB setup and sessions, +- telemetry models such as `RunRecord`, +- `create_run`, +- cohort resolution, +- Inngest event dispatch, +- experiment define/launch/read services, +- workflow services, +- runtime settings. + +This is a useful signal. `ergon_core.api` is not really the CLI API today. The CLI already operates at the application/runtime layer. + +### Built-ins + +`ergon_builtins` uses the public API heavily as an extension-authoring kit: + +- Benchmarks subclass `Benchmark` and create `BenchmarkTask`. +- Workers subclass `Worker` and receive `WorkerContext`. +- Criteria subclass `Criterion`, receive `EvaluationContext`, and return `CriterionResult`. +- Rubrics subclass `Rubric` and return `TaskEvaluationResult`. +- Registries type their maps as `Benchmark`, `Evaluator`, and `Worker`. +- Onboarding metadata uses `BenchmarkDeps`. + +This is the strongest argument that `Benchmark`, `BenchmarkTask`, `Worker`, `WorkerContext`, `WorkerOutput`, `Criterion`, `CriterionResult`, `CriterionScoreSpec`, `Rubric`, and `TaskEvaluationResult` should remain public or have very deliberate replacements. + +### Core Runtime + +Core runtime imports public API types in several places: + +- `core/runtime/inngest/worker_execute.py` + - uses `BenchmarkTask`, `EmptyTaskPayload`, `WorkerContext` +- `core/runtime/evaluation/inngest_executor.py` + - uses `Criterion`, `EvaluationContext`, `CriterionResult`, `WorkerOutput`, `BenchmarkTask` +- `core/runtime/evaluation/evaluation_schemas.py` + - uses `Criterion` +- `core/runtime/services/rubric_evaluation_service.py` + - uses `Evaluator`, `CriterionResult`, `TaskEvaluationResult`, `BenchmarkTask` +- `core/runtime/services/experiment_persistence_service.py` + - uses `Rubric`, `PersistedExperimentDefinition`, and type-checks `Experiment` +- `core/runtime/services/experiment_launch_service.py` + - uses `Benchmark`, `Evaluator`, `Experiment`, `PersistedExperimentDefinition`, `BenchmarkTask`, `WorkerSpec` +- `core/runtime/services/experiment_definition_service.py` + - uses `Benchmark`, `BenchmarkTask` +- `core/runtime/services/run_service.py` + - uses `PersistedExperimentDefinition` + +Some of that is fine: core runtime naturally consumes public authoring objects. But the reverse direction is more concerning: public API modules also import core runtime/persistence modules. + +### Tests + +Tests use almost every current public type: + +- API contract tests cover imports and public API boundary behavior. +- Runtime tests instantiate criteria, rubrics, contexts, tasks, and result models. +- Built-in benchmark tests instantiate `Benchmark`, `BenchmarkTask`, `BenchmarkDeps`, `EvaluationContext`, `WorkerOutput`, and result models. +- Worker tests use `WorkerContext`, `BenchmarkTask`, and `EmptyTaskPayload`. +- Runtime service tests use `PersistedExperimentDefinition`. + +This means simplification should be staged. Move internal users first, leave compatibility imports where useful, then adjust tests around the intended boundary. + +## Public File Inventory + +```text +ergon_core/ergon_core/api/ + __init__.py + exports the object-first public surface + + benchmark.py + Benchmark base class + currently also validates required packages via core runtime dependencies + + benchmark_deps.py + BenchmarkDeps onboarding metadata + + task_types.py + EmptyTaskPayload + BenchmarkTask + + worker.py + Worker base class + currently imports core generation chunk types + currently reads persisted context events to build default output + + worker_context.py + WorkerContext execution identity model + + worker_spec.py + WorkerSpec config-time registry descriptor + imports ergon_builtins registry during validation + + criterion.py + Criterion base class + currently validates required packages via core runtime dependencies + + evaluation_context.py + EvaluationContext for criteria + currently exposes core CriterionRuntime protocol as a field + + evaluator.py + Evaluator base class + Rubric concrete class + currently validates required packages via core runtime dependencies + + results.py + WorkerOutput + CriterionScoreSpec + CriterionObservationMessage + CriterionObservation + CriterionResult + TaskEvaluationResult + currently imports core JsonObject + + experiment.py + Experiment composition root + validates object graph + persists through core ExperimentPersistenceService + + handles.py + PersistedExperimentDefinition handle returned by Experiment.persist() + imports core utcnow helper + + errors.py + DependencyError + CriteriaCheckError +``` + +## Symbol By Symbol Review + +### `Benchmark` + +Current role: + +- Public base class for benchmark authors. +- Owns `type_slug`, `task_payload_model`, `build_instances()`, `evaluator_requirements()`, `parse_task_payload()`, and dependency validation. + +Where used: + +- Built-in benchmarks: MiniF2F, SWE-Bench Verified, ResearchRubrics, GDPEval. +- Core experiment definition and launch services. +- Registries type benchmark constructors. +- Tests for benchmark contracts and runtime services. + +Keep in public API? + +- Yes. + +Concerns: + +- The name is good for benchmark authors. +- `build_instances()` returning `Mapping[str, Sequence[BenchmarkTask]]` introduces "instance" as an extra concept. That may be necessary for benchmark datasets, but it is one more noun. +- `evaluator_requirements()` exposes evaluator slot binding to benchmark authors. +- `validate()` imports `core.runtime.dependencies.check_packages`. + +Possible cleanup: + +- Keep `Benchmark` public. +- Consider making `evaluator_requirements()` advanced or replacing it with a simpler `default_evaluator_slots = ("default",)` class var. +- Decide whether benchmark authors should declare dependency metadata as: + - `required_packages` plus `install_hint`, + - `onboarding_deps`, + - or one consolidated `requirements` object. +- Move dependency validation implementation inward so `api.benchmark` does not import core runtime. + +Decision question: + +- Should a student writing a benchmark need to know about evaluator binding keys, or should benchmarks just produce tasks and let the experiment/CLI layer attach rubrics? + +### `BenchmarkTask` And `EmptyTaskPayload` + +Current role: + +- `BenchmarkTask` is the public task object passed to workers and criteria. +- `EmptyTaskPayload` is the default Pydantic payload when a benchmark has no structured task data. + +Where used: + +- All built-in benchmarks create `BenchmarkTask`. +- Built-in workers consume `BenchmarkTask`. +- Built-in criteria and rubrics receive task objects. +- Core runtime reconstructs `BenchmarkTask` from persisted task rows. +- Many tests instantiate it directly. + +Keep in public API? + +- Yes. + +Concerns: + +- The name `BenchmarkTask` is precise but slightly more formal than necessary for students. +- It contains `instance_key`, `parent_task_slug`, `dependency_task_slugs`, and `evaluator_binding_keys`, which are runtime/workflow concepts mixed into the authoring task model. + +Possible cleanup: + +- Keep `BenchmarkTask` for compatibility. +- Consider a friendlier alias: + +```python +Task = BenchmarkTask +``` + +- Longer term, split: + - public `Task`: slug, description, payload, + - advanced/internal `WorkflowTaskSpec`: parent/dependencies/evaluator bindings/instance key. + +Decision question: + +- Are task dependencies and evaluator bindings part of the beginner benchmark-authoring story, or are they an advanced workflow story? + +### `Worker` + +Current role: + +- Public base class for workers. +- Authors implement `execute(task, context=...)`. +- `execute()` yields `ContextPartChunk` objects. +- Default `get_output()` reads context events from the database and extracts the last assistant text. + +Where used: + +- Built-in ReAct worker and training stub worker subclass it. +- Smoke fixtures subclass it. +- Registries type worker constructors. +- Core runtime instantiates workers in `worker_execute.py`. +- Tests assert worker contracts. + +Keep in public API? + +- Yes, but slim it down. + +Concerns: + +- `api.worker` imports: + - `core.generation.AssistantTextPart` + - `core.generation.ContextPartChunk` + - `core.persistence.context.repository.ContextEventRepository` + - `core.persistence.shared.db.get_session` + - `core.runtime.dependencies.check_packages` +- That means the public base class knows persistence and generation internals. +- Students writing a worker must understand streaming chunks, not just "return an answer". + +Possible cleanup: + +- Keep `Worker` public. +- Move DB-backed default output extraction to core runtime, probably near `worker_execute.py`. +- Decide whether beginner workers can implement a simpler method: + +```python +async def run(self, task: Task, context: WorkerContext) -> WorkerOutput: + ... +``` + +while advanced workers implement streaming: + +```python +async def execute(self, task: Task, *, context: WorkerContext) -> AsyncGenerator[ContextPartChunk, None]: + ... +``` + +- If streaming remains public, either: + - intentionally export the chunk type as an advanced public type, + - or define a small public event/chunk model that core adapts into context events. + +Decision question: + +- Should the student-facing worker API be "return a WorkerOutput" first, with streaming as advanced, or should all workers remain streaming-first? + +### `WorkerContext` + +Current role: + +- Public model passed to `Worker.execute()`. +- Contains `run_id`, `definition_id`, `task_id`, `execution_id`, `sandbox_id`, `node_id`, and metadata. + +Where used: + +- Built-in workers. +- Built-in tools such as workflow CLI tooling. +- Core runtime worker execution. +- Tests. + +Keep in public API? + +- Yes, but possibly with fewer fields. + +Concerns: + +- `definition_id` and `node_id` are graph/runtime concepts. +- `task_id` is nullable for dynamic subtasks, while `execution_id` is always present. That distinction is important to core but awkward to explain to students. + +Possible cleanup: + +- Public `WorkerContext` could expose: + - `run_id` + - `task_id` or `execution_id` + - `sandbox_id` + - `metadata` +- Internal `CoreWorkerContext` could add: + - `definition_id` + - `node_id` + - static-vs-dynamic task identity. + +Decision question: + +- Which IDs do worker authors actually need in normal code? If most only need `sandbox_id` and maybe `execution_id`, hide the rest. + +### `WorkerOutput` + +Current role: + +- Public result model for worker completion. +- Contains `output`, `success`, and metadata. + +Where used: + +- Built-in workers return it. +- Criteria receive it through `EvaluationContext`. +- Core evaluation executor wraps agent reasoning into it. +- Tests instantiate it. + +Keep in public API? + +- Yes. + +Concerns: + +- Field name `output` is generic but probably fine. +- `success` is useful but can overlap with runtime execution status. + +Possible cleanup: + +- Keep as-is unless we introduce a simpler non-streaming worker API. +- If worker runtime status and worker semantic success diverge, document that `success` means "worker produced a usable answer", not "the process did not crash". + +Decision question: + +- Do we want `WorkerOutput.output` to stay a single string, or should structured outputs become first-class? + +### `Criterion` + +Current role: + +- Public base class for atomic evaluation units. +- Authors implement `evaluate(context) -> CriterionResult`. + +Where used: + +- Built-in criteria for SWE-Bench, MiniF2F, ResearchRubrics, generic code checks, LLM judge, sandbox file check. +- Smoke fixtures. +- Core evaluation executor. +- Core evaluation schemas store `Criterion` in `CriterionSpec`. +- Tests. + +Keep in public API? + +- Yes. + +Concerns: + +- `Criterion.evaluate()` depends on `EvaluationContext`, which currently exposes core runtime capability plumbing. +- `validate()` imports core dependency checking. + +Possible cleanup: + +- Keep `Criterion` public. +- Simplify the context it receives. +- Move dependency checking inward or expose it as a small public helper independent of `core`. + +Decision question: + +- Should criteria own sandbox/resource access directly through context helper methods, or should they receive a separate capability object? + +### `EvaluationContext` + +Current role: + +- Public context passed to `Criterion.evaluate()`. +- Contains run/task/execution IDs, `BenchmarkTask`, `WorkerOutput`, sandbox ID, metadata, and optional runtime capability. + +Where used: + +- Built-in criteria. +- Smoke criteria. +- Core Inngest criterion executor. +- Tests for runtime injection and criterion contracts. + +Keep in public API? + +- Probably yes short-term, but redesign it. + +Concerns: + +- It imports `core.runtime.evaluation.protocols.CriterionRuntime`. +- The public field `runtime` means criterion authors can see an internal protocol rather than a stable student-facing capability. +- It duplicates some identity with `WorkerContext`. + +Possible cleanup: + +- Keep the name `EvaluationContext` if we want stability. +- Change the implementation so it owns public helper methods: + +```python +await context.execute_code("pytest -q") +await context.read_resource("answer.txt") +await context.read_resource_by_id(resource_id) +``` + +- Store the internal runtime in a private field, not as a public typed protocol. +- Or rename to `CriterionContext` if we want "criterion evaluates with criterion context" instead of a broader evaluation context. + +Decision question: + +- Is `EvaluationContext` the right public name, or is `CriterionContext` easier for students? + +### `CriterionScoreSpec` + +Current role: + +- Public-ish score range model for criteria. +- Not exported from `ergon_core.api.__all__`, but imported from `ergon_core.api.results` by tests and built-ins. + +Where used: + +- Criteria constructors. +- MiniF2F proof verification. +- Code check and LLM judge criteria. +- Runtime tests. + +Keep in public API? + +- Yes, if criteria remain configurable with score ranges. + +Concerns: + +- It is public by usage but not top-level exported. +- If top-level exports are the documented API, this mismatch is confusing. + +Possible cleanup: + +- Either export it at the root: + +```python +from ergon_core.api import CriterionScoreSpec +``` + +- Or document `ergon_core.api.results.CriterionScoreSpec` as advanced. + +Decision question: + +- Do we want all common authoring types available from `ergon_core.api`, or do we want submodules for less common result/config types? + +### `CriterionResult` + +Current role: + +- Public result of a single criterion. +- Includes score, pass/fail, weight, feedback, evidence IDs, observations, errors, and metadata. + +Where used: + +- Built-in criteria return it. +- Rubrics aggregate it. +- Core evaluation executor returns it from each criterion step. +- Evaluation persistence converts it into persisted summaries. +- Tests. + +Keep in public API? + +- Yes. + +Concerns: + +- It is fairly large for students. +- It overlaps with internal `CriterionResultEntry` in `core.persistence.telemetry.evaluation_summary`. + +Possible cleanup: + +- Keep public `CriterionResult`. +- Keep persisted `CriterionResultEntry` internal. +- Centralize conversion in a core adapter so authors only learn `CriterionResult`. +- Consider helper constructors: + +```python +CriterionResult.pass_(slug="...", score=1.0, feedback="...") +CriterionResult.fail(slug="...", feedback="...") +``` + +Decision question: + +- Should we add helper constructors to reduce boilerplate in student-written criteria? + +### `CriterionObservation` And `CriterionObservationMessage` + +Current role: + +- Structured observation models nested inside `CriterionResult`. +- Capture prompt messages, evidence resource/action IDs, model details, and output. + +Where used: + +- ResearchRubrics judge criterion and LLM judge criterion. +- Evaluation summary persistence imports `CriterionObservation`. +- Tests likely inspect summary contracts. + +Keep in public API? + +- Keep in `results.py`, but maybe not root export. + +Concerns: + +- This is useful for advanced LLM-as-judge and audit trails. +- It may be too detailed for the beginner path. +- It imports or depends on JSON object typing from core through `results.py`. + +Possible cleanup: + +- Keep as advanced result detail. +- Move JSON type alias local to public API or use `dict[str, object]` style. + +Decision question: + +- Do students need to produce structured observations, or is this mainly for built-in LLM judges and dashboard evidence? + +### `Rubric` + +Current role: + +- Public concrete evaluator with a fixed list of criteria. +- Aggregates criterion scores with weighted average. + +Where used: + +- Built-in rubrics. +- Smoke rubrics. +- Core persistence checks whether an evaluator is a `Rubric` to snapshot criteria names. +- Core runtime service evaluates via `Evaluator` interface. +- Tests. + +Keep in public API? + +- Yes. + +Concerns: + +- It subclasses `Evaluator`, so users see both `Evaluator` and `Rubric`. +- Public `Rubric` is simple, but `RubricEvaluationService` in core has a similar name and is a runtime runner. +- Built-ins like GDPEval subclass `Rubric` but implement staged gating, which stretches the fixed-list weighted-average base concept. + +Possible cleanup: + +- Make `Rubric` the primary student-facing evaluation concept. +- Consider an explicit `WeightedRubric` name if we add multiple rubric types. +- Rename core `RubricEvaluationService` to `TaskEvaluationService` or `EvaluationRunner` to avoid confusing public rubric with internal service. + +Decision question: + +- Is `Rubric` always "a thing with criteria", or should `Evaluator` be the primary abstraction and `Rubric` just one implementation? + +### `Evaluator` + +Current role: + +- Public ABC for objects that select criteria for a task and aggregate criterion results. +- `Rubric` subclasses it. + +Where used: + +- Built-in registry typing. +- Core evaluation service accepts `Evaluator`. +- Core launch service builds evaluator bindings. +- Custom built-in rubrics inherit through `Rubric`. + +Keep in public API? + +- Maybe. + +Concerns: + +- It is a powerful extension point, but it adds another noun for students. +- Most authors probably need `Rubric`, not arbitrary dynamic evaluators. +- ResearchRubrics does need task-specific criteria via `criteria_for(task)`, which is an evaluator behavior. + +Possible cleanup: + +- Keep `Evaluator` for advanced users. +- Do not feature it in beginner docs. +- Potentially move it to `ergon_core.api.advanced` while `Rubric` stays root-exported. +- Or keep it root-exported because registries and dynamic task-specific rubrics already rely on it. + +Decision question: + +- Do we want external users to write custom dynamic evaluators, or only criteria and rubrics? + +### `TaskEvaluationResult` + +Current role: + +- Public aggregated result for one task after criteria run. + +Where used: + +- Rubrics return it. +- Core runtime persists it. +- Tests. + +Keep in public API? + +- Yes if custom rubrics/evaluators remain public. + +Concerns: + +- It overlaps with `EvaluationSummary`, which is internal persisted/dashboard state. + +Possible cleanup: + +- Keep public. +- Make `EvaluationSummary` clearly internal. +- Add adapter for persistence. + +Decision question: + +- Should rubric authors directly construct `TaskEvaluationResult`, or should Rubric have simpler aggregation hooks? + +### `Experiment` + +Current role: + +- Public composition root binding a benchmark, worker specs, evaluator bindings, assignments, and metadata. +- Validates the object graph. +- Persists itself by lazy-importing `ExperimentPersistenceService` from core. + +Where used: + +- CLI composition builds `Experiment`. +- Core launch service builds a temporary single-sample `Experiment`. +- Core persistence service type-checks it. +- Tests cover launch/persistence behavior. + +Keep in public API? + +- Open question. + +Argument to keep: + +- It is a natural word for users: "I want to run an experiment." +- It provides one object that composes benchmark, workers, and evaluators. +- CLI composition already uses it. + +Argument to move or de-emphasize: + +- It is not an authoring primitive like `Benchmark`, `Worker`, or `Criterion`. +- It exposes binding keys, assignments, evaluator maps, and worker specs. +- `persist()` makes public API depend on core persistence. +- There are already core concepts called `ExperimentRecord` and `ExperimentDefinition`, so the word "Experiment" is overloaded. + +Possible cleanup: + +- Short-term: keep exported for compatibility. +- Medium-term: remove `persist()` from the public object. Use a core service: + +```python +definition = experiment_service.persist(experiment) +``` + +- Long-term: decide whether public users should build `Experiment` directly or use a simpler CLI/app facade: + +```python +ergon.define( + benchmark="minif2f", + worker="react", + rubric="minif2f", + model="openai:gpt-4o", +) +``` + +Decision question: + +- Is `Experiment` a public user composition object, or an internal runtime definition draft? + +My current leaning: + +- Keep `Experiment` public short-term, but make it pure composition with no persistence method. +- If the beginner docs do not need it, do not root-feature it. + +### `WorkerSpec` + +Current role: + +- Config-time descriptor for worker binding. +- Contains `worker_slug`, `name`, and `model`. +- Validates worker slug against `ergon_builtins.registry.WORKERS`. + +Where used: + +- CLI composition. +- Core launch service. +- Experiment composition and persistence. +- Tests. + +Keep in public API? + +- Probably not as a beginner concept. + +Concerns: + +- It is registry/config plumbing. +- It imports builtins registry during validation. +- It exists because live `Worker` requires runtime IDs and cannot be used at config time. + +Possible cleanup: + +- Move to core composition. +- Keep compatibility import for now. +- Replace public construction with simpler facade args: + +```python +worker="researchrubrics-workflow-cli-react" +model="openai:gpt-4o" +``` + +Decision question: + +- Do external users need to build multi-worker assignment graphs manually, or can that be an advanced/core composition feature? + +### `PersistedExperimentDefinition` + +Current role: + +- Handle returned by `Experiment.persist()`. +- Contains `definition_id`, benchmark type, worker/evaluator bindings, counts, created timestamp, and metadata. + +Where used: + +- CLI benchmark command renders it and uses it to create a run. +- Core run service takes it. +- Core launch service returns it from workflow definition factory. +- Runtime tests instantiate it. + +Keep in public API? + +- Probably not as student authoring API. + +Concerns: + +- It is a persistence/launch handle, not an authoring concept. +- Its name overlaps with core `ExperimentDefinition` table rows. + +Possible cleanup: + +- Move to core composition or core service DTOs. +- Consider rename: + - `WorkflowDefinitionHandle` + - `DefinitionHandle` + - `PersistedDefinition` +- Keep compatibility import until CLI/core imports are migrated. + +Decision question: + +- Should users ever see persisted definition handles directly, or should they see run IDs/status objects from CLI/app services? + +### `BenchmarkDeps` + +Current role: + +- Onboarding requirements for a benchmark: E2B, extras, optional keys. + +Where used: + +- Built-in benchmark class vars. +- CLI onboarding profile. +- Benchmark contract tests. + +Keep in public API? + +- Maybe, but simplify or rehome. + +Concerns: + +- It duplicates conceptually with `required_packages` and `install_hint`. +- It is not about defining benchmark tasks. It is about onboarding/install/config. +- The `Benchmark` docstring says subclasses must set `onboarding_deps`, but `Benchmark` itself does not define/enforce that class var. + +Possible cleanup: + +- Merge into a single public metadata object: + +```python +requirements = BenchmarkRequirements( + packages=("datasets", "huggingface_hub"), + extras=("ergon-builtins[data]",), + env_keys=("HF_API_KEY",), + e2b=True, +) +``` + +- Or keep `BenchmarkDeps` but move to `ergon_core.api.onboarding`. + +Decision question: + +- Should install/runtime dependencies and onboarding prompts be one concept or two? + +### `DependencyError` + +Current role: + +- Raised when required packages are missing. + +Where used: + +- Public ABC validation methods. +- Tests may catch or assert dependency behavior. + +Keep in public API? + +- Maybe. + +Concerns: + +- If dependency validation moves inward, public users may not need this exception. +- But users might want to catch it around benchmark validation. + +Possible cleanup: + +- Keep if public `.validate()` methods stay. +- Move if validation becomes core launch-time behavior. + +Decision question: + +- Is dependency validation part of authoring, or only part of launching/running? + +### `CriteriaCheckError` + +Current role: + +- Domain-level exception criteria can raise from helpers and catch inside `evaluate()` to return a failed `CriterionResult`. + +Where used: + +- Smoke fixture criteria. +- Built-in criterion tests. + +Keep in public API? + +- Yes. + +Concerns: + +- The name uses plural "Criteria" even though a single criterion raises it. + +Possible cleanup: + +- Keep for compatibility. +- Consider alias: + +```python +CriterionCheckError = CriteriaCheckError +``` + +Decision question: + +- Is the plural name worth correcting with an alias, or not worth the churn? + +## Boundary Problems To Fix + +### Public API Imports Core Persistence + +Worst offender: + +```text +api/worker.py + imports core.persistence.context.repository.ContextEventRepository + imports core.persistence.shared.db.get_session +``` + +Why it matters: + +- A worker author importing `Worker` should not load DB/persistence concerns. +- It creates import-cycle risk. +- It makes the public base class responsible for runtime storage. + +Likely fix: + +- Move default output extraction to core. +- Let worker runtime call a core helper after `execute()` finishes. + +### Public API Imports Core Runtime Protocols + +Offender: + +```text +api/evaluation_context.py + imports core.runtime.evaluation.protocols.CriterionRuntime +``` + +Why it matters: + +- Criteria see an internal runtime protocol as a public field. +- It makes the public context harder to document. + +Likely fix: + +- Make runtime private inside context. +- Expose public methods on context. + +### Public API Imports Builtins Registry + +Offender: + +```text +api/worker_spec.py + validate_spec() imports ergon_builtins.registry.WORKERS +``` + +Why it matters: + +- `ergon_core.api` should not know about built-ins. +- Registry validation is runtime/composition behavior. + +Likely fix: + +- Move `WorkerSpec` to core composition. +- Or inject registry validator from core/CLI. + +### Public API Imports Core Generation Types + +Offender: + +```text +api/worker.py + execute() yields core.generation.ContextPartChunk +``` + +Why it matters: + +- Streaming workers are tightly coupled to Ergon's internal transcript/event model. +- If that is intended, it should be explicitly a public advanced type. + +Likely fix: + +- Decide whether to publicize a stable streaming event type. +- Or add a simpler `run()` API and keep streaming advanced. + +## Consolidation Areas + +### Experiment / Definition / Run / Cohort + +Current nouns: + +```text +Experiment +ExperimentRecord +ExperimentDefinition +PersistedExperimentDefinition +RunRecord +ExperimentCohort +ExperimentCohortStats +``` + +Possible clean story: + +```text +Public: + Benchmark + Worker + Rubric + +Application/CLI: + ExperimentSpec or RunSpec + RunHandle + +Core persistence: + ExperimentRecord + ExperimentDefinition + RunRecord + ExperimentCohort +``` + +Open design choice: + +- If users think in experiments, keep `Experiment` public, but make it a pure spec. +- If students mostly write benchmarks/workers/rubrics, hide experiment composition behind CLI commands or a service facade. + +### Evaluator / Rubric / Evaluation Service + +Current nouns: + +```text +Evaluator +Rubric +RubricEvaluationService +TaskEvaluationResult +EvaluationSummary +CriterionResultEntry +``` + +Possible clean story: + +```text +Public: + Criterion + CriterionResult + Rubric + TaskEvaluationResult + +Advanced public: + Evaluator + +Core: + EvaluationRunner + EvaluationSummary + CriterionResultEntry +``` + +Open design choice: + +- Keep `Evaluator` root-exported if dynamic task-specific evaluators are important. +- Otherwise feature `Rubric` and let custom evaluators live in an advanced namespace. + +### Task / Instance / Workflow Graph + +Current nouns: + +```text +BenchmarkTask +instance_key +parent_task_slug +dependency_task_slugs +evaluator_binding_keys +ExperimentDefinitionTask +RunTaskExecution +RunGraphNode +``` + +Possible clean story: + +```text +Public beginner: + Task(slug, description, payload) + +Public advanced: + WorkflowTask(parent, dependencies, evaluator_slots) + +Core: + ExperimentDefinitionTask + RunTaskExecution + RunGraphNode +``` + +Open design choice: + +- Do benchmark authors commonly need dependency graphs? +- If yes, keep the fields but document them as advanced. +- If no, split simple task authoring from graph authoring. + +## Ergonomic API Options + +### Option A: Minimal Authoring Root + +Root exports: + +```python +from ergon_core.api import ( + Benchmark, + BenchmarkTask, + EmptyTaskPayload, + Worker, + WorkerContext, + WorkerOutput, + Criterion, + CriterionResult, + CriterionScoreSpec, + Rubric, + TaskEvaluationResult, + CriteriaCheckError, +) +``` + +Advanced imports: + +```python +from ergon_core.api.advanced import Evaluator, Experiment, WorkerSpec +``` + +Pros: + +- Cleanest beginner story. +- Easy to document. +- Makes runtime/composition concepts visibly advanced. + +Cons: + +- More migration churn. +- Built-in registry typing and core services need import updates. +- Existing code that imports `Experiment` from public API needs shims. + +### Option B: Keep Object-First API, But Purify It + +Root exports still include: + +```python +Experiment +WorkerSpec +Evaluator +``` + +But: + +- `Experiment.persist()` moves to a service. +- `WorkerSpec.validate_spec()` moves to core composition. +- `Worker.get_output()` no longer reads DB from public base class. +- `EvaluationContext.runtime` becomes private helper-backed capability. + +Pros: + +- Less disruptive. +- Preserves object-first feel. +- Keeps `Experiment` available for users who naturally want to compose runs in Python. + +Cons: + +- Beginner docs still need to explain more nouns. +- The top-level API remains larger. +- Harder to communicate what is "normal" vs "advanced". + +### Option C: Two Layer Public API + +Root beginner API: + +```python +Benchmark +Task +Worker +WorkerOutput +Criterion +CriterionResult +Rubric +``` + +Explicit composition API: + +```python +from ergon_core.composition import Experiment, WorkerSpec, persist_experiment +``` + +or: + +```python +from ergon_core.app import define_experiment, run_benchmark +``` + +Pros: + +- Honest separation without hiding useful power. +- CLI and notebook users get a supported high-level entrypoint. +- Students can start with authoring and only learn composition when needed. + +Cons: + +- Requires new package/module naming decisions. +- Need to avoid having too many "public APIs". + +My current recommendation: + +- Option C, implemented gradually. +- Keep compatibility re-exports during migration. +- Document `ergon_core.api` as authoring. +- Add a separate high-level app/composition facade for running things. + +## Proposed Beginner Docs Shape + +### Writing A Benchmark + +```python +from ergon_core.api import Benchmark, BenchmarkTask + +class MyBenchmark(Benchmark): + type_slug = "my-benchmark" + + def build_instances(self): + return { + "default": [ + BenchmarkTask( + task_slug="task-1", + instance_key="default", + description="Solve this problem.", + ) + ] + } +``` + +Possible future version: + +```python +from ergon_core.api import Benchmark, Task + +class MyBenchmark(Benchmark): + type_slug = "my-benchmark" + + def tasks(self): + yield Task("task-1", "Solve this problem.") +``` + +### Writing A Worker + +Current-ish: + +```python +from ergon_core.api import Worker, WorkerContext, BenchmarkTask + +class MyWorker(Worker): + type_slug = "my-worker" + + async def execute(self, task: BenchmarkTask, *, context: WorkerContext): + ... +``` + +Possible future beginner version: + +```python +from ergon_core.api import Worker, WorkerOutput + +class MyWorker(Worker): + type_slug = "my-worker" + + async def run(self, task, context): + return WorkerOutput(output="answer") +``` + +### Writing A Criterion + +Current-ish: + +```python +from ergon_core.api import Criterion, CriterionResult, EvaluationContext + +class MyCriterion(Criterion): + type_slug = "my-criterion" + + async def evaluate(self, context: EvaluationContext): + return CriterionResult( + slug=self.slug, + name=self.slug, + score=1.0, + passed=True, + ) +``` + +Possible helper version: + +```python +return CriterionResult.pass_(self.slug, score=1.0) +``` + +### Writing A Rubric + +```python +from ergon_core.api import Rubric + +rubric = Rubric( + name="default", + criteria=[MyCriterion(slug="correctness")], +) +``` + +## Decisions To Make Together + +### Public Root Exports + +Suggested categories: + +```text +Definitely root public: + Benchmark + BenchmarkTask or Task + EmptyTaskPayload + Worker + WorkerContext + WorkerOutput + Criterion + CriterionResult + CriterionScoreSpec + Rubric + TaskEvaluationResult + CriteriaCheckError + +Maybe root public: + EvaluationContext + Evaluator + BenchmarkDeps + DependencyError + Experiment + +Probably not root public long-term: + WorkerSpec + PersistedExperimentDefinition +``` + +### Concept Names + +Questions: + +- Keep `BenchmarkTask`, or alias it as `Task`? +- Keep `EvaluationContext`, or rename to `CriterionContext`? +- Keep `Evaluator` visible, or make `Rubric` the main public evaluation abstraction? +- Keep `Experiment`, or move composition to a separate facade? +- Rename `PersistedExperimentDefinition` to `WorkflowDefinitionHandle`? +- Rename `RubricEvaluationService` to `EvaluationRunner` or `TaskEvaluationService`? +- Add `CriterionCheckError` alias for `CriteriaCheckError`? + +### Simplicity Targets + +A clean beginner author should not need to know: + +- Inngest, +- database sessions, +- context event persistence, +- run graph node IDs, +- experiment definition row IDs, +- cohort tables, +- telemetry models, +- evaluator binding keys, +- worker binding keys, +- registry validation internals. + +They may need to know: + +- how to create tasks, +- how a worker receives a task, +- how to return an output, +- how criteria inspect the output, +- how a rubric combines criteria. + +## Recommended Refactor Sequence + +### Phase 1: Document And Test The Boundary + +Add tests that encode: + +- `ergon_core.api.worker` must not import DB/session/persistence modules. +- `ergon_core.api.evaluation_context` must not import core runtime protocols directly. +- root exports are intentionally categorized. +- submodule-only public symbols like `CriterionScoreSpec` are either root-exported or documented. + +### Phase 2: Remove Runtime Leakage From Public Worker + +Move from: + +```text +api/worker.py + ContextEventRepository + get_session + AssistantTextPart +``` + +To: + +```text +core/runtime/output_extraction.py + default_worker_output(context) +``` + +Then `worker_execute.py` owns the runtime behavior. + +### Phase 3: Hide Criterion Runtime Behind Public Context Methods + +Move from: + +```text +EvaluationContext.runtime: CriterionRuntime | None +``` + +To: + +```text +EvaluationContext.execute_code(...) +EvaluationContext.read_resource(...) +EvaluationContext.read_resource_by_id(...) +``` + +Internal runtime remains in `core.runtime.evaluation`. + +### Phase 4: Move Composition Plumbing + +Move: + +```text +api/experiment.py -> core/runtime/composition/experiment.py +api/worker_spec.py -> core/runtime/composition/worker_spec.py +api/handles.py -> core/runtime/composition/handles.py +``` + +Keep compatibility shims temporarily: + +```text +api/experiment.py +api/worker_spec.py +api/handles.py +``` + +But update core and CLI imports to the new home first. + +### Phase 5: Add A CLI/Application Facade + +Create something like: + +```text +core/runtime/services/benchmark_run_facade.py +``` + +It owns: + +- build benchmark from slug, +- attach worker/model/rubric, +- persist definition, +- resolve/create cohort, +- create run, +- emit workflow started event, +- poll run status. + +Then `ergon_cli` becomes mostly command parsing and rendering. + +### Phase 6: Consolidate Evaluation Naming + +Decide: + +- root `Rubric` only, or root `Evaluator` too? +- rename internal `RubricEvaluationService`? +- add public helper constructors for result models? +- centralize `CriterionResult` to `EvaluationSummary` conversion. + +## Proposed End State + +```text +ergon_core.api + The authoring kit. + Used by benchmarks, workers, criteria, rubrics, and students. + +ergon_core.core.runtime.composition + Internal composition layer. + Used by CLI and core services to bind benchmarks, workers, rubrics, assignments. + +ergon_core.core.runtime.services + Application services. + Used by API routers and CLI facade. + +ergon_core.core.persistence + SQLModel rows and repositories. + Not imported by public API. + +ergon_cli + Command parsing and display. + Calls a small core facade, not many low-level services. +``` + +## Working Recommendation + +If we want the cleanest ergonomics for students: + +1. Keep the root public API focused on authoring. +2. Keep `Experiment` available for now, but do not teach it first. +3. Move `WorkerSpec` and `PersistedExperimentDefinition` out of the public root over time. +4. Make `Rubric` the public evaluation concept; keep `Evaluator` advanced. +5. Add helper methods/constructors so basic workers and criteria are short to write. +6. Build a separate run/composition facade for CLI and notebook users. + +The practical next conversation should decide three things: + +1. Is `Experiment` a public composition object or a core definition draft? +2. Is worker authoring streaming-first or output-first? +3. Is `Evaluator` a first-class public concept or an advanced escape hatch behind `Rubric`? diff --git a/docs/superpowers/plans/2026-04-28-public-api-folder-plan.md b/docs/superpowers/plans/2026-04-28-public-api-folder-plan.md new file mode 100644 index 00000000..1b7bcb47 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-public-api-folder-plan.md @@ -0,0 +1,413 @@ +# Public API Folder Refactor Plan + +Goal: make `ergon_core.api` small enough for students to understand while moving runtime, persistence, dashboard, cohort, run, and registry plumbing into `ergon_core.core`. + +The public API should be an authoring kit: define benchmarks, tasks, workers, criteria, rubrics, and simple result objects. It should not expose database sessions, persistence handles, Inngest dispatch, cohort management, run lifecycle, or internal evaluation summaries. + +## Proposed Folder Shape + +```text +ergon_core/ + ergon_core/ + api/ + __init__.py + # keep : only the student-facing authoring exports + # export: Benchmark, BenchmarkTask, EmptyTaskPayload + # export: Worker, WorkerContext, WorkerOutput + # export: Criterion, CriterionResult, CriterionScoreSpec + # export: Rubric, TaskEvaluationResult + # export: CriteriaCheckError + # stop exporting: Experiment, WorkerSpec, PersistedExperimentDefinition + # consider hiding: Evaluator, EvaluationContext, BenchmarkDeps, DependencyError + + benchmark.py + # keep : Benchmark as the public dataset/task generator base class + # keep : type_slug, task_payload_model, build_instances() + # keep : parse_task_payload() + # simplify : evaluator_requirements() should become optional/advanced + # move : dependency package checking to core/runtime/dependencies.py adapter + # merge : onboarding_deps and required_packages into one simpler authoring metadata story + + task_types.py + # keep : BenchmarkTask and EmptyTaskPayload + # consider rename later : BenchmarkTask -> Task or TaskSpec + # keep public because benchmarks, workers, and criteria all share it + # do not expose: ExperimentDefinitionTask persistence model here + + worker.py + # keep : Worker ABC and execute(task, context=...) + # keep : optional from_buffer() only if resumption remains an author-facing extension point + # move : default DB-backed get_output() implementation to core/runtime/output_extraction.py + # move : ContextEventRepository/get_session imports out of public API + # move : AssistantTextPart/ContextPartChunk dependency behind a smaller public streaming type or an advanced namespace + # simplify : base Worker should not know how context events are persisted + + worker_context.py + # keep : WorkerContext as the minimal execution context passed to Worker.execute() + # simplify : expose only run_id, task_id, execution_id, sandbox_id, metadata if possible + # move inward : definition_id and node_id if only runtime/delegation needs them + # consider : a separate internal CoreWorkerContext for graph/runtime identity + + results.py + # keep : WorkerOutput + # keep : CriterionScoreSpec + # keep : CriterionResult + # keep : TaskEvaluationResult + # keep or move advanced : CriterionObservation and CriterionObservationMessage + # move : JsonObject import from core into a public local alias/type + # merge : align CriterionResult fields with core EvaluationSummary conversion in one adapter + + criterion.py + # keep : Criterion ABC + # keep : evaluate(context) -> CriterionResult + # move : dependency package checking to core validation helper + # simplify : criterion authors should not need to import core runtime protocols + + evaluation_context.py + # keep temporarily : EvaluationContext for compatibility + # replace with : CriterionContext or EvaluationContext with public helper methods + # move : CriterionRuntime Protocol import to core/runtime/evaluation/protocols.py only + # hide : sandbox manager/runtime internals behind context.execute_code(), context.read_resource(), etc. + # eventual delete : if Criterion can receive a simpler public CriterionContext + + evaluator.py + # keep : Rubric as the common public evaluation concept + # consider advanced : Evaluator ABC moves to api/advanced/evaluator.py or core/runtime/evaluation + # merge : default weighted aggregation remains Rubric + # move : dynamic evaluator orchestration details to core/runtime/services/rubric_evaluation_service.py + # clarify : Rubric = author-facing grouping of criteria; evaluator service = internal runner + + errors.py + # keep : CriteriaCheckError + # consider move : DependencyError to core/runtime/dependencies.py unless public callers catch it + + benchmark_deps.py + # merge : into Benchmark metadata or move to api/onboarding.py + # keep temporarily : compatibility for ergon_cli/onboarding/profile.py and built-in benchmark declarations + # eventual delete : once onboarding reads a simpler Benchmark.onboarding field + + experiment.py + # move to core/runtime/composition/experiment.py or core/runtime/services/experiment_composition.py + # reason : binds benchmark + worker specs + evaluators + assignments for persistence + # reason : persist() calls core ExperimentPersistenceService + # public replacement : a simple CLI/application facade, not a student authoring primitive + # eventual delete from top-level api + + worker_spec.py + # move to core/runtime/composition/worker_spec.py + # reason : config-time descriptor for registry lookup, not worker authoring + # reason : validate_spec() imports ergon_builtins.registry.WORKERS + # public replacement : CLI accepts worker_slug/model and core builds WorkerSpec internally + # eventual delete from top-level api + + handles.py + # move to core/runtime/services/experiment_handles.py or core/runtime/composition/handles.py + # reason : PersistedExperimentDefinition is a persistence/run launch handle + # public replacement : CLI-facing RunHandle/DefinitionHandle returned by core facade + # eventual delete from top-level api +``` + +```text +ergon_core/ + ergon_core/ + core/ + runtime/ + composition/ + __init__.py + # create : internal composition exports for CLI/core + + experiment.py + # move from api/experiment.py + # keep : Experiment composition root if core still needs object-first persistence + # change : persist() should become service-owned, not a method on Experiment + + worker_spec.py + # move from api/worker_spec.py + # keep : WorkerSpec registry descriptor + # keep : validate_spec() registry lookup here, away from public API + + handles.py + # move from api/handles.py + # keep : PersistedExperimentDefinition or rename to WorkflowDefinitionHandle + + output_extraction.py + # create : default worker output extraction from context events + # move from api/worker.py : ContextEventRepository/get_session/AssistantTextPart logic + # used by : core/runtime/inngest/worker_execute.py + + dependencies.py + # keep : check_packages() + # add : validate_component_dependencies(component_type, slug, packages, install_hint) + # public ABCs call this only through small wrappers, or core validates before launch + + evaluation/ + protocols.py + # keep : CriterionRuntime internal protocol + # no public api imports should depend on this directly + + context.py + # create or rename : internal TaskEvaluationContext/CriterionContext live here + # owns : sandbox/runtime details for criterion execution + + adapters.py + # create : convert public CriterionResult into persisted EvaluationSummary entries + # merge logic currently split between public results and persistence summary models + + evaluation_schemas.py + # keep : internal CriterionSpec, TaskEvaluationContext, CriterionContext + # maybe rename : criterion_specs.py if it remains evaluation-engine only + + services/ + public_api_facade.py + # create : CLI/application facade for common operations + # owns : define benchmark experiment, persist definition, create cohort/run, dispatch, poll + # goal : CLI should import one core facade instead of many core services/models + + experiment_persistence_service.py + # keep : writes Experiment/BenchmarkTask object graph to immutable definition rows + # adjust imports : read Experiment and WorkerSpec from core/runtime/composition + + experiment_definition_service.py + # keep : create ExperimentRecord sample selections + # clarify name : this creates experiment records, not immutable workflow definitions + # possible rename later : benchmark_experiment_service.py + + experiment_launch_service.py + # keep : materializes runs for defined ExperimentRecord rows + # adjust imports : use core composition types, not public api Experiment/WorkerSpec + + rubric_evaluation_service.py + # keep : internal service runner + # clarify : not the same concept as public Rubric + # maybe rename : task_evaluation_service.py + + evaluation_persistence_service.py + # keep : persistence of evaluation summaries + # move conversion from public-ish result shapes into runtime/evaluation/adapters.py + + cohort_service.py + # keep : cohorts are operator/runtime grouping, not student API + # expose via facade only for CLI/dashboard + + run_service.py + # keep : runs are runtime telemetry/lifecycle, not student API + # expose via facade only for CLI/dashboard +``` + +```text +ergon_cli/ + ergon_cli/ + composition/ + __init__.py + # delete or shrink substantially + # current : imports public Experiment + WorkerSpec + # move : build_experiment() logic to core/runtime/composition or services/public_api_facade.py + # replacement : CLI passes slugs/options to core facade + + commands/ + benchmark.py + # keep : command parsing and rendering only + # move inward : create_run, WorkflowStartedEvent, inngest_client, RunRecord polling + # replace with : public_api_facade.run_benchmark(...) + # keep : setup benchmark E2B template logic unless moved to onboarding service + + experiment.py + # keep : command parsing/rendering + # replace multiple core service imports with one facade import + + run.py + # keep : command parsing/rendering + # replace direct RunRecord/run_service access with one run facade + + workflow.py + # keep : command parsing/rendering + # replace direct workflow_service/db access with facade if possible + + onboarding/ + profile.py + # keep : onboarding profile behavior + # change later : read Benchmark.onboarding metadata instead of BenchmarkDeps directly +``` + +```text +ergon_builtins/ + ergon_builtins/ + benchmarks/ + */benchmark.py + # keep public imports : Benchmark, BenchmarkTask, EmptyTaskPayload + # update : BenchmarkDeps if moved/merged + # no direct dependency on core persistence or run concepts + + */rubric.py + # keep public imports : Rubric, CriterionResult, TaskEvaluationResult, BenchmarkTask + # if Evaluator moves advanced/internal, custom rubrics should still subclass Rubric + + */criterion.py + # keep public imports : Criterion, CriterionResult, CriterionScoreSpec + # update : EvaluationContext -> simpler CriterionContext if introduced + + workers/ + */*.py + # keep public imports : Worker, WorkerContext, WorkerOutput, BenchmarkTask + # update : streaming chunk type if ContextPartChunk is hidden or rehomed + + registry.py + # keep : plugin registry for built-ins + # core composition validates WorkerSpec/Benchmark/Evaluator slugs against this + # public API should not import this registry directly +``` + +## Concept Merges And Renames + +### Experiment Concepts + +Current concepts: + +- `api.Experiment`: object graph for benchmark + workers + evaluators + assignments. +- `core.persistence.telemetry.ExperimentRecord`: cohort/sample-selection record. +- `core.persistence.definitions.ExperimentDefinition`: immutable workflow definition rows. + +Plan: + +- Keep `ExperimentDefinition` as a core persistence name. +- Consider renaming `ExperimentRecord` service language to `BenchmarkExperiment` or `ExperimentPlan` later, because it is not the immutable workflow definition. +- Move public `Experiment` into core composition, or rename it `WorkflowDefinitionDraft` if it remains object-first. +- Do not ask students to learn all three names. + +### Worker Concepts + +Current concepts: + +- `Worker`: execution-ready authoring base class. +- `WorkerSpec`: config-time registry descriptor. +- `ExperimentDefinitionWorker`: persisted worker binding row. + +Plan: + +- Keep `Worker` public. +- Move `WorkerSpec` into core composition. +- Keep `ExperimentDefinitionWorker` internal. +- CLI should accept `worker_slug` and `model`; core creates `WorkerSpec`. + +### Evaluation Concepts + +Current concepts: + +- `Criterion`: atomic authoring unit. +- `Rubric`: fixed-list `Evaluator` with aggregation. +- `Evaluator`: abstract dynamic evaluator. +- `RubricEvaluationService`: runtime service that executes criteria and aggregates. +- `CriterionResultEntry` / `EvaluationSummary`: persisted dashboard schema. + +Plan: + +- Keep `Criterion` and `Rubric` public. +- Keep `Evaluator` advanced or internal unless third-party dynamic evaluators are required. +- Rename or document `RubricEvaluationService` as internal task evaluation runner. +- Keep `EvaluationSummary` internal. +- Add one adapter that maps `CriterionResult`/`TaskEvaluationResult` to persisted summary rows. + +### Task Concepts + +Current concepts: + +- `BenchmarkTask`: author-facing task object generated by a benchmark. +- `ExperimentDefinitionTask`: persisted definition row. +- `RunTaskExecution`: runtime execution telemetry row. + +Plan: + +- Keep `BenchmarkTask` public for now. +- Consider future alias `Task = BenchmarkTask` for student docs. +- Keep persistence/runtime task rows internal. +- Core adapters convert public task specs into definition rows. + +### Cohort And Run Concepts + +Current concepts: + +- Cohorts and runs are not in `ergon_core.api`, but CLI imports core services/models directly. +- `ExperimentCohort`, `ExperimentCohortStats`, `RunRecord`, `RunTaskExecution`, `RunTaskEvaluation` are operator/runtime concepts. + +Plan: + +- Keep cohorts and runs out of the student authoring API. +- Add a CLI/application facade so built-in CLI can use cohorts/runs without importing persistence models, Inngest events, or low-level services. +- Dashboard/API routers can still use detailed core services and DTOs. + +## Compatibility Strategy + +1. Add architecture tests for the intended boundary before moving code. +2. Keep compatibility re-exports for one refactor window: + - `ergon_core.api.experiment.Experiment` + - `ergon_core.api.worker_spec.WorkerSpec` + - `ergon_core.api.handles.PersistedExperimentDefinition` + - `ergon_core.api.benchmark_deps.BenchmarkDeps` +3. Update `ergon_cli` and `ergon_core.core` imports first so internal code no longer depends on public API for internal composition. +4. Update `ergon_builtins` imports only after the public authoring surface is stable. +5. Remove compatibility shims once tests and docs no longer reference moved symbols. + +## Suggested Implementation Order + +```text +phase_1_boundary_tests/ + tests/unit/architecture/test_public_api_boundaries.py + # add forbidden import checks for api -> core.persistence, core.runtime.evaluation.protocols, core.generation + # add explicit expected top-level public exports + +phase_2_worker_runtime_split/ + ergon_core/ergon_core/api/worker.py + # keep Worker ABC only + # remove DB/context event imports + + ergon_core/ergon_core/core/runtime/output_extraction.py + # create default output extraction helper + + ergon_core/ergon_core/core/runtime/inngest/worker_execute.py + # use output_extraction helper after worker.execute() + +phase_3_composition_move/ + ergon_core/ergon_core/core/runtime/composition/ + # create experiment.py, worker_spec.py, handles.py + + ergon_core/ergon_core/api/ + # leave temporary import shims for Experiment, WorkerSpec, PersistedExperimentDefinition + + ergon_cli/ergon_cli/composition/__init__.py + # migrate logic or shrink to facade call + +phase_4_cli_facade/ + ergon_core/ergon_core/core/runtime/services/public_api_facade.py + # create stable CLI-facing functions/classes + + ergon_cli/ergon_cli/commands/*.py + # replace direct core service/model/event imports where practical + +phase_5_evaluation_simplification/ + ergon_core/ergon_core/api/evaluation_context.py + # replace raw runtime protocol exposure with public context methods + + ergon_core/ergon_core/core/runtime/evaluation/adapters.py + # centralize result-to-summary conversion + + ergon_core/ergon_core/api/evaluator.py + # make Rubric primary; move Evaluator to advanced/internal if desired + +phase_6_cleanup/ + ergon_core/ergon_core/api/__init__.py + # remove moved concepts from top-level exports + + docs/ + # update student-facing examples to import only the authoring kit +``` + +## Desired Final Student-Facing Mental Model + +```text +I define a Benchmark. +The Benchmark returns Tasks. +A Worker solves each Task. +A Criterion checks the output. +A Rubric combines Criteria into a score. +Ergon core handles experiments, definitions, cohorts, runs, persistence, dispatch, and dashboards. +``` diff --git a/docs/superpowers/plans/2026-04-28-runtime-services-layout-audit.md b/docs/superpowers/plans/2026-04-28-runtime-services-layout-audit.md new file mode 100644 index 00000000..acefd6cd --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-runtime-services-layout-audit.md @@ -0,0 +1,665 @@ +# Runtime Services Layout Audit + +Date: 2026-04-28 + +Scope: `ergon_core/ergon_core/core/runtime/services` in the current core/public API refactor branch. + +This note is an investigation artifact for a later fix/refactor plan. It does not propose a final migration sequence yet. The goal is to identify where `runtime/services` has become a dumping ground, where service shapes are inconsistent, and where logic appears duplicated or split across weak domain boundaries. + +Post-refactor update: this audit has been refreshed after the public API nesting refactor and the first core service moves: + +- `Experiment` and `WorkerSpec` now live under `core/composition`. +- The beginner-facing public API is now nested under `api/benchmark`, `api/worker`, `api/criterion`, and `api/rubric`. +- `experiment_validation_service.py` now owns experiment object-graph validation. +- `workflow_propagation_service.py` now owns the former `runtime/execution/propagation.py` graph propagation helpers. + +Most of the original duplication findings still stand. The new public API shape mainly changes the target boundaries: authoring concepts should stay in `ergon_core.api`, composition/definition concepts should sit near `core/composition` and definition services, and graph/task/workflow lifecycle behavior should stop accumulating in a single flat `runtime/services` package. + +## Executive Summary + +`runtime/services` is doing too many jobs in one flat namespace: + +- Domain orchestration services (`TaskExecutionService`, `WorkflowInitializationService`, `WorkflowFinalizationService`). +- Graph mutation and graph read helpers (`WorkflowGraphRepository`, `GraphNodeLookup`, graph DTOs). +- Agent/tool-facing subtask services (`TaskManagementService`, `TaskInspectionService`). +- API/dashboard read models (`RunReadService`, `WorkflowService`). +- Persistence helpers (`ExperimentPersistenceService`, `EvaluationPersistenceService`). +- Product areas that are not obviously part of runtime orchestration (`CommunicationService`, cohort services). +- Transport contracts for Inngest and API surfaces (`*_dto.py`, `*_schemas.py`, `child_function_payloads.py`, `inngest_function_results.py`). + +The resulting issue is not just file count. The same concepts are implemented with different local conventions: request/response models may be named DTOs, schemas, payloads, or function results; DB access may use explicit sessions, `with get_session()`, or ad hoc repository instances; graph traversal and latest-execution lookup logic are repeated with inconsistent ordering rules. + +## Current File Groups + +### Graph And Graph Mutation + +- `graph_repository.py` +- `graph_lookup.py` +- `graph_dto.py` +- `workflow_propagation_service.py` +- `task_management_service.py` +- `task_inspection_service.py` +- `task_management_dto.py` +- `task_inspection_dto.py` +- `subtask_cancellation_service.py` +- `subtask_cancellation_dto.py` +- `subtask_blocking_service.py` + +This is the densest cluster. It covers graph mutation, graph traversal, task/subtask management, inspection, cancellation, blocking, propagation, and graph DTOs. Moving propagation into services made the domain boundary clearer: the old `runtime/execution` package was not really a separate layer; propagation belongs with graph lifecycle policy. + +### Experiment Definition And Composition + +- `experiment_validation_service.py` +- `experiment_persistence_service.py` +- `experiment_definition_service.py` +- `experiment_launch_service.py` +- `experiment_schemas.py` +- `experiment_read_service.py` + +This group is now more visible because `Experiment` moved out of the public API and into `core/composition`. These files are not all the same kind of service: + +- `experiment_validation_service.py` validates the in-memory composition object graph. +- `experiment_persistence_service.py` materializes immutable definition rows from composition objects. +- `experiment_definition_service.py` defines experiments from registered benchmark/worker/evaluator slugs. +- `experiment_launch_service.py` bridges persisted definitions into runtime orchestration. +- `experiment_read_service.py` and `experiment_schemas.py` are application/API read models. + +The current flat package hides that sequence. A later refactor should make the pipeline explicit: composition -> definition persistence -> launch -> read model. + +### Workflow And Run Lifecycle + +- `run_service.py` +- `workflow_initialization_service.py` +- `workflow_finalization_service.py` +- `workflow_service.py` +- `workflow_dto.py` +- `orchestration_dto.py` +- `run_snapshot_read_model.py` + +This group mixes run lifecycle orchestration with workflow navigation/resource materialization. `workflow_service.py` is read-heavy and tool/API-facing, while `workflow_initialization_service.py` and `workflow_finalization_service.py` are engine lifecycle services. `run_snapshot_read_model.py` is already a move in the right direction because it names read-model shaping separately from orchestration. + +### Task Execution And Propagation + +- `task_execution_service.py` +- `task_propagation_service.py` +- `workflow_propagation_service.py` +- `task_cleanup_service.py` +- `task_cleanup_dto.py` + +This group owns execution row creation/finalization, graph status updates for task execution, propagation after completion/failure, and cleanup of cancelled task executions. `workflow_propagation_service.py` is deliberately listed in both graph and task groups because it is the clearest split point: some functions are graph lifecycle primitives, while `TaskPropagationService` is an orchestration wrapper that turns those transitions into schedulable work. + +### Evaluation + +- `rubric_evaluation_service.py` +- `evaluator_dispatch_service.py` +- `evaluation_persistence_service.py` +- `evaluation_dto.py` + +This group mixes evaluator preparation, rubric execution, persistence, and dashboard DTO shaping. + +### API Read Models And Product Features + +- `run_read_service.py` +- `communication_service.py` +- `communication_schemas.py` +- `cohort_service.py` +- `cohort_stats_service.py` +- `cohort_schemas.py` + +These are valid application services, but they are not the same kind of service as runtime orchestration. Their presence in the same flat package makes ownership harder to read. + +### Transport Contracts + +- `child_function_payloads.py` +- `inngest_function_results.py` +- plus the various `*_dto.py` and `*_schemas.py` files + +These are request/response contracts, not services. They currently sit beside service implementations without a consistent folder or naming convention. + +## Standardization Gaps + +### No Common Service Module Shape + +The desired structure is roughly: + +- request/response models +- DB schema types +- `repository.py` or service implementation +- `errors.py` for custom domain/service exceptions +- optional `utils.py` + +The current structure is flat and inconsistent: + +- Some service request/response models live in `*_dto.py`. +- Some live in `*_schemas.py`. +- Inngest request models live in `child_function_payloads.py`. +- Inngest outputs live in `inngest_function_results.py`. +- Some service-specific helper models live in the same service file. +- Persistence-facing repositories live partly in `core/persistence` and partly in `runtime/services`. +- Custom exceptions live mostly in broad runtime error modules, not beside the service/domain that raises them. + +This makes it difficult to infer whether a file is a domain service, transport contract, read model, or persistence adapter. + +### Public API Boundary Is Cleaner, But Core Still Needs Adapters + +The public API refactor has reduced the authoring surface to nested packages: + +- `api/benchmark`: `Benchmark`, `Task`, `EmptyTaskPayload`, `BenchmarkRequirements` +- `api/worker`: `Worker`, `WorkerContext`, `WorkerOutput` +- `api/criterion`: `Criterion`, `CriterionContext`, `CriterionOutcome`, `ScoreScale`, evidence types +- `api/rubric`: `Rubric`, `TaskEvaluationResult`, and advanced `Evaluator` + +That is a useful constraint for the services refactor. Runtime services should consume public authoring objects at the boundary where user-authored concepts enter core, but they should not treat `ergon_core.api` as the place for operational concepts like runs, cohorts, graph nodes, or persisted definition handles. + +Current service imports are mostly consistent with that direction: + +- `experiment_validation_service.py`, `experiment_definition_service.py`, `experiment_launch_service.py`, and `rubric_evaluation_service.py` legitimately consume authoring concepts such as `Benchmark`, `Task`, `Evaluator`, `Rubric`, and criterion outcomes. +- `run_read_service.py`, `run_snapshot_read_model.py`, `communication_service.py`, and `evaluation_persistence_service.py` still import API-layer DTOs from `core/api/schemas.py`. Those are not beginner-facing authoring API objects, but the import direction is still awkward: runtime read-model code depends upward on API schemas. + +The revised target should be: public authoring API in `ergon_core.api`; internal composition in `core/composition`; runtime read models in a runtime/application read-model package; HTTP/API routers adapt those read models to wire schemas. + +### Error Types Are Not Domain-Local + +Some custom errors already exist under `core/runtime/errors`, for example graph, delegation, and Inngest-specific error modules. That is better than raising generic `ValueError` everywhere, but it still leaves service packages without local ownership of their failure modes. + +The target convention should be: each runtime domain package owns an `errors.py` file for exceptions that are part of that domain contract. For example: + +- `runtime/graph/errors.py` for graph structural and mutation errors. +- `runtime/tasks/errors.py` for task execution, task management, cleanup, cancellation, and inspection failures. +- `runtime/workflows/errors.py` for workflow initialization/finalization/lifecycle failures. +- `runtime/evaluation/errors.py` for evaluator dispatch, rubric evaluation, and evaluation persistence failures. +- `runtime/inngest/errors.py` for Inngest wrapper/contract/non-retryable errors. + +This does not mean every exception class needs to move immediately. The refactor plan should move errors opportunistically with the package they belong to, and should prefer explicit custom exceptions over generic `ValueError`, `RuntimeError`, or assertion-style checks at service boundaries. + +### Repository Naming Is Ambiguous + +`WorkflowGraphRepository` is in `runtime/services/graph_repository.py`, while persistence repositories live in: + +- `core/persistence/context/repository.py` +- `core/persistence/telemetry/repositories.py` + +This is understandable because `WorkflowGraphRepository` owns runtime graph mutation semantics and audit-log writes, not just raw CRUD. Still, the package shape blurs whether repositories are persistence infrastructure or runtime domain services. + +### Session Ownership Varies + +Patterns include: + +- Methods accepting an explicit `Session`. +- Services opening `with get_session() as session`. +- Services using `session = get_session()` with manual `finally: session.close()`. +- Repository classes receiving a session from callers. + +Examples: + +- `TaskManagementService`, `SubtaskCancellationService`, and `WorkflowService` accept caller-owned sessions. +- `RunReadService`, `RunService`, `WorkflowInitializationService`, and `WorkflowFinalizationService` open sessions internally. +- `EvaluationPersistenceService` manually opens and closes sessions instead of using `with get_session()`. + +This makes transaction boundaries harder to reason about and complicates any future service package convention. + +## Concrete Duplication Findings + +### P1: Duplicate Latest Execution Lookup + +Two files define the same helper: + +- `task_management_service.py` +- `subtask_cancellation_service.py` + +Both query `RunTaskExecution.id` by `node_id`, ordered by `RunTaskExecution.started_at.desc()`, and use it to populate `TaskCancelledEvent.execution_id`. + +Related methods in other services define "latest execution" differently: + +- `WorkflowService.get_latest_execution` orders by `attempt_number DESC`, then `started_at DESC`. +- `TaskInspectionService._latest_output` and `_latest_error` order only by `started_at DESC`. + +This is a real semantic duplication. There should be one canonical helper for "latest execution for node", with a clearly documented ordering rule. + +### P1: Duplicate Containment Subtree Traversal + +The same parent-child BFS pattern appears in: + +- `task_management_service.py` via `_count_non_terminal_descendants`. +- `subtask_cancellation_service.py` via `cancel_orphans`. +- `subtask_blocking_service.py` via `block_pending_descendants`. + +All query `RunGraphNode` children by `run_id` and `parent_node_id`, then apply a different policy: + +- Count non-terminal descendants. +- Cancel non-terminal descendants. +- Block non-terminal, non-running descendants. + +This should become a shared graph traversal primitive, with the policy supplied by the caller or by domain-specific cascade services. + +### P1: Scattered Graph Status Transitions + +Graph node and edge status writes appear across: + +- `task_execution_service.py` +- `task_propagation_service.py` +- `workflow_propagation_service.py` +- `task_management_service.py` +- `subtask_cancellation_service.py` +- `subtask_blocking_service.py` +- `workflow_initialization_service.py` +- `graph_repository.py` + +`WorkflowGraphRepository` intentionally does not validate transitions; it only records mutations and enforces structural invariants. That boundary is reasonable, but the transition policy above it is distributed across many services. + +The refactor plan should decide whether there is a single graph lifecycle domain service, or at least a small set of named transition operations such as: + +- start node execution +- complete node execution +- fail node execution +- reset node for restart +- cancel subtree +- block subtree +- satisfy dependency edge + +### P2: Duplicated Graph Mapping / Read Loading + +`GraphNodeLookup` batch-loads mappings from definition task IDs and edges to run graph IDs. + +`RunReadService.build_run_snapshot` builds similar maps inline: + +- `execution_task_map` +- `defn_to_node` +- task maps and context-event maps through API helper functions + +`WorkflowService` also builds node maps through `_nodes_by_id` and tree/resource scopes through local queries. + +These are not identical consumers, but the primitives overlap: load run graph, map definition IDs to node IDs, map executions to nodes, and traverse parent/child relationships. + +### P2: Evaluation Score Semantics Drift + +`WorkflowFinalizationService` computes: + +- `final_score = sum(scores)` +- `normalized_score = final_score / len(scores)` + +`RunReadService.build_run_snapshot` computes: + +- `final_score = sum(scores) / len(scores)` + +`TelemetryRepository.refresh_run_evaluation_summary` also updates summary fields from evaluation rows. + +`cohort_service.py` and `cohort_stats_service.py` then read `normalized_score` and `final_score` from summary JSON. This should be centralized because downstream consumers depend on the meaning of these fields. + +### P2: Read Model Shaping Depends On API Helpers + +`RunReadService` imports DTOs from `ergon_core.core.api.schemas` and imports `ergon_core.core.api.runs` helper functions inside `build_run_snapshot`. + +That means a runtime service depends upward on API helpers. This is likely a layering smell. `run_snapshot_read_model.py` is a partial correction because it moves snapshot shaping into a named runtime read model, but it still imports DTO classes from `core/api/schemas.py`. The pure DTO helper functions and run snapshot DTOs should either move into a runtime/read-model package, or the API should own the service and not call it "runtime". + +The new public API nesting makes this more important. `ergon_core.api` should mean authoring API, not operational wire schemas. Runtime read models should not be coupled to the benchmark/worker/criterion authoring package or to HTTP schema modules. + +### P3: Repeated Graph Repository Construction + +`WorkflowGraphRepository()` is constructed in many places: + +- `task_execution_service.py` +- `task_propagation_service.py` +- `workflow_initialization_service.py` +- `task_management_service.py` +- `subtask_cancellation_service.py` +- `subtask_blocking_service.py` + +The repository is mostly stateless, but it has mutation listeners. `TaskManagementService` registers `dashboard_emitter.graph_mutation`; other construction sites do not. If listeners are meant to be consistently applied, construction should be standardized. If not, the listener behavior should be explicit at call sites or separated from repository construction. + +### P3: DTO Naming And Boundaries Are Mixed + +Current naming patterns include: + +- `graph_dto.py` +- `workflow_dto.py` +- `task_management_dto.py` +- `task_inspection_dto.py` +- `evaluation_dto.py` +- `cohort_schemas.py` +- `communication_schemas.py` +- `child_function_payloads.py` +- `inngest_function_results.py` + +The differences may have history, but they do not communicate ownership. A student/user reading the package cannot easily tell whether "schema", "DTO", "payload", and "result" are meaningful distinctions. + +### P3: Task Reference Shapes Overlap + +The following are related but split: + +- `GraphTaskRef` in `graph_dto.py` +- `TaskDescriptor` in `orchestration_dto.py` +- `SubtaskInfo` in `task_inspection_dto.py` +- `WorkflowDependencyRef.source` / `target` in `workflow_dto.py` +- `AddSubtaskResult`, `CancelTaskResult`, and `RestartTaskResult` in `task_management_dto.py` + +Some separation is legitimate, but the shared task identity payload should be explicit. The current split risks reintroducing separate names/status fields for the same runtime graph node. + +## Boundary Assessment + +### Persistence Layer Boundary + +Keep `core/persistence` as storage infrastructure, not as a home for domain behavior. + +These belong in `core/persistence`: + +- SQLModel table definitions in `core/persistence`. +- Shared DB session creation in `core/persistence/shared/db.py`. +- Shared persisted enums and types in `core/persistence/shared`. +- Thin append/read/write helpers that do not encode runtime policy. + +These should move out of `core/persistence`, or should not be added there: + +- Domain repositories that encode graph/task/workflow/evaluation semantics. +- "Latest execution" selection rules. +- Graph lifecycle transition rules. +- Evaluation score aggregation semantics. +- Experiment-definition materialization from authored composition objects. + +In other words, `core/persistence` answers "what rows exist and how do we store them?" Domain packages answer "what does it mean to add a graph node, complete a task, select an attempt, or persist an authored experiment definition?" + +Candidate to split or dissolve: + +- `core/persistence/queries.py` + +It currently contains domain-shaped query objects (`DefinitionsQueries`, `TaskExecutionsQueries`, child-execution lookup, status lookup). Those should be redistributed over time into definition, task, graph, and read-model packages. + +Candidate to reframe: + +- `experiment_persistence_service.py` + +It writes immutable experiment definition tables, but the important behavior is not raw SQL persistence; it is materializing an authored `Experiment` into a persisted definition graph. That makes it a definition/composition domain operation that imports persistence table models, not a persistence-layer module. + +### Things That Belong Near Composition + +`Experiment` and `WorkerSpec` are now under `core/composition`, which gives the services refactor a better boundary than the original audit had. Composition owns the in-memory definition before it becomes persisted runtime state. + +Candidate to move or reframe: + +- `experiment_validation_service.py` + +It validates `Experiment`, benchmark task graph structure, evaluator bindings, and worker assignments. That is composition/definition validation, not runtime DAG execution. It can live under `runtime/services` temporarily, but the target should probably be `core/composition/validation.py` or `core/composition/services/validation.py` unless we decide all composition use cases belong under a broader `core/application` layer later. + +Related files that should be considered together: + +- `core/composition/experiment.py` +- `core/composition/worker_spec.py` +- `core/composition/handles.py` +- `runtime/services/experiment_validation_service.py` +- `runtime/services/experiment_persistence_service.py` +- `runtime/services/experiment_definition_service.py` + +### Things That Belong In Runtime Domain Packages + +These are runtime domain behavior, not raw persistence: + +- Graph mutation repository and mutation DTOs. +- Task execution lifecycle. +- Propagation and graph lifecycle transitions. +- Agent/tool-facing task management and inspection. +- Inngest command/result contracts. + +Candidate runtime packages: + +- `runtime/graph` +- `runtime/tasks` +- `runtime/workflows` +- `runtime/evaluation` +- `runtime/read_models` +- `runtime/inngest/contracts` + +The exact package names can wait for the refactor plan, but the target should be domain packages rather than one `services` bucket. `workflow_propagation_service.py` should be treated as a graph lifecycle module during that migration, not as a generic workflow service. + +### Things Inngest Should Own + +The Inngest function implementations already live under `core/runtime/inngest`, but two Inngest-owned modules currently sit at the top of `core/runtime`: + +- `inngest_client.py` +- `inngest_registry.py` + +These should move under `runtime/inngest` with the function modules. The Inngest package should own: + +- the client singleton and shared cancellation configuration +- the function registry / function list passed to `serve()` +- function modules +- child-function request contracts and function result contracts, unless those contracts are better colocated with the specific function module +- Inngest-specific errors + +This would make `runtime/inngest` the runtime boundary for event orchestration instead of spreading its setup across `runtime` and `runtime/services`. + +### Things That Are Product/Application Services + +These may belong outside the runtime kernel, or in separate runtime subdomains: + +- `communication_service.py` +- `cohort_service.py` +- `cohort_stats_service.py` +- `run_read_service.py` + +They are valid application concerns, but colocating them with graph mutation and task execution weakens the meaning of `services`. + +## Suggested Target Shape + +This is a sketch, not a final implementation plan. + +```text +core/runtime/ + # imports table/session infrastructure from core/persistence, + # but owns domain-specific persistence operations. + + composition_services/ # optional; may instead live under core/composition + validation.py # ExperimentValidationService or pure validation functions + + graph/ + models.py # runtime DTOs for graph snapshots and mutation records + repository.py # WorkflowGraphRepository; domain-aware graph writes over persistence graph tables + errors.py # graph structural and mutation errors + traversal.py # subtree and dependency traversal primitives + lookup.py # GraphNodeLookup or successor + lifecycle.py # named graph status transitions, if introduced + propagation.py # former workflow_propagation_service graph edge/node propagation helpers + + tasks/ + models.py # task execution commands/results, task refs + errors.py # task execution/management/cancellation errors + repository.py # latest execution / attempt selection over RunTaskExecution rows + execution.py # TaskExecutionService + management.py # agent-initiated subtask operations + inspection.py # read-only subtask snapshots + cleanup.py # per-execution cleanup + cascades.py # cancellation/blocking/downstream invalidation + + workflows/ + models.py # workflow lifecycle commands/results + errors.py + initialization.py + finalization.py + service.py # workflow navigation/resource materialization, if kept here + launch.py # ExperimentLaunchService if launch remains runtime-facing + + evaluation/ + models.py + errors.py + dispatch.py + rubric.py + persistence.py + scoring.py # shared score aggregation semantics + + read_models/ + errors.py + run_snapshot.py # RunReadService and pure DTO shaping helpers + experiments.py # ExperimentReadService + cohorts.py # cohort read/detail/stats DTO shaping + + definitions/ + models.py # define/persist commands/results if kept out of persistence + definition.py # ExperimentDefinitionService + persistence.py # ExperimentPersistenceService; materializes composition objects into definition rows + + inngest/ + client.py # Inngest singleton and cancellation config + registry.py # ALL_FUNCTIONS / serve() function list + contracts.py # child payloads and function results, or per-event modules + errors.py # Inngest/non-retryable/contract wrapper errors + functions/ # optional if we want one subdirectory below package root +``` + +The key convention is that each domain package should make its file roles obvious: + +- `models.py` for request/response/domain DTOs. +- `repository.py` only where the module owns persisted mutation/read-write behavior. +- `errors.py` for exceptions that are part of that service/domain contract. +- `service.py` or named service files for use-case orchestration. +- `utils.py` or more specific helper modules only for reusable pure helpers. + +For Inngest specifically, avoid a separate top-level `runtime/inngest_client.py` or `runtime/inngest_registry.py`; the `runtime/inngest` package should own those pieces directly. + +## High-Value Refactor Candidates + +### 0. Keep The New Public API Boundary Out Of Runtime Read Models + +The public API is now an authoring API. Do not move run/cohort/graph/read-model concepts into `ergon_core.api` to make service imports easier. + +Immediate cleanup direction: + +- Leave `Benchmark`, `Task`, `Worker`, `Criterion`, `Rubric`, and their result/context objects in the nested public API packages. +- Keep `Experiment`, `WorkerSpec`, and definition handles in `core/composition`. +- Move operational DTO shaping out of `core/api/schemas.py` and into runtime/application read models before doing large package moves. + +This is mostly a boundary rule for the plan, but it prevents the services refactor from undoing the public API simplification. + +### 1. Extract Graph Traversal Primitives + +Create a small module for containment traversal by `parent_node_id`. + +Initial consumers: + +- `task_management_service._count_non_terminal_descendants` +- `subtask_cancellation_service.cancel_orphans` +- `subtask_blocking_service.block_pending_descendants` +- `workflow_service._descendant_ids` + +This is the clearest low-risk cleanup because the duplicated query shape is visible and bounded. + +### 2. Centralize Latest Execution Selection + +Create one helper or repository method for "latest execution for node". + +It should define ordering once, probably: + +1. `attempt_number DESC` +2. `started_at DESC` + +Then update: + +- `WorkflowService.get_latest_execution` +- `TaskInspectionService._latest_output` +- `TaskInspectionService._latest_error` +- `task_management_service._latest_execution_id` +- `subtask_cancellation_service._latest_execution_id` + +### 3. Centralize Evaluation Score Aggregation + +Create one score aggregation helper that returns a named object: + +- `final_score` +- `normalized_score` +- `evaluators_count` + +Then update: + +- `WorkflowFinalizationService` +- `TelemetryRepository.refresh_run_evaluation_summary` +- `RunReadService.build_run_snapshot` +- cohort summary readers if their semantics need adjustment + +### 4. Split DTO/Schema Contracts From Service Implementations + +Normalize naming inside any new package: + +- Use `models.py` for request/response DTOs within runtime domain packages. +- Reserve `schemas.py` for API wire schemas only if the codebase keeps that distinction. +- Avoid mixing Inngest contracts with service DTOs unless the package name makes that explicit. + +### 5. Move API Snapshot Helpers Out Of API Layer + +`RunReadService` should not need to import `ergon_core.core.api.runs` helper functions. Move pure task/resource/evaluation snapshot builders to a runtime read-model module, or move `RunReadService` behind the API layer. + +### 6. Decide Whether `WorkflowGraphRepository` Is A Repository Or Domain Service + +Keep it in runtime, but move it to `runtime/graph/repository.py` and make clear that it is a domain repository for graph mutations, not a generic persistence repository. + +The repository writes audit mutations and encodes structural invariants, not just SQL CRUD. It should import `core/persistence/graph/models.py` table classes, but the operation names and invariants belong to the graph domain. + +Use this as the general persistence rule for the refactor: + +- Table definitions and session setup stay under `core/persistence`. +- Domain-specific repositories live with their domain package. +- Generic query bags such as `core/persistence/queries.py` should shrink or dissolve as their methods move to domain packages. + +### 7. Move Experiment Validation Toward Composition + +`experiment_validation_service.py` is useful as a first extraction, but it should not make `runtime/services` the permanent home for composition validation. + +Candidate target: + +- `core/composition/validation.py` + +The target file can expose either `ExperimentValidationService` or pure validation functions. The important boundary is that this logic validates authored/composed definitions before persistence; it does not participate in live runtime execution. + +### 8. Move Inngest Ownership Into The Inngest Package + +Move or plan to move: + +- `runtime/inngest_client.py` to `runtime/inngest/client.py` +- `runtime/inngest_registry.py` to `runtime/inngest/registry.py` +- `services/child_function_payloads.py` to `runtime/inngest/contracts.py` or per-function contract modules +- `services/inngest_function_results.py` to `runtime/inngest/contracts.py` or per-function result modules +- `runtime/errors/inngest_errors.py` to `runtime/inngest/errors.py` + +This should be mostly import churn, but the plan should include architecture tests so Inngest setup does not drift back into `runtime/services`. + +### 9. Add Domain-Local Error Modules + +As packages are split, add `errors.py` to each domain package. The first pass can be mechanical: + +- graph errors follow `WorkflowGraphRepository` +- delegation/task errors follow task management and inspection +- Inngest errors follow the Inngest client and functions +- evaluation-specific contract violations move with evaluation services if they are not broadly runtime-level + +The plan should not require inventing custom errors for every possible branch in one pass. It should require that new service boundary failures use domain-specific exception types, and that moved services do not keep reaching into a shared dumping-ground error module when a local `errors.py` is clearer. + +## Questions For The Refactor Plan + +1. Should `services` disappear entirely in favor of domain packages, or should it remain only for files not yet moved during direct bulk renames? +2. Should request/response models live in `models.py` beside each domain package, or in separate `contracts.py` files when they are consumed by Inngest/API boundaries? +3. Should `WorkflowGraphRepository` emit/listen to dashboard mutations directly, or should dashboard emission sit above the repository? +4. Should read-model services be considered runtime services, API services, or their own `runtime/read_models` layer? +5. Which `core/persistence/queries.py` methods should dissolve into definition/task/graph/read-model domain repositories first? +6. Should each package expose its domain errors from `__init__.py`, or should callers import directly from `package.errors` to avoid new barrel behavior? +7. Should Inngest contracts be centralized in one `runtime/inngest/contracts.py`, or colocated with each function module? +8. Should `experiment_validation_service.py` move into `core/composition`, or should all experiment definition use cases live under a new definition/application package? +9. Should `workflow_propagation_service.py` become `runtime/graph/propagation.py`, or should propagation be split between graph lifecycle primitives and task orchestration? +10. Should operational DTOs currently in `core/api/schemas.py` move before or after the services package split? +11. Should the first domain repository extraction be `runtime/tasks/repository.py` for latest execution/attempt selection, since that duplication is already concrete? + +## Recommended Next Step + +Write a refactor plan that starts with mechanical, low-risk extractions before package moves. Revised order after the public API and service moves: + +1. Lock the boundary rule in tests: public `ergon_core.api` remains authoring-only; runtime/read-model services do not import beginner-facing API modules except at authoring/evaluation adapter boundaries. +2. Lock the persistence rule in tests or architecture notes: `core/persistence` owns tables/session/storage infrastructure; domain repositories live with runtime/composition/definition packages. +3. Extract shared latest-execution and attempt-selection logic into a task-domain repository/helper. +4. Extract graph containment traversal helper. +5. Move `workflow_propagation_service.py` behind a graph lifecycle module or package, preserving the current import behavior through direct bulk updates rather than aliasing. +6. Extract evaluation score aggregation helper. +7. Move pure run snapshot helper functions and operational DTO shaping out of `core.api.runs` / `core.api.schemas`. +8. Move `experiment_validation_service.py` toward `core/composition` and keep `experiment_persistence_service.py` in a definition/composition domain package rather than under raw persistence. +9. Move Inngest client, registry, contracts, results, and errors under `runtime/inngest`. +10. Introduce domain package structure with one package at a time, starting with `runtime/graph`. +11. Dissolve `core/persistence/queries.py` incrementally as each domain repository takes over its methods. +12. Add `errors.py` to each package as services move, and replace generic service-boundary exceptions where the domain already has a clear failure type. +13. Move/rename services only after tests prove the helpers preserve behavior. + +This order reduces risk because it fixes semantic duplication before large import churn. diff --git a/docs/superpowers/plans/2026-04-29-core-component-registry-refactor.md b/docs/superpowers/plans/2026-04-29-core-component-registry-refactor.md new file mode 100644 index 00000000..de57361b --- /dev/null +++ b/docs/superpowers/plans/2026-04-29-core-component-registry-refactor.md @@ -0,0 +1,1229 @@ +# Core Component Registry Refactor Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Move component registration ownership into `ergon_core` public API so core never imports `ergon_builtins`, builtins/tests explicitly register components, and experiment definition/runtime have a clear slug-to-component mental model. + +**Architecture:** Add a Pydantic-based `ComponentRegistry` and process-global `registry` under `ergon_core.api.registry`. Builtins, optional builtins capabilities, and tests contribute components through explicit registration functions. Core application/runtime code resolves persisted slugs through the core registry only. + +**Tech Stack:** Python, Pydantic models, pytest, Inngest job handlers, FastAPI startup, existing Ergon public APIs. + +--- + +## Mental Model To Preserve + +The final model should be easy to explain to students: + +1. Components are Python classes/functions: `Benchmark`, `Worker`, `Evaluator`/`Rubric`, `BaseSandboxManager`. +2. Registration says which component slugs are available in this process. +3. Experiment authoring passes concrete objects/specs into `Experiment`. +4. Persistence stores only stable identities: benchmark slug, worker slug, evaluator slug, sandbox slug, model target. +5. Runtime jobs turn those stored slugs back into Python classes/functions via `ergon_core.api.registry.registry`. + +The registry is not the main experiment authoring API. It is the catalog that validates slugs and rehydrates persisted definitions across process boundaries. + +## File Structure + +- Create `ergon_core/ergon_core/api/registry.py` + - Defines `WorkerFactory`, `ComponentRegistry`, `registry`, duplicate handling, `require_*` lookup helpers, and reset/snapshot helpers for tests. +- Modify `ergon_core/ergon_core/api/__init__.py` + - Re-export `ComponentRegistry`, `WorkerFactory`, and `registry`. +- Modify `ergon_builtins/ergon_builtins/registry_core.py` + - Replace exported dict ownership with `register_core_builtins(target=registry)`. +- Modify `ergon_builtins/ergon_builtins/registry_data.py` + - Replace exported dict ownership with `register_data_builtins(target=registry)`. +- Modify `ergon_builtins/ergon_builtins/registry_local_models.py` + - Replace exported dict ownership with `register_local_model_builtins(target=registry)` or a returned model backend mapping, depending on model backend constraints. +- Modify `ergon_builtins/ergon_builtins/registry.py` + - Becomes explicit composition function `register_builtins(target=registry)`. + - Optional: keep backwards-compatible module attributes temporarily only if necessary for existing tests, but core must not use them. +- Modify core runtime imports in: + - `ergon_core/ergon_core/core/application/jobs/worker_execute.py` + - `ergon_core/ergon_core/core/application/jobs/evaluate_task_run.py` + - `ergon_core/ergon_core/core/application/jobs/persist_outputs.py` + - `ergon_core/ergon_core/core/application/jobs/sandbox_setup.py` + - `ergon_core/ergon_core/core/application/experiments/launch.py` + - `ergon_core/ergon_core/core/application/experiments/service.py` + - `ergon_core/ergon_core/core/application/workflows/service.py` + - `ergon_core/ergon_core/core/application/tasks/management.py` + - `ergon_core/ergon_core/core/domain/experiments/worker_spec.py` + - `ergon_core/ergon_core/core/rest_api/app.py` +- Move test-only smoke fixture component definitions from: + - `ergon_core/ergon_core/test_support/smoke_fixtures/**` + - into `tests/e2e/fixtures/smoke_components/**` or `tests/fixtures/smoke_components/**`. +- Modify E2E/test startup: + - `tests/e2e/conftest.py` + - current startup plugin module(s) referenced by `ERGON_STARTUP_PLUGINS` + - tests currently importing `ergon_core.test_support.smoke_fixtures` +- Modify unit tests: + - `tests/unit/registry/test_builtin_pairings.py` + - add `tests/unit/registry/test_component_registry.py` + - add/adjust core tests that assert no `ergon_core` file imports `ergon_builtins.registry`. + +--- + +### Task 1: Add Core Public Component Registry + +**Files:** +- Create: `ergon_core/ergon_core/api/registry.py` +- Modify: `ergon_core/ergon_core/api/__init__.py` +- Test: `tests/unit/registry/test_component_registry.py` + +- [ ] **Step 1: Write failing registry unit tests** + +Create `tests/unit/registry/test_component_registry.py`: + +```python +import pytest + +from ergon_core.api import Benchmark, Rubric, Worker +from ergon_core.api.registry import ComponentRegistry +from ergon_core.core.infrastructure.sandbox.manager import BaseSandboxManager + + +class ExampleWorker(Worker): + type_slug = "example-worker" + + +class ReplacementWorker(Worker): + type_slug = "example-worker" + + +class ExampleBenchmark(Benchmark): + type_slug = "example-benchmark" + + +class ExampleRubric(Rubric): + type_slug = "example-rubric" + + +class ExampleSandboxManager(BaseSandboxManager): + pass + + +def test_registers_components_by_explicit_or_type_slug() -> None: + registry = ComponentRegistry() + + registry.register_worker(ExampleWorker.type_slug, ExampleWorker) + registry.register_benchmark(ExampleBenchmark) + registry.register_evaluator(ExampleRubric) + registry.register_sandbox_manager("example-benchmark", ExampleSandboxManager) + + assert registry.require_worker("example-worker") is ExampleWorker + assert registry.require_benchmark("example-benchmark") is ExampleBenchmark + assert registry.require_evaluator("example-rubric") is ExampleRubric + assert registry.sandbox_managers["example-benchmark"] is ExampleSandboxManager + + +def test_duplicate_slug_rejects_different_object() -> None: + registry = ComponentRegistry() + registry.register_worker("example-worker", ExampleWorker) + + with pytest.raises(ValueError, match="Duplicate worker slug 'example-worker'"): + registry.register_worker("example-worker", ReplacementWorker) + + +def test_duplicate_slug_allows_idempotent_registration() -> None: + registry = ComponentRegistry() + registry.register_worker("example-worker", ExampleWorker) + registry.register_worker("example-worker", ExampleWorker) + + assert registry.require_worker("example-worker") is ExampleWorker + + +def test_unknown_slug_error_lists_registered_values() -> None: + registry = ComponentRegistry() + registry.register_worker("example-worker", ExampleWorker) + + with pytest.raises( + ValueError, + match="Unknown worker slug 'missing-worker'; registered workers: example-worker", + ): + registry.require_worker("missing-worker") +``` + +- [ ] **Step 2: Run failing registry tests** + +Run: + +```bash +pytest tests/unit/registry/test_component_registry.py -q +``` + +Expected: FAIL because `ergon_core.api.registry` does not exist. + +- [ ] **Step 3: Implement `ergon_core.api.registry`** + +Create `ergon_core/ergon_core/api/registry.py`: + +```python +"""Public process-level component registry. + +The registry maps stable slugs stored in experiment definitions back to the +Python classes/factories needed by runtime jobs. Packages such as +``ergon_builtins`` and test fixtures contribute components explicitly during +startup; ``ergon_core`` never imports those packages to discover components. +""" + +from collections.abc import Callable, Mapping +from typing import TypeVar + +from ergon_core.api.benchmark import Benchmark +from ergon_core.api.rubric import Evaluator +from ergon_core.api.worker import Worker +from ergon_core.core.infrastructure.sandbox.manager import BaseSandboxManager +from pydantic import BaseModel, ConfigDict, Field + +WorkerFactory = Callable[..., Worker] +T = TypeVar("T") + + +class ComponentRegistry(BaseModel): + """Catalog of component types available in the current Python process.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + workers: dict[str, WorkerFactory] = Field(default_factory=dict) + benchmarks: dict[str, type[Benchmark]] = Field(default_factory=dict) + evaluators: dict[str, type[Evaluator]] = Field(default_factory=dict) + sandbox_managers: dict[str, type[BaseSandboxManager]] = Field(default_factory=dict) + + def register_worker(self, slug: str, factory: WorkerFactory) -> None: + self._register(self.workers, "worker", slug, factory) + + def register_benchmark(self, benchmark_cls: type[Benchmark], slug: str | None = None) -> None: + self._register(self.benchmarks, "benchmark", slug or benchmark_cls.type_slug, benchmark_cls) + + def register_evaluator(self, evaluator_cls: type[Evaluator], slug: str | None = None) -> None: + self._register(self.evaluators, "evaluator", slug or evaluator_cls.type_slug, evaluator_cls) + + def register_sandbox_manager( + self, + slug: str, + manager_cls: type[BaseSandboxManager], + ) -> None: + self._register(self.sandbox_managers, "sandbox manager", slug, manager_cls) + + def require_worker(self, slug: str) -> WorkerFactory: + return self._require(self.workers, "worker", slug) + + def require_benchmark(self, slug: str) -> type[Benchmark]: + return self._require(self.benchmarks, "benchmark", slug) + + def require_evaluator(self, slug: str) -> type[Evaluator]: + return self._require(self.evaluators, "evaluator", slug) + + def _register(self, target: dict[str, T], kind: str, slug: str, value: T) -> None: + existing = target.get(slug) + if existing is not None and existing is not value: + raise ValueError(f"Duplicate {kind} slug {slug!r}") + target[slug] = value + + def _require(self, target: Mapping[str, T], kind: str, slug: str) -> T: + try: + return target[slug] + except KeyError: + known = ", ".join(sorted(target)) or "" + raise ValueError( + f"Unknown {kind} slug {slug!r}; registered {kind}s: {known}" + ) from None + + +registry = ComponentRegistry() +``` + +- [ ] **Step 4: Re-export the registry from public API** + +Modify `ergon_core/ergon_core/api/__init__.py`: + +```python +"""Beginner-facing Ergon authoring API surface.""" + +from ergon_core.api.benchmark import Benchmark, BenchmarkRequirements, EmptyTaskPayload, Task +from ergon_core.api.criterion import ( + Criterion, + CriterionContext, + CriterionEvidence, + CriterionOutcome, + EvidenceMessage, + ScoreScale, +) +from ergon_core.api.errors import CriterionCheckError +from ergon_core.api.registry import ComponentRegistry, WorkerFactory, registry +from ergon_core.api.rubric import Rubric, TaskEvaluationResult +from ergon_core.api.worker import Worker, WorkerContext, WorkerOutput, WorkerStreamItem + +__all__ = [ + "Benchmark", + "BenchmarkRequirements", + "ComponentRegistry", + "Criterion", + "CriterionCheckError", + "CriterionContext", + "CriterionEvidence", + "CriterionOutcome", + "EmptyTaskPayload", + "EvidenceMessage", + "Rubric", + "ScoreScale", + "Task", + "TaskEvaluationResult", + "Worker", + "WorkerContext", + "WorkerFactory", + "WorkerOutput", + "WorkerStreamItem", + "registry", +] +``` + +- [ ] **Step 5: Run registry tests** + +Run: + +```bash +pytest tests/unit/registry/test_component_registry.py -q +``` + +Expected: PASS. + +--- + +### Task 2: Convert Builtins Registry To Explicit Registration + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/registry_core.py` +- Modify: `ergon_builtins/ergon_builtins/registry_data.py` +- Modify: `ergon_builtins/ergon_builtins/registry_local_models.py` +- Modify: `ergon_builtins/ergon_builtins/registry.py` +- Test: `tests/unit/registry/test_builtin_pairings.py` + +- [ ] **Step 1: Update builtin pairing tests to register into a fresh registry** + +Modify `tests/unit/registry/test_builtin_pairings.py` so tests no longer import dicts from `ergon_builtins.registry_core` or `ergon_builtins.registry`. Use a fresh `ComponentRegistry`: + +```python +"""Documented built-in benchmark pairings are explicit and registered.""" + +import pytest + +from ergon_core.api.registry import ComponentRegistry + + +CORE_PAIRINGS = [ + { + "benchmark": "minif2f", + "worker": "minif2f-react", + "evaluator": "minif2f-rubric", + "sandbox": "minif2f", + "extras": ("none",), + }, + { + "benchmark": "swebench-verified", + "worker": "swebench-react", + "evaluator": "swebench-rubric", + "sandbox": "swebench-verified", + "extras": ("ergon-builtins[data]",), + }, +] + +DATA_PAIRINGS = [ + { + "benchmark": "gdpeval", + "worker": "gdpeval-react", + "evaluator": "gdpeval-staged-rubric", + "sandbox": "gdpeval", + "extras": ("ergon-builtins[data]",), + }, + { + "benchmark": "researchrubrics", + "worker": "researchrubrics-researcher", + "evaluator": "researchrubrics-rubric", + "sandbox": "researchrubrics", + "extras": ("ergon-builtins[data]",), + }, + { + "benchmark": "researchrubrics-vanilla", + "worker": "researchrubrics-researcher", + "evaluator": "researchrubrics-rubric", + "sandbox": "researchrubrics-vanilla", + "extras": ("ergon-builtins[data]",), + }, +] + + +@pytest.mark.parametrize("pairing", CORE_PAIRINGS) +def test_core_pairings_reference_registered_slugs(pairing: dict[str, object]) -> None: + from ergon_builtins.registry_core import register_core_builtins + + registry = ComponentRegistry() + register_core_builtins(registry) + + _assert_pairing(pairing, registry) + + +@pytest.mark.parametrize("pairing", DATA_PAIRINGS) +def test_data_pairings_reference_registered_slugs(pairing: dict[str, object]) -> None: + pytest.importorskip("datasets", reason="ergon-builtins[data] not installed") + from ergon_builtins.registry import register_builtins + + registry = ComponentRegistry() + register_builtins(registry) + + _assert_pairing(pairing, registry) + + +def _assert_pairing(pairing: dict[str, object], registry: ComponentRegistry) -> None: + benchmark = pairing["benchmark"] + worker = pairing["worker"] + evaluator = pairing["evaluator"] + sandbox = pairing["sandbox"] + extras = pairing["extras"] + + assert benchmark in registry.benchmarks + assert worker in registry.workers + assert evaluator in registry.evaluators + assert sandbox in registry.sandbox_managers + assert isinstance(extras, tuple) + assert extras +``` + +- [ ] **Step 2: Run updated builtin pairing tests** + +Run: + +```bash +pytest tests/unit/registry/test_builtin_pairings.py -q +``` + +Expected: FAIL because the `register_*` functions do not exist. + +- [ ] **Step 3: Replace `registry_core.py` dicts with `register_core_builtins`** + +Modify `ergon_builtins/ergon_builtins/registry_core.py` to keep imports but replace exported dicts with: + +```python +from ergon_core.api.registry import ComponentRegistry, registry + + +def register_core_builtins(target: ComponentRegistry = registry) -> None: + """Register builtins that have no optional dependency extras.""" + + target.register_worker("training-stub", TrainingStubWorker) + target.register_worker("minif2f-react", minif2f_react) + target.register_worker("swebench-react", swebench_react) + + target.register_benchmark(MiniF2FBenchmark) + target.register_benchmark(SweBenchVerifiedBenchmark) + + target.register_evaluator(StagedRubric) + target.register_evaluator(StagedRubric, slug="gdpeval-staged-rubric") + target.register_evaluator(MiniF2FRubric) + target.register_evaluator(SWEBenchRubric) + + target.register_sandbox_manager("gdpeval", GDPEvalSandboxManager) + target.register_sandbox_manager("minif2f", MiniF2FSandboxManager) + target.register_sandbox_manager("swebench-verified", SWEBenchSandboxManager) +``` + +Do not remove `SANDBOX_TEMPLATES` yet unless all uses are known. Leave it as a plain exported mapping: + +```python +SANDBOX_TEMPLATES: dict[str, Path] = { + "minif2f": Path(__file__).parent / "benchmarks/minif2f/sandbox", + "swebench-verified": Path(__file__).parent / "benchmarks/swebench_verified/sandbox", +} +``` + +- [ ] **Step 4: Replace `registry_data.py` dicts with `register_data_builtins`** + +Modify `ergon_builtins/ergon_builtins/registry_data.py`: + +```python +from ergon_core.api.registry import ComponentRegistry, registry + + +def register_data_builtins(target: ComponentRegistry = registry) -> None: + """Register builtins that require the [data] optional dependency group.""" + + target.register_benchmark(GDPEvalBenchmark) + target.register_benchmark(ResearchRubricsBenchmark) + target.register_benchmark(ResearchRubricsVanillaBenchmark) + + target.register_evaluator(ResearchRubricsRubric, slug="research-rubric") + target.register_evaluator(ResearchRubricsRubric) + + target.register_worker("gdpeval-react", gdpeval_react) + target.register_worker(ResearchRubricsResearcherWorker.type_slug, ResearchRubricsResearcherWorker) + target.register_worker( + ResearchRubricsWorkflowCliReActWorker.type_slug, + ResearchRubricsWorkflowCliReActWorker, + ) + + target.register_sandbox_manager("researchrubrics", ResearchRubricsSandboxManager) + target.register_sandbox_manager("researchrubrics-vanilla", ResearchRubricsSandboxManager) +``` + +If `GDPEvalBenchmark` requires a sandbox manager but the current data registry does not register one, decide during implementation whether to add: + +```python +target.register_sandbox_manager("gdpeval", GDPEvalSandboxManager) +``` + +only if `GDPEvalSandboxManager` can be imported from the data module without creating an optional dependency problem. Otherwise keep the current core registration for `"gdpeval"`. + +- [ ] **Step 5: Convert top-level `ergon_builtins.registry` to an explicit registration function** + +Modify `ergon_builtins/ergon_builtins/registry.py`: + +```python +"""Register built-in Ergon components into the core public registry.""" + +import structlog + +from ergon_core.api.registry import ComponentRegistry, registry +from ergon_builtins.models.resolution import register_model_backend +from ergon_builtins.registry_core import register_core_builtins + +log = structlog.get_logger() + + +def register_builtins(target: ComponentRegistry = registry) -> None: + """Register builtins available in the current environment. + + This is intentionally explicit: importing ``ergon_core`` does not import + builtins, and importing builtins does not mutate core unless startup calls + this function. + """ + + register_core_builtins(target) + _register_local_model_builtins() + _register_data_builtins(target) + + +def _register_local_model_builtins() -> None: + try: + from ergon_builtins.registry_local_models import register_local_model_builtins + except ImportError: + log.info("ergon-builtins[local-models] not installed; local transformers inference unavailable") + return + + register_local_model_builtins() + + +def _register_data_builtins(target: ComponentRegistry) -> None: + try: + from ergon_builtins.registry_data import register_data_builtins + except ImportError: + log.info( + "ergon-builtins[data] not installed; gdpeval and researchrubrics benchmarks unavailable" + ) + return + + register_data_builtins(target) + + +INSTALL_HINTS: dict[str, str] = { + "transformers": "pip install 'ergon-builtins[local-models]'", + "gdpeval": "pip install 'ergon-builtins[data]'", + "researchrubrics": "pip install 'ergon-builtins[data]'", + "research-rubric": "pip install 'ergon-builtins[data]'", +} +``` + +- [ ] **Step 6: Convert local model registry** + +Modify `ergon_builtins/ergon_builtins/registry_local_models.py`: + +```python +"""Components that require the [local-models] capability.""" + +from ergon_builtins.models.resolution import register_model_backend +from ergon_builtins.models.transformers_backend import resolve_transformers + + +def register_local_model_builtins() -> None: + register_model_backend("transformers", resolve_transformers) +``` + +Keep core model backends registered wherever they are currently registered. If `registry_core.py` currently owns `"vllm"`, `"openai"`, `"anthropic"`, `"google"`, `"openrouter"`, and `"openai-responses"`, move that into a helper in `ergon_builtins.registry_core` called by `register_core_builtins()`: + +```python +def _register_core_model_backends() -> None: + register_model_backend("vllm", resolve_vllm) + register_model_backend("openai", resolve_cloud) + register_model_backend("anthropic", resolve_cloud) + register_model_backend("google", resolve_cloud) + register_model_backend("openrouter", resolve_openrouter) + register_model_backend("openai-responses", resolve_openrouter_responses) +``` + +- [ ] **Step 7: Run builtin registry tests** + +Run: + +```bash +pytest tests/unit/registry/test_builtin_pairings.py tests/unit/registry/test_component_registry.py -q +``` + +Expected: PASS. + +--- + +### Task 3: Add Startup Registration For Runtime Processes + +**Files:** +- Modify: runtime startup location that is imported by CLI/API before defining/running experiments. +- Likely modify: `ergon_core/ergon_core/core/rest_api/app.py` +- Search and modify: CLI entrypoints under `ergon_cli/**` +- Test: existing CLI/API tests that define experiments. + +- [ ] **Step 1: Locate CLI and startup entrypoints** + +Run: + +```bash +rg "experiment define|ERGON_STARTUP_PLUGINS|startup_plugins|register_builtins|def main|typer|click" ergon_cli ergon_core tests -n +``` + +Expected: identify the CLI initialization path and FastAPI lifespan path. + +- [ ] **Step 2: Add explicit builtin registration during API startup** + +In `ergon_core/ergon_core/core/rest_api/app.py`, import only the core registry at module or function scope. In the lifespan before sandbox event sink wiring, call builtins registration as a startup plugin decision: + +```python +from ergon_core.api.registry import registry + + +def _register_default_components() -> None: + from ergon_builtins.registry import register_builtins + + register_builtins(registry) +``` + +Then call `_register_default_components()` early in `lifespan`, before runtime services need sandbox managers. + +Important: this is acceptable at app startup because the application chooses to install builtins. Core library modules still must not import `ergon_builtins.registry`. + +- [ ] **Step 3: Update sandbox event sink wiring to use core registry** + +Replace: + +```python +from ergon_builtins.registry import SANDBOX_MANAGERS +... +for manager_cls in SANDBOX_MANAGERS.values(): + manager_cls.set_event_sink(sink) +logger.info("sandbox event sink wired on %d manager subclass(es)", 1 + len(SANDBOX_MANAGERS)) +``` + +with: + +```python +from ergon_core.api.registry import registry +... +for manager_cls in registry.sandbox_managers.values(): + manager_cls.set_event_sink(sink) +logger.info( + "sandbox event sink wired on %d manager subclass(es)", + 1 + len(registry.sandbox_managers), +) +``` + +- [ ] **Step 4: Add explicit builtin registration during CLI startup** + +In the CLI root entrypoint, add a small registration helper and call it before commands that define or run experiments: + +```python +from ergon_core.api.registry import registry + + +def register_default_components() -> None: + from ergon_builtins.registry import register_builtins + + register_builtins(registry) +``` + +Do not scatter this call through individual commands if there is a central CLI startup hook. If no central hook exists, call it at the top of experiment define/run command handlers and note the duplication for later cleanup. + +- [ ] **Step 5: Run fast CLI/API tests affected by startup** + +Run the narrowest available tests after locating them: + +```bash +pytest tests/unit tests/integration -q -k "experiment or registry or cli" +``` + +Expected: PASS or unrelated pre-existing failures documented before continuing. + +--- + +### Task 4: Replace Core Imports Of Builtins Registry + +**Files:** +- Modify listed core files containing `from ergon_builtins.registry import ...` +- Test: add import-boundary test under `tests/unit/registry/test_core_registry_boundary.py` + +- [ ] **Step 1: Add boundary test that core does not import builtins registry** + +Create `tests/unit/registry/test_core_registry_boundary.py`: + +```python +from pathlib import Path + + +def test_ergon_core_does_not_import_builtins_registry() -> None: + root = Path("ergon_core/ergon_core") + offenders: list[str] = [] + + for path in root.rglob("*.py"): + text = path.read_text() + if "ergon_builtins.registry" in text: + offenders.append(str(path)) + + assert offenders == [] +``` + +- [ ] **Step 2: Run boundary test and verify it fails** + +Run: + +```bash +pytest tests/unit/registry/test_core_registry_boundary.py -q +``` + +Expected: FAIL listing the current core files that import `ergon_builtins.registry`. + +- [ ] **Step 3: Update worker execution lookup** + +Modify `ergon_core/ergon_core/core/application/jobs/worker_execute.py`: + +```python +from ergon_core.api.registry import registry +``` + +Inside `run_worker_execute_job`, remove: + +```python +from ergon_builtins.registry import BENCHMARKS, WORKERS +``` + +Replace worker lookup: + +```python +worker_cls = registry.workers.get(payload.worker_type) +``` + +Replace benchmark lookup: + +```python +benchmark_cls = registry.benchmarks.get(payload.benchmark_type) +``` + +Keep existing `RegistryLookupError` behavior for workers by checking `None` as today. + +- [ ] **Step 4: Update evaluation job lookup** + +Modify `ergon_core/ergon_core/core/application/jobs/evaluate_task_run.py`: + +```python +from ergon_core.api.registry import registry +``` + +Remove the builtins import inside `run_evaluate_task_run_job`. Replace: + +```python +evaluator_cls = EVALUATORS.get(evaluator_type) +manager_cls = SANDBOX_MANAGERS.get(benchmark_type, DefaultSandboxManager) +benchmark_cls = BENCHMARKS.get(benchmark_type) if benchmark_type is not None else None +``` + +with: + +```python +evaluator_cls = registry.evaluators.get(evaluator_type) +manager_cls = ( + registry.sandbox_managers.get(benchmark_type, DefaultSandboxManager) + if benchmark_type is not None + else DefaultSandboxManager +) +benchmark_cls = registry.benchmarks.get(benchmark_type) if benchmark_type is not None else None +``` + +- [ ] **Step 5: Update sandbox and output jobs** + +Modify `ergon_core/ergon_core/core/application/jobs/persist_outputs.py` and `ergon_core/ergon_core/core/application/jobs/sandbox_setup.py`: + +```python +from ergon_core.api.registry import registry +``` + +Replace: + +```python +manager_cls = SANDBOX_MANAGERS.get(..., DefaultSandboxManager) +``` + +with: + +```python +manager_cls = registry.sandbox_managers.get(..., DefaultSandboxManager) +``` + +- [ ] **Step 6: Update experiment launch and define services** + +Modify `ergon_core/ergon_core/core/application/experiments/launch.py`: + +```python +from ergon_core.api.registry import registry +``` + +Replace evaluator and benchmark lookups with: + +```python +evaluator_cls = registry.require_evaluator(evaluator_slug) +source = registry.require_benchmark(benchmark_slug)() +``` + +Modify `ergon_core/ergon_core/core/application/experiments/service.py` so `_benchmark_cls` caches `registry.benchmarks`, not builtins dicts: + +```python +from ergon_core.api.registry import registry +... +if self._benchmarks is None: + self._benchmarks = registry.benchmarks +return self._benchmarks[benchmark_slug] +``` + +- [ ] **Step 7: Update workflow/task mutation validation** + +Modify `ergon_core/ergon_core/core/application/workflows/service.py`, `ergon_core/ergon_core/core/application/tasks/management.py`, and `ergon_core/ergon_core/core/domain/experiments/worker_spec.py`: + +```python +from ergon_core.api.registry import registry +``` + +Replace membership checks: + +```python +if slug not in WORKERS: +``` + +with: + +```python +if slug not in registry.workers: +``` + +For error messages listing known workers, use: + +```python +known = ", ".join(sorted(registry.workers)) +``` + +- [ ] **Step 8: Run boundary and affected unit tests** + +Run: + +```bash +pytest tests/unit/registry/test_core_registry_boundary.py tests/unit/registry/test_component_registry.py tests/unit/registry/test_builtin_pairings.py -q +``` + +Expected: PASS. + +--- + +### Task 5: Move Smoke Test Helpers Out Of Core + +**Files:** +- Move from: `ergon_core/ergon_core/test_support/smoke_fixtures/**` +- Move to: `tests/fixtures/smoke_components/**` +- Modify: `tests/e2e/conftest.py` +- Modify: startup plugin referenced by E2E environment +- Test: E2E smoke tests and import-boundary tests. + +- [ ] **Step 1: Add a test proving smoke fixtures do not live under core** + +Create or extend `tests/unit/registry/test_core_registry_boundary.py`: + +```python +def test_core_package_has_no_smoke_fixture_registration_package() -> None: + assert not Path("ergon_core/ergon_core/test_support/smoke_fixtures").exists() +``` + +Expected initially: FAIL. + +- [ ] **Step 2: Create tests fixture package** + +Create: + +```text +tests/fixtures/smoke_components/ +tests/fixtures/smoke_components/__init__.py +tests/fixtures/smoke_components/benchmarks.py +tests/fixtures/smoke_components/sandbox.py +tests/fixtures/smoke_components/criteria/ +tests/fixtures/smoke_components/workers/ +``` + +Move files from `ergon_core/ergon_core/test_support/smoke_fixtures/**` into the new package, preserving internal folder shape where possible. + +- [ ] **Step 3: Update imports in moved files** + +Search: + +```bash +rg "ergon_core\\.test_support\\.smoke_fixtures|test_support\\.smoke_fixtures" tests/fixtures/smoke_components tests ergon_core -n +``` + +Replace imports such as: + +```python +from ergon_core.test_support.smoke_fixtures.workers.swebench_smoke import SweBenchSmokeWorker +``` + +with: + +```python +from tests.fixtures.smoke_components.workers.swebench_smoke import SweBenchSmokeWorker +``` + +- [ ] **Step 4: Replace smoke registration function** + +In `tests/fixtures/smoke_components/__init__.py`, define: + +```python +"""Test-only smoke component registration.""" + +import os + +from ergon_core.api.registry import ComponentRegistry, registry +from tests.fixtures.smoke_components.benchmarks import ( + MiniF2FSmokeBenchmark, + ResearchRubricsSmokeBenchmark, + SweBenchSmokeBenchmark, +) +from tests.fixtures.smoke_components.criteria.smoke_rubrics import ( + MiniF2FSmokeRubric, + ResearchRubricsSmokeRubric, + SweBenchSmokeRubric, +) +from tests.fixtures.smoke_components.criteria.timing import SmokePostRootTimingRubric +from tests.fixtures.smoke_components.sandbox import SmokeSandboxManager +from tests.fixtures.smoke_components.workers.minif2f_smoke import ( + MiniF2FFailingLeafWorker, + MiniF2FRecursiveSmokeWorker, + MiniF2FSadPathSmokeWorker, + MiniF2FSmokeLeafWorker, + MiniF2FSmokeWorker, +) +from tests.fixtures.smoke_components.workers.researchrubrics_smoke import ( + ResearchRubricsFailingLeafWorker, + ResearchRubricsRecursiveSmokeWorker, + ResearchRubricsSadPathSmokeWorker, + ResearchRubricsSmokeLeafWorker, + ResearchRubricsSmokeWorker, +) +from tests.fixtures.smoke_components.workers.swebench_smoke import ( + SweBenchFailingLeafWorker, + SweBenchRecursiveSmokeWorker, + SweBenchSadPathSmokeWorker, + SweBenchSmokeLeafWorker, + SweBenchSmokeWorker, +) + + +def register_smoke_components(target: ComponentRegistry = registry) -> None: + """Register test-only smoke components into the supplied registry.""" + + if os.environ.get("ENABLE_TEST_HARNESS") == "1": + target.register_benchmark(ResearchRubricsSmokeBenchmark) + target.register_benchmark(MiniF2FSmokeBenchmark) + target.register_benchmark(SweBenchSmokeBenchmark) + target.register_sandbox_manager(ResearchRubricsSmokeBenchmark.type_slug, SmokeSandboxManager) + target.register_sandbox_manager(MiniF2FSmokeBenchmark.type_slug, SmokeSandboxManager) + target.register_sandbox_manager(SweBenchSmokeBenchmark.type_slug, SmokeSandboxManager) + + target.register_worker(ResearchRubricsSmokeWorker.type_slug, ResearchRubricsSmokeWorker) + target.register_worker(ResearchRubricsSmokeLeafWorker.type_slug, ResearchRubricsSmokeLeafWorker) + target.register_worker( + ResearchRubricsRecursiveSmokeWorker.type_slug, + ResearchRubricsRecursiveSmokeWorker, + ) + target.register_evaluator(ResearchRubricsSmokeRubric) + target.register_evaluator(SmokePostRootTimingRubric) + target.register_worker(ResearchRubricsSadPathSmokeWorker.type_slug, ResearchRubricsSadPathSmokeWorker) + target.register_worker(ResearchRubricsFailingLeafWorker.type_slug, ResearchRubricsFailingLeafWorker) + + target.register_worker(MiniF2FSmokeWorker.type_slug, MiniF2FSmokeWorker) + target.register_worker(MiniF2FSmokeLeafWorker.type_slug, MiniF2FSmokeLeafWorker) + target.register_worker(MiniF2FRecursiveSmokeWorker.type_slug, MiniF2FRecursiveSmokeWorker) + target.register_worker(MiniF2FSadPathSmokeWorker.type_slug, MiniF2FSadPathSmokeWorker) + target.register_worker(MiniF2FFailingLeafWorker.type_slug, MiniF2FFailingLeafWorker) + target.register_evaluator(MiniF2FSmokeRubric) + + target.register_worker(SweBenchSmokeWorker.type_slug, SweBenchSmokeWorker) + target.register_worker(SweBenchSmokeLeafWorker.type_slug, SweBenchSmokeLeafWorker) + target.register_worker(SweBenchRecursiveSmokeWorker.type_slug, SweBenchRecursiveSmokeWorker) + target.register_worker(SweBenchSadPathSmokeWorker.type_slug, SweBenchSadPathSmokeWorker) + target.register_worker(SweBenchFailingLeafWorker.type_slug, SweBenchFailingLeafWorker) + target.register_evaluator(SweBenchSmokeRubric) +``` + +- [ ] **Step 5: Update E2E startup plugin** + +Locate the startup plugin currently importing `ergon_core.test_support.smoke_fixtures`. Replace it with: + +```python +from tests.fixtures.smoke_components import register_smoke_components + + +def register() -> None: + register_smoke_components() +``` + +If the startup plugin loader expects a different function name, preserve that function name and call `register_smoke_components()` inside it. + +- [ ] **Step 6: Remove old core smoke fixture package** + +Delete `ergon_core/ergon_core/test_support/smoke_fixtures/**` only after all imports have been updated. + +- [ ] **Step 7: Run smoke fixture import and boundary tests** + +Run: + +```bash +pytest tests/unit/registry/test_core_registry_boundary.py -q +pytest tests/e2e/test_swebench_smoke.py --collect-only -q +``` + +Expected: PASS. + +--- + +### Task 6: Update E2E And Integration Tests To Use Explicit Registry Setup + +**Files:** +- Modify: `tests/e2e/conftest.py` +- Modify: E2E startup plugin module(s) +- Modify: tests currently using `ergon_builtins.registry` dict mutation +- Test: E2E smoke suite. + +- [ ] **Step 1: Search for remaining dict mutation against old registries** + +Run: + +```bash +rg "BENCHMARKS|WORKERS|EVALUATORS|SANDBOX_MANAGERS|ergon_builtins\\.registry|register_smoke_fixtures|smoke_fixtures" tests ergon_core ergon_builtins -n +``` + +Expected: remaining references are either in `ergon_builtins` registration implementation, tests asserting pairings via `ComponentRegistry`, or places to update. + +- [ ] **Step 2: Update tests that temporarily patch registries** + +Replace code like: + +```python +from ergon_builtins.registry import BENCHMARKS, SANDBOX_MANAGERS + +original_benchmarks = {slug: BENCHMARKS[slug] for slug in slugs} +BENCHMARKS[slug] = SmokeBenchmark +``` + +with fresh registry injection if the code under test accepts a registry, or explicit registration into global `registry` if the code under test is runtime-like: + +```python +from ergon_core.api.registry import registry + +registry.register_benchmark(SmokeBenchmark) +registry.register_sandbox_manager(SmokeBenchmark.type_slug, SmokeSandboxManager) +``` + +If a test mutates global `registry`, restore state in `finally`: + +```python +original_benchmarks = dict(registry.benchmarks) +original_sandbox_managers = dict(registry.sandbox_managers) +try: + registry.register_benchmark(SmokeBenchmark) + registry.register_sandbox_manager(SmokeBenchmark.type_slug, SmokeSandboxManager) + ... +finally: + registry.benchmarks.clear() + registry.benchmarks.update(original_benchmarks) + registry.sandbox_managers.clear() + registry.sandbox_managers.update(original_sandbox_managers) +``` + +- [ ] **Step 3: Keep host-side E2E black-box behavior** + +`tests/e2e/conftest.py` currently documents that smoke fixture registration lives in the API container via `ERGON_STARTUP_PLUGINS`. Keep that mental model. Update the note to reference `tests.fixtures.smoke_components.register_smoke_components`, not `ergon_core.test_support`. + +- [ ] **Step 4: Run E2E smoke collect and selected tests** + +Run: + +```bash +pytest tests/e2e/test_swebench_smoke.py --collect-only -q +``` + +Then, if the E2E stack is running: + +```bash +pytest tests/e2e/test_swebench_smoke.py -q +``` + +Expected: collect passes. Runtime E2E passes when required infrastructure is available. + +--- + +### Task 7: Improve Experiment Validation Error Messages + +**Files:** +- Modify: `ergon_core/ergon_core/core/domain/experiments/worker_spec.py` +- Modify: `ergon_core/ergon_core/core/domain/experiments/validation.py` +- Test: existing or new experiment validation unit tests. + +- [ ] **Step 1: Add tests for clear missing component errors** + +Create or update `tests/unit/experiments/test_experiment_validation.py` with tests covering: + +```python +import pytest + +from ergon_core.core.domain.experiments import WorkerSpec + + +def test_worker_spec_unknown_worker_lists_registered_workers() -> None: + spec = WorkerSpec(worker_slug="missing-worker", name="primary", model="stub:constant") + + with pytest.raises(ValueError, match="Unknown worker slug 'missing-worker'"): + spec.validate_spec() +``` + +If the registry is process-global and other tests register workers, isolate this test by snapshotting/restoring `registry.workers`. + +- [ ] **Step 2: Update `WorkerSpec.validate_spec`** + +Use `ergon_core.api.registry.registry`: + +```python +from ergon_core.api.registry import registry + + +def validate_spec(self) -> None: + """Check that ``worker_slug`` refers to a known registry entry.""" + if self.worker_slug not in registry.workers: + known = ", ".join(sorted(registry.workers)) or "" + raise ValueError( + f"Unknown worker slug {self.worker_slug!r}; registered workers: {known}" + ) + if not self.name: + raise ValueError("WorkerSpec.name must be a non-empty string") + if not self.model: + raise ValueError("WorkerSpec.model must be a non-empty string") +``` + +- [ ] **Step 3: Add benchmark pairing metadata only if needed** + +Do not add a large new abstraction in this refactor unless tests show a concrete gap. If student-facing validation needs “benchmark X expects worker Y,” add a small optional method to benchmark classes later: + +```python +def recommended_worker_slugs(self) -> tuple[str, ...]: + return () +``` + +For this plan, keep pairing validation in tests and docs unless an existing runtime path requires it. + +- [ ] **Step 4: Run experiment validation tests** + +Run: + +```bash +pytest tests/unit -q -k "validation or WorkerSpec or registry" +``` + +Expected: PASS. + +--- + +### Task 8: Final Search, Lint, And Regression Verification + +**Files:** +- No planned source files beyond cleanup. + +- [ ] **Step 1: Verify no core imports of builtins registry remain** + +Run: + +```bash +rg "ergon_builtins\\.registry" ergon_core/ergon_core -n +``` + +Expected: no matches. + +- [ ] **Step 2: Verify old smoke fixture location is gone** + +Run: + +```bash +test ! -d ergon_core/ergon_core/test_support/smoke_fixtures +``` + +Expected: exit code 0. + +- [ ] **Step 3: Verify remaining registry references are intentional** + +Run: + +```bash +rg "BENCHMARKS|WORKERS|EVALUATORS|SANDBOX_MANAGERS" ergon_core ergon_builtins tests -n +``` + +Expected: no core runtime imports from `ergon_builtins.registry`; remaining uppercase dict names should either be deleted or constrained to docs/backwards compatibility tests. + +- [ ] **Step 4: Run focused tests** + +Run: + +```bash +pytest tests/unit/registry -q +pytest tests/unit -q -k "experiment or workflow or task or sandbox or registry" +``` + +Expected: PASS. + +- [ ] **Step 5: Run E2E collect** + +Run: + +```bash +pytest tests/e2e --collect-only -q +``` + +Expected: PASS. + +- [ ] **Step 6: Run full available test suite** + +Run: + +```bash +pytest tests/unit -q +``` + +Expected: PASS. If E2E infrastructure is available, also run: + +```bash +pytest tests/e2e -q +``` + +Expected: PASS or documented infrastructure failures unrelated to this refactor. + +--- + +## Self-Review + +- Spec coverage: The plan covers core registry creation, builtins update, removal of `BENCHMARKS`/`WORKERS`/`EVALUATORS`/`SANDBOX_MANAGERS` imports from core, moving smoke test helpers out of core, and updating integration/E2E registration flow. +- Placeholder scan: No unfinished placeholder markers remain. The only conditional areas are explicitly bounded implementation checks where the current codebase must be searched first, such as CLI entrypoint location and optional data dependency import constraints. +- Type consistency: `ComponentRegistry`, `WorkerFactory`, `registry`, and `register_*` function names are used consistently across tasks. diff --git a/docs/superpowers/plans/2026-04-29-finish-builtins-cli-e2e-refactor.md b/docs/superpowers/plans/2026-04-29-finish-builtins-cli-e2e-refactor.md new file mode 100644 index 00000000..78a06932 --- /dev/null +++ b/docs/superpowers/plans/2026-04-29-finish-builtins-cli-e2e-refactor.md @@ -0,0 +1,841 @@ +# Finish Built-ins, CLI, And E2E Refactor Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Finish the Ergon built-ins, CLI, and e2e refactor after the core public API and test-support facade have stabilized, while avoiding private core internals that may continue moving. + +**Architecture:** Treat `ergon_core.api`, core service/facade DTOs, `ergon_core.test_support`, HTTP `/api/test/*`, and application read models as the stable boundary. Production built-ins own benchmark-specific workers/rubrics/sandboxes; CLI commands validate explicit slugs and call core facades; e2e tests assert black-box runtime behavior and use test-support constants rather than private repository methods. + +**Tech Stack:** Python, pytest, FastAPI test harness endpoints, Playwright, Inngest, E2B, `ergon_core.test_support`, `ergon_builtins.registry`, `ergon_cli`. + +--- + +## Current Working Assumptions + +- Core runtime behavior is stable: the canonical smoke topology, resource counts, task states, communication threads, and evaluation outcomes are still expected to match existing e2e assertions. +- Core internal layout has changed substantially. Tests should not import private repository modules or persistence models unless there is no stable public/test-support read helper yet. +- `ergon_core.test_support` is stable and may be imported by unit/integration/e2e host-side test code. +- The API process, not the host e2e process, should register smoke fixtures via startup plugin/environment. +- Built-ins and CLI work may proceed as long as it stays on public API/service boundaries and avoids core repository implementation files. + +## E2E Behavior That Should Remain True + +These expected values are derived from stable smoke fixture constants and should remain hard assertions unless `ergon_core.test_support.smoke_fixtures` changes intentionally. + +```text +Happy path: +- 12 total tasks: 1 root + 9 direct subtasks + 2 nested subtasks +- 10 leaf tasks +- direct level-1 slugs match EXPECTED_SUBTASK_SLUGS +- nested level-2 slugs match NESTED_LINE_SLUGS +- l_2 is non-leaf; l_2_a and l_2_b are children of l_2 +- all nodes complete +- 20 task artifact resources: 10 benchmark artifacts + 10 probe_*.json +- no worker_output resources; final assistant messages stay on executions +- 26 context events: parent 3 + recursive 3 + 10 leaves x 2 +- 2 root evaluations, both score 1.0, created after root execution completion +- final score is 1.0 +- one smoke-completion thread with 11 ordered messages + +Sad path: +- l_2 fails +- l_3 is blocked, never starts, and has no execution attempts +- root does not complete +- independent leaves complete +- exactly one partial_*.md artifact persists from l_2 +- at least one pre-failure partial wc WAL/probe entry exists +- smoke-completion thread has 7 messages +- l_2 and l_3 do not send completion messages +- final score is None or 0.0 +``` + +Benchmark-specific artifact assertions should also remain: + +```text +MiniF2F: +- 10 proof_*.lean resources +- each proof contains "theorem smoke_trivial" and ":=" + +SWE-Bench: +- 10 patch_*.py resources +- each patch parses as Python and defines add() + +ResearchRubrics: +- report/probe artifacts and dashboard-visible resource panels match the shared smoke assertions +``` + +## File Responsibility Map + +Built-ins: + +- `ergon_builtins/ergon_builtins/registry.py`: merged public registry surface. +- `ergon_builtins/ergon_builtins/registry_core.py`: always-importable benchmarks/workers/evaluators/sandboxes/model backends. +- `ergon_builtins/ergon_builtins/registry_data.py`: `[data]` benchmark registrations. +- `ergon_builtins/ergon_builtins/benchmarks/*/worker_factory.py`: benchmark-owned worker factories or benchmark-owned re-export surfaces. +- `ergon_builtins/ergon_builtins/shared/`: generic worker, criteria, model, prompt import surfaces. + +CLI: + +- `ergon_cli/ergon_cli/main.py`: parser contract only. +- `ergon_cli/ergon_cli/commands/experiment.py`: thin command handler for `experiment define/run/show/list`. +- `ergon_cli/ergon_cli/commands/benchmark.py`: `list`, `setup`, and `run` wrapper behavior. +- `ergon_cli/ergon_cli/discovery/__init__.py`: registry list helpers. +- Future target: `ergon_cli/ergon_cli/services/*_facade.py` if command handlers remain too stateful. + +E2E: + +- `tests/e2e/_submit.py`: black-box cohort submission client for `/api/test/write/cohort`. +- `tests/e2e/_read_contracts.py`: stable read-model wrapper for run snapshots. +- `tests/e2e/_asserts.py`: behavior assertions; should import test-support constants and stable read helpers. +- `tests/e2e/test_{researchrubrics,minif2f,swebench}_smoke.py`: per-benchmark e2e drivers. +- `ergon-dashboard/tests/e2e/*.smoke.spec.ts`: dashboard assertions. + +Stable core/test-support surfaces: + +- `ergon_core.api` +- `ergon_core.test_support` +- `ergon_core.core.application.read_models.*`, if accepted as the application-level read facade +- `/api/test/*` HTTP endpoints + +Private core surfaces to avoid in new e2e code: + +- `ergon_core.core.persistence.*` models and queries +- `ergon_core.core.runtime.tasks.repository` +- `ergon_core.core.runtime.evaluation.persistence` +- Inngest child payload modules +- repository method names or table-specific access patterns + +## Task 1: Freeze And Document The Stable E2E Boundary + +**Files:** +- Modify: `docs/superpowers/plans/2026-04-28-ergon-e2e-refactor-test-plan.md` +- Test: `tests/unit/architecture/test_public_api_boundaries.py` + +- [ ] **Step 1: Add a “stable e2e boundary” section to the e2e plan** + +Add this section near the existing `Fixture Residency Rules` section: + +```markdown +## Stable E2E Boundary After Core Layout Refactor + +Core behavior is stable, but private repository and persistence modules may move. +E2E code should use only: + +- HTTP endpoints under `/api/test/*` +- `ergon_core.test_support` +- public core API objects from `ergon_core.api` +- application read-model facades, not private repository methods + +The existing smoke behavior assertions remain valid: + +- happy runs complete the 12-node graph +- sad runs fail `l_2` and block `l_3` +- happy runs produce 20 task resources and 26 context events +- happy root produces two score-1.0 evaluations +- sad runs produce one partial artifact and seven completion messages +``` + +- [ ] **Step 2: Add or update a boundary test** + +Add/extend a test in `tests/unit/architecture/test_public_api_boundaries.py`: + +```python +from pathlib import Path + + +def test_e2e_tests_do_not_import_private_core_repositories() -> None: + e2e_dir = Path("tests/e2e") + forbidden = ( + "ergon_core.core.persistence.", + "ergon_core.core.runtime.tasks.repository", + "ergon_core.core.runtime.evaluation.persistence", + "ergon_core.core.runtime.inngest.", + ) + offenders: list[tuple[str, str]] = [] + for path in e2e_dir.rglob("*.py"): + text = path.read_text() + for needle in forbidden: + if needle in text: + offenders.append((str(path), needle)) + assert not offenders +``` + +- [ ] **Step 3: Run the boundary test and confirm failure before cleanup** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_public_api_boundaries.py::test_e2e_tests_do_not_import_private_core_repositories -q +``` + +Expected before cleanup: fail with current `tests/e2e/_asserts.py` private persistence imports. + +## Task 2: Update E2E Submission To Explicit Runtime Choices + +**Files:** +- Modify: `tests/e2e/_submit.py` +- Modify: `tests/e2e/test_researchrubrics_smoke.py` +- Modify: `tests/e2e/test_minif2f_smoke.py` +- Modify: `tests/e2e/test_swebench_smoke.py` +- Test: `tests/unit/smoke_base/test_e2e_smoke_driver_pairs.py` + +- [ ] **Step 1: Add a unit test for explicit e2e submission payloads** + +Create or update `tests/unit/smoke_base/test_e2e_smoke_driver_pairs.py`: + +```python +from tests.e2e._submit import build_cohort_payload + + +def test_build_cohort_payload_includes_explicit_runtime_choices() -> None: + payload = build_cohort_payload( + benchmark_slug="minif2f", + slots=[("minif2f-smoke-worker", "minif2f-smoke-criterion")], + cohort_key="ci-smoke-minif2f", + sandbox_slug="minif2f", + dependency_extras=("none",), + model="openai:gpt-4o", + ) + + assert payload["benchmark_slug"] == "minif2f" + assert payload["sandbox_slug"] == "minif2f" + assert payload["dependency_extras"] == ["none"] + assert payload["model"] == "openai:gpt-4o" + assert payload["slots"] == [ + { + "worker_slug": "minif2f-smoke-worker", + "evaluator_slug": "minif2f-smoke-criterion", + } + ] +``` + +- [ ] **Step 2: Implement `build_cohort_payload()`** + +In `tests/e2e/_submit.py`, add: + +```python +def build_cohort_payload( + *, + benchmark_slug: str, + slots: list[tuple[str, str]], + cohort_key: str, + sandbox_slug: str, + dependency_extras: tuple[str, ...], + model: str = "openai:gpt-4o", +) -> dict: + return { + "benchmark_slug": benchmark_slug, + "slots": [ + {"worker_slug": worker, "evaluator_slug": evaluator} + for worker, evaluator in slots + ], + "cohort_key": cohort_key, + "sandbox_slug": sandbox_slug, + "dependency_extras": list(dependency_extras), + "model": model, + } +``` + +- [ ] **Step 3: Route `submit_cohort()` through the payload builder** + +Change `submit_cohort()` signature to accept explicit fields: + +```python +async def submit_cohort( + *, + benchmark_slug: str, + slots: list[tuple[str, str]], + cohort_key: str, + sandbox_slug: str, + dependency_extras: tuple[str, ...], + model: str = "openai:gpt-4o", + timeout: int = 300, +) -> list[UUID]: + payload = build_cohort_payload( + benchmark_slug=benchmark_slug, + slots=slots, + cohort_key=cohort_key, + sandbox_slug=sandbox_slug, + dependency_extras=dependency_extras, + model=model, + ) + async with httpx.AsyncClient(base_url=_api_base(), timeout=30.0) as client: + response = await client.post("/api/test/write/cohort", json=payload) + ... +``` + +- [ ] **Step 4: Update each e2e driver call** + +For `tests/e2e/test_minif2f_smoke.py`: + +```python +run_ids = await submit_cohort( + benchmark_slug=ENV, + slots=[(worker, criterion) for _, worker, criterion in smoke_slots], + cohort_key=cohort_key, + sandbox_slug=ENV, + dependency_extras=("none",), + timeout=PER_RUN_TIMEOUT, +) +``` + +For `tests/e2e/test_swebench_smoke.py`: + +```python +run_ids = await submit_cohort( + benchmark_slug=ENV, + slots=[(worker, criterion) for _, worker, criterion in smoke_slots], + cohort_key=cohort_key, + sandbox_slug=ENV, + dependency_extras=("none",), + timeout=PER_RUN_TIMEOUT, +) +``` + +For `tests/e2e/test_researchrubrics_smoke.py`: + +```python +run_ids = await submit_cohort( + benchmark_slug=ENV, + slots=[(worker, criterion) for _, worker, criterion in smoke_slots], + cohort_key=cohort_key, + sandbox_slug=ENV, + dependency_extras=("none",), + timeout=PER_RUN_TIMEOUT, +) +``` + +Smoke fixtures replace production benchmark loaders, so e2e smoke should use `("none",)` unless the API harness explicitly requires package extras to test onboarding messaging. + +- [ ] **Step 5: Run unit payload test** + +Run: + +```bash +uv run pytest tests/unit/smoke_base/test_e2e_smoke_driver_pairs.py -q +``` + +Expected: pass. + +## Task 3: Replace Private E2E Reads With Test-Support Or Application Read Models + +**Files:** +- Modify: `tests/e2e/_asserts.py` +- Modify: `tests/e2e/_read_contracts.py` +- Optional create: `ergon_core/ergon_core/test_support/e2e_read_helpers.py` +- Test: `tests/unit/smoke_base/test_e2e_read_helpers.py` + +- [ ] **Step 1: Inventory direct private imports in `_asserts.py`** + +Search: + +```bash +rg "ergon_core.core.persistence|sqlmodel|select\\(" tests/e2e/_asserts.py +``` + +Expected current private access areas: + +- graph node rows for temporal ordering +- `RunResource` rows for blob/artifact assertions +- `RunTaskEvaluation` rows for evaluation timestamp assertions +- sandbox WAL/event rows + +- [ ] **Step 2: Keep `require_run_snapshot()` as the primary read path** + +`tests/e2e/_read_contracts.py` may keep: + +```python +from ergon_core.core.application.read_models.models import RunSnapshotDto +from ergon_core.core.application.read_models.runs import RunReadService +``` + +Do not import private repository classes in e2e drivers. If `RunReadService` moves, fix this wrapper only. + +- [ ] **Step 3: Add test-support helpers only for data not exposed in snapshots** + +If WAL/resource byte paths/evaluation timestamps are not exposed through `RunSnapshotDto`, create `ergon_core/ergon_core/test_support/e2e_read_helpers.py`: + +```python +"""Stable test-support reads for e2e assertions.""" + +from pathlib import Path +from uuid import UUID + +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.persistence.telemetry.models import ( + RunResource, + RunTaskEvaluation, + RunTaskExecution, + SandboxCommandWalEntry, + SandboxEvent, +) +from sqlmodel import select + + +def list_run_resources(run_id: UUID) -> list[RunResource]: + with get_session() as session: + return list(session.exec(select(RunResource).where(RunResource.run_id == run_id)).all()) + + +def read_resource_bytes(resource: RunResource) -> bytes: + return Path(resource.file_path).read_bytes() + + +def list_sandbox_command_wal(run_id: UUID) -> list[SandboxCommandWalEntry]: + with get_session() as session: + return list( + session.exec( + select(SandboxCommandWalEntry).where(SandboxCommandWalEntry.run_id == run_id), + ).all() + ) + + +def list_sandbox_events(run_id: UUID) -> list[SandboxEvent]: + with get_session() as session: + return list(session.exec(select(SandboxEvent).where(SandboxEvent.run_id == run_id)).all()) + + +def list_root_evaluation_rows(run_id: UUID) -> tuple[RunTaskExecution | None, list[RunTaskEvaluation]]: + # Implementation may use the current core layout internally. + # E2E tests should import this function, not the private models directly. + ... +``` + +If the core agent has already created stable equivalents under `ergon_core.test_support`, use those instead of adding this file. + +- [ ] **Step 4: Move `_asserts.py` imports to stable helper functions** + +Change `tests/e2e/_asserts.py` so private persistence imports are replaced by: + +```python +from ergon_core.test_support.e2e_read_helpers import ( + list_root_evaluation_rows, + list_run_resources, + list_sandbox_command_wal, + list_sandbox_events, + read_resource_bytes, +) +``` + +Keep these direct test-support imports: + +```python +from ergon_core.test_support.smoke_fixtures.smoke_base.constants import EXPECTED_SUBTASK_SLUGS +from ergon_core.test_support.smoke_fixtures.smoke_base.leaf_base import BaseSmokeLeafWorker +from ergon_core.test_support.smoke_fixtures.smoke_base.recursive import ( + NESTED_LINE_SLUGS, + RecursiveSmokeWorkerBase, +) +from ergon_core.test_support.smoke_fixtures.smoke_base.worker_base import SmokeWorkerBase +``` + +- [ ] **Step 5: Re-run the boundary test** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_public_api_boundaries.py::test_e2e_tests_do_not_import_private_core_repositories -q +``` + +Expected after cleanup: pass. + +## Task 4: Finish Built-ins Registry And Factory Contracts + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/registry_core.py` +- Modify: `ergon_builtins/ergon_builtins/registry_data.py` +- Modify/create: `ergon_builtins/ergon_builtins/benchmarks/gdpeval/worker_factory.py` +- Modify/create: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/worker_factory.py` +- Modify: `tests/unit/registry/test_builtin_pairings.py` +- Modify: `tests/unit/registry/test_react_factories.py` + +- [ ] **Step 1: Verify explicit pairing table** + +`tests/unit/registry/test_builtin_pairings.py` must contain registered pairings: + +```python +PAIRINGS = [ + ("minif2f", "minif2f-react", "minif2f-rubric", "minif2f", ("none",)), + ("swebench-verified", "swebench-react", "swebench-rubric", "swebench-verified", ("none",)), + ("gdpeval", "gdpeval-react", "gdpeval-staged-rubric", "gdpeval", ("ergon-builtins[data]",)), + ("researchrubrics", "researchrubrics-researcher", "researchrubrics-rubric", "researchrubrics", ("ergon-builtins[data]",)), + ("researchrubrics-vanilla", "researchrubrics-researcher", "researchrubrics-rubric", "researchrubrics-vanilla", ("ergon-builtins[data]",)), +] +``` + +Use `("none",)` for e2e smoke replacement submissions, but keep production pairing documentation accurate for production data benchmarks. + +- [ ] **Step 2: Register final evaluator slugs** + +`registry_core.py` should expose both during migration: + +```python +EVALUATORS = { + "staged-rubric": StagedRubric, + "gdpeval-staged-rubric": StagedRubric, + ... +} +``` + +`registry_data.py` should expose: + +```python +EVALUATORS = { + "research-rubric": ResearchRubricsRubric, + "researchrubrics-rubric": ResearchRubricsRubric, +} +``` + +- [ ] **Step 3: Keep benchmark-owned worker factory surfaces** + +Required files: + +```text +ergon_builtins/ergon_builtins/benchmarks/minif2f/worker_factory.py +ergon_builtins/ergon_builtins/benchmarks/swebench_verified/worker_factory.py +ergon_builtins/ergon_builtins/benchmarks/gdpeval/worker_factory.py +ergon_builtins/ergon_builtins/benchmarks/researchrubrics/worker_factory.py +``` + +`researchrubrics/worker_factory.py` may re-export existing worker classes until a later physical move. + +- [ ] **Step 4: Run registry tests** + +Run: + +```bash +uv run pytest tests/unit/registry/test_builtin_pairings.py tests/unit/registry/test_react_factories.py -q +``` + +Expected: pass. + +## Task 5: Finish CLI Contract And Wrapper Behavior + +**Files:** +- Modify: `ergon_cli/ergon_cli/main.py` +- Modify: `ergon_cli/ergon_cli/commands/experiment.py` +- Modify: `ergon_cli/ergon_cli/commands/benchmark.py` +- Modify: `tests/unit/cli/test_experiment_cli.py` +- Modify: `tests/unit/cli/test_benchmark_setup.py` + +- [ ] **Step 1: Keep explicit define args required** + +Parser requirements: + +```text +ergon experiment define + --worker + --model + --evaluator + --sandbox + --extras +``` + +Test with: + +```bash +uv run pytest tests/unit/cli/test_experiment_cli.py::test_experiment_define_requires_explicit_runtime_choices -q +``` + +- [ ] **Step 2: Keep `benchmark run` as define-plus-run wrapper** + +`benchmark run` should parse the same explicit fields: + +```text +ergon benchmark run + --limit 1 + --worker + --model + --evaluator + --sandbox + --extras +``` + +If `ExperimentLaunchService.wait/timeout_seconds` is not implemented, do not expose `--timeout` or `--no-wait` on `benchmark run`. The wrapper should submit and print run IDs, not pretend to block. + +- [ ] **Step 3: Keep `benchmark setup` success hint explicit** + +Expected hint shape: + +```text +ergon benchmark run --limit 1 --worker --model --evaluator --sandbox --extras none +``` + +Regression test: + +```python +def test_setup_success_hint_uses_explicit_runtime_choices(...): + rc = setup_benchmark(_make_args()) + out = capsys.readouterr().out + assert "--worker" in out + assert "--evaluator" in out + assert "--sandbox" in out + assert "--extras" in out +``` + +- [ ] **Step 4: Run CLI tests** + +Run: + +```bash +uv run pytest tests/unit/cli/test_experiment_cli.py tests/unit/cli/test_benchmark_setup.py -q +``` + +Expected: pass. + +## Task 6: Align `/api/test/write/cohort` With Explicit Test Harness Contract + +**Files:** +- Modify: `ergon_core/ergon_core/core/api/test_harness.py` or the current stable test harness module if moved +- Modify: `tests/integration/smokes/test_smoke_harness.py` +- Modify: `tests/e2e/_submit.py` + +- [ ] **Step 1: Ensure request DTO accepts explicit sandbox/extras** + +The stable test harness write request should accept: + +```python +class SubmitCohortRequest(BaseModel): + benchmark_slug: str + slots: list[CohortSlotRequest] + cohort_key: str + sandbox_slug: str | None = None + dependency_extras: tuple[str, ...] = ("none",) + model: str = "openai:gpt-4o" + limit: int = 1 +``` + +- [ ] **Step 2: Ensure the harness uses the same define/run service path** + +The handler should pass: + +```python +ExperimentDefineRequest( + benchmark_slug=body.benchmark_slug, + cohort_id=cohort.id, + limit=body.limit, + default_model_target=body.model, + default_worker_team={"primary": slot.worker_slug}, + default_evaluator_slug=slot.evaluator_slug, + sandbox_slug=body.sandbox_slug or body.benchmark_slug, + dependency_extras=body.dependency_extras, + metadata={"source": "test-harness"}, +) +``` + +If the core facade DTO names differ after the core refactor, adapt to the stable facade shape rather than private repositories. + +- [ ] **Step 3: Add integration assertion** + +In `tests/integration/smokes/test_smoke_harness.py`, assert the write endpoint accepts a payload with `sandbox_slug` and `dependency_extras` and returns run IDs. + +- [ ] **Step 4: Run smoke harness integration test** + +Run: + +```bash +uv run pytest tests/integration/smokes/test_smoke_harness.py -q +``` + +Expected: pass if stack dependencies for integration are available; otherwise skip should be environment-gated. + +## Task 7: Preserve E2E Runtime Assertions While Updating Access Paths + +**Files:** +- Modify: `tests/e2e/_asserts.py` +- Modify: `tests/e2e/test_researchrubrics_smoke.py` +- Modify: `tests/e2e/test_minif2f_smoke.py` +- Modify: `tests/e2e/test_swebench_smoke.py` +- Modify: `ergon-dashboard/tests/e2e/*.smoke.spec.ts` + +- [ ] **Step 1: Keep the behavioral assertions hard** + +Do not weaken these assertions: + +```python +assert snapshot.total_tasks == 12 +assert snapshot.total_leaf_tasks == 10 +assert len(probes) == 10 +assert len(resources) == 20 +assert event_count == 26 +assert len(evaluations) == 2 +assert scores == [1.0, 1.0] +assert len(msgs) == 11 +``` + +Sad path: + +```python +assert by_slug["l_2"].status == FAILED +assert by_slug["l_3"].status == BLOCKED +assert by_slug["l_3"].started_at is None +assert len(msgs) == 7 +``` + +- [ ] **Step 2: Update imports only** + +Replace any private core imports with: + +```python +from tests.e2e._read_contracts import require_run_snapshot +from ergon_core.test_support.smoke_fixtures.smoke_base.constants import EXPECTED_SUBTASK_SLUGS +``` + +And, where direct DB access is still needed: + +```python +from ergon_core.test_support.e2e_read_helpers import ... +``` + +- [ ] **Step 3: Keep dashboard assertions aligned** + +Playwright specs should assert visible behavior: + +```text +- run status is completed/failed as appropriate +- all expected task nodes appear +- failed l_2 and blocked l_3 are visible on sad path +- resource/evaluation panels render when expected +``` + +Do not assert private API response shapes unless the dashboard API marks them public/stable. + +## Task 8: Run The Non-E2E Verification Gate + +**Files:** +- No code changes unless tests fail. + +- [ ] **Step 1: Run focused unit/integration tests** + +Run: + +```bash +uv run pytest \ + tests/unit/registry/test_react_factories.py \ + tests/unit/registry/test_builtin_pairings.py \ + tests/unit/cli/test_experiment_cli.py \ + tests/unit/cli/test_benchmark_setup.py \ + tests/unit/smoke_base/test_e2e_smoke_driver_pairs.py \ + tests/unit/architecture/test_public_api_boundaries.py \ + tests/integration/smokes/test_smoke_harness.py \ + -q +``` + +Expected: pass or environment-gated integration skip. Any import failure from `tests/e2e` is a blocker. + +- [ ] **Step 2: Run e2e collection without executing live stack** + +Run: + +```bash +uv run pytest tests/e2e --collect-only -q +``` + +Expected: collection succeeds. This catches stale import paths without needing the stack. + +- [ ] **Step 3: Run lint diagnostics on touched test/docs paths** + +Use IDE lints for: + +```text +tests/e2e/ +tests/unit/registry/ +tests/unit/cli/ +tests/unit/smoke_base/ +docs/superpowers/plans/ +``` + +Expected: no new code-specific diagnostics. Environment import-resolution warnings are non-blocking only if pytest confirms imports. + +## Task 9: Full E2E Execution Gate + +**Files:** +- No code changes unless runtime evidence fails. + +- [ ] **Step 1: Verify stack env** + +Required environment: + +```text +ENABLE_TEST_HARNESS=1 +ENABLE_SMOKE_FIXTURES=1 +ERGON_STARTUP_PLUGINS=ergon_core.test_support.smoke_fixtures:register_smoke_fixtures +ERGON_API_BASE_URL=http://127.0.0.1:9000 +TEST_HARNESS_SECRET= +E2B_API_KEY= +``` + +- [ ] **Step 2: Run one smoke leg first** + +Run: + +```bash +uv run pytest tests/e2e/test_minif2f_smoke.py -q -s +``` + +Expected: + +- one happy run reaches `completed` +- one sad run reaches `failed` +- all hard assertions pass +- Playwright spec completes or captures failure screenshots + +- [ ] **Step 3: Run all smoke legs** + +Run: + +```bash +uv run pytest tests/e2e -q -s +``` + +Expected: + +- ResearchRubrics, MiniF2F, and SWE-Bench each submit happy/sad cohorts +- happy runs pass graph/resource/turn/evaluation/dashboard assertions +- sad runs pass blocked/failure/partial-artifact assertions + +## Task 10: Review And Handoff To Real-LLM Canaries + +**Files:** +- Modify only if review finds issues. + +- [ ] **Step 1: Request code review** + +Send reviewer scope: + +```text +Review built-ins, CLI, and e2e refactor completion. +Check that: +- no benchmark profiles/default pairings remain +- CLI requires explicit worker/model/evaluator/sandbox/extras +- e2e uses HTTP/test-support/read-model boundaries +- runtime behavior assertions remain hard +- no private core repository imports remain in e2e tests +``` + +- [ ] **Step 2: Fix Critical and Important review findings** + +Follow review feedback with tests for each fix. + +- [ ] **Step 3: Decide real-LLM canary timing** + +Only after e2e smoke is green, run or schedule: + +```bash +ERGON_REAL_LLM=1 uv run pytest tests/real_llm -q -s +``` + +If real-LLM tests still use stale CLI paths, update them to the same explicit runtime choice contract before running. + +## Completion Criteria + +- `tests/e2e --collect-only` succeeds without private core import failures. +- `tests/unit/architecture/test_public_api_boundaries.py` confirms e2e tests do not import private core repository/runtime internals. +- `tests/unit/registry/test_builtin_pairings.py` covers all documented production benchmark pairings. +- CLI parser tests prove explicit arguments are required. +- `/api/test/write/cohort` accepts explicit sandbox/extras and uses the same define/run facade path. +- Full e2e smoke suite preserves existing behavior assertions: + - 12 tasks, 10 leaves, 20 resources, 26 turns, 2 root evaluations on happy path + - `l_2` failed, `l_3` blocked, 7 completion messages on sad path +- Code review has no unresolved Critical or Important findings. + diff --git a/docs/superpowers/plans/2026-04-29-persistent-component-catalog-and-test-layout.md b/docs/superpowers/plans/2026-04-29-persistent-component-catalog-and-test-layout.md new file mode 100644 index 00000000..a72f7a5c --- /dev/null +++ b/docs/superpowers/plans/2026-04-29-persistent-component-catalog-and-test-layout.md @@ -0,0 +1,1784 @@ +# Persistent Component Catalog And Test Layout Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make component registration understandable across processes by splitting tests by package ownership, persisting component slug-to-import references in Postgres, and deleting test/fixture env-var switches. + +**Architecture:** First reorganize tests so package boundaries are visible and cross-process E2E stays black-box. Then add a trusted `component_catalog` table in `ergon_core` that stores component kind, slug, module, qualname, and metadata. Finally, update the Pydantic registry to publish/load catalog rows, make runtime jobs resolve components through the catalog-backed registry, and remove `ENABLE_TEST_HARNESS`, `TEST_HARNESS_SECRET`, `ERGON_STARTUP_PLUGINS`, `ENABLE_SMOKE_FIXTURES`, and `ERGON_SKIP_INFRA_CHECK`. + +**Tech Stack:** Python 3.13, SQLModel, Alembic, Pydantic v2, pytest, FastAPI, argparse CLI, existing uv/pnpm scripts. + +--- + +## Service Design Constraint + +Use one catalog boundary: `ComponentCatalogService`. Do not implement both a service and repository for the catalog. The service owns the contract for publishing refs, requiring refs, and loading import refs; keep the API small so it does not become a second registry. + +## Mental Model + +The final system should be explainable as: + +1. Packages define components in Python code. +2. Packages publish component references into Postgres as trusted catalog rows. +3. Experiment definitions store stable slugs. +4. API/Inngest/CLI resolve slugs through the shared catalog, import the Python reference, and instantiate the component. +5. Tests are package-owned; only black-box E2E crosses process boundaries. + +The Pydantic registry remains useful as an authoring and publishing helper, but runtime resolution should read from Postgres every time. These lookups are not hot enough to justify an in-memory process-local cache, and always reading the catalog keeps cross-process behavior easier to reason about. + +## ID Model + +Use one worker-facing task identity: + +```python +Task.task_id == RunGraphNode.id +``` + +`RunGraphNode.id` is the runtime task id. It exists for every executable task in a run, including dynamically spawned subtasks. This is the only task id worker authors should see. + +Use explicit names for internal/template identity: + +```python +definition_id # ExperimentDefinition.id, the static experiment template +node_id # RunGraphNode.id, the runtime task identity +execution_id # RunTaskExecution.id, one attempt to execute a node +``` + +Do not pass `definition_task_id` through public `Task` or runtime event/job payloads. Keep it only as an optional persisted relationship on rows such as `RunGraphNode` / `RunTaskExecution` when the application layer needs static-template joins. If runtime needs definition data, resolve it from `node_id` through the persisted graph/run links (`RunGraphNode.run_id` -> `RunRecord.workflow_definition_id` -> `ExperimentDefinition`) or use the already available run/definition context in the application layer. + +## File Structure + +- Create package-owned test roots: + - `ergon_core/tests/` + - `ergon_builtins/tests/` + - `ergon_cli/tests/` + - optionally `ergon_infra/tests/` +- Keep cross-package black-box tests at: + - `tests/e2e/` + - `tests/real_llm/` + - `tests/fixtures/` only for fixtures intentionally shared by black-box tests. +- Create component catalog files: + - `ergon_core/ergon_core/core/persistence/components/models.py` + - `ergon_core/ergon_core/core/application/components/catalog.py` + - `ergon_core/migrations/versions/_add_component_catalog.py` +- Modify registry/bootstrap files: + - `ergon_core/ergon_core/api/benchmark/task.py` + - `ergon_core/ergon_core/api/worker/context.py` + - `ergon_core/ergon_core/api/worker/worker.py` + - `ergon_core/ergon_core/api/worker/__init__.py` + - `ergon_core/ergon_core/api/registry.py` + - `ergon_builtins/ergon_builtins/registry.py` + - `ergon_builtins/ergon_builtins/registry_core.py` + - `ergon_builtins/ergon_builtins/registry_data.py` + - `tests/fixtures/smoke_components/__init__.py` +- Modify runtime resolution files: + - `ergon_core/ergon_core/core/application/events/task_events.py` + - `ergon_core/ergon_core/core/application/jobs/models.py` + - `ergon_core/ergon_core/core/application/jobs/worker_execute.py` + - `ergon_core/ergon_core/core/application/jobs/execute_task.py` + - `ergon_core/ergon_core/core/application/workflows/orchestration.py` + - `ergon_core/ergon_core/core/application/jobs/evaluate_task_run.py` + - `ergon_core/ergon_core/core/application/jobs/sandbox_setup.py` + - `ergon_core/ergon_core/core/application/jobs/persist_outputs.py` + - `ergon_core/ergon_core/core/application/experiments/service.py` + - `ergon_core/ergon_core/core/application/experiments/launch.py` + - `ergon_core/ergon_core/core/application/workflows/service.py` + - `ergon_core/ergon_core/core/application/tasks/management.py` + - `ergon_core/ergon_core/core/domain/experiments/worker_spec.py` +- Modify harness/env-var files: + - `ergon_core/ergon_core/core/shared/settings.py` + - `ergon_core/ergon_core/core/rest_api/app.py` + - `ergon_core/ergon_core/core/rest_api/test_harness.py` + - `docker-compose.yml` + - `.github/workflows/e2e-benchmarks.yml` + - `.github/workflows/ci-fast.yml` + - `package.json` + - `scripts/smoke_local_up.sh` + - `scripts/smoke_local_run.sh` + - `tests/e2e/conftest.py` + - `tests/integration/conftest.py` + - dashboard test harness clients/routes that reference `TEST_HARNESS_SECRET`. + +--- + +### Task 1: Create Package-Owned Test Layout Guardrails + +**Files:** +- Create: `tests/unit/architecture/test_package_test_layout.py` +- Modify later: `package.json` + +- [ ] **Step 1: Write architecture test for target test layout** + +Create `tests/unit/architecture/test_package_test_layout.py`: + +```python +from pathlib import Path + + +def test_package_owned_test_roots_exist() -> None: + assert Path("ergon_core/tests").is_dir() + assert Path("ergon_builtins/tests").is_dir() + assert Path("ergon_cli/tests").is_dir() + + +def test_root_tests_are_black_box_or_shared_only() -> None: + allowed = { + "__init__.py", + "__pycache__", + "conftest.py", + "e2e", + "fixtures", + "integration", + "real_llm", + } + root_entries = {path.name for path in Path("tests").iterdir()} + assert root_entries <= allowed +``` + +- [ ] **Step 2: Run the architecture test and verify it fails** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_package_test_layout.py -q +``` + +Expected: FAIL because package-owned test roots do not exist and `tests/unit` still contains package-owned tests. + +- [ ] **Step 3: Create package-owned test directories** + +Create: + +```text +ergon_core/tests/unit/ +ergon_core/tests/integration/ +ergon_builtins/tests/unit/ +ergon_builtins/tests/integration/ +ergon_cli/tests/unit/ +ergon_cli/tests/integration/ +``` + +Add empty `__init__.py` files only if import/package semantics require them. Prefer no `__init__.py` for pytest discovery unless an existing pattern depends on package imports. + +- [ ] **Step 4: Update `package.json` scripts to include both old and new roots** + +Modify backend test scripts temporarily so moved tests can be discovered while migration is incremental: + +```json +"test:be:unit": "uv run pytest ergon_core/tests/unit ergon_builtins/tests/unit ergon_cli/tests/unit tests/unit -q -n auto --durations=20", +"test:be:coverage": "uv run pytest ergon_core/tests/unit ergon_builtins/tests/unit ergon_cli/tests/unit tests/unit tests/integration --cov=ergon_core --cov=ergon_builtins --cov-report=term-missing --cov-report=xml:coverage.xml" +``` + +- [ ] **Step 5: Run package layout test** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_package_test_layout.py -q +``` + +Expected: still FAIL until tests are moved in Tasks 2-4. + +--- + +### Task 2: Move Core-Owned Unit Tests To `ergon_core/tests` + +**Files:** +- Move tests from `tests/unit/api`, `tests/unit/runtime`, `tests/unit/sandbox`, selected `tests/unit/architecture`, selected `tests/unit/state`, and core app tests into `ergon_core/tests/unit`. +- Modify imports only where they reference moved fixture paths. + +- [ ] **Step 1: Move clearly core-owned directories** + +Move: + +```text +tests/unit/api/ -> ergon_core/tests/unit/api/ +tests/unit/runtime/ -> ergon_core/tests/unit/runtime/ +tests/unit/sandbox/ -> ergon_core/tests/unit/sandbox/ +tests/unit/persistence/ -> ergon_core/tests/unit/persistence/ +tests/unit/dashboard/ -> ergon_core/tests/unit/dashboard/ +``` + +Move standalone core app tests: + +```text +tests/unit/test_app_mounts_harness_conditionally.py -> ergon_core/tests/unit/test_app_mounts_harness_conditionally.py +tests/unit/test_dashboard_emitter_wiring.py -> ergon_core/tests/unit/test_dashboard_emitter_wiring.py +tests/unit/test_rollouts_di.py -> ergon_core/tests/unit/test_rollouts_di.py +tests/unit/test_test_harness.py -> ergon_core/tests/unit/test_test_harness.py +tests/unit/test_swebench_criterion_no_sandbox.py -> ergon_core/tests/unit/test_swebench_criterion_no_sandbox.py +``` + +- [ ] **Step 2: Move registry/core architecture tests** + +Move: + +```text +tests/unit/registry/ -> ergon_core/tests/unit/registry/ +tests/unit/architecture/test_api_runs_boundary.py -> ergon_core/tests/unit/architecture/test_api_runs_boundary.py +tests/unit/architecture/test_core_schema_sources.py -> ergon_core/tests/unit/architecture/test_core_schema_sources.py +tests/unit/architecture/test_model_field_descriptions.py -> ergon_core/tests/unit/architecture/test_model_field_descriptions.py +tests/unit/architecture/test_no_test_logic_in_core.py -> ergon_core/tests/unit/architecture/test_no_test_logic_in_core.py +tests/unit/architecture/test_persistence_boundaries.py -> ergon_core/tests/unit/architecture/test_persistence_boundaries.py +tests/unit/architecture/test_public_api_boundaries.py -> ergon_core/tests/unit/architecture/test_public_api_boundaries.py +tests/unit/architecture/test_public_api_target_structure.py -> ergon_core/tests/unit/architecture/test_public_api_target_structure.py +tests/unit/architecture/test_smoke_fixture_package_boundary.py -> ergon_core/tests/unit/architecture/test_smoke_fixture_package_boundary.py +``` + +Leave `tests/unit/architecture/test_package_test_layout.py` at root until the migration is complete because it governs the whole repo. + +- [ ] **Step 3: Run moved core tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit -q +``` + +Expected: PASS or failures that reveal imports still pointing at old `tests/unit/...` paths. + +- [ ] **Step 4: Fix import paths revealed by failures** + +For each failure, update imports to either: + +```python +from tests.fixtures... +``` + +for intentionally shared black-box fixtures, or local package test helpers under: + +```python +from ergon_core.tests... +``` + +Do not import `ergon_builtins` in core unit tests unless the test is explicitly an integration/boundary test that names that dependency. + +- [ ] **Step 5: Run old and new unit suites** + +Run: + +```bash +uv run pytest ergon_core/tests/unit tests/unit -q +``` + +Expected: PASS, with fewer tests left under `tests/unit`. + +--- + +### Task 3: Move Builtins-Owned Tests To `ergon_builtins/tests` + +**Files:** +- Move benchmark, worker, builtins state, smoke component tests that assert builtins behavior. + +- [ ] **Step 1: Move builtins benchmark/worker tests** + +Move: + +```text +tests/unit/benchmarks/ -> ergon_builtins/tests/unit/benchmarks/ +tests/unit/builtins/ -> ergon_builtins/tests/unit/builtins/ +tests/unit/workers/ -> ergon_builtins/tests/unit/workers/ +tests/unit/state/test_benchmark_contract.py -> ergon_builtins/tests/unit/state/test_benchmark_contract.py +tests/unit/state/test_gdpeval_benchmark.py -> ergon_builtins/tests/unit/state/test_gdpeval_benchmark.py +tests/unit/state/test_research_rubrics_benchmark.py -> ergon_builtins/tests/unit/state/test_research_rubrics_benchmark.py +tests/unit/state/test_research_rubrics_workers.py -> ergon_builtins/tests/unit/state/test_research_rubrics_workers.py +tests/unit/state/test_llm_judge_runtime_injection.py -> ergon_builtins/tests/unit/state/test_llm_judge_runtime_injection.py +tests/unit/state/test_criteria_do_not_spawn_sandboxes.py -> ergon_builtins/tests/unit/state/test_criteria_do_not_spawn_sandboxes.py +``` + +- [ ] **Step 2: Move smoke component unit tests** + +Move: + +```text +tests/unit/smoke_base/ -> ergon_builtins/tests/unit/smoke_base/ +``` + +Rationale: the fixture source remains at `tests/fixtures/smoke_components` because E2E consumes it as shared black-box fixture code, but unit tests for that fixture behavior should not live in root `tests/unit`. + +- [ ] **Step 3: Run builtins tests** + +Run: + +```bash +uv run pytest ergon_builtins/tests/unit -q +``` + +Expected: PASS or import failures from moved helper paths. + +- [ ] **Step 4: Fix moved builtins imports** + +Update any relative references from old root locations. Keep production imports from `ergon_builtins.*` unchanged. + +- [ ] **Step 5: Run package test subset** + +Run: + +```bash +uv run pytest ergon_builtins/tests/unit ergon_core/tests/unit tests/unit -q +``` + +Expected: PASS. + +--- + +### Task 4: Move CLI-Owned Tests To `ergon_cli/tests` + +**Files:** +- Move CLI unit tests and CLI-specific state tests. + +- [ ] **Step 1: Move CLI tests** + +Move: + +```text +tests/unit/cli/ -> ergon_cli/tests/unit/cli/ +tests/unit/state/test_onboard_profile.py -> ergon_cli/tests/unit/state/test_onboard_profile.py +tests/unit/state/test_env_writer.py -> ergon_cli/tests/unit/state/test_env_writer.py +tests/unit/state/test_openrouter_model_resolution.py -> ergon_cli/tests/unit/state/test_openrouter_model_resolution.py +tests/unit/state/test_subtask_lifecycle_toolkit.py -> ergon_cli/tests/unit/state/test_subtask_lifecycle_toolkit.py +tests/unit/state/test_workflow_cli_tool.py -> ergon_cli/tests/unit/state/test_workflow_cli_tool.py +``` + +- [ ] **Step 2: Run CLI tests** + +Run: + +```bash +uv run pytest ergon_cli/tests/unit -q +``` + +Expected: PASS or import failures that identify old paths. + +- [ ] **Step 3: Update `package.json` to remove old unit root once empty** + +After Tasks 2-4, if `tests/unit` contains only architecture migration tests or is empty, update scripts: + +```json +"test:be:unit": "uv run pytest ergon_core/tests/unit ergon_builtins/tests/unit ergon_cli/tests/unit -q -n auto --durations=20" +``` + +If a small root `tests/unit` remains for repo-wide architecture tests, include it explicitly: + +```json +"test:be:unit": "uv run pytest ergon_core/tests/unit ergon_builtins/tests/unit ergon_cli/tests/unit tests/unit -q -n auto --durations=20" +``` + +- [ ] **Step 4: Run package layout guardrail** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_package_test_layout.py -q +``` + +Expected: PASS. + +--- + +### Task 5: Add Component Catalog Persistence Model And Migration + +**Files:** +- Create: `ergon_core/ergon_core/core/persistence/components/models.py` +- Modify: `ergon_core/migrations/env.py` +- Create: `ergon_core/migrations/versions/_add_component_catalog.py` +- Test: `ergon_core/tests/unit/registry/test_component_catalog_model.py` + +- [ ] **Step 1: Write catalog model tests** + +Create `ergon_core/tests/unit/registry/test_component_catalog_model.py`: + +```python +import pytest + +from ergon_core.core.persistence.components.models import ComponentCatalogEntry + + +def test_component_catalog_entry_round_trips_metadata() -> None: + entry = ComponentCatalogEntry( + kind="worker", + slug="training-stub", + module="ergon_builtins.shared.workers.training_stub_worker", + qualname="TrainingStubWorker", + package="ergon-builtins", + metadata_json={"description": "offline worker"}, + ) + + assert entry.parsed_metadata() == {"description": "offline worker"} + + +def test_component_catalog_entry_rejects_invalid_kind() -> None: + with pytest.raises(ValueError, match="kind must be one of"): + ComponentCatalogEntry( + kind="not-a-kind", + slug="bad", + module="pkg.mod", + qualname="Thing", + ) +``` + +- [ ] **Step 2: Run catalog model tests and verify they fail** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry/test_component_catalog_model.py -q +``` + +Expected: FAIL because the model module does not exist. + +- [ ] **Step 3: Implement SQLModel catalog entry** + +Create `ergon_core/ergon_core/core/persistence/components/models.py`: + +```python +"""Persistent component catalog shared across CLI/API/Inngest processes.""" + +from datetime import datetime +from uuid import UUID, uuid4 + +from ergon_core.core.shared.json_types import JsonObject +from ergon_core.core.shared.utils import utcnow as _utcnow +from pydantic import model_validator +from sqlalchemy import JSON, Column, DateTime, UniqueConstraint +from sqlmodel import Field, SQLModel + +TZDateTime = DateTime(timezone=True) +COMPONENT_KINDS = {"worker", "benchmark", "evaluator", "sandbox_manager"} + + +class ComponentCatalogEntry(SQLModel, table=True): + __tablename__ = "component_catalog" + __table_args__ = (UniqueConstraint("kind", "slug", name="uq_component_catalog_kind_slug"),) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + kind: str = Field(index=True) + slug: str = Field(index=True) + module: str + qualname: str + package: str | None = Field(default=None, index=True) + version: str | None = None + metadata_json: dict = Field(default_factory=dict, sa_column=Column(JSON)) + created_at: datetime = Field(default_factory=_utcnow, sa_type=TZDateTime) + updated_at: datetime = Field(default_factory=_utcnow, sa_type=TZDateTime) + + def parsed_metadata(self) -> JsonObject: + return self.__class__._parse_metadata(self.metadata_json) + + @classmethod + def _parse_metadata(cls, data: dict) -> JsonObject: + if not isinstance(data, dict): + raise ValueError(f"metadata_json must be a dict, got {type(data).__name__}") + return data + + @model_validator(mode="after") + def _validate_entry(self) -> "ComponentCatalogEntry": + if self.kind not in COMPONENT_KINDS: + allowed = ", ".join(sorted(COMPONENT_KINDS)) + raise ValueError(f"kind must be one of: {allowed}") + if not self.slug: + raise ValueError("slug must be non-empty") + if not self.module: + raise ValueError("module must be non-empty") + if not self.qualname: + raise ValueError("qualname must be non-empty") + self.__class__._parse_metadata(self.metadata_json) + return self +``` + +- [ ] **Step 4: Import component models in Alembic env** + +Modify `ergon_core/migrations/env.py`: + +```python +import ergon_core.core.persistence.components.models +``` + +Add it beside the other persistence model imports. + +- [ ] **Step 5: Add Alembic migration** + +Create a migration file under `ergon_core/migrations/versions/` with a new revision id: + +```python +"""add component catalog + +Revision ID: d1e2f3a4b5c6 +Revises: c2d3e4f5a6b7 +Create Date: 2026-04-29 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa +from alembic import op + +revision: str = "d1e2f3a4b5c6" +down_revision: str | None = "c2d3e4f5a6b7" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + op.create_table( + "component_catalog", + sa.Column("id", sa.Uuid(), nullable=False), + sa.Column("kind", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("module", sa.String(), nullable=False), + sa.Column("qualname", sa.String(), nullable=False), + sa.Column("package", sa.String(), nullable=True), + sa.Column("version", sa.String(), nullable=True), + sa.Column("metadata_json", sa.JSON(), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("kind", "slug", name="uq_component_catalog_kind_slug"), + ) + op.create_index("ix_component_catalog_kind", "component_catalog", ["kind"], unique=False) + op.create_index("ix_component_catalog_slug", "component_catalog", ["slug"], unique=False) + op.create_index("ix_component_catalog_package", "component_catalog", ["package"], unique=False) + + +def downgrade() -> None: + op.drop_index("ix_component_catalog_package", table_name="component_catalog") + op.drop_index("ix_component_catalog_slug", table_name="component_catalog") + op.drop_index("ix_component_catalog_kind", table_name="component_catalog") + op.drop_table("component_catalog") +``` + +Before choosing `down_revision`, inspect the current migration head with: + +```bash +uv run alembic -c ergon_core/alembic.ini heads +``` + +Use the actual head instead of the placeholder if different. + +- [ ] **Step 6: Run catalog model tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry/test_component_catalog_model.py -q +``` + +Expected: PASS. + +--- + +### Task 6: Add Component Catalog Service And Import Reference Loader + +**Files:** +- Create: `ergon_core/ergon_core/core/application/components/__init__.py` +- Create: `ergon_core/ergon_core/core/application/components/catalog.py` +- Test: `ergon_core/tests/unit/registry/test_component_catalog_service.py` + +- [ ] **Step 1: Write catalog service tests** + +Create `ergon_core/tests/unit/registry/test_component_catalog_service.py`: + +```python +import pytest +from sqlalchemy.pool import StaticPool +from sqlmodel import Session, SQLModel, create_engine + +from ergon_core.core.application.components.catalog import ( + ComponentCatalogService, + ComponentRef, + import_component_ref, +) +from ergon_core.core.persistence.components.models import ComponentCatalogEntry + + +def _session() -> Session: + engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + SQLModel.metadata.create_all(engine) + return Session(engine) + + +def test_upsert_and_require_component_ref() -> None: + session = _session() + service = ComponentCatalogService() + + service.upsert( + session, + ComponentRef( + kind="worker", + slug="training-stub", + module="ergon_builtins.shared.workers.training_stub_worker", + qualname="TrainingStubWorker", + package="ergon-builtins", + metadata={"install_hint": "none"}, + ), + ) + session.commit() + + ref = service.require(session, kind="worker", slug="training-stub") + assert ref.module == "ergon_builtins.shared.workers.training_stub_worker" + assert ref.qualname == "TrainingStubWorker" + assert ref.metadata == {"install_hint": "none"} + + +def test_upsert_updates_existing_ref() -> None: + session = _session() + service = ComponentCatalogService() + + service.upsert(session, ComponentRef(kind="worker", slug="x", module="old", qualname="Thing")) + service.upsert(session, ComponentRef(kind="worker", slug="x", module="new", qualname="Other")) + session.commit() + + rows = session.query(ComponentCatalogEntry).all() + assert len(rows) == 1 + assert service.require(session, kind="worker", slug="x").module == "new" + + +def test_import_component_ref_imports_module_qualname() -> None: + ref = ComponentRef( + kind="worker", + slug="component-ref", + module="ergon_core.core.application.components.catalog", + qualname="ComponentRef", + ) + + assert import_component_ref(ref) is ComponentRef + + +def test_require_unknown_component_lists_kind_and_slug() -> None: + session = _session() + + with pytest.raises(ValueError, match="Unknown worker component slug 'missing'"): + ComponentCatalogService().require(session, kind="worker", slug="missing") +``` + +- [ ] **Step 2: Run catalog service tests and verify they fail** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry/test_component_catalog_service.py -q +``` + +Expected: FAIL because `ComponentCatalogService` does not exist. + +- [ ] **Step 3: Implement component catalog service** + +Create the package marker: + +```python +"""Component catalog application services.""" +``` + +Create `ergon_core/ergon_core/core/application/components/catalog.py`: + +```python +"""Application service for trusted component catalog references.""" + +from importlib import import_module +from typing import Any + +from ergon_core.core.persistence.components.models import ComponentCatalogEntry +from ergon_core.core.shared.json_types import JsonObject +from ergon_core.core.shared.utils import utcnow +from pydantic import BaseModel, ConfigDict, Field +from sqlmodel import Session, select + + +class ComponentRef(BaseModel): + model_config = ConfigDict(frozen=True) + + kind: str + slug: str + module: str + qualname: str + package: str | None = None + version: str | None = None + metadata: JsonObject = Field(default_factory=dict) + + +class ComponentCatalogService: + def upsert(self, session: Session, ref: ComponentRef) -> ComponentCatalogEntry: + existing = session.exec( + select(ComponentCatalogEntry).where( + ComponentCatalogEntry.kind == ref.kind, + ComponentCatalogEntry.slug == ref.slug, + ) + ).one_or_none() + + row = existing or ComponentCatalogEntry( + kind=ref.kind, + slug=ref.slug, + module=ref.module, + qualname=ref.qualname, + ) + row.module = ref.module + row.qualname = ref.qualname + row.package = ref.package + row.version = ref.version + row.metadata_json = dict(ref.metadata) + row.updated_at = utcnow() + session.add(row) + return row + + def require(self, session: Session, *, kind: str, slug: str) -> ComponentRef: + row = session.exec( + select(ComponentCatalogEntry).where( + ComponentCatalogEntry.kind == kind, + ComponentCatalogEntry.slug == slug, + ) + ).one_or_none() + if row is None: + raise ValueError(f"Unknown {kind} component slug {slug!r}") + return _row_to_ref(row) + + def load_ref(self, ref: ComponentRef) -> Any: # slopcop: ignore[no-typing-any] + return import_component_ref(ref) + + +def import_component_ref(ref: ComponentRef) -> Any: # slopcop: ignore[no-typing-any] + target: Any = import_module(ref.module) # slopcop: ignore[no-typing-any] + for part in ref.qualname.split("."): + target = getattr(target, part) + return target + + +def _row_to_ref(row: ComponentCatalogEntry) -> ComponentRef: + return ComponentRef( + kind=row.kind, + slug=row.slug, + module=row.module, + qualname=row.qualname, + package=row.package, + version=row.version, + metadata=row.parsed_metadata(), + ) +``` + +- [ ] **Step 4: Run catalog service tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry/test_component_catalog_service.py -q +``` + +Expected: PASS. + +--- + +### Task 7: Move Execution Identity Out Of Worker Construction + +**Files:** +- Modify: `ergon_core/ergon_core/api/benchmark/task.py` +- Modify: `ergon_core/ergon_core/api/worker/context.py` +- Modify: `ergon_core/ergon_core/api/worker/worker.py` +- Modify: `ergon_core/ergon_core/core/application/events/task_events.py` +- Modify: `ergon_core/ergon_core/core/application/jobs/models.py` +- Modify: `ergon_core/ergon_core/core/application/workflows/orchestration.py` +- Modify: `ergon_core/ergon_core/core/application/jobs/execute_task.py` +- Modify worker subclasses/factories that still require `task_id` or `sandbox_id` +- Test: `ergon_core/tests/unit/api/test_worker_contract.py` + +- [ ] **Step 1: Write worker construction contract tests** + +Create `ergon_core/tests/unit/api/test_worker_contract.py`: + +```python +from collections.abc import AsyncGenerator +from uuid import uuid4 + +from ergon_core.api.benchmark import Task +from ergon_core.api.worker import Worker, WorkerContext, WorkerOutput +from ergon_core.api.worker.worker import WorkerStreamItem + + +class ContractSmokeWorker(Worker): + type_slug = "contract-smoke-worker" + + async def execute( + self, + task: Task, + *, + context: WorkerContext, + ) -> AsyncGenerator[WorkerStreamItem, None]: + yield WorkerOutput(output="ok", success=True) + + +def test_worker_constructor_has_only_authoring_configuration() -> None: + worker = ContractSmokeWorker(name="primary", model="stub:constant") + + assert isinstance(worker, ContractSmokeWorker) + assert worker.name == "primary" + assert worker.model == "stub:constant" + + +def test_task_carries_non_null_runtime_task_identity() -> None: + node_id = uuid4() + + task = Task( + task_id=node_id, + task_slug="root", + instance_key="default", + description="Run root task", + ) + + assert task.task_id == node_id +``` + +- [ ] **Step 2: Run worker contract tests and verify they fail** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/api/test_worker_contract.py -q +``` + +Expected: FAIL because `Task.task_id` does not exist yet and `Worker.__init__` still requires `task_id` and `sandbox_id`. + +- [ ] **Step 3: Add non-null task identity to `Task`** + +Modify `ergon_core/ergon_core/api/benchmark/task.py`: + +```python +from uuid import UUID + +class Task(BaseModel, Generic[PayloadT]): + task_id: UUID + task_slug: str + instance_key: str + description: str +``` + +`Task.task_id` is the worker-facing runtime task identity. It must always be `RunGraphNode.id`, not `ExperimentDefinitionTask.id`. Static definition tasks and dynamic subtasks both have a `RunGraphNode`, so worker authors get one non-null task id for every execution. + +Remove the old nullable event/request `task_id` from runtime payloads. Runtime events/jobs should carry `node_id` as the task identity: + +```python +node_id: UUID # RunGraphNode.id; runtime task identity +``` + +Then remove the nullable worker-facing `task_id` from `WorkerContext`. The worker-facing contract should be: + +```python +task.task_id # non-null RunGraphNode.id +context.sandbox_id # non-null sandbox identity +``` + +If helper tools need a sandbox/task key, pass `task.task_id` to those helpers explicitly when building them. Do not use `WorkerContext.task_id` as a second, nullable source of truth. + +- [ ] **Step 3b: Remove nullable task identity from runtime payloads** + +Remove internal event and job fields that currently use nullable `task_id` for `ExperimentDefinitionTask.id`: + +```python +class TaskReadyEvent(InngestEventContract): + run_id: UUID + definition_id: UUID + node_id: UUID +``` + +Apply the same shape to: + +- `TaskStartedEvent` +- `TaskCompletedEvent` +- `TaskFailedEvent` +- `PrepareTaskExecutionCommand` +- `WorkerExecuteRequest` +- `EvaluateTaskRunRequest` + +Keep `PreparedTaskExecution.node_id` as the canonical runtime task identity. Keep `RunGraphNode.definition_task_id` and `RunTaskExecution.definition_task_id` only as persisted relationships for static-template joins. If a service needs the static definition task row, it should load `RunGraphNode` by `node_id` and follow `RunGraphNode.definition_task_id`; do not carry that id through event payloads or public `Task`. + +- [ ] **Step 4: Simplify `Worker.__init__`** + +Modify `ergon_core/ergon_core/api/worker/worker.py`: + +```python +def __init__( + self, + *, + name: str, + model: str | None, + metadata: Mapping[str, Any] | None = None, # slopcop: ignore[no-typing-any] +) -> None: + self.name = name + self.model = model + self.metadata: dict[str, Any] = dict(metadata or {}) # slopcop: ignore[no-typing-any] +``` + +Do not keep `self.task_id` or `self.sandbox_id` on `Worker`. Workers should use `task.task_id` and `context.sandbox_id` inside `execute(...)`. + +- [ ] **Step 5: Refactor builtin worker factories into Worker subclasses** + +Replace factory functions such as `minif2f_react(...)` and `swebench_react(...)` with importable `Worker` subclasses. Those classes should build sandbox-bound tools inside `execute(...)`, using the runtime objects they already receive: + +```python +async def execute(self, task: Task, *, context: WorkerContext) -> AsyncGenerator[WorkerStreamItem, None]: + sandbox = MiniF2FSandboxManager().reconnect(context.sandbox_id) + toolkit = MiniF2FToolkit(...) + delegate = ReActWorker( + name=self.name, + model=self.model, + tools=list(toolkit.get_tools()), + system_prompt=MINIF2F_SYSTEM_PROMPT, + max_iterations=30, + ) + async for item in delegate.execute(task, context=context): + yield item +``` + +If a sandbox manager currently only looks up sandboxes by definition task id, add a public lookup/reconnect path by `sandbox_id`. Do not force worker construction to know about sandbox registry keys. + +- [ ] **Step 6: Run worker contract tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/api/test_worker_contract.py -q +``` + +Expected: PASS. + +--- + +### Task 8: Update Pydantic Registry To Produce And Publish Component Refs + +**Files:** +- Modify: `ergon_core/ergon_core/api/registry.py` +- Test: `ergon_core/tests/unit/registry/test_component_registry.py` + +- [ ] **Step 1: Add tests for ref generation and deregistration** + +Extend `ergon_core/tests/unit/registry/test_component_registry.py`: + +```python +def test_registry_records_import_refs_for_registered_components() -> None: + registry = ComponentRegistry(catalog_service=ComponentCatalogService()) + + registry.register_worker(ExampleWorker.type_slug, ExampleWorker) + ref = registry.component_refs[("worker", "example-worker")] + + assert ref.kind == "worker" + assert ref.slug == "example-worker" + assert ref.module == __name__ + assert ref.qualname == "ExampleWorker" + + +def test_registry_deregister_removes_component_and_ref() -> None: + registry = ComponentRegistry(catalog_service=ComponentCatalogService()) + registry.register_worker("example-worker", ExampleWorker) + + registry.deregister("worker", "example-worker") + + assert "example-worker" not in registry.workers + assert ("worker", "example-worker") not in registry.component_refs +``` + +- [ ] **Step 2: Run registry tests and verify they fail** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry/test_component_registry.py -q +``` + +Expected: FAIL because `component_refs` and `deregister` do not exist. + +- [ ] **Step 3: Add `ComponentRef` tracking to `ComponentRegistry`** + +Modify `ergon_core/ergon_core/api/registry.py`: + +```python +from ergon_core.core.application.components.catalog import ComponentCatalogService, ComponentRef +from sqlmodel import Session +``` + +Add field: + +```python +catalog_service: ComponentCatalogService +component_refs: dict[tuple[str, str], ComponentRef] = Field(default_factory=dict) +``` + +Update register methods to call a private helper after `_register`: + +```python +self._remember_ref("worker", slug, worker_cls) +``` + +Implement: + +```python +def deregister(self, kind: str, slug: str) -> None: + mapping = self._mapping_for(kind) + mapping.pop(slug, None) + self.component_refs.pop((kind, slug), None) + +def publish(self, session: Session) -> None: + for ref in self.component_refs.values(): + self.catalog_service.upsert(session, ref) + +def _remember_ref(self, kind: str, slug: str, value: object) -> None: + self.component_refs[(kind, slug)] = ComponentRef( + kind=kind, + slug=slug, + module=value.__module__, + qualname=value.__qualname__, + ) +``` + +For worker classes, `__qualname__` is sufficient if the class is module-level. If a value lacks `__module__` or `__qualname__`, raise `ValueError` with a clear message. Do not preserve the old `WorkerFactory` public alias; workers should be registered as importable `Worker` subclasses and constructed by the catalog with only authoring configuration (`name`, `model`, metadata). + +Construct the global authoring registry with an explicit service dependency: + +```python +registry = ComponentRegistry(catalog_service=ComponentCatalogService()) +``` + +Do not use nullable service parameters or ad hoc fallback construction such as `service or ComponentCatalogService()`. Tests that need isolation should pass their own `ComponentCatalogService()` when constructing a fresh `ComponentRegistry`. + +- [ ] **Step 4: Run registry tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry/test_component_registry.py -q +``` + +Expected: PASS. + +--- + +### Task 9: Register Builtins And Smoke Components Into The Catalog + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/registry.py` +- Modify: `tests/fixtures/smoke_components/__init__.py` +- Test: `ergon_builtins/tests/unit/registry/test_builtin_pairings.py` or moved equivalent. + +- [ ] **Step 1: Add tests that builtins can publish refs into a DB session** + +Create or extend builtins registry tests: + +```python +from sqlalchemy.pool import StaticPool +from sqlmodel import Session, SQLModel, create_engine + +from ergon_core.api.registry import ComponentRegistry +from ergon_core.core.application.components.catalog import ComponentCatalogService + + +def _session() -> Session: + engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + SQLModel.metadata.create_all(engine) + return Session(engine) + + +def test_register_builtins_can_publish_component_refs() -> None: + from ergon_builtins.registry import register_builtins + + service = ComponentCatalogService() + registry = ComponentRegistry(catalog_service=service) + register_builtins(registry) + session = _session() + + registry.publish(session) + session.commit() + + ref = service.require(session, kind="worker", slug="training-stub") + assert ref.module.endswith("training_stub_worker") + assert ref.qualname == "TrainingStubWorker" +``` + +- [ ] **Step 2: Run publishing test and verify it fails if refs are incomplete** + +Run: + +```bash +uv run pytest ergon_builtins/tests/unit/registry -q +``` + +Expected: PASS if Task 8 is complete; otherwise FAIL on missing refs. + +- [ ] **Step 3: Keep publishing explicit and outside registration functions** + +Keep registration functions focused on filling the in-process authoring registry: + +```python +def register_builtins(target: ComponentRegistry = registry) -> None: + register_core_builtins(target) + _register_local_model_builtins() + _register_data_builtins(target) +``` + +Do not make builtins import DB/session code. Keep publishing as an explicit caller responsibility: + +```python +register_builtins(registry) +with get_session() as session: + registry.publish(session) + session.commit() +``` + +This keeps builtins package independent of persistence. + +- [ ] **Step 4: Run builtins registry tests** + +Run: + +```bash +uv run pytest ergon_builtins/tests/unit/registry -q +``` + +Expected: PASS. + +- [ ] **Step 5: Remove legacy builtins registry dict snapshots** + +After publishing tests pass, delete legacy dict snapshot exports from `ergon_builtins/ergon_builtins/registry.py`. The top-level builtins registry module should expose registration functions and install hints only, not old process-local maps. + +Remove exports named: + +```python +BENCHMARKS +WORKERS +EVALUATORS +SANDBOX_MANAGERS +MODEL_BACKENDS +``` + +Keep sub-registry implementation details in `registry_core.py` and `registry_data.py` only as inputs to `register_core_builtins()` and `register_data_builtins()`. Update tests/callers that imported top-level dict snapshots to use either `ComponentRegistry` in authoring tests or `ComponentCatalogService` in runtime/catalog tests. + +- [ ] **Step 6: Convert worker factory functions to Worker subclasses** + +Before publishing worker refs into the catalog, ensure every registered worker slug points at an importable `Worker` subclass. If any existing builtins are module-level factory functions that return workers, replace them with small `Worker` subclasses or move their construction logic into the subclass initializer. + +This keeps the public mental model simple: + +```python +register_worker("training-stub", TrainingStubWorker) +worker = catalog.build_worker(session, slug="training-stub", name="primary", model="stub:constant") +``` + +There should be no public `Callable[..., Worker]` / `WorkerFactory` API after this migration. + +--- + +### Task 10: Add Catalog-Only Runtime Loading + +**Files:** +- Modify: `ergon_core/ergon_core/core/application/components/catalog.py` +- Modify runtime files listed in file structure. +- Test: core runtime registry tests. + +- [ ] **Step 1: Add test for catalog-backed runtime loading** + +Create `ergon_core/tests/unit/registry/test_catalog_backed_registry_resolution.py`: + +```python +from collections.abc import AsyncGenerator +from sqlalchemy.pool import StaticPool +from sqlmodel import Session, SQLModel, create_engine + +from ergon_core.api.benchmark import Task +from ergon_core.api.worker import Worker, WorkerContext, WorkerOutput +from ergon_core.api.worker.worker import WorkerStreamItem +from ergon_core.core.application.components.catalog import ComponentCatalogService, ComponentRef + + +class CatalogSmokeWorker(Worker): + type_slug = "catalog-smoke-worker" + + async def execute( + self, + task: Task, + *, + context: WorkerContext, + ) -> AsyncGenerator[WorkerStreamItem, None]: + yield WorkerOutput(output="ok", success=True) + + +def _session() -> Session: + engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + SQLModel.metadata.create_all(engine) + return Session(engine) + + +def test_build_worker_imports_worker_class_without_local_registration() -> None: + session = _session() + service = ComponentCatalogService() + service.upsert( + session, + ComponentRef( + kind="worker", + slug=CatalogSmokeWorker.type_slug, + module=__name__, + qualname="CatalogSmokeWorker", + ), + ) + session.commit() + + loaded = service.build_worker( + session, + slug=CatalogSmokeWorker.type_slug, + name="primary", + model="stub:constant", + ) + + assert isinstance(loaded, CatalogSmokeWorker) + assert loaded.name == "primary" +``` + +This test proves the catalog imports the persisted worker class and returns a real `Worker` without requiring process-local registry state or execution-only constructor arguments. + +- [ ] **Step 2: Run test and verify it fails** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry/test_catalog_backed_registry_resolution.py -q +``` + +Expected: FAIL because `build_worker` does not exist yet. + +- [ ] **Step 3: Add catalog loading without registry caching** + +Do not extend `ComponentRegistry.require_*` into a cache-loading runtime API. Keep `ComponentRegistry` focused on in-process authoring, validation of explicitly registered objects, and publishing refs into the catalog. + +Add one generic loading helper to `ComponentCatalogService` for non-worker component types: + +```python +def load_ref(self, ref: ComponentRef) -> object: + return import_component_ref(ref) +``` + +Runtime code should call catalog resolution directly and not populate `registry.workers`, `registry.benchmarks`, `registry.evaluators`, or `registry.sandbox_managers`. + +- [ ] **Step 4: Add typed catalog loading helpers** + +Add typed helpers on `ComponentCatalogService` because they make runtime call sites easier to read. Workers should produce a real `Worker`, not a factory/constructor object. + +```python +def build_worker( + self, + session: Session, + *, + slug: str, + name: str, + model: str | None, +) -> Worker: + ref = self.require(session, kind="worker", slug=slug) + worker_cls = self.load_ref(ref) + if not isinstance(worker_cls, type) or not issubclass(worker_cls, Worker): + raise TypeError( + f"Worker component {slug!r} resolved to {worker_cls!r}, expected a Worker subclass" + ) + return worker_cls( + name=name, + model=model, + metadata=ref.metadata, + ) + +def resolve_benchmark(self, session: Session, slug: str) -> type[Benchmark]: + return self.load_ref(self.require(session, kind="benchmark", slug=slug)) + +def resolve_evaluator(self, session: Session, slug: str) -> type[Evaluator]: + return self.load_ref(self.require(session, kind="evaluator", slug=slug)) + +def resolve_sandbox_manager(self, session: Session, slug: str) -> type[BaseSandboxManager]: + return self.load_ref(self.require(session, kind="sandbox_manager", slug=slug)) +``` + +These helpers must still read from Postgres and import the component on each call; do not populate `registry.workers`, `registry.benchmarks`, `registry.evaluators`, or `registry.sandbox_managers`. + +- [ ] **Step 5: Run catalog-backed registry tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/registry -q +``` + +Expected: PASS. + +--- + +### Task 11: Publish Catalog Rows During CLI/API/Test Bootstrap + +**Files:** +- Modify: `ergon_cli/ergon_cli/main.py` +- Modify: `ergon_core/ergon_core/core/rest_api/app.py` +- Modify: test setup files. + +- [ ] **Step 1: Replace env-var plugin startup with explicit bootstrap helper** + +Create a function in a non-core module, for example `ergon_cli/ergon_cli/bootstrap.py`: + +```python +"""Process bootstrap for local CLI/API components.""" + +from ergon_builtins.registry import register_builtins +from ergon_core.api.registry import registry +from ergon_core.core.persistence.shared.db import get_session + + +def register_and_publish_builtins() -> None: + register_builtins(registry) + with get_session() as session: + registry.publish(session) + session.commit() +``` + +- [ ] **Step 2: Call bootstrap from CLI startup** + +Modify `ergon_cli/ergon_cli/main.py`: + +```python +from ergon_cli.bootstrap import register_and_publish_builtins +``` + +Call it before command handlers run. If commands like `doctor` should not require DB, skip publishing for those commands by calling it only in experiment/benchmark/eval/workflow handlers. + +- [ ] **Step 3: Add API startup bootstrap without env plugins** + +Do not import tests from core app. For local Docker, choose one explicit bootstrap: + +Option A, if `app.py` is local/dev-only: + +```python +from ergon_builtins.registry import register_builtins +from ergon_core.api.registry import registry + +register_builtins(registry) +with get_session() as session: + registry.publish(session) + session.commit() +``` + +Option B, if strict core independence is still desired: + +Create `ergon_cli/ergon_cli/api_app.py` or a top-level `ergon_app/local_api.py` that imports core `app`, registers/publishes builtins, registers/publishes smoke fixtures, and is the uvicorn target used by docker compose. + +Recommendation: use Option B to avoid recreating core-to-builtins coupling. + +- [ ] **Step 4: Add smoke publishing in test bootstrap** + +For E2E/local Docker, explicit Python bootstrap should call: + +```python +from tests.fixtures.smoke_components import register_smoke_components + +register_smoke_components(registry) +with get_session() as session: + registry.publish(session) + session.commit() +``` + +Host-side pytest can still call this for in-process tests, but E2E must publish inside the API/Inngest process or before the stack starts against the shared DB. + +- [ ] **Step 5: Run CLI/API bootstrap tests** + +Run: + +```bash +uv run pytest ergon_cli/tests/unit ergon_core/tests/unit/test_app_mounts_harness_conditionally.py -q +``` + +Expected: PASS after tests are updated for no `ENABLE_TEST_HARNESS`. + +--- + +### Task 12: Update Runtime Jobs To Resolve Through Catalog When Needed + +**Files:** +- Modify runtime files listed in file structure. +- Test: existing runtime job tests plus new catalog-backed tests. + +- [ ] **Step 1: Update worker execute job** + +In `worker_execute.py`, when resolving worker and benchmark: + +```python +with get_session() as session: + worker = catalog.build_worker( + session, + slug=payload.worker_type, + name=payload.assigned_worker_slug, + model=payload.model_target, + ) +``` + +Build the `Task` with the runtime graph node identity. Do not derive this from the nullable static definition task id: + +```python +if payload.node_id is None: + raise ContractViolationError("worker-execute requires node_id") + +task = Task( + task_id=payload.node_id, + task_slug=payload.task_slug, + instance_key=instance_key, + description=payload.task_description, + task_payload=task_payload or EmptyTaskPayload(), +) +``` + +Build `WorkerContext` without duplicating task identity: + +```python +worker_context = WorkerContext( + run_id=payload.run_id, + definition_id=payload.definition_id, + execution_id=payload.execution_id, + sandbox_id=payload.sandbox_id, +) +``` + +`WorkerExecuteRequest` should carry only the runtime task id: + +```python +node_id: UUID # runtime task id, always present +``` + +If worker execution needs static task payload or instance data, resolve it from the persisted graph node: + +```python +node = session.get(RunGraphNode, payload.node_id) +if node is None: + raise ContractViolationError(f"RunGraphNode {payload.node_id} not found") + +if node.definition_task_id is not None: + task_row, instance_row = DefinitionRepository().task_with_instance( + session, + node.definition_task_id, + ) + task_payload = task_row.task_payload_as(benchmark_cls.task_payload_model) + instance_key = instance_row.instance_key +else: + task_payload = None + instance_key = str(payload.node_id) +``` + +Avoid opening duplicate sessions if the function already opens a session for task rows. Reuse the existing session where practical. + +- [ ] **Step 2: Update evaluate task job** + +Use: + +```python +evaluator_cls = catalog.resolve_evaluator(session, evaluator_type) +benchmark_cls = catalog.resolve_benchmark(session, benchmark_type) +manager_cls = catalog.resolve_sandbox_manager(session, benchmark_type) +``` + +Do not keep the previous `DefaultSandboxManager` fallback for known benchmark/sandbox slugs. If a persisted benchmark or sandbox slug has no catalog entry, raise immediately; that means definition-time validation or catalog publishing failed. + +- [ ] **Step 3: Update sandbox setup and persist outputs** + +Use catalog resolution where a sandbox slug is explicit. Do not fall back to `DefaultSandboxManager` for unknown explicit slugs. The purpose of definition-time validation is to prevent unknown slugs from being persisted; if one still reaches runtime, fail loudly with the missing slug and registry/catalog context. + +```python +manager_cls = catalog.resolve_sandbox_manager(session, slug) +``` + +- [ ] **Step 4: Update experiment service and launch** + +Resolve benchmark/evaluator via catalog-backed `require_*` using the DB session already used in the service. + +- [ ] **Step 5: Update workflow/task validation** + +Replace `slug in registry.workers` checks with catalog-backed existence checks: + +```python +catalog.require(session, kind="worker", slug=slug) +``` + +This is the point where cross-process correctness improves: validation no longer depends on the current process having imported builtins first. + +- [ ] **Step 6: Run runtime tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit/runtime ergon_core/tests/unit/registry -q +``` + +Expected: PASS. + +--- + +### Task 13: Delete `ERGON_STARTUP_PLUGINS` And `ENABLE_SMOKE_FIXTURES` + +**Files:** +- Modify: `ergon_core/ergon_core/core/shared/settings.py` +- Modify: `ergon_core/ergon_core/core/rest_api/app.py` +- Modify: `ergon_cli/ergon_cli/composition/__init__.py` +- Modify: `docker-compose.yml`, `.github/workflows/e2e-benchmarks.yml`, scripts/docs/tests. + +- [ ] **Step 1: Add grep-based env-var deletion test** + +Create `tests/unit/architecture/test_retired_env_vars.py`: + +```python +from pathlib import Path + + +RETIRED = { + "ERGON_STARTUP_PLUGINS", + "ENABLE_SMOKE_FIXTURES", +} + + +def test_retired_plugin_and_smoke_env_vars_are_not_used_in_code() -> None: + offenders: list[str] = [] + roots = [Path("ergon_core"), Path("ergon_cli"), Path("ergon_builtins"), Path("tests"), Path("scripts")] + for root in roots: + for path in root.rglob("*"): + if path.is_file() and path.suffix in {".py", ".sh", ".ts", ".tsx", ".yml", ".yaml", ".json"}: + text = path.read_text(errors="ignore") + if any(name in text for name in RETIRED): + offenders.append(str(path)) + assert offenders == [] +``` + +- [ ] **Step 2: Run env-var deletion test and verify it fails** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_retired_env_vars.py -q +``` + +Expected: FAIL listing current usage. + +- [ ] **Step 3: Remove startup plugin settings and loader** + +Delete from `Settings`: + +```python +startup_plugin_specs +startup_plugins +``` + +Delete `_run_startup_plugins` from `app.py`. + +- [ ] **Step 4: Remove `ENABLE_SMOKE_FIXTURES` fallback** + +In `ergon_cli/ergon_cli/composition/__init__.py`, delete: + +```python +os.environ.get("ENABLE_SMOKE_FIXTURES", ...) +``` + +Smoke registration should happen through explicit test/bootstrap code, not inside generic CLI composition. + +- [ ] **Step 5: Remove env vars from compose/workflows/scripts** + +Delete `ERGON_STARTUP_PLUGINS` and `ENABLE_SMOKE_FIXTURES` from: + +```text +docker-compose.yml +.github/workflows/e2e-benchmarks.yml +scripts/smoke_local_up.sh +tests/real_llm/benchmarks/test_smoke_stub.py +``` + +- [ ] **Step 6: Run deletion test** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_retired_env_vars.py -q +``` + +Expected: PASS. + +--- + +### Task 14: Delete `ENABLE_TEST_HARNESS` And `TEST_HARNESS_SECRET` + +**Files:** +- Modify: `ergon_core/ergon_core/core/shared/settings.py` +- Modify: `ergon_core/ergon_core/core/rest_api/app.py` +- Modify: `ergon_core/ergon_core/core/rest_api/test_harness.py` +- Modify dashboard test clients/routes referencing `TEST_HARNESS_SECRET`. +- Modify compose/workflows/package scripts/docs. + +- [ ] **Step 1: Extend retired env-var test** + +Add to `RETIRED`: + +```python +"ENABLE_TEST_HARNESS", +"TEST_HARNESS_SECRET", +``` + +Run: + +```bash +uv run pytest tests/unit/architecture/test_retired_env_vars.py -q +``` + +Expected: FAIL listing all remaining uses. + +- [ ] **Step 2: Always mount test harness under a danger-prefixed route** + +Change test harness router: + +```python +router = APIRouter(prefix="/api/__danger__/test-harness", tags=["danger-test-harness"]) +``` + +Update all clients from `/api/test/...` to `/api/__danger__/test-harness/...`. + +- [ ] **Step 3: Remove secret requirement from write endpoints** + +Delete `_require_secret` from `test_harness.py`. + +Remove `x_test_secret` parameters and `_require_secret(x_test_secret)` calls from: + +```python +seed_run +reset_test_rows +``` + +Decide whether `submit_cohort` should remain write-but-unguarded; with the danger-prefixed route, it should also be under the same unauthenticated local harness policy. + +- [ ] **Step 4: Remove conditional mount** + +In `app.py`, replace: + +```python +if settings.enable_test_harness: + app.include_router(_test_harness_router) +``` + +with: + +```python +app.include_router(_test_harness_router) +``` + +Delete `enable_test_harness` from `Settings`. + +- [ ] **Step 5: Update dashboard and Python clients** + +Update: + +```text +ergon-dashboard/tests/helpers/backendHarnessClient.ts +ergon-dashboard/src/app/api/test/dashboard/seed/route.ts +ergon-dashboard/src/lib/config.ts +tests/e2e/_asserts.py +tests/e2e/test_*_smoke.py +tests/integration/smokes/test_smoke_harness.py +package.json +scripts/smoke_local_run.sh +``` + +Remove `X-Test-Secret` headers and env lookups. Update URL paths to danger-prefixed harness routes. + +- [ ] **Step 6: Update tests for always-mounted harness** + +Replace `test_app_mounts_harness_conditionally.py` with a test named: + +```python +def test_app_mounts_danger_test_harness_routes() -> None: + routes = {route.path for route in app.routes} + assert "/api/__danger__/test-harness/read/run/{run_id}/state" in routes +``` + +- [ ] **Step 7: Run retired env-var test** + +Run: + +```bash +uv run pytest tests/unit/architecture/test_retired_env_vars.py -q +``` + +Expected: PASS. + +--- + +### Task 15: Verification + +**Files:** +- No planned source files beyond fixes revealed by tests. + +- [ ] **Step 1: Verify retired env vars are gone** + +Run: + +```bash +rg "ENABLE_TEST_HARNESS|TEST_HARNESS_SECRET|ERGON_STARTUP_PLUGINS|ENABLE_SMOKE_FIXTURES|ERGON_SKIP_INFRA_CHECK" ergon_core ergon_builtins ergon_cli tests scripts docker-compose.yml .github package.json ergon-dashboard -n +``` + +Expected: no matches, except historical docs if the team chooses not to update old planning documents. The architecture test should search code/config, not historical plans. + +- [ ] **Step 2: Verify component catalog migration imports** + +Run: + +```bash +uv run alembic -c ergon_core/alembic.ini upgrade head +``` + +Expected: migration succeeds on a local/dev DB. + +- [ ] **Step 3: Run package-owned unit tests** + +Run: + +```bash +uv run pytest ergon_core/tests/unit ergon_builtins/tests/unit ergon_cli/tests/unit tests/unit -q +``` + +Expected: PASS. + +- [ ] **Step 4: Run backend unit script** + +Run: + +```bash +pnpm run test:be:unit +``` + +Expected: PASS. + +- [ ] **Step 5: Run E2E collection** + +Run: + +```bash +uv run pytest tests/e2e --collect-only -q +``` + +Expected: PASS. + +- [ ] **Step 6: Run lint on changed Python paths** + +Run: + +```bash +uv run ruff check ergon_core ergon_builtins ergon_cli tests scripts +``` + +Expected: PASS. + +--- + +## Self-Review + +- Spec coverage: The plan covers package-owned test layout, PG component catalog schema, catalog service, registry publishing/loading, runtime refactor, and deletion of all five env vars named in the discussion. +- Placeholder scan: The plan contains no placeholder instructions. The migration revision id must be chosen from the actual Alembic head during execution, and the plan explicitly instructs how to do that. +- Type consistency: The same names are used throughout: `ComponentCatalogEntry`, `ComponentCatalogService`, `ComponentRef`, `component_catalog`, `registry.publish`, and catalog-backed `require_*` methods. diff --git a/ergon-dashboard/package.json b/ergon-dashboard/package.json index a55e16bb..5d88cdd8 100644 --- a/ergon-dashboard/package.json +++ b/ergon-dashboard/package.json @@ -28,6 +28,7 @@ "react": "^18", "react-dom": "^18", "react-markdown": "^10.1.0", + "react-resizable-panels": "^4.10.0", "recharts": "^3.8.1", "remark-gfm": "^4.0.1", "socket.io": "^4.8.3", diff --git a/ergon-dashboard/pnpm-lock.yaml b/ergon-dashboard/pnpm-lock.yaml index fd126578..ee2b0094 100644 --- a/ergon-dashboard/pnpm-lock.yaml +++ b/ergon-dashboard/pnpm-lock.yaml @@ -35,6 +35,9 @@ importers: react-markdown: specifier: ^10.1.0 version: 10.1.0(@types/react@18.3.27)(react@18.3.1) + react-resizable-panels: + specifier: ^4.10.0 + version: 4.10.0(react-dom@18.3.1(react@18.3.1))(react@18.3.1) recharts: specifier: ^3.8.1 version: 3.8.1(@types/react@18.3.27)(react-dom@18.3.1(react@18.3.1))(react-is@16.13.1)(react@18.3.1)(redux@5.0.1) @@ -3096,6 +3099,12 @@ packages: redux: optional: true + react-resizable-panels@4.10.0: + resolution: {integrity: sha512-frjewRQt7TCv/vCH1pJfjZ7RxAhr5pKuqVQtVgzFq/vherxBFOWyC3xMbryx5Ti2wylViGUFc93Etg4rB3E0UA==} + peerDependencies: + react: ^18.0.0 || ^19.0.0 + react-dom: ^18.0.0 || ^19.0.0 + react@18.3.1: resolution: {integrity: sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==} engines: {node: '>=0.10.0'} @@ -7311,6 +7320,11 @@ snapshots: '@types/react': 18.3.27 redux: 5.0.1 + react-resizable-panels@4.10.0(react-dom@18.3.1(react@18.3.1))(react@18.3.1): + dependencies: + react: 18.3.1 + react-dom: 18.3.1(react@18.3.1) + react@18.3.1: dependencies: loose-envify: 1.4.0 diff --git a/ergon-dashboard/scripts/generate-rest-contracts.mjs b/ergon-dashboard/scripts/generate-rest-contracts.mjs index 24745ab7..04ffa7e6 100644 --- a/ergon-dashboard/scripts/generate-rest-contracts.mjs +++ b/ergon-dashboard/scripts/generate-rest-contracts.mjs @@ -10,6 +10,16 @@ const source = readFileSync(contractsPath, "utf8") .replace('import { makeApi, Zodios, type ZodiosOptions } from "@zodios/core";\n', "") // openapi-zod-client generates z.record(V) but Zod requires z.record(K, V). .replace(/z\.record\((?!z\.string\(\))/g, "z.record(z.string(), ") + // Preserve literal discriminators for generated context-event payload unions. + .replace( + /event_type: z\.string\(\)\.optional\(\)\.default\("([^"]+)"\)/g, + 'event_type: z.literal("$1").default("$1")', + ) + // Preserve literal discriminators for generated context-part unions. + .replace( + /part_kind: z\.string\(\)\.optional\(\)\.default\("([^"]+)"\)/g, + 'part_kind: z.literal("$1").default("$1")', + ) // Recursive JSON schemas must be lazy or the generated module dereferences // JsonValue_Input before it has been initialized. .replace( diff --git a/ergon-dashboard/src/components/dag/DAGCanvas.tsx b/ergon-dashboard/src/components/dag/DAGCanvas.tsx index ef653852..a2212144 100644 --- a/ergon-dashboard/src/components/dag/DAGCanvas.tsx +++ b/ergon-dashboard/src/components/dag/DAGCanvas.tsx @@ -28,6 +28,7 @@ import "@xyflow/react/dist/style.css"; import { TaskStatus, type WorkflowRunState } from "@/lib/types"; import { nodeTypes, type TaskNodeType } from "./TaskNode"; import { GraphDependencyEdge } from "./edges/GraphDependencyEdge"; +import { buildContainerEvaluationRollup } from "@/features/evaluation/selectors"; import { GraphExpansionProvider } from "@/features/graph/hooks/useGraphExpansion"; import { computeHierarchicalLayout, calculateExpandedContainers } from "@/features/graph/layout/hierarchicalLayout"; import { DEFAULT_EXPANDED_DEPTH } from "@/features/graph/layout/layoutTypes"; @@ -175,6 +176,36 @@ function SearchCard({ ); } +function EvaluationLensCard({ + active, + count, + onToggle, +}: { + active: boolean; + count: number; + onToggle: () => void; +}) { + return ( + + ); +} + const LEGEND_ITEMS: { status: string; label: string; cssVar: string }[] = [ { status: "completed", label: "completed", cssVar: "var(--status-completed)" }, { status: "running", label: "running", cssVar: "var(--status-running)" }, @@ -220,6 +251,7 @@ function DAGCanvasInner({ const [edges, setEdges, onEdgesChange] = useEdgesState([]); const [containerDims, setContainerDims] = useState>(new Map()); const [prevTaskIds, setPrevTaskIds] = useState>(new Set()); + const [evaluationLensActive, setEvaluationLensActive] = useState(false); const { fitView: rfFitView } = useReactFlow(); const fitViewTimer = useRef | null>(null); @@ -273,7 +305,7 @@ function DAGCanvasInner({ if ( task.name.toLowerCase().includes(searchLower) || task.description?.toLowerCase().includes(searchLower) || - task.assignedWorkerName?.toLowerCase().includes(searchLower) + task.assignedWorkerSlug?.toLowerCase().includes(searchLower) ) { count++; } @@ -281,6 +313,20 @@ function DAGCanvasInner({ return count; }, [searchQuery, runState?.tasks]); + const evaluationRollups = useMemo(() => { + const rollups = new Map>(); + if (!runState?.tasks) return rollups; + for (const taskId of runState.tasks.keys()) { + rollups.set(taskId, buildContainerEvaluationRollup(runState, taskId)); + } + return rollups; + }, [runState]); + + const evaluationBearingCount = useMemo( + () => Array.from(evaluationRollups.values()).filter((rollup) => rollup !== null).length, + [evaluationRollups], + ); + useEffect(() => { if (!runState?.tasks || runState.tasks.size === 0) return; @@ -293,6 +339,8 @@ function DAGCanvasInner({ "LR", newNodeIds, highlightedTaskIds, + evaluationRollups, + evaluationLensActive, ); setNodes(result.nodes as TaskNodeType[]); @@ -311,6 +359,8 @@ function DAGCanvasInner({ selectedTaskId, newNodeIds, highlightedTaskIds, + evaluationRollups, + evaluationLensActive, setNodes, setEdges, rfFitView, @@ -486,6 +536,11 @@ function DAGCanvasInner({ onSearchChange={handleSearchChange} matchCount={matchCount} /> + setEvaluationLensActive((active) => !active)} + />
{/* Floating controls — bottom-left */} diff --git a/ergon-dashboard/src/components/dag/TaskNode.tsx b/ergon-dashboard/src/components/dag/TaskNode.tsx index dfe589c8..1be4c2f1 100644 --- a/ergon-dashboard/src/components/dag/TaskNode.tsx +++ b/ergon-dashboard/src/components/dag/TaskNode.tsx @@ -10,6 +10,7 @@ import { memo } from "react"; import { type Node, type NodeProps } from "@xyflow/react"; import type { TaskState } from "@/lib/types"; +import type { EvaluationRollup } from "@/features/evaluation/contracts"; import { useGraphExpansion } from "@/features/graph/hooks/useGraphExpansion"; import { getNodeVariant } from "@/features/graph/layout/layoutTypes"; import { ContainerNode } from "@/features/graph/components/ContainerNode"; @@ -27,6 +28,8 @@ export type TaskNodeData = { maxGraphDepth?: number; /** Dagre rank direction used for this layout pass (drives handle positions). */ graphLayoutDirection?: "TB" | "LR"; + evaluationRollup?: EvaluationRollup | null; + evaluationLensActive?: boolean; }; export type TaskNodeType = Node; @@ -41,6 +44,8 @@ function TaskNodeComponent({ data }: NodeProps) { isNew = false, maxGraphDepth, graphLayoutDirection = "LR", + evaluationRollup = null, + evaluationLensActive = false, } = data; const { expandedContainers, toggleExpand, containerDimensions } = useGraphExpansion(); @@ -71,6 +76,8 @@ function TaskNodeComponent({ data }: NodeProps) { containerHeight={dims?.height ?? 100} layoutDirection={graphLayoutDirection} maxGraphDepth={maxGraphDepth} + evaluationRollup={evaluationRollup} + evaluationLensActive={evaluationLensActive} /> ); @@ -89,6 +96,8 @@ function TaskNodeComponent({ data }: NodeProps) { highlighted={highlighted} layoutDirection={graphLayoutDirection} maxGraphDepth={maxGraphDepth} + evaluationRollup={evaluationRollup} + evaluationLensActive={evaluationLensActive} /> ); diff --git a/ergon-dashboard/src/components/panels/EvaluationPanel.tsx b/ergon-dashboard/src/components/panels/EvaluationPanel.tsx index 66f90111..4e7fe400 100644 --- a/ergon-dashboard/src/components/panels/EvaluationPanel.tsx +++ b/ergon-dashboard/src/components/panels/EvaluationPanel.tsx @@ -6,6 +6,21 @@ function formatPercent(score: number): string { return `${(score * 100).toFixed(1)}%`; } +function statusBadgeClass(status: string): string { + switch (status) { + case "passed": + return "bg-emerald-50 text-emerald-700 ring-emerald-200"; + case "failed": + return "bg-rose-50 text-rose-700 ring-rose-200"; + case "errored": + return "bg-amber-50 text-amber-700 ring-amber-200"; + case "skipped": + return "bg-slate-100 text-slate-600 ring-slate-200"; + default: + return "bg-gray-100 text-gray-700 ring-gray-200"; + } +} + function EvaluationCriteriaEmpty({ detail }: { detail: string }) { return (
+
+
Evaluator
+
+ {evaluation.evaluatorName} +
+
+
+
Aggregation
+
+ {evaluation.aggregationRule} +
+
Normalized
@@ -72,22 +99,78 @@ export function EvaluationPanel({ >
-
- {criterion.stageName}: {criterion.criterionDescription} +
+ + {criterion.status} + +
+ {criterion.stageName}: {criterion.criterionDescription} +
- {criterion.criterionType} + {criterion.criterionName} · {criterion.criterionType} · weight {criterion.weight}
{criterion.score} / {criterion.maxScore} +
+ contribution {criterion.contribution} +
+ {criterion.modelReasoning ? ( +
+
+ Reasoning +
+

{criterion.modelReasoning}

+
+ ) : null} + {criterion.skippedReason ? ( +
+ Skipped: {criterion.skippedReason} +
+ ) : null} + {criterion.error ? ( +
+                  {JSON.stringify(criterion.error, null, 2)}
+                
+ ) : null} {criterion.feedback ? (

{criterion.feedback}

) : null} + {criterion.evaluationInput ? ( +
+ + Evaluation input + +
+                    {criterion.evaluationInput}
+                  
+
+ ) : null} + {(criterion.evaluatedActionIds.length > 0 || criterion.evaluatedResourceIds.length > 0) && ( +
+ {criterion.evaluatedActionIds.map((id) => ( + + action {id} + + ))} + {criterion.evaluatedResourceIds.map((id) => ( + + resource {id} + + ))} +
+ )}
))}
diff --git a/ergon-dashboard/src/components/run/RunWorkspacePage.tsx b/ergon-dashboard/src/components/run/RunWorkspacePage.tsx index 2320e0b5..bd6bba93 100644 --- a/ergon-dashboard/src/components/run/RunWorkspacePage.tsx +++ b/ergon-dashboard/src/components/run/RunWorkspacePage.tsx @@ -2,6 +2,7 @@ import Link from "next/link"; import { useEffect, useMemo, useRef, useState } from "react"; +import { Group, Panel, Separator, type Layout } from "react-resizable-panels"; import { DAGCanvas } from "@/components/dag/DAGCanvas"; import { StatusBadge } from "@/components/common/StatusBadge"; @@ -21,6 +22,42 @@ import { useRunState } from "@/hooks/useRunState"; import { buildRunEvents } from "@/lib/runEvents"; import { RunLifecycleStatus, SerializedWorkflowRunState, TaskStatus } from "@/lib/types"; +const VERTICAL_LAYOUT_STORAGE_KEY = "ergon-run-debugger-vertical-layout:v1"; +const HORIZONTAL_LAYOUT_STORAGE_KEY = "ergon-run-debugger-horizontal-layout:v1"; +const DEFAULT_VERTICAL_LAYOUT: Layout = { "graph-workspace": 62, timeline: 38 }; +const DEFAULT_HORIZONTAL_LAYOUT: Layout = { graph: 58, workspace: 42 }; + +function loadPanelLayout(storageKey: string, fallback: Layout): Layout { + if (typeof window === "undefined") return fallback; + + try { + const raw = window.localStorage.getItem(storageKey); + if (!raw) return fallback; + const parsed = JSON.parse(raw) as Layout; + return Object.fromEntries( + Object.entries(fallback).map(([id, defaultSize]) => { + const size = parsed[id]; + return [id, Number.isFinite(size) ? size : defaultSize]; + }), + ); + } catch { + return fallback; + } +} + +function savePanelLayout(storageKey: string, layout: Layout): void { + try { + window.localStorage.setItem(storageKey, JSON.stringify(layout)); + } catch { + // Ignore storage failures; resizing should still work for the session. + } +} + +function panelPercent(layout: Layout, id: string, fallback: number): string { + const size = layout[id]; + return `${Number.isFinite(size) ? size : fallback}%`; +} + function formatSeconds(value: number | null): string { if (value == null) return "—"; if (value < 60) return `${value.toFixed(1)}s`; @@ -60,6 +97,13 @@ export function RunWorkspacePage({ const [selectionNotice, setSelectionNotice] = useState(null); const [statusFilter, setStatusFilter] = useState(null); const [isStreamOpen, setIsStreamOpen] = useState(false); + const [verticalLayout, setVerticalLayout] = useState(() => + loadPanelLayout(VERTICAL_LAYOUT_STORAGE_KEY, DEFAULT_VERTICAL_LAYOUT), + ); + const [horizontalLayout, setHorizontalLayout] = useState(() => + loadPanelLayout(HORIZONTAL_LAYOUT_STORAGE_KEY, DEFAULT_HORIZONTAL_LAYOUT), + ); + const [hasLoadedPanelLayouts, setHasLoadedPanelLayouts] = useState(false); const { runState, isLoading, error, isSubscribed } = useRunState(runId, initialRunState); // A null snapshot means the graph follows live state; a sequence replays @@ -76,6 +120,12 @@ export function RunWorkspacePage({ selectedActivityIdRef.current = selectedActivityId; }, [selectedActivityId]); + useEffect(() => { + setVerticalLayout(loadPanelLayout(VERTICAL_LAYOUT_STORAGE_KEY, DEFAULT_VERTICAL_LAYOUT)); + setHorizontalLayout(loadPanelLayout(HORIZONTAL_LAYOUT_STORAGE_KEY, DEFAULT_HORIZONTAL_LAYOUT)); + setHasLoadedPanelLayouts(true); + }, []); + // Fetch mutations once per run load so snapshot selection is always ready. useEffect(() => { let cancelled = false; @@ -403,8 +453,7 @@ export function RunWorkspacePage({
)} -
+
{selectionNotice && (
)} -
0 ? 300 : 0, - paddingRight: isInspectorOpen ? 476 : 0, + 0 ? "with-timeline" : "without-timeline" + }`} + orientation="vertical" + defaultLayout={activities.length > 0 ? verticalLayout : { "graph-workspace": 100 }} + onLayoutChange={(layout) => { + if (activities.length > 0) { + setVerticalLayout(layout); + savePanelLayout(VERTICAL_LAYOUT_STORAGE_KEY, layout); + } }} + className="size-full" > - -
- - {activities.length > 0 && ( -
- -
- )} - - {isStreamOpen && events.length > 0 && ( -
- { - setSelectionNotice(null); - setSelectedTaskId(id); - }} - onSequenceClick={(seq) => { - requestedSequenceRef.current = seq; - handleSequenceChange(seq); - }} - /> -
- )} - - {isInspectorOpen ? ( -
0 + ? panelPercent(verticalLayout, "graph-workspace", 62) + : "100%" + } + minSize="28%" > - setSelectedTaskId(null)} - onJumpToSequence={(seq) => { - requestedSequenceRef.current = seq; - handleSequenceChange(seq); + { + if (isInspectorOpen) { + setHorizontalLayout(layout); + savePanelLayout(HORIZONTAL_LAYOUT_STORAGE_KEY, layout); + } }} - selectedTime={selectedTimelineTime} - selectedSequence={snapshotSequence} - selectedActivity={selectedActivity} - /> -
- ) : ( -
-
-
- Task inspection -
-

- Click node → workspace drawer -

-

State, outputs, turns, and evals appear scoped to the selected sequence.

- {selectedTask && ( -
- Ready to inspect {selectedTask.name}. -
+ className="size-full" + > + +
+ + + {isStreamOpen && events.length > 0 && ( +
+ { + setSelectionNotice(null); + setSelectedTaskId(id); + }} + onSequenceClick={(seq) => { + requestedSequenceRef.current = seq; + handleSequenceChange(seq); + }} + /> +
+ )} + + {!isInspectorOpen && ( +
+
+
+ Task inspection +
+

+ Click node → workspace drawer +

+

State, outputs, turns, and evals appear scoped to the selected sequence.

+ {selectedTask && ( +
+ Ready to inspect {selectedTask.name}. +
+ )} +
+
+ )} +
+
+ + {isInspectorOpen && ( + <> + +
+ + +
+ setSelectedTaskId(null)} + onJumpToSequence={(seq) => { + requestedSequenceRef.current = seq; + handleSequenceChange(seq); + }} + selectedTime={selectedTimelineTime} + selectedSequence={snapshotSequence} + selectedActivity={selectedActivity} + /> +
+
+ )} -
-
- )} + + + + {activities.length > 0 && ( + <> + +
+ + +
+ +
+
+ + )} +
); diff --git a/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx b/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx index 139e637c..2595b504 100644 --- a/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx +++ b/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx @@ -282,7 +282,7 @@ export function TaskWorkspace({ )}
- Worker: {task.assignedWorkerName ?? "—"} + Worker: {task.assignedWorkerSlug ?? "—"} Level: {task.level} Leaf task: {task.isLeaf ? "yes" : "no"} Attempts: {filteredEvidence.executions.length || 0} diff --git a/ergon-dashboard/src/features/activity/buildRunActivities.test.ts b/ergon-dashboard/src/features/activity/buildRunActivities.test.ts index 9183939a..456b42ee 100644 --- a/ergon-dashboard/src/features/activity/buildRunActivities.test.ts +++ b/ergon-dashboard/src/features/activity/buildRunActivities.test.ts @@ -54,6 +54,7 @@ test("buildRunActivities surfaces semantic activity kinds without creating actor runState.contextEventsByTask.set(noisyTaskId, [ { id: "context-noisy", + runId: runState.id, taskExecutionId: "execution-noisy", taskNodeId: noisyTaskId, workerBindingKey: "worker-1", diff --git a/ergon-dashboard/src/features/evaluation/contracts.ts b/ergon-dashboard/src/features/evaluation/contracts.ts new file mode 100644 index 00000000..628de16d --- /dev/null +++ b/ergon-dashboard/src/features/evaluation/contracts.ts @@ -0,0 +1,19 @@ +export type EvalCriterionStatus = "passed" | "failed" | "errored" | "skipped"; + +export type EvalRollupStatus = "passing" | "failing" | "errored" | "skipped" | "mixed"; + +export type RubricStatusSummaryStatus = EvalRollupStatus | "none"; + +export interface EvaluationRollup { + status: EvalRollupStatus; + totalCriteria: number; + passed: number; + failed: number; + errored: number; + skipped: number; + normalizedScore: number; + maxScore: number; + evaluatorNames: string[]; + attachedTaskIds: string[]; + criterionStatuses: EvalCriterionStatus[]; +} diff --git a/ergon-dashboard/src/features/evaluation/selectors.test.ts b/ergon-dashboard/src/features/evaluation/selectors.test.ts new file mode 100644 index 00000000..f75e65c0 --- /dev/null +++ b/ergon-dashboard/src/features/evaluation/selectors.test.ts @@ -0,0 +1,151 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import type { TaskEvaluationState, TaskState, WorkflowRunState } from "@/lib/types"; +import { TaskStatus } from "@/lib/types"; +import { + buildContainerEvaluationRollup, + combineEvaluationStatuses, + evaluationToRollup, + isEvaluationBearingTask, +} from "./selectors"; + +function task(id: string, childIds: string[] = []): TaskState { + return { + id, + name: id, + description: id, + status: TaskStatus.COMPLETED, + parentId: null, + childIds, + dependsOnIds: [], + isLeaf: childIds.length === 0, + level: 0, + assignedWorkerId: null, + assignedWorkerSlug: null, + startedAt: null, + completedAt: null, + history: [], + lastTrigger: null, + }; +} + +function evaluation(taskId: string, statuses: Array<"passed" | "failed" | "errored" | "skipped">): TaskEvaluationState { + return { + id: `evaluation-${taskId}`, + runId: "run-1", + taskId, + evaluatorName: "rubric", + aggregationRule: "weighted_sum", + totalScore: statuses.filter((status) => status === "passed").length, + maxScore: statuses.length, + normalizedScore: statuses.length > 0 ? statuses.filter((status) => status === "passed").length / statuses.length : 0, + stagesEvaluated: 1, + stagesPassed: statuses.every((status) => status === "passed") ? 1 : 0, + failedGate: null, + createdAt: "2026-04-27T12:00:00.000Z", + criterionResults: statuses.map((status, index) => ({ + id: `${taskId}-${index}`, + stageNum: 0, + stageName: "default", + criterionNum: index, + criterionSlug: `${status}_criterion`, + criterionType: "fixture", + criterionDescription: `${status} criterion`, + criterionName: `${status} criterion`, + status, + passed: status === "passed", + weight: 1, + contribution: status === "passed" ? 1 : 0, + score: status === "passed" ? 1 : 0, + maxScore: 1, + feedback: null, + modelReasoning: null, + skippedReason: null, + evaluationInput: null, + error: status === "errored" ? { kind: "fixture" } : null, + evaluatedActionIds: [], + evaluatedResourceIds: [], + })), + }; +} + +function state(evaluationsByTask: Map): WorkflowRunState { + return { + id: "run-1", + experimentId: "experiment-1", + name: "run", + status: "completed", + tasks: new Map([ + ["root", task("root", ["child-a", "child-b"])], + ["child-a", task("child-a")], + ["child-b", task("child-b")], + ]), + rootTaskId: "root", + resourcesByTask: new Map(), + executionsByTask: new Map(), + evaluationsByTask, + sandboxesByTask: new Map(), + threads: [], + contextEventsByTask: new Map(), + startedAt: "2026-04-27T12:00:00.000Z", + completedAt: null, + durationSeconds: null, + totalTasks: 3, + totalLeafTasks: 2, + completedTasks: 3, + failedTasks: 0, + runningTasks: 0, + cancelledTasks: 0, + finalScore: null, + error: null, + edges: new Map(), + annotationsByTarget: new Map(), + unhandledMutations: [], + }; +} + +test("evaluationToRollup returns null when there are no criteria", () => { + assert.equal(evaluationToRollup(evaluation("child-a", [])), null); +}); + +test("evaluationToRollup preserves explicit failed, skipped, and errored states", () => { + const rollup = evaluationToRollup(evaluation("child-a", ["passed", "failed", "skipped"])); + + assert.equal(rollup?.status, "failing"); + assert.equal(rollup?.passed, 1); + assert.equal(rollup?.failed, 1); + assert.equal(rollup?.skipped, 1); + assert.deepEqual(rollup?.criterionStatuses, ["passed", "failed", "skipped"]); + + assert.equal(evaluationToRollup(evaluation("child-a", ["errored"]))?.status, "errored"); +}); + +test("container rollup aggregates descendants and returns null for no evidence", () => { + const empty = state(new Map()); + assert.equal(buildContainerEvaluationRollup(empty, "root"), null); + assert.equal(isEvaluationBearingTask(empty, "root"), false); + + const populated = state( + new Map([ + ["child-a", evaluation("child-a", ["passed", "skipped"])], + ["child-b", evaluation("child-b", ["passed"])], + ]), + ); + + const rollup = buildContainerEvaluationRollup(populated, "root"); + + assert.equal(rollup?.status, "mixed"); + assert.equal(rollup?.totalCriteria, 3); + assert.equal(rollup?.passed, 2); + assert.equal(rollup?.skipped, 1); + assert.deepEqual(rollup?.attachedTaskIds, ["child-a", "child-b"]); + assert.equal(isEvaluationBearingTask(populated, "root"), true); +}); + +test("combineEvaluationStatuses prioritizes errored then failing before mixed", () => { + assert.equal(combineEvaluationStatuses(["passing", "errored", "failing"]), "errored"); + assert.equal(combineEvaluationStatuses(["passing", "failing", "mixed"]), "failing"); + assert.equal(combineEvaluationStatuses(["passing", "skipped"]), "mixed"); + assert.equal(combineEvaluationStatuses(["skipped", "skipped"]), "skipped"); +}); diff --git a/ergon-dashboard/src/features/evaluation/selectors.ts b/ergon-dashboard/src/features/evaluation/selectors.ts new file mode 100644 index 00000000..818abc91 --- /dev/null +++ b/ergon-dashboard/src/features/evaluation/selectors.ts @@ -0,0 +1,84 @@ +import type { TaskEvaluationState, WorkflowRunState } from "@/lib/types"; +import type { EvalCriterionStatus, EvalRollupStatus, EvaluationRollup } from "./contracts"; + +function criterionStatusToRollupStatus(status: EvalCriterionStatus): EvalRollupStatus { + if (status === "passed") return "passing"; + if (status === "failed") return "failing"; + return status; +} + +export function combineEvaluationStatuses(statuses: EvalRollupStatus[]): EvalRollupStatus { + if (statuses.includes("errored")) return "errored"; + if (statuses.includes("failing")) return "failing"; + if (statuses.includes("mixed")) return "mixed"; + if (statuses.includes("skipped") && statuses.includes("passing")) return "mixed"; + if (statuses.every((status) => status === "skipped")) return "skipped"; + return "passing"; +} + +export function evaluationToRollup(evaluation: TaskEvaluationState | undefined): EvaluationRollup | null { + if (!evaluation || evaluation.criterionResults.length === 0) return null; + + const criterionStatuses = evaluation.criterionResults.map( + (criterion) => criterion.status as EvalCriterionStatus, + ); + const passed = criterionStatuses.filter((status) => status === "passed").length; + const failed = criterionStatuses.filter((status) => status === "failed").length; + const errored = criterionStatuses.filter((status) => status === "errored").length; + const skipped = criterionStatuses.filter((status) => status === "skipped").length; + + return { + status: combineEvaluationStatuses(criterionStatuses.map(criterionStatusToRollupStatus)), + totalCriteria: criterionStatuses.length, + passed, + failed, + errored, + skipped, + normalizedScore: evaluation.normalizedScore, + maxScore: evaluation.maxScore, + evaluatorNames: [evaluation.evaluatorName], + attachedTaskIds: evaluation.taskId ? [evaluation.taskId] : [], + criterionStatuses, + }; +} + +export function buildContainerEvaluationRollup( + state: WorkflowRunState, + taskId: string, +): EvaluationRollup | null { + const task = state.tasks.get(taskId); + if (!task) return null; + + const direct = evaluationToRollup(state.evaluationsByTask.get(taskId)); + const childRollups = task.childIds.map((childId) => buildContainerEvaluationRollup(state, childId)); + const rollups = [direct, ...childRollups].filter( + (rollup): rollup is EvaluationRollup => rollup !== null, + ); + + if (rollups.length === 0) return null; + + const totalCriteria = rollups.reduce((sum, rollup) => sum + rollup.totalCriteria, 0); + const maxScore = rollups.reduce((sum, rollup) => sum + rollup.maxScore, 0); + const weightedScore = rollups.reduce( + (sum, rollup) => sum + rollup.normalizedScore * rollup.maxScore, + 0, + ); + + return { + status: combineEvaluationStatuses(rollups.map((rollup) => rollup.status)), + totalCriteria, + passed: rollups.reduce((sum, rollup) => sum + rollup.passed, 0), + failed: rollups.reduce((sum, rollup) => sum + rollup.failed, 0), + errored: rollups.reduce((sum, rollup) => sum + rollup.errored, 0), + skipped: rollups.reduce((sum, rollup) => sum + rollup.skipped, 0), + normalizedScore: maxScore > 0 ? weightedScore / maxScore : 0, + maxScore, + evaluatorNames: Array.from(new Set(rollups.flatMap((rollup) => rollup.evaluatorNames))).sort(), + attachedTaskIds: Array.from(new Set(rollups.flatMap((rollup) => rollup.attachedTaskIds))).sort(), + criterionStatuses: rollups.flatMap((rollup) => rollup.criterionStatuses), + }; +} + +export function isEvaluationBearingTask(state: WorkflowRunState, taskId: string): boolean { + return buildContainerEvaluationRollup(state, taskId) !== null; +} diff --git a/ergon-dashboard/src/features/graph/components/ContainerNode.tsx b/ergon-dashboard/src/features/graph/components/ContainerNode.tsx index e0c6b5f4..db9ce0e3 100644 --- a/ergon-dashboard/src/features/graph/components/ContainerNode.tsx +++ b/ergon-dashboard/src/features/graph/components/ContainerNode.tsx @@ -3,6 +3,7 @@ import { memo } from "react"; import { Handle, Position } from "@xyflow/react"; import type { TaskState, TaskStatus } from "@/lib/types"; +import type { EvaluationRollup } from "@/features/evaluation/contracts"; interface ContainerNodeProps { task: TaskState; @@ -16,6 +17,8 @@ interface ContainerNodeProps { containerHeight: number; layoutDirection?: "TB" | "LR"; maxGraphDepth?: number; + evaluationRollup?: EvaluationRollup | null; + evaluationLensActive?: boolean; } function ContainerNodeComponent(props: ContainerNodeProps) { @@ -30,6 +33,7 @@ function ContainerNodeComponent(props: ContainerNodeProps) { containerWidth, containerHeight, layoutDirection = "LR", + evaluationRollup = null, } = props; const handleClick = (e: React.MouseEvent) => { e.stopPropagation(); @@ -98,6 +102,15 @@ function ContainerNodeComponent(props: ContainerNodeProps) { > {task.childIds.length} subtask{task.childIds.length !== 1 ? "s" : ""} + {evaluationRollup && ( + + R + + )}