From 187019e36a829ccd60abb1911dbcee574ddde3fb Mon Sep 17 00:00:00 2001 From: Charlie Masters <69640669+cm2435@users.noreply.github.com> Date: Tue, 28 Apr 2026 16:45:09 +0100 Subject: [PATCH 1/5] docs: capture runtime cleanup plans Record the real-LLM debugging, context stream, and runtime layout cleanup notes that explain the branch direction. Made-with: Cursor --- docs/architecture/03_providers.md | 12 +- docs/architecture/cross_cutting/artifacts.md | 2 +- ...-cleanup-cancelled-task-release-sandbox.md | 8 +- ...-04-17-sandbox-lifetime-covers-criteria.md | 86 +- .../2026-04-18-sandbox-manager-key-cleanup.md | 6 +- ...026-04-18-sandbox-manager-process-state.md | 22 +- .../02-test-brittleness-and-gaps.md | 2 +- .../03-code-quality.md | 2 +- .../final-worker-output-source-of-truth.md | 177 +++ ...27-react-worker-failure-context-capture.md | 650 ++++++++ .../2026-04-28-agent-tool-budget-harness.md | 811 ++++++++++ .../2026-04-28-context-part-chunk-stream.md | 1359 +++++++++++++++++ ...evaluation-resource-context-and-scoring.md | 909 +++++++++++ ergon_builtins/AGENTS.md | 6 +- 14 files changed, 3978 insertions(+), 74 deletions(-) create mode 100644 docs/rfcs/active/final-worker-output-source-of-truth.md create mode 100644 docs/superpowers/plans/2026-04-27-react-worker-failure-context-capture.md create mode 100644 docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md create mode 100644 docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md create mode 100644 docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md diff --git a/docs/architecture/03_providers.md b/docs/architecture/03_providers.md index 7a957547..7b1d900e 100644 --- a/docs/architecture/03_providers.md +++ b/docs/architecture/03_providers.md @@ -2,7 +2,7 @@ ## 1. Purpose -The providers layer is Ergon's boundary between runtime code and external execution substrates. It owns four concerns: resolving `model_id` strings to `pydantic_ai.models.Model` instances, provisioning and tearing down E2B sandboxes via per-benchmark manager subclasses, surfacing sandbox state transitions as dashboard events, and publishing worker outputs as content-addressed blobs that evaluators can re-read. Everything that crosses the process boundary (LLM API, container runtime, blob storage) is routed through this layer so the runtime, workers, and evaluators stay substrate-agnostic. +The provider-style boundaries are Ergon's adapters between runtime code and external execution substrates. Model resolution lives in the generation registry, while sandbox infrastructure now lives under `ergon_core.core.sandbox` because it owns lifecycle, instrumentation, event emission, and artifact publishing rather than just a third-party provider adapter. ## 2. Core abstractions @@ -11,12 +11,12 @@ The providers layer is Ergon's boundary between runtime code and external execut | `_BACKEND_REGISTRY` | module-level dict | `ergon_core/core/providers/generation/model_resolution.py` | Frozen shape; entries grow via registration. | Providers layer. | | `resolve_model_target` | function | `ergon_core/core/providers/generation/model_resolution.py` | Public, frozen signature. Returns `ResolvedModel`. | Providers layer. | | `register_model_backend` | function | `ergon_core/core/providers/generation/model_resolution.py` | Public, frozen signature. | Providers layer; callers are backend modules executing at import time. | -| `BaseSandboxManager` | abstract class + singleton | `ergon_core/core/providers/sandbox/manager.py` | Shape stable; `event_sink` activation path in flux. | Providers layer. | -| `DefaultSandboxManager` | concrete class | `ergon_core/core/providers/sandbox/manager.py` | Frozen. | Providers layer. | +| `BaseSandboxManager` | abstract class + singleton | `ergon_core/core/sandbox/manager.py` | Shape stable; `event_sink` activation path in flux. | Sandbox domain. | +| `DefaultSandboxManager` | concrete class | `ergon_core/core/sandbox/manager.py` | Frozen. | Sandbox domain. | | `SWEBenchSandboxManager`, `MiniF2FSandboxManager`, `ResearchRubricsSandboxManager` | concrete subclasses | `ergon_builtins/` | Owned per benchmark; singletons. | Benchmark authors. | -| `SandboxEventSink` | `typing.Protocol` | `ergon_core/core/providers/sandbox/event_sink.py` | Frozen protocol; activation path in flux. | Providers layer. | -| `NoopSandboxEventSink`, `DashboardEmitterSandboxEventSink` | implementations | `ergon_core/core/providers/sandbox/event_sink.py` | Frozen. | Providers layer. | -| `SandboxResourcePublisher` | class | `ergon_core/core/providers/sandbox/resource_publisher.py` | Frozen API; storage backend swappable via `ERGON_BLOB_ROOT`. | Providers layer. | +| `SandboxEventSink` | `typing.Protocol` | `ergon_core/core/sandbox/event_sink.py` | Frozen protocol; activation path in flux. | Sandbox domain. | +| `NoopSandboxEventSink`, `DashboardEmitterSandboxEventSink` | implementations | `ergon_core/core/sandbox/event_sink.py` | Frozen. | Sandbox domain. | +| `SandboxResourcePublisher` | class | `ergon_core/core/sandbox/resource_publisher.py` | Frozen API; storage backend swappable via `ERGON_BLOB_ROOT`. | Sandbox domain. | | `TransformersModel` | `pydantic_ai.models.Model` subclass | `ergon_builtins/ergon_builtins/models/transformers_backend.py` | Frozen. | ML team (TRL training loop callers). | ### 2.1 Generation registry diff --git a/docs/architecture/cross_cutting/artifacts.md b/docs/architecture/cross_cutting/artifacts.md index bc6b5fe9..04506b02 100644 --- a/docs/architecture/cross_cutting/artifacts.md +++ b/docs/architecture/cross_cutting/artifacts.md @@ -15,7 +15,7 @@ produces computed artifacts through `CriterionRuntime.run_command(...)`. | Type | Location | Freeze | Owner | |------|----------|--------|-------| -| `SandboxResourcePublisher` | `ergon_core/core/providers/sandbox/resource_publisher.py` | Stable | Sandbox provider | +| `SandboxResourcePublisher` | `ergon_core/core/sandbox/resource_publisher.py` | Stable | Sandbox domain | | `RunResource` | ORM row; table `run_resources` | Stable wire shape | Persistence layer | | `dashboard/resource.published` | Inngest event | Stable | Dashboard lane | | `CriterionRuntime.read_resource(name)` | Proposed per RFC | Pending | Evaluator layer | diff --git a/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md b/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md index 22b7426d..a207dbe5 100644 --- a/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md +++ b/docs/rfcs/active/2026-04-17-cleanup-cancelled-task-release-sandbox.md @@ -51,7 +51,7 @@ to a `type[BaseSandboxManager]` (not an instance). The cleanup function would need to resolve the class and call the static method `BaseSandboxManager.terminate_by_sandbox_id(sandbox_id)`. `terminate_by_sandbox_id` is a `@staticmethod` at -`ergon_core/ergon_core/core/providers/sandbox/manager.py:472-490` that calls +`ergon_core/ergon_core/core/sandbox/manager.py:472-490` that calls `AsyncSandbox.kill(sandbox_id=..., api_key=...)` directly via E2B, so no instance is needed. However, `cleanup_cancelled_task_fn` currently has no import path to `SANDBOX_MANAGERS`. @@ -278,7 +278,7 @@ import logging import inngest from ergon_builtins.registry import SANDBOX_MANAGERS -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_core.core.runtime.events.task_events import TaskCancelledEvent from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.task_cleanup_dto import CleanupResult @@ -712,13 +712,13 @@ class TestReleaseSandboxStep: async def test_releases_sandbox_when_fields_present(self) -> None: """terminate_by_sandbox_id called exactly once for valid payload.""" with patch( - "ergon_core.core.providers.sandbox.manager.BaseSandboxManager" + "ergon_core.core.sandbox.manager.BaseSandboxManager" ".terminate_by_sandbox_id", new_callable=AsyncMock, return_value=True, ) as mock_terminate: from ergon_builtins.registry import SANDBOX_MANAGERS - from ergon_core.core.providers.sandbox.manager import BaseSandboxManager + from ergon_core.core.sandbox.manager import BaseSandboxManager # Any known slug from SANDBOX_MANAGERS slug = next(iter(SANDBOX_MANAGERS)) diff --git a/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md b/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md index 5b6c0bda..e54a8f9e 100644 --- a/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md +++ b/docs/rfcs/active/2026-04-17-sandbox-lifetime-covers-criteria.md @@ -14,7 +14,7 @@ superseded_by: null ### Current state `BaseSandboxManager.create()` at -`ergon_core/ergon_core/core/providers/sandbox/manager.py:226` accepts a single +`ergon_core/ergon_core/core/sandbox/manager.py:226` accepts a single `timeout_minutes: int = 30` parameter. Every call site passes a literal or relies on the default: @@ -145,7 +145,7 @@ reconnect path; `CriterionRuntime.ensure_sandbox()` will call it once RFC **Change 3 — Define `SandboxExpiredError`.** New exception class at -`ergon_core/ergon_core/core/providers/sandbox/errors.py`. Subclasses the base +`ergon_core/ergon_core/core/sandbox/errors.py`. Subclasses the base `Exception` (not `ErgonNonRetriableError` — sandbox expiry is not a definition-level error; it is a transient infrastructure condition). Criteria that catch it should surface a `"sandbox-expired"` evaluation outcome rather @@ -225,7 +225,7 @@ SandboxSetupRequest (payload) ## Type / interface definitions ```python -# ergon_core/ergon_core/core/providers/sandbox/errors.py +# ergon_core/ergon_core/core/sandbox/errors.py """Sandbox-specific exception types.""" @@ -260,7 +260,7 @@ class SandboxExpiredError(SandboxError): ### `errors.py` (new file) ```python -# ergon_core/ergon_core/core/providers/sandbox/errors.py +# ergon_core/ergon_core/core/sandbox/errors.py """Sandbox-specific exception types.""" @@ -291,7 +291,7 @@ class SandboxExpiredError(SandboxError): ### `reconnect` method (added to `BaseSandboxManager`) ```python -# Added to: ergon_core/ergon_core/core/providers/sandbox/manager.py +# Added to: ergon_core/ergon_core/core/sandbox/manager.py # Location: after get_sandbox() at line 394, before get_sandbox_path() async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": @@ -308,7 +308,7 @@ async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": This method does NOT register the sandbox in class-level state; callers should not assume it shows up in _sandboxes. """ - from ergon_core.core.providers.sandbox.errors import SandboxExpiredError + from ergon_core.core.sandbox.errors import SandboxExpiredError if AsyncSandbox is None: raise RuntimeError( @@ -331,7 +331,7 @@ async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": ### Updated `create()` signature — `BaseSandboxManager` ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py # Replace lines 226-295 (existing create method) async def create( @@ -423,7 +423,7 @@ async def create( ### Updated `DefaultSandboxManager.create()` override ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py # Replace lines 503-526 (existing DefaultSandboxManager.create override) async def create( @@ -457,21 +457,21 @@ async def create( ### Updated `__init__.py` (sandbox package) ```python -# ergon_core/ergon_core/core/providers/sandbox/__init__.py +# ergon_core/ergon_core/core/sandbox/__init__.py # Add SandboxExpiredError, SandboxError to exports """Sandbox management: provisioning, file I/O, lifecycle.""" -from ergon_core.core.providers.sandbox.errors import ( +from ergon_core.core.sandbox.errors import ( SandboxError, SandboxExpiredError, ) -from ergon_core.core.providers.sandbox.event_sink import ( +from ergon_core.core.sandbox.event_sink import ( DashboardEmitterSandboxEventSink, NoopSandboxEventSink, SandboxEventSink, ) -from ergon_core.core.providers.sandbox.manager import ( +from ergon_core.core.sandbox.manager import ( BaseSandboxManager, DefaultSandboxManager, DownloadedFile, @@ -495,7 +495,7 @@ __all__ = [ ## Exact diffs for modified files -### `ergon_core/ergon_core/core/providers/sandbox/manager.py` +### `ergon_core/ergon_core/core/sandbox/manager.py` ```diff @@ -226,13 +226,16 @@ class BaseSandboxManager(ABC): @@ -559,7 +559,7 @@ __all__ = [ + sandbox is not found or has already timed out. Idempotent. + Does NOT register in class-level _sandboxes state. + """ -+ from ergon_core.core.providers.sandbox.errors import SandboxExpiredError ++ from ergon_core.core.sandbox.errors import SandboxExpiredError + + if AsyncSandbox is None: + raise RuntimeError( @@ -640,22 +640,22 @@ __all__ = [ Note: `reset_timeout` call changes from 30 to 40 to match the new provisioned total. The signature of `reset_timeout` at `manager.py:407` is unchanged (still accepts `timeout_minutes`). -### `ergon_core/ergon_core/core/providers/sandbox/__init__.py` +### `ergon_core/ergon_core/core/sandbox/__init__.py` ```diff @@ -1,6 +1,11 @@ """Sandbox management: provisioning, file I/O, lifecycle.""" -+from ergon_core.core.providers.sandbox.errors import ( ++from ergon_core.core.sandbox.errors import ( + SandboxError, + SandboxExpiredError, +) - from ergon_core.core.providers.sandbox.event_sink import ( + from ergon_core.core.sandbox.event_sink import ( DashboardEmitterSandboxEventSink, NoopSandboxEventSink, SandboxEventSink, ) - from ergon_core.core.providers.sandbox.manager import ( + from ergon_core.core.sandbox.manager import ( BaseSandboxManager, DefaultSandboxManager, DownloadedFile, @@ -683,7 +683,7 @@ New file, no new package. The errors module sits alongside the existing sandbox package files: ``` -ergon_core/ergon_core/core/providers/sandbox/ +ergon_core/ergon_core/core/sandbox/ ├── __init__.py MODIFY (add SandboxError, SandboxExpiredError exports) ├── errors.py ADD (SandboxError, SandboxExpiredError) ├── event_sink.py no change @@ -700,15 +700,15 @@ ergon_core/ergon_core/core/providers/sandbox/ | Step | Phase | What | Files touched | |------|-------|------|---------------| -| 1 | PR 1 | Create `errors.py` with `SandboxError` and `SandboxExpiredError` | ADD `ergon_core/ergon_core/core/providers/sandbox/errors.py` | -| 2 | PR 1 | Add `errors` imports to sandbox `__init__.py` | MODIFY `ergon_core/ergon_core/core/providers/sandbox/__init__.py` | -| 3 | PR 1 | Update `BaseSandboxManager.create()` signature: `timeout_minutes` → `task_timeout_minutes + max_criterion_timeout_minutes`; update WAL entry log | MODIFY `ergon_core/ergon_core/core/providers/sandbox/manager.py` | -| 4 | PR 1 | Update `DefaultSandboxManager.create()` override with same signature change | MODIFY `ergon_core/ergon_core/core/providers/sandbox/manager.py` | +| 1 | PR 1 | Create `errors.py` with `SandboxError` and `SandboxExpiredError` | ADD `ergon_core/ergon_core/core/sandbox/errors.py` | +| 2 | PR 1 | Add `errors` imports to sandbox `__init__.py` | MODIFY `ergon_core/ergon_core/core/sandbox/__init__.py` | +| 3 | PR 1 | Update `BaseSandboxManager.create()` signature: `timeout_minutes` → `task_timeout_minutes + max_criterion_timeout_minutes`; update WAL entry log | MODIFY `ergon_core/ergon_core/core/sandbox/manager.py` | +| 4 | PR 1 | Update `DefaultSandboxManager.create()` override with same signature change | MODIFY `ergon_core/ergon_core/core/sandbox/manager.py` | | 5 | PR 1 | Migrate `sandbox_setup.py` call site: `timeout_minutes=30` → `task_timeout_minutes=30` | MODIFY `ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py` | | 6 | PR 1 | Migrate `criterion_runtime.py` call sites: same rename; `reset_timeout` 30 → 40 | MODIFY `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` | | 7 | PR 1 | Migrate test call sites: `timeout_minutes=5` → `task_timeout_minutes=5` in `tests/swebench_verified/test_sandbox_manager.py` and `tests/minif2f/test_sandbox_manager.py` | MODIFY 2 test files | | 8 | PR 1 | Unit tests: `create()` passes correct total timeout to E2B; `task_timeout + max_criterion_timeout` arithmetic | ADD `tests/unit/test_sandbox_timeout.py` | -| 9 | PR 2 | Add `BaseSandboxManager.reconnect(sandbox_id)` method | MODIFY `ergon_core/ergon_core/core/providers/sandbox/manager.py` | +| 9 | PR 2 | Add `BaseSandboxManager.reconnect(sandbox_id)` method | MODIFY `ergon_core/ergon_core/core/sandbox/manager.py` | | 10 | PR 2 | Unit tests for `reconnect`: successful connect, E2B-not-found raises `SandboxExpiredError`, non-expired E2B error re-raises | ADD to `tests/unit/test_sandbox_reconnect.py` | | 11 | PR 2 | Canary e2e test: deliberately-slow criterion (sleep > task_timeout) still finds sandbox reachable | ADD `tests/e2e/test_sandbox_criterion_timeout_canary.py` | | 12 | PR 2 | (Deferred — depends on `2026-04-17-criterion-runtime-di-container`) Migrate `DefaultCriterionRuntime.ensure_sandbox()` to use `reconnect` when `get_sandbox` returns `None`, handling `SandboxExpiredError` | MODIFY `criterion_runtime.py` | @@ -724,7 +724,7 @@ Steps 1–8 land as PR 1 ("sandbox-lifetime/split-timeout"). Steps 9–11 land a | File | Purpose | |------|---------| -| `ergon_core/ergon_core/core/providers/sandbox/errors.py` | `SandboxError` base class; `SandboxExpiredError` raised by `reconnect()` on expired sandbox | +| `ergon_core/ergon_core/core/sandbox/errors.py` | `SandboxError` base class; `SandboxExpiredError` raised by `reconnect()` on expired sandbox | | `tests/unit/test_sandbox_timeout.py` | Unit tests: `create()` arithmetic, `task_timeout + max_criterion_timeout` passed to E2B | | `tests/unit/test_sandbox_reconnect.py` | Unit tests: `reconnect()` success, not-found raises `SandboxExpiredError`, other errors re-raise | | `tests/e2e/test_sandbox_criterion_timeout_canary.py` | E2e canary: slow criterion still reaches sandbox when timeout is correctly provisioned | @@ -733,8 +733,8 @@ Steps 1–8 land as PR 1 ("sandbox-lifetime/split-timeout"). Steps 9–11 land a | File | Changes | |------|---------| -| `ergon_core/ergon_core/core/providers/sandbox/manager.py` | Split `timeout_minutes` into `task_timeout_minutes + max_criterion_timeout_minutes` in `BaseSandboxManager.create()` and `DefaultSandboxManager.create()`; add `reconnect()` method | -| `ergon_core/ergon_core/core/providers/sandbox/__init__.py` | Export `SandboxError`, `SandboxExpiredError` | +| `ergon_core/ergon_core/core/sandbox/manager.py` | Split `timeout_minutes` into `task_timeout_minutes + max_criterion_timeout_minutes` in `BaseSandboxManager.create()` and `DefaultSandboxManager.create()`; add `reconnect()` method | +| `ergon_core/ergon_core/core/sandbox/__init__.py` | Export `SandboxError`, `SandboxExpiredError` | | `ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py` | Rename `timeout_minutes=30` → `task_timeout_minutes=30` at line 106 | | `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` | Rename `timeout_minutes=30` → `task_timeout_minutes=30` at line 59; `reset_timeout(..., timeout_minutes=30)` → `timeout_minutes=40` at line 63 | | `tests/swebench_verified/test_sandbox_manager.py` | Rename `timeout_minutes=5` → `task_timeout_minutes=5`; update assertion `call_kwargs["timeout"] == 5 * 60` → `== (5 + 10) * 60` | @@ -758,7 +758,7 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager, DefaultSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager, DefaultSandboxManager @pytest.fixture(autouse=True) @@ -792,11 +792,11 @@ async def test_create_passes_total_timeout_to_e2b(monkeypatch: pytest.MonkeyPatc fake_sandbox.sandbox_id = "sbx-test" fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -819,11 +819,11 @@ async def test_create_default_max_criterion_timeout(monkeypatch: pytest.MonkeyPa fake_sandbox.sandbox_id = "sbx-default" fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -841,11 +841,11 @@ async def test_create_zero_criterion_timeout(monkeypatch: pytest.MonkeyPatch) -> fake_sandbox.sandbox_id = "sbx-zero" fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -875,8 +875,8 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.errors import SandboxExpiredError -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.errors import SandboxExpiredError +from ergon_core.core.sandbox.manager import BaseSandboxManager @pytest.fixture(autouse=True) @@ -902,11 +902,11 @@ async def test_reconnect_returns_sandbox_on_success(monkeypatch: pytest.MonkeyPa fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -924,11 +924,11 @@ async def test_reconnect_raises_sandbox_expired_on_not_found( """reconnect() raises SandboxExpiredError when E2B returns 'not found'.""" fake_connect = AsyncMock(side_effect=Exception("sandbox not found (404)")) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -945,11 +945,11 @@ async def test_reconnect_reraises_non_expiry_errors(monkeypatch: pytest.MonkeyPa """reconnect() re-raises unexpected E2B errors unchanged.""" fake_connect = AsyncMock(side_effect=ConnectionError("network blip")) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -978,7 +978,7 @@ import asyncio import pytest from uuid import uuid4 -from ergon_core.core.providers.sandbox.manager import DefaultSandboxManager, BaseSandboxManager +from ergon_core.core.sandbox.manager import DefaultSandboxManager, BaseSandboxManager @pytest.fixture(autouse=True) diff --git a/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md b/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md index d694a25e..e0452646 100644 --- a/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md +++ b/docs/rfcs/active/2026-04-18-sandbox-manager-key-cleanup.md @@ -25,7 +25,7 @@ reduces the diff size for that RFC. ## Problem `BaseSandboxManager.create()` -(`ergon_core/ergon_core/core/providers/sandbox/manager.py:226-233`) takes three +(`ergon_core/ergon_core/core/sandbox/manager.py:226-233`) takes three conceptual task-keys as positional/keyword arguments: ```python @@ -177,7 +177,7 @@ production cases — which is exactly what `task_id` is after the rename. ## Full implementation -### Modified file: `ergon_core/ergon_core/core/providers/sandbox/manager.py` +### Modified file: `ergon_core/ergon_core/core/sandbox/manager.py` #### 1. Remove `_display_task_ids` class attribute @@ -575,7 +575,7 @@ None. | File | Changes | |---|---| -| `ergon_core/ergon_core/core/providers/sandbox/manager.py` | Delete `_display_task_ids` attr (line 70); delete `_get_display_task_id()` (lines 96-97); rename `sandbox_key`→`task_id` + remove `display_task_id` in `BaseSandboxManager.create()` (lines 226-295); rename `sandbox_key`→`task_id` + rename `task_id`→`override_task_id` in `_emit_wal_entry()` (lines 99-131); simplify `terminate()` (lines 429-469); rename + remove `display_task_id` in `DefaultSandboxManager.create()` (lines 503-526) | +| `ergon_core/ergon_core/core/sandbox/manager.py` | Delete `_display_task_ids` attr (line 70); delete `_get_display_task_id()` (lines 96-97); rename `sandbox_key`→`task_id` + remove `display_task_id` in `BaseSandboxManager.create()` (lines 226-295); rename `sandbox_key`→`task_id` + rename `task_id`→`override_task_id` in `_emit_wal_entry()` (lines 99-131); simplify `terminate()` (lines 429-469); rename + remove `display_task_id` in `DefaultSandboxManager.create()` (lines 503-526) | | `ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py` | Drop `display_task_id=task_id` kwarg at line 108 | | `ergon_builtins/ergon_builtins/benchmarks/swebench_verified/criterion.py` | Rename `sandbox_key=` → `task_id=` at line 74 | | `tests/minif2f/test_sandbox_manager.py` | Remove `BaseSandboxManager._display_task_ids = {}` at line 30; rename `sandbox_key=` → `task_id=` at lines 121, 172, 206 | diff --git a/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md b/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md index 4e047349..0d82db05 100644 --- a/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md +++ b/docs/rfcs/active/2026-04-18-sandbox-manager-process-state.md @@ -14,7 +14,7 @@ superseded_by: null ## 1. Problem `BaseSandboxManager` -(`ergon_core/ergon_core/core/providers/sandbox/manager.py`) is wired as a +(`ergon_core/ergon_core/core/sandbox/manager.py`) is wired as a singleton-per-subclass via `__new__` at `manager.py:78-81`: ```python @@ -71,7 +71,7 @@ The same pattern appears in: - `ergon_builtins/ergon_builtins/benchmarks/swebench_verified/criterion.py:72` `ResearchRubricsSandboxManager` (in -`ergon_core/ergon_core/core/providers/sandbox/research_rubrics_manager.py`) also +`ergon_builtins/ergon_builtins/benchmarks/researchrubrics/sandbox_manager.py`) also calls `self._sandboxes[task_id]` directly at `research_rubrics_manager.py:105` in `publisher_for()`, relying on the class-level dict. @@ -237,7 +237,7 @@ DefaultCriterionRuntime.ensure_sandbox() (any process) ### 4.1 Updated `BaseSandboxManager.__init__` ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py class BaseSandboxManager(ABC): """Abstract base class for E2B sandbox lifecycle management. @@ -267,7 +267,7 @@ class BaseSandboxManager(ABC): ### 4.2 `reconnect` method signature ```python -# ergon_core/ergon_core/core/providers/sandbox/manager.py +# ergon_core/ergon_core/core/sandbox/manager.py async def reconnect(self, sandbox_id: str) -> "AsyncSandbox": """Rehydrate a running sandbox by its E2B sandbox_id. @@ -538,7 +538,7 @@ Behavior unchanged. Stage 1 is a pure refactor. | File | Changes | |---|---| -| `ergon/ergon_core/ergon_core/core/providers/sandbox/manager.py` | Stage 1: move six dicts to `__init__`, fix `_event_sink` init; Stage 2: remove `__new__` + `_instance`, add `reconnect()` | +| `ergon/ergon_core/ergon_core/core/sandbox/manager.py` | Stage 1: move six dicts to `__init__`, fix `_event_sink` init; Stage 2: remove `__new__` + `_instance`, add `reconnect()` | | `ergon/ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` | Stage 3: update `ensure_sandbox()` to use `reconnect()` on cross-process miss | | `ergon/ergon_core/ergon_core/core/runtime/evaluation/evaluation_schemas.py` | Stage 3: add `sandbox_id: str \| None = None` to `CriterionContext` if absent | | `ergon/ergon_builtins/ergon_builtins/workers/baselines/minif2f_react_worker.py` | Stage 3: replace `manager.get_sandbox(context.task_id)` with `reconnect` or DI | @@ -567,7 +567,7 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager class _MinimalManager(BaseSandboxManager): @@ -610,12 +610,12 @@ class TestInstanceIsolation: ) def test_event_sink_initialized_in_init(self) -> None: - from ergon_core.core.providers.sandbox.event_sink import NoopSandboxEventSink + from ergon_core.core.sandbox.event_sink import NoopSandboxEventSink m = _MinimalManager() assert isinstance(m._event_sink, NoopSandboxEventSink) def test_custom_event_sink_set_without_stomp(self) -> None: - from ergon_core.core.providers.sandbox.event_sink import NoopSandboxEventSink + from ergon_core.core.sandbox.event_sink import NoopSandboxEventSink sink_a = NoopSandboxEventSink() sink_b = NoopSandboxEventSink() m1 = _MinimalManager(event_sink=sink_a) @@ -648,7 +648,7 @@ class TestReconnect: @pytest.mark.asyncio async def test_reconnect_calls_connect(self, monkeypatch: pytest.MonkeyPatch) -> None: - from ergon_core.core.providers.sandbox import manager as mgr_module + from ergon_core.core.sandbox import manager as mgr_module fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) @@ -667,7 +667,7 @@ class TestReconnect: async def test_reconnect_raises_when_e2b_not_installed( self, monkeypatch: pytest.MonkeyPatch ) -> None: - from ergon_core.core.providers.sandbox import manager as mgr_module + from ergon_core.core.sandbox import manager as mgr_module monkeypatch.setattr(mgr_module, "AsyncSandbox", None) @@ -680,7 +680,7 @@ class TestReconnect: self, monkeypatch: pytest.MonkeyPatch ) -> None: """reconnect() must not populate self._sandboxes (stateless by design).""" - from ergon_core.core.providers.sandbox import manager as mgr_module + from ergon_core.core.sandbox import manager as mgr_module fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) diff --git a/docs/rfcs/active/architecture-refactor-audit/02-test-brittleness-and-gaps.md b/docs/rfcs/active/architecture-refactor-audit/02-test-brittleness-and-gaps.md index 6344e914..1525b9b6 100644 --- a/docs/rfcs/active/architecture-refactor-audit/02-test-brittleness-and-gaps.md +++ b/docs/rfcs/active/architecture-refactor-audit/02-test-brittleness-and-gaps.md @@ -364,7 +364,7 @@ Files: propagation. - Add fake provider helpers under `ergon_core/ergon_core/test_support/` only if they are reusable across test tiers. -- Pair with code cleanup in `core/providers/sandbox/manager.py` only after +- Pair with code cleanup in `core/sandbox/manager.py` only after characterization tests exist. Steps: diff --git a/docs/rfcs/active/architecture-refactor-audit/03-code-quality.md b/docs/rfcs/active/architecture-refactor-audit/03-code-quality.md index d541ef28..864c88a4 100644 --- a/docs/rfcs/active/architecture-refactor-audit/03-code-quality.md +++ b/docs/rfcs/active/architecture-refactor-audit/03-code-quality.md @@ -271,7 +271,7 @@ Verification: Files: -- Review: `ergon_core/ergon_core/core/providers/sandbox/manager.py`. +- Review: `ergon_core/ergon_core/core/sandbox/manager.py`. - Review: `ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py`. - Review: `ergon_core/ergon_core/core/rl/eval_runner.py`. - Review: `ergon_core/ergon_core/test_support/smoke_fixtures/`. diff --git a/docs/rfcs/active/final-worker-output-source-of-truth.md b/docs/rfcs/active/final-worker-output-source-of-truth.md new file mode 100644 index 00000000..09495a56 --- /dev/null +++ b/docs/rfcs/active/final-worker-output-source-of-truth.md @@ -0,0 +1,177 @@ +# Final Worker Output Source of Truth + +_Sketch for treating `WorkerOutput` as the semantic final answer, rather than inferring it from context transcript events._ + +--- + +## Problem + +`ReActWorker.get_output()` currently reconstructs the worker's final output by reading persisted `RunContextEvent` rows and taking the last `assistant_text`, with a fallback that searches for a `final_result` tool call. That works, but it conflates three different concepts: + +- `assistant_text`: model text emitted during a generation turn +- `tool_call(final_result)`: PydanticAI's structured-output protocol +- `WorkerOutput`: the worker's final semantic result for the task execution + +The final answer should not be inferred from transcript shape. It should be the explicit output returned by the worker and persisted by the runtime. + +## Current State + +The codebase already has most of the right destination: + +- `WorkerOutput(output=..., success=..., metadata=...)` is the worker API's semantic final result. +- `worker_execute_fn()` receives the worker's `WorkerOutput` after `worker.get_output(worker_context)`. +- `WorkerExecuteResult.final_assistant_message` carries that value from `worker-execute` back to `task-execute`. +- `execute_task_fn()` passes `worker_result.final_assistant_message` into `FinalizeTaskExecutionCommand`. +- `TaskExecutionService.finalize_success()` persists it to `RunTaskExecution.final_assistant_message`. +- `RunTaskExecution` also has `output_json` for structured execution output metadata. + +So the persistence model already has a first-class execution-level field for the final assistant message. The weak part is upstream: `ReActWorker.get_output()` still computes that value by re-reading the context-event transcript. + +## Desired Shape + +The runtime should treat final worker output as execution-level data, not as another transcript event. + +```text +worker.execute() yields GenerationTurn events + | + v +ContextEventRepository persists transcript evidence + | + v +worker.get_output() returns WorkerOutput + | + v +TaskExecutionService.finalize_success() persists execution result + | + v +RunTaskExecution.final_assistant_message / output_json are the source of truth +``` + +In this model: + +- `RunContextEvent` remains the append-only transcript log. +- `RunTaskExecution.final_assistant_message` is the final human-readable answer. +- `RunTaskExecution.output_json` can hold structured metadata from `WorkerOutput.metadata`. +- Rollout-card export reads both: context events for the trace, task execution fields for final execution outputs. + +## Proposed Contract + +`WorkerOutput` should be the only object that defines a worker's final semantic output. + +```python +class WorkerOutput(BaseModel): + output: str + success: bool = True + metadata: dict[str, Any] = Field(default_factory=dict) +``` + +The runtime should persist it as: + +```text +RunTaskExecution.final_assistant_message = WorkerOutput.output +RunTaskExecution.output_json = { + "worker_output": { + "success": WorkerOutput.success, + "metadata": WorkerOutput.metadata, + }, + "resource_ids": [...] +} +``` + +If we want the full `WorkerOutput` object available in exports, use `output_json["worker_output"]` rather than adding a new `RunContextEvent` type. + +## ReActWorker Implication + +`ReActWorker` should stop deriving output by querying `ContextEventRepository`. + +Instead, it should capture the structured final result while running the PydanticAI agent. The worker already configures: + +```python +agent: Agent[None, _AgentOutput] = Agent( + model=resolved.model, + instructions=self.system_prompt or None, + tools=self.tools, + output_type=_AgentOutput, +) +``` + +The final `_AgentOutput.final_assistant_message` should be stored on the worker instance during `execute()`, then returned directly from `get_output()`. + +Conceptually: + +```python +class ReActWorker(Worker): + def __init__(...): + ... + self._final_output: _AgentOutput | None = None + self._turn_count = 0 + + async def _run_agent(...): + async with agent.iter(...) as run: + ... + self._final_output = run.result.output + + def get_output(self, context: WorkerContext) -> WorkerOutput: + if self._final_output is None: + return WorkerOutput(output="", success=False) + return WorkerOutput( + output=self._final_output.final_assistant_message, + success=True, + metadata={ + "reasoning": self._final_output.reasoning, + "turn_count": self._turn_count, + }, + ) +``` + +The exact PydanticAI result access may differ, but the ownership is the important part: the worker returns the structured final result it received from the agent, rather than reconstructing it from persisted context events. + +## Why Not `final_agent_message` Context Events? + +A new context event type would make the transcript easier to query, but it blurs the abstraction boundary. + +`RunContextEvent` should answer: "What happened during the model/tool interaction?" + +`RunTaskExecution` should answer: "What did this worker execution finally produce?" + +The final output belongs to the second question. Mirroring it into a rollout-card export is useful; storing it as another transcript event is optional and should not be the source of truth. + +## Implementation Sketch + +1. Keep `ContextEventRepository` unchanged as the transcript serializer. +2. Update `WorkerExecuteResult` only if needed to carry `WorkerOutput.metadata`. +3. Update `FinalizeTaskExecutionCommand` to carry `worker_output_metadata` or a full `worker_output_json`. +4. Update `TaskExecutionService.finalize_success()` to persist: + - `final_assistant_message` + - `output_json["worker_output"]` + - existing `resource_ids` if present +5. Update `ReActWorker` to capture its PydanticAI structured result during execution. +6. Replace `ReActWorker._base_output()` with a simple read of the captured structured output. +7. Remove `_latest_final_result_message()` if no other worker needs it. +8. Update rollout-card export to include task execution final outputs from `RunTaskExecution`, not by scanning `RunContextEvent`. + +## Migration / Compatibility + +Existing completed runs may only have context events, so readers should remain tolerant: + +- Prefer `RunTaskExecution.final_assistant_message`. +- If absent, optionally fall back to the old transcript inference for legacy runs. +- Do not use the fallback in new execution paths. + +This preserves old data while making new runs explicit. + +## Tests + +Add focused tests for: + +- `ReActWorker.get_output()` returns the captured structured `_AgentOutput`, not the last `assistant_text`. +- A run with intermediate `assistant_text` plus final structured output persists the structured final output. +- `TaskExecutionService.finalize_success()` writes `final_assistant_message` and `output_json["worker_output"]`. +- Context event replay still reconstructs transcript messages without needing final-output semantics. +- Legacy read helpers fall back to transcript inference only when `RunTaskExecution.final_assistant_message` is missing. + +## Open Questions + +1. Should `WorkerExecuteResult` carry the full `WorkerOutput.metadata`, or should `worker_execute_fn()` persist it directly before returning? +2. Should `RunTaskExecution.output_json` store the full `WorkerOutput` shape, or only `metadata` plus resource references? +3. Should rollout-card export call this field `worker_output`, `execution_output`, or `final_worker_output`? diff --git a/docs/superpowers/plans/2026-04-27-react-worker-failure-context-capture.md b/docs/superpowers/plans/2026-04-27-react-worker-failure-context-capture.md new file mode 100644 index 00000000..e730dc6f --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-react-worker-failure-context-capture.md @@ -0,0 +1,650 @@ +# ReAct Worker Failure Context Capture Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Preserve partial PydanticAI ReAct transcript history when `agent.iter(...)` raises before `ReActWorker._run_agent()` reaches its normal post-run transcript extraction. + +**Architecture:** Keep runtime persistence ownership in `worker_execute_fn()`: workers yield `GenerationTurn`, runtime persists `RunContextEvent`. Add an incremental/cursor-based extraction API to `PydanticAITranscriptAdapter` so `ReActWorker` can yield completed turns during normal iteration and flush any remaining partial turn in an exception path before re-raising. This keeps failure semantics intact while eliminating the current zero-context failure gap for failed ReAct/CLI child workers. + +**Tech Stack:** Python, PydanticAI `Agent.iter`, `GenerationTurn`, `PydanticAITranscriptAdapter`, `ContextEventRepository`, pytest. + +--- + +## Root Cause + +Current `ReActWorker._run_agent()` only converts PydanticAI messages into `GenerationTurn`s after the `agent.iter(...)` context exits normally: + +```python +async with agent.iter(...) as run: + async for _node in run: + ... + +turns = PydanticAITranscriptAdapter().build_turns(run.ctx.state.message_history) +for turn in turns: + yield turn +``` + +If PydanticAI raises inside `async for _node in run`, control jumps out of `_run_agent()` before `build_turns(...)` runs. Then `worker_execute_fn()` catches the exception before it has received any turns to persist. That explains executions with an error stack but `0` `RunContextEvent` rows. + +The ResearchRubrics workflow CLI worker is affected because it subclasses `ReActWorker`: + +```python +async for turn in super().execute(task, context=context): + yield turn +``` + +Successful CLI runs use the shared adapter; failed CLI runs can still lose partial transcript history. + +--- + +## Desired Behavior + +- Successful ReAct runs keep capturing the same full transcript as today. +- Failed ReAct runs yield/persist every turn that can be reconstructed from `run.ctx.state.message_history` before re-raising the original exception. +- Runtime failure semantics do not change: `worker_execute_fn()` still returns the failure result and task status remains failed. +- Workers do not call `ContextEventRepository` directly. +- No duplicate context events are emitted when incremental extraction is called multiple times. +- Partial trailing responses can be flushed on final success or failure, but not emitted prematurely while a tool call may still receive a following `ToolReturnPart`. + +--- + +## File Map + +```text +ergon_builtins/ + ergon_builtins/ + common/ + llm_context/ + adapters/ + pydantic_ai.py # modify: replace post-run-only turn extraction with cursor API + workers/ + baselines/ + react_worker.py # modify: yield incremental turns and flush on exception + +tests/ + unit/ + builtins/ + common/ + test_transcript_adapters.py # modify: cursor extraction + trailing flush tests + workers/ + test_react_worker_contract.py # modify or add tests for failure transcript yield/re-raise +``` + +Do not modify `worker_execute_fn()` for this fix unless tests prove it cannot persist turns yielded immediately before an async generator raises. The existing `async for turn in worker.execute(...)` loop already persists each yielded turn before requesting the next one. + +--- + +## Closure And Removals + +This is not an additive second serialization path. Close the old behavior explicitly: + +- Remove `ReActWorker._run_agent()`'s post-run-only extraction pattern: + +```python +turns = PydanticAITranscriptAdapter().build_turns(run.ctx.state.message_history) +for turn in turns: + yield turn +``` + +Replace it with cursor extraction during the loop plus final/failure flush. + +- Do not add a new repository or direct DB writer for failure capture. `ContextEventRepository` remains the only `GenerationTurn` -> `RunContextEvent` serializer, and it remains called by `worker_execute_fn()`. +- Do not restore the old core PydanticAI serializers removed in the previous refactor: `ergon_core/core/persistence/context/assembly.py` and `ergon_core/core/providers/generation/pydantic_ai_format.py`. +- Do not add any new `ergon_core` PydanticAI transcript code. All PydanticAI transcript extraction/replay stays in `ergon_builtins.common.llm_context.adapters.pydantic_ai`. +- Treat the cursor API as the runtime extraction surface. If a batch `build_turns(...)` helper remains for tests or protocol compatibility, implement it as a wrapper around the same cursor extraction logic, not as a second independent serializer. +- Update tests that assert the worker no longer owns parser helpers so they also assert `ReActWorker` does not call a post-run-only extraction helper directly. + +There is no separate old "turn serialization repository" to delete after the previous refactor. The durable serialization repository is still `ContextEventRepository`, and that should stay. The old thing to remove here is the worker's post-run-only transcript extraction path, because it is the failure gap. + +--- + +## Design + +Use a small cursor object in the PydanticAI adapter: + +```python +from pydantic import BaseModel + + +class TranscriptTurnCursor(BaseModel): + model_config = {"validate_assignment": True} + + emitted_turn_count: int = 0 +``` + +Make cursor extraction the runtime API: + +```python +class PydanticAITranscriptAdapter(...): + def build_new_turns( + self, + transcript: list[ModelMessage], + cursor: TranscriptTurnCursor, + *, + flush_pending: bool = False, + ) -> list[GenerationTurn]: + turns = _build_turns_from_transcript(transcript, flush_pending=flush_pending) + new_turns = turns[cursor.emitted_turn_count :] + cursor.emitted_turn_count = len(turns) + return new_turns +``` + +If `build_turns(...)` remains public because `TranscriptAdapter` currently declares it, it should delegate to the same internal implementation used by `build_new_turns(...)`. Do not keep two independent conversion implementations. + +Change current trailing-response behavior in `build_turns()` so it is explicit: + +```python +if pending_response is not None and flush_pending: + turns.append(_to_turn(pending_request_in, pending_response, tool_result_request=None)) +``` + +`flush_pending=False` is important during the live `agent.iter(...)` loop. It prevents emitting a tool-call response before the following `ModelRequest` has a chance to include the `ToolReturnPart`. On final success or failure, use `flush_pending=True` so partial model output is not lost. + +Update `ReActWorker._run_agent()`: + +```python +adapter = PydanticAITranscriptAdapter() +cursor = TranscriptTurnCursor() +run = None + +try: + async with agent.iter(...) as active_run: + run = active_run + async for _node in run: + node_count += 1 + + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=False, + ): + yield turn + + if node_count >= self.max_iterations: + logger.warning(...) + break +except Exception: + if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn + raise + +if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn +``` + +This is extraction-as-iterator in practice: the cursor marks what has already been yielded, and `build_new_turns(...)` can be called repeatedly as message history grows. + +Do not swallow exceptions. The final `raise` is required so `worker_execute_fn()` still records failure. + +--- + +## Task 1: Adapter Cursor API + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` +- Modify: `tests/unit/builtins/common/test_transcript_adapters.py` + +- [ ] **Step 1: Write failing test for no premature trailing response** + +Add to `tests/unit/builtins/common/test_transcript_adapters.py`: + +```python +from ergon_builtins.common.llm_context.adapters.pydantic_ai import TranscriptTurnCursor + + +def test_incremental_extraction_does_not_emit_pending_tool_call_response() -> None: + adapter = PydanticAITranscriptAdapter() + cursor = TranscriptTurnCursor() + transcript = [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ] + + assert adapter.build_new_turns(transcript, cursor, flush_pending=False) == [] + + flushed = adapter.build_new_turns(transcript, cursor, flush_pending=True) + assert len(flushed) == 1 + assert any(isinstance(part, ErgonToolCallPart) for part in flushed[0].response_parts) +``` + +- [ ] **Step 2: Write failing test for no duplicate new turns** + +Add: + +```python +def test_incremental_extraction_tracks_emitted_turns() -> None: + adapter = PydanticAITranscriptAdapter() + cursor = TranscriptTurnCursor() + transcript = [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ModelRequest( + parts=[ + ToolReturnPart( + tool_name="search", + tool_call_id="call-1", + content={"result": "found"}, + ) + ] + ), + ] + + first = adapter.build_new_turns(transcript, cursor, flush_pending=False) + second = adapter.build_new_turns(transcript, cursor, flush_pending=False) + + assert len(first) == 1 + assert second == [] +``` + +- [ ] **Step 3: Run red tests** + +Run: + +```bash +uv run pytest tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: FAIL because `TranscriptTurnCursor` and `build_new_turns()` do not exist. + +- [ ] **Step 4: Replace batch extraction internals with cursor-backed extraction** + +In `pydantic_ai.py`, add: + +```python +from pydantic import BaseModel + + +class TranscriptTurnCursor(BaseModel): + model_config = {"validate_assignment": True} + + emitted_turn_count: int = 0 +``` + +Move the existing `build_turns(...)` body into a private helper that takes `flush_pending`: + +```python +def _build_turns_from_transcript( + transcript: list[ModelMessage], + *, + flush_pending: bool, +) -> list[GenerationTurn]: + ... +``` + +Keep `build_turns(...)` only as compatibility with the existing `TranscriptAdapter` protocol and any batch tests: + +```python +def build_turns( + self, + transcript: list[ModelMessage], + *, + flush_pending: bool = True, +) -> list[GenerationTurn]: + return _build_turns_from_transcript(transcript, flush_pending=flush_pending) +``` + +Do not call `build_turns(...)` from `ReActWorker`. Runtime extraction should use the cursor API only. + +Change trailing append: + +```python +if pending_response is not None: + turns.append(_to_turn(pending_request_in, pending_response, tool_result_request=None)) +``` + +to: + +```python +if pending_response is not None and flush_pending: + turns.append(_to_turn(pending_request_in, pending_response, tool_result_request=None)) +``` + +Add: + +```python +def build_new_turns( + self, + transcript: list[ModelMessage], + cursor: TranscriptTurnCursor, + *, + flush_pending: bool = False, +) -> list[GenerationTurn]: + turns = _build_turns_from_transcript(transcript, flush_pending=flush_pending) + new_turns = turns[cursor.emitted_turn_count :] + cursor.emitted_turn_count = len(turns) + return new_turns +``` + +After this change, there is one conversion implementation: `_build_turns_from_transcript(...)`. `build_turns(...)` and `build_new_turns(...)` are wrappers with different calling semantics. + +- [ ] **Step 5: Run green tests** + +Run: + +```bash +uv run pytest tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: PASS. + +--- + +## Task 2: ReActWorker Failure Flush + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` +- Modify: `tests/unit/workers/test_react_worker_contract.py` + +- [ ] **Step 1: Write failing test for partial yield then re-raise** + +Add a fake `Agent` to `tests/unit/workers/test_react_worker_contract.py`: + +```python +from pydantic_ai.messages import ModelRequest, ModelResponse, TextPart, UserPromptPart + + +class _FakeRunState: + def __init__(self): + self.message_history = [ + ModelRequest(parts=[UserPromptPart(content="question")]), + ModelResponse(parts=[TextPart(content="partial answer")]), + ] + + +class _FakeRunContext: + def __init__(self): + self.state = _FakeRunState() + + +class _FailingAgentRun: + def __init__(self): + self.ctx = _FakeRunContext() + + def __aiter__(self): + return self + + async def __anext__(self): + raise RuntimeError("tool validation failed") + + +class _FailingAgentIter: + async def __aenter__(self): + return _FailingAgentRun() + + async def __aexit__(self, exc_type, exc, tb): + return False + + +class _FailingAgent: + def __init__(self, **kwargs): + pass + + def iter(self, *args, **kwargs): + return _FailingAgentIter() +``` + +Then add: + +```python +@pytest.mark.asyncio +async def test_react_worker_yields_partial_turn_before_reraising_agent_iter_failure(monkeypatch) -> None: + import ergon_builtins.workers.baselines.react_worker as react_worker + + monkeypatch.setattr(react_worker, "Agent", _FailingAgent) + monkeypatch.setattr( + react_worker, + "resolve_model_target", + lambda model: type( + "Resolved", + (), + {"model": "stub:constant", "capture_model_settings": None}, + )(), + ) + + worker = ReActWorker( + name="unit", + model=None, + task_id=UUID(int=1), + sandbox_id="test-sandbox", + tools=[], + system_prompt=None, + max_iterations=10, + ) + task = _minimal_task() + + turns = [] + with pytest.raises(RuntimeError, match="tool validation failed"): + async for turn in worker.execute(task, context=_minimal_context()): + turns.append(turn) + + assert len(turns) == 1 + assert any(part.content == "partial answer" for part in turns[0].response_parts) +``` + +Add small local helpers if this test file does not already have task/context fixtures: + +```python +from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload +from ergon_core.api.worker_context import WorkerContext + + +def _minimal_task() -> BenchmarkTask: + return BenchmarkTask( + task_id=UUID(int=2), + task_slug="unit-task", + description="Unit task", + task_payload=EmptyTaskPayload(), + ) + + +def _minimal_context() -> WorkerContext: + return WorkerContext( + run_id=UUID(int=3), + definition_id=UUID(int=4), + task_id=UUID(int=2), + execution_id=UUID(int=5), + sandbox_id="test-sandbox", + node_id=UUID(int=6), + ) +``` + +- [ ] **Step 2: Run red test** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py::test_react_worker_yields_partial_turn_before_reraising_agent_iter_failure -q +``` + +Expected: FAIL because `_run_agent()` currently re-raises before yielding the partial transcript. + +- [ ] **Step 3: Implement failure flush in `_run_agent()`** + +Modify `ReActWorker._run_agent()`: + +```python +adapter = PydanticAITranscriptAdapter() +cursor = TranscriptTurnCursor() +run = None + +try: + async with agent.iter( + task_prompt, + model_settings=resolved.capture_model_settings, + message_history=self._seed_messages, + ) as active_run: + run = active_run + async for _node in run: + node_count += 1 + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=False, + ): + yield turn + if node_count >= self.max_iterations: + logger.warning(...) + break +except Exception: + if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn + raise + +if run is not None: + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn +``` + +Keep the existing warning text for `max_iterations`. + +- [ ] **Step 4: Run worker test** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py -q +``` + +Expected: PASS. + +--- + +## Task 3: Runtime Persistence Regression + +**Files:** +- Modify: `tests/unit/runtime/test_failure_error_json.py` or add `tests/unit/runtime/test_worker_execute_partial_failure_context.py` + +- [ ] **Step 1: Add runtime-level regression if feasible** + +Add a unit test around `worker_execute_fn()` with a fake registered worker whose `execute()` yields one `GenerationTurn` and then raises. Assert that `ContextEventRepository.persist_turn()` is called before the failure result is returned. + +If existing `worker_execute_fn()` setup makes this too fixture-heavy, keep the worker-level test from Task 2 as the required regression and add a short comment in the test explaining why it is sufficient: + +```python +# worker_execute_fn persists each yielded turn before requesting the next item +# from the async generator, so this test covers the failure-capture contract at +# the worker boundary without rebuilding Inngest context fixtures. +``` + +- [ ] **Step 2: Run focused runtime/worker tests** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py tests/unit/persistence/test_context_event_repository.py -q +``` + +Expected: PASS. + +--- + +## Task 4: Verification + +**Files:** +- No production edits. + +- [ ] **Step 1: Run affected capture suite** + +Run: + +```bash +uv run pytest \ + tests/unit/builtins/common/test_transcript_adapters.py \ + tests/unit/persistence/test_context_event_repository.py \ + tests/unit/workers/test_react_worker_contract.py \ + tests/unit/state/test_generation_turn_build.py \ + tests/unit/state/test_context_assembly.py \ + -q +``` + +Expected: PASS. + +- [ ] **Step 2: Run lint/compile** + +Run: + +```bash +uv run ruff check \ + ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py \ + tests/unit/builtins/common/test_transcript_adapters.py \ + tests/unit/workers/test_react_worker_contract.py +uv run slopcop \ + ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py +uv run python -m compileall -q \ + ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py +``` + +Expected: PASS. + +- [ ] **Step 3: Optional real-run validation** + +Trigger a ReAct/CLI worker failure after the PydanticAI run has started, then inspect: + +```bash +RUN_ID= python - <<'PY' +from uuid import UUID +from sqlmodel import select +from ergon_core.core.persistence.shared.db import get_session +from ergon_core.core.persistence.context.models import RunContextEvent + +run_id = UUID(__import__("os").environ["RUN_ID"]) +with get_session() as session: + rows = session.exec( + select(RunContextEvent) + .where(RunContextEvent.run_id == run_id) + .order_by(RunContextEvent.task_execution_id, RunContextEvent.sequence) + ).all() + for row in rows: + print(row.task_execution_id, row.sequence, row.event_type) +PY +``` + +Expected: the failed child execution has at least the partial model request/response/tool-call events that existed before the exception. + +--- + +## Self-Review + +- Spec coverage: The plan addresses the observed gap where `agent.iter(...)` raises before post-run extraction, including CLI workers through `ReActWorker` inheritance. +- Iterator question: The plan proposes cursor-based incremental extraction from growing `message_history`, which is the appropriate iterator shape for PydanticAI histories. +- Persistence boundary: The plan keeps `ContextEventRepository` in the runtime path and does not make workers write directly to the DB. +- Failure semantics: The original exception is re-raised after partial turns are yielded. +- Known limitation: If `agent.iter(...)` fails during `__aenter__` before a `run` object exists, there is no PydanticAI `message_history` to flush. That case should still produce normal task failure metadata, but cannot produce transcript events. diff --git a/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md b/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md new file mode 100644 index 00000000..7af71aa7 --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md @@ -0,0 +1,811 @@ +# Agent Tool Budget Harness Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a simple, reusable tool-budget harness that prevents agent rollouts from looping indefinitely by counting `workflow` tool calls separately from all other tool calls and returning explicit budget-exhausted messages when either limit is reached. + +**Architecture:** Use Pydantic AI dependency injection. `ReActWorker` passes an optional deps object into `Agent.iter(...)`; tools that participate in the budget accept `RunContext[AgentToolBudgetDeps]` and call `ctx.deps.tool_budget.check(...)` before doing work. The budget system is generic and benchmark-agnostic: it knows only `workflow` vs `other`, not ResearchRubrics, Exa, or rubric-specific concepts. Reference: [Pydantic AI dependencies](https://pydantic.dev/docs/ai/core-concepts/dependencies/). + +**Tech Stack:** Python 3.13, pydantic-ai `RunContext`, Ergon `ReActWorker`, existing tool callables, pytest smoke checks, real-LLM rollout artifacts, Logfire. + +--- + +## Design + +The harness should enforce two counters per agent execution: + +```python +workflow_tool_calls <= max_workflow_tool_calls +other_tool_calls <= max_other_tool_calls +``` + +Initial defaults: + +```python +AgentToolBudgetPolicy( + max_workflow_tool_calls=12, + max_other_tool_calls=12, + warning_at_remaining=3, +) +``` + +The budget does not decide which benchmark is running and does not know about Exa. It only sees: + +- `workflow` calls: the workflow CLI tool. +- `other` calls: context-gathering and workspace-inspection tools other than `workflow`. +- `finalization` calls: tools that produce final output artifacts, such as report writing. These count for observability but are not blocked, because the budget should push the agent into finalization rather than prevent it. + +When a limit is reached, the tool returns a normal structured tool result: + +```python +AgentToolBudgetExhaustedResult( + status="TOOL_BUDGET_EXHAUSTED", + reason="workflow tool budget reached", + message="Stop calling workflow. Use currently visible context/resources and produce the best possible final output.", + budget_state={...}, +) +``` + +or: + +```python +AgentToolBudgetExhaustedResult( + status="TOOL_BUDGET_EXHAUSTED", + reason="non-workflow tool budget reached", + message="Stop calling tools. Produce the final answer from the context already gathered.", + budget_state={...}, +) +``` + +This is intentionally not a Python exception. The model gets a final chance to converge. The outer `max_iterations` guard still raises a real error if the agent keeps looping after exhausted tool responses. + +## Package Placement + +- Generic budget state: `ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py` +- Base agent execution hook: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` +- Budgeted workflow command tool: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` +- Budgeted non-workflow tools for this rollout: `ergon_builtins/ergon_builtins/tools/research_rubrics_toolkit.py` and `ergon_builtins/ergon_builtins/tools/graph_toolkit.py` +- Worker-specific budget policy wiring: `ergon_builtins/ergon_builtins/workers/research_rubrics/` +- Rollout diagnostics: `tests/real_llm/` + +## Added Files + +```text +ergon_builtins/ + ergon_builtins/ + workers/ + baselines/ + tool_budget.py +``` + +`tool_budget.py` owns the generic Pydantic models for budget policy, mutable per-execution budget state, deps passed into pydantic-ai, and helper logic for attaching warning text to tool results. + +## Edited Files + +```text +ergon_builtins/ + ergon_builtins/ + tools/ + graph_toolkit.py + research_rubrics_toolkit.py + workflow_cli_tool.py + workers/ + baselines/ + react_worker.py + research_rubrics/ + researcher_worker.py + workflow_cli_react_worker.py + +tests/ + real_llm/ + artifact_health.py + rollout.py +``` + +Edit responsibilities: + +- `react_worker.py`: add an optional deps hook, pass deps into `Agent.iter(...)`, and raise when `max_iterations` is hit. +- `workflow_cli_tool.py`: edit the existing workflow tool function path to support a ctx-taking budgeted mode for `workflow` calls. +- `research_rubrics_toolkit.py`: convert participating tools to ctx-taking functions and count context-gathering tools as `other`, while allowing report-writing as `finalization`. +- `graph_toolkit.py`: convert graph/resource tools to ctx-taking functions and count them as `other`. +- `researcher_worker.py`: provide generic budget deps to `ReActWorker` and steer the prompt toward quick convergence. +- `workflow_cli_react_worker.py`: provide generic budget deps, use budgeted workflow tool mode, and steer the prompt toward deliberate workflow use and subagent coordination. +- `artifact_health.py`: derive `workflow_tool_calls`, `other_tool_calls`, `budget_exhausted`, and `missing_final_report` from existing rollout artifacts. +- `rollout.py`: include those derived counters in `report.md`. + +## Deleted Files + +```text +(none) +``` + +## Optional Later Files + +If other benchmarks start showing the same loop behavior, apply the same `RunContext[AgentToolBudgetDeps]` pattern to their toolkits: + +```text +ergon_builtins/ + ergon_builtins/ + benchmarks/ + gdpeval/ + toolkit.py + minif2f/ + toolkit.py + swebench_verified/ + toolkit.py +``` + +--- + +## Task 1: Add Generic Tool Budget State + +**Files:** +- Create: `ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py` + +- [ ] **Step 1: Create generic budget types** + +Create `tool_budget.py`: + +```python +from __future__ import annotations + +from typing import Any, Literal + +from pydantic import BaseModel, Field + +ToolBudgetKind = Literal["workflow", "other", "finalization"] +ToolBudgetExhaustedStatus = Literal["TOOL_BUDGET_EXHAUSTED"] + + +class AgentToolBudgetExhaustedResult(BaseModel): + status: ToolBudgetExhaustedStatus = "TOOL_BUDGET_EXHAUSTED" + reason: str + message: str + budget_state: dict[str, Any] # slopcop: ignore[no-typing-any] + + +class AgentToolBudgetPolicy(BaseModel): + model_config = {"frozen": True} + + max_workflow_tool_calls: int = 12 + max_other_tool_calls: int = 12 + warning_at_remaining: int = 3 + + +class AgentToolBudgetDecision(BaseModel): + model_config = {"frozen": True} + + allowed: bool + warning: str | None = None + exhausted: AgentToolBudgetExhaustedResult | None = None + + +class AgentToolBudgetState(BaseModel): + policy: AgentToolBudgetPolicy = Field(default_factory=AgentToolBudgetPolicy) + workflow_tool_calls: int = 0 + other_tool_calls: int = 0 + finalization_tool_calls: int = 0 + calls_by_tool: dict[str, int] = Field(default_factory=dict) + + def check(self, tool_name: str, kind: ToolBudgetKind) -> AgentToolBudgetDecision: + self.calls_by_tool[tool_name] = self.calls_by_tool.get(tool_name, 0) + 1 + + if kind == "workflow": + self.workflow_tool_calls += 1 + if self.workflow_tool_calls > self.policy.max_workflow_tool_calls: + return AgentToolBudgetDecision( + allowed=False, + exhausted=self.exhausted_result("workflow tool budget reached"), + ) + remaining = self.policy.max_workflow_tool_calls - self.workflow_tool_calls + elif kind == "finalization": + self.finalization_tool_calls += 1 + return AgentToolBudgetDecision(allowed=True) + else: + self.other_tool_calls += 1 + if self.other_tool_calls > self.policy.max_other_tool_calls: + return AgentToolBudgetDecision( + allowed=False, + exhausted=self.exhausted_result("non-workflow tool budget reached"), + ) + remaining = self.policy.max_other_tool_calls - self.other_tool_calls + + if remaining <= self.policy.warning_at_remaining: + return AgentToolBudgetDecision( + allowed=True, + warning=( + f"TOOL_BUDGET_WARNING: {remaining} {kind} tool calls remain. " + "Converge now using the context already gathered." + ), + ) + return AgentToolBudgetDecision(allowed=True) + + def snapshot(self) -> dict[str, Any]: # slopcop: ignore[no-typing-any] + return { + "workflow_tool_calls": self.workflow_tool_calls, + "max_workflow_tool_calls": self.policy.max_workflow_tool_calls, + "other_tool_calls": self.other_tool_calls, + "max_other_tool_calls": self.policy.max_other_tool_calls, + "finalization_tool_calls": self.finalization_tool_calls, + "calls_by_tool": dict(sorted(self.calls_by_tool.items())), + } + + def exhausted_result(self, reason: str) -> AgentToolBudgetExhaustedResult: + return AgentToolBudgetExhaustedResult( + reason=reason, + message=( + "Stop calling tools in this category. Use the context/resources already " + "available and produce the best possible final output. If the output is " + "incomplete, state what context or resource was missing." + ), + budget_state=self.snapshot(), + ) + + +class AgentToolBudgetDeps(BaseModel): + tool_budget: AgentToolBudgetState + + +def with_budget_warning(result: Any, warning: str | None) -> Any: # slopcop: ignore[no-typing-any] + if warning is None: + return result + if isinstance(result, str): + return f"{result}\n\n{warning}" + if isinstance(result, dict): + updated = dict(result) + updated["tool_budget_warning"] = warning + return updated + return result +``` + +- [ ] **Step 2: Run import smoke check** + +Run: + +```bash +uv run python - <<'PY' +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetPolicy, + AgentToolBudgetState, +) + +state = AgentToolBudgetState( + policy=AgentToolBudgetPolicy(max_workflow_tool_calls=1, max_other_tool_calls=2), +) +deps = AgentToolBudgetDeps(tool_budget=state) +print(deps.tool_budget.check("workflow", "workflow").allowed) +print(deps.tool_budget.check("workflow", "workflow").allowed) +print(deps.tool_budget.snapshot()) +PY +``` + +Expected: first line `True`, second line `False`, then a snapshot dictionary. + +--- + +## Task 2: Pass Deps Through ReActWorker + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` + +- [ ] **Step 1: Add a deps hook** + +Add to `ReActWorker`: + +```python + def build_agent_deps(self, context: WorkerContext) -> Any | None: # slopcop: ignore[no-typing-any] + return None +``` + +- [ ] **Step 2: Pass context into `_run_agent`** + +Change: + +```python +async for turn in self._run_agent(task): +``` + +to: + +```python +async for turn in self._run_agent(task, context): +``` + +Change `_run_agent` signature: + +```python + async def _run_agent( + self, + task: BenchmarkTask, + context: WorkerContext, + ) -> AsyncGenerator[GenerationTurn, None]: +``` + +- [ ] **Step 3: Pass deps to pydantic-ai** + +Before `Agent(...)`: + +```python + agent_deps = self.build_agent_deps(context) + deps_type = type(agent_deps) if agent_deps is not None else None +``` + +Change the agent construction to include: + +```python + deps_type=deps_type, +``` + +Change `agent.iter(...)` to include: + +```python + deps=agent_deps, +``` + +- [ ] **Step 4: Make max-iteration exhaustion visible** + +Replace the current `break` on `max_iterations` with: + +```python + for turn in adapter.build_new_turns( + run.ctx.state.message_history, + cursor, + flush_pending=True, + ): + yield turn + raise RuntimeError( + f"ReActWorker exceeded max_iterations={self.max_iterations}" + ) +``` + +- [ ] **Step 5: Run existing focused tests** + +Run: + +```bash +uv run pytest tests/unit/workers/test_react_worker_contract.py tests/unit/builtins/common/test_transcript_adapters.py -q +``` + +Expected: PASS. + +--- + +## Task 3: Budget the Workflow Tool + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` +- Existing test: `tests/unit/state/test_workflow_cli_tool.py` + +- [ ] **Step 1: Add ctx-aware mode** + +Import: + +```python +from pydantic_ai import RunContext +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetExhaustedResult, + with_budget_warning, +) +``` + +Add parameter to `make_workflow_cli_tool`: + +```python + budgeted: bool = False, +``` + +Edit the existing function body directly. Do not add a separate wrapper around workflow execution. Because pydantic-ai needs a clear callable signature, use two function definitions inside `make_workflow_cli_tool`: one ctx-taking definition for `budgeted=True`, and the existing no-ctx definition for `budgeted=False`. + +```python + if budgeted: + async def workflow( + ctx: RunContext[AgentToolBudgetDeps], + command: str, + ) -> str | AgentToolBudgetExhaustedResult: + decision = ctx.deps.tool_budget.check("workflow", "workflow") + if not decision.allowed: + assert decision.exhausted is not None + return decision.exhausted + + if worker_context.node_id is None: + raise ValueError("workflow tool requires WorkerContext.node_id") + + output = await asyncio.to_thread( + execute_command, + command, + context=WorkflowCommandContext( + run_id=worker_context.run_id, + node_id=worker_context.node_id, + execution_id=worker_context.execution_id, + sandbox_task_key=sandbox_task_key, + benchmark_type=benchmark_type, + ), + session_factory=session_factory, + service=service_factory(), + ) + if output.exit_code != 0: + detail = output.stderr or output.stdout + result = f"workflow exited {output.exit_code}: {detail}".strip() + elif output.stderr: + result = f"{output.stdout}\n\nstderr:\n{output.stderr}".strip() + else: + result = output.stdout + return with_budget_warning(result, decision.warning) + + return workflow +``` + +Keep the existing no-ctx `workflow(command: str)` function as the `budgeted=False` branch: + +```python + async def workflow(command: str) -> str: + if worker_context.node_id is None: + raise ValueError("workflow tool requires WorkerContext.node_id") + + output = await asyncio.to_thread( + execute_command, + command, + context=WorkflowCommandContext( + run_id=worker_context.run_id, + node_id=worker_context.node_id, + execution_id=worker_context.execution_id, + sandbox_task_key=sandbox_task_key, + benchmark_type=benchmark_type, + ), + session_factory=session_factory, + service=service_factory(), + ) + if output.exit_code != 0: + detail = output.stderr or output.stdout + return f"workflow exited {output.exit_code}: {detail}".strip() + if output.stderr: + return f"{output.stdout}\n\nstderr:\n{output.stderr}".strip() + return output.stdout + + return workflow +``` + +- [ ] **Step 2: Preserve existing behavior** + +Run: + +```bash +uv run pytest tests/unit/state/test_workflow_cli_tool.py -q +``` + +Expected: PASS. Existing tests use `budgeted=False`. + +--- + +## Task 4: Budget Other Tools Used by This Harness + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/tools/research_rubrics_toolkit.py` +- Modify: `ergon_builtins/ergon_builtins/tools/graph_toolkit.py` + +- [ ] **Step 1: Convert ResearchRubrics tools to ctx-taking functions** + +In `research_rubrics_toolkit.py`, import: + +```python +from pydantic_ai import RunContext +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetExhaustedResult, + with_budget_warning, +) +``` + +For each tool function, add `ctx` as the first arg: + +```python +ctx: RunContext[AgentToolBudgetDeps], +``` + +At the top of each context-gathering tool: + +```python +decision = ctx.deps.tool_budget.check("", "other") +if not decision.allowed: + assert decision.exhausted is not None + return decision.exhausted +``` + +For final-output tools such as `write_report_draft` and `edit_report_draft`, use: + +```python +decision = ctx.deps.tool_budget.check("", "finalization") +``` + +Do not block finalization tools after `other` is exhausted. The budget exists to force convergence into these tools. + +Use the actual function/tool name for each function so `calls_by_tool` remains useful in artifacts. + +After the existing result is produced: + +```python +return cast( | AgentToolBudgetExhaustedResult, with_budget_warning(resp, decision.warning)) +``` + +For response types that are Pydantic models, returning `AgentToolBudgetExhaustedResult` on exhaustion is acceptable because the tool result is serialized back to the model. Keep type annotations broad enough, for example: + +```python +) -> SearchResponse | AgentToolBudgetExhaustedResult: +``` + +Change each `Tool(..., takes_ctx=False)` to: + +```python +Tool(function=..., takes_ctx=True) +``` + +- [ ] **Step 2: Convert graph/resource tools to ctx-taking functions** + +In `graph_toolkit.py`, apply the same pattern: + +```python +decision = ctx.deps.tool_budget.check("list_child_resources", "other") +if not decision.allowed: + assert decision.exhausted is not None + return decision.exhausted +``` + +Update all graph tools to `takes_ctx=True`. + +- [ ] **Step 3: Run import smoke checks** + +Run: + +```bash +uv run python - <<'PY' +from ergon_builtins.tools.research_rubrics_toolkit import ResearchRubricsToolkit +from ergon_builtins.tools.graph_toolkit import ResearchGraphToolkit +print(ResearchRubricsToolkit) +print(ResearchGraphToolkit) +PY +``` + +Expected: imports cleanly. + +--- + +## Task 5: Wire Budget Deps Into Current ResearchRubrics Workers + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py` +- Modify: `ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py` + +- [ ] **Step 1: Add policy imports** + +In both workers: + +```python +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetPolicy, + AgentToolBudgetState, +) +``` + +- [ ] **Step 2: Add a shared policy** + +Use the same generic policy in both files: + +```python +_TOOL_BUDGET_POLICY = AgentToolBudgetPolicy( + max_workflow_tool_calls=12, + max_other_tool_calls=12, + warning_at_remaining=3, +) +``` + +- [ ] **Step 3: Create deps per execution** + +In each `execute(...)`, before calling `super().execute(...)`: + +```python +self._agent_deps = AgentToolBudgetDeps( + AgentToolBudgetState(_TOOL_BUDGET_POLICY), +) +``` + +Add method: + +```python +def build_agent_deps(self, context: WorkerContext) -> AgentToolBudgetDeps: + return self._agent_deps +``` + +These worker instances are currently execution-scoped. If that changes later, move deps creation into a base-class execution context instead of storing on `self`. + +- [ ] **Step 4: Use budgeted workflow tool in manager** + +In `workflow_cli_react_worker.py`, change: + +```python +workflow_tool = make_workflow_cli_tool(...) +``` + +to: + +```python +workflow_tool = make_workflow_cli_tool(..., budgeted=True) +``` + +- [ ] **Step 5: Tighten prompts, but keep them generic** + +Researcher prompt: + +```text +You have a limited non-workflow tool budget. Gather enough context, then stop using tools and write final_output/report.md. If any tool returns TOOL_BUDGET_WARNING or TOOL_BUDGET_EXHAUSTED, immediately produce the best possible final report from the context already gathered. +``` + +Manager prompt: + +```text +For multi-step work, divide and conquer with focused subagents to manage context. Workflow calls are limited, so inspect deliberately, create focused children, avoid duplicate research, and converge after child resources are visible. If any tool returns TOOL_BUDGET_WARNING or TOOL_BUDGET_EXHAUSTED, stop polling/searching and produce the best possible final output from current context/resources. +``` + +- [ ] **Step 6: Run focused worker import** + +Run: + +```bash +uv run python - <<'PY' +from ergon_builtins.workers.research_rubrics.researcher_worker import ResearchRubricsResearcherWorker +from ergon_builtins.workers.research_rubrics.workflow_cli_react_worker import ResearchRubricsWorkflowCliReActWorker +print(ResearchRubricsResearcherWorker.type_slug) +print(ResearchRubricsWorkflowCliReActWorker.type_slug) +PY +``` + +Expected: prints both type slugs. + +--- + +## Task 6: Add Lightweight Rollout Reporting + +**Files:** +- Modify: `tests/real_llm/artifact_health.py` +- Modify: `tests/real_llm/rollout.py` + +- [ ] **Step 1: Count budget signals from existing events** + +In `artifact_health.py`, derive: + +```python +workflow_tool_calls +other_tool_calls +budget_exhausted +missing_final_report +``` + +Implementation rule: + +- If `tool_name == "workflow"`, increment `workflow_tool_calls`. +- Else if event type is `tool_call`, increment `other_tool_calls`. +- If any event payload has `status == "TOOL_BUDGET_EXHAUSTED"`, set `budget_exhausted=True`. +- If no resource path is `final_output/report.md`, set `missing_final_report=True`. + +- [ ] **Step 2: Show counters in rollout report** + +In `rollout.py`, add lines: + +```python +f"- workflow tool calls: {health.workflow_tool_calls}", +f"- other tool calls: {health.other_tool_calls}", +f"- budget exhausted: {health.budget_exhausted}", +f"- missing final report: {health.missing_final_report}", +``` + +- [ ] **Step 3: Run collection smoke** + +Run: + +```bash +uv run pytest tests/real_llm -q --collect-only +``` + +Expected: collection succeeds. + +--- + +## Task 7: Verify With One Real Sample + +**Files:** +- No new source files. + +- [ ] **Step 1: Run focused checks** + +Run: + +```bash +uv run pytest \ + tests/unit/state/test_workflow_cli_tool.py \ + tests/unit/workers/test_react_worker_contract.py \ + tests/unit/builtins/common/test_transcript_adapters.py \ + -q +``` + +Expected: PASS. + +- [ ] **Step 2: Run lint on changed files** + +Run: + +```bash +uv run ruff check \ + ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py \ + ergon_builtins/ergon_builtins/workers/baselines/react_worker.py \ + ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py \ + ergon_builtins/ergon_builtins/tools/research_rubrics_toolkit.py \ + ergon_builtins/ergon_builtins/tools/graph_toolkit.py \ + ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py \ + ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py \ + tests/real_llm/artifact_health.py \ + tests/real_llm/rollout.py +``` + +Expected: `All checks passed!` + +- [ ] **Step 3: Rebuild and run one sample** + +Run: + +```bash +POSTGRES_PASSWORD=ergon_dev \ +TEST_HARNESS_SECRET=real-llm-secret \ +ENABLE_TEST_HARNESS=1 \ +ENABLE_SMOKE_FIXTURES=0 \ +ERGON_STARTUP_PLUGINS= \ +ERGON_LOGFIRE_PYDANTIC_AI=1 \ +ERGON_LOGFIRE_SERVICE_NAME=ergon-builtins \ +ERGON_LOGFIRE_ENVIRONMENT=real-llm \ +docker compose build api +``` + +Then: + +```bash +POSTGRES_PASSWORD=ergon_dev \ +TEST_HARNESS_SECRET=real-llm-secret \ +ENABLE_TEST_HARNESS=1 \ +ENABLE_SMOKE_FIXTURES=0 \ +ERGON_STARTUP_PLUGINS= \ +ERGON_LOGFIRE_PYDANTIC_AI=1 \ +ERGON_LOGFIRE_SERVICE_NAME=ergon-builtins \ +ERGON_LOGFIRE_ENVIRONMENT=real-llm \ +docker compose up -d --no-build --force-recreate --wait api +``` + +Then: + +```bash +ERGON_REAL_LLM=1 \ +ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react \ +ERGON_REAL_LLM_LIMIT=1 \ +ERGON_REAL_LLM_BUDGET_USD=5 \ +TEST_HARNESS_SECRET=real-llm-secret \ +ENABLE_TEST_HARNESS=1 \ +ENABLE_SMOKE_FIXTURES=0 \ +uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -q -s --assume-stack-up +``` + +Expected improvement: + +- no silent runaway loop. +- report shows `workflow tool calls <= 12`, or budget exhaustion is visible. +- report shows `other tool calls <= 12`, or budget exhaustion is visible. +- if the run fails, it fails with persisted transcript/error context that explains whether the budget was exhausted. + +--- + +## Notes + +- This is intentionally simpler than per-tool caps. No Exa-specific budget, no rubric-specific budget, no child-poll-specific budget. +- This still supports better prompt steering, but prompt steering is advisory. The two counters are enforcement. +- We should not add broad unit tests for every tool. Existing workflow tests, import smoke checks, lint, and the one-sample real rollout are enough for this change. +- Do not commit unless explicitly asked. + diff --git a/docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md b/docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md new file mode 100644 index 00000000..d4f00e7a --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md @@ -0,0 +1,1359 @@ +# Context Part Chunk Stream Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the parallel `GenerationTurn` and context-event payload model with one canonical context-part stream emitted by workers and enriched by core before persistence. + +**Architecture:** Define a single discriminated `ContextPart` union for things that appear in an LLM context/action stream: system prompts, user messages, assistant text, tool calls, tool results, and thinking. Workers yield `ContextPartChunk` values containing a `part` plus optional token metadata; core normalizes and enriches those chunks into persisted `RunContextEvent` rows with sequence, turn id, timestamps, worker key, and run/execution ids. Keep database rows flat enough for SQLModel/JSONB, but make API, dashboard, replay, and RL consumers use typed chunk/log schemas instead of duplicate payload unions. This is a clean-break migration: old `*Payload`, `GenerationTurn`, request/response part aliases, and old discriminator names must be gone by the final task. + +**Tech Stack:** Python 3.13, Pydantic v2 discriminated unions, SQLModel JSON columns, pytest, existing Ergon worker/runtime/persistence packages. + +--- + +## Source Of Truth + +The canonical worker-facing stream type should live in `ergon_core.core.generation` or a renamed module such as `ergon_core.core.context_stream`. To avoid a large import churn in the first slice, start in `ergon_core.core.generation`. + +Use these names: + +```python +ContextPart +ContextPartChunk +ContextPartChunkLog +WorkerYield +``` + +`ContextPart` is the only union for LLM context/action parts. + +`ContextPartChunk` is the de facto worker generator type. + +`ContextPartChunkLog` is the core-enriched durable event shape. It is not the database ORM model; it is the typed payload/envelope used when projecting a stored `RunContextEvent`. + +`RunContextEvent` remains the SQLModel row with JSON storage and relational ids. + +--- + +## Change Tree + +```text +ergon/ + ergon_core/ + ergon_core/ + core/ + generation.py # modify: canonical ContextPart/ContextPartChunk/ContextPartChunkLog + api/ + schemas.py # modify: typed REST context event payloads + runs.py # modify: project parsed chunk logs + dashboard/ + event_contracts.py # modify: dashboard context event payload uses chunk log + emitter.py # modify: emit parsed chunk logs + persistence/ + context/ + event_payloads.py # modify/delete duplicate payload union; no final old aliases + models.py # modify: validate JSON as ContextPartChunkLog + repository.py # modify: add persist_chunk enrichment; later delete persist_turn + rl/ + extraction.py # modify: consume chunk-log parts + runtime/ + services/ + task_execution_service.py # modify: persist worker chunks instead of turns + test_support/ + smoke_fixtures/ + smoke_base/ + leaf_base.py # modify: yield ContextPartChunk + recursive.py # modify: yield ContextPartChunk + worker_base.py # modify: yield ContextPartChunk + tests/ + unit/ + architecture/ + test_core_schema_sources.py # modify: guard single context part union + test_model_field_descriptions.py # modify: check chunk-log field descriptions + builtins/ + common/ + test_transcript_adapters.py # modify: assert chunk extraction/replay + dashboard/ + test_event_contract_types.py # modify: assert typed chunk-log dashboard payload + persistence/ + test_context_event_repository.py # modify: persist_chunk tests + state/ + test_context_part_stream.py # add: canonical part/chunk serialization tests + test_context_assembly.py # modify: replay from ContextPartChunkLog + test_generation_turn_build.py # modify/delete after GenerationTurn compatibility removal + workers/ + test_react_worker_contract.py # modify: worker yields chunks + ergon_builtins/ + ergon_builtins/ + common/ + llm_context/ + adapters/ + pydantic_ai.py # modify: build_chunks/build_new_chunks and replay chunk logs + workers/ + baselines/ + react_worker.py # modify: inspect ContextPartChunkLog.part + training_stub_worker.py # modify: yield ContextPartChunk + research_rubrics/ + researcher_worker.py # modify if still yielding GenerationTurn + workflow_cli_react_worker.py # modify if still yielding GenerationTurn +``` + +--- + +## File Structure + +**Modify:** +- `ergon_core/ergon_core/core/generation.py` — replace request/response-specific part model as the canonical context stream model while preserving temporary aliases during migration. +- `ergon_core/ergon_core/core/persistence/context/event_payloads.py` — replace the duplicate payload union with canonical context-event type exports only; do not keep old payload aliases in the final state. +- `ergon_core/ergon_core/core/persistence/context/models.py` — validate stored JSON as `ContextPartChunkLog` or the log payload shape. +- `ergon_core/ergon_core/core/persistence/context/repository.py` — replace `persist_turn()` decomposition with `persist_chunk()` enrichment; keep a temporary `persist_turn()` adapter if needed for staged migration. +- `ergon_core/ergon_core/core/api/schemas.py` — type REST context-event DTOs with `ContextPartChunkLog` instead of `dict[str, Any]`. +- `ergon_core/ergon_core/core/api/runs.py` — project stored context events through typed log validation. +- `ergon_core/ergon_core/core/dashboard/event_contracts.py` — use the same typed log schema as REST for context events. +- `ergon_core/ergon_core/core/dashboard/emitter.py` — emit typed enriched context logs. +- `ergon_core/ergon_core/core/rl/extraction.py` — read `event.part` instead of payload-specific classes. +- `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` — convert PydanticAI messages into `ContextPartChunk` streams and replay logs back into PydanticAI messages. +- `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` — consume the new typed context stream. +- `ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py` — yield chunks instead of `GenerationTurn`. +- `ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/*.py` — yield chunks in smoke workers. + +**Tests:** +- `tests/unit/state/test_context_part_stream.py` — new focused tests for canonical union and chunk serialization. +- `tests/unit/persistence/test_context_event_repository.py` — rewrite around `persist_chunk()`. +- `tests/unit/builtins/common/test_transcript_adapters.py` — update PydanticAI adapter tests to assert chunk/log behavior. +- `tests/unit/state/test_context_assembly.py` — update replay tests around `ContextPartChunkLog`. +- `tests/unit/architecture/test_core_schema_sources.py` — add architecture guard against reintroducing duplicate context payload unions. +- Existing focused tests: `tests/unit/state/test_generation_turn_build.py`, `tests/unit/workers/test_react_worker_contract.py`, `tests/unit/dashboard/test_event_contract_types.py`, `tests/unit/architecture/test_model_field_descriptions.py`. + +--- + +### Task 1: Introduce Canonical Context Parts + +**Files:** +- Modify: `ergon_core/ergon_core/core/generation.py` +- Create: `tests/unit/state/test_context_part_stream.py` + +- [ ] **Step 1: Write failing tests for the canonical part union** + +Create `tests/unit/state/test_context_part_stream.py` with: + +```python +from pydantic import TypeAdapter + +from ergon_core.core.generation import ( + AssistantTextPart, + ContextPart, + ContextPartChunk, + ContextPartChunkLog, + SystemPromptPart, + ThinkingPart, + TokenLogprob, + ToolCallPart, + ToolResultPart, + UserMessagePart, +) + + +def test_context_part_discriminates_all_part_kinds() -> None: + adapter = TypeAdapter(ContextPart) + + cases = [ + SystemPromptPart(content="sys"), + UserMessagePart(content="hi"), + AssistantTextPart(content="hello"), + ToolCallPart(tool_call_id="call-1", tool_name="search", args={"q": "x"}), + ToolResultPart(tool_call_id="call-1", tool_name="search", content="ok"), + ThinkingPart(content="reasoning"), + ] + + for part in cases: + dumped = part.model_dump(mode="json") + parsed = adapter.validate_python(dumped) + assert parsed == part + + +def test_context_part_chunk_wraps_part_with_optional_token_metadata() -> None: + chunk = ContextPartChunk( + part=AssistantTextPart(content="answer"), + token_ids=[1, 2], + logprobs=[TokenLogprob(token="answer", logprob=-0.1)], + ) + + dumped = chunk.model_dump(mode="json") + + assert dumped["part"]["part_kind"] == "assistant_text" + assert dumped["token_ids"] == [1, 2] + assert dumped["logprobs"][0]["token"] == "answer" + + +def test_context_part_chunk_log_adds_core_enrichment() -> None: + log = ContextPartChunkLog( + part=ThinkingPart(content="hmm"), + sequence=7, + worker_binding_key="researcher", + turn_id="turn-1", + token_ids=None, + logprobs=None, + ) + + dumped = log.model_dump(mode="json") + + assert dumped["part"]["part_kind"] == "thinking" + assert dumped["sequence"] == 7 + assert dumped["worker_binding_key"] == "researcher" + assert dumped["turn_id"] == "turn-1" +``` + +- [ ] **Step 2: Run the failing tests** + +Run: + +```bash +pytest tests/unit/state/test_context_part_stream.py -v +``` + +Expected: FAIL because `AssistantTextPart`, `UserMessagePart`, `ToolResultPart`, `ContextPartChunk`, and `ContextPartChunkLog` do not exist yet. + +- [ ] **Step 3: Implement canonical context stream types** + +Modify `ergon_core/ergon_core/core/generation.py` to define the canonical names. This task may keep request/response subset aliases only if needed to keep the next migration task small; those aliases must be deleted in Task 7 before the plan is complete. + +```python +"""Core model context-stream types. + +These types are used by worker APIs, transcript adapters, persistence, replay, +and RL extraction. Keep them in core so persistence can import them without +loading ``ergon_core.api``. +""" + +from datetime import datetime +from typing import Annotated, Any, Literal + +from ergon_core.core.json_types import JsonObject +from pydantic import BaseModel, Field + + +class TokenLogprob(BaseModel): + """Per-token log probability from the serving backend.""" + + model_config = {"frozen": True} + + token: str + logprob: float + top_logprobs: list[JsonObject] = Field(default_factory=list) + + +class SystemPromptPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["system_prompt"] = "system_prompt" + content: str + + +class UserMessagePart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["user_message"] = "user_message" + content: str + + +class AssistantTextPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["assistant_text"] = "assistant_text" + content: str + + +class ToolCallPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["tool_call"] = "tool_call" + tool_name: str + tool_call_id: str + args: dict[str, Any] # slopcop: ignore[no-typing-any] + + +class ToolResultPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["tool_result"] = "tool_result" + tool_call_id: str + tool_name: str + content: str + is_error: bool = False + + +class ThinkingPart(BaseModel): + model_config = {"frozen": True} + part_kind: Literal["thinking"] = "thinking" + content: str + + +ContextPart = Annotated[ + SystemPromptPart + | UserMessagePart + | AssistantTextPart + | ToolCallPart + | ToolResultPart + | ThinkingPart, + Field(discriminator="part_kind"), +] + + +class ContextPartChunk(BaseModel): + """One worker-emitted context/action stream item. + + Core adds run/execution/sequence/timing metadata before persistence. + """ + + model_config = {"frozen": True} + + part: ContextPart + token_ids: list[int] | None = None + logprobs: list[TokenLogprob] | None = None + + +class ContextPartChunkLog(ContextPartChunk): + """Core-enriched context stream item suitable for API/dashboard projection.""" + + sequence: int + worker_binding_key: str + turn_id: str | None = None + started_at: datetime | None = None + completed_at: datetime | None = None + policy_version: str | None = None + + +WorkerYield = ContextPartChunk + +# Temporary migration-only aliases. Task 7 must remove these before completion. +UserPromptPart = UserMessagePart +TextPart = AssistantTextPart +ToolReturnPart = ToolResultPart + +ModelRequestPart = Annotated[ + SystemPromptPart | UserMessagePart | ToolResultPart, + Field(discriminator="part_kind"), +] +ModelResponsePart = Annotated[ + AssistantTextPart | ToolCallPart | ThinkingPart, + Field(discriminator="part_kind"), +] + + +class GenerationTurn(BaseModel): + """Deprecated: use ContextPartChunk streams instead.""" + + model_config = {"frozen": True} + + messages_in: list[ModelRequestPart] = Field(default_factory=list) + response_parts: list[ModelResponsePart] = Field(default_factory=list) + tool_results: list[ToolResultPart] = Field(default_factory=list) + turn_token_ids: list[int] | None = None + turn_logprobs: list[TokenLogprob] | None = None + policy_version: str | None = None + started_at: datetime | None = None + completed_at: datetime | None = None +``` + +- [ ] **Step 4: Run the focused tests** + +Run: + +```bash +pytest tests/unit/state/test_context_part_stream.py -v +``` + +Expected: PASS. + +- [ ] **Step 5: Run generation-related tests to expose compatibility fallout** + +Run: + +```bash +pytest tests/unit/state/test_generation_turn_build.py tests/unit/builtins/common/test_transcript_adapters.py -v +``` + +Expected: likely FAIL because existing tests assert old discriminator values such as `tool-call` and old constructor names such as `ToolReturnPart`. + +--- + +### Task 2: Replace Payload Union With Enriched Chunk Log + +**Files:** +- Modify: `ergon_core/ergon_core/core/persistence/context/event_payloads.py` +- Modify: `ergon_core/ergon_core/core/persistence/context/models.py` +- Modify: `tests/unit/architecture/test_model_field_descriptions.py` + +- [ ] **Step 1: Write failing compatibility tests for typed log payload validation** + +Update or add tests that assert the context event row validates its JSON as `ContextPartChunkLog`: + +```python +from ergon_core.core.generation import AssistantTextPart, ContextPartChunkLog +from ergon_core.core.persistence.context.models import RunContextEvent + + +def test_run_context_event_parsed_payload_is_context_part_chunk_log() -> None: + log = ContextPartChunkLog( + part=AssistantTextPart(content="hello"), + sequence=3, + worker_binding_key="worker-a", + turn_id="turn-1", + ) + event = RunContextEvent( + run_id="00000000-0000-0000-0000-000000000001", + task_execution_id="00000000-0000-0000-0000-000000000002", + worker_binding_key="worker-a", + sequence=3, + event_type="assistant_text", + payload=log.model_dump(mode="json"), + ) + + parsed = event.parsed_payload() + + assert isinstance(parsed, ContextPartChunkLog) + assert parsed.part == AssistantTextPart(content="hello") +``` + +If UUID strings are not accepted by SQLModel in this test, use `uuid.UUID(...)` values instead. + +- [ ] **Step 2: Run the failing test** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py::test_run_context_event_parsed_payload_is_context_part_chunk_log -v +``` + +Expected: FAIL until `RunContextEvent.parsed_payload()` validates the new log shape. + +- [ ] **Step 3: Collapse `event_payloads.py` into canonical exports** + +Modify `ergon_core/ergon_core/core/persistence/context/event_payloads.py` so the canonical payload is `ContextPartChunkLog`. Do not define `SystemPromptPayload`, `UserMessagePayload`, `AssistantTextPayload`, `ToolCallPayload`, `ToolResultPayload`, or `ThinkingPayload`; callers must migrate to `ContextPartChunkLog.part` and the canonical part classes. + +```python +"""Typed context event payload exports. + +The canonical context payload is an enriched ContextPartChunkLog. Event-specific +payload classes were removed in favor of ContextPartChunkLog.part. +""" + +from typing import Literal + +from ergon_core.core.generation import ( + ContextPart, + ContextPartChunk, + ContextPartChunkLog, +) + +ContextEventType = Literal[ + "system_prompt", + "user_message", + "assistant_text", + "tool_call", + "tool_result", + "thinking", +] + +ContextEventPayload = ContextPartChunkLog +``` + +- [ ] **Step 4: Update `RunContextEvent` validation** + +Modify `ergon_core/ergon_core/core/persistence/context/models.py`: + +```python +from ergon_core.core.generation import ContextPartChunkLog +from pydantic import TypeAdapter + +_PAYLOAD_ADAPTER: TypeAdapter[ContextPartChunkLog] = TypeAdapter(ContextPartChunkLog) + + +class RunContextEvent(SQLModel, table=True): + ... + + def parsed_payload(self) -> ContextPartChunkLog: + return _PAYLOAD_ADAPTER.validate_python(self.payload) +``` + +Keep `event_type: str` and `payload: dict[str, Any]` on the SQLModel row because the database stores JSON and indexes `event_type`. + +- [ ] **Step 5: Replace field-description architecture tests** + +Update `tests/unit/architecture/test_model_field_descriptions.py` to check descriptions on `ContextPartChunkLog` if the project requires descriptions for public fields. Do not keep tests against the old payload classes once they are aliases. + +- [ ] **Step 6: Run focused tests** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py tests/unit/architecture/test_model_field_descriptions.py -v +``` + +Expected: repository tests still fail until Task 3 replaces `persist_turn()` behavior. + +--- + +### Task 3: Persist Worker Chunks With Core Enrichment + +**Files:** +- Modify: `ergon_core/ergon_core/core/persistence/context/repository.py` +- Modify: `tests/unit/persistence/test_context_event_repository.py` + +- [ ] **Step 1: Write repository tests for `persist_chunk()`** + +Replace turn-oriented tests with chunk-oriented tests: + +```python +from uuid import uuid4 + +from ergon_core.core.generation import ( + AssistantTextPart, + ContextPartChunk, + ThinkingPart, + ToolCallPart, + ToolResultPart, + UserMessagePart, +) + + +async def test_persist_chunk_records_prompt_and_model_output_in_order(session): + repo = ContextEventRepository() + run_id = uuid4() + execution_id = uuid4() + + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk(part=UserMessagePart(content="question")), + ) + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk(part=ThinkingPart(content="think")), + ) + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk(part=AssistantTextPart(content="answer")), + ) + + events = repo.get_for_execution(session, execution_id) + + assert [event.sequence for event in events] == [0, 1, 2] + assert [event.event_type for event in events] == [ + "user_message", + "thinking", + "assistant_text", + ] + assert events[1].parsed_payload().turn_id == events[2].parsed_payload().turn_id + + +async def test_persist_chunk_tool_result_closes_current_turn(session): + repo = ContextEventRepository() + run_id = uuid4() + execution_id = uuid4() + + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk( + part=ToolCallPart(tool_call_id="call-1", tool_name="search", args={"q": "x"}) + ), + ) + await repo.persist_chunk( + session, + run_id=run_id, + execution_id=execution_id, + worker_binding_key="worker-a", + chunk=ContextPartChunk( + part=ToolResultPart(tool_call_id="call-1", tool_name="search", content="ok") + ), + ) + + events = repo.get_for_execution(session, execution_id) + + assert [event.event_type for event in events] == ["tool_call", "tool_result"] + assert events[0].parsed_payload().turn_id is not None + assert events[1].parsed_payload().turn_id is None +``` + +Adjust fixture names to match the existing `test_context_event_repository.py` session fixture. + +- [ ] **Step 2: Run repository tests to verify failure** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py -v +``` + +Expected: FAIL because `persist_chunk()` does not exist. + +- [ ] **Step 3: Implement event type derivation** + +In `ergon_core/ergon_core/core/persistence/context/repository.py`, add: + +```python +from ergon_core.core.generation import ( + AssistantTextPart, + ContextPartChunk, + ContextPartChunkLog, + SystemPromptPart, + ThinkingPart, + ToolCallPart, + ToolResultPart, + UserMessagePart, +) + + +def _event_type_for_part(part: ContextPart) -> str: + return part.part_kind +``` + +If type checkers object to `ContextPart` as an `Annotated` alias in the helper signature, use the explicit union type or accept `object` and narrow via `isinstance`. + +- [ ] **Step 4: Implement turn-id state machine** + +Add private state to the repository: + +```python +def __init__(self) -> None: + self._listeners: list[Callable[[RunContextEvent], Awaitable[None]]] = [] + self._sequence_counters: dict[UUID, int] = {} + self._active_turn_ids: dict[UUID, str] = {} +``` + +Add helpers: + +```python +def _turn_id_for_chunk(self, execution_id: UUID, chunk: ContextPartChunk) -> str | None: + part = chunk.part + if isinstance(part, (AssistantTextPart, ThinkingPart, ToolCallPart)): + turn_id = self._active_turn_ids.get(execution_id) + if turn_id is None: + turn_id = str(uuid4()) + self._active_turn_ids[execution_id] = turn_id + return turn_id + if isinstance(part, ToolResultPart): + self._active_turn_ids.pop(execution_id, None) + return None + if isinstance(part, (SystemPromptPart, UserMessagePart)): + return None + return None +``` + +This deliberately associates `thinking`, `assistant_text`, and `tool_call` chunks emitted contiguously with the same model-output turn. A following `tool_result` closes the active turn. + +- [ ] **Step 5: Implement `persist_chunk()`** + +Add: + +```python +async def persist_chunk( + self, + session: Session, + *, + run_id: UUID, + execution_id: UUID, + worker_binding_key: str, + chunk: ContextPartChunk, +) -> RunContextEvent: + seq = self._next_sequence(execution_id) + turn_id = self._turn_id_for_chunk(execution_id, chunk) + event_type = chunk.part.part_kind + now = datetime.now(UTC) + payload = ContextPartChunkLog( + part=chunk.part, + token_ids=chunk.token_ids, + logprobs=chunk.logprobs, + sequence=seq, + worker_binding_key=worker_binding_key, + turn_id=turn_id, + started_at=now, + completed_at=now, + ) + event = self._make_event( + run_id, + execution_id, + worker_binding_key, + seq, + payload, + started_at=payload.started_at, + completed_at=payload.completed_at, + policy_version=payload.policy_version, + ) + self._sequence_counters[execution_id] = seq + 1 + + session.add(event) + session.commit() + + for listener in self._listeners: + try: + await listener(event) + except Exception: # slopcop: ignore[no-broad-except] + logger.warning("Context event listener failed", exc_info=True) + + return event +``` + +Update `_make_event()` to accept `payload: ContextPartChunkLog` and store `payload.model_dump(mode="json")`. + +- [ ] **Step 6: Keep a temporary `persist_turn()` adapter** + +During migration only, keep `persist_turn()` by decomposing old `GenerationTurn` into chunks: + +```python +async def persist_turn(..., turn: GenerationTurn) -> list[RunContextEvent]: + events: list[RunContextEvent] = [] + for part in turn.messages_in: + events.append(await self.persist_chunk(..., chunk=ContextPartChunk(part=part))) + for part in turn.response_parts: + events.append( + await self.persist_chunk( + ..., + chunk=ContextPartChunk( + part=part, + token_ids=turn.turn_token_ids, + logprobs=turn.turn_logprobs, + ), + ) + ) + for part in turn.tool_results: + events.append(await self.persist_chunk(..., chunk=ContextPartChunk(part=part))) + return events +``` + +This keeps old workers running while the execution service migrates to chunks. + +- [ ] **Step 7: Run persistence tests** + +Run: + +```bash +pytest tests/unit/persistence/test_context_event_repository.py -v +``` + +Expected: PASS after updating any old assertions to inspect `event.parsed_payload().part`. + +--- + +### Task 4: Migrate PydanticAI Adapter To Chunk Streams + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py` +- Modify: `tests/unit/builtins/common/test_transcript_adapters.py` +- Modify: `tests/unit/state/test_generation_turn_build.py` +- Modify: `tests/unit/state/test_context_assembly.py` + +- [ ] **Step 1: Write adapter tests for chunk extraction** + +Update `tests/unit/builtins/common/test_transcript_adapters.py` so PydanticAI transcript extraction returns chunks: + +```python +def test_text_and_thinking_are_context_part_chunks() -> None: + adapter = PydanticAITranscriptAdapter() + + chunks = adapter.build_chunks( + [ + ModelRequest(parts=[UserPromptPart(content="hard question")]), + ModelResponse( + parts=[ + ThinkingPart(content="let me reason"), + TextPart(content="answer"), + ] + ), + ] + ) + + assert [chunk.part.part_kind for chunk in chunks] == [ + "user_message", + "thinking", + "assistant_text", + ] +``` + +Add a tool-call/tool-result test: + +```python +def test_tool_call_and_return_become_context_part_chunks() -> None: + adapter = PydanticAITranscriptAdapter() + + chunks = adapter.build_chunks( + [ + ModelRequest(parts=[UserPromptPart(content="search")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="search", + tool_call_id="call-1", + args={"query": "ergon"}, + ) + ] + ), + ModelRequest( + parts=[ + ToolReturnPart( + tool_name="search", + tool_call_id="call-1", + content={"result": "found"}, + ) + ] + ), + ] + ) + + assert [chunk.part.part_kind for chunk in chunks] == [ + "user_message", + "tool_call", + "tool_result", + ] +``` + +- [ ] **Step 2: Run adapter tests to verify failure** + +Run: + +```bash +pytest tests/unit/builtins/common/test_transcript_adapters.py -v +``` + +Expected: FAIL because `build_chunks()` does not exist. + +- [ ] **Step 3: Implement `build_chunks()` and `build_new_chunks()`** + +In `ergon_builtins/ergon_builtins/common/llm_context/adapters/pydantic_ai.py`, add methods parallel to the existing turn methods: + +```python +def build_chunks( + self, + transcript: list[ModelMessage], + *, + flush_pending: bool = True, +) -> list[ContextPartChunk]: + return _build_chunks_from_transcript(transcript, flush_pending=flush_pending) + + +def build_new_chunks( + self, + transcript: list[ModelMessage], + cursor: TranscriptTurnCursor, + *, + flush_pending: bool = False, +) -> list[ContextPartChunk]: + chunks = _build_chunks_from_transcript(transcript, flush_pending=flush_pending) + new_chunks = chunks[cursor.emitted_turn_count :] + cursor.emitted_turn_count = len(chunks) + return new_chunks +``` + +Rename `TranscriptTurnCursor.emitted_turn_count` to `emitted_chunk_count` only if the migration can update all callers in one task. Otherwise leave the field name temporarily and add a follow-up cleanup task. + +- [ ] **Step 4: Implement PydanticAI part conversion** + +Replace old `_extract_request_parts`, `_extract_response_parts`, and `_extract_tool_results` internals with chunk builders: + +```python +def _chunks_from_request(request: ModelRequest) -> list[ContextPartChunk]: + chunks: list[ContextPartChunk] = [] + for part in request.parts: + if isinstance(part, PydanticSystemPromptPart): + chunks.append(ContextPartChunk(part=SystemPromptPart(content=part.content))) + elif isinstance(part, PydanticUserPromptPart) and isinstance(part.content, str): + chunks.append(ContextPartChunk(part=UserMessagePart(content=part.content))) + elif isinstance(part, PydanticToolReturnPart): + chunks.append( + ContextPartChunk( + part=ToolResultPart( + tool_call_id=part.tool_call_id, + tool_name=part.tool_name, + content=_serialize_tool_content(part.content), + ) + ) + ) + return chunks + + +def _chunks_from_response(response: ModelResponse) -> list[ContextPartChunk]: + logprobs = extract_logprobs(response) + chunks: list[ContextPartChunk] = [] + for part in response.parts: + if isinstance(part, PydanticTextPart): + chunks.append( + ContextPartChunk(part=AssistantTextPart(content=part.content), logprobs=logprobs) + ) + logprobs = None + elif isinstance(part, PydanticToolCallPart): + chunks.append( + ContextPartChunk( + part=ToolCallPart( + tool_name=part.tool_name, + tool_call_id=part.tool_call_id, + args=part.args_as_dict(), + ), + logprobs=logprobs, + ) + ) + logprobs = None + elif isinstance(part, PydanticThinkingPart): + chunks.append( + ContextPartChunk(part=ThinkingPart(content=part.content), logprobs=logprobs) + ) + logprobs = None + return chunks +``` + +Only attach turn-level logprobs to the first model-output chunk. This matches the current persisted behavior where sibling events omit the shared token stream after the first model-output event. + +- [ ] **Step 5: Implement replay from chunk logs** + +Update `assemble_replay()` to consume `RunContextEvent.parsed_payload()` as `ContextPartChunkLog`, then switch on `log.part`. + +```python +payload = event.parsed_payload() +part = payload.part +``` + +Map: +- `SystemPromptPart` -> `PydanticSystemPromptPart` +- `UserMessagePart` -> `PydanticUserPromptPart` +- `ToolResultPart` -> `PydanticToolReturnPart` +- `ThinkingPart` -> `PydanticThinkingPart` +- `AssistantTextPart` -> `PydanticTextPart` +- `ToolCallPart` -> `PydanticToolCallPart` + +- [ ] **Step 6: Keep old adapter methods as wrappers** + +Keep `build_turns()` and `build_new_turns()` temporarily by grouping chunks into a deprecated `GenerationTurn` only if old callers still exist at this point. Add comments marking them as migration-only. Task 7 must delete these wrappers; the final codebase must not expose the old turn API. + +- [ ] **Step 7: Run adapter and replay tests** + +Run: + +```bash +pytest tests/unit/builtins/common/test_transcript_adapters.py tests/unit/state/test_context_assembly.py tests/unit/state/test_generation_turn_build.py -v +``` + +Expected: PASS after old tests are rewritten or any migration-only wrappers are correct. These wrappers are not allowed to remain after Task 7. + +--- + +### Task 5: Migrate Worker Interface And Execution Persistence + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/task_execution_service.py` +- Modify: `ergon_core/ergon_core/api/results.py` +- Modify: worker base API files that type `execute()` return values. +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/react_worker.py` +- Modify: `ergon_builtins/ergon_builtins/workers/baselines/training_stub_worker.py` +- Modify: smoke fixture workers under `ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/` +- Modify: `tests/unit/workers/test_react_worker_contract.py` +- Modify: `tests/unit/state/test_research_rubrics_workers.py` + +- [ ] **Step 1: Find all `AsyncGenerator[GenerationTurn` callers** + +Run: + +```bash +rg "AsyncGenerator\\[GenerationTurn|GenerationTurn" ergon_core ergon_builtins tests -n +``` + +Expected: a finite list including builtins workers, smoke fixtures, test support, and execution persistence. + +- [ ] **Step 2: Update worker API type hints** + +Replace worker `execute()` signatures from: + +```python +) -> AsyncGenerator[GenerationTurn, None]: +``` + +to: + +```python +) -> AsyncGenerator[ContextPartChunk, None]: +``` + +Import `ContextPartChunk` from `ergon_core.core.generation`. + +- [ ] **Step 3: Update task execution persistence loop** + +In `task_execution_service.py`, replace the turn persistence call: + +```python +async for turn in worker.execute(task, context=context): + await context_event_repository.persist_turn( + session, + run_id=run_id, + execution_id=execution.id, + worker_binding_key=worker_binding_key, + turn=turn, + ) +``` + +with: + +```python +async for chunk in worker.execute(task, context=context): + await context_event_repository.persist_chunk( + session, + run_id=run_id, + execution_id=execution.id, + worker_binding_key=worker_binding_key, + chunk=chunk, + ) +``` + +Keep exact local variable names consistent with the existing file. + +- [ ] **Step 4: Update simple text-yielding workers** + +For smoke workers that currently yield: + +```python +yield GenerationTurn(response_parts=[TextPart(content="...")]) +``` + +replace with: + +```python +yield ContextPartChunk(part=AssistantTextPart(content="...")) +``` + +For user prompt setup chunks, emit: + +```python +yield ContextPartChunk(part=UserMessagePart(content="...")) +``` + +Only emit prompt chunks if the worker previously included them in `messages_in`; do not invent additional prompt events. + +- [ ] **Step 5: Update `training_stub_worker.py`** + +Replace synthetic `GenerationTurn` creation with chunk lists: + +```python +chunks: list[ContextPartChunk] = [] +chunks.append(ContextPartChunk(part=UserMessagePart(content=f"Task: Synthetic task {task_slug}"))) +chunks.append( + ContextPartChunk( + part=ToolCallPart( + tool_name="stub_tool", + tool_call_id=f"call_{i}", + args={"turn": i, "task": task_slug}, + ), + logprobs=logprobs, + ) +) +chunks.append( + ContextPartChunk( + part=ToolResultPart( + tool_call_id=f"call_{i}", + tool_name="stub_tool", + content=f"Tool result for turn {i} of {task_slug}", + ) + ) +) +``` + +For final assistant output: + +```python +ContextPartChunk( + part=AssistantTextPart(content=f"Synthetic response turn {i}"), + logprobs=logprobs, +) +``` + +- [ ] **Step 6: Update `react_worker.py`** + +Where the worker previously handled `GenerationTurn` outputs or inspected payload classes, switch to chunk/log parts: + +```python +payload = event.parsed_payload() +part = payload.part +if isinstance(part, AssistantTextPart): + ... +``` + +For final assistant message extraction, replace `AssistantTextPayload` checks with `AssistantTextPart`. + +- [ ] **Step 7: Run worker contract tests** + +Run: + +```bash +pytest tests/unit/workers/test_react_worker_contract.py tests/unit/state/test_research_rubrics_workers.py -v +``` + +Expected: PASS after signatures and assertions are migrated. + +--- + +### Task 6: Update REST, Dashboard, And RL Consumers + +**Files:** +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Modify: `ergon_core/ergon_core/core/api/runs.py` +- Modify: `ergon_core/ergon_core/core/dashboard/event_contracts.py` +- Modify: `ergon_core/ergon_core/core/dashboard/emitter.py` +- Modify: `ergon_core/ergon_core/core/rl/extraction.py` +- Modify: dashboard generated contracts if this repo checks them in. +- Modify: `tests/unit/dashboard/test_event_contract_types.py` + +- [ ] **Step 1: Type REST context event DTOs with chunk logs** + +Modify `RunContextEventDto`: + +```python +from ergon_core.core.generation import ContextPartChunkLog +from ergon_core.core.persistence.context.event_payloads import ContextEventType + + +class RunContextEventDto(CamelModel): + id: str + task_execution_id: str + task_node_id: str + worker_binding_key: str + sequence: int + event_type: ContextEventType + payload: ContextPartChunkLog + created_at: str + started_at: str | None = None + completed_at: str | None = None +``` + +- [ ] **Step 2: Project typed payloads in REST snapshots** + +In `_context_events_by_task()`, change: + +```python +payload=event.payload, +``` + +to: + +```python +payload=event.parsed_payload(), +``` + +Keep `event_type=cast(ContextEventType, event.event_type)` if type checking requires it. + +- [ ] **Step 3: Type dashboard event contracts with the same payload** + +In `event_contracts.py`, ensure: + +```python +payload: ContextPartChunkLog +``` + +instead of the old `ContextEventPayload` union alias if that alias is still confusing. + +- [ ] **Step 4: Update dashboard emitter payload validation** + +In `emitter.py`, validate as: + +```python +payload=event.parsed_payload() +``` + +instead of constructing a separate TypeAdapter in the emitter. + +- [ ] **Step 5: Update RL extraction** + +Change event handling from payload-class checks to part-class checks: + +```python +payload = event.parsed_payload() +part = payload.part + +if isinstance(part, (SystemPromptPart, UserMessagePart)): + ... +elif isinstance(part, (AssistantTextPart, ToolCallPart, ThinkingPart)): + token_ids = _get_token_ids(payload, tokenizer) +elif isinstance(part, ToolResultPart): + result_tokens = tokenizer.encode(str(part.content)) +``` + +Update `_get_token_ids()` to accept `ContextPartChunkLog` and inspect `payload.part`. + +- [ ] **Step 6: Run REST/dashboard/RL tests** + +Run: + +```bash +pytest tests/unit/dashboard/test_event_contract_types.py tests/unit/state/test_context_assembly.py tests/unit/persistence/test_context_event_repository.py -v +``` + +Expected: PASS after DTOs and consumers use `ContextPartChunkLog`. + +--- + +### Task 7: Add Architecture Guards And Remove Deprecated Turn API + +**Files:** +- Modify: `tests/unit/architecture/test_core_schema_sources.py` +- Modify: `ergon_core/ergon_core/core/generation.py` +- Modify: any remaining files found by `rg`. + +- [ ] **Step 1: Add architecture guard against duplicate context payload unions** + +Add to `tests/unit/architecture/test_core_schema_sources.py`: + +```python +from pathlib import Path + + +def test_context_stream_has_single_discriminated_part_union() -> None: + root = Path(__file__).resolve().parents[3] + generation = root / "ergon_core" / "ergon_core" / "core" / "generation.py" + event_payloads = ( + root + / "ergon_core" + / "ergon_core" + / "core" + / "persistence" + / "context" + / "event_payloads.py" + ) + + generation_text = generation.read_text() + event_payloads_text = event_payloads.read_text() + + assert "ContextPart = Annotated[" in generation_text + assert "SystemPromptPayload |" not in event_payloads_text + assert "AssistantTextPayload |" not in event_payloads_text + assert "ToolCallPayload |" not in event_payloads_text +``` + +- [ ] **Step 2: Run the architecture test** + +Run: + +```bash +pytest tests/unit/architecture/test_core_schema_sources.py -v +``` + +Expected: PASS only after `event_payloads.py` no longer owns a duplicate payload union. + +- [ ] **Step 3: Remove deprecated `GenerationTurn` compatibility** + +Run: + +```bash +rg "GenerationTurn|ModelRequestPart|ModelResponsePart|ToolReturnPart|TextPart|UserPromptPart" ergon_core ergon_builtins tests -n +``` + +Remove remaining old names where possible. Keep `TextPart` only when it refers to `pydantic_ai.messages.TextPart`, and alias it as `PydanticTextPart` in imports to avoid confusion. + +- [ ] **Step 4: Delete compatibility aliases** + +From `generation.py`, remove: + +```python +UserPromptPart = UserMessagePart +TextPart = AssistantTextPart +ToolReturnPart = ToolResultPart +ModelRequestPart = ... +ModelResponsePart = ... +class GenerationTurn(...) +``` + +Only do this once `rg` confirms no production caller depends on those names. + +- [ ] **Step 5: Verify no old payload classes or aliases exist in `event_payloads.py`** + +Run: + +```bash +rg "SystemPromptPayload|UserMessagePayload|AssistantTextPayload|ToolCallPayload|ToolResultPayload|ThinkingPayload" ergon_core ergon_builtins tests -n +``` + +Expected: no production matches. Test matches should be migrated to `ContextPartChunkLog` and canonical part classes. + +Confirm `event_payloads.py` does not define or export: + +```python +SystemPromptPayload +UserMessagePayload +AssistantTextPayload +ToolCallPayload +ToolResultPayload +ThinkingPayload +``` + +Keep: + +```python +ContextEventType +ContextEventPayload = ContextPartChunkLog +``` + +or rename `ContextEventPayload` to `ContextPartChunkLog` everywhere if the alias is no longer useful. + +- [ ] **Step 6: Run full focused suite** + +Run: + +```bash +pytest \ + tests/unit/state/test_context_part_stream.py \ + tests/unit/persistence/test_context_event_repository.py \ + tests/unit/builtins/common/test_transcript_adapters.py \ + tests/unit/state/test_context_assembly.py \ + tests/unit/workers/test_react_worker_contract.py \ + tests/unit/dashboard/test_event_contract_types.py \ + tests/unit/architecture/test_core_schema_sources.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 7: Run broader unit smoke** + +Run: + +```bash +pytest tests/unit -q +``` + +Expected: PASS, or only unrelated pre-existing failures. Investigate any failures mentioning context events, generation turns, workers, dashboard contracts, replay, or RL extraction. + +--- + +## Migration Notes + +This is a schema/API clean break. Do not preserve backwards compatibility with the old schemas in the final state. + +Temporary adapters are allowed only inside intermediate tasks to make the migration reviewable: +- `GenerationTurn` can exist only until worker execution is moved to chunks. +- request/response subset aliases can exist only until all worker and adapter callers move to `ContextPartChunk`. +- old `*Payload` event classes should not be reintroduced as aliases; migrate those callers directly to `ContextPartChunkLog.part`. + +After Task 7, the only canonical stream type should be `ContextPart`, the worker generator type should be `ContextPartChunk`, and the enriched log type should be `ContextPartChunkLog`. + +Do not add a second new union in `event_payloads.py`. Do not leave compatibility exports for the old payload classes. Either outcome recreates the drift this plan is removing. + +--- + +## Self-Review + +**Spec coverage:** The plan implements the requested model: `ContextPart` as the single discriminated union, `ContextPartChunk` as the worker generator type, and `ContextPartChunkLog` as the core-enriched persistence/API shape. + +**Placeholder scan:** No steps rely on `TBD`, unspecified tests, or unnamed files. Commands and expected outcomes are included for each task. + +**Type consistency:** The plan consistently uses `content` for text-bearing parts, `part_kind` for the part discriminator, `token_ids`/`logprobs` for worker-provided token metadata, and `sequence`/`worker_binding_key`/`turn_id` for core-enriched log metadata. + +--- + +## Execution Handoff + +Plan complete and saved to `docs/superpowers/plans/2026-04-28-context-part-chunk-stream.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - dispatch a fresh subagent per task, review between tasks, fast iteration. + +**2. Inline Execution** - execute tasks in this session using executing-plans, batch execution with checkpoints. + +Which approach? diff --git a/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md b/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md new file mode 100644 index 00000000..f714978d --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md @@ -0,0 +1,909 @@ +# Evaluation Resource Context and Scoring Patch Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make evaluator criteria fetch their own task-scoped resources, judge final artifacts rather than assistant summaries, and preserve evaluator-normalized scores without double-normalizing. + +**Architecture:** Core remains benchmark-agnostic: it exposes task-scoped resource access through `CriterionRuntime`. Benchmark criteria in `ergon_builtins` decide which resources to read, how to sort final outputs vs scratch files, and what to show verifiers or LLM judges. Evaluation persistence assumes all evaluators return normalized scalar task scores. + +**Tech Stack:** Python, Pydantic models, SQLModel, Ergon `CriterionRuntime`, ResearchRubrics LLM judge, real-LLM rollout artifacts. + +--- + +## Code Change Map + +- Modify: `ergon_core/ergon_core/api/criterion_runtime.py` + - Add optional `task_execution_id` to `list_resources`. + - Add `read_resource_by_id` so criteria can read exact SQL rows after listing. + +- Modify: `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` + - Implement optional task-execution scoping for `list_resources`. + - Implement `read_resource_by_id`. + - Keep core generic: no final-vs-scratch classification here. + +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py` + - Fetch resources from `context.runtime`. + - Classify ResearchRubrics final outputs vs scratch files locally. + - Build the judge prompt from resource content plus final assistant message. + - Record `evaluated_resource_ids` and `evaluation_input`. + +- Modify: `ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py` + - Stop re-normalizing `TaskEvaluationResult.score`. + - Store `summary.normalized_score = result.score`. + +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py` + - Keep existing ResearchRubrics formula, but clarify metadata with normalized score semantics. + +- Modify: `tests/real_llm/artifact_health.py` + - Detect missing final output via task-scoped resource rows and final-output provenance, not durable blob `file_path`. + +- Tests: + - `tests/unit/state/test_criterion_runtime_di.py` + - `tests/unit/state/test_research_rubrics_benchmark.py` + - `tests/unit/runtime/test_evaluation_summary_contracts.py` + - `tests/unit/runtime/test_real_llm_rollout_artifact_health.py` + +--- + +## Task 1: Extend Core Runtime Resource Access + +**Files:** +- Modify: `ergon_core/ergon_core/api/criterion_runtime.py` +- Modify: `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py` +- Test: `tests/unit/state/test_criterion_runtime_di.py` + +### Rationale + +Criteria should own context selection. Core should only provide generic resource primitives: + +- list resources for the evaluated task execution by default; +- optionally list resources for an explicit task execution id; +- read exact resources by id to avoid name collisions. + +Core must not know about ResearchRubrics final reports, scratchpads, or judge prompt layout. + +### Patch: Public Protocol + +In `ergon_core/ergon_core/api/criterion_runtime.py`, add `UUID` under `TYPE_CHECKING` or as a normal import. Since Protocol signatures need the type at runtime under postponed annotations are not enabled in this file, use a normal import: + +```python +from uuid import UUID +``` + +Change the resource methods: + +```python +# ── resource I/O ────────────────────────────────────────────────── +async def read_resource(self, name: str) -> bytes: ... +async def read_resource_by_id(self, resource_id: UUID) -> bytes: ... +async def list_resources( + self, + task_execution_id: UUID | None = None, +) -> "list[RunResourceView]": ... +async def get_all_files_for_task(self) -> "dict[str, bytes]": + """Return ``{name: bytes}`` for every resource produced by this task. + + Scoped to the runtime's evaluator-bound task execution. On duplicate + ``name`` s, the newest ``created_at`` wins. Not size-capped. + """ + ... +``` + +### Patch: Concrete Runtime + +In `ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py`, keep the existing SQLModel imports: + +```python +from sqlmodel import Session, desc, select +``` + +Add exact-id reading after `read_resource`: + +```python +async def read_resource_by_id(self, resource_id: UUID) -> bytes: + """Read one worker-published blob by its RunResource primary key.""" + with get_session() as session: + row = session.get(RunResource, resource_id) + + if row is None or row.run_id != self._run_id: + raise ResourceNotFoundError( + f"No run_resource {resource_id!s} for run {self._run_id}" + ) + + result = Path(row.file_path).read_bytes() + logger.info( + "criterion read_resource_by_id run_id=%s resource_id=%s size_bytes=%d", + self._run_id, + resource_id, + len(result), + ) + return result +``` + +Replace `list_resources` with task-aware behavior: + +```python +async def list_resources( + self, + task_execution_id: UUID | None = None, +) -> list[RunResourceView]: + """Return resource DTOs for this run, newest first. + + Defaults to this runtime's evaluated task execution. Passing + ``task_execution_id`` lets a benchmark criterion inspect a related task + explicitly without core knowing benchmark semantics. + """ + effective_execution_id = ( + task_execution_id if task_execution_id is not None else self._task_id + ) + with get_session() as session: + stmt = select(RunResource).where(RunResource.run_id == self._run_id) + if effective_execution_id is not None: + stmt = stmt.where(RunResource.task_execution_id == effective_execution_id) + stmt = stmt.order_by(desc(RunResource.created_at)) + rows = list(session.exec(stmt).all()) + return [RunResourceView.from_row(r) for r in rows] +``` + +### Tests + +In `tests/unit/state/test_criterion_runtime_di.py`, update the protocol test expected method set: + +```python +expected = { + "ensure_sandbox", + "upload_files", + "write_file", + "run_command", + "execute_code", + "cleanup", + "read_resource", + "read_resource_by_id", + "list_resources", + "get_all_files_for_task", + "db_read_session", + "event_sink", +} +``` + +Add tests: + +```python +@pytest.mark.asyncio +async def test_list_resources_defaults_to_runtime_task_execution() -> None: + task_execution_id = uuid4() + runtime = _make_runtime(task_id=task_execution_id) + + mock_row = MagicMock() + mock_session = MagicMock() + mock_session.__enter__ = MagicMock(return_value=mock_session) + mock_session.__exit__ = MagicMock(return_value=False) + mock_session.exec.return_value.all.return_value = [mock_row] + + with ( + patch( + "ergon_core.core.runtime.evaluation.criterion_runtime.get_session", + return_value=mock_session, + ), + patch.object(RunResourceView, "from_row", return_value=MagicMock()) as mock_from_row, + ): + result = await runtime.list_resources() + + assert len(result) == 1 + mock_from_row.assert_called_once_with(mock_row) + # Keep this assertion broad: SQLModel statements are hard to compare, but + # this ensures a DB query was issued through the runtime path. + mock_session.exec.assert_called_once() +``` + +```python +@pytest.mark.asyncio +async def test_read_resource_by_id_reads_exact_blob(tmp_path: Path) -> None: + blob = tmp_path / "abc" + blob.write_bytes(b"exact-resource") + + run_id = uuid4() + resource_id = uuid4() + row = MagicMock() + row.id = resource_id + row.run_id = run_id + row.file_path = str(blob) + + runtime = _make_runtime(run_id=run_id) + + mock_session = MagicMock() + mock_session.__enter__ = MagicMock(return_value=mock_session) + mock_session.__exit__ = MagicMock(return_value=False) + mock_session.get.return_value = row + + with patch( + "ergon_core.core.runtime.evaluation.criterion_runtime.get_session", + return_value=mock_session, + ): + result = await runtime.read_resource_by_id(resource_id) + + assert result == b"exact-resource" +``` + +Run: + +```bash +uv run pytest tests/unit/state/test_criterion_runtime_di.py -q +``` + +Expected: all tests pass. + +--- + +## Task 2: Make ResearchRubrics Criterion Fetch and Package Its Own Evidence + +**Files:** +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py` +- Test: `tests/unit/state/test_research_rubrics_benchmark.py` + +### Rationale + +ResearchRubrics should judge the actual task artifacts, not the final assistant summary. The built-in criterion should use the generic runtime to fetch resources, then apply ResearchRubrics-specific evidence policy: + +- final outputs first; +- scratch/intermediate resources second; +- final assistant message as status/context only. + +### Patch + +Add imports: + +```python +from uuid import UUID + +from ergon_core.api.run_resource import RunResourceView +``` + +Add constants and a small local evidence type: + +```python +_MAX_RESOURCE_CHARS = 30_000 +_FINAL_OUTPUT_PREFIX = "/workspace/final_output/" + + +class _ResourceEvidence(BaseModel): + model_config = {"frozen": True, "arbitrary_types_allowed": True} + + resource: RunResourceView + content: str + + @property + def resource_id(self) -> str: + return str(self.resource.id) +``` + +Change `evaluate`: + +```python +async def evaluate(self, context: EvaluationContext) -> CriterionResult: + final_outputs, scratch_outputs = await _load_researchrubrics_evidence(context) + user_prompt = _build_user_prompt( + context, + final_outputs=final_outputs, + scratch_outputs=scratch_outputs, + ) + verdict = await call_structured_judge( + messages=[ + JudgeMessage(role="system", content=self.system_prompt), + JudgeMessage(role="user", content=user_prompt), + ], + response_type=ResearchRubricsVerdict, + model=self.model, + ) + evaluated_resource_ids = [ + evidence.resource_id for evidence in [*final_outputs, *scratch_outputs] + ] + return CriterionResult( + name=self.name, + score=self.max_score if verdict.passed else 0.0, + passed=verdict.passed, + weight=self.weight, + feedback=verdict.reasoning, + evaluation_input=_summarize_evaluation_input( + final_outputs=final_outputs, + scratch_outputs=scratch_outputs, + final_assistant_message=context.worker_result.output, + ), + evaluated_resource_ids=evaluated_resource_ids, + metadata={ + "primary_evidence_resource_ids": [e.resource_id for e in final_outputs], + "scratch_evidence_resource_ids": [e.resource_id for e in scratch_outputs], + }, + ) +``` + +Add evidence loading helpers: + +```python +async def _load_researchrubrics_evidence( + context: EvaluationContext, +) -> tuple[list[_ResourceEvidence], list[_ResourceEvidence]]: + if context.runtime is None: + return [], [] + + resources = await context.runtime.list_resources() + final_resources = [resource for resource in resources if _is_final_output_resource(resource)] + scratch_resources = [resource for resource in resources if resource not in final_resources] + + final_outputs = await _read_text_resources(context, final_resources) + scratch_outputs = await _read_text_resources(context, scratch_resources) + return final_outputs, scratch_outputs +``` + +```python +async def _read_text_resources( + context: EvaluationContext, + resources: list[RunResourceView], +) -> list[_ResourceEvidence]: + if context.runtime is None: + return [] + + evidence: list[_ResourceEvidence] = [] + for resource in resources: + if not _is_text_like(resource): + continue + content_bytes = await context.runtime.read_resource_by_id(resource.id) + content = content_bytes.decode("utf-8", errors="replace") + if len(content) > _MAX_RESOURCE_CHARS: + content = content[:_MAX_RESOURCE_CHARS] + "\n\n[truncated]" + evidence.append(_ResourceEvidence(resource=resource, content=content)) + return evidence +``` + +```python +def _is_text_like(resource: RunResourceView) -> bool: + return ( + resource.mime_type.startswith("text/") + or resource.mime_type in {"application/json", "application/x-ndjson"} + or resource.name.endswith((".md", ".txt", ".json", ".jsonl", ".csv")) + ) +``` + +```python +def _is_final_output_resource(resource: RunResourceView) -> bool: + origin = resource.metadata.get("sandbox_origin") + return isinstance(origin, str) and origin.startswith(_FINAL_OUTPUT_PREFIX) +``` + +Replace `_build_user_prompt`: + +```python +def _build_user_prompt( + context: EvaluationContext, + *, + final_outputs: list[_ResourceEvidence], + scratch_outputs: list[_ResourceEvidence], +) -> str: + return "\n\n".join( + [ + f"Original research request:\n{context.task.description}", + _format_resource_section( + "Final output resources (primary answer to judge)", + final_outputs, + empty="No final output resources were published.", + ), + _format_resource_section( + "Scratch/intermediate resources (supporting context; do not treat as final answer)", + scratch_outputs, + empty="No scratch resources were published.", + ), + ( + "Final assistant message (execution summary/status, not the primary answer):\n" + f"{context.worker_result.output}" + ), + ] + ) +``` + +Add format helpers: + +```python +def _format_resource_section( + title: str, + resources: list[_ResourceEvidence], + *, + empty: str, +) -> str: + if not resources: + return f"{title}:\n{empty}" + blocks = [f"{title}:"] + for evidence in resources: + resource = evidence.resource + origin = resource.metadata.get("sandbox_origin") + blocks.append( + "\n".join( + [ + f"--- resource_id={resource.id} name={resource.name} kind={resource.kind}", + f"mime_type={resource.mime_type} sandbox_origin={origin}", + evidence.content, + ] + ) + ) + return "\n\n".join(blocks) +``` + +```python +def _summarize_evaluation_input( + *, + final_outputs: list[_ResourceEvidence], + scratch_outputs: list[_ResourceEvidence], + final_assistant_message: str, +) -> str: + return "\n".join( + [ + "Evidence used by ResearchRubrics judge:", + "final_outputs=" + + ", ".join(f"{e.resource.name}:{e.resource.id}" for e in final_outputs), + "scratch_outputs=" + + ", ".join(f"{e.resource.name}:{e.resource.id}" for e in scratch_outputs), + "final_assistant_message=" + + final_assistant_message[:1000], + ] + ) +``` + +### Tests + +In `tests/unit/state/test_research_rubrics_benchmark.py`, add a fake runtime and direct unit test for the criterion. + +```python +class _Runtime: + def __init__(self, resources, blobs): + self._resources = resources + self._blobs = blobs + + async def list_resources(self, task_execution_id=None): + return self._resources + + async def read_resource_by_id(self, resource_id): + return self._blobs[resource_id] +``` + +Patch `call_structured_judge` and assert: + +```python +@pytest.mark.asyncio +async def test_researchrubrics_judge_uses_final_resource_content(monkeypatch): + from uuid import uuid4 + from ergon_core.api.evaluation_context import EvaluationContext + from ergon_core.api.results import WorkerOutput + from ergon_core.api.run_resource import RunResourceKind, RunResourceView + from ergon_builtins.benchmarks.researchrubrics.judge_criterion import ( + ResearchRubricsJudgeCriterion, + ResearchRubricsVerdict, + ) + + report_id = uuid4() + scratch_id = uuid4() + run_id = uuid4() + execution_id = uuid4() + report = RunResourceView( + id=report_id, + run_id=run_id, + task_execution_id=execution_id, + kind=RunResourceKind.REPORT, + name="report.md", + mime_type="text/markdown", + file_path="/tmp/blob/report", + size_bytes=12, + content_hash="abc", + error=None, + metadata={"sandbox_origin": "/workspace/final_output/report.md"}, + ) + scratch = RunResourceView( + id=scratch_id, + run_id=run_id, + task_execution_id=execution_id, + kind=RunResourceKind.NOTE, + name="notes.md", + mime_type="text/markdown", + file_path="/tmp/blob/notes", + size_bytes=5, + content_hash="def", + error=None, + metadata={"sandbox_origin": "/workspace/scratch/notes.md"}, + ) + captured = {} + + async def fake_judge(*, messages, response_type, model): + captured["prompt"] = messages[1].content + return ResearchRubricsVerdict(reasoning="report satisfies criterion", passed=True) + + monkeypatch.setattr( + "ergon_builtins.benchmarks.researchrubrics.judge_criterion.call_structured_judge", + fake_judge, + ) + + criterion = ResearchRubricsJudgeCriterion( + name="criterion_0", + rubric=RubricCriterion(criterion="Includes sources.", axis="Explicit", weight=2.0), + ) + task = BenchmarkTask( + task_slug="sample", + instance_key="default", + description="Write a report.", + ) + context = EvaluationContext( + run_id=run_id, + task_id=uuid4(), + execution_id=execution_id, + task=task, + worker_result=WorkerOutput(output="Wrote report.md"), + runtime=_Runtime( + [report, scratch], + { + report_id: b"# Findings\nFinal report text", + scratch_id: b"draft notes", + }, + ), + ) + + result = await criterion.evaluate(context) + + assert result.passed is True + assert str(report_id) in result.evaluated_resource_ids + assert str(scratch_id) in result.evaluated_resource_ids + assert "Final output resources" in captured["prompt"] + assert "Final report text" in captured["prompt"] + assert "Scratch/intermediate resources" in captured["prompt"] + assert "draft notes" in captured["prompt"] +``` + +Run: + +```bash +uv run pytest tests/unit/state/test_research_rubrics_benchmark.py -q +``` + +Expected: all tests pass. + +--- + +## Task 3: Align Rollout Artifact Health With Task-Scoped Final Outputs + +**Files:** +- Modify: `tests/real_llm/artifact_health.py` +- Test: `tests/unit/runtime/test_real_llm_rollout_artifact_health.py` + +### Rationale + +Health analysis works on dumped JSONL, not live SQL. It should mirror the same policy: + +- group resources by `task_execution_id`; +- a completed task has a final output if at least one resource has `metadata_json.sandbox_origin` under `/workspace/final_output/`; +- do not compare durable blob `file_path` to logical sandbox paths. + +### Patch + +In `tests/real_llm/artifact_health.py`, add helpers near `_tool_budget_signals`: + +```python +_FINAL_OUTPUT_PREFIX = "/workspace/final_output/" + + +def _resource_metadata(resource: dict[str, Any]) -> dict[str, Any]: # slopcop: ignore[no-typing-any] + metadata = resource.get("metadata_json") or resource.get("metadata") or {} + if isinstance(metadata, str): + return json.loads(metadata) + return metadata if isinstance(metadata, dict) else {} + + +def _is_final_output_resource(resource: dict[str, Any]) -> bool: # slopcop: ignore[no-typing-any] + origin = _resource_metadata(resource).get("sandbox_origin") + return isinstance(origin, str) and origin.startswith(_FINAL_OUTPUT_PREFIX) +``` + +Replace current `missing_final_report` calculation: + +```python +completed_execution_ids = { + str(execution.get("id")) + for execution in executions + if execution.get("status") == "completed" and execution.get("id") is not None +} +final_output_execution_ids = { + str(resource.get("task_execution_id")) + for resource in resources + if resource.get("task_execution_id") is not None and _is_final_output_resource(resource) +} +missing_final_report = bool(completed_execution_ids - final_output_execution_ids) +``` + +This field name can stay `missing_final_report` for now to avoid dashboard churn, but the semantics become “completed task is missing a final-output resource.” + +### Tests + +In `tests/unit/runtime/test_real_llm_rollout_artifact_health.py`, update `_write_minimal_rollout` to optionally write final-output metadata: + +```python +def _write_minimal_rollout( + root: Path, + *, + task_count: int = 1, + evaluation_rows: list[dict] | None = None, + resource_rows: list[dict] | None = None, +) -> None: + ... + execution_ids = [str(uuid4()) for _ in range(task_count)] + ... + _write_jsonl( + db / "run_task_executions.jsonl", + [ + { + "id": execution_ids[idx], + "task_slug": f"task-{idx}", + "status": "completed", + } + for idx in range(task_count) + ], + ) + ... + _write_jsonl( + db / "run_resources.jsonl", + resource_rows + if resource_rows is not None + else [ + { + "id": str(uuid4()), + "task_execution_id": execution_ids[0], + "name": "report.md", + "metadata_json": {"sandbox_origin": "/workspace/final_output/report.md"}, + } + ], + ) +``` + +Add: + +```python +def test_artifact_health_detects_final_output_by_task_resource_metadata(tmp_path: Path) -> None: + execution_id = str(uuid4()) + _write_minimal_rollout( + tmp_path, + task_count=1, + evaluation_rows=[ + { + "id": str(uuid4()), + "score": 0.75, + "summary_json": { + "evaluator_name": "research-rubric", + "normalized_score": 0.75, + "criterion_results": [ + { + "criterion_name": "criterion_0", + "criterion_type": "researchrubrics-llm-judge", + "score": 1.0, + "max_score": 1.0, + "passed": True, + "weight": 1.0, + "status": "passed", + "criterion_description": "Includes citations.", + "feedback": "The report cited source material.", + } + ], + }, + } + ], + resource_rows=[ + { + "id": str(uuid4()), + "task_execution_id": execution_id, + "name": "report.md", + "file_path": "/tmp/ergon-blob/abc", + "metadata_json": {"sandbox_origin": "/workspace/final_output/report.md"}, + } + ], + ) +``` + +If `_write_minimal_rollout` generates execution ids internally, return them from the helper or pass explicit ids. Keep the test focused: final-output detection must use `metadata_json.sandbox_origin`, not durable `file_path`. + +Run: + +```bash +uv run pytest tests/unit/runtime/test_real_llm_rollout_artifact_health.py tests/real_llm/test_artifact_health.py -q +``` + +Expected: all tests pass. + +--- + +## Task 4: Preserve Evaluator-Normalized Scores + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py` +- Modify: `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py` +- Test: `tests/unit/runtime/test_evaluation_summary_contracts.py` +- Test: `tests/unit/state/test_research_rubrics_benchmark.py` + +### Rationale + +New standard: all evaluators return normalized scalar scores in `TaskEvaluationResult.score`. Persistence must record, not reinterpret, that score. + +Current bug: + +```python +total_score = result.score +normalized = total_score / max_score_total if max_score_total > 0 else 0.0 +``` + +For ResearchRubrics, `result.score` is already normalized, so this divides twice. + +### Patch: Persistence + +In `build_evaluation_summary`, replace: + +```python +total_score = result.score +normalized = total_score / max_score_total if max_score_total > 0 else 0.0 +``` + +with: + +```python +normalized = result.score +``` + +Keep `max_score_total` as rubric display metadata: + +```python +return EvaluationSummary( + evaluator_name=result.evaluator_name, + max_score=max_score_total, + normalized_score=normalized, + stages_evaluated=len(stage_names), + stages_passed=stages_passed, + criterion_results=entries, +) +``` + +### Patch: ResearchRubrics Metadata + +In `ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py`, keep the formula and add explicit score metadata: + +```python +return TaskEvaluationResult( + task_slug=task.task_slug, + score=normalized_score, + passed=total_score > 0, + evaluator_name=self.name, + criterion_results=results, + metadata={ + "score_scale": "normalized_0_1", + "raw_score": total_score, + "max_possible": max_possible, + "min_possible": min_possible, + }, +) +``` + +### Tests + +In `tests/unit/runtime/test_evaluation_summary_contracts.py`, add: + +```python +def test_build_evaluation_summary_preserves_evaluator_normalized_score() -> None: + summary = build_evaluation_summary( + _service_result( + feedback="criterion ran", + criterion_score=0.5, + criterion_weight=2.0, + passed=True, + ), + evaluation_input=None, + ) + + assert summary.normalized_score == 0.5 + assert summary.max_score == 1.0 +``` + +To make this test prove the no-double-normalization contract, change the helper's `CriterionSpec` for this test case from `max_score=1.0` to `max_score=2.0`. With the old implementation, `summary.normalized_score` would be `0.25`; with the new contract, it remains `0.5`. + +In `tests/unit/state/test_research_rubrics_benchmark.py`, update expected metadata: + +```python +assert result.metadata == { + "score_scale": "normalized_0_1", + "raw_score": 2.0, + "max_possible": 2.0, + "min_possible": -1.0, +} +``` + +Run: + +```bash +uv run pytest tests/unit/runtime/test_evaluation_summary_contracts.py tests/unit/state/test_research_rubrics_benchmark.py -q +``` + +Expected: all tests pass. + +--- + +## Task 5: Verify With One Real Rollout + +**Files:** +- No new code files. + +### Commands + +Run focused checks: + +```bash +uv run pytest \ + tests/unit/state/test_criterion_runtime_di.py \ + tests/unit/state/test_research_rubrics_benchmark.py \ + tests/unit/runtime/test_evaluation_summary_contracts.py \ + tests/unit/runtime/test_real_llm_rollout_artifact_health.py \ + tests/real_llm/test_artifact_health.py \ + -q +``` + +Expected: all tests pass. + +Run lint/compile for touched files: + +```bash +uv run ruff check \ + ergon_core/ergon_core/api/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py \ + tests/real_llm/artifact_health.py \ + tests/unit/state/test_criterion_runtime_di.py \ + tests/unit/state/test_research_rubrics_benchmark.py \ + tests/unit/runtime/test_evaluation_summary_contracts.py \ + tests/unit/runtime/test_real_llm_rollout_artifact_health.py +``` + +Expected: `All checks passed!` + +Run compile: + +```bash +uv run python -m compileall -q \ + ergon_core/ergon_core/api/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py \ + ergon_core/ergon_core/core/runtime/services/evaluation_persistence_service.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/judge_criterion.py \ + ergon_builtins/ergon_builtins/benchmarks/researchrubrics/rubric.py \ + tests/real_llm/artifact_health.py +``` + +Expected: exit code `0`. + +After rebuild, rerun one real sample: + +```bash +ERGON_REAL_LLM=1 \ +ERGON_REAL_LLM_MODEL=openrouter:anthropic/claude-opus-4.7 \ +ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react \ +ERGON_REAL_LLM_LIMIT=1 \ +ERGON_REAL_LLM_BUDGET_USD=25 \ +TEST_HARNESS_SECRET=real-llm-secret \ +uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py --assume-stack-up -vv -s +``` + +Expected rollout properties: + +- terminal status is `completed`; +- artifact health reports `missing_final_report: False`; +- `normalized scores` matches `RunTaskEvaluation.score`; +- criterion `evaluated_resource_ids` contains the report resource id; +- judge feedback references details from the full final report, not just the final assistant summary. + +--- + +## Non-Goals + +- Do not put final-vs-scratch classification in `ergon_core`. +- Do not include full agent conversation in ResearchRubrics judge prompts by default. +- Do not introduce a new persisted table for evidence bundles. +- Do not preserve compatibility with double-normalized summary scores; new runs should use the normalized score invariant. + diff --git a/ergon_builtins/AGENTS.md b/ergon_builtins/AGENTS.md index cfb169a3..ff1678e9 100644 --- a/ergon_builtins/AGENTS.md +++ b/ergon_builtins/AGENTS.md @@ -100,12 +100,10 @@ EVALUATION is populated by whichever **evaluator** you pass with |---|---|---| | `gdpeval` | `benchmarks/gdpeval/sandbox.py` | GDPEval harness sandbox. | | `minif2f` | `benchmarks/minif2f/sandbox_manager.py` | Lean 4 sandbox with the compiler pre-installed. | +| `researchrubrics` | `benchmarks/researchrubrics/sandbox_manager.py` | ResearchRubrics E2B sandbox with Exa tooling. | +| `researchrubrics-vanilla` | `benchmarks/researchrubrics/sandbox_manager.py` | Same sandbox setup for the vanilla benchmark variant. | | `swebench-verified` | `benchmarks/swebench_verified/sandbox_manager.py` | SWE-Bench instance sandbox; installs repo+deps in `_install_dependencies`. | -(`ResearchRubricsSandboxManager` lives in `ergon_core/core/providers/sandbox/` -and is instantiated directly by `researchrubrics-researcher`; it is not in -`SANDBOX_MANAGERS` because nothing else uses it.) - --- ## Model backends (`MODEL_BACKENDS` in registry_core.py) From 14ee2b8b7f896f794522b076bc52ddae1e241ef5 Mon Sep 17 00:00:00 2001 From: Charlie Masters <69640669+cm2435@users.noreply.github.com> Date: Tue, 28 Apr 2026 16:45:17 +0100 Subject: [PATCH 2/5] Refactor runtime debugging infrastructure Move sandbox, tracing, and Inngest helpers into clearer runtime packages while preserving real-LLM debugging and smoke-test coverage. Made-with: Cursor --- .../benchmarks/gdpeval/sandbox.py | 2 +- .../benchmarks/gdpeval/sandbox_utils.py | 2 +- .../benchmarks/gdpeval/toolkit.py | 2 +- .../benchmarks/minif2f/sandbox_manager.py | 2 +- .../researchrubrics/sandbox_manager.py | 14 +- .../swebench_verified/sandbox_manager.py | 4 +- ergon_builtins/ergon_builtins/registry.py | 2 +- .../ergon_builtins/registry_core.py | 2 +- .../ergon_builtins/registry_data.py | 8 +- .../workers/baselines/tool_budget.py | 61 ++ .../research_rubrics/researcher_worker.py | 2 +- .../workflow_cli_react_worker.py | 2 +- ergon_cli/ergon_cli/commands/benchmark.py | 2 +- ergon_core/ergon_core/core/api/app.py | 8 +- .../ergon_core/core/api/test_harness.py | 2 +- .../ergon_core/core/dashboard/emitter.py | 2 +- .../ergon_core/core/providers/__init__.py | 0 .../core/providers/generation/__init__.py | 4 - .../core/runtime/errors/error_payload.py | 35 -- .../runtime/evaluation/criterion_runtime.py | 6 +- .../runtime/evaluation/inngest_executor.py | 2 +- .../core/runtime/evaluation/protocols.py | 2 +- .../runtime/inngest/benchmark_run_start.py | 2 +- .../runtime/inngest/cancel_orphan_subtasks.py | 2 +- .../core/runtime/inngest/check_evaluators.py | 4 +- .../runtime/inngest/cleanup_cancelled_task.py | 2 +- .../{inngest_client.py => inngest/client.py} | 0 .../core/runtime/inngest/complete_workflow.py | 2 +- .../core/runtime/inngest/evaluate_task_run.py | 4 +- .../core/runtime/inngest/execute_task.py | 30 +- .../core/runtime/inngest/fail_workflow.py | 2 +- .../core/runtime/inngest/persist_outputs.py | 6 +- .../runtime/inngest/propagate_execution.py | 4 +- .../registry.py} | 0 .../core/runtime/inngest/run_cleanup.py | 4 +- .../core/runtime/inngest/sandbox_setup.py | 4 +- .../core/runtime/inngest/start_workflow.py | 2 +- .../core/runtime/inngest/worker_execute.py | 41 +- .../core/runtime/services/run_service.py | 2 +- .../services/task_management_service.py | 2 +- .../core/runtime/services/workflow_service.py | 2 +- ergon_core/ergon_core/core/runtime/tracing.py | 572 ------------------ .../core/runtime/tracing/__init__.py | 85 +++ .../core/runtime/tracing/attributes.py | 46 ++ .../core/runtime/tracing/contexts.py | 177 ++++++ .../ergon_core/core/runtime/tracing/ids.py | 56 ++ .../ergon_core/core/runtime/tracing/noop.py | 47 ++ .../ergon_core/core/runtime/tracing/otel.py | 135 +++++ .../ergon_core/core/runtime/tracing/sinks.py | 27 + .../ergon_core/core/runtime/tracing/types.py | 67 ++ .../core/{providers => }/sandbox/__init__.py | 2 +- .../core/{providers => }/sandbox/errors.py | 0 .../{providers => }/sandbox/event_sink.py | 0 .../sandbox/instrumentation.py | 4 +- .../core/{providers => }/sandbox/lifecycle.py | 2 +- .../core/{providers => }/sandbox/manager.py | 6 +- .../sandbox/resource_publisher.py | 0 .../core/{providers => }/sandbox/utils.py | 0 .../test_support/sandbox/stub_manager.py | 2 +- .../test_support/smoke_fixtures/sandbox.py | 2 +- .../smoke_fixtures/smoke_base/leaf_base.py | 2 +- .../smoke_fixtures/smoke_base/subworker.py | 2 +- pyproject.toml | 2 +- scripts/spike_openrouter_reasoning.py | 141 +++++ tests/integration/conftest.py | 2 +- .../minif2f/test_sandbox_manager.py | 14 +- .../minif2f/test_verification_integration.py | 2 +- .../researchrubrics/test_sandbox_manager.py | 16 +- .../sandbox/test_required_env_keys.py | 14 +- .../swebench_verified/test_sandbox_manager.py | 10 +- .../test_model_field_descriptions.py | 82 +++ .../test_swebench_sandbox_manager.py | 4 +- .../unit/builtins/test_logfire_pydantic_ai.py | 53 ++ tests/unit/builtins/test_tool_budget.py | 51 ++ .../test_criterion_runtime_reconnect.py | 2 +- .../test_failed_task_sandbox_cleanup.py | 2 +- tests/unit/runtime/test_failure_error_json.py | 41 -- .../test_inngest_criterion_executor.py | 86 +++ .../runtime/test_inngest_package_layout.py | 10 + .../test_worker_execute_output_failure.py | 12 - .../test_ensure_sandbox_idempotence.py | 6 +- .../sandbox/test_sandbox_lifecycle_service.py | 4 +- tests/unit/sandbox/test_sandbox_reconnect.py | 28 +- .../test_leaf_sends_completion_message.py | 2 +- .../smoke_base/test_smoke_sandbox_manager.py | 6 +- tests/unit/state/test_criterion_runtime_di.py | 2 +- tests/unit/test_dashboard_emitter_wiring.py | 2 +- 87 files changed, 1286 insertions(+), 823 deletions(-) rename ergon_core/ergon_core/core/providers/sandbox/research_rubrics_manager.py => ergon_builtins/ergon_builtins/benchmarks/researchrubrics/sandbox_manager.py (91%) create mode 100644 ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py delete mode 100644 ergon_core/ergon_core/core/providers/__init__.py delete mode 100644 ergon_core/ergon_core/core/providers/generation/__init__.py delete mode 100644 ergon_core/ergon_core/core/runtime/errors/error_payload.py rename ergon_core/ergon_core/core/runtime/{inngest_client.py => inngest/client.py} (100%) rename ergon_core/ergon_core/core/runtime/{inngest_registry.py => inngest/registry.py} (100%) delete mode 100644 ergon_core/ergon_core/core/runtime/tracing.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/__init__.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/attributes.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/contexts.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/ids.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/noop.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/otel.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/sinks.py create mode 100644 ergon_core/ergon_core/core/runtime/tracing/types.py rename ergon_core/ergon_core/core/{providers => }/sandbox/__init__.py (72%) rename ergon_core/ergon_core/core/{providers => }/sandbox/errors.py (100%) rename ergon_core/ergon_core/core/{providers => }/sandbox/event_sink.py (100%) rename ergon_core/ergon_core/core/{providers => }/sandbox/instrumentation.py (98%) rename ergon_core/ergon_core/core/{providers => }/sandbox/lifecycle.py (96%) rename ergon_core/ergon_core/core/{providers => }/sandbox/manager.py (99%) rename ergon_core/ergon_core/core/{providers => }/sandbox/resource_publisher.py (100%) rename ergon_core/ergon_core/core/{providers => }/sandbox/utils.py (100%) create mode 100644 scripts/spike_openrouter_reasoning.py create mode 100644 tests/unit/architecture/test_model_field_descriptions.py create mode 100644 tests/unit/builtins/test_logfire_pydantic_ai.py create mode 100644 tests/unit/builtins/test_tool_budget.py create mode 100644 tests/unit/runtime/test_inngest_criterion_executor.py create mode 100644 tests/unit/runtime/test_inngest_package_layout.py delete mode 100644 tests/unit/runtime/test_worker_execute_output_failure.py diff --git a/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox.py b/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox.py index 3dfb1c39..fbcd0bdf 100644 --- a/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox.py +++ b/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox.py @@ -7,7 +7,7 @@ import logging from uuid import UUID -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager try: from e2b_code_interpreter import AsyncSandbox # type: ignore[import-untyped] diff --git a/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox_utils.py b/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox_utils.py index f3cae0bd..e7b20fd4 100644 --- a/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox_utils.py +++ b/ergon_builtins/ergon_builtins/benchmarks/gdpeval/sandbox_utils.py @@ -13,7 +13,7 @@ from pydantic import BaseModel, Field if TYPE_CHECKING: - from ergon_core.core.providers.sandbox.manager import BaseSandboxManager + from ergon_core.core.sandbox.manager import BaseSandboxManager logger = logging.getLogger(__name__) diff --git a/ergon_builtins/ergon_builtins/benchmarks/gdpeval/toolkit.py b/ergon_builtins/ergon_builtins/benchmarks/gdpeval/toolkit.py index dcae6c1b..a7346f99 100644 --- a/ergon_builtins/ergon_builtins/benchmarks/gdpeval/toolkit.py +++ b/ergon_builtins/ergon_builtins/benchmarks/gdpeval/toolkit.py @@ -25,7 +25,7 @@ ) if TYPE_CHECKING: - from ergon_core.core.providers.sandbox.manager import BaseSandboxManager + from ergon_core.core.sandbox.manager import BaseSandboxManager class QAExchange: diff --git a/ergon_builtins/ergon_builtins/benchmarks/minif2f/sandbox_manager.py b/ergon_builtins/ergon_builtins/benchmarks/minif2f/sandbox_manager.py index 7a6d42e3..e9f8c650 100644 --- a/ergon_builtins/ergon_builtins/benchmarks/minif2f/sandbox_manager.py +++ b/ergon_builtins/ergon_builtins/benchmarks/minif2f/sandbox_manager.py @@ -10,7 +10,7 @@ class ``_install_dependencies`` hook is sufficient — the verify step just import logging from uuid import UUID -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_builtins.benchmarks.minif2f.sandbox.utils import ( REGISTRY_PATH, diff --git a/ergon_core/ergon_core/core/providers/sandbox/research_rubrics_manager.py b/ergon_builtins/ergon_builtins/benchmarks/researchrubrics/sandbox_manager.py similarity index 91% rename from ergon_core/ergon_core/core/providers/sandbox/research_rubrics_manager.py rename to ergon_builtins/ergon_builtins/benchmarks/researchrubrics/sandbox_manager.py index 553ebe11..d512b661 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/research_rubrics_manager.py +++ b/ergon_builtins/ergon_builtins/benchmarks/researchrubrics/sandbox_manager.py @@ -2,7 +2,7 @@ Subclasses ``BaseSandboxManager`` to pre-install research tooling (``exa-py``) and scaffold the workspace directory layout used by the research toolkit's -skill handlers. Provides a ``publisher_for`` factory so toolkit methods can +skill handlers. Provides a ``publisher_for`` factory so toolkit methods can trigger ``SandboxResourcePublisher.sync()`` after write operations. """ @@ -10,8 +10,8 @@ from typing import ClassVar from uuid import UUID -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager -from ergon_core.core.providers.sandbox.resource_publisher import ( +from ergon_core.core.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.resource_publisher import ( SandboxResourcePublisher, ) @@ -36,9 +36,9 @@ class ResearchRubricsSandboxManager(BaseSandboxManager): """Singleton sandbox manager for researchrubrics benchmarks. - One ``AsyncSandbox`` per root task. ``exa-py`` is installed and the + One ``AsyncSandbox`` per root task. ``exa-py`` is installed and the workspace directory tree is scaffolded at ``create`` time via the - ``_install_dependencies`` override. ``EXA_API_KEY`` from ``settings`` + ``_install_dependencies`` override. ``EXA_API_KEY`` from ``settings`` is injected into the sandbox process env so the in-sandbox Exa skill calls (``exa_search``, ``exa_qa``, ``exa_get_content``) can authenticate. @@ -49,7 +49,7 @@ class ResearchRubricsSandboxManager(BaseSandboxManager): type_slug: ClassVar[str] = "researchrubrics" - # In-sandbox tool keys sourced from ``settings``. The base class's + # In-sandbox tool keys sourced from ``settings``. The base class's # ``_compose_envs`` helper reads ``settings.exa_api_key`` and merges # it into the ``envs`` dict threaded to ``AsyncSandbox.create``. required_env_keys: ClassVar[tuple[str, ...]] = ("EXA_API_KEY",) @@ -67,7 +67,7 @@ async def _install_dependencies( if AsyncSandbox is None: # The class-level ``try: from e2b_code_interpreter ...`` lets us # import this module when e2b isn't installed (documentation builds, - # type-only contexts). Reaching this method with no e2b means + # type-only contexts). Reaching this method with no e2b means # somebody constructed the manager without the optional dep -- # fail fast with a clear message instead of a confusing # ``NoneType is not callable`` deeper down. diff --git a/ergon_builtins/ergon_builtins/benchmarks/swebench_verified/sandbox_manager.py b/ergon_builtins/ergon_builtins/benchmarks/swebench_verified/sandbox_manager.py index 94764b22..8f4a7ed2 100644 --- a/ergon_builtins/ergon_builtins/benchmarks/swebench_verified/sandbox_manager.py +++ b/ergon_builtins/ergon_builtins/benchmarks/swebench_verified/sandbox_manager.py @@ -13,8 +13,8 @@ from uuid import UUID from ergon_core.core.persistence.queries import queries -from ergon_core.core.providers.sandbox.errors import SandboxSetupError -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.errors import SandboxSetupError +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_builtins.benchmarks.swebench_verified.criterion import make_test_spec from ergon_builtins.benchmarks.swebench_verified.sandbox.utils import resolve_template diff --git a/ergon_builtins/ergon_builtins/registry.py b/ergon_builtins/ergon_builtins/registry.py index 97dc5917..b6453ce2 100644 --- a/ergon_builtins/ergon_builtins/registry.py +++ b/ergon_builtins/ergon_builtins/registry.py @@ -8,7 +8,7 @@ import structlog from ergon_core.api import Benchmark, Evaluator, Worker -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_builtins.models.resolution import ( ResolvedModel, diff --git a/ergon_builtins/ergon_builtins/registry_core.py b/ergon_builtins/ergon_builtins/registry_core.py index 3c599194..1068c7e0 100644 --- a/ergon_builtins/ergon_builtins/registry_core.py +++ b/ergon_builtins/ergon_builtins/registry_core.py @@ -10,7 +10,7 @@ from uuid import UUID from ergon_core.api import Benchmark, Evaluator, Worker -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_builtins.benchmarks.gdpeval.rubric import StagedRubric from ergon_builtins.benchmarks.gdpeval.sandbox import GDPEvalSandboxManager diff --git a/ergon_builtins/ergon_builtins/registry_data.py b/ergon_builtins/ergon_builtins/registry_data.py index 215eaf69..4f75facf 100644 --- a/ergon_builtins/ergon_builtins/registry_data.py +++ b/ergon_builtins/ergon_builtins/registry_data.py @@ -6,14 +6,14 @@ from collections.abc import Callable from ergon_core.api import Benchmark, Evaluator, Worker -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager -from ergon_core.core.providers.sandbox.research_rubrics_manager import ( - ResearchRubricsSandboxManager, -) +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_builtins.benchmarks.gdpeval.benchmark import GDPEvalBenchmark from ergon_builtins.benchmarks.researchrubrics.benchmark import ResearchRubricsBenchmark from ergon_builtins.benchmarks.researchrubrics.rubric import ResearchRubricsRubric +from ergon_builtins.benchmarks.researchrubrics.sandbox_manager import ( + ResearchRubricsSandboxManager, +) from ergon_builtins.benchmarks.researchrubrics.vanilla import ( ResearchRubricsVanillaBenchmark, ) diff --git a/ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py b/ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py new file mode 100644 index 00000000..2b590ac5 --- /dev/null +++ b/ergon_builtins/ergon_builtins/workers/baselines/tool_budget.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import Any, Literal + +from pydantic import BaseModel, Field + +ToolBudgetKind = Literal["workflow", "other", "finalization"] +ToolBudgetExhaustedStatus = Literal["TOOL_BUDGET_EXHAUSTED"] + + +class AgentToolBudgetExhaustedResult(BaseModel): + status: ToolBudgetExhaustedStatus = "TOOL_BUDGET_EXHAUSTED" + reason: str + message: str + budget_state: dict[str, Any] # slopcop: ignore[no-typing-any] + + +class AgentToolBudgetState(BaseModel): + max_workflow_tool_calls: int = 12 + max_other_tool_calls: int = 12 + workflow_tool_calls: int = 0 + other_tool_calls: int = 0 + finalization_tool_calls: int = 0 + calls_by_tool: dict[str, int] = Field(default_factory=dict) + + def increment(self, tool_name: str, kind: ToolBudgetKind) -> int: + self.calls_by_tool[tool_name] = self.calls_by_tool.get(tool_name, 0) + 1 + + if kind == "workflow": + self.workflow_tool_calls += 1 + return self.workflow_tool_calls + if kind == "finalization": + self.finalization_tool_calls += 1 + return self.finalization_tool_calls + self.other_tool_calls += 1 + return self.other_tool_calls + + def snapshot(self) -> dict[str, Any]: # slopcop: ignore[no-typing-any] + return { + "workflow_tool_calls": self.workflow_tool_calls, + "max_workflow_tool_calls": self.max_workflow_tool_calls, + "other_tool_calls": self.other_tool_calls, + "max_other_tool_calls": self.max_other_tool_calls, + "finalization_tool_calls": self.finalization_tool_calls, + "calls_by_tool": dict(sorted(self.calls_by_tool.items())), + } + + def exhausted_result(self, reason: str) -> AgentToolBudgetExhaustedResult: + return AgentToolBudgetExhaustedResult( + reason=reason, + message=( + "Stop calling tools in this category. Use the context/resources already " + "available and produce the best possible final output. If the output is " + "incomplete, state what context or resource was missing." + ), + budget_state=self.snapshot(), + ) + + +class AgentToolBudgetDeps(BaseModel): + tool_budget: AgentToolBudgetState diff --git a/ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py b/ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py index 3c1b981a..e0fd720a 100644 --- a/ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py +++ b/ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py @@ -14,7 +14,7 @@ from ergon_core.core.runtime.resources import RunResourceView from ergon_core.api.task_types import BenchmarkTask from ergon_core.api.worker_context import WorkerContext -from ergon_core.core.providers.sandbox.research_rubrics_manager import ( +from ergon_builtins.benchmarks.researchrubrics.sandbox_manager import ( ResearchRubricsSandboxManager, ) diff --git a/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py b/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py index 7a7a3a05..dfa624d6 100644 --- a/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py +++ b/ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py @@ -7,7 +7,7 @@ from ergon_core.core.runtime.resources import RunResourceView from ergon_core.api.task_types import BenchmarkTask from ergon_core.api.worker_context import WorkerContext -from ergon_core.core.providers.sandbox.research_rubrics_manager import ( +from ergon_builtins.benchmarks.researchrubrics.sandbox_manager import ( ResearchRubricsSandboxManager, ) diff --git a/ergon_cli/ergon_cli/commands/benchmark.py b/ergon_cli/ergon_cli/commands/benchmark.py index 72d692fb..1e0993cf 100644 --- a/ergon_cli/ergon_cli/commands/benchmark.py +++ b/ergon_cli/ergon_cli/commands/benchmark.py @@ -19,7 +19,7 @@ from ergon_core.core.persistence.shared.enums import TERMINAL_RUN_STATUSES from ergon_core.core.persistence.telemetry.models import RunRecord from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.cohort_service import experiment_cohort_service from ergon_core.core.runtime.services.run_service import create_run from ergon_core.core.settings import settings diff --git a/ergon_core/ergon_core/core/api/app.py b/ergon_core/ergon_core/core/api/app.py index ccd1b5cb..9e213661 100644 --- a/ergon_core/ergon_core/core/api/app.py +++ b/ergon_core/ergon_core/core/api/app.py @@ -28,15 +28,15 @@ from ergon_core.core.api.test_harness import router as _test_harness_router from ergon_core.core.dashboard.emitter import dashboard_emitter from ergon_core.core.persistence.shared.db import ensure_db, get_session -from ergon_core.core.providers.sandbox.event_sink import ( +from ergon_core.core.sandbox.event_sink import ( CompoundSandboxEventSink, DashboardEmitterSandboxEventSink, PostgresSandboxEventSink, ) -from ergon_core.core.providers.sandbox.manager import DefaultSandboxManager +from ergon_core.core.sandbox.manager import DefaultSandboxManager from ergon_core.core.rl.rollout_service import RolloutService -from ergon_core.core.runtime.inngest_client import inngest_client -from ergon_core.core.runtime.inngest_registry import ALL_FUNCTIONS +from ergon_core.core.runtime.inngest.client import inngest_client +from ergon_core.core.runtime.inngest.registry import ALL_FUNCTIONS from ergon_core.core.settings import Settings, settings from fastapi import FastAPI diff --git a/ergon_core/ergon_core/core/api/test_harness.py b/ergon_core/ergon_core/core/api/test_harness.py index 117e0836..336b5912 100644 --- a/ergon_core/ergon_core/core/api/test_harness.py +++ b/ergon_core/ergon_core/core/api/test_harness.py @@ -34,7 +34,7 @@ Thread, ) from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.cohort_service import experiment_cohort_service from ergon_core.core.runtime.services.run_service import create_run from fastapi import APIRouter, Depends, Header, HTTPException, status diff --git a/ergon_core/ergon_core/core/dashboard/emitter.py b/ergon_core/ergon_core/core/dashboard/emitter.py index cd91203d..6719ffe8 100644 --- a/ergon_core/ergon_core/core/dashboard/emitter.py +++ b/ergon_core/ergon_core/core/dashboard/emitter.py @@ -25,7 +25,7 @@ ) from ergon_core.core.persistence.queries import queries from ergon_core.core.runtime.events.task_events import TaskCancelledEvent -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.cohort_schemas import CohortSummaryDto from ergon_core.core.runtime.services.cohort_service import experiment_cohort_service from ergon_core.core.runtime.services.cohort_stats_service import ( diff --git a/ergon_core/ergon_core/core/providers/__init__.py b/ergon_core/ergon_core/core/providers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/ergon_core/ergon_core/core/providers/generation/__init__.py b/ergon_core/ergon_core/core/providers/generation/__init__.py deleted file mode 100644 index 765985ec..00000000 --- a/ergon_core/ergon_core/core/providers/generation/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Generation provider namespace. - -Concrete PydanticAI model resolution lives in ``ergon_builtins.models``. -""" diff --git a/ergon_core/ergon_core/core/runtime/errors/error_payload.py b/ergon_core/ergon_core/core/runtime/errors/error_payload.py deleted file mode 100644 index e75806d3..00000000 --- a/ergon_core/ergon_core/core/runtime/errors/error_payload.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Structured runtime error payloads for persisted execution failures.""" - -import traceback -from collections.abc import Mapping -from typing import Any - -from ergon_core.core.json_types import JsonObject -from pydantic import BaseModel, Field - - -class RuntimeErrorPayload(BaseModel): - """Persisted shape for task execution failures.""" - - message: str - exception_type: str - phase: str - stack: str - context: dict[str, str] = Field(default_factory=dict) - - -def build_error_json( - exc: BaseException, - *, - phase: str, - context: Mapping[str, Any] | None = None, -) -> JsonObject: - """Return stack-rich, queryable error details for PG persistence.""" - payload = RuntimeErrorPayload( - message=str(exc), - exception_type=type(exc).__name__, - phase=phase, - stack="".join(traceback.format_exception(type(exc), exc, exc.__traceback__)), - context={key: str(value) for key, value in (context or {}).items()}, - ) - return payload.model_dump(mode="json") diff --git a/ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py b/ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py index 545c0d9a..58716002 100644 --- a/ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py +++ b/ergon_core/ergon_core/core/runtime/evaluation/criterion_runtime.py @@ -13,8 +13,8 @@ ) from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.telemetry.models import RunResource -from ergon_core.core.providers.sandbox.errors import SandboxExpiredError -from ergon_core.core.providers.sandbox.event_sink import ( +from ergon_core.core.sandbox.errors import SandboxExpiredError +from ergon_core.core.sandbox.event_sink import ( NoopSandboxEventSink, SandboxEventSink, ) @@ -24,7 +24,7 @@ from sqlmodel import Session, desc, select if TYPE_CHECKING: - from ergon_core.core.providers.sandbox.manager import AsyncSandbox, BaseSandboxManager + from ergon_core.core.sandbox.manager import AsyncSandbox, BaseSandboxManager logger = logging.getLogger(__name__) diff --git a/ergon_core/ergon_core/core/runtime/evaluation/inngest_executor.py b/ergon_core/ergon_core/core/runtime/evaluation/inngest_executor.py index 85888538..3810fdd4 100644 --- a/ergon_core/ergon_core/core/runtime/evaluation/inngest_executor.py +++ b/ergon_core/ergon_core/core/runtime/evaluation/inngest_executor.py @@ -27,7 +27,7 @@ ) if TYPE_CHECKING: - from ergon_core.core.providers.sandbox.manager import BaseSandboxManager + from ergon_core.core.sandbox.manager import BaseSandboxManager class InngestCriterionExecutor: diff --git a/ergon_core/ergon_core/core/runtime/evaluation/protocols.py b/ergon_core/ergon_core/core/runtime/evaluation/protocols.py index 59c74083..79f2695a 100644 --- a/ergon_core/ergon_core/core/runtime/evaluation/protocols.py +++ b/ergon_core/ergon_core/core/runtime/evaluation/protocols.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: from sqlmodel import Session - from ergon_core.core.providers.sandbox.event_sink import SandboxEventSink + from ergon_core.core.sandbox.event_sink import SandboxEventSink from ergon_core.core.runtime.resources import RunResourceView __all__ = ["CommandResult", "CriterionRuntime", "SandboxResult"] diff --git a/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py b/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py index f517de7a..f504beb7 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py +++ b/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py @@ -14,7 +14,7 @@ from ergon_core.core.runtime.errors import RegistryLookupError from ergon_core.core.runtime.events.base import InngestEventContract from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.inngest_function_results import ( BenchmarkRunStartResult, ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/cancel_orphan_subtasks.py b/ergon_core/ergon_core/core/runtime/inngest/cancel_orphan_subtasks.py index 3daf00a8..a79efb7d 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/cancel_orphan_subtasks.py +++ b/ergon_core/ergon_core/core/runtime/inngest/cancel_orphan_subtasks.py @@ -21,7 +21,7 @@ TaskCancelledEvent, TaskFailedEvent, ) -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.subtask_blocking_service import SubtaskBlockingService from ergon_core.core.runtime.services.subtask_cancellation_service import ( SubtaskCancellationService, diff --git a/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py b/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py index cfee52bd..31d00364 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py +++ b/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py @@ -9,12 +9,12 @@ import logging import inngest -from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id +from ergon_core.core.sandbox.lifecycle import terminate_sandbox_by_id from ergon_core.core.runtime.events.task_events import ( TaskCompletedEvent, ) +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.inngest.evaluate_task_run import evaluate_task_run -from ergon_core.core.runtime.inngest_client import inngest_client from ergon_core.core.runtime.services.child_function_payloads import ( EvaluateTaskRunRequest, ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py b/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py index 649438c8..793bd81b 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py +++ b/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py @@ -13,7 +13,7 @@ from ergon_core.core.json_types import JsonObject from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.runtime.events.task_events import TaskCancelledEvent -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.task_cleanup_dto import CleanupResult from ergon_core.core.runtime.services.task_cleanup_service import TaskCleanupService diff --git a/ergon_core/ergon_core/core/runtime/inngest_client.py b/ergon_core/ergon_core/core/runtime/inngest/client.py similarity index 100% rename from ergon_core/ergon_core/core/runtime/inngest_client.py rename to ergon_core/ergon_core/core/runtime/inngest/client.py diff --git a/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py b/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py index 20fc5393..0f5a31ef 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py +++ b/ergon_core/ergon_core/core/runtime/inngest/complete_workflow.py @@ -10,7 +10,7 @@ from ergon_core.core.persistence.telemetry.models import RunRecord from ergon_core.core.runtime.events.infrastructure_events import RunCleanupEvent from ergon_core.core.runtime.events.task_events import WorkflowCompletedEvent -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.inngest_function_results import WorkflowCompleteResult from ergon_core.core.runtime.services.orchestration_dto import FinalizeWorkflowCommand from ergon_core.core.runtime.services.workflow_finalization_service import ( diff --git a/ergon_core/ergon_core/core/runtime/inngest/evaluate_task_run.py b/ergon_core/ergon_core/core/runtime/inngest/evaluate_task_run.py index 363afed7..aa823f13 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/evaluate_task_run.py +++ b/ergon_core/ergon_core/core/runtime/inngest/evaluate_task_run.py @@ -12,11 +12,11 @@ from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload from ergon_core.core.dashboard.emitter import dashboard_emitter from ergon_core.core.persistence.queries import queries -from ergon_core.core.providers.sandbox.manager import DefaultSandboxManager +from ergon_core.core.sandbox.manager import DefaultSandboxManager from ergon_core.core.runtime.errors import ContractViolationError, RegistryLookupError from ergon_core.core.runtime.evaluation.evaluation_schemas import TaskEvaluationContext from ergon_core.core.runtime.evaluation.inngest_executor import InngestCriterionExecutor -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.child_function_payloads import ( EvaluateTaskRunRequest, ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/execute_task.py b/ergon_core/ergon_core/core/runtime/inngest/execute_task.py index a5add7fc..cd63cd3d 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/execute_task.py +++ b/ergon_core/ergon_core/core/runtime/inngest/execute_task.py @@ -5,11 +5,11 @@ """ import logging +import traceback from datetime import UTC, datetime import inngest from ergon_core.core.runtime.errors import ContractViolationError -from ergon_core.core.runtime.errors.error_payload import build_error_json from ergon_core.core.runtime.events.task_events import ( TaskCompletedEvent, TaskFailedEvent, @@ -18,7 +18,7 @@ from ergon_core.core.runtime.inngest.persist_outputs import persist_outputs_fn from ergon_core.core.runtime.inngest.sandbox_setup import sandbox_setup_fn from ergon_core.core.runtime.inngest.worker_execute import worker_execute_fn -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, TASK_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, TASK_CANCEL, inngest_client from ergon_core.core.runtime.services.child_function_payloads import ( PersistOutputsRequest, SandboxSetupRequest, @@ -309,18 +309,22 @@ async def execute_task_fn(ctx: inngest.Context) -> TaskExecuteResult: run_id=payload.run_id, task_id=payload.task_id, error_message=error_msg, - error_json=build_error_json( - exc, - phase="task_execute", - context={ - "task_slug": prepared.task_slug, - "assigned_worker_slug": prepared.assigned_worker_slug, - "worker_type": prepared.worker_type, - "model_target": prepared.model_target, - "node_id": prepared.node_id, - "execution_id": prepared.execution_id, + error_json={ + "message": error_msg, + "exception_type": type(exc).__name__, + "phase": "task_execute", + "stack": "".join( + traceback.format_exception(type(exc), exc, exc.__traceback__) + ), + "context": { + "task_slug": str(prepared.task_slug), + "assigned_worker_slug": str(prepared.assigned_worker_slug), + "worker_type": str(prepared.worker_type), + "model_target": str(prepared.model_target), + "node_id": str(prepared.node_id), + "execution_id": str(prepared.execution_id), }, - ), + }, ) ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py b/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py index 4dda0b84..20f3e853 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py +++ b/ergon_core/ergon_core/core/runtime/inngest/fail_workflow.py @@ -10,7 +10,7 @@ from ergon_core.core.runtime.errors import DataIntegrityError from ergon_core.core.runtime.events.infrastructure_events import RunCleanupEvent from ergon_core.core.runtime.events.task_events import WorkflowFailedEvent -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.inngest_function_results import WorkflowFailedResult from ergon_core.core.runtime.tracing import ( CompletedSpan, diff --git a/ergon_core/ergon_core/core/runtime/inngest/persist_outputs.py b/ergon_core/ergon_core/core/runtime/inngest/persist_outputs.py index d162fac2..d4747f3e 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/persist_outputs.py +++ b/ergon_core/ergon_core/core/runtime/inngest/persist_outputs.py @@ -13,13 +13,13 @@ import inngest from ergon_builtins.registry import SANDBOX_MANAGERS -from ergon_core.core.providers.sandbox.manager import ( +from ergon_core.core.sandbox.manager import ( BaseSandboxManager, DefaultSandboxManager, ) -from ergon_core.core.providers.sandbox.resource_publisher import SandboxResourcePublisher +from ergon_core.core.sandbox.resource_publisher import SandboxResourcePublisher from ergon_core.core.runtime.errors import ContractViolationError -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.child_function_payloads import PersistOutputsRequest from ergon_core.core.runtime.services.inngest_function_results import PersistOutputsResult from ergon_core.core.runtime.tracing import ( diff --git a/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py b/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py index 518c001b..2192bbfe 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py +++ b/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py @@ -7,7 +7,7 @@ from datetime import UTC, datetime import inngest -from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id +from ergon_core.core.sandbox.lifecycle import terminate_sandbox_by_id from ergon_core.core.runtime.events.task_events import ( TaskCancelledEvent, TaskCompletedEvent, @@ -16,7 +16,7 @@ WorkflowCompletedEvent, WorkflowFailedEvent, ) -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.inngest_function_results import TaskPropagateResult from ergon_core.core.runtime.services.orchestration_dto import ( PropagateTaskCompletionCommand, diff --git a/ergon_core/ergon_core/core/runtime/inngest_registry.py b/ergon_core/ergon_core/core/runtime/inngest/registry.py similarity index 100% rename from ergon_core/ergon_core/core/runtime/inngest_registry.py rename to ergon_core/ergon_core/core/runtime/inngest/registry.py diff --git a/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py b/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py index 88a83fdc..b8d9fbf1 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py +++ b/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py @@ -11,10 +11,10 @@ from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import RunStatus from ergon_core.core.persistence.telemetry.models import RunRecord -from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id +from ergon_core.core.sandbox.lifecycle import terminate_sandbox_by_id from ergon_core.core.runtime.errors import ConfigurationError, DataIntegrityError from ergon_core.core.runtime.events.infrastructure_events import RunCleanupEvent -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.inngest_function_results import RunCleanupResult logger = logging.getLogger(__name__) diff --git a/ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py b/ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py index 6450bade..a785262a 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py +++ b/ergon_core/ergon_core/core/runtime/inngest/sandbox_setup.py @@ -14,9 +14,9 @@ from ergon_builtins.registry import SANDBOX_MANAGERS from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.telemetry.models import RunResource -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager, DefaultSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager, DefaultSandboxManager from ergon_core.core.runtime.errors import DataIntegrityError -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.child_function_payloads import SandboxSetupRequest from ergon_core.core.runtime.services.inngest_function_results import SandboxReadyResult from ergon_core.core.runtime.tracing import ( diff --git a/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py b/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py index da57ea8d..040940ab 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py +++ b/ergon_core/ergon_core/core/runtime/inngest/start_workflow.py @@ -14,7 +14,7 @@ TaskReadyEvent, WorkflowStartedEvent, ) -from ergon_core.core.runtime.inngest_client import RUN_CANCEL, inngest_client +from ergon_core.core.runtime.inngest.client import RUN_CANCEL, inngest_client from ergon_core.core.runtime.services.inngest_function_results import WorkflowStartResult from ergon_core.core.runtime.services.orchestration_dto import InitializeWorkflowCommand from ergon_core.core.runtime.services.workflow_initialization_service import ( diff --git a/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py b/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py index b29540f0..e5947b7c 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py +++ b/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py @@ -7,11 +7,11 @@ """ import logging +import traceback from datetime import UTC, datetime import inngest from ergon_builtins.registry import BENCHMARKS, WORKERS -from ergon_core.api.results import WorkerOutput from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload from ergon_core.api.worker_context import WorkerContext from ergon_core.core.dashboard.emitter import dashboard_emitter @@ -20,8 +20,7 @@ from ergon_core.core.persistence.queries import queries from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.runtime.errors import RegistryLookupError -from ergon_core.core.runtime.errors.error_payload import build_error_json -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.child_function_payloads import WorkerExecuteRequest from ergon_core.core.runtime.services.inngest_function_results import WorkerExecuteResult from ergon_core.core.runtime.tracing import ( @@ -34,22 +33,6 @@ logger = logging.getLogger(__name__) -def _worker_execute_result_from_output(output: WorkerOutput) -> WorkerExecuteResult: - return WorkerExecuteResult( - success=output.success, - final_assistant_message=output.output, - error=None if output.success else output.output, - ) - - -def _worker_execute_result_from_exception(exc: BaseException) -> WorkerExecuteResult: - return WorkerExecuteResult( - success=False, - error=str(exc), - error_json=build_error_json(exc, phase="worker_execute"), - ) - - @inngest_client.create_function( fn_id="worker-execute", trigger=inngest.TriggerEvent(event="task/worker-execute"), @@ -146,7 +129,19 @@ async def worker_execute_fn(ctx: inngest.Context) -> WorkerExecuteResult: turn_count, error_msg, ) - return _worker_execute_result_from_exception(exc) + return WorkerExecuteResult( + success=False, + error=error_msg, + error_json={ + "message": error_msg, + "exception_type": type(exc).__name__, + "phase": "worker_execute", + "stack": "".join( + traceback.format_exception(type(exc), exc, exc.__traceback__) + ), + "context": {}, + }, + ) sink = get_trace_sink() sink.emit_span( @@ -173,7 +168,11 @@ async def worker_execute_fn(ctx: inngest.Context) -> WorkerExecuteResult: ) ) - return _worker_execute_result_from_output(output) + return WorkerExecuteResult( + success=output.success, + final_assistant_message=output.output, + error=None if output.success else output.output, + ) async def _persist_context_events( diff --git a/ergon_core/ergon_core/core/runtime/services/run_service.py b/ergon_core/ergon_core/core/runtime/services/run_service.py index 0d7679bb..bf84f42d 100644 --- a/ergon_core/ergon_core/core/runtime/services/run_service.py +++ b/ergon_core/ergon_core/core/runtime/services/run_service.py @@ -15,7 +15,7 @@ RunCleanupEvent, ) from ergon_core.core.runtime.events.task_events import WorkflowStartedEvent -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.settings import settings from ergon_core.core.utils import utcnow diff --git a/ergon_core/ergon_core/core/runtime/services/task_management_service.py b/ergon_core/ergon_core/core/runtime/services/task_management_service.py index c6d72e36..84e84871 100644 --- a/ergon_core/ergon_core/core/runtime/services/task_management_service.py +++ b/ergon_core/ergon_core/core/runtime/services/task_management_service.py @@ -35,7 +35,7 @@ TaskCancelledEvent, TaskReadyEvent, ) -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.runtime.services.graph_dto import MutationMeta from ergon_core.core.runtime.services.graph_repository import WorkflowGraphRepository from ergon_core.core.runtime.services.task_management_dto import ( diff --git a/ergon_core/ergon_core/core/runtime/services/workflow_service.py b/ergon_core/ergon_core/core/runtime/services/workflow_service.py index 11fa2a0c..dd6008d0 100644 --- a/ergon_core/ergon_core/core/runtime/services/workflow_service.py +++ b/ergon_core/ergon_core/core/runtime/services/workflow_service.py @@ -15,7 +15,7 @@ RunResource, RunTaskExecution, ) -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager, DefaultSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager, DefaultSandboxManager from ergon_core.core.runtime.services.task_management_dto import ( AddSubtaskCommand, AddSubtaskResult, diff --git a/ergon_core/ergon_core/core/runtime/tracing.py b/ergon_core/ergon_core/core/runtime/tracing.py deleted file mode 100644 index 927e7d60..00000000 --- a/ergon_core/ergon_core/core/runtime/tracing.py +++ /dev/null @@ -1,572 +0,0 @@ -"""Tracing facade. - -Defines the TraceSink protocol and data classes that the runtime uses -to emit structured spans. The default sink is NoopTraceSink (discards -everything). When a real backend is wired in (OtelTraceSink), swap the -singleton returned by get_trace_sink(). - -Context factories at the bottom produce deterministic TraceContext -objects from run/task/execution/evaluator UUIDs so span trees are -reproducible across replays. - -Target span hierarchy (one trace per run, keyed by run_id):: - - workflow.execute (synthetic root) - │ cohort_id, instance_count - ├── workflow.start - ├── task.execute (per task) - │ instance_key - │ ├── sandbox.setup - │ ├── worker.execute - │ │ └── tool.{tool_name} (per tool call in GenerationTurn) - │ │ turn_index, tool_name, tool_call_id, has_result - │ ├── persist.outputs - │ │ resource_ids - │ └── evaluation.task (per evaluator) - │ └── evaluation.criterion (per criterion) - ├── task.propagate (per completion) - ├── communication.message (per ThreadMessage, optional) - │ thread_id, from_agent_id, to_agent_id, sequence_num - └── workflow.complete OR workflow.failed - -Every span stores relational IDs (run_id, task_id, execution_id, -evaluator_id) for PG lookup — not payload copies. -See otel_tracing_v2.md for full attribute schemas per span. -""" - -import hashlib -import json -import random -from contextlib import contextmanager -from contextvars import ContextVar -from datetime import UTC, datetime -from typing import Protocol -from uuid import UUID - -from opentelemetry import trace as otel_trace -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor - -try: - from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -except ImportError: - OTLPSpanExporter = None # type: ignore[assignment,misc] -from ergon_core.core.json_types import JsonObject, JsonValue -from ergon_core.core.settings import settings -from opentelemetry.trace import ( - NonRecordingSpan, - SpanContext, - Status, - StatusCode, - TraceFlags, -) -from opentelemetry.trace.propagation import set_span_in_context -from opentelemetry.trace.span import TraceState -from pydantic import BaseModel, Field - -TRACE_FLAGS_SAMPLED = 0x01 -_MAX_TRACE_ID = (1 << 128) - 1 -_MAX_SPAN_ID = (1 << 64) - 1 -_EMPTY_SPAN_ID = 0 - -_desired_trace_id: ContextVar[int | None] = ContextVar("desired_trace_id", default=None) -_desired_span_id: ContextVar[int | None] = ContextVar("desired_span_id", default=None) - -# --------------------------------------------------------------------------- -# Data classes -# --------------------------------------------------------------------------- - - -class TraceContext(BaseModel): - model_config = {"frozen": True} - - trace_id: int - span_id: int - parent_span_id: int | None = None - run_id: UUID | None = None - task_id: UUID | None = None - execution_id: UUID | None = None - evaluator_id: UUID | None = None - attributes: JsonObject = Field(default_factory=dict) - - -class SpanEvent(BaseModel): - model_config = {"frozen": True} - - name: str - timestamp: datetime - attributes: JsonObject = Field(default_factory=dict) - - -class CompletedSpan(BaseModel): - model_config = {"frozen": True} - - name: str - context: TraceContext - start_time: datetime - end_time: datetime - attributes: JsonObject = Field(default_factory=dict) - status_code: int | str = 0 - status_message: str | None = None - events: list[SpanEvent] = Field(default_factory=list) - - -# --------------------------------------------------------------------------- -# TraceSink protocol + noop implementation -# --------------------------------------------------------------------------- - - -class TraceSink(Protocol): - def emit_span(self, span: CompletedSpan) -> None: ... - - def add_event( - self, - context: TraceContext, - name: str, - attributes: JsonObject | None = None, - timestamp: datetime | None = None, - ) -> None: ... - - def child_context( - self, - parent: TraceContext, - *, - span_key: str, - run_id: UUID | None = None, - task_id: UUID | None = None, - execution_id: UUID | None = None, - evaluator_id: UUID | None = None, - attributes: JsonObject | None = None, - ) -> TraceContext: ... - - -class NoopTraceSink: - """Default sink that discards everything. Zero overhead.""" - - def emit_span(self, span: CompletedSpan) -> None: - pass - - def add_event( - self, - context: TraceContext, - name: str, - attributes: JsonObject | None = None, - timestamp: datetime | None = None, - ) -> None: - pass - - def child_context( - self, - parent: TraceContext, - *, - span_key: str, - run_id: UUID | None = None, - task_id: UUID | None = None, - execution_id: UUID | None = None, - evaluator_id: UUID | None = None, - attributes: JsonObject | None = None, - ) -> TraceContext: - child_span = span_id_from_key(str(parent.span_id), span_key) - return TraceContext( - trace_id=parent.trace_id, - span_id=child_span, - parent_span_id=parent.span_id, - run_id=parent.run_id if run_id is None else run_id, - task_id=parent.task_id if task_id is None else task_id, - execution_id=parent.execution_id if execution_id is None else execution_id, - evaluator_id=parent.evaluator_id if evaluator_id is None else evaluator_id, - attributes={} if attributes is None else attributes, - ) - - -# --------------------------------------------------------------------------- -# Attribute helpers -# --------------------------------------------------------------------------- - - -def truncate_text(value: str | None, max_length: int | None = None) -> str | None: - if value is None: - return None - limit = max_length or settings.otel_max_attribute_length - if len(value) <= limit: - return value - return f"{value[:limit]}...[truncated]" - - -def safe_json_attribute(value: JsonValue, max_length: int | None = None) -> str: - try: - serialized = json.dumps(value, default=str, separators=(",", ":")) - except (TypeError, ValueError): - serialized = str(value) - return truncate_text(serialized, max_length=max_length) or "" - - -def normalize_attributes(attributes: JsonObject | None) -> JsonObject: - if not attributes: - return {} - normalized: JsonObject = {} - for key, value in attributes.items(): - if value is None: - continue - if isinstance(value, (bool, int, float)): - normalized[key] = value - elif isinstance(value, str): - normalized[key] = truncate_text(value) - else: - normalized[key] = safe_json_attribute(value) - return normalized - - -def datetime_to_nanos(value: datetime) -> int: - if value.tzinfo is None: - value = value.replace(tzinfo=UTC) - return int(value.timestamp() * 1_000_000_000) - - -# --------------------------------------------------------------------------- -# Deterministic ID helpers -# --------------------------------------------------------------------------- - - -def trace_id_from_run_id(run_id: UUID) -> int: - """Derive a deterministic 128-bit trace ID from a run UUID.""" - return int(run_id.hex, 16) & _MAX_TRACE_ID - - -def span_id_from_key(*parts: str) -> int: - """Derive a deterministic 64-bit span ID from arbitrary string parts.""" - digest = hashlib.sha256(":".join(parts).encode()).digest()[:8] - return int.from_bytes(digest, "big") & _MAX_SPAN_ID or 1 - - -class DeterministicIdGenerator: - """OTEL ID generator that supports one-shot deterministic overrides.""" - - def generate_trace_id(self) -> int: - override = _desired_trace_id.get() - if override is not None: - return override - return random.getrandbits(128) - - def generate_span_id(self) -> int: - override = _desired_span_id.get() - if override is not None: - return override - return random.getrandbits(64) or 1 - - -@contextmanager -def _id_override(trace_id: int | None = None, span_id: int | None = None): - trace_token = _desired_trace_id.set(trace_id) if trace_id is not None else None - span_token = _desired_span_id.set(span_id) if span_id is not None else None - try: - yield - finally: - if span_token is not None: - _desired_span_id.reset(span_token) - if trace_token is not None: - _desired_trace_id.reset(trace_token) - - -# --------------------------------------------------------------------------- -# OtelTraceSink -# --------------------------------------------------------------------------- - - -class OtelTraceSink: - """OTEL-backed sink that exports spans via OTLP/gRPC.""" - - def __init__(self) -> None: - provider = TracerProvider( - resource=Resource.create({"service.name": settings.otel_service_name}), - id_generator=DeterministicIdGenerator(), - ) - exporter = OTLPSpanExporter( - endpoint=settings.otel_exporter_otlp_endpoint, - insecure=settings.otel_exporter_otlp_insecure, - ) - provider.add_span_processor(BatchSpanProcessor(exporter)) - otel_trace.set_tracer_provider(provider) - - self._provider: TracerProvider = provider - self._tracer = otel_trace.get_tracer(settings.otel_service_name) - - def child_context( - self, - parent: TraceContext, - *, - span_key: str, - run_id: UUID | None = None, - task_id: UUID | None = None, - execution_id: UUID | None = None, - evaluator_id: UUID | None = None, - attributes: JsonObject | None = None, - ) -> TraceContext: - return TraceContext( - trace_id=parent.trace_id, - span_id=span_id_from_key(str(parent.trace_id), str(parent.span_id), span_key), - parent_span_id=parent.span_id, - run_id=run_id if run_id is not None else parent.run_id, - task_id=task_id if task_id is not None else parent.task_id, - execution_id=execution_id if execution_id is not None else parent.execution_id, - evaluator_id=evaluator_id if evaluator_id is not None else parent.evaluator_id, - attributes=attributes or {}, - ) - - def add_event( - self, - context: TraceContext, - name: str, - attributes: JsonObject | None = None, - timestamp: datetime | None = None, - ) -> None: - now = timestamp or datetime.now(UTC) - span = CompletedSpan( - name=f"{name}.event", - context=context, - start_time=now, - end_time=now, - attributes=attributes or {}, - events=[SpanEvent(name=name, timestamp=now, attributes=attributes or {})], - ) - self.emit_span(span) - - def emit_span(self, span: CompletedSpan) -> None: - parent_ctx = None - if span.context.parent_span_id not in (None, _EMPTY_SPAN_ID): - span_context = SpanContext( - trace_id=span.context.trace_id, - span_id=span.context.parent_span_id, - is_remote=False, - trace_flags=TraceFlags(TRACE_FLAGS_SAMPLED), - trace_state=TraceState(), - ) - parent_ctx = set_span_in_context(NonRecordingSpan(span_context)) - - start_time = datetime_to_nanos(span.start_time) - end_time = datetime_to_nanos(span.end_time) - attrs = normalize_attributes({**span.context.attributes, **span.attributes}) - - with _id_override( - trace_id=span.context.trace_id if span.context.parent_span_id is None else None, - span_id=span.context.span_id, - ): - sdk_span = self._tracer.start_span( - span.name, - context=parent_ctx, - attributes=attrs, - start_time=start_time, - ) - - if str(span.status_code).lower() == "error": - sdk_span.set_status(Status(StatusCode.ERROR, span.status_message)) - else: - sdk_span.set_status(Status(StatusCode.OK)) - - for event in span.events: - sdk_span.add_event( - event.name, - attributes=normalize_attributes(event.attributes), - timestamp=datetime_to_nanos(event.timestamp), - ) - - sdk_span.end(end_time=end_time) - - -# --------------------------------------------------------------------------- -# Process-wide sink -# --------------------------------------------------------------------------- - - -def _create_sink() -> TraceSink: - if not settings.otel_traces_enabled: - return NoopTraceSink() - # The operator explicitly opted in to OTEL. Refuse to silently downgrade - # to a no-op sink — that has caused real "where are my traces?" debugging - # sessions. Surface the construction error so misconfiguration is loud. - return OtelTraceSink() - - -_sink: TraceSink = _create_sink() - - -def get_trace_sink() -> TraceSink: - """Return the process-wide trace sink. - - Each process (uvicorn worker, CLI invocation, test runner) gets its own - sink created at import time. No locking needed — OTEL is stateless - per-process and the collector handles fan-in from multiple exporters. - """ - return _sink - - -# --------------------------------------------------------------------------- -# Context factories -# --------------------------------------------------------------------------- - - -def workflow_root_context(run_id: UUID) -> TraceContext: - tid = trace_id_from_run_id(run_id) - return TraceContext( - trace_id=tid, - span_id=span_id_from_key("workflow", str(run_id)), - run_id=run_id, - ) - - -def workflow_start_context(run_id: UUID) -> TraceContext: - root = workflow_root_context(run_id) - return TraceContext( - trace_id=root.trace_id, - span_id=span_id_from_key("workflow_start", str(run_id)), - parent_span_id=root.span_id, - run_id=run_id, - ) - - -def task_execute_context(run_id: UUID, task_id: UUID) -> TraceContext: - root = workflow_root_context(run_id) - return TraceContext( - trace_id=root.trace_id, - span_id=span_id_from_key("task_execute", str(run_id), str(task_id)), - parent_span_id=root.span_id, - run_id=run_id, - task_id=task_id, - ) - - -def sandbox_setup_context(run_id: UUID, task_id: UUID) -> TraceContext: - parent = task_execute_context(run_id, task_id) - return TraceContext( - trace_id=parent.trace_id, - span_id=span_id_from_key("sandbox_setup", str(run_id), str(task_id)), - parent_span_id=parent.span_id, - run_id=run_id, - task_id=task_id, - ) - - -def worker_execute_context( - run_id: UUID, - task_id: UUID, - execution_id: UUID, -) -> TraceContext: - parent = task_execute_context(run_id, task_id) - return TraceContext( - trace_id=parent.trace_id, - span_id=span_id_from_key( - "worker_execute", - str(run_id), - str(task_id), - str(execution_id), - ), - parent_span_id=parent.span_id, - run_id=run_id, - task_id=task_id, - execution_id=execution_id, - ) - - -def persist_outputs_context( - run_id: UUID, - task_id: UUID, - execution_id: UUID, -) -> TraceContext: - parent = task_execute_context(run_id, task_id) - return TraceContext( - trace_id=parent.trace_id, - span_id=span_id_from_key( - "persist_outputs", - str(run_id), - str(task_id), - str(execution_id), - ), - parent_span_id=parent.span_id, - run_id=run_id, - task_id=task_id, - execution_id=execution_id, - ) - - -def task_propagate_context(run_id: UUID, task_id: UUID) -> TraceContext: - root = workflow_root_context(run_id) - return TraceContext( - trace_id=root.trace_id, - span_id=span_id_from_key("task_propagate", str(run_id), str(task_id)), - parent_span_id=root.span_id, - run_id=run_id, - task_id=task_id, - ) - - -def workflow_complete_context(run_id: UUID) -> TraceContext: - root = workflow_root_context(run_id) - return TraceContext( - trace_id=root.trace_id, - span_id=span_id_from_key("workflow_complete", str(run_id)), - parent_span_id=root.span_id, - run_id=run_id, - ) - - -def workflow_failed_context(run_id: UUID) -> TraceContext: - root = workflow_root_context(run_id) - return TraceContext( - trace_id=root.trace_id, - span_id=span_id_from_key("workflow_failed", str(run_id)), - parent_span_id=root.span_id, - run_id=run_id, - ) - - -def evaluation_task_context( - run_id: UUID, - task_id: UUID, - execution_id: UUID, - evaluator_id: UUID, -) -> TraceContext: - parent = task_execute_context(run_id, task_id) - return TraceContext( - trace_id=parent.trace_id, - span_id=span_id_from_key( - "evaluation_task", - str(run_id), - str(task_id), - str(execution_id), - str(evaluator_id), - ), - parent_span_id=parent.span_id, - run_id=run_id, - task_id=task_id, - execution_id=execution_id, - evaluator_id=evaluator_id, - ) - - -def evaluation_criterion_context( - run_id: UUID, - task_id: UUID, - execution_id: UUID, - evaluator_id: UUID, - stage_idx: int, - criterion_idx: int, -) -> TraceContext: - parent = evaluation_task_context(run_id, task_id, execution_id, evaluator_id) - return TraceContext( - trace_id=parent.trace_id, - span_id=span_id_from_key( - "evaluation_criterion", - str(run_id), - str(task_id), - str(execution_id), - str(evaluator_id), - str(stage_idx), - str(criterion_idx), - ), - parent_span_id=parent.span_id, - run_id=run_id, - task_id=task_id, - execution_id=execution_id, - evaluator_id=evaluator_id, - ) diff --git a/ergon_core/ergon_core/core/runtime/tracing/__init__.py b/ergon_core/ergon_core/core/runtime/tracing/__init__.py new file mode 100644 index 00000000..82db1111 --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/__init__.py @@ -0,0 +1,85 @@ +"""Tracing facade. + +The runtime emits structured spans through this package while keeping the +existing public import path stable: + + from ergon_core.core.runtime.tracing import get_trace_sink + +Target span hierarchy (one trace per run, keyed by run_id):: + + workflow.execute (synthetic root) + | cohort_id, instance_count + +-- workflow.start + +-- task.execute (per task) + | instance_key + | +-- sandbox.setup + | +-- worker.execute + | | +-- tool.{tool_name} (per tool call in GenerationTurn) + | +-- persist.outputs + | +-- evaluation.task (per evaluator) + | +-- evaluation.criterion (per criterion) + +-- task.propagate (per completion) + +-- communication.message (per ThreadMessage, optional) + +-- workflow.complete OR workflow.failed + +Every span stores relational IDs (run_id, task_id, execution_id, evaluator_id) +for PG lookup, not payload copies. See otel_tracing_v2.md for full attribute +schemas per span. +""" + +from ergon_core.core.runtime.tracing.attributes import ( + datetime_to_nanos, + normalize_attributes, + safe_json_attribute, + truncate_text, +) +from ergon_core.core.runtime.tracing.contexts import ( + evaluation_criterion_context, + evaluation_task_context, + persist_outputs_context, + sandbox_setup_context, + task_execute_context, + task_propagate_context, + workflow_complete_context, + workflow_failed_context, + workflow_root_context, + workflow_start_context, + worker_execute_context, +) +from ergon_core.core.runtime.tracing.ids import ( + DeterministicIdGenerator, + span_id_from_key, + trace_id_from_run_id, +) +from ergon_core.core.runtime.tracing.noop import NoopTraceSink +from ergon_core.core.runtime.tracing.otel import OtelTraceSink +from ergon_core.core.runtime.tracing.sinks import get_trace_sink +from ergon_core.core.runtime.tracing.types import CompletedSpan, SpanEvent, TraceContext, TraceSink + +__all__ = [ + "CompletedSpan", + "DeterministicIdGenerator", + "NoopTraceSink", + "OtelTraceSink", + "SpanEvent", + "TraceContext", + "TraceSink", + "datetime_to_nanos", + "evaluation_criterion_context", + "evaluation_task_context", + "get_trace_sink", + "normalize_attributes", + "persist_outputs_context", + "safe_json_attribute", + "sandbox_setup_context", + "span_id_from_key", + "task_execute_context", + "task_propagate_context", + "trace_id_from_run_id", + "truncate_text", + "workflow_complete_context", + "workflow_failed_context", + "workflow_root_context", + "workflow_start_context", + "worker_execute_context", +] diff --git a/ergon_core/ergon_core/core/runtime/tracing/attributes.py b/ergon_core/ergon_core/core/runtime/tracing/attributes.py new file mode 100644 index 00000000..1775b2cd --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/attributes.py @@ -0,0 +1,46 @@ +"""Helpers for serializing values into OTEL-safe attributes.""" + +import json +from datetime import UTC, datetime + +from ergon_core.core.json_types import JsonObject, JsonValue +from ergon_core.core.settings import settings + + +def truncate_text(value: str | None, max_length: int | None = None) -> str | None: + if value is None: + return None + limit = max_length or settings.otel_max_attribute_length + if len(value) <= limit: + return value + return f"{value[:limit]}...[truncated]" + + +def safe_json_attribute(value: JsonValue, max_length: int | None = None) -> str: + try: + serialized = json.dumps(value, default=str, separators=(",", ":")) + except (TypeError, ValueError): + serialized = str(value) + return truncate_text(serialized, max_length=max_length) or "" + + +def normalize_attributes(attributes: JsonObject | None) -> JsonObject: + if not attributes: + return {} + normalized: JsonObject = {} + for key, value in attributes.items(): + if value is None: + continue + if isinstance(value, (bool, int, float)): + normalized[key] = value + elif isinstance(value, str): + normalized[key] = truncate_text(value) + else: + normalized[key] = safe_json_attribute(value) + return normalized + + +def datetime_to_nanos(value: datetime) -> int: + if value.tzinfo is None: + value = value.replace(tzinfo=UTC) + return int(value.timestamp() * 1_000_000_000) diff --git a/ergon_core/ergon_core/core/runtime/tracing/contexts.py b/ergon_core/ergon_core/core/runtime/tracing/contexts.py new file mode 100644 index 00000000..baa01720 --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/contexts.py @@ -0,0 +1,177 @@ +"""Runtime trace context factories. + +Context factories produce deterministic ``TraceContext`` objects from +run/task/execution/evaluator UUIDs so span trees are reproducible across +replays. +""" + +from uuid import UUID + +from ergon_core.core.runtime.tracing.ids import span_id_from_key, trace_id_from_run_id +from ergon_core.core.runtime.tracing.types import TraceContext + + +def workflow_root_context(run_id: UUID) -> TraceContext: + tid = trace_id_from_run_id(run_id) + return TraceContext( + trace_id=tid, + span_id=span_id_from_key("workflow", str(run_id)), + run_id=run_id, + ) + + +def workflow_start_context(run_id: UUID) -> TraceContext: + root = workflow_root_context(run_id) + return TraceContext( + trace_id=root.trace_id, + span_id=span_id_from_key("workflow_start", str(run_id)), + parent_span_id=root.span_id, + run_id=run_id, + ) + + +def task_execute_context(run_id: UUID, task_id: UUID) -> TraceContext: + root = workflow_root_context(run_id) + return TraceContext( + trace_id=root.trace_id, + span_id=span_id_from_key("task_execute", str(run_id), str(task_id)), + parent_span_id=root.span_id, + run_id=run_id, + task_id=task_id, + ) + + +def sandbox_setup_context(run_id: UUID, task_id: UUID) -> TraceContext: + parent = task_execute_context(run_id, task_id) + return TraceContext( + trace_id=parent.trace_id, + span_id=span_id_from_key("sandbox_setup", str(run_id), str(task_id)), + parent_span_id=parent.span_id, + run_id=run_id, + task_id=task_id, + ) + + +def worker_execute_context( + run_id: UUID, + task_id: UUID, + execution_id: UUID, +) -> TraceContext: + parent = task_execute_context(run_id, task_id) + return TraceContext( + trace_id=parent.trace_id, + span_id=span_id_from_key( + "worker_execute", + str(run_id), + str(task_id), + str(execution_id), + ), + parent_span_id=parent.span_id, + run_id=run_id, + task_id=task_id, + execution_id=execution_id, + ) + + +def persist_outputs_context( + run_id: UUID, + task_id: UUID, + execution_id: UUID, +) -> TraceContext: + parent = task_execute_context(run_id, task_id) + return TraceContext( + trace_id=parent.trace_id, + span_id=span_id_from_key( + "persist_outputs", + str(run_id), + str(task_id), + str(execution_id), + ), + parent_span_id=parent.span_id, + run_id=run_id, + task_id=task_id, + execution_id=execution_id, + ) + + +def task_propagate_context(run_id: UUID, task_id: UUID) -> TraceContext: + root = workflow_root_context(run_id) + return TraceContext( + trace_id=root.trace_id, + span_id=span_id_from_key("task_propagate", str(run_id), str(task_id)), + parent_span_id=root.span_id, + run_id=run_id, + task_id=task_id, + ) + + +def workflow_complete_context(run_id: UUID) -> TraceContext: + root = workflow_root_context(run_id) + return TraceContext( + trace_id=root.trace_id, + span_id=span_id_from_key("workflow_complete", str(run_id)), + parent_span_id=root.span_id, + run_id=run_id, + ) + + +def workflow_failed_context(run_id: UUID) -> TraceContext: + root = workflow_root_context(run_id) + return TraceContext( + trace_id=root.trace_id, + span_id=span_id_from_key("workflow_failed", str(run_id)), + parent_span_id=root.span_id, + run_id=run_id, + ) + + +def evaluation_task_context( + run_id: UUID, + task_id: UUID, + execution_id: UUID, + evaluator_id: UUID, +) -> TraceContext: + parent = task_execute_context(run_id, task_id) + return TraceContext( + trace_id=parent.trace_id, + span_id=span_id_from_key( + "evaluation_task", + str(run_id), + str(task_id), + str(execution_id), + str(evaluator_id), + ), + parent_span_id=parent.span_id, + run_id=run_id, + task_id=task_id, + execution_id=execution_id, + evaluator_id=evaluator_id, + ) + + +def evaluation_criterion_context( + run_id: UUID, + task_id: UUID, + execution_id: UUID, + evaluator_id: UUID, + stage_idx: int, + criterion_idx: int, +) -> TraceContext: + parent = evaluation_task_context(run_id, task_id, execution_id, evaluator_id) + return TraceContext( + trace_id=parent.trace_id, + span_id=span_id_from_key( + "evaluation_criterion", + str(run_id), + str(task_id), + str(execution_id), + str(evaluator_id), + str(stage_idx), + str(criterion_idx), + ), + parent_span_id=parent.span_id, + run_id=run_id, + task_id=task_id, + execution_id=execution_id, + evaluator_id=evaluator_id, + ) diff --git a/ergon_core/ergon_core/core/runtime/tracing/ids.py b/ergon_core/ergon_core/core/runtime/tracing/ids.py new file mode 100644 index 00000000..d01d9c0f --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/ids.py @@ -0,0 +1,56 @@ +"""Deterministic trace and span ID helpers.""" + +import hashlib +import random +from contextlib import contextmanager +from contextvars import ContextVar +from typing import Iterator +from uuid import UUID + +TRACE_FLAGS_SAMPLED = 0x01 +MAX_TRACE_ID = (1 << 128) - 1 +MAX_SPAN_ID = (1 << 64) - 1 +EMPTY_SPAN_ID = 0 + +_desired_trace_id: ContextVar[int | None] = ContextVar("desired_trace_id", default=None) +_desired_span_id: ContextVar[int | None] = ContextVar("desired_span_id", default=None) + + +def trace_id_from_run_id(run_id: UUID) -> int: + """Derive a deterministic 128-bit trace ID from a run UUID.""" + return int(run_id.hex, 16) & MAX_TRACE_ID + + +def span_id_from_key(*parts: str) -> int: + """Derive a deterministic 64-bit span ID from arbitrary string parts.""" + digest = hashlib.sha256(":".join(parts).encode()).digest()[:8] + return int.from_bytes(digest, "big") & MAX_SPAN_ID or 1 + + +class DeterministicIdGenerator: + """OTEL ID generator that supports one-shot deterministic overrides.""" + + def generate_trace_id(self) -> int: + override = _desired_trace_id.get() + if override is not None: + return override + return random.getrandbits(128) + + def generate_span_id(self) -> int: + override = _desired_span_id.get() + if override is not None: + return override + return random.getrandbits(64) or 1 + + +@contextmanager +def id_override(trace_id: int | None = None, span_id: int | None = None) -> Iterator[None]: + trace_token = _desired_trace_id.set(trace_id) if trace_id is not None else None + span_token = _desired_span_id.set(span_id) if span_id is not None else None + try: + yield + finally: + if span_token is not None: + _desired_span_id.reset(span_token) + if trace_token is not None: + _desired_trace_id.reset(trace_token) diff --git a/ergon_core/ergon_core/core/runtime/tracing/noop.py b/ergon_core/ergon_core/core/runtime/tracing/noop.py new file mode 100644 index 00000000..b18809b8 --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/noop.py @@ -0,0 +1,47 @@ +"""No-op tracing sink.""" + +from datetime import datetime +from uuid import UUID + +from ergon_core.core.json_types import JsonObject +from ergon_core.core.runtime.tracing.ids import span_id_from_key +from ergon_core.core.runtime.tracing.types import CompletedSpan, TraceContext + + +class NoopTraceSink: + """Default sink that discards everything.""" + + def emit_span(self, span: CompletedSpan) -> None: + pass + + def add_event( + self, + context: TraceContext, + name: str, + attributes: JsonObject | None = None, + timestamp: datetime | None = None, + ) -> None: + pass + + def child_context( + self, + parent: TraceContext, + *, + span_key: str, + run_id: UUID | None = None, + task_id: UUID | None = None, + execution_id: UUID | None = None, + evaluator_id: UUID | None = None, + attributes: JsonObject | None = None, + ) -> TraceContext: + child_span = span_id_from_key(str(parent.span_id), span_key) + return TraceContext( + trace_id=parent.trace_id, + span_id=child_span, + parent_span_id=parent.span_id, + run_id=parent.run_id if run_id is None else run_id, + task_id=parent.task_id if task_id is None else task_id, + execution_id=parent.execution_id if execution_id is None else execution_id, + evaluator_id=parent.evaluator_id if evaluator_id is None else evaluator_id, + attributes={} if attributes is None else attributes, + ) diff --git a/ergon_core/ergon_core/core/runtime/tracing/otel.py b/ergon_core/ergon_core/core/runtime/tracing/otel.py new file mode 100644 index 00000000..51b66dc4 --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/otel.py @@ -0,0 +1,135 @@ +"""OpenTelemetry tracing sink.""" + +from datetime import UTC, datetime +from uuid import UUID + +from opentelemetry import trace as otel_trace +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.trace import ( + NonRecordingSpan, + SpanContext, + Status, + StatusCode, + TraceFlags, +) +from opentelemetry.trace.propagation import set_span_in_context +from opentelemetry.trace.span import TraceState + +try: + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +except ImportError: + OTLPSpanExporter = None # type: ignore[assignment,misc] + +from ergon_core.core.json_types import JsonObject +from ergon_core.core.runtime.tracing.attributes import datetime_to_nanos, normalize_attributes +from ergon_core.core.runtime.tracing.ids import ( + EMPTY_SPAN_ID, + TRACE_FLAGS_SAMPLED, + DeterministicIdGenerator, + id_override, + span_id_from_key, +) +from ergon_core.core.runtime.tracing.types import CompletedSpan, SpanEvent, TraceContext +from ergon_core.core.settings import settings + + +class OtelTraceSink: + """OTEL-backed sink that exports spans via OTLP/gRPC.""" + + def __init__(self) -> None: + provider = TracerProvider( + resource=Resource.create({"service.name": settings.otel_service_name}), + id_generator=DeterministicIdGenerator(), + ) + exporter = OTLPSpanExporter( + endpoint=settings.otel_exporter_otlp_endpoint, + insecure=settings.otel_exporter_otlp_insecure, + ) + provider.add_span_processor(BatchSpanProcessor(exporter)) + otel_trace.set_tracer_provider(provider) + + self._provider: TracerProvider = provider + self._tracer = otel_trace.get_tracer(settings.otel_service_name) + + def child_context( + self, + parent: TraceContext, + *, + span_key: str, + run_id: UUID | None = None, + task_id: UUID | None = None, + execution_id: UUID | None = None, + evaluator_id: UUID | None = None, + attributes: JsonObject | None = None, + ) -> TraceContext: + return TraceContext( + trace_id=parent.trace_id, + span_id=span_id_from_key(str(parent.trace_id), str(parent.span_id), span_key), + parent_span_id=parent.span_id, + run_id=run_id if run_id is not None else parent.run_id, + task_id=task_id if task_id is not None else parent.task_id, + execution_id=execution_id if execution_id is not None else parent.execution_id, + evaluator_id=evaluator_id if evaluator_id is not None else parent.evaluator_id, + attributes=attributes or {}, + ) + + def add_event( + self, + context: TraceContext, + name: str, + attributes: JsonObject | None = None, + timestamp: datetime | None = None, + ) -> None: + now = timestamp or datetime.now(UTC) + span = CompletedSpan( + name=f"{name}.event", + context=context, + start_time=now, + end_time=now, + attributes=attributes or {}, + events=[SpanEvent(name=name, timestamp=now, attributes=attributes or {})], + ) + self.emit_span(span) + + def emit_span(self, span: CompletedSpan) -> None: + parent_ctx = None + if span.context.parent_span_id not in (None, EMPTY_SPAN_ID): + span_context = SpanContext( + trace_id=span.context.trace_id, + span_id=span.context.parent_span_id, + is_remote=False, + trace_flags=TraceFlags(TRACE_FLAGS_SAMPLED), + trace_state=TraceState(), + ) + parent_ctx = set_span_in_context(NonRecordingSpan(span_context)) + + start_time = datetime_to_nanos(span.start_time) + end_time = datetime_to_nanos(span.end_time) + attrs = normalize_attributes({**span.context.attributes, **span.attributes}) + + with id_override( + trace_id=span.context.trace_id if span.context.parent_span_id is None else None, + span_id=span.context.span_id, + ): + sdk_span = self._tracer.start_span( + span.name, + context=parent_ctx, + attributes=attrs, + start_time=start_time, + ) + + if str(span.status_code).lower() == "error": + sdk_span.set_status(Status(StatusCode.ERROR, span.status_message)) + else: + sdk_span.set_status(Status(StatusCode.OK)) + + for event in span.events: + sdk_span.add_event( + event.name, + attributes=normalize_attributes(event.attributes), + timestamp=datetime_to_nanos(event.timestamp), + ) + + sdk_span.end(end_time=end_time) diff --git a/ergon_core/ergon_core/core/runtime/tracing/sinks.py b/ergon_core/ergon_core/core/runtime/tracing/sinks.py new file mode 100644 index 00000000..34607a4b --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/sinks.py @@ -0,0 +1,27 @@ +"""Process-wide trace sink factory.""" + +from ergon_core.core.runtime.tracing.noop import NoopTraceSink +from ergon_core.core.runtime.tracing.otel import OtelTraceSink +from ergon_core.core.runtime.tracing.types import TraceSink +from ergon_core.core.settings import settings + + +def _create_sink() -> TraceSink: + if not settings.otel_traces_enabled: + return NoopTraceSink() + # The operator explicitly opted in to OTEL. Refuse to silently downgrade + # to a no-op sink, so trace exporter misconfiguration is loud. + return OtelTraceSink() + + +_sink: TraceSink = _create_sink() + + +def get_trace_sink() -> TraceSink: + """Return the process-wide trace sink. + + Each process (uvicorn worker, CLI invocation, test runner) gets its own + sink created at import time. No locking needed; OTEL is stateless + per-process and the collector handles fan-in from multiple exporters. + """ + return _sink diff --git a/ergon_core/ergon_core/core/runtime/tracing/types.py b/ergon_core/ergon_core/core/runtime/tracing/types.py new file mode 100644 index 00000000..2fd6cc6c --- /dev/null +++ b/ergon_core/ergon_core/core/runtime/tracing/types.py @@ -0,0 +1,67 @@ +"""Tracing data contracts.""" + +from datetime import datetime +from typing import Protocol +from uuid import UUID + +from pydantic import BaseModel, Field + +from ergon_core.core.json_types import JsonObject + + +class TraceContext(BaseModel): + model_config = {"frozen": True} + + trace_id: int + span_id: int + parent_span_id: int | None = None + run_id: UUID | None = None + task_id: UUID | None = None + execution_id: UUID | None = None + evaluator_id: UUID | None = None + attributes: JsonObject = Field(default_factory=dict) + + +class SpanEvent(BaseModel): + model_config = {"frozen": True} + + name: str + timestamp: datetime + attributes: JsonObject = Field(default_factory=dict) + + +class CompletedSpan(BaseModel): + model_config = {"frozen": True} + + name: str + context: TraceContext + start_time: datetime + end_time: datetime + attributes: JsonObject = Field(default_factory=dict) + status_code: int | str = 0 + status_message: str | None = None + events: list[SpanEvent] = Field(default_factory=list) + + +class TraceSink(Protocol): + def emit_span(self, span: CompletedSpan) -> None: ... + + def add_event( + self, + context: TraceContext, + name: str, + attributes: JsonObject | None = None, + timestamp: datetime | None = None, + ) -> None: ... + + def child_context( + self, + parent: TraceContext, + *, + span_key: str, + run_id: UUID | None = None, + task_id: UUID | None = None, + execution_id: UUID | None = None, + evaluator_id: UUID | None = None, + attributes: JsonObject | None = None, + ) -> TraceContext: ... diff --git a/ergon_core/ergon_core/core/providers/sandbox/__init__.py b/ergon_core/ergon_core/core/sandbox/__init__.py similarity index 72% rename from ergon_core/ergon_core/core/providers/sandbox/__init__.py rename to ergon_core/ergon_core/core/sandbox/__init__.py index 6a0a5e62..288b875c 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/__init__.py +++ b/ergon_core/ergon_core/core/sandbox/__init__.py @@ -1,7 +1,7 @@ """Sandbox management: provisioning, file I/O, lifecycle. Import concrete modules directly, for example -``ergon_core.core.providers.sandbox.manager``. Keeping this package initializer +``ergon_core.core.sandbox.manager``. Keeping this package initializer lightweight avoids import cycles between telemetry models and API DTO modules. """ diff --git a/ergon_core/ergon_core/core/providers/sandbox/errors.py b/ergon_core/ergon_core/core/sandbox/errors.py similarity index 100% rename from ergon_core/ergon_core/core/providers/sandbox/errors.py rename to ergon_core/ergon_core/core/sandbox/errors.py diff --git a/ergon_core/ergon_core/core/providers/sandbox/event_sink.py b/ergon_core/ergon_core/core/sandbox/event_sink.py similarity index 100% rename from ergon_core/ergon_core/core/providers/sandbox/event_sink.py rename to ergon_core/ergon_core/core/sandbox/event_sink.py diff --git a/ergon_core/ergon_core/core/providers/sandbox/instrumentation.py b/ergon_core/ergon_core/core/sandbox/instrumentation.py similarity index 98% rename from ergon_core/ergon_core/core/providers/sandbox/instrumentation.py rename to ergon_core/ergon_core/core/sandbox/instrumentation.py index 30411c08..e4c0e307 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/instrumentation.py +++ b/ergon_core/ergon_core/core/sandbox/instrumentation.py @@ -13,8 +13,8 @@ except ImportError: CommandExitException = Exception # type: ignore[assignment,misc] -from ergon_core.core.providers.sandbox.event_sink import SandboxEventSink -from ergon_core.core.providers.sandbox.utils import ( +from ergon_core.core.sandbox.event_sink import SandboxEventSink +from ergon_core.core.sandbox.utils import ( _truncate, bytes_length, coerce_text, diff --git a/ergon_core/ergon_core/core/providers/sandbox/lifecycle.py b/ergon_core/ergon_core/core/sandbox/lifecycle.py similarity index 96% rename from ergon_core/ergon_core/core/providers/sandbox/lifecycle.py rename to ergon_core/ergon_core/core/sandbox/lifecycle.py index 33595810..c6a862c5 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/lifecycle.py +++ b/ergon_core/ergon_core/core/sandbox/lifecycle.py @@ -32,7 +32,7 @@ async def terminate_sandbox_by_id(sandbox_id: str | None) -> SandboxTerminationR try: # reason: avoid import cycle between sandbox manager/event sink and telemetry models. - from ergon_core.core.providers.sandbox.manager import ( + from ergon_core.core.sandbox.manager import ( BaseSandboxManager, ) diff --git a/ergon_core/ergon_core/core/providers/sandbox/manager.py b/ergon_core/ergon_core/core/sandbox/manager.py similarity index 99% rename from ergon_core/ergon_core/core/providers/sandbox/manager.py rename to ergon_core/ergon_core/core/sandbox/manager.py index 7bbab2ab..abd15641 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/manager.py +++ b/ergon_core/ergon_core/core/sandbox/manager.py @@ -8,12 +8,12 @@ from typing import ClassVar, Protocol, runtime_checkable from uuid import UUID -from ergon_core.core.providers.sandbox.errors import SandboxExpiredError -from ergon_core.core.providers.sandbox.event_sink import ( +from ergon_core.core.sandbox.errors import SandboxExpiredError +from ergon_core.core.sandbox.event_sink import ( NoopSandboxEventSink, SandboxEventSink, ) -from ergon_core.core.providers.sandbox.utils import _truncate, coerce_text +from ergon_core.core.sandbox.utils import _truncate, coerce_text from ergon_core.core.settings import settings from pydantic import BaseModel diff --git a/ergon_core/ergon_core/core/providers/sandbox/resource_publisher.py b/ergon_core/ergon_core/core/sandbox/resource_publisher.py similarity index 100% rename from ergon_core/ergon_core/core/providers/sandbox/resource_publisher.py rename to ergon_core/ergon_core/core/sandbox/resource_publisher.py diff --git a/ergon_core/ergon_core/core/providers/sandbox/utils.py b/ergon_core/ergon_core/core/sandbox/utils.py similarity index 100% rename from ergon_core/ergon_core/core/providers/sandbox/utils.py rename to ergon_core/ergon_core/core/sandbox/utils.py diff --git a/ergon_core/ergon_core/test_support/sandbox/stub_manager.py b/ergon_core/ergon_core/test_support/sandbox/stub_manager.py index 1674ddb3..eed4676e 100644 --- a/ergon_core/ergon_core/test_support/sandbox/stub_manager.py +++ b/ergon_core/ergon_core/test_support/sandbox/stub_manager.py @@ -4,7 +4,7 @@ from typing import cast from uuid import UUID -from ergon_core.core.providers.sandbox.manager import AsyncSandbox, BaseSandboxManager +from ergon_core.core.sandbox.manager import AsyncSandbox, BaseSandboxManager from ergon_core.test_support.sandbox.sentinel import STUB_SANDBOX_PREFIX logger = logging.getLogger(__name__) diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/sandbox.py b/ergon_core/ergon_core/test_support/smoke_fixtures/sandbox.py index 0eb556ac..65a28726 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/sandbox.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/sandbox.py @@ -12,7 +12,7 @@ from typing import cast from uuid import UUID -from ergon_core.core.providers.sandbox.manager import AsyncSandbox, BaseSandboxManager +from ergon_core.core.sandbox.manager import AsyncSandbox, BaseSandboxManager from ergon_core.core.settings import settings from pydantic import BaseModel diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/leaf_base.py b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/leaf_base.py index cd653656..b7343212 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/leaf_base.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/leaf_base.py @@ -29,7 +29,7 @@ from ergon_core.api.results import WorkerOutput from ergon_core.core.persistence.graph.models import RunGraphNode from ergon_core.core.persistence.shared.db import get_session -from ergon_core.core.providers.sandbox.instrumentation import InstrumentedSandbox +from ergon_core.core.sandbox.instrumentation import InstrumentedSandbox from ergon_core.core.runtime.services.communication_schemas import CreateMessageRequest from ergon_core.core.runtime.services.communication_service import ( communication_service, diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/subworker.py b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/subworker.py index e8dfb88f..9fccef3b 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/subworker.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/subworker.py @@ -21,7 +21,7 @@ from typing import Protocol, runtime_checkable -from ergon_core.core.providers.sandbox.manager import AsyncSandbox +from ergon_core.core.sandbox.manager import AsyncSandbox from pydantic import BaseModel diff --git a/pyproject.toml b/pyproject.toml index fff065e9..1faf9719 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,7 +112,7 @@ invalid-assignment = "warn" # invalid-assignment: try/except ImportError fallbacks (AsyncSandbox = None, # CommandExitException = Exception) when e2b SDK is unavailable. [[tool.ty.overrides]] -include = ["**/providers/sandbox/**"] +include = ["ergon_core/ergon_core/core/sandbox/**"] [tool.ty.overrides.rules] invalid-argument-type = "warn" unresolved-attribute = "warn" diff --git a/scripts/spike_openrouter_reasoning.py b/scripts/spike_openrouter_reasoning.py new file mode 100644 index 00000000..c8ce9abb --- /dev/null +++ b/scripts/spike_openrouter_reasoning.py @@ -0,0 +1,141 @@ +"""Spike reasoning settings and streamed thinking events. + +Usage: + uv run python scripts/spike_openrouter_reasoning.py + uv run python scripts/spike_openrouter_reasoning.py --model openrouter:anthropic/claude-opus-4.7 + uv run python scripts/spike_openrouter_reasoning.py --model anthropic:claude-opus-4.7 + +The script always prints Ergon's resolved model settings. If OPENROUTER_API_KEY +is available, it also runs one tiny PydanticAI streaming request and reports +whether ThinkingPart / ThinkingPartDelta events are surfaced. +""" + +from __future__ import annotations + +import argparse +import asyncio +import os +from collections import Counter +from typing import Any + +# Register production model backends before resolving OpenRouter targets. +import ergon_builtins.registry # noqa: F401 +from ergon_builtins.models.resolution import resolve_model_target +from pydantic_ai import Agent +from pydantic_ai.messages import ( + PartDeltaEvent, + PartEndEvent, + PartStartEvent, + TextPartDelta, + ThinkingPart, + ThinkingPartDelta, +) + + +def _thinking_content(part: ThinkingPart) -> str: + if part.content: + return part.content + details = part.provider_details + if isinstance(details, dict): + raw_content = details.get("raw_content") + if isinstance(raw_content, str): + return raw_content + return "" + + +async def _run_stream(model: str, prompt: str) -> None: + resolved = resolve_model_target(model) + print(f"resolved.model={resolved.model!r}") + print(f"resolved.capture_model_settings={resolved.capture_model_settings!r}") + + required_key = _required_api_key_name(model) + if required_key and not os.environ.get(required_key): + print(f"{required_key} is not set; skipping live call.") + return + + agent: Agent[None, str] = Agent( + model=resolved.model, + instructions=("Answer briefly. Use reasoning if available, then give the final answer."), + output_type=str, + ) + + counts: Counter[str] = Counter() + thinking_chunks: list[str] = [] + + async with agent.iter( + prompt, + model_settings=resolved.capture_model_settings, + ) as run: + async for node in run: + if Agent.is_model_request_node(node) or Agent.is_call_tools_node(node): + async with node.stream(run.ctx) as stream: + async for event in stream: + counts[type(event).__name__] += 1 + _record_part_shape(event, counts) + _record_thinking_event(event, thinking_chunks, counts) + + print(f"event_counts={dict(counts)}") + print(f"thinking_chunk_count={len(thinking_chunks)}") + if thinking_chunks: + preview = "".join(thinking_chunks)[:1000] + print(f"thinking_preview={preview!r}") + else: + print("thinking_preview=None") + + +def _record_part_shape(event: Any, counts: Counter[str]) -> None: + if isinstance(event, PartStartEvent): + counts[f"PartStartEvent:{type(event.part).__name__}"] += 1 + elif isinstance(event, PartDeltaEvent): + counts[f"PartDeltaEvent:{type(event.delta).__name__}"] += 1 + if isinstance(event.delta, TextPartDelta) and event.delta.content_delta: + counts["text_delta_chars"] += len(event.delta.content_delta) + elif isinstance(event, PartEndEvent): + counts[f"PartEndEvent:{type(event.part).__name__}"] += 1 + + +def _record_thinking_event( + event: Any, + thinking_chunks: list[str], + counts: Counter[str], +) -> None: + if isinstance(event, PartStartEvent) and isinstance(event.part, ThinkingPart): + counts["ThinkingPart:start"] += 1 + if content := _thinking_content(event.part): + thinking_chunks.append(content) + elif isinstance(event, PartDeltaEvent) and isinstance( + event.delta, + ThinkingPartDelta, + ): + counts["ThinkingPartDelta"] += 1 + if event.delta.content_delta: + thinking_chunks.append(event.delta.content_delta) + elif isinstance(event, PartEndEvent) and isinstance(event.part, ThinkingPart): + counts["ThinkingPart:end"] += 1 + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + default="openrouter:anthropic/claude-opus-4.7", + help="Model target to resolve and optionally call.", + ) + parser.add_argument( + "--prompt", + default="In one sentence, explain why task decomposition helps research agents.", + ) + args = parser.parse_args() + asyncio.run(_run_stream(args.model, args.prompt)) + + +def _required_api_key_name(model: str) -> str | None: + if model.startswith(("openrouter:", "openai-responses:")): + return "OPENROUTER_API_KEY" + if model.startswith("anthropic:"): + return "ANTHROPIC_API_KEY" + return None + + +if __name__ == "__main__": + main() diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c9689728..4d18d38b 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -27,7 +27,7 @@ import pytest import pytest_asyncio from ergon_core.core.persistence.shared.db import ensure_db -from ergon_core.core.runtime.inngest_client import inngest_client +from ergon_core.core.runtime.inngest.client import inngest_client from ergon_core.core.settings import settings from inngest._internal import net as inngest_net diff --git a/tests/integration/minif2f/test_sandbox_manager.py b/tests/integration/minif2f/test_sandbox_manager.py index 9df69456..61bf878f 100644 --- a/tests/integration/minif2f/test_sandbox_manager.py +++ b/tests/integration/minif2f/test_sandbox_manager.py @@ -10,7 +10,7 @@ import pytest from ergon_builtins.benchmarks.minif2f.sandbox.utils import resolve_template from ergon_builtins.benchmarks.minif2f.sandbox_manager import MiniF2FSandboxManager -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager # --------------------------------------------------------------------------- # Reset the singleton between tests — BaseSandboxManager stores _instance and @@ -104,12 +104,12 @@ async def test_create_threads_template_kwarg_to_e2b_sdk( fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) # settings.e2b_api_key must be truthy for create() to proceed. monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -157,11 +157,11 @@ async def _run(cmd: str, **_kwargs: object) -> MagicMock: fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -188,11 +188,11 @@ async def test_base_class_omits_template_when_unset(monkeypatch: pytest.MonkeyPa ) fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) diff --git a/tests/integration/minif2f/test_verification_integration.py b/tests/integration/minif2f/test_verification_integration.py index 356ec8a0..c5967af2 100644 --- a/tests/integration/minif2f/test_verification_integration.py +++ b/tests/integration/minif2f/test_verification_integration.py @@ -23,7 +23,7 @@ from ergon_core.api.evaluation_context import EvaluationContext from ergon_core.api.results import WorkerOutput from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_core.core.runtime.evaluation.criterion_runtime import ( DefaultCriterionRuntime, ) diff --git a/tests/integration/researchrubrics/test_sandbox_manager.py b/tests/integration/researchrubrics/test_sandbox_manager.py index 7351eb28..a80f1ba6 100644 --- a/tests/integration/researchrubrics/test_sandbox_manager.py +++ b/tests/integration/researchrubrics/test_sandbox_manager.py @@ -16,8 +16,8 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager -from ergon_core.core.providers.sandbox.research_rubrics_manager import ( +from ergon_core.core.sandbox.manager import BaseSandboxManager +from ergon_builtins.benchmarks.researchrubrics.sandbox_manager import ( ResearchRubricsSandboxManager, ) @@ -68,15 +68,15 @@ async def test_create_injects_exa_api_key_into_sandbox_envs( fake_sandbox = _make_fake_sandbox() fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-e2b-key", ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.exa_api_key", + "ergon_core.core.sandbox.manager.settings.exa_api_key", "test-exa-key-xyz", ) @@ -107,15 +107,15 @@ async def test_create_fails_fast_when_required_key_missing_from_settings( fake_sandbox = _make_fake_sandbox() fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-e2b-key", ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.exa_api_key", + "ergon_core.core.sandbox.manager.settings.exa_api_key", "", ) diff --git a/tests/integration/sandbox/test_required_env_keys.py b/tests/integration/sandbox/test_required_env_keys.py index 616733d0..b7533b58 100644 --- a/tests/integration/sandbox/test_required_env_keys.py +++ b/tests/integration/sandbox/test_required_env_keys.py @@ -26,13 +26,13 @@ import pytest from ergon_builtins.benchmarks.gdpeval.sandbox import GDPEvalSandboxManager from ergon_builtins.benchmarks.minif2f.sandbox_manager import MiniF2FSandboxManager +from ergon_builtins.benchmarks.researchrubrics.sandbox_manager import ( + ResearchRubricsSandboxManager, +) from ergon_builtins.benchmarks.swebench_verified.sandbox_manager import ( SWEBenchSandboxManager, ) -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager -from ergon_core.core.providers.sandbox.research_rubrics_manager import ( - ResearchRubricsSandboxManager, -) +from ergon_core.core.sandbox.manager import BaseSandboxManager # Every concrete ``BaseSandboxManager`` subclass ergon ships. Add new # managers here so the env-injection contract is enforced for them too. @@ -84,11 +84,11 @@ def _install_async_sandbox_and_e2b_key(monkeypatch: pytest.MonkeyPatch) -> Async fake_sandbox = _make_fake_sandbox() fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-e2b-key", ) return fake_create @@ -128,7 +128,7 @@ async def test_required_env_keys_round_trip_into_sandbox( dummy = f"dummy-{key}-{idx}" expected_envs[key] = dummy monkeypatch.setattr( - f"ergon_core.core.providers.sandbox.manager.settings.{key.lower()}", + f"ergon_core.core.sandbox.manager.settings.{key.lower()}", dummy, ) diff --git a/tests/integration/swebench_verified/test_sandbox_manager.py b/tests/integration/swebench_verified/test_sandbox_manager.py index 40d19255..6318fc4e 100644 --- a/tests/integration/swebench_verified/test_sandbox_manager.py +++ b/tests/integration/swebench_verified/test_sandbox_manager.py @@ -11,7 +11,7 @@ from ergon_builtins.benchmarks.swebench_verified.sandbox_manager import ( SWEBenchSandboxManager, ) -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager # --------------------------------------------------------------------------- # Reset the singleton between tests — BaseSandboxManager stores _instance and @@ -113,12 +113,12 @@ async def test_create_threads_template_kwarg_to_e2b_sdk( fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) # settings.e2b_api_key must be truthy for create() to proceed. monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -178,11 +178,11 @@ async def _run(cmd: str, **_kwargs: object) -> MagicMock: fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) diff --git a/tests/unit/architecture/test_model_field_descriptions.py b/tests/unit/architecture/test_model_field_descriptions.py new file mode 100644 index 00000000..1d7e4e35 --- /dev/null +++ b/tests/unit/architecture/test_model_field_descriptions.py @@ -0,0 +1,82 @@ +"""Guards for model field docs that must survive schema export.""" + +from ergon_core.core.dashboard.event_contracts import DashboardContextEventEvent +from ergon_core.core.persistence.context.event_payloads import ( + AssistantTextPayload, + ThinkingPayload, + ToolCallPayload, + ToolResultPayload, + UserMessagePayload, +) +from ergon_core.core.persistence.context.models import RunContextEvent +from ergon_core.core.persistence.graph.models import ( + RunGraphAnnotation, + RunGraphMutation, + RunGraphNode, +) +from ergon_core.core.persistence.telemetry.models import RunResource +from ergon_core.core.runtime.services.graph_dto import ( + GraphAnnotationDto, + GraphEdgeDto, + GraphMutationDto, + GraphNodeDto, +) +from ergon_builtins.benchmarks.swebench_verified.task_schemas import ( + SWEBenchInstance, + SWEBenchTaskPayload, +) +from pydantic import BaseModel + + +def _description(model: type[BaseModel], field_name: str) -> str | None: + return model.model_fields[field_name].description + + +def test_context_event_payload_field_docs_are_schema_metadata() -> None: + assert _description(UserMessagePayload, "from_worker_key") + assert _description(AssistantTextPayload, "turn_id") + assert _description(AssistantTextPayload, "turn_token_ids") + assert _description(AssistantTextPayload, "turn_logprobs") + assert _description(ToolCallPayload, "turn_id") + assert _description(ToolCallPayload, "turn_token_ids") + assert _description(ToolCallPayload, "turn_logprobs") + assert _description(ToolResultPayload, "tool_call_id") + assert _description(ToolResultPayload, "result") + assert _description(ThinkingPayload, "turn_id") + assert _description(ThinkingPayload, "turn_token_ids") + assert _description(ThinkingPayload, "turn_logprobs") + + +def test_dashboard_context_event_field_docs_are_schema_metadata() -> None: + assert _description(DashboardContextEventEvent, "id") + assert _description(DashboardContextEventEvent, "task_node_id") + assert _description(DashboardContextEventEvent, "payload") + + +def test_graph_dto_field_docs_are_schema_metadata() -> None: + assert _description(GraphNodeDto, "status") + assert _description(GraphEdgeDto, "status") + assert _description(GraphAnnotationDto, "id") + assert _description(GraphAnnotationDto, "target_id") + assert _description(GraphMutationDto, "id") + assert _description(GraphMutationDto, "target_id") + + +def test_sqlmodel_field_docs_are_schema_metadata() -> None: + assert _description(RunGraphNode, "instance_key") + assert _description(RunGraphNode, "task_slug") + assert _description(RunGraphNode, "status") + assert _description(RunGraphNode, "assigned_worker_slug") + assert _description(RunGraphNode, "parent_node_id") + assert _description(RunGraphNode, "level") + assert _description(RunContextEvent, "event_type") + assert _description(RunContextEvent, "payload") + assert _description(RunGraphAnnotation, "target_type") + assert _description(RunGraphMutation, "mutation_type") + assert _description(RunGraphMutation, "target_type") + assert _description(RunResource, "kind") + + +def test_builtin_task_schema_field_docs_are_schema_metadata() -> None: + assert _description(SWEBenchInstance, "hints_text") + assert _description(SWEBenchTaskPayload, "hints_text") diff --git a/tests/unit/benchmarks/test_swebench_sandbox_manager.py b/tests/unit/benchmarks/test_swebench_sandbox_manager.py index 2462cd35..9900d9a7 100644 --- a/tests/unit/benchmarks/test_swebench_sandbox_manager.py +++ b/tests/unit/benchmarks/test_swebench_sandbox_manager.py @@ -55,7 +55,7 @@ async def test_install_runs_setup_and_install_scripts(monkeypatch: pytest.Monkey @pytest.mark.asyncio async def test_install_raises_when_payload_missing(monkeypatch: pytest.MonkeyPatch) -> None: from ergon_core.core.persistence import queries as q_mod - from ergon_core.core.providers.sandbox.errors import SandboxSetupError + from ergon_core.core.sandbox.errors import SandboxSetupError monkeypatch.setattr( q_mod.queries.task_executions, @@ -90,7 +90,7 @@ async def test_install_raises_on_nonzero_exit( """ from ergon_builtins.benchmarks.swebench_verified import sandbox_manager as sm from ergon_core.core.persistence import queries as q_mod - from ergon_core.core.providers.sandbox.errors import SandboxSetupError + from ergon_core.core.sandbox.errors import SandboxSetupError monkeypatch.setattr( q_mod.queries.task_executions, diff --git a/tests/unit/builtins/test_logfire_pydantic_ai.py b/tests/unit/builtins/test_logfire_pydantic_ai.py new file mode 100644 index 00000000..ec8206b2 --- /dev/null +++ b/tests/unit/builtins/test_logfire_pydantic_ai.py @@ -0,0 +1,53 @@ +import importlib + + +def test_logfire_pydantic_ai_instrumentation_is_disabled_by_default(monkeypatch) -> None: + module = importlib.import_module("ergon_builtins.observability.pydantic_ai_logfire") + module._reset_for_tests() + monkeypatch.delenv("ERGON_LOGFIRE_PYDANTIC_AI", raising=False) + + assert module.configure_pydantic_ai_logfire(logfire_module=_FailingLogfire()) is False + + +def test_logfire_pydantic_ai_instrumentation_configures_once(monkeypatch) -> None: + module = importlib.import_module("ergon_builtins.observability.pydantic_ai_logfire") + module._reset_for_tests() + monkeypatch.setenv("ERGON_LOGFIRE_PYDANTIC_AI", "1") + monkeypatch.setenv("ERGON_LOGFIRE_SERVICE_NAME", "ergon-test") + monkeypatch.setenv("ERGON_LOGFIRE_ENVIRONMENT", "unit") + monkeypatch.setenv("ERGON_LOGFIRE_CONFIG_DIR", "/tmp/logfire-config") + fake = _FakeLogfire() + + assert module.configure_pydantic_ai_logfire(logfire_module=fake) is True + assert module.configure_pydantic_ai_logfire(logfire_module=fake) is True + + assert fake.configure_calls == [ + { + "send_to_logfire": "if-token-present", + "service_name": "ergon-test", + "environment": "unit", + "config_dir": "/tmp/logfire-config", + "console": False, + } + ] + assert fake.instrument_calls == [{"include_content": True}] + + +class _FailingLogfire: + def configure(self, **kwargs): + raise AssertionError("disabled instrumentation should not configure Logfire") + + def instrument_pydantic_ai(self, **kwargs): + raise AssertionError("disabled instrumentation should not instrument pydantic-ai") + + +class _FakeLogfire: + def __init__(self) -> None: + self.configure_calls = [] + self.instrument_calls = [] + + def configure(self, **kwargs): + self.configure_calls.append(kwargs) + + def instrument_pydantic_ai(self, **kwargs): + self.instrument_calls.append(kwargs) diff --git a/tests/unit/builtins/test_tool_budget.py b/tests/unit/builtins/test_tool_budget.py new file mode 100644 index 00000000..29a60e8d --- /dev/null +++ b/tests/unit/builtins/test_tool_budget.py @@ -0,0 +1,51 @@ +from ergon_builtins.workers.baselines.tool_budget import ( + AgentToolBudgetDeps, + AgentToolBudgetExhaustedResult, + AgentToolBudgetState, +) + + +def test_tool_budget_exhausts_workflow_calls_with_structured_result() -> None: + state = AgentToolBudgetState( + max_workflow_tool_calls=1, + max_other_tool_calls=2, + ) + + first = state.increment("workflow", "workflow") + second = state.increment("workflow", "workflow") + exhausted = state.exhausted_result("workflow tool budget reached") + + assert first == 1 + assert second == 2 + assert second > state.max_workflow_tool_calls + assert isinstance(exhausted, AgentToolBudgetExhaustedResult) + assert exhausted.status == "TOOL_BUDGET_EXHAUSTED" + assert exhausted.reason == "workflow tool budget reached" + assert exhausted.budget_state["workflow_tool_calls"] == 2 + + +def test_tool_budget_allows_finalization_after_other_exhaustion() -> None: + state = AgentToolBudgetState( + max_workflow_tool_calls=1, + max_other_tool_calls=1, + ) + + assert state.increment("exa_search", "other") == 1 + assert state.increment("list_child_resources", "other") == 2 + finalization_count = state.increment("write_report_draft", "finalization") + + assert state.other_tool_calls > state.max_other_tool_calls + assert finalization_count == 1 + assert state.finalization_tool_calls == 1 + + +def test_tool_budget_deps_wraps_mutable_state() -> None: + state = AgentToolBudgetState( + max_workflow_tool_calls=1, + max_other_tool_calls=1, + ) + deps = AgentToolBudgetDeps(tool_budget=state) + + deps.tool_budget.increment("exa_search", "other") + + assert deps.tool_budget.other_tool_calls == 1 diff --git a/tests/unit/runtime/test_criterion_runtime_reconnect.py b/tests/unit/runtime/test_criterion_runtime_reconnect.py index 50ef8bab..8d068200 100644 --- a/tests/unit/runtime/test_criterion_runtime_reconnect.py +++ b/tests/unit/runtime/test_criterion_runtime_reconnect.py @@ -12,7 +12,7 @@ from uuid import uuid4 import pytest -from ergon_core.core.providers.sandbox.errors import SandboxExpiredError +from ergon_core.core.sandbox.errors import SandboxExpiredError from ergon_core.core.runtime.evaluation.criterion_runtime import ( CriterionRuntimeOptions, DefaultCriterionRuntime, diff --git a/tests/unit/runtime/test_failed_task_sandbox_cleanup.py b/tests/unit/runtime/test_failed_task_sandbox_cleanup.py index e8e52b04..6b226102 100644 --- a/tests/unit/runtime/test_failed_task_sandbox_cleanup.py +++ b/tests/unit/runtime/test_failed_task_sandbox_cleanup.py @@ -1,7 +1,7 @@ from unittest.mock import AsyncMock, patch import pytest -from ergon_core.core.providers.sandbox.lifecycle import ( +from ergon_core.core.sandbox.lifecycle import ( SandboxTerminationReason, SandboxTerminationResult, ) diff --git a/tests/unit/runtime/test_failure_error_json.py b/tests/unit/runtime/test_failure_error_json.py index c4807411..2b4aea1c 100644 --- a/tests/unit/runtime/test_failure_error_json.py +++ b/tests/unit/runtime/test_failure_error_json.py @@ -6,47 +6,6 @@ from ergon_core.core.runtime.services.orchestration_dto import FailTaskExecutionCommand -def test_build_error_json_includes_stack_without_inferred_triage() -> None: - from ergon_core.core.runtime.errors.error_payload import ( - RuntimeErrorPayload, - build_error_json, - ) - - try: - raise RuntimeError( - "Invalid response from OpenAI chat completions endpoint: " - "choices.0.finish_reason input_value=None" - ) - except RuntimeError as exc: - payload = build_error_json(exc, phase="worker_execute") - - assert payload["message"].startswith("Invalid response from OpenAI") - assert payload["exception_type"] == "RuntimeError" - assert payload["phase"] == "worker_execute" - assert "Traceback" in payload["stack"] - assert "finish_reason" in payload["stack"] - assert "category" not in payload - assert "retryable" not in payload - assert RuntimeErrorPayload.model_validate(payload).message == payload["message"] - - -def test_worker_exception_result_carries_structured_error_json() -> None: - from ergon_core.core.runtime.inngest.worker_execute import ( - _worker_execute_result_from_exception, - ) - - try: - raise RuntimeError("provider timeout") - except RuntimeError as exc: - result = _worker_execute_result_from_exception(exc) - - assert result.success is False - assert result.error == "provider timeout" - assert result.error_json is not None - assert result.error_json["phase"] == "worker_execute" - assert result.error_json["exception_type"] == "RuntimeError" - - @pytest.mark.asyncio async def test_finalize_failure_preserves_structured_error_json(monkeypatch) -> None: from ergon_core.core.runtime.services import task_execution_service as module diff --git a/tests/unit/runtime/test_inngest_criterion_executor.py b/tests/unit/runtime/test_inngest_criterion_executor.py new file mode 100644 index 00000000..070582c7 --- /dev/null +++ b/tests/unit/runtime/test_inngest_criterion_executor.py @@ -0,0 +1,86 @@ +"""Contracts for Inngest criterion executor runtime wiring.""" + +from uuid import uuid4 + +import pytest +from ergon_core.api.criterion import Criterion +from ergon_core.api.evaluation_context import EvaluationContext +from ergon_core.api.results import CriterionResult +from ergon_core.api.task_types import BenchmarkTask +from ergon_core.core.runtime.evaluation.evaluation_schemas import ( + CriterionSpec, + TaskEvaluationContext, +) +from ergon_core.core.runtime.evaluation.inngest_executor import InngestCriterionExecutor + + +class _Step: + async def run(self, _name, fn, *, output_type): + return await fn() + + +class _Group: + async def parallel(self, fns): + return [await fn() for fn in fns] + + +class _Ctx: + step = _Step() + group = _Group() + + +class _Criterion(Criterion): + type_slug = "test-criterion" + + def __init__(self) -> None: + super().__init__(name="criterion") + self.runtime_task_scope = None + + async def evaluate(self, context: EvaluationContext) -> CriterionResult: + self.runtime_task_scope = context.runtime.task_scope + return CriterionResult(name=self.name, score=1.0, passed=True) + + +@pytest.mark.asyncio +async def test_executor_scopes_criterion_runtime_to_task_execution(monkeypatch) -> None: + execution_id = uuid4() + definition_task_id = uuid4() + captured_options = [] + + class FakeRuntime: + def __init__(self, *, context, sandbox_manager, options) -> None: + captured_options.append(options) + self.task_scope = options.task_id + + monkeypatch.setattr( + "ergon_core.core.runtime.evaluation.inngest_executor.DefaultCriterionRuntime", + FakeRuntime, + ) + + criterion = _Criterion() + executor = InngestCriterionExecutor( + _Ctx(), + task_id=definition_task_id, + execution_id=execution_id, + evaluator_id=uuid4(), + sandbox_manager=object(), + ) + + await executor.execute_all( + TaskEvaluationContext( + run_id=uuid4(), + task_input="input", + agent_reasoning="output", + ), + BenchmarkTask( + task_slug="task", + instance_key="default", + description="input", + evaluator_binding_keys=("default",), + ), + "benchmark", + [CriterionSpec(criterion=criterion)], + ) + + assert captured_options[0].task_id == execution_id + assert criterion.runtime_task_scope == execution_id diff --git a/tests/unit/runtime/test_inngest_package_layout.py b/tests/unit/runtime/test_inngest_package_layout.py new file mode 100644 index 00000000..6d2e9f88 --- /dev/null +++ b/tests/unit/runtime/test_inngest_package_layout.py @@ -0,0 +1,10 @@ +import importlib +import importlib.util + + +def test_inngest_infrastructure_lives_in_inngest_package() -> None: + client_module = importlib.import_module("ergon_core.core.runtime.inngest.client") + registry_spec = importlib.util.find_spec("ergon_core.core.runtime.inngest.registry") + + assert client_module.inngest_client is not None + assert registry_spec is not None diff --git a/tests/unit/runtime/test_worker_execute_output_failure.py b/tests/unit/runtime/test_worker_execute_output_failure.py deleted file mode 100644 index f421a542..00000000 --- a/tests/unit/runtime/test_worker_execute_output_failure.py +++ /dev/null @@ -1,12 +0,0 @@ -from ergon_core.api.results import WorkerOutput -from ergon_core.core.runtime.inngest.worker_execute import _worker_execute_result_from_output - - -def test_worker_execute_result_preserves_worker_output_failure() -> None: - result = _worker_execute_result_from_output( - WorkerOutput(output="probe failed", success=False), - ) - - assert result.success is False - assert result.final_assistant_message == "probe failed" - assert result.error == "probe failed" diff --git a/tests/unit/sandbox/test_ensure_sandbox_idempotence.py b/tests/unit/sandbox/test_ensure_sandbox_idempotence.py index 0ff301d2..2f17bd82 100644 --- a/tests/unit/sandbox/test_ensure_sandbox_idempotence.py +++ b/tests/unit/sandbox/test_ensure_sandbox_idempotence.py @@ -12,7 +12,7 @@ from uuid import UUID, uuid4 import pytest -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.manager import BaseSandboxManager class _ProbeManager(BaseSandboxManager): @@ -83,11 +83,11 @@ async def test_install_dependencies_runs_exactly_once_on_repeated_create( # `AsyncSandbox` binding in `manager.py` to return our fake sandbox. fake_create = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(create=fake_create), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) diff --git a/tests/unit/sandbox/test_sandbox_lifecycle_service.py b/tests/unit/sandbox/test_sandbox_lifecycle_service.py index c1403a34..fd44f1af 100644 --- a/tests/unit/sandbox/test_sandbox_lifecycle_service.py +++ b/tests/unit/sandbox/test_sandbox_lifecycle_service.py @@ -1,7 +1,7 @@ from unittest.mock import AsyncMock, patch import pytest -from ergon_core.core.providers.sandbox.lifecycle import ( +from ergon_core.core.sandbox.lifecycle import ( SandboxTerminationReason, terminate_sandbox_by_id, ) @@ -10,7 +10,7 @@ @pytest.mark.asyncio async def test_terminate_sandbox_by_id_dispatches_real_ids() -> None: with patch( - "ergon_core.core.providers.sandbox.manager.BaseSandboxManager.terminate_by_sandbox_id", + "ergon_core.core.sandbox.manager.BaseSandboxManager.terminate_by_sandbox_id", new=AsyncMock(return_value=True), ) as terminate: result = await terminate_sandbox_by_id("sbx-live-123") diff --git a/tests/unit/sandbox/test_sandbox_reconnect.py b/tests/unit/sandbox/test_sandbox_reconnect.py index bde1e87c..c819fcf8 100644 --- a/tests/unit/sandbox/test_sandbox_reconnect.py +++ b/tests/unit/sandbox/test_sandbox_reconnect.py @@ -8,8 +8,8 @@ from unittest.mock import AsyncMock, MagicMock import pytest -from ergon_core.core.providers.sandbox.errors import SandboxExpiredError -from ergon_core.core.providers.sandbox.manager import BaseSandboxManager +from ergon_core.core.sandbox.errors import SandboxExpiredError +from ergon_core.core.sandbox.manager import BaseSandboxManager class _MinimalManager(BaseSandboxManager): @@ -51,11 +51,11 @@ async def test_reconnect_returns_sandbox_on_success( fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -82,11 +82,11 @@ async def test_reconnect_does_not_register_in_sandboxes_dict( fake_sandbox = MagicMock() fake_connect = AsyncMock(return_value=fake_sandbox) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -112,11 +112,11 @@ async def test_reconnect_idempotent_returns_equivalent_handles( fake_sandbox_b = MagicMock() fake_connect = AsyncMock(side_effect=[fake_sandbox_a, fake_sandbox_b]) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -134,7 +134,7 @@ async def test_reconnect_raises_sandbox_expired_on_not_found_exception( monkeypatch: pytest.MonkeyPatch, ) -> None: """SandboxNotFoundException → SandboxExpiredError with sandbox_id preserved.""" - import ergon_core.core.providers.sandbox.manager as mgr_mod + import ergon_core.core.sandbox.manager as mgr_mod class _FakeSandboxNotFound(Exception): pass @@ -165,7 +165,7 @@ async def test_reconnect_raises_sandbox_expired_on_timeout_exception( monkeypatch: pytest.MonkeyPatch, ) -> None: """TimeoutException → SandboxExpiredError.""" - import ergon_core.core.providers.sandbox.manager as mgr_mod + import ergon_core.core.sandbox.manager as mgr_mod class _FakeTimeout(Exception): pass @@ -198,11 +198,11 @@ async def test_reconnect_classifies_by_message_when_sdk_raises_generic_error( side_effect=Exception("HTTP 404: sandbox not found"), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) @@ -225,11 +225,11 @@ async def test_reconnect_reraises_unrelated_errors_unchanged( """ fake_connect = AsyncMock(side_effect=ConnectionError("TLS handshake failed")) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.AsyncSandbox", + "ergon_core.core.sandbox.manager.AsyncSandbox", MagicMock(connect=fake_connect), ) monkeypatch.setattr( - "ergon_core.core.providers.sandbox.manager.settings.e2b_api_key", + "ergon_core.core.sandbox.manager.settings.e2b_api_key", "test-key", ) diff --git a/tests/unit/smoke_base/test_leaf_sends_completion_message.py b/tests/unit/smoke_base/test_leaf_sends_completion_message.py index 3ba972ab..d51ce270 100644 --- a/tests/unit/smoke_base/test_leaf_sends_completion_message.py +++ b/tests/unit/smoke_base/test_leaf_sends_completion_message.py @@ -11,7 +11,7 @@ import pytest from ergon_core.api import BenchmarkTask from ergon_core.core.persistence.shared.types import AssignedWorkerSlug -from ergon_core.core.providers.sandbox.manager import AsyncSandbox +from ergon_core.core.sandbox.manager import AsyncSandbox from ergon_core.core.runtime.services.communication_schemas import CreateMessageRequest from ergon_core.test_support.smoke_fixtures.smoke_base.leaf_base import BaseSmokeLeafWorker from ergon_core.test_support.smoke_fixtures.smoke_base.subworker import SubworkerResult diff --git a/tests/unit/smoke_base/test_smoke_sandbox_manager.py b/tests/unit/smoke_base/test_smoke_sandbox_manager.py index 35d22663..709801e3 100644 --- a/tests/unit/smoke_base/test_smoke_sandbox_manager.py +++ b/tests/unit/smoke_base/test_smoke_sandbox_manager.py @@ -2,7 +2,7 @@ from uuid import UUID, uuid4 import pytest -from ergon_core.core.providers.sandbox.event_sink import SandboxEventSink +from ergon_core.core.sandbox.event_sink import SandboxEventSink class _RecordingSink(SandboxEventSink): @@ -82,8 +82,8 @@ async def test_smoke_sandbox_health_command_matches_swebench_probe() -> None: @pytest.mark.asyncio async def test_static_teardown_closes_registered_smoke_sandbox() -> None: - from ergon_core.core.providers.sandbox.event_sink import NoopSandboxEventSink - from ergon_core.core.providers.sandbox.manager import BaseSandboxManager + from ergon_core.core.sandbox.event_sink import NoopSandboxEventSink + from ergon_core.core.sandbox.manager import BaseSandboxManager from ergon_core.test_support.smoke_fixtures.sandbox import SmokeSandboxManager sink = _RecordingSink() diff --git a/tests/unit/state/test_criterion_runtime_di.py b/tests/unit/state/test_criterion_runtime_di.py index 46b47b72..0e9b755b 100644 --- a/tests/unit/state/test_criterion_runtime_di.py +++ b/tests/unit/state/test_criterion_runtime_di.py @@ -12,7 +12,7 @@ import pytest from ergon_core.core.runtime.evaluation.protocols import CriterionRuntime from ergon_core.core.runtime.resources import RunResourceView -from ergon_core.core.providers.sandbox.event_sink import ( +from ergon_core.core.sandbox.event_sink import ( DashboardEmitterSandboxEventSink, NoopSandboxEventSink, ) diff --git a/tests/unit/test_dashboard_emitter_wiring.py b/tests/unit/test_dashboard_emitter_wiring.py index fc8e1db7..3f080195 100644 --- a/tests/unit/test_dashboard_emitter_wiring.py +++ b/tests/unit/test_dashboard_emitter_wiring.py @@ -32,7 +32,7 @@ { "ergon_core/ergon_core/core/dashboard/emitter.py", "ergon_core/ergon_core/core/dashboard/event_contracts.py", - "ergon_core/ergon_core/core/providers/sandbox/event_sink.py", + "ergon_core/ergon_core/core/sandbox/event_sink.py", } ) From cd21b0cc340e0ae2e2fbaff0c8614dc48cd2b1c6 Mon Sep 17 00:00:00 2001 From: Charlie Masters <69640669+cm2435@users.noreply.github.com> Date: Tue, 28 Apr 2026 16:45:51 +0100 Subject: [PATCH 3/5] docs: trim cleanup plan trailing whitespace Made-with: Cursor --- docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md | 1 - .../plans/2026-04-28-evaluation-resource-context-and-scoring.md | 1 - 2 files changed, 2 deletions(-) diff --git a/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md b/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md index 7af71aa7..c611f731 100644 --- a/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md +++ b/docs/superpowers/plans/2026-04-28-agent-tool-budget-harness.md @@ -808,4 +808,3 @@ Expected improvement: - This still supports better prompt steering, but prompt steering is advisory. The two counters are enforcement. - We should not add broad unit tests for every tool. Existing workflow tests, import smoke checks, lint, and the one-sample real rollout are enough for this change. - Do not commit unless explicitly asked. - diff --git a/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md b/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md index f714978d..59306462 100644 --- a/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md +++ b/docs/superpowers/plans/2026-04-28-evaluation-resource-context-and-scoring.md @@ -906,4 +906,3 @@ Expected rollout properties: - Do not include full agent conversation in ResearchRubrics judge prompts by default. - Do not introduce a new persisted table for evidence bundles. - Do not preserve compatibility with double-normalized summary scores; new runs should use the normalized score invariant. - From f629cbe8a2e31e39353414d6ebe70ed50b35ec72 Mon Sep 17 00:00:00 2001 From: Charlie Masters <69640669+cm2435@users.noreply.github.com> Date: Tue, 28 Apr 2026 16:58:46 +0100 Subject: [PATCH 4/5] fix: trim schema trailing whitespace Made-with: Cursor --- ergon_core/ergon_core/core/api/schemas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ergon_core/ergon_core/core/api/schemas.py b/ergon_core/ergon_core/core/api/schemas.py index 8ae9fd87..0d532aab 100644 --- a/ergon_core/ergon_core/core/api/schemas.py +++ b/ergon_core/ergon_core/core/api/schemas.py @@ -248,4 +248,3 @@ class TrainingMetricDto(CamelModel): completion_mean_length: float | None = None step_time_s: float | None = None - From 4875c943ca0ac7e2528b9047fb21c848352be575 Mon Sep 17 00:00:00 2001 From: Charlie Masters <69640669+cm2435@users.noreply.github.com> Date: Tue, 28 Apr 2026 17:00:47 +0100 Subject: [PATCH 5/5] fix: align field docs guard with context stream schema Made-with: Cursor --- ergon_core/ergon_core/core/generation.py | 102 +++++++++++++----- .../test_model_field_descriptions.py | 42 ++++---- 2 files changed, 94 insertions(+), 50 deletions(-) diff --git a/ergon_core/ergon_core/core/generation.py b/ergon_core/ergon_core/core/generation.py index 68e3a94a..0178d686 100644 --- a/ergon_core/ergon_core/core/generation.py +++ b/ergon_core/ergon_core/core/generation.py @@ -17,50 +17,76 @@ class TokenLogprob(BaseModel): model_config = {"frozen": True} - token: str - logprob: float - top_logprobs: list[JsonObject] = Field(default_factory=list) + token: str = Field(description="Generated token text.") + logprob: float = Field(description="Natural-log probability assigned to the token.") + top_logprobs: list[JsonObject] = Field( + default_factory=list, + description="Optional model-provider alternatives and probabilities for this position.", + ) class SystemPromptPart(BaseModel): model_config = {"frozen": True} - part_kind: Literal["system_prompt"] = "system_prompt" - content: str + part_kind: Literal["system_prompt"] = Field( + default="system_prompt", + description="Discriminator identifying this context part as a system prompt.", + ) + content: str = Field(description="System instructions supplied to the worker.") class UserMessagePart(BaseModel): model_config = {"frozen": True} - part_kind: Literal["user_message"] = "user_message" - content: str + part_kind: Literal["user_message"] = Field( + default="user_message", + description="Discriminator identifying this context part as a user message.", + ) + content: str = Field(description="User or upstream task message content.") class AssistantTextPart(BaseModel): model_config = {"frozen": True} - part_kind: Literal["assistant_text"] = "assistant_text" - content: str + part_kind: Literal["assistant_text"] = Field( + default="assistant_text", + description="Discriminator identifying this context part as assistant text.", + ) + content: str = Field(description="Assistant response text emitted by the worker.") class ToolCallPart(BaseModel): model_config = {"frozen": True} - part_kind: Literal["tool_call"] = "tool_call" - tool_name: str - tool_call_id: str - args: dict[str, Any] # slopcop: ignore[no-typing-any] + part_kind: Literal["tool_call"] = Field( + default="tool_call", + description="Discriminator identifying this context part as a tool call.", + ) + tool_name: str = Field(description="Name of the tool requested by the worker.") + tool_call_id: str = Field(description="Provider-stable identifier for this tool call.") + args: dict[str, Any] = Field( # slopcop: ignore[no-typing-any] + description="JSON-like tool input arguments.", + ) class ToolResultPart(BaseModel): model_config = {"frozen": True} - part_kind: Literal["tool_result"] = "tool_result" - tool_call_id: str - tool_name: str - content: str - is_error: bool = False + part_kind: Literal["tool_result"] = Field( + default="tool_result", + description="Discriminator identifying this context part as a tool result.", + ) + tool_call_id: str = Field(description="Identifier of the tool call this result answers.") + tool_name: str = Field(description="Name of the tool that produced this result.") + content: str = Field(description="Serialized tool result content.") + is_error: bool = Field( + default=False, + description="Whether the tool result represents an error response.", + ) class ThinkingPart(BaseModel): model_config = {"frozen": True} - part_kind: Literal["thinking"] = "thinking" - content: str + part_kind: Literal["thinking"] = Field( + default="thinking", + description="Discriminator identifying this context part as private thinking.", + ) + content: str = Field(description="Reasoning or thinking text emitted by the model.") ContextPart = Annotated[ @@ -82,20 +108,38 @@ class ContextPartChunk(BaseModel): model_config = {"frozen": True} - part: ContextPart - token_ids: list[int] | None = None - logprobs: list[TokenLogprob] | None = None + part: ContextPart = Field(description="Typed context stream payload.") + token_ids: list[int] | None = Field( + default=None, + description="Token IDs associated with this context part when provided by the backend.", + ) + logprobs: list[TokenLogprob] | None = Field( + default=None, + description="Per-token log probabilities associated with this context part.", + ) class ContextPartChunkLog(ContextPartChunk): """Core-enriched context stream item suitable for API/dashboard projection.""" - sequence: int - worker_binding_key: str - turn_id: str | None = None - started_at: datetime | None = None - completed_at: datetime | None = None - policy_version: str | None = None + sequence: int = Field(description="Monotonic sequence number within the execution stream.") + worker_binding_key: str = Field(description="Worker binding that emitted this context part.") + turn_id: str | None = Field( + default=None, + description="Stable generation turn identifier shared by related streamed parts.", + ) + started_at: datetime | None = Field( + default=None, + description="Timestamp when generation for this part started.", + ) + completed_at: datetime | None = Field( + default=None, + description="Timestamp when generation for this part completed.", + ) + policy_version: str | None = Field( + default=None, + description="Optional worker or policy version that produced the part.", + ) WorkerYield = ContextPartChunk diff --git a/tests/unit/architecture/test_model_field_descriptions.py b/tests/unit/architecture/test_model_field_descriptions.py index 1d7e4e35..887c47e2 100644 --- a/tests/unit/architecture/test_model_field_descriptions.py +++ b/tests/unit/architecture/test_model_field_descriptions.py @@ -1,12 +1,13 @@ """Guards for model field docs that must survive schema export.""" from ergon_core.core.dashboard.event_contracts import DashboardContextEventEvent -from ergon_core.core.persistence.context.event_payloads import ( - AssistantTextPayload, - ThinkingPayload, - ToolCallPayload, - ToolResultPayload, - UserMessagePayload, +from ergon_core.core.generation import ( + AssistantTextPart, + ContextPartChunkLog, + ThinkingPart, + ToolCallPart, + ToolResultPart, + UserMessagePart, ) from ergon_core.core.persistence.context.models import RunContextEvent from ergon_core.core.persistence.graph.models import ( @@ -18,7 +19,7 @@ from ergon_core.core.runtime.services.graph_dto import ( GraphAnnotationDto, GraphEdgeDto, - GraphMutationDto, + GraphMutationRecordDto, GraphNodeDto, ) from ergon_builtins.benchmarks.swebench_verified.task_schemas import ( @@ -33,18 +34,17 @@ def _description(model: type[BaseModel], field_name: str) -> str | None: def test_context_event_payload_field_docs_are_schema_metadata() -> None: - assert _description(UserMessagePayload, "from_worker_key") - assert _description(AssistantTextPayload, "turn_id") - assert _description(AssistantTextPayload, "turn_token_ids") - assert _description(AssistantTextPayload, "turn_logprobs") - assert _description(ToolCallPayload, "turn_id") - assert _description(ToolCallPayload, "turn_token_ids") - assert _description(ToolCallPayload, "turn_logprobs") - assert _description(ToolResultPayload, "tool_call_id") - assert _description(ToolResultPayload, "result") - assert _description(ThinkingPayload, "turn_id") - assert _description(ThinkingPayload, "turn_token_ids") - assert _description(ThinkingPayload, "turn_logprobs") + assert _description(UserMessagePart, "content") + assert _description(AssistantTextPart, "content") + assert _description(ToolCallPart, "tool_call_id") + assert _description(ToolCallPart, "args") + assert _description(ToolResultPart, "tool_call_id") + assert _description(ToolResultPart, "content") + assert _description(ThinkingPart, "content") + assert _description(ContextPartChunkLog, "worker_binding_key") + assert _description(ContextPartChunkLog, "turn_id") + assert _description(ContextPartChunkLog, "token_ids") + assert _description(ContextPartChunkLog, "logprobs") def test_dashboard_context_event_field_docs_are_schema_metadata() -> None: @@ -58,8 +58,8 @@ def test_graph_dto_field_docs_are_schema_metadata() -> None: assert _description(GraphEdgeDto, "status") assert _description(GraphAnnotationDto, "id") assert _description(GraphAnnotationDto, "target_id") - assert _description(GraphMutationDto, "id") - assert _description(GraphMutationDto, "target_id") + assert _description(GraphMutationRecordDto, "id") + assert _description(GraphMutationRecordDto, "target_id") def test_sqlmodel_field_docs_are_schema_metadata() -> None: