diff --git a/.github/workflows/e2e-benchmarks.yml b/.github/workflows/e2e-benchmarks.yml index 598881ea..20aa7350 100644 --- a/.github/workflows/e2e-benchmarks.yml +++ b/.github/workflows/e2e-benchmarks.yml @@ -27,7 +27,7 @@ jobs: name: "smoke [${{ matrix.env }}]" strategy: fail-fast: false - max-parallel: 1 + max-parallel: 3 matrix: env: [researchrubrics, minif2f, swebench-verified] runs-on: ubuntu-latest @@ -35,9 +35,11 @@ jobs: env: SMOKE_ENV: ${{ matrix.env }} ENABLE_TEST_HARNESS: "1" + ERGON_STARTUP_PLUGINS: "ergon_core.test_support.smoke_fixtures:register_smoke_fixtures" TEST_HARNESS_SECRET: ${{ secrets.TEST_HARNESS_SECRET || 'ci-test-harness' }} E2B_API_KEY: ${{ secrets.E2B_API_KEY }} GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} + SMOKE_COHORT_SIZE: "1" steps: - uses: actions/checkout@v4 with: @@ -72,6 +74,7 @@ jobs: # Unified compose reads these as overrides (see docker-compose.yml). POSTGRES_PASSWORD: ci_test ENABLE_TEST_HARNESS: "1" + ERGON_STARTUP_PLUGINS: "ergon_core.test_support.smoke_fixtures:register_smoke_fixtures" run: docker compose up -d --build --wait timeout-minutes: 5 diff --git a/docker-compose.yml b/docker-compose.yml index 9086d24d..153cc2be 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -81,7 +81,7 @@ services: - INNGEST_API_BASE_URL=http://inngest-dev:8288 - ERGON_API_BASE_URL=http://api:9000 - ENABLE_TEST_HARNESS=${ENABLE_TEST_HARNESS:-1} - - ENABLE_SMOKE_FIXTURES=${ENABLE_SMOKE_FIXTURES:-1} + - ERGON_STARTUP_PLUGINS=${ERGON_STARTUP_PLUGINS-ergon_core.test_support.smoke_fixtures:register_smoke_fixtures} - TEST_HARNESS_SECRET=${TEST_HARNESS_SECRET:-local-dev} - ERGON_BLOB_ROOT=/tmp/ergon-blob - OTEL_TRACES_ENABLED=false diff --git a/docs/superpowers/plans/2026-04-26-communication-thread-workspace.md b/docs/superpowers/plans/2026-04-26-communication-thread-workspace.md new file mode 100644 index 00000000..b5a67c55 --- /dev/null +++ b/docs/superpowers/plans/2026-04-26-communication-thread-workspace.md @@ -0,0 +1,558 @@ +# Communication Thread Workspace Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make inter-agent communication a first-class, time-aware workspace view with agent-authored thread summaries, task anchoring, and a clickable WhatsApp-style thread trace. + +**Architecture:** Extend the communication schema from agent tool request through persistence, dashboard DTOs, live events, run snapshots, and frontend rendering. Preserve the current `(run_id, topic)` thread identity and add nullable `summary` metadata so agents can set a human-readable thread summary when creating the first message. Frontend should work with summary absent, but prefer it when present. + +**Tech Stack:** Python, SQLModel/Alembic, Pydantic DTOs, dashboard event contracts, React/TypeScript, Playwright. + +--- + +## File Structure + +- Modify `ergon_core/ergon_core/core/runtime/services/communication_schemas.py`: add nullable `thread_summary` to `CreateMessageRequest`, `ThreadSummary`, and `ThreadWithMessages`. +- Modify `ergon_core/ergon_core/core/persistence/telemetry/models.py`: add nullable `summary` column to `Thread`. +- Create migration under `ergon_core/migrations/versions/`: add nullable `summary` column to `threads`. +- Modify `ergon_core/ergon_core/core/runtime/services/communication_service.py`: persist `thread_summary` only when creating a thread or when an existing thread has no summary. +- Modify `ergon_core/ergon_core/core/api/schemas.py`: add nullable `summary` to `RunCommunicationThreadDto`. +- Modify `ergon_core/ergon_core/core/api/runs.py`: populate `thread.summary`, `thread.task_id`, and `message.task_id` in `_build_communication_threads`. +- Modify `ergon_core/ergon_core/core/dashboard/event_contracts.py` only if generated event contracts need explicit schema references refreshed. +- Modify `ergon-dashboard/src/generated/rest/contracts.ts` after schema generation or manually in lockstep if generation is not available in this branch. +- Modify `ergon-dashboard/src/lib/contracts/rest.ts`: ensure normalized `RunCommunicationThread` includes `summary: string | null`. +- Modify `ergon-dashboard/src/components/panels/CommunicationPanel.tsx`: replace always-expanded cards with thread list + selected chat trace. +- Modify `ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.ts`: keep existing time filtering, and ensure summaries/counts are based on visible messages at selected time. +- Test `tests/unit/smoke_base/test_leaf_sends_completion_message.py`: existing callers remain valid without summaries. +- Test `tests/unit/dashboard/test_event_contract_types.py`: DTO exposes `summary`, `task_id`, and `task_execution_id`. +- Add backend unit tests for communication summary persistence and task anchoring. +- Update Playwright tests in `ergon-dashboard/tests/e2e/run.snapshot.spec.ts` or `run.delta.spec.ts` for clickable thread list and chat bubbles. + +--- + +## Task 1: Extend Communication Schema and Persistence + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/communication_schemas.py` +- Modify: `ergon_core/ergon_core/core/persistence/telemetry/models.py` +- Create: `ergon_core/migrations/versions/_add_thread_summary.py` +- Test: `tests/unit/smoke_base/test_leaf_sends_completion_message.py` + +- [ ] **Step 1: Write compatibility assertion for summary-optional requests** + +Add to `tests/unit/smoke_base/test_leaf_sends_completion_message.py` inside `test_send_completion_message_posts_request`: + +```python +assert req.thread_summary is None +``` + +- [ ] **Step 2: Run test to verify current schema fails** + +Run: + +```bash +pytest tests/unit/smoke_base/test_leaf_sends_completion_message.py::test_send_completion_message_posts_request -q +``` + +Expected: FAIL because `CreateMessageRequest` has no `thread_summary` attribute. + +- [ ] **Step 3: Add nullable summary field to request/response schemas** + +In `communication_schemas.py`, update `CreateMessageRequest`: + +```python +class CreateMessageRequest(BaseModel): + run_id: UUID + from_agent_id: str = Field( + description="ID of the sending agent, e.g. '{run_id}:worker'", + ) + to_agent_id: str = Field( + description="ID of the receiving agent, e.g. '{run_id}:stakeholder'", + ) + thread_topic: str + thread_summary: str | None = Field( + default=None, + description="Optional human-readable summary set when the thread is first created.", + ) + content: str + task_execution_id: UUID | None = None +``` + +Also add `summary: str | None = None` to `ThreadSummary` and `ThreadWithMessages`. + +- [ ] **Step 4: Add persistence field** + +In `models.py`, add to `Thread`: + +```python +summary: str | None = None +``` + +- [ ] **Step 5: Add migration** + +Create an Alembic migration adding: + +```python +op.add_column("threads", sa.Column("summary", sqlmodel.sql.sqltypes.AutoString(), nullable=True)) +``` + +Downgrade removes the column. + +- [ ] **Step 6: Run schema compatibility test** + +Run: + +```bash +pytest tests/unit/smoke_base/test_leaf_sends_completion_message.py::test_send_completion_message_posts_request -q +``` + +Expected: PASS. + +--- + +## Task 2: Persist Agent-Authored Thread Summary + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/communication_service.py` +- Test: add or extend communication service unit test near existing service tests + +- [ ] **Step 1: Write failing service test** + +Create a test that calls `communication_service.save_message` with: + +```python +CreateMessageRequest( + run_id=run_id, + from_agent_id="leaf-l_1", + to_agent_id="parent", + thread_topic="smoke-completion", + thread_summary="Leaf workers report completion artifacts and probe exit status.", + content="l_1: done exit=0", + task_execution_id=execution_id, +) +``` + +Assert the persisted `Thread.summary` equals the provided summary. + +- [ ] **Step 2: Run test to verify it fails** + +Run: + +```bash +pytest tests/unit -q -k "communication and summary" +``` + +Expected: FAIL because `CommunicationService` does not persist summary. + +- [ ] **Step 3: Update thread creation/update semantics** + +In `CommunicationService.save_message`, pass `thread_summary=request.thread_summary` into `_get_or_create_thread`. + +Update `_get_or_create_thread` signature: + +```python +def _get_or_create_thread( + session, + *, + run_id: UUID, + agent_a_id: str, + agent_b_id: str, + topic: str, + thread_summary: str | None = None, +) -> Thread: +``` + +When an existing thread is found: + +```python +if existing is not None: + if existing.summary is None and thread_summary: + existing.summary = thread_summary + session.add(existing) + return existing +``` + +When creating: + +```python +thread = Thread(run_id=run_id, topic=topic, agent_a_id=a, agent_b_id=b, summary=thread_summary) +``` + +- [ ] **Step 4: Include summary in service response DTOs** + +When building `RunCommunicationThreadDto`, `ThreadSummary`, and `ThreadWithMessages`, populate `summary=thread.summary`. + +- [ ] **Step 5: Run service tests** + +Run: + +```bash +pytest tests/unit -q -k "communication" +``` + +Expected: PASS. + +--- + +## Task 3: Populate Dashboard Thread DTOs and Task Anchors + +**Files:** +- Modify: `ergon_core/ergon_core/core/api/schemas.py` +- Modify: `ergon_core/ergon_core/core/api/runs.py` +- Test: `tests/unit/dashboard/test_event_contract_types.py` +- Test: add focused test for `_build_communication_threads` + +- [ ] **Step 1: Extend DTO contract test** + +Add to `test_event_contract_types.py`: + +```python +def test_thread_dto_exposes_summary_and_task_identity() -> None: + assert "summary" in RunCommunicationThreadDto.model_fields + assert "task_id" in RunCommunicationThreadDto.model_fields + assert "task_id" in RunCommunicationMessageDto.model_fields +``` + +- [ ] **Step 2: Run contract test to verify failure** + +Run: + +```bash +pytest tests/unit/dashboard/test_event_contract_types.py -q +``` + +Expected: FAIL until `summary` is added. + +- [ ] **Step 3: Add summary field to API schema** + +In `RunCommunicationThreadDto`: + +```python +summary: str | None = None +``` + +- [ ] **Step 4: Populate message task IDs in snapshots** + +Update `_build_communication_threads` to accept `execution_task_map: dict[UUID, UUID]`. + +For each message: + +```python +task_id = execution_task_map.get(m.task_execution_id) if m.task_execution_id else None +``` + +Set: + +```python +task_id=str(task_id) if task_id else None +``` + +on `RunCommunicationMessageDto`. + +- [ ] **Step 5: Populate thread task ID** + +For each thread, collect message task IDs. If exactly one unique non-null task ID exists, set `RunCommunicationThreadDto.task_id` to that ID. Otherwise set it to `None`. + +- [ ] **Step 6: Pass execution map from run read service** + +In `run_read_service.py`, change: + +```python +threads=run_api_helpers._build_communication_threads(threads, thread_messages), +``` + +to: + +```python +threads=run_api_helpers._build_communication_threads( + threads, + thread_messages, + execution_task_map, +), +``` + +- [ ] **Step 7: Run backend contract tests** + +Run: + +```bash +pytest tests/unit/dashboard/test_event_contract_types.py -q +``` + +Expected: PASS. + +--- + +## Task 4: Carry Summary and Anchors Through Live Events + +**Files:** +- Modify: `ergon_core/ergon_core/core/runtime/services/communication_service.py` +- Modify: dashboard event contract tests if needed +- Test: add or extend dashboard event contract/unit test + +- [ ] **Step 1: Write failing live event assertion** + +In a communication service test, patch `dashboard_emitter.thread_message_created` and assert the emitted `thread.summary` equals the request `thread_summary`, and emitted `message.task_execution_id` equals request `task_execution_id`. + +- [ ] **Step 2: Run test to verify failure** + +Run: + +```bash +pytest tests/unit -q -k "thread_message_created" +``` + +Expected: FAIL because live DTO currently omits summary and task execution identity in the emitted message. + +- [ ] **Step 3: Populate live DTO fields** + +In `CommunicationService.save_message`, set: + +```python +thread_dto = RunCommunicationThreadDto( + id=str(thread.id), + run_id=str(thread.run_id), + topic=thread.topic, + summary=thread.summary, + agent_a_id=thread.agent_a_id, + agent_b_id=thread.agent_b_id, + created_at=thread.created_at, + updated_at=thread.updated_at, + messages=[], +) +``` + +Set on `message_dto`: + +```python +task_execution_id=str(message.task_execution_id) if message.task_execution_id else None, +``` + +If task ID derivation is available in this service path, also set `task_id`. If not, leave `task_id=None` and rely on snapshot enrichment until the service has an execution lookup helper. + +- [ ] **Step 4: Run live event test** + +Run: + +```bash +pytest tests/unit -q -k "thread_message_created" +``` + +Expected: PASS. + +--- + +## Task 5: Update Frontend Contracts + +**Files:** +- Modify: `ergon-dashboard/src/generated/rest/contracts.ts` +- Modify: `ergon-dashboard/src/lib/contracts/rest.ts` +- Test: `ergon-dashboard/tests/contracts/contracts.test.ts` + +- [ ] **Step 1: Write frontend contract assertion** + +In `tests/contracts/contracts.test.ts`, assert a thread fixture can contain: + +```ts +summary: "Leaf workers report completion artifacts and probe exit status." +``` + +and parsing preserves `thread.summary`. + +- [ ] **Step 2: Run test to verify failure** + +Run: + +```bash +pnpm exec vitest run tests/contracts/contracts.test.ts +``` + +Expected: FAIL if generated contract does not include `summary`. + +- [ ] **Step 3: Update generated/rest contract** + +Add `summary: z.string().nullable().optional()` to `RunCommunicationThreadDto` schema if codegen is not run in this task. + +- [ ] **Step 4: Normalize summary** + +Ensure `RunCommunicationThread` exposes: + +```ts +summary: string | null; +``` + +and normalization defaults missing `summary` to `null`. + +- [ ] **Step 5: Run frontend contract test** + +Run: + +```bash +pnpm exec vitest run tests/contracts/contracts.test.ts +``` + +Expected: PASS. + +--- + +## Task 6: Restyle Communication Panel as Thread List + Chat Trace + +**Files:** +- Modify: `ergon-dashboard/src/components/panels/CommunicationPanel.tsx` +- Test: `ergon-dashboard/tests/e2e/run.snapshot.spec.ts` + +- [ ] **Step 1: Add E2E assertions for clickable thread list** + +In `run.snapshot.spec.ts`, after opening communication tab, assert: + +```ts +await expect(page.getByTestId("communication-thread-list")).toBeVisible(); +await expect(page.getByTestId("communication-thread-card").first()).toContainText("smoke-completion"); +await page.getByTestId("communication-thread-card").first().click(); +await expect(page.getByTestId("communication-chat-trace")).toBeVisible(); +await expect(page.getByTestId("communication-chat-message").first()).toBeVisible(); +``` + +- [ ] **Step 2: Run E2E test to verify failure** + +Run: + +```bash +pnpm exec playwright test tests/e2e/run.snapshot.spec.ts --project=chromium -g "graph selection opens workspace evidence sections" +``` + +Expected: FAIL because the current panel has no thread-list/chat-trace test IDs. + +- [ ] **Step 3: Implement selected thread state** + +In `CommunicationPanel.tsx`, add: + +```ts +const [selectedThreadId, setSelectedThreadId] = useState(threads[0]?.id ?? null); +const selectedThread = threads.find((thread) => thread.id === selectedThreadId) ?? threads[0] ?? null; +``` + +Use `useEffect` to reset selection when `threads[0]?.id` changes. + +- [ ] **Step 4: Render thread list** + +For each thread, render a button card with: + +- topic +- `thread.summary ?? summarizeThread(thread)` +- message count +- created/updated time +- participant chips derived from `messages[].fromAgentId` plus `agentAId`/`agentBId` + +- [ ] **Step 5: Render WhatsApp-style chat trace** + +For selected thread, render messages sorted by `sequenceNum` with: + +- sender label from `fromAgentId` +- timestamp from `createdAt` +- content as wrapped text +- bubble alignment keyed by sender +- small metadata row: `#${sequenceNum}` and `taskId` when available + +- [ ] **Step 6: Keep empty state clear** + +If no threads are visible at time `t`, show: + +```tsx +No communication threads yet at this point in the run. +``` + +- [ ] **Step 7: Run E2E test** + +Run: + +```bash +pnpm exec playwright test tests/e2e/run.snapshot.spec.ts --project=chromium -g "graph selection opens workspace evidence sections" +``` + +Expected: PASS. + +--- + +## Task 7: Ensure Time-Step Filtering Reads Correctly + +**Files:** +- Modify: `ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.ts` if needed +- Test: `ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.test.ts` + +- [ ] **Step 1: Add test for visible-at-time thread summaries** + +Create a test where: + +- thread exists at 10:00 +- messages exist at 10:01 and 10:02 +- selected time is 10:01:30 + +Assert returned thread contains only the first message. + +- [ ] **Step 2: Run test** + +Run: + +```bash +pnpm exec tsx --test src/components/workspace/filterTaskEvidenceForTime.test.ts +``` + +Expected: PASS if current filtering already handles this. If it fails, patch only filtering logic. + +--- + +## Task 8: Final Verification + +**Files:** +- All touched files + +- [ ] **Step 1: Run backend unit tests** + +Run: + +```bash +pytest tests/unit/smoke_base/test_leaf_sends_completion_message.py tests/unit/dashboard/test_event_contract_types.py -q +``` + +Expected: PASS. + +- [ ] **Step 2: Run frontend typecheck** + +Run from `ergon-dashboard`: + +```bash +pnpm exec tsc --noEmit +``` + +Expected: exit 0. + +- [ ] **Step 3: Run frontend E2E tests sequentially** + +Run from `ergon-dashboard`: + +```bash +pnpm exec playwright test tests/e2e/run.snapshot.spec.ts --project=chromium +pnpm exec playwright test tests/e2e/run.delta.spec.ts --project=chromium +``` + +Expected: both pass. Run sequentially to avoid shared dev-server port collisions. + +- [ ] **Step 4: Lint recently edited files** + +Use the IDE linter diagnostics for: + +- `CommunicationPanel.tsx` +- `filterTaskEvidenceForTime.ts` +- generated/normalized contract files +- backend communication service/schema files + +Expected: no new linter errors. + +--- + +## Self-Review + +- Spec coverage: covers agent-authored nullable thread summary, first-message creation path, backend summary/task anchoring, live event DTOs, frontend clickable thread list, chat trace, and time-step filtering. +- Placeholder scan: no `TBD`/`TODO` placeholders; migration revision filename remains intentionally parameterized because Alembic generates revision IDs. +- Type consistency: backend uses `thread_summary` for request input and `summary` for persisted/output thread metadata; frontend uses `summary`. diff --git a/docs/superpowers/plans/2026-04-26-core-test-logic-audit.md b/docs/superpowers/plans/2026-04-26-core-test-logic-audit.md new file mode 100644 index 00000000..efe7a06d --- /dev/null +++ b/docs/superpowers/plans/2026-04-26-core-test-logic-audit.md @@ -0,0 +1,1113 @@ +# Core Test Logic Audit Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Remove testing-specific logic that has crept into core runtime code, starting with sandbox sentinel handling, while preserving the non-null `sandbox_id` contract. + +**Architecture:** Core orchestration should depend on provider-owned lifecycle APIs, not on test/stub identities or sentinel parsing. Production sandbox setup requires E2B configuration and must fail loudly if E2B is unavailable; there is no core "no remote sandbox was provisioned" fallback. Test doubles such as `StubSandboxManager`, smoke workers, smoke fixtures, local sandbox implementations, and any placeholder sentinel IDs stay under `ergon_core.test_support` and are only wired through explicitly gated harness/bootstrap paths. + +**Tech Stack:** Python, FastAPI, Inngest, SQLModel, pytest, Playwright, Docker Compose. + +--- + +## Current Findings + +The immediate issue is not that placeholder sandbox IDs exist in tests. The issue is that runtime/core code knows too much about why a placeholder exists. Core should require real sandbox provisioning; test support can still provide sentinel-backed managers for unit/integration tests. + +Current leaks: + +- `ergon_core/ergon_core/core/providers/sandbox/manager.py` defines `StubSandboxManager` and `is_stub_sandbox_id`; the manager can continue to exist, but it belongs under `ergon_core.test_support`, not `core`. +- `ergon_core/ergon_core/core/runtime/inngest/execute_task.py` imports `StubSandboxManager` to mint skipped-task sandbox IDs. +- `ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py` imports `is_stub_sandbox_id` before terminating sandboxes. +- `ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py` imports `is_stub_sandbox_id` before terminating run-level sandboxes. +- `ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py` currently imports `is_stub_sandbox_id` for failed-task cleanup. +- `ergon_core/ergon_core/test_support/smoke_fixtures/*` is acceptable test-owned code, but core runtime must not import it. +- `ergon_core/ergon_core/core/api/test_harness.py` is acceptable only because it is explicitly mounted behind `ENABLE_TEST_HARNESS`; this plan adds guardrails so that pattern does not spread. + +## Audit Results + +Audit command run on 2026-04-26: + +```bash +rg "test_support|tests\.|smoke|fake|mock|stub|fixture|ENABLE_TEST|ENABLE_SMOKE|test_harness|StubSandboxManager|is_stub_sandbox_id|stub-sandbox" ergon_core/ergon_core/core +``` + +### Must Fix + +- `ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py` + - Imports `BaseSandboxManager` and `is_stub_sandbox_id`. + - Branches on `is_stub_sandbox_id` during failed-task sandbox cleanup. + - Classification: **test/provider sentinel knowledge leaked into runtime orchestration.** + - Fix: route through provider-owned `terminate_sandbox_by_id`. + +- `ergon_core/ergon_core/core/runtime/inngest/execute_task.py` + - Imports `StubSandboxManager`. + - Creates `stub_sandbox_id` for skipped tasks. + - Comments instruct downstream teardown to inspect `is_stub_sandbox_id`. + - Classification: **test double implementation leaked into task execution.** + - Fix: remove the skipped-task stub path from core. If skipped tasks must emit completion events, they should still have a real sandbox ID from normal setup or the event contract should be redesigned deliberately; do not mint provider placeholders in core. + +- `ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py` + - Imports and branches on `is_stub_sandbox_id`. + - Classification: **runtime teardown knows provider sentinel details.** + - Fix: call provider-owned lifecycle termination API. + +- `ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py` + - Imports and branches on `is_stub_sandbox_id`. + - Classification: **run cleanup knows provider sentinel details.** + - Fix: call provider-owned lifecycle termination API. + +- `ergon_core/ergon_core/core/runtime/events/task_events.py` + - Comments describe `StubSandboxManager` and `is_stub_sandbox_id`. + - Classification: **contract docs encode the wrong abstraction.** + - Fix: document that production task execution uses real sandbox IDs. Test-support managers may emit sentinel IDs, but core consumers must not branch on them. + +- `ergon_core/ergon_core/core/providers/sandbox/manager.py` + - Defines `_STUB_SANDBOX_PREFIX`, `is_stub_sandbox_id`, and `StubSandboxManager`. + - `DefaultSandboxManager.create` delegates to `StubSandboxManager` when `E2B_API_KEY` is missing. + - Classification: **a test double is mixed into core, and core incorrectly treats missing E2B configuration as a recoverable execution mode.** + - Fix: move `StubSandboxManager` to `ergon_core.test_support`; remove the `DefaultSandboxManager` no-E2B fallback and let `BaseSandboxManager.create` fail loudly when `E2B_API_KEY` is absent. + +- `ergon_core/ergon_core/core/api/app.py` + - Imports `ergon_core.test_support.smoke_fixtures.register_smoke_fixtures` under `settings.smoke_fixtures_enabled`. + - Classification: **core app bootstrap imports test-support code.** The flag helps, but the dependency direction is still wrong. + - Fix: replace this with a generic startup-plugin mechanism, then configure the smoke fixture registration callable from local/CI environment. + +### Probably Fix / Rename + +- `ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py` + - Defaults `worker_slug` to `"stub-worker"` and `evaluator_slug` to `"stub-rubric"`. + - Classification: **not necessarily test logic, but the default names read as test doubles inside a production request contract.** + - Fix: make both fields required. The benchmark-run request contract should not invent worker/evaluator defaults. + +- `ergon_core/ergon_core/core/rl/eval_runner.py` + - Defaults `evaluator_type` to `"stub-rubric"` and uses `"stub-worker"` when no `model_base` is provided. + - Classification: **RL/dev utility behavior may be legitimate, but the naming implies test doubles.** + - Fix: make evaluator/model inputs explicit. Do not default to stub worker or stub evaluator slugs. + +- `ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py` + - Comment says `release-sandbox — stub`. + - Classification: **stale implementation comment.** + - Fix: update to lifecycle-service language when cancelled-task cleanup is wired. + +### Allowed / No Code Change + +- `ergon_core/ergon_core/core/api/test_harness.py` + - Test-only router behind `ENABLE_TEST_HARNESS`. + - Classification: **allowed explicitly gated integration surface.** + - Constraint: may depend on test concepts, but should stay isolated to this file/package. + +- `ergon_core/ergon_core/core/settings.py` + - Defines `ENABLE_TEST_HARNESS`, `ENABLE_SMOKE_FIXTURES`, and `smoke_fixtures_enabled`. + - Classification: **allowed configuration surface for gated dev/test behavior.** + - Follow-up: `ENABLE_SMOKE_FIXTURES` should be replaced or backed by a generic startup-plugin setting when `core/api/app.py` is fixed. + +- `ergon_core/ergon_core/core/runtime/services/task_management_service.py` + - Comment says tests must seed `RunRecord` via factories/fixtures. + - Classification: **allowed test guidance in invariant documentation.** + - No runtime behavior depends on test code. + +- `ergon_core/ergon_core/core/runtime/errors/delegation_errors.py` + - Comment says missing fixtures in tests should fail loudly. + - Classification: **allowed explanatory comment.** + - No runtime behavior depends on test code. + +- `ergon_core/ergon_core/core/providers/sandbox/manager.py` + - ImportError fallback exception classes for missing E2B SDK. + - Classification: **allowed optional dependency shim, but rename/comment carefully to avoid "test stub" language.** + +- `ergon_core/ergon_core/core/persistence/graph/models.py` + - Comment includes `"canonical-smoke"` as an example worker slug. + - Classification: **allowed example but should be refreshed if smoke naming changes.** + +Desired boundary: + +- Core runtime may say: "terminate the sandbox for this ID." +- Core runtime may not say: "skip because this ID is a stub." +- Core production sandbox creation must fail loudly if E2B is unavailable. +- Test-support managers may use sentinel IDs, but only test-support code should create or name them. +- The `sandbox_id` field should remain non-null for normal lifecycle events that require it. + +## File Structure + +Create: + +- `ergon_core/ergon_core/core/providers/sandbox/lifecycle.py` + - Owns sandbox lifecycle decisions by ID. + - Defines a termination result and termination service. + - Does not define or parse test sentinel IDs. + +- `ergon_core/ergon_core/test_support/sandbox/stub_manager.py` + - Contains `StubSandboxManager` as a test double for unit tests and harness-specific tests. + - Owns its sentinel prefix and any fake sandbox lifecycle bookkeeping. + - Must not be imported by `ergon_core.core`. + +- `tests/unit/sandbox/test_sandbox_lifecycle_service.py` + - Tests real-ID termination dispatch. + - Tests malformed or missing IDs are handled explicitly. + +- `tests/unit/architecture/test_no_test_logic_in_core.py` + - Regression guard that scans core runtime/provider modules for forbidden imports and terms. + - Allows explicitly approved files such as `core/api/test_harness.py` and `core/settings.py`. + +Modify: + +- `ergon_core/ergon_core/core/providers/sandbox/manager.py` + - Remove `is_stub_sandbox_id` from this file. + - Move `StubSandboxManager` to `ergon_core/ergon_core/test_support/sandbox/stub_manager.py`. + - Remove `DefaultSandboxManager.create`'s no-E2B fallback so production setup inherits the loud failure in `BaseSandboxManager.create`. + +- `ergon_core/ergon_core/core/providers/sandbox/__init__.py` + - Export lifecycle primitives that runtime code is allowed to use. + +- `ergon_core/ergon_core/core/runtime/inngest/execute_task.py` + - Stop importing `StubSandboxManager`. + - Remove skipped-task placeholder minting from core. + +- `ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py` + - Replace `is_stub_sandbox_id` branching with the lifecycle service termination call. + +- `ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py` + - Replace `is_stub_sandbox_id` branching with the lifecycle service termination call. + +- `ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py` + - Replace failed-task cleanup branching with the lifecycle service termination call. + +- `ergon_core/ergon_core/core/runtime/events/task_events.py` + - Update comments that reference stub mode or `is_stub_sandbox_id`. + +- `ergon_core/ergon_core/core/api/app.py` + - Remove direct imports from `ergon_core.test_support`. + - Load optional startup hooks through a generic plugin setting. + +- `ergon_core/ergon_core/core/settings.py` + - Add a generic startup plugin setting. + - Keep test harness routing gated, but stop hardcoding smoke fixture registration in core app startup. + +- `tests/unit/runtime/test_failed_task_sandbox_cleanup.py` + - Update mocks to target the lifecycle service instead of `BaseSandboxManager` directly. + +## Task 1: Add Provider-Owned Sandbox Lifecycle API + +**Files:** + +- Create: `ergon_core/ergon_core/core/providers/sandbox/lifecycle.py` +- Test: `tests/unit/sandbox/test_sandbox_lifecycle_service.py` + +- [ ] **Step 1: Write failing lifecycle service tests** + +Create `tests/unit/sandbox/test_sandbox_lifecycle_service.py`: + +```python +from unittest.mock import AsyncMock, patch + +import pytest + +from ergon_core.core.providers.sandbox.lifecycle import ( + SandboxTerminationReason, + terminate_sandbox_by_id, +) + + +@pytest.mark.asyncio +async def test_terminate_sandbox_by_id_dispatches_real_ids() -> None: + with patch( + "ergon_core.core.providers.sandbox.manager.BaseSandboxManager.terminate_by_sandbox_id", + new=AsyncMock(return_value=True), + ) as terminate: + result = await terminate_sandbox_by_id("sbx-live-123") + + terminate.assert_awaited_once_with("sbx-live-123") + assert result.terminated is True + assert result.reason == SandboxTerminationReason.TERMINATED + + +@pytest.mark.asyncio +async def test_terminate_sandbox_by_id_handles_missing_id_explicitly() -> None: + result = await terminate_sandbox_by_id(None) + + assert result.terminated is False + assert result.reason == SandboxTerminationReason.MISSING_ID + assert result.sandbox_id is None +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/sandbox/test_sandbox_lifecycle_service.py -q +``` + +Expected: FAIL because `ergon_core.core.providers.sandbox.lifecycle` does not exist yet. + +- [ ] **Step 3: Add lifecycle service** + +Create `ergon_core/ergon_core/core/providers/sandbox/lifecycle.py`: + +```python +"""Provider-owned sandbox lifecycle helpers. + +Runtime orchestration code should not inspect sandbox ID sentinels. It should +delegate lifecycle operations here and let the provider layer terminate real +sandboxes. Test-support sentinel IDs are owned by test-support managers, not by +core runtime. +""" + +from __future__ import annotations + +import logging +from enum import StrEnum + +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + + +class SandboxTerminationReason(StrEnum): + TERMINATED = "terminated" + NOT_FOUND_OR_ALREADY_CLOSED = "not_found_or_already_closed" + MISSING_ID = "missing_id" + ERROR = "error" + + +class SandboxTerminationResult(BaseModel): + sandbox_id: str | None + terminated: bool + reason: SandboxTerminationReason + + +async def terminate_sandbox_by_id(sandbox_id: str | None) -> SandboxTerminationResult: + """Terminate a sandbox ID behind one runtime-facing lifecycle boundary.""" + if sandbox_id is None: + return SandboxTerminationResult( + sandbox_id=None, + terminated=False, + reason=SandboxTerminationReason.MISSING_ID, + ) + + try: + from ergon_core.core.providers.sandbox.manager import BaseSandboxManager + + terminated = await BaseSandboxManager.terminate_by_sandbox_id(sandbox_id) + except Exception: # slopcop: ignore[no-broad-except] + logger.error("Failed to terminate sandbox %s", sandbox_id, exc_info=True) + return SandboxTerminationResult( + sandbox_id=sandbox_id, + terminated=False, + reason=SandboxTerminationReason.ERROR, + ) + + return SandboxTerminationResult( + sandbox_id=sandbox_id, + terminated=terminated, + reason=( + SandboxTerminationReason.TERMINATED + if terminated + else SandboxTerminationReason.NOT_FOUND_OR_ALREADY_CLOSED + ), + ) +``` + +- [ ] **Step 4: Run lifecycle tests** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/sandbox/test_sandbox_lifecycle_service.py -q +``` + +Expected: PASS. + +## Task 2: Move Stub Sandbox Manager Out of Core + +**Files:** + +- Modify: `ergon_core/ergon_core/core/providers/sandbox/manager.py` +- Modify: `ergon_core/ergon_core/core/providers/sandbox/__init__.py` +- Create: `ergon_core/ergon_core/test_support/sandbox/__init__.py` +- Create: `ergon_core/ergon_core/test_support/sandbox/stub_manager.py` +- Test: `tests/unit/sandbox/test_sandbox_lifecycle_service.py` +- Test: `tests/unit/smoke_base/test_smoke_sandbox_manager.py` + +- [ ] **Step 1: Update exports** + +Modify `ergon_core/ergon_core/core/providers/sandbox/__init__.py` to export the lifecycle API: + +```python +from ergon_core.core.providers.sandbox.lifecycle import ( + SandboxTerminationReason, + SandboxTerminationResult, + terminate_sandbox_by_id, +) +``` + +Add these names to `__all__`: + +```python + "SandboxTerminationReason", + "SandboxTerminationResult", + "terminate_sandbox_by_id", +``` + +- [ ] **Step 2: Move `StubSandboxManager` to test support** + +Create `ergon_core/ergon_core/test_support/sandbox/__init__.py`: + +```python +"""Test-support sandbox doubles.""" + +from ergon_core.test_support.sandbox.stub_manager import StubSandboxManager + +__all__ = ["StubSandboxManager"] +``` + +Create `ergon_core/ergon_core/test_support/sandbox/stub_manager.py`: + +```python +"""Sandbox manager test double. + +This class exists for unit tests and test harnesses that need a concrete +manager object without provisioning E2B. Production/core code must not import +this module. +""" + +from __future__ import annotations + +import logging +from uuid import UUID + +from ergon_core.core.providers.sandbox.manager import AsyncSandbox, BaseSandboxManager + +logger = logging.getLogger(__name__) + + +class _StubSandbox: + def __init__(self, sandbox_id: str) -> None: + self.sandbox_id = sandbox_id + + async def kill(self) -> None: + return None + + +class StubSandboxManager(BaseSandboxManager): + """No-op sandbox manager for tests. + + ``create`` returns a test-owned sentinel ID. Production/core code must not + create or inspect this ID format. + """ + + _PREFIX = "stub-sandbox-" + + async def create( + self, + sandbox_key: UUID, + run_id: UUID, + timeout_minutes: int = 30, + envs: dict[str, str] | None = None, + display_task_id: UUID | None = None, + ) -> str: + sandbox_id = f"{self._PREFIX}{sandbox_key}" + logger.info( + "Returning test stub sandbox id %s for task %s", + sandbox_id, + sandbox_key, + ) + self._ensure_registries(sandbox_key) + self._sandboxes[sandbox_key] = _StubSandbox(sandbox_id) # type: ignore[assignment] + self._run_ids[sandbox_key] = run_id + self._display_task_ids[sandbox_key] = display_task_id or sandbox_key + self._sandbox_manager_classes[sandbox_key] = type(self) + return sandbox_id + + async def _install_dependencies(self, sandbox: AsyncSandbox, task_id: UUID) -> None: + return None + + async def terminate(self, task_id: UUID, reason: str = "completed") -> None: + self._file_registries.pop(task_id, None) + self._created_files_registry.pop(task_id, None) + self._run_ids.pop(task_id, None) + self._display_task_ids.pop(task_id, None) + + async def reset_timeout(self, task_id: UUID, timeout_minutes: int = 30) -> bool: + return True +``` + +Then remove these symbols from `ergon_core/ergon_core/core/providers/sandbox/manager.py`: + +```python +_STUB_SANDBOX_PREFIX = "stub-sandbox-" + + +def is_stub_sandbox_id(sandbox_id: JsonValue) -> bool: + ... + + +class StubSandboxManager(BaseSandboxManager): + ... +``` + +- [ ] **Step 3: Remove core no-E2B fallback** + +Do not add any `DefaultSandboxManager.create` fallback. In `ergon_core/ergon_core/core/providers/sandbox/manager.py`, either delete the `DefaultSandboxManager.create` override entirely or reduce `DefaultSandboxManager` to dependency hooks only: + +```python +class DefaultSandboxManager(BaseSandboxManager): + """No custom dependencies. Used by benchmarks without specific sandbox setup.""" + + async def _install_dependencies(self, sandbox: AsyncSandbox, task_id: UUID) -> None: + pass +``` + +This intentionally preserves `BaseSandboxManager.create`'s existing loud failure when `E2B_API_KEY` is missing. + +- [ ] **Step 4: Run sandbox tests** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/sandbox/test_sandbox_lifecycle_service.py tests/unit/smoke_base/test_smoke_sandbox_manager.py -q +``` + +Expected: PASS. + +## Task 3: Remove Skipped-Task Placeholder Minting from Task Execution + +**Files:** + +- Modify: `ergon_core/ergon_core/core/runtime/inngest/execute_task.py` +- Test: `tests/unit/runtime/test_child_function_payloads.py` +- Test: `tests/unit/runtime/test_worker_execute_output_failure.py` + +- [ ] **Step 1: Remove skipped-task stub manager import** + +In `ergon_core/ergon_core/core/runtime/inngest/execute_task.py`, replace: + +```python +from ergon_core.core.providers.sandbox.manager import StubSandboxManager +``` + +with no sandbox-manager import. `execute_task.py` should not import `StubSandboxManager`, `make_noop_sandbox_id`, or any test-support sandbox module. + +Then replace the skipped-task block: + +```python +if prepared.skipped: + logger.info( + "task-execute skipped task_id=%s reason=%s", + payload.task_id, + prepared.skip_reason, + ) + stub_sandbox_id = await StubSandboxManager().create( + prepared.node_id, + run_id=payload.run_id, + display_task_id=prepared.node_id, + ) + await _emit_task_completed(payload, prepared, stub_sandbox_id) + return TaskExecuteResult( + run_id=payload.run_id, + task_id=payload.task_id, + execution_id=prepared.execution_id, + success=True, + skipped=True, + skip_reason=prepared.skip_reason, + ) +``` + +with: + +```python +if prepared.skipped: + raise ContractViolationError( + "Skipped task execution cannot emit task/completed without a real sandbox_id. " + "Introduce a first-class task/skipped event before supporting skipped tasks." + ) +``` + +Rationale: production has no "no remote sandbox was provisioned" path. If skipped tasks become a real product feature, they need their own explicit event/propagation contract instead of fake sandbox IDs. + +- [ ] **Step 2: Add a regression test for skipped-task contract failure** + +Add a focused unit test for `execute_task_fn`'s skipped branch if there is an existing task-execution test harness. If no focused harness exists, add this behavior to the architecture guard: + +```python +def test_core_task_execution_does_not_mint_placeholder_sandbox_ids() -> None: + path = CORE / "runtime" / "inngest" / "execute_task.py" + text = path.read_text() + + assert "StubSandboxManager" not in text + assert "make_noop_sandbox_id" not in text + assert "stub_sandbox_id" not in text +``` + +- [ ] **Step 3: Compile task execution module** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run python -m py_compile ergon_core/ergon_core/core/runtime/inngest/execute_task.py +``` + +Expected: no output. + +- [ ] **Step 4: Run targeted runtime tests** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/runtime/test_child_function_payloads.py tests/unit/runtime/test_worker_execute_output_failure.py -q +``` + +Expected: PASS. + +## Task 4: Route All Runtime Teardown Through Lifecycle Service + +**Files:** + +- Modify: `ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py` +- Modify: `ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py` +- Modify: `ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py` +- Modify: `tests/unit/runtime/test_failed_task_sandbox_cleanup.py` + +- [ ] **Step 1: Update failed-task cleanup test** + +Change `tests/unit/runtime/test_failed_task_sandbox_cleanup.py` to patch the provider lifecycle API: + +```python +from unittest.mock import AsyncMock, patch + +import pytest + +from ergon_core.core.providers.sandbox.lifecycle import ( + SandboxTerminationReason, + SandboxTerminationResult, +) +from ergon_core.core.runtime.inngest.propagate_execution import _terminate_failed_task_sandbox + + +@pytest.mark.asyncio +async def test_failed_task_sandbox_cleanup_delegates_to_lifecycle_service() -> None: + result = SandboxTerminationResult( + sandbox_id="sbx-real", + terminated=True, + reason=SandboxTerminationReason.TERMINATED, + ) + with patch( + "ergon_core.core.runtime.inngest.propagate_execution.terminate_sandbox_by_id", + new=AsyncMock(return_value=result), + ) as terminate: + await _terminate_failed_task_sandbox("sbx-real") + + terminate.assert_awaited_once_with("sbx-real") +``` + +- [ ] **Step 2: Update failed-task cleanup implementation** + +In `ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py`, replace: + +```python +from ergon_core.core.providers.sandbox.manager import BaseSandboxManager, is_stub_sandbox_id +``` + +with: + +```python +from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id +``` + +Replace `_terminate_failed_task_sandbox` with: + +```python +async def _terminate_failed_task_sandbox(sandbox_id: str | None) -> None: + result = await terminate_sandbox_by_id(sandbox_id) + if not result.terminated: + logger.info( + "failed-task sandbox cleanup did not terminate sandbox_id=%s reason=%s", + result.sandbox_id, + result.reason, + ) +``` + +- [ ] **Step 3: Update evaluator cleanup** + +In `ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py`, replace imports of `BaseSandboxManager` and `is_stub_sandbox_id` with: + +```python +from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id +``` + +Replace `_terminate_sandbox` with: + +```python +async def _terminate_sandbox(sandbox_id: str) -> None: + """Terminate the task's sandbox through the provider lifecycle boundary.""" + result = await terminate_sandbox_by_id(sandbox_id) + logger.info( + "Evaluator sandbox cleanup sandbox_id=%s terminated=%s reason=%s", + result.sandbox_id, + result.terminated, + result.reason, + ) +``` + +- [ ] **Step 4: Update run cleanup** + +In `ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py`, replace: + +```python +from ergon_core.core.providers.sandbox.manager import ( + BaseSandboxManager, + is_stub_sandbox_id, +) +``` + +with: + +```python +from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id +``` + +Replace the branch over `sandbox_id` with: + +```python +sandbox_result = await terminate_sandbox_by_id( + sandbox_id if isinstance(sandbox_id, str) else None +) +sandbox_terminated = sandbox_result.terminated + +if sandbox_id is not None and not isinstance(sandbox_id, str): + logger.warning( + "run-cleanup run_id=%s: sandbox_id has unexpected type %s, skipping termination", + run_id, + type(sandbox_id).__name__, + ) +``` + +- [ ] **Step 5: Run targeted teardown tests** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/runtime/test_failed_task_sandbox_cleanup.py tests/unit/sandbox/test_sandbox_lifecycle_service.py -q +``` + +Expected: PASS. + +## Task 5: Remove Test-Support Imports from App Bootstrap + +**Files:** + +- Modify: `ergon_core/ergon_core/core/api/app.py` +- Modify: `ergon_core/ergon_core/core/settings.py` +- Test: `tests/unit/test_test_harness.py` +- Test: `tests/unit/architecture/test_no_test_logic_in_core.py` + +- [ ] **Step 1: Add a generic startup-plugin setting** + +In `ergon_core/ergon_core/core/settings.py`, add a string setting that can hold comma-separated import specs: + +```python +startup_plugin_specs: str = Field( + default="", + validation_alias=AliasChoices("ERGON_STARTUP_PLUGINS"), +) +``` + +Add a helper property: + +```python +@property +def startup_plugins(self) -> tuple[str, ...]: + return tuple( + spec.strip() + for spec in self.startup_plugin_specs.split(",") + if spec.strip() + ) +``` + +Keep `enable_test_harness` for mounting the harness router. Treat `enable_smoke_fixtures` as compatibility only until callers are moved to `ERGON_STARTUP_PLUGINS`. + +- [ ] **Step 2: Add an app-local plugin loader** + +In `ergon_core/ergon_core/core/api/app.py`, add: + +```python +from importlib import import_module + + +def _run_startup_plugins(plugin_specs: tuple[str, ...]) -> None: + for spec in plugin_specs: + module_name, sep, attr_name = spec.partition(":") + if not sep or not module_name or not attr_name: + raise RuntimeError( + "Invalid ERGON_STARTUP_PLUGINS entry " + f"{spec!r}; expected 'module:function'" + ) + module = import_module(module_name) + plugin = getattr(module, attr_name) + plugin() +``` + +Then replace: + +```python +if settings.smoke_fixtures_enabled: + from ergon_core.test_support.smoke_fixtures import register_smoke_fixtures + + register_smoke_fixtures() +``` + +with: + +```python +_run_startup_plugins(settings.startup_plugins) +``` + +- [ ] **Step 3: Preserve local/CI smoke fixture registration through configuration** + +Update local and CI smoke environment setup to use: + +```bash +ERGON_STARTUP_PLUGINS=ergon_core.test_support.smoke_fixtures:register_smoke_fixtures +``` + +Candidate files to inspect and update: + +- `docker-compose.yml` +- `.github/workflows/e2e-benchmarks.yml` +- `scripts/smoke_local_up.sh` +- `scripts/smoke_local_run.sh` + +Do not keep a direct import of `ergon_core.test_support` in `core/api/app.py`. + +- [ ] **Step 4: Add tests for startup plugin loading** + +In `tests/unit/test_test_harness.py`, add a focused test for `_run_startup_plugins` using a standard-library callable that is safe to call, or a tiny in-test module fixture if one already exists. If a direct callable test is awkward, test invalid config instead: + +```python +import pytest + +from ergon_core.core.api.app import _run_startup_plugins + + +def test_startup_plugin_loader_rejects_invalid_specs() -> None: + with pytest.raises(RuntimeError, match="expected 'module:function'"): + _run_startup_plugins(("ergon_core.test_support.smoke_fixtures",)) +``` + +- [ ] **Step 5: Run app/test-harness tests** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/test_test_harness.py -q +``` + +Expected: PASS. + +## Task 6: Add Architecture Guard for Test Logic in Core + +**Files:** + +- Create: `tests/unit/architecture/test_no_test_logic_in_core.py` + +- [ ] **Step 1: Write architecture guard** + +Create `tests/unit/architecture/test_no_test_logic_in_core.py`: + +```python +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +CORE = ROOT / "ergon_core" / "ergon_core" / "core" + +ALLOWED_FILES = { + CORE / "api" / "test_harness.py", + CORE / "settings.py", +} + +FORBIDDEN_IMPORT_SNIPPETS = ( + "ergon_core.test_support", + "tests.", +) + +FORBIDDEN_CORE_TEST_DOUBLE_TERMS = ( + "StubSandboxManager", + "is_stub_sandbox_id", + "stub-sandbox-", +) + + +def _core_python_files() -> list[Path]: + return [ + path + for path in CORE.rglob("*.py") + if path not in ALLOWED_FILES and "__pycache__" not in path.parts + ] + + +def test_core_does_not_import_test_support_or_tests() -> None: + offenders: list[str] = [] + for path in _core_python_files(): + text = path.read_text() + for snippet in FORBIDDEN_IMPORT_SNIPPETS: + if snippet in text: + offenders.append(f"{path.relative_to(ROOT)} contains {snippet!r}") + + assert offenders == [] + + +def test_core_does_not_define_or_branch_on_stub_sandbox_terms() -> None: + offenders: list[str] = [] + for path in _core_python_files(): + text = path.read_text() + for term in FORBIDDEN_CORE_TEST_DOUBLE_TERMS: + if term in text: + offenders.append(f"{path.relative_to(ROOT)} contains {term!r}") + + assert offenders == [] +``` + +- [ ] **Step 2: Run architecture guard** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/architecture/test_no_test_logic_in_core.py -q +``` + +Expected: PASS after Tasks 1-4. If it fails, move the offending logic behind provider/test-support boundaries instead of weakening the guard. + +## Task 7: Clean Comments and Event Contract Language + +**Files:** + +- Modify: `ergon_core/ergon_core/core/runtime/events/task_events.py` +- Modify: `ergon_core/ergon_core/core/runtime/inngest/execute_task.py` +- Modify: `ergon_core/ergon_core/core/providers/sandbox/manager.py` + +- [ ] **Step 1: Remove "stub" and no-sandbox fallback terminology from core comments** + +Replace comments that describe no-E2B behavior as "stub mode" or "provider no-op sandbox ID" with loud production setup language. + +In `task_events.py`, replace the existing stub comment with: + +```python +# Production task execution emits real sandbox IDs. Test-support managers may +# use sentinel IDs, but core event consumers must not parse or branch on those +# sentinel formats. +``` + +- [ ] **Step 2: Search for remaining core stub terms** + +Run: + +```bash +rg "StubSandboxManager|is_stub_sandbox_id|stub-sandbox|stub mode|stub sandbox" ergon_core/ergon_core/core +``` + +Expected: no matches. If matches remain in core runtime/provider code, rewrite them or move the implementation to `test_support`. + +## Task 8: Run Focused Verification + +**Files:** + +- No source edits. + +- [ ] **Step 1: Compile touched Python files** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run python -m py_compile \ + ergon_core/ergon_core/core/providers/sandbox/lifecycle.py \ + ergon_core/ergon_core/core/providers/sandbox/manager.py \ + ergon_core/ergon_core/core/runtime/inngest/execute_task.py \ + ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py \ + ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py \ + ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py +``` + +Expected: no output. + +- [ ] **Step 2: Run unit tests** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest \ + tests/unit/sandbox/test_sandbox_lifecycle_service.py \ + tests/unit/runtime/test_failed_task_sandbox_cleanup.py \ + tests/unit/runtime/test_worker_execute_output_failure.py \ + tests/unit/smoke_base/test_smoke_sandbox_manager.py \ + tests/unit/architecture/test_no_test_logic_in_core.py \ + -q +``` + +Expected: PASS. + +- [ ] **Step 3: Run local canonical smoke e2es** + +Run: + +```bash +ERGON_DATABASE_URL=postgresql://ergon:ergon_dev@localhost:5433/ergon \ +INNGEST_API_BASE_URL=http://localhost:8289 \ +INNGEST_DEV=1 \ +INNGEST_EVENT_KEY=dev \ +ERGON_API_BASE_URL=http://127.0.0.1:9000 \ +PLAYWRIGHT_BASE_URL=http://127.0.0.1:3001 \ +ENABLE_TEST_HARNESS=1 \ +TEST_HARNESS_SECRET=local-dev \ +SCREENSHOT_DIR=/tmp/playwright \ +SMOKE_COHORT_SIZE=1 \ +PYTHONPATH="ergon_core:ergon_builtins" \ +uv run pytest tests/e2e/test_researchrubrics_smoke.py tests/e2e/test_minif2f_smoke.py tests/e2e/test_swebench_smoke.py -q -s --timeout=300 --tb=short +``` + +Expected: all three benchmark smoke tests pass. The sad-path shape should remain: + +- `l_2` fails. +- `l_3` is blocked and never starts. +- Independent leaves complete. +- Run status is `FAILED`. +- Sandbox lifecycle events are symmetric for created/closed sandbox IDs. + +## Task 9: Close Remaining Audit Findings + +**Files:** + +- Modify: `ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py` +- Modify: `ergon_core/ergon_core/core/rl/eval_runner.py` +- Modify: `ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py` +- Update: `tests/unit/architecture/test_no_test_logic_in_core.py` + +- [ ] **Step 1: Re-run the audit search and compare against the inventory** + +Run: + +```bash +rg "test_support|tests\\.|smoke|fake|mock|stub|fixture|ENABLE_TEST|ENABLE_SMOKE" ergon_core/ergon_core/core +``` + +Expected remaining matches should be limited to the `Allowed / No Code Change` section above. Any match from `Must Fix` should be gone. + +- [ ] **Step 2: Fix request-contract defaults that read as test doubles** + +In `ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py`, make worker/evaluator slugs explicit instead of defaulting to test-looking values: + +```python +class BenchmarkRunRequest(InngestEventContract): + """CLI sends this to request a full benchmark run.""" + + name: ClassVar[str] = "benchmark/run-request" + + benchmark_slug: str + model: str + worker_slug: str + evaluator_slug: str + cohort_name: str = "" # slopcop: ignore[no-str-empty-default] +``` + +Then update call sites/tests that construct `BenchmarkRunRequest` without `worker_slug` or `evaluator_slug` so they pass concrete slugs. + +- [ ] **Step 3: Require explicit RL eval runner inputs** + +In `ergon_core/ergon_core/core/rl/eval_runner.py`, replace `"stub-rubric"` defaults with required evaluator arguments: + +```python +async def watch_and_evaluate( + checkpoint_dir: str, + benchmark_type: str, + *, + evaluator_type: str, + model_base: str, + poll_interval_s: int = 60, + eval_limit: int | None = None, + on_checkpoint_cmd: str | None = None, + external_cmd_timeout_s: int = 600, +) -> None: +``` + +For `_run_local_eval`, make `model_base` required: + +```python +async def _run_local_eval( + ckpt: CheckpointInfo, + *, + benchmark_type: str, + evaluator_type: str, + model_base: str, + eval_limit: int | None, +) -> int: +``` + +Then replace: + +```python +model_target = f"vllm:{ckpt.path}" if model_base else "stub-worker" +``` + +with: + +```python +model_target = f"vllm:{ckpt.path}" +``` + +Apply the same required `evaluator_type` and `model_base` signatures to `evaluate_checkpoint`. Callers must pass concrete values. + +- [ ] **Step 4: Update cancelled-task cleanup comment** + +In `ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py`, replace: + +```python +2. release-sandbox — stub (pending sandbox management module) +``` + +with: + +```python +2. release-sandbox — routed through the sandbox lifecycle provider when an + execution has an associated sandbox. +``` + +- [ ] **Step 5: Keep the allowed list narrow** + +Do not add broad exemptions to `tests/unit/architecture/test_no_test_logic_in_core.py`. The allowed files should remain: + +```python +ALLOWED_FILES = { + CORE / "api" / "test_harness.py", + CORE / "settings.py", +} +``` + +If the architecture guard catches a new file, fix the dependency direction instead of adding the file to `ALLOWED_FILES`. + +- [ ] **Step 6: Fix any new offenders with the same pattern** + +For each offender: + +1. Move test-owned implementation into `ergon_core/ergon_core/test_support`. +2. Leave only a production abstraction in `ergon_core/ergon_core/core`. +3. Wire the test implementation from an explicitly gated bootstrap path. +4. Add the offender term to `tests/unit/architecture/test_no_test_logic_in_core.py` if it should never recur. + +- [ ] **Step 7: Re-run architecture guard** + +Run: + +```bash +PYTHONPATH="ergon_core:ergon_builtins" uv run pytest tests/unit/architecture/test_no_test_logic_in_core.py -q +``` + +Expected: PASS. + +## Self-Review + +- Spec coverage: covers the selected sandbox stub leak and widens scope to an architectural audit for test logic in core. +- Placeholder scan: no unresolved implementation placeholders are used as plan content; the audit task gives concrete classification and remediation rules. +- Type consistency: lifecycle names are consistent across tasks: `terminate_sandbox_by_id`, `SandboxTerminationResult`, and `SandboxTerminationReason`. +- Scope check: intentionally scoped to test-logic leakage in core, with sandbox lifecycle as the first concrete refactor and a guardrail test to prevent recurrence. + diff --git a/docs/superpowers/plans/2026-04-26-mas-navigation-cli.md b/docs/superpowers/plans/2026-04-26-mas-navigation-cli.md new file mode 100644 index 00000000..33d93d9c --- /dev/null +++ b/docs/superpowers/plans/2026-04-26-mas-navigation-cli.md @@ -0,0 +1,1439 @@ +# Workflow Agent CLI Implementation Plan + +> **Superseded:** this single-file plan has been split into [`workflow-agent-cli/`](workflow-agent-cli/). Start at [`workflow-agent-cli/README.md`](workflow-agent-cli/README.md). Keep this file only as historical context while reviewers migrate. + +> **For agentic workers:** REQUIRED SUB-SKILL: Use `superpowers:subagent-driven-development` (recommended) or `superpowers:executing-plans` to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build `ergon workflow ...`, an agent-local command surface for the benchmark-task worker to navigate its current workflow topology/resources and explicitly materialize useful visible resources into its workspace without exposing raw SQL. + +**Architecture:** Put scoped Postgres queries and resource materialization policy in `ergon_core`, command parsing in `ergon_cli`, and the agent-callable wrapper in `ergon_builtins/tools`. This is not a general operator/debugging CLI: it is invoked by the benchmark agent, runs in the local worker/API process, reads local Postgres directly via `get_session()`, and uses the existing sandbox manager to copy approved resources into the current E2B workspace. + +**Tech Stack:** Python, argparse, SQLModel, existing `get_session()` / `ensure_db()`, pydantic-ai `Tool`, pytest. + +--- + +## Package Placement + +There is no `arcane_builtins` package in this workspace. The right homes are: + +- Core scoped read logic: `ergon_core/ergon_core/core/runtime/services/workflow_navigation_service.py` +- Core materialization policy: `ergon_core/ergon_core/core/runtime/services/workflow_resource_materialization_service.py` +- DTOs: `ergon_core/ergon_core/core/runtime/services/workflow_navigation_dto.py` +- CLI parser and handlers: `ergon_cli/ergon_cli/main.py` and `ergon_cli/ergon_cli/commands/workflow.py` +- Agent wrapper: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` +- Optional worker wiring: initially a target ReAct/research worker, not all workers by default + +Keep `ergon_builtins/ergon_builtins/tools/graph_toolkit.py` intact for now. It is resource-discovery-specific and research-named; `workflow` should be more general and command-shaped. + +## Resource Ownership and Copy Semantics + +Resources are immutable published artifacts. A task may read a visible resource from another task in the same run, including a resource from a different branch of the control DAG, but it must never mutate the source row or source bytes. + +The workflow CLI must treat copying as a fork: + +- **Read is context.** Reading `resource-content` does not change graph state. +- **Materialize is fork.** Copying a resource into the current agent workspace creates a new `RunResource` row owned by the current task execution. +- **Publish is ownership.** If the current task edits the copied file and publishes it later, the edited artifact is another new resource owned by the current task execution. +- **Lineage is evidence.** The copied resource row records `copied_from_resource_id=`, and later edited outputs should preserve provenance in metadata where practical. +- **Control edges schedule work; resource lineage explains information flow.** Materializing a resource from a divergent DAG branch must not add a control dependency edge. + +Example: + +```text +task_a publishes: + resource_id=res_a + task_execution_id=task_a_execution + name="paper.pdf" + content_hash=abc + +task_b materializes res_a: + resource_id=res_b_copy + task_execution_id=task_b_execution + name="paper (copy).pdf" + content_hash=abc + copied_from_resource_id=res_a + metadata.sandbox_destination="/workspace/imported/task-a/paper (copy).pdf" + +task_b edits and publishes: + resource_id=res_b_edited + task_execution_id=task_b_execution + name="paper_annotated.pdf" + content_hash=def + metadata.derived_from_resource_ids=["res_a", "res_b_copy"] +``` + +If task A later republishes a newer `paper.pdf`, task B's copy remains pinned to the old `resource_id` and `content_hash`. Rerun/staleness logic can use resource lineage to flag B as potentially stale, but A never mutates B's copy. + +## Code Write Locations + +Review this section before implementation. These are the proposed new and modified files. + +### New Core Runtime Files + +- Create `ergon_core/ergon_core/core/runtime/services/workflow_navigation_dto.py` + - Owns Pydantic DTOs returned by the workflow inspection service. + - Intended types: `WorkflowTaskRef`, `WorkflowExecutionRef`, `WorkflowResourceRef`, `WorkflowDependencyRef`, `WorkflowBlockerRef`, `WorkflowNextActionRef`, `WorkflowMaterializedResourceRef`. + +- Create `ergon_core/ergon_core/core/runtime/services/workflow_navigation_service.py` + - Owns scoped Postgres reads for the current run. + - Implements task listing, task tree traversal, dependency inspection, resource visibility, task blockers, next actions, and resource content reads. + - Must not query evaluation tables. + +- Create `ergon_core/ergon_core/core/runtime/services/workflow_resource_materialization_service.py` + - Owns the policy-checked "copy visible resource into my current E2B workspace" operation. + - Reads source `RunResource` metadata/bytes from local Postgres/blob store. + - Creates a current-task-owned copied `RunResource` row with a new ID. + - Writes the bytes to the current task sandbox under a controlled workspace path. + - Records provenance via `copied_from_resource_id` and metadata. + +### New Migration File + +- Create `ergon_core/migrations/versions/_add_copied_from_resource_id.py` + - Adds nullable `run_resources.copied_from_resource_id`. + - Adds a self-referential foreign key to `run_resources.id`. + - Adds an index for lineage queries. + +### Modified Persistence Files + +- Modify `ergon_core/ergon_core/core/persistence/telemetry/models.py` + - Adds `RunResourceKind.IMPORT`. + - Adds nullable `RunResource.copied_from_resource_id`. + +- Modify `ergon_core/ergon_core/core/persistence/queries.py` + - Extends `ResourcesQueries.append(...)` to accept `copied_from_resource_id`. + - Adds small lineage read helpers if needed by workflow inspection/tests. + +### New CLI Files + +- Create `ergon_cli/ergon_cli/commands/workflow.py` + - Owns all `ergon workflow inspect ...` and `ergon workflow manage ...` command handlers. + - Uses `WorkflowNavigationService` for reads. + - Uses `WorkflowResourceMaterializationService` for `manage materialize-resource`. + - Uses existing graph/task management services for mutations. + - Handles text/JSON rendering, `--explain`, output caps, and `--dry-run`. + +### Modified CLI Files + +- Modify `ergon_cli/ergon_cli/main.py` + - Imports `handle_workflow`. + - Registers the top-level `workflow` command. + - Registers nested `inspect` and `manage` subcommands. + - Dispatches `args.command == "workflow"` to `handle_workflow(args)`. + +### New Builtins Tool File + +- Create `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` + - Provides the local pydantic-ai `workflow(command=...)` tool. + - Injects `WorkerContext.run_id` and `WorkerContext.node_id`. + - Rejects user-supplied run/experiment/cohort scope arguments. + - Calls the CLI in-process rather than spawning a shell command. + - Enforces leaf vs manager command permissions. + +### New Proof-of-Concept Worker File + +- Create `ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py` + - Defines `ResearchRubricsWorkflowCliReActWorker`. + - Reuses the research-rubrics ReAct/toolkit behavior, but adds the local `workflow(command=...)` tool. + - Uses worker slug `researchrubrics-workflow-cli-react`. + - Adds prompt guidance that tells agents to start with `inspect task-tree`, `inspect task-workspace`, `inspect next-actions`, or `inspect resource-list --scope input`. + +### Modified Registry File + +- Modify `ergon_builtins/ergon_builtins/registry_data.py` + - Imports `ResearchRubricsWorkflowCliReActWorker`. + - Registers `"researchrubrics-workflow-cli-react": ResearchRubricsWorkflowCliReActWorker`. + - Leaves the existing `"researchrubrics-researcher"` worker unchanged. + +### New Tests + +- Create `tests/unit/runtime/test_workflow_navigation_service.py` + - Tests core service behavior: current-run reads, immediate-upstream resource semantics, task tree traversal, blockers, next actions, and cross-run resource rejection. + +- Create `tests/unit/runtime/test_workflow_resource_materialization_service.py` + - Tests materialize-resource semantics: same-run visibility, new copied resource ID, copied name, `copied_from_resource_id`, controlled destination path, import manifest, collision handling, and no mutation of the source resource. + +- Create `tests/unit/cli/test_workflow_cli.py` + - Tests parser/handler behavior for `inspect` and `manage` commands. + - Tests text and JSON output. + - Tests invalid UUIDs, duplicate slugs, and mutation `--dry-run`. + +- Create `tests/unit/state/test_workflow_cli_tool.py` + - Tests the pydantic-ai wrapper. + - Verifies scope injection, denial of user-supplied `--run-id`, denial of `inspect resource-list --scope run`, leaf vs manager permissions, multiline rejection, and structured failures. + +- Create `tests/unit/runtime/test_workflow_input_resource_semantics.py` + - Tests the canonical input-resource policy on a diamond and line graph. + - Ensures a task sees only immediate predecessor resources by default, not transitive ancestors. + +### Modified Existing Tests + +- Modify `tests/unit/state/test_research_rubrics_workers.py` + - Adds coverage for `ResearchRubricsWorkflowCliReActWorker`. + - Asserts the new worker exposes the `workflow` tool. + - Asserts the existing `ResearchRubricsResearcherWorker` behavior remains unchanged. + +- Modify `tests/real_llm/benchmarks/test_researchrubrics.py` + - Adds environment overrides or a dedicated rollout path for `researchrubrics-workflow-cli-react`. + - Supports running the final workflow-CLI acceptance rollout with `--limit 5`. + - Persists enough rollout artifacts to inspect whether and how the agent used `workflow(...)`. + +### Files Explicitly Not Planned For V1 + +- Do not expose eval tables. +- Do not modify dashboard UI files. +- Do not modify `ergon_builtins/ergon_builtins/tools/graph_toolkit.py`; keep it as the existing research/resource toolkit. +- Do not add E2B-side CLI execution or require E2B-to-localhost networking. + +## Command Surface + +Build two explicit command groups: + +- `workflow inspect ...`: read-only task topology, dependency, resource, and workspace inspection. +- `workflow manage ...`: state-changing task/dependency management commands that wrap the existing task lifecycle services. + +The command surface is intended for the currently executing benchmark-task agent. It reads Postgres directly through scoped services, but the model never receives a SQL console and never selects another run. + +Every command is scoped by injected runtime context: + +- `run_id`: injected from `WorkerContext.run_id` +- `node_id`: injected from `WorkerContext.node_id` +- `execution_id`: available to the wrapper from `WorkerContext.execution_id` if needed later + +For direct developer testing, `ergon workflow ...` may still accept explicit `--run-id` and `--node-id`, but that is test/debug plumbing. The agent wrapper strips or rejects user-supplied scope arguments and supplies the real values. + +Keep mutation commands out of `inspect`. If a command changes graph state, it belongs under `manage` and should return an explicit mutation result. + +All commands should support `--format text|json` (default `text`) and `--explain` (adds a short explanation of the scope/policy being applied). Commands that can produce large output should also support caps: `--limit`, `--max-chars`, or `--max-bytes` as appropriate. + +All `manage ...` commands must support `--dry-run`. Dry-run resolves slugs to node IDs, checks visibility and service preconditions, and prints the mutation that would happen without writing to Postgres or emitting events. + +## Inspect Commands + +### `workflow inspect task-list` + +Lists task nodes in a run. + +Examples: + +```bash +workflow("inspect task-list") +workflow("inspect task-list --children") +workflow("inspect task-list --level 2") +workflow("inspect task-list --under d_root --level 3") +workflow("inspect task-list --format json") +``` + +Behavior: + +- Reads `RunGraphNode` rows for the run. +- `--children` restricts to direct children of the current node. +- `--level N` returns tasks at absolute graph level `N`. +- `--under TASK_SLUG_OR_NODE_ID --level N` returns tasks at relative level `N` inside that subtree. +- `--status STATUS` filters by node status. +- Table columns: `node_id_short`, `task_slug`, `status`, `level`, `parent_node_id_short`, `worker`. +- JSON output returns full IDs and the same fields. +- Does not read evaluations or context events. + +### `workflow inspect task-tree` + +Shows a recursive task subtree, grouped by level. This should be the default orientation command for agents because it behaves like a mixture of `cd` and `ls`, but with a self-describing name. + +Examples: + +```bash +workflow("inspect task-tree") +workflow("inspect task-tree --from current --depth 2") +workflow("inspect task-tree --from root --depth 3") +workflow("inspect task-tree --from d_root --level 2") +workflow("inspect task-tree --from d_root --status pending") +``` + +Behavior: + +- `--from current` starts at the current node; this is the default. +- `--from root` starts at the root node for the current run. +- `--from TASK_SLUG_OR_NODE_ID` starts at that node. +- `--depth N` prints descendants up to `N` levels below the start node. +- `--level N` prints only descendants at relative level `N` below the start node. +- Uses `parent_node_id` containment, not dependency edges. + +Example output: + +```text +TREE from=parent depth=2 + +level +0 + parent running node=9af1c2aa worker=researchrubrics-smoke-worker + +level +1 + d_root completed node=8a31c4f2 worker=researchrubrics-smoke-leaf + l_1 completed node=0c72d1aa worker=researchrubrics-smoke-leaf + s_a completed node=f71e5a10 worker=researchrubrics-smoke-leaf + +level +2 + d_left completed node=19de72aa parent=d_root worker=researchrubrics-smoke-leaf + d_right completed node=34b7a901 parent=d_root worker=researchrubrics-smoke-leaf + l_2 pending node=aa10e821 parent=l_1 worker=researchrubrics-smoke-leaf +``` + +For "show every task on this level recursive down", the agent uses: + +```bash +workflow("inspect task-tree --from current --level 2") +``` + +### `workflow inspect task-details` + +Shows one task node plus latest execution summary. + +Examples: + +```bash +workflow("inspect task-details") +workflow("inspect task-details --task-slug d_left --include-output") +workflow("inspect task-details --task-slug d_left --format json") +``` + +Behavior: + +- With no selector, shows the current node. +- With `--task-slug`, resolves exactly one node in the current run. +- If `--task-slug` matches multiple nodes in a run, exits non-zero and prints the matching node IDs. +- Includes latest `RunTaskExecution` status, attempt number, timestamps, and resource count. +- `--include-output` includes a truncated `final_assistant_message`, default max 1200 chars, configurable with `--max-chars`. +- Does not expose evaluation feedback. + +### `workflow inspect task-dependencies` + +Shows graph dependencies for a task. + +Examples: + +```bash +workflow("inspect task-dependencies") +workflow("inspect task-dependencies --task-slug d_join --direction upstream") +workflow("inspect task-dependencies --task-slug d_left --direction downstream --format json") +``` + +Behavior: + +- Reads `RunGraphEdge`. +- `--direction upstream` lists incoming edges: source task -> current task. +- `--direction downstream` lists outgoing edges: current task -> target task. +- `--direction both` lists both. +- Table columns: `direction`, `edge_status`, `source_slug`, `source_status`, `target_slug`, `target_status`, `edge_id_short`. + +### `workflow inspect task-blockers` + +Explains why a task is not ready, not completed, or cannot proceed. + +Examples: + +```bash +workflow("inspect task-blockers") +workflow("inspect task-blockers --task-slug d_join") +workflow("inspect task-blockers --task-slug l_3 --format json") +``` + +Behavior: + +- Defaults to current node. +- Reports unsatisfied upstream dependencies, failed upstream dependencies, blocked/cancelled status, running children, and missing input resources if inferable. +- Does not mutate anything. +- Includes suggested next inspection commands. + +Example output: + +```text +Task blockers: d_join + +Readiness: + blocked: yes + reason: waiting_for_upstream + +Upstream dependencies: + d_left completed edge=satisfied resources=2 + d_right running edge=pending resources=0 + +Next useful commands: + workflow("inspect task-details --task-slug d_right") + workflow("inspect resource-list --scope input") +``` + +### `workflow inspect next-actions` + +Gives the agent a concise recovery/orientation summary for the current visible run scope. + +Examples: + +```bash +workflow("inspect next-actions") +workflow("inspect next-actions --include-completed") +``` + +Behavior: + +- Lists ready, pending, blocked, failed, and cancelled tasks visible to the current agent. +- Suggests concrete commands to inspect or manage the highest-priority items. +- For leaf agents, suggestions include only `inspect ...` commands. +- For manager-capable agents, suggestions may include `manage ... --dry-run` commands. + +Example output: + +```text +Next actions + +Blocked: + l_3 blocked because l_2 failed + inspect: workflow("inspect task-details --task-slug l_2 --include-output") + manager dry-run: workflow("manage restart-task --task-slug l_2 --dry-run") + +Ready: + d_join has all upstream inputs satisfied + inspect: workflow("inspect task-workspace --task-slug d_join") + +Input resources: + current task has 4 input resources + inspect: workflow("inspect resource-list --scope input") +``` + +### `workflow inspect resource-list` + +Lists visible resources. + +Examples: + +```bash +workflow("inspect resource-list --scope input") +workflow("inspect resource-list --scope upstream") +workflow("inspect resource-list --scope children") +workflow("inspect resource-list --scope descendants --max-depth 3") +workflow("inspect resource-list --scope visible --limit 20") +workflow("inspect resource-list --scope own --kind report") +workflow("inspect resource-list --scope input --format json") +``` + +Scopes: + +- `input`: resources produced by latest successful executions of immediate upstream dependency nodes. This is the default for task-scoped agents. +- `upstream`: same as `input` for v1; kept as a readable alias. +- `own`: resources produced by the current node's latest execution. +- `children`: resources produced by direct child task executions. +- `descendants`: resources produced by descendants up to `--max-depth`, default 3. +- `visible`: same-run resources the current profile is allowed to see, including resources from divergent DAG branches. This is needed for opportunistic collaboration, but it must still exclude eval/private/system resources and be capped by `--limit`. +- `run`: do not expose in v1. Even current-run-wide raw resources are broader than a benchmark-task agent needs by default. + +Table columns: `resource_id_short`, `kind`, `name`, `task_slug`, `size_bytes`, `mime_type`, `created_at`, `content_hash_short`. + +### `workflow inspect resource-content` + +Reads resource content from the blob path stored in `RunResource.file_path`. + +Examples: + +```bash +workflow("inspect resource-content --resource-id $RESOURCE_ID") +workflow("inspect resource-content --resource-id $RESOURCE_ID --max-bytes 20000") +``` + +Behavior: + +- Verifies the resource belongs to the injected current `run_id`. +- Verifies resource ID is visible under the active scope policy before reading bytes. +- Reads bytes from `RunResource.file_path`. +- If bytes decode as UTF-8, prints text. +- If not UTF-8, prints a short binary summary and exits 0 unless `--raw` is supplied later. +- Caps output with `--max-bytes`, default 64 KiB. + +### `workflow inspect resource-location` + +Returns metadata and local blob path for a resource without dumping content. + +Example: + +```bash +workflow("inspect resource-location --resource-id $RESOURCE_ID") +``` + +Behavior: + +- Useful for humans and tests. +- Agent wrapper may hide `file_path` if path leakage becomes a concern; v1 can expose it because it is a host blob path and already part of resource metadata. + +### `workflow inspect task-workspace` + +Shows the full task workspace snapshot: task, execution, upstream dependencies, downstream dependents, input resources, own resources, children, and suggested next commands. + +Examples: + +```bash +workflow("inspect task-workspace") +workflow("inspect task-workspace --task-slug d_join") +workflow("inspect task-workspace --task-slug d_join --include-output") +``` + +Behavior: + +- Defaults to current node. +- Uses only current-run data. +- No evaluation rows. +- Output should be compact and sectioned so the agent can orient in one call. + +## Manage Commands + +`manage` means "state-changing", not necessarily "manager-only". + +- Graph lifecycle commands require a manager-capable wrapper profile: create/restart/abandon/update task graph state. +- `manage materialize-resource` is a current-task workspace/import operation and should be available to ordinary task agents when the source resource is visible under policy. + +### `workflow manage create-task` + +Adds one dynamic subtask under the current node by wrapping `TaskManagementService.add_subtask`. + +Examples: + +```bash +workflow("manage create-task --task-slug summarize_left --worker researchrubrics-smoke-leaf --description 'Summarize left branch'") +workflow("manage create-task --task-slug join --worker researchrubrics-smoke-leaf --description 'Join summaries' --depends-on summarize_left --depends-on summarize_right") +workflow("manage create-task --task-slug join --worker researchrubrics-smoke-leaf --description 'Join summaries' --depends-on summarize_left --dry-run") +``` + +Behavior: + +- Parent is always the current node unless a privileged manager profile later allows `--parent`. +- Creates a `RunGraphNode` with `parent_node_id=current_node_id`. +- Creates dependency edges for `--depends-on` slugs/node IDs. +- Returns created node ID, slug, and status. +- With `--dry-run`, validates parent, dependency references, and worker slug, then prints the proposed node and edges without writing. + +### `workflow manage create-task-plan` + +Adds multiple dynamic subtasks in one transaction by wrapping `TaskManagementService.plan_subtasks`. + +Examples: + +```bash +workflow("manage create-task-plan --json '[{\"task_slug\":\"a\",\"description\":\"Do A\",\"assigned_worker_slug\":\"researchrubrics-smoke-leaf\"},{\"task_slug\":\"b\",\"description\":\"Do B\",\"assigned_worker_slug\":\"researchrubrics-smoke-leaf\",\"depends_on\":[\"a\"]}]'") +workflow("manage create-task-plan --json '[{\"task_slug\":\"a\",\"description\":\"Do A\",\"assigned_worker_slug\":\"researchrubrics-smoke-leaf\"}]' --dry-run") +``` + +Behavior: + +- This is the safest way to add a local DAG. +- Rejects cycles/duplicates through existing service validation. +- Returns created nodes and root slugs. +- With `--dry-run`, runs the same validation and returns the normalized plan without inserting nodes. + +### `workflow manage create-dependency` + +Adds a dependency edge between two existing sibling/visible tasks. + +Examples: + +```bash +workflow("manage create-dependency --source summarize_left --target join") +workflow("manage create-dependency --source summarize_left --target join --dry-run") +``` + +Behavior: + +- Uses `WorkflowGraphRepository.add_edge`. +- Source and target must resolve inside the current run and be visible to the current agent profile. +- Fails if the edge would create a cycle. +- New edge status starts as `pending`. +- With `--dry-run`, resolves source/target and checks cycle risk without adding the edge. + +### `workflow manage restart-task` + +Resets a terminal task back to pending by wrapping `TaskManagementService.restart_task`. + +Examples: + +```bash +workflow("manage restart-task --task-slug l_2") +workflow("manage restart-task --node-id aa10e821-...") +workflow("manage restart-task --task-slug l_2 --dry-run") +``` + +Behavior: + +- Only terminal tasks can be reset. +- Existing service handles downstream invalidation/reset behavior. +- Returns old status and invalidated downstream node IDs. +- With `--dry-run`, reports whether the task is restartable and which downstream nodes would be invalidated. + +### `workflow manage abandon-task` + +Abandons/cancels a task by wrapping `TaskManagementService.cancel_task`. + +Examples: + +```bash +workflow("manage abandon-task --task-slug l_3") +workflow("manage abandon-task --node-id 98db73a2-...") +workflow("manage abandon-task --task-slug l_3 --dry-run") +``` + +Behavior: + +- Transitions the target to cancelled when allowed. +- Emits the existing cancellation event through the service path. +- Returns old status and cascade count. +- With `--dry-run`, reports whether cancellation is allowed and the descendant cascade count without writing or emitting events. + +### `workflow manage update-task-description` + +Updates a non-running task description by wrapping `TaskManagementService.refine_task`. + +Examples: + +```bash +workflow("manage update-task-description --task-slug l_3 --description 'Retry with the corrected input file'") +workflow("manage update-task-description --task-slug l_3 --description 'Retry with the corrected input file' --dry-run") +``` + +Behavior: + +- Fails on running tasks. +- Returns old and new description. +- With `--dry-run`, validates mutability and prints the old/new description without writing. + +### `workflow manage materialize-resource` + +Copies one immutable, visible `RunResource` from local Postgres/blob storage into the current agent's E2B workspace and records the copy as a new current-task-owned resource. + +This is a fork operation, not a mutation of the source task's artifact. + +Examples: + +```bash +workflow("manage materialize-resource --resource-id $RESOURCE_ID") +workflow("manage materialize-resource --resource-id $RESOURCE_ID --destination imported/task-a/paper.pdf") +workflow("manage materialize-resource --resource-id $RESOURCE_ID --destination imported/task-a/paper.pdf --dry-run") +``` + +Behavior: + +- Resolves `--resource-id` to a source `RunResource` in the current run. +- Requires the source resource to be visible to the current agent profile. +- Rejects evaluation/private/system resources and cross-run resource IDs. +- Reads bytes from the source resource's content-addressed `file_path`. +- Writes bytes into the current E2B sandbox under `/workspace/`. +- If `--destination` is omitted, uses a collision-safe default like `/workspace/imported//`. +- Rejects absolute destinations, `..`, symlink escapes, and paths outside `/workspace`. +- Creates a new `RunResource` row owned by the current task execution: + - new `id` + - `task_execution_id=current_execution_id` + - `kind=import` + - `name=" (copy)"` unless an explicit destination name is provided + - same `file_path` and `content_hash` as the source resource + - `copied_from_resource_id=` + - metadata containing source task/node identifiers, source name/hash, sandbox destination, and materialized timestamp +- Updates `/workspace/.ergon/resource_imports.json` in the sandbox with the source resource ID, copied resource ID, content hash, and destination path so future tools/debuggers can reconstruct the local workspace import history. +- Does not add a control edge to the DAG. +- With `--dry-run`, validates source visibility, destination normalization, collision behavior, sandbox target, and the copied resource name without writing to Postgres or E2B. + +Ordering: + +- Validate source visibility and destination before side effects. +- Write bytes to the sandbox destination first. +- Append the copied `RunResource` row only after the sandbox write succeeds. +- Update the import manifest after the copied row exists so it can include the new copied resource ID. +- If the manifest update fails after the file copy/resource row succeeds, return a structured warning rather than pretending the source was not materialized. + +V1 lineage boundary: + +- The materialized copy row has strong lineage via `copied_from_resource_id`. +- Arbitrary later edits are B-owned outputs. They must never mutate A's row. +- Do not pretend to infer every arbitrary transformation automatically in v1. If the edited output is later published, it is enough that the run has the materialization row, the import manifest, and the tool-call context events to explain how B got the source bytes. A richer many-to-many `run_resource_lineage` table can come later if synthesis from multiple copied resources becomes central. + +Output: + +```text +materialized resource + source: res_a paper.pdf sha256:abc... + copy: res_b paper (copy).pdf kind=import + copied_from_resource_id: res_a + sandbox_path: /workspace/imported/task-a/paper (copy).pdf + note: source resource was not modified +``` + +JSON output should include at least: + +```json +{ + "source_resource_id": "res_a", + "copied_resource_id": "res_b", + "copied_from_resource_id": "res_a", + "source_content_hash": "abc", + "copied_content_hash": "abc", + "sandbox_path": "/workspace/imported/task-a/paper (copy).pdf", + "source_mutated": false +} +``` + +## Deferred Commands + +Do not build these in v1: + +- `ergon workflow messages send`: useful workflow primitive, but not needed for the first read/navigation surface. +- Arbitrary `manage remove-dependency`: higher risk than `create-dependency` because it can unexpectedly unblock or strand work; add after mutation auditing is clearer. + +## Agent Invocation Model + +The agent runs locally in the worker process. The environment runs in E2B. Therefore this command surface should read local Postgres directly, then let existing sandbox tools handle E2B environment actions. Do not put this command inside the E2B sandbox. + +The model must not pass, override, or discover `--run-id`; the wrapper injects the current `WorkerContext.run_id`. The model also must not inspect other runs in the same experiment, cohort, benchmark, or different experiments. + +### Single Tool Path + +Create `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` with: + +```python +def make_workflow_cli_tool( + *, + context: WorkerContext, + allowed_scopes: frozenset[str] = frozenset({"input", "own", "upstream", "children", "descendants", "visible"}), + manager_capable: bool = False, +) -> Callable[..., Awaitable[WorkflowCliToolResponse]]: + ... +``` + +The pydantic-ai tool signature should be: + +```python +async def workflow(command: str, timeout_s: int = 10) -> WorkflowCliToolResponse: + """Run a scoped workflow navigation command, for example: + `inspect task-tree`, `inspect task-details --task-slug d_left`, + `inspect task-dependencies --task-slug d_join`, + `inspect resource-list --scope input`, `inspect resource-content --resource-id ...`, + `manage materialize-resource --resource-id ...`, + or manager-profile commands like `manage restart-task --task-slug l_2`. + `run_id`, current `node_id`, current `execution_id`, and sandbox context + are injected automatically. + """ +``` + +Important behavior: + +- The model passes only the subcommand string, e.g. `inspect resource-list --scope input`. +- The wrapper prepends `workflow` and injects `--run-id `, `--node-id `, `--execution-id `, `--sandbox-id `, and `--sandbox-task-key ` when the command supports them. +- The wrapper rejects any user-supplied `--run-id`, `--node-id`, `--execution-id`, `--sandbox-id`, `--sandbox-task-key`, `--definition-id`, `--experiment-id`, or `--cohort-id` argument in v1. Current-run/current-task scope is an invariant, not a prompt instruction. +- The wrapper calls the CLI handler in-process via `ergon_cli.main._main(argv)` with stdout/stderr captured, not via subprocess. +- The wrapper rejects disallowed tokens in v1: shell metacharacters are irrelevant for in-process argv parsing, but still reject newlines and commands starting with `run`, `eval`, `doctor`, etc. +- The wrapper does not allow `inspect resource-list --scope run`. + +### Bash Path + +For local agents, the "bash" path should mean local command execution, not the current E2B sandbox bash tool. The existing `bash_sandbox_tool.py` runs inside E2B and cannot see local Postgres. + +Two options: + +- Preferred v1: use the single `workflow(command=...)` tool. +- Later: add a local bash-like tool that only whitelists `ergon workflow ...`, but do not mix it with E2B bash. + +System prompt guidance for ReAct agents should say: + +```text +Use the `workflow` tool to inspect the workflow run graph and resources. +Start with `inspect task-tree`, `inspect task-workspace`, `inspect next-actions`, or `inspect resource-list --scope input`. +Do not assume transitive dependencies are inputs; use `inspect task-dependencies` if unsure. +Use `manage materialize-resource --resource-id ... --dry-run` before importing a useful resource into your workspace. +Before mutating the task graph, run the same graph-lifecycle `manage ...` command with `--dry-run`. +``` + +## Permissions Model + +V1 policy is simple and explicit: + +- Agent wrapper: automatically scoped to one `run_id` and current `node_id`; the model cannot choose a run. +- Direct CLI invocation with explicit IDs exists only for developer tests/debugging, not as a model-facing capability. +- Agent wrapper default read scopes: `input`, `own`, `upstream`, `children`, `descendants`, `visible`. +- Agent wrapper same-run collaboration scope: `visible`, capped by `--limit`, for resources from divergent branches that are useful context but not control dependencies. +- Agent wrapper denied scope: `run`. +- Agent wrapper denied cross-run scope: other runs from the same experiment, cohort, benchmark, or other experiments. +- Graph mutation commands require a manager-capable profile. Manager agents may get `manage create-task`, `manage create-task-plan`, `manage create-dependency`, `manage restart-task`, `manage abandon-task`, and `manage update-task-description`. +- Resource materialization is allowed for leaf and manager agents, but only for visible same-run resources and only into the current task sandbox/workspace. +- Manager-capable agents should be instructed to use `--dry-run` before every non-trivial mutation. V1 enforces support for dry-run but does not require a confirm token. +- No eval tables are queried by any command. +- No raw SQL is accepted. + +Later profiles can be layered on top: + +- `leaf`: input, own, upstream summaries, and capped same-run visible resource discovery/materialization. +- `manager`: children, descendants, task lifecycle mutations. +- `evaluator`: target task outputs only. +- `cohort_observer` / `meta_analyst`: cross-run summaries without raw resource content and without evaluation leakage, only if explicitly assigned. + +Future safety layer: + +- `--confirm-token` for destructive mutations. A dry run would produce a short token that must be echoed back on the real command. Do not build this in v1 unless mutation behavior proves too risky in tests. + +## Implementation File Plan + +This is the master file/folder plan for implementation. + +### Added Files + +```text +ergon_core/ + migrations/ + versions/ + _add_copied_from_resource_id.py + ergon_core/ + core/ + runtime/ + services/ + workflow_navigation_dto.py + workflow_navigation_service.py + workflow_resource_materialization_service.py + +ergon_cli/ + ergon_cli/ + commands/ + workflow.py + +ergon_builtins/ + ergon_builtins/ + tools/ + workflow_cli_tool.py + workers/ + research_rubrics/ + workflow_cli_react_worker.py + +tests/ + unit/ + runtime/ + test_workflow_navigation_service.py + test_workflow_input_resource_semantics.py + test_workflow_resource_materialization_service.py + cli/ + test_workflow_cli.py + state/ + test_workflow_cli_tool.py +``` + +### Edited Files + +```text +ergon_cli/ + ergon_cli/ + main.py + +ergon_core/ + ergon_core/ + core/ + persistence/ + telemetry/ + models.py + queries.py + +ergon_builtins/ + ergon_builtins/ + registry_data.py + +tests/ + unit/ + state/ + test_research_rubrics_workers.py + real_llm/ + benchmarks/ + test_researchrubrics.py +``` + +`ergon_builtins/ergon_builtins/workers/research_rubrics/researcher_worker.py` is intentionally not edited for the POC path. + +### Deleted Files + +```text +(none) +``` + +### Optional Edited Files If We Include Automatic Sandbox Input Threading Now + +```text +ergon_core/ + ergon_core/ + api/ + worker_context.py + core/ + runtime/ + inngest/ + execute_task.py + services/ + orchestration_dto.py + task_execution_service.py +``` + +These optional edits are only for automatic input materialization at sandbox creation time. They are separate from `workflow manage materialize-resource`, which is an explicit on-demand copy/fork operation initiated by the current agent. + +## Task 0: Add Resource Copy Lineage Schema + +**Files:** + +- Modify: `ergon_core/ergon_core/core/persistence/telemetry/models.py` +- Modify: `ergon_core/ergon_core/core/persistence/queries.py` +- Create: `ergon_core/migrations/versions/_add_copied_from_resource_id.py` + +- [ ] **Step 1: Add failing persistence tests** + +Add focused tests that create a source `RunResource`, append a copied resource, and assert: + +- copied row has a new `id` +- copied row keeps the same `content_hash` and `file_path` +- copied row has `kind="import"` +- copied row has `copied_from_resource_id=` +- source row is unchanged + +- [ ] **Step 2: Update `RunResourceKind` and `RunResource`** + +File path: `ergon_core/ergon_core/core/persistence/telemetry/models.py` + +```python +class RunResourceKind(StrEnum): + OUTPUT = "output" + REPORT = "report" + ARTIFACT = "artifact" + SEARCH_CACHE = "search_cache" + NOTE = "note" + IMPORT = "import" + """Copied snapshot materialized from another RunResource into a task workspace.""" + +class RunResource(SQLModel, table=True): + # ... + copied_from_resource_id: UUID | None = Field( + default=None, + foreign_key="run_resources.id", + index=True, + ) +``` + +- [ ] **Step 3: Update resource append helper** + +File path: `ergon_core/ergon_core/core/persistence/queries.py` + +Extend `ResourcesQueries.append(...)` with: + +```python +copied_from_resource_id: UUID | None = None +``` + +and pass it to `RunResource(...)`. + +- [ ] **Step 4: Add migration** + +File path: `ergon_core/migrations/versions/_add_copied_from_resource_id.py` + +Migration behavior: + +- `upgrade()` adds nullable `copied_from_resource_id` UUID column to `run_resources` +- creates a self-referential foreign key to `run_resources.id` +- creates an index on `run_resources.copied_from_resource_id` +- `downgrade()` drops index, foreign key, and column + +Run: + +```bash +pytest tests/unit/runtime/test_workflow_resource_materialization_service.py -v +``` + +Expected: schema/helper tests pass once the materialization service is implemented. + +## Task 1: Add Workflow Navigation DTOs and Service + +**Files:** + +- Create: `ergon_core/ergon_core/core/runtime/services/workflow_navigation_dto.py` +- Create: `ergon_core/ergon_core/core/runtime/services/workflow_navigation_service.py` +- Create: `ergon_core/ergon_core/core/runtime/services/workflow_resource_materialization_service.py` +- Test: `tests/unit/runtime/test_workflow_navigation_service.py` +- Test: `tests/unit/runtime/test_workflow_resource_materialization_service.py` + +- [ ] **Step 1: Write failing service tests** + +Add tests for: + +- `list_tasks(run_id)` returns all run nodes ordered by level/task_slug. +- `list_tasks(run_id, parent_node_id=...)` returns direct children only. +- `list_deps(..., direction="upstream")` returns incoming edges with source/target summaries. +- `list_resources(..., scope="input")` returns resources from latest completed executions of immediate upstream nodes only. +- `list_resources(..., scope="visible")` can include same-run resources from divergent branches while still excluding cross-run/eval/private resources. +- `get_resource_content` rejects a resource from a different run. +- `get_task_blockers(...)` reports pending upstream dependencies and failed upstream dependencies. +- `get_next_actions(...)` suggests inspect-only commands for leaf profiles and dry-run manage commands for manager profiles. +- `materialize_resource(...)` creates a new current-task-owned `kind=import` resource row with a new ID, copied name, same hash/blob path, and `copied_from_resource_id`. +- `materialize_resource(...)` writes or updates `/workspace/.ergon/resource_imports.json` in the sandbox with source/copy/destination metadata. +- `materialize_resource(..., dry_run=True)` validates source/destination without writing to Postgres or E2B. +- `materialize_resource(...)` rejects cross-run resources, invisible resources, absolute destinations, `..`, and destination collisions unless overwrite/versioning behavior is explicit. + +Use a tiny graph fixture: + +```text +a -> b -> c +x +``` + +Give `a`, `b`, and `x` one completed execution each; give `a` and `b` one resource each. Assert `c` input resources include only `b` resource, not `a`. + +- [ ] **Step 2: Implement DTOs** + +Define frozen Pydantic models: + +File path: `ergon_core/ergon_core/core/runtime/services/workflow_navigation_dto.py` + +```python +class WorkflowTaskRef(BaseModel): + model_config = {"frozen": True} + node_id: UUID + task_slug: str + status: str + level: int + parent_node_id: UUID | None = None + assigned_worker_slug: str | None = None + +class WorkflowExecutionRef(BaseModel): + model_config = {"frozen": True} + execution_id: UUID + status: str + attempt_number: int + final_assistant_message: str | None = None + +class WorkflowResourceRef(BaseModel): + model_config = {"frozen": True} + resource_id: UUID + run_id: UUID + task_execution_id: UUID | None + node_id: UUID | None + task_slug: str | None + kind: str + name: str + mime_type: str + size_bytes: int + file_path: str + content_hash: str | None = None + copied_from_resource_id: UUID | None = None + created_at: datetime + +class WorkflowDependencyRef(BaseModel): + model_config = {"frozen": True} + edge_id: UUID + edge_status: str + source: WorkflowTaskRef + target: WorkflowTaskRef + +class WorkflowBlockerRef(BaseModel): + model_config = {"frozen": True} + task: WorkflowTaskRef + reason: str + details: list[str] = Field(default_factory=list) + suggested_commands: list[str] = Field(default_factory=list) + +class WorkflowNextActionRef(BaseModel): + model_config = {"frozen": True} + priority: str + task: WorkflowTaskRef | None = None + summary: str + suggested_commands: list[str] = Field(default_factory=list) + +class WorkflowMaterializedResourceRef(BaseModel): + model_config = {"frozen": True} + source_resource_id: UUID + copied_resource_id: UUID | None + copied_from_resource_id: UUID + source_name: str + copied_name: str + source_content_hash: str | None + copied_content_hash: str | None + sandbox_path: str + dry_run: bool = False + source_mutated: bool = False +``` + +- [ ] **Step 3: Implement read service** + +Implement `WorkflowNavigationService` methods: + +File path: `ergon_core/ergon_core/core/runtime/services/workflow_navigation_service.py` + +```python +class WorkflowNavigationService: + def list_tasks(self, session: Session, *, run_id: UUID, parent_node_id: UUID | None = None) -> list[WorkflowTaskRef]: ... + def get_task(self, session: Session, *, run_id: UUID, node_id: UUID | None, task_slug: str | None) -> WorkflowTaskRef: ... + def get_latest_execution(self, session: Session, *, node_id: UUID) -> RunTaskExecution | None: ... + def list_dependencies(self, session: Session, *, run_id: UUID, node_id: UUID, direction: Literal["upstream", "downstream", "both"]) -> list[WorkflowDependencyRef]: ... + def list_resources(self, session: Session, *, run_id: UUID, node_id: UUID | None, scope: Literal["input", "upstream", "own", "children", "descendants", "visible", "run"], kind: str | None = None, max_depth: int = 3, limit: int = 50) -> list[WorkflowResourceRef]: ... + def read_resource_bytes(self, session: Session, *, run_id: UUID, resource_id: UUID, max_bytes: int) -> bytes: ... + def get_task_blockers(self, session: Session, *, run_id: UUID, node_id: UUID) -> list[WorkflowBlockerRef]: ... + def get_next_actions(self, session: Session, *, run_id: UUID, node_id: UUID, manager_capable: bool) -> list[WorkflowNextActionRef]: ... +``` + +For `input` / `upstream`, use incoming edges to the current node, get each source node's latest completed execution, then collect `RunResource` rows for those execution IDs. + +Implement `WorkflowResourceMaterializationService` separately from read-only navigation: + +File path: `ergon_core/ergon_core/core/runtime/services/workflow_resource_materialization_service.py` + +```python +class WorkflowResourceMaterializationService: + async def materialize_resource( + self, + session: Session, + *, + run_id: UUID, + current_node_id: UUID, + current_execution_id: UUID, + sandbox_task_key: UUID, + benchmark_type: str, + resource_id: UUID, + destination: str | None, + dry_run: bool, + ) -> WorkflowMaterializedResourceRef: ... +``` + +The service should use the benchmark's sandbox manager class and existing `BaseSandboxManager.upload_file(...)` to write into the live E2B sandbox. It should not create a new low-level E2B upload primitive. + +- [ ] **Step 4: Run service tests** + +Run: + +```bash +pytest tests/unit/runtime/test_workflow_navigation_service.py tests/unit/runtime/test_workflow_resource_materialization_service.py -v +``` + +Expected: all tests pass. + +## Task 2: Add `ergon workflow` CLI Commands + +**Files:** + +- Create: `ergon_cli/ergon_cli/commands/workflow.py` +- Modify: `ergon_cli/ergon_cli/main.py` +- Test: `tests/unit/cli/test_workflow_cli.py` + +- [ ] **Step 1: Write failing parser/handler tests** + +Add tests that call `handle_workflow(args)` directly and at least one parser integration test using `build_parser().parse_args(...)`. + +Test cases: + +- `ergon workflow inspect task-list --run-id ` renders slugs. +- `ergon workflow inspect task-details --run-id --task-slug b --include-output` renders latest output excerpt. +- `ergon workflow inspect task-dependencies --run-id --task-slug c --direction upstream` renders `b -> c`. +- `ergon workflow inspect resource-list --run-id --node-id --scope input` includes `b` resource only. +- `ergon workflow inspect resource-content --run-id --resource-id ` prints file content. +- `ergon workflow inspect task-blockers --run-id --node-id ` explains why a task is blocked. +- `ergon workflow inspect next-actions --run-id --node-id ` prints suggested commands. +- `ergon workflow manage materialize-resource --run-id --node-id --execution-id --sandbox-task-key --resource-id --destination imported/a/report.pdf --dry-run` reports the copy/fork without DB or E2B writes. +- `ergon workflow manage materialize-resource ...` creates a new `kind=import` resource row with `copied_from_resource_id` and writes the file to `/workspace/imported/a/report (copy).pdf`. +- Every `manage ... --dry-run` command validates and reports the planned mutation without DB writes. +- invalid UUID returns exit code 1. +- duplicate `--task-slug` returns exit code 1 and helpful message. + +- [ ] **Step 2: Register parser** + +In `ergon_cli/ergon_cli/main.py`: + +- Import `handle_workflow`. +- Add top-level `workflow`. +- Add nested subcommands: + - `inspect task-list` + - `inspect task-tree` + - `inspect task-details` + - `inspect task-dependencies` + - `inspect task-blockers` + - `inspect next-actions` + - `inspect resource-list` + - `inspect resource-content` + - `inspect resource-location` + - `inspect task-workspace` + - `manage create-task` + - `manage create-task-plan` + - `manage create-dependency` + - `manage restart-task` + - `manage abandon-task` + - `manage update-task-description` + - `manage materialize-resource` +- Add branch: + +File path: `ergon_cli/ergon_cli/main.py` + +```python +elif args.command == "workflow": + return handle_workflow(args) +``` + +- [ ] **Step 3: Implement handler** + +In `commands/workflow.py`, implement: + +File path: `ergon_cli/ergon_cli/commands/workflow.py` + +```python +def handle_workflow(args: Namespace) -> int: + ensure_db() + ... +``` + +Use `WorkflowNavigationService`, `WorkflowResourceMaterializationService`, `get_session()`, `render_table`, and `json.dumps(..., default=str)` for `--format json`. + +Keep output agent-friendly: + +- Plain table by default. +- Compact JSON when requested. +- No rich formatting. +- Stable field names. + +- [ ] **Step 4: Run CLI tests** + +Run: + +```bash +pytest tests/unit/cli/test_workflow_cli.py -v +``` + +Expected: all tests pass. + +## Task 3: Add Agent-Facing Workflow CLI Tool + +**Files:** + +- Create: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` +- Test: `tests/unit/state/test_workflow_cli_tool.py` + +- [ ] **Step 1: Write failing wrapper tests** + +Test cases: + +- `workflow("inspect task-list")` injects `--run-id` and returns stdout/exit code. +- `workflow("inspect resource-list --scope input")` injects `--node-id`. +- `workflow("inspect resource-list --scope visible --limit 20")` is allowed for same-run collaboration discovery. +- `workflow("inspect resource-list --scope run")` is denied by default. +- `workflow("manage materialize-resource --resource-id --destination imported/a/report.pdf --dry-run")` is allowed for leaf wrappers and injects current execution/sandbox context. +- `workflow("manage restart-task --task-slug l_2 --dry-run")` is allowed only for manager-capable wrappers. +- `workflow("manage restart-task --task-slug l_2")` is denied for leaf wrappers. +- User-supplied `--execution-id`, `--sandbox-id`, or `--sandbox-task-key` is rejected. +- `workflow("../bad")` or multiline input is rejected. +- Non-zero CLI exit returns a structured failure, not an exception. + +- [ ] **Step 2: Implement response DTOs** + +Use: + +File path: `ergon_builtins/ergon_builtins/tools/workflow_cli_tool.py` + +```python +class WorkflowCliToolSuccess(BaseModel): + kind: Literal["success"] = "success" + stdout: str + stderr: str + exit_code: int + +class WorkflowCliToolFailure(BaseModel): + kind: Literal["failure"] = "failure" + error: str + stdout: str = "" + stderr: str = "" + exit_code: int = 1 +``` + +- [ ] **Step 3: Implement `make_workflow_cli_tool`** + +Use `shlex.split(command)` to build argv. Capture stdout/stderr with `contextlib.redirect_stdout` and `redirect_stderr`. Call `await ergon_cli.main._main(argv)`. + +Do not spawn a subprocess. + +- [ ] **Step 4: Run wrapper tests** + +Run: + +```bash +pytest tests/unit/state/test_workflow_cli_tool.py -v +``` + +Expected: all tests pass. + +## Task 4: Add a ResearchRubrics Workflow CLI ReAct Worker + +**Files:** + +- New worker: `ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py` +- Registry edit: `ergon_builtins/ergon_builtins/registry_data.py` +- Test: update `tests/unit/state/test_research_rubrics_workers.py` or add a focused worker wiring test beside it. + +- [ ] **Step 1: Create the proof-of-concept worker** + +Create `ResearchRubricsWorkflowCliReActWorker` as the first consumer of the workflow CLI tool. + +Do not alter `ResearchRubricsResearcherWorker` for this proof of concept. Keeping the new behavior behind a separate worker slug makes it easy to compare workflow-CLI behavior against the existing research-rubrics agent. + +- [ ] **Step 2: Add the workflow tool to the new worker** + +File path: `ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py` + +```python +class ResearchRubricsWorkflowCliReActWorker(ResearchRubricsResearcherWorker): + """Research-rubrics ReAct worker with the workflow CLI tool enabled.""" +``` + +Inside `execute()`, mirror the existing research-rubrics runtime tool composition and add: + +```python +workflow_tool = make_workflow_cli_tool(context=context) +self.tools = [*rr_tools, *graph_tools, Tool(function=workflow_tool, takes_ctx=False)] +``` + +If pydantic-ai expects the callable directly rather than prewrapped `Tool`, mirror the existing toolkit pattern. + +- [ ] **Step 3: Update system prompt** + +Add a short instruction: + +File path: `ergon_builtins/ergon_builtins/workers/research_rubrics/workflow_cli_react_worker.py` + +```text +Use the `workflow` tool to inspect task topology and resources. Start with +`inspect task-tree`, `inspect task-workspace`, or +`inspect resource-list --scope input`. If a visible resource from another task +is useful, run `manage materialize-resource --resource-id ... --dry-run` +before importing it into your workspace. Use `--dry-run` before any graph +lifecycle `manage ...` mutation command. +``` + +- [ ] **Step 4: Register the new worker slug** + +File path: `ergon_builtins/ergon_builtins/registry_data.py` + +```python +from ergon_builtins.workers.research_rubrics.workflow_cli_react_worker import ( + ResearchRubricsWorkflowCliReActWorker, +) + +WORKERS: dict[str, Callable[..., Worker]] = { + "researchrubrics-researcher": ResearchRubricsResearcherWorker, + "researchrubrics-workflow-cli-react": ResearchRubricsWorkflowCliReActWorker, +} +``` + +- [ ] **Step 5: Run worker wiring test** + +Run: + +```bash +pytest tests/unit/state/test_research_rubrics_workers.py -v +``` + +Expected: + +- `researchrubrics-workflow-cli-react` is registered. +- `ResearchRubricsWorkflowCliReActWorker` exposes the `workflow` tool. +- The prompt recommends `inspect task-workspace`, `inspect resource-list --scope input`, `manage materialize-resource --dry-run`, and `--dry-run` before graph lifecycle `manage` commands. +- Existing `ResearchRubricsResearcherWorker` assertions remain unchanged and do not require `workflow`. + +## Task 5: Add Contract Tests Around Input Resource Semantics + +**Files:** + +- Test: `tests/unit/runtime/test_workflow_input_resource_semantics.py` +- Optional later implementation: thread computed IDs into `PreparedTaskExecution.input_resource_ids`. + +- [ ] **Step 1: Test the default policy** + +Build a graph: + +```text +d_root -> d_left +d_root -> d_right +d_left -> d_join +d_right -> d_join +l_1 -> l_2 -> l_3 +``` + +Assert: + +- `d_join` input resources are exactly latest resources from `d_left` and `d_right`. +- `l_3` input resources are exactly latest resources from `l_2`, not `l_1`. +- roots and singletons have empty input resources. + +- [ ] **Step 2: Decide whether to wire sandbox inputs now** + +If included in this implementation, add: + +- `input_resource_ids` to `PreparedTaskExecution` +- computation in `TaskExecutionService.prepare` +- pass through `_setup_sandbox` in `execute_task.py` +- optional `WorkerContext.input_resource_ids` + +If not included, keep the CLI/tool behavior independent and leave sandbox auto-materialization as the next implementation plan. + +## Task 6: Add Real-LLM Acceptance Rollout + +**Files:** + +- Modify: `tests/real_llm/benchmarks/test_researchrubrics.py` + +- [ ] **Step 1: Parameterize the rollout worker and sample limit** + +Keep the existing defaults for the current research-rubrics rollout, but allow the final acceptance run to choose the workflow-CLI worker and five samples: + +```python +model = os.environ.get("ERGON_REAL_LLM_MODEL", _DEFAULT_MODEL) +benchmark = os.environ.get("ERGON_REAL_LLM_BENCHMARK", "researchrubrics") +worker = os.environ.get("ERGON_REAL_LLM_WORKER", "researchrubrics-researcher") +evaluator = os.environ.get("ERGON_REAL_LLM_EVALUATOR", "research-rubric") +limit = os.environ.get("ERGON_REAL_LLM_LIMIT", "1") +``` + +Then pass `limit` into the CLI invocation instead of hard-coding `"1"`. + +- [ ] **Step 2: Preserve rollout artifact capture** + +Keep the existing artifact behavior: CLI stdout/stderr, table dumps, dashboard screenshots, manifest, and `report.md`. The final review should use these artifacts to inspect the agent's actual behavior rather than asserting a brittle exact tool-call sequence. + +- [ ] **Step 3: Run the final acceptance rollout** + +Run: + +```bash +ERGON_REAL_LLM=1 \ +ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react \ +ERGON_REAL_LLM_LIMIT=5 \ +uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s +``` + +Expected: + +- The real-LLM test reaches a terminal run status: `completed`, `failed`, or `cancelled`. +- The rollout artifacts are written under `tests/real_llm/.rollouts/-/`. +- The manifest records `worker=researchrubrics-workflow-cli-react` and `limit=5`. +- The generated `report.md` plus dumped persistence rows provide enough evidence to inspect whether the agent invoked `workflow(...)`, which workflow commands it chose, whether it materialized any copied resources, and whether those commands helped it orient around task topology/resources. + +This is the final acceptance criterion for the feature. Unit and focused integration tests remain the normal correctness gate; this rollout is the observational gate for whether the workflow CLI is useful to a real ResearchRubrics agent. + +## Verification + +Run focused tests: + +```bash +pytest tests/unit/runtime/test_workflow_navigation_service.py tests/unit/runtime/test_workflow_resource_materialization_service.py tests/unit/cli/test_workflow_cli.py tests/unit/state/test_workflow_cli_tool.py -v +``` + +Run affected worker tests: + +```bash +pytest tests/unit/state/test_research_rubrics_workers.py -v +``` + +If sandbox input threading is included, also run: + +```bash +pytest tests/unit/runtime/test_workflow_input_resource_semantics.py -v +``` + +Final acceptance rollout: + +```bash +ERGON_REAL_LLM=1 \ +ERGON_REAL_LLM_WORKER=researchrubrics-workflow-cli-react \ +ERGON_REAL_LLM_LIMIT=5 \ +uv run pytest tests/real_llm/benchmarks/test_researchrubrics.py -v -s +``` diff --git a/docs/superpowers/plans/2026-04-26-run-workspace-interaction-corrections.md b/docs/superpowers/plans/2026-04-26-run-workspace-interaction-corrections.md new file mode 100644 index 00000000..4ee687c7 --- /dev/null +++ b/docs/superpowers/plans/2026-04-26-run-workspace-interaction-corrections.md @@ -0,0 +1,932 @@ +# Run Workspace Interaction Corrections Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Bring the run workspace back into alignment with the design brief: tabbed right-hand task workspace, always-live graph with bottom timeline-driven snapshots, cleaner concurrent activity visualization, and no dead controls. + +**Architecture:** Keep `RunWorkspacePage` as the orchestration point, but replace the `live | timeline` mode split with a single `snapshotSequence: number | null`. A null snapshot means live; a selected sequence replays graph mutations with `replayToSequence()`. `TaskWorkspace` becomes a tabbed inspector, and `buildRunActivities()` stops mixing every event type into the concurrent activity stack. + +**Tech Stack:** Next.js App Router, React client components, React Flow, node:test via `tsx --test`, Playwright for browser verification. + +--- + +## File Structure + +- Modify: `ergon-dashboard/src/components/run/RunWorkspacePage.tsx` + - Owns run-level live state, snapshot selection, mutation loading, graph replay, header controls, and rerun button state. +- Modify: `ergon-dashboard/src/components/workspace/TaskWorkspace.tsx` + - Converts the right drawer from stacked sections to tabs: `Overview`, `Actions`, `Communication`, `Outputs`, `State transitions`, `Evaluation`. +- Modify: `ergon-dashboard/src/features/activity/buildRunActivities.ts` + - Narrows the concurrent activity data model to execution/concurrency bars plus graph/key-event markers. +- Modify: `ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx` + - Removes play/pause/speed controls and uses click-only marker navigation. +- Modify: `ergon-dashboard/src/features/activity/components/ActivityBar.tsx` + - Keeps bar rendering, but supports clearer marker-vs-span styling. +- Create: `ergon-dashboard/src/features/activity/snapshotSequence.ts` + - Pure helper for resolving a clicked activity to the nearest replay mutation sequence. +- Create: `ergon-dashboard/src/features/activity/snapshotSequence.test.ts` + - Unit tests for direct sequence and timestamp-to-nearest-mutation behavior. +- Modify: `ergon-dashboard/src/features/activity/buildRunActivities.test.ts` + - Regression tests for “do not flood the stack with context/sandbox command detail”. +- Create/modify: `ergon-dashboard/tests/e2e/run-workspace-interactions.spec.ts` + - Browser-level checks for drawer tabs, timeline click rollback, no live/timeline toggle, no playback controls, no active dead rerun. + +--- + +## Current State Summary + +The backend and existing frontend data structures already support graph replay through: + +- `GET /api/runs/[runId]/mutations` +- `parseGraphMutationDtoArray()` +- `replayToSequence(mutations, currentSequence, emptyState, snapshotCache)` + +The current UI does not consistently use that support because: + +- `RunWorkspacePage` only fetches mutations after entering `timelineMode === "timeline"`. +- Activity clicks only rewind when `activity.sequence !== null`. +- Most visible activity bars have `sequence: null` because they represent execution/sandbox/context spans, not graph mutations. +- `buildRunActivities()` mixes execution spans, sandbox spans, sandbox commands, context events, event markers, and graph mutations into one stacked view. +- The right drawer is still a stacked section list, not tabs. +- The rerun button is visually active but has no `onClick`. + +--- + +## Task 1: Snapshot State Model In `RunWorkspacePage` + +**Files:** +- Modify: `ergon-dashboard/src/components/run/RunWorkspacePage.tsx` +- Test: `ergon-dashboard/tests/e2e/run-workspace-interactions.spec.ts` + +- [ ] **Step 1: Write failing E2E test for removed mode controls** + +Create or extend `tests/e2e/run-workspace-interactions.spec.ts`: + +```ts +import { test, expect } from "@playwright/test"; + +const BASE = process.env.BASE_URL ?? "http://localhost:3001"; +const COHORT_ID = "a39ee959-376d-490c-8705-22f0c3e32d1e"; +const RUN_ID = "4028c6d2-d9db-4c5a-be21-d9223d46b4ca"; + +test("run workspace is always live and has no manual live/timeline or playback controls", async ({ page }) => { + await page.goto(`${BASE}/cohorts/${COHORT_ID}/runs/${RUN_ID}`); + await expect(page.locator('[data-testid="graph-canvas"]')).toBeVisible(); + + await expect(page.locator('[data-testid="mode-live"]')).toHaveCount(0); + await expect(page.locator('[data-testid="mode-timeline"]')).toHaveCount(0); + await expect(page.locator('[data-testid="activity-play-toggle"]')).toHaveCount(0); + await expect(page.locator('[data-testid^="speed-"]')).toHaveCount(0); +}); +``` + +- [ ] **Step 2: Run E2E test and verify it fails** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "always live" +``` + +Expected: FAIL because `mode-live`, `mode-timeline`, play/pause, and speed controls are currently rendered. + +- [ ] **Step 3: Replace mode state with snapshot state** + +In `RunWorkspacePage.tsx`, replace: + +```ts +const [timelineMode, setTimelineMode] = useState<"live" | "timeline">("live"); +const [currentSequence, setCurrentSequence] = useState(0); +const [isPlaying, setIsPlaying] = useState(false); +const [playbackSpeed, setPlaybackSpeed] = useState(1); +``` + +with: + +```ts +const [snapshotSequence, setSnapshotSequence] = useState(null); +const currentSequence = snapshotSequence ?? 0; +``` + +- [ ] **Step 4: Always fetch mutations once per run** + +Replace the current mutation `useEffect()` guard: + +```ts +if (timelineMode !== "timeline") return; +``` + +with unconditional loading: + +```ts +useEffect(() => { + let cancelled = false; + fetch(`/api/runs/${runId}/mutations`) + .then((res) => res.json()) + .then((data) => { + if (cancelled) return; + const parsed = parseGraphMutationDtoArray(data); + setMutations(parsed); + snapshotCache.current.clear(); + const requestedSequence = requestedSequenceRef.current; + requestedSequenceRef.current = null; + if (requestedSequence !== null) { + setSnapshotSequence(nearestMutationAtOrBefore(parsed, requestedSequence)?.sequence ?? null); + } + }) + .catch(() => { + if (!cancelled) setMutations([]); + }); + return () => { + cancelled = true; + }; +}, [runId]); +``` + +- [ ] **Step 5: Replay only when `snapshotSequence !== null`** + +Change `displayState` to: + +```ts +const displayState = useMemo(() => { + if (snapshotSequence === null || mutations.length === 0) return runState; + if (!runState) return runState; + const emptyState: WorkflowRunState = { + ...runState, + tasks: new Map(), + totalTasks: 0, + totalLeafTasks: 0, + completedTasks: 0, + runningTasks: 0, + failedTasks: 0, + }; + return replayToSequence(mutations, snapshotSequence, emptyState, snapshotCache.current); +}, [runState, mutations, snapshotSequence]); +``` + +- [ ] **Step 6: Remove header mode toggle** + +Delete the whole `role="tablist"` block that renders `mode-live` and `mode-timeline`. + +Change header chip text from: + +```tsx +{timelineMode === "live" ? "live" : `seq ${currentSequence}`} · {formatSeconds(...)} +``` + +to: + +```tsx +{snapshotSequence === null ? "live" : `snapshot · seq ${snapshotSequence}`} · {formatSeconds(...)} +``` + +- [ ] **Step 7: Update keyboard behavior** + +Remove the `t` shortcut entirely. Change `Esc` behavior to: + +```ts +if (e.key === "Escape") { + if (selectedTaskId) { setSelectedTaskId(null); return; } + if (snapshotSequence !== null) { setSnapshotSequence(null); return; } + if (statusFilter) { setStatusFilter(null); return; } + return; +} +``` + +Change arrow stepping to use `snapshotSequence !== null`. + +- [ ] **Step 8: Run E2E test and verify it passes** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "always live" +``` + +Expected: PASS. + +--- + +## Task 2: Activity Click Resolves To Snapshot Sequence + +**Files:** +- Create: `ergon-dashboard/src/features/activity/snapshotSequence.ts` +- Create: `ergon-dashboard/src/features/activity/snapshotSequence.test.ts` +- Modify: `ergon-dashboard/src/components/run/RunWorkspacePage.tsx` + +- [ ] **Step 1: Write failing unit tests** + +Create `src/features/activity/snapshotSequence.test.ts`: + +```ts +import assert from "node:assert/strict"; +import test from "node:test"; + +import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import type { RunActivity } from "./types"; +import { resolveActivitySnapshotSequence } from "./snapshotSequence"; + +function mutation(sequence: number, createdAt: string): GraphMutationDto { + return { + id: `m-${sequence}`, + run_id: "run-1", + sequence, + mutation_type: "node.status_changed", + target_type: "node", + target_id: "task-1", + old_value: null, + new_value: { status: "running" }, + actor: "worker", + reason: null, + created_at: createdAt, + } as GraphMutationDto; +} + +function activity(overrides: Partial): RunActivity { + return { + id: "a-1", + kind: "execution", + label: "activity", + taskId: "task-1", + sequence: null, + startAt: "2026-04-26T10:00:05.000Z", + endAt: "2026-04-26T10:00:08.000Z", + isInstant: false, + actor: "worker", + sourceKind: "execution.span", + metadata: {}, + ...overrides, + }; +} + +test("uses explicit activity sequence when present", () => { + assert.equal( + resolveActivitySnapshotSequence(activity({ sequence: 67 }), [ + mutation(1, "2026-04-26T10:00:00.000Z"), + ]), + 67, + ); +}); + +test("uses nearest mutation at or before activity start time when sequence is absent", () => { + assert.equal( + resolveActivitySnapshotSequence(activity({ startAt: "2026-04-26T10:00:05.000Z" }), [ + mutation(10, "2026-04-26T10:00:00.000Z"), + mutation(20, "2026-04-26T10:00:04.000Z"), + mutation(30, "2026-04-26T10:00:06.000Z"), + ]), + 20, + ); +}); + +test("returns null when no mutation can represent the activity time", () => { + assert.equal( + resolveActivitySnapshotSequence(activity({ startAt: "2026-04-26T09:59:00.000Z" }), [ + mutation(10, "2026-04-26T10:00:00.000Z"), + ]), + null, + ); +}); +``` + +- [ ] **Step 2: Run unit test and verify it fails** + +Run: + +```bash +cd ergon-dashboard +npx tsx --test src/features/activity/snapshotSequence.test.ts +``` + +Expected: FAIL because `snapshotSequence.ts` does not exist. + +- [ ] **Step 3: Implement helper** + +Create `src/features/activity/snapshotSequence.ts`: + +```ts +import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import type { RunActivity } from "./types"; + +export function resolveActivitySnapshotSequence( + activity: RunActivity, + mutations: GraphMutationDto[], +): number | null { + if (activity.sequence !== null) return activity.sequence; + + const activityMs = Date.parse(activity.startAt); + if (!Number.isFinite(activityMs)) return null; + + let selected: GraphMutationDto | null = null; + for (const mutation of mutations) { + const mutationMs = Date.parse(mutation.created_at); + if (!Number.isFinite(mutationMs)) continue; + if (mutationMs > activityMs) break; + selected = mutation; + } + return selected?.sequence ?? null; +} +``` + +- [ ] **Step 4: Use helper in `handleActivityClick`** + +In `RunWorkspacePage.tsx`, import: + +```ts +import { resolveActivitySnapshotSequence } from "@/features/activity/snapshotSequence"; +``` + +Replace: + +```ts +if (activity.sequence !== null) { + requestedSequenceRef.current = activity.sequence; + if (timelineMode !== "timeline") setTimelineMode("timeline"); + handleSequenceChange(activity.sequence); +} +``` + +with: + +```ts +const snapshot = resolveActivitySnapshotSequence(activity, mutations); +if (snapshot !== null) { + setSnapshotSequence(snapshot); +} +``` + +- [ ] **Step 5: Run tests** + +Run: + +```bash +cd ergon-dashboard +npx tsx --test src/features/activity/snapshotSequence.test.ts +npm run build +``` + +Expected: unit test PASS and build PASS. + +--- + +## Task 3: Simplify Concurrent Activity Stack Data + +**Files:** +- Modify: `ergon-dashboard/src/features/activity/buildRunActivities.ts` +- Modify: `ergon-dashboard/src/features/activity/buildRunActivities.test.ts` +- Modify: `ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx` +- Modify: `ergon-dashboard/src/features/activity/components/ActivityBar.tsx` + +- [ ] **Step 1: Write failing test for reduced activity stack noise** + +Add to `src/features/activity/buildRunActivities.test.ts`: + +```ts +test("buildRunActivities keeps concurrent stack focused on execution spans and graph markers", () => { + const runState = makeRunStateWithOneExecutionAndSandboxCommand(); + const activities = buildRunActivities({ + runState, + events: [ + { + id: "message-1", + kind: "thread.message", + at: "2026-04-26T10:00:01.000Z", + taskId: "task-1", + actor: "worker", + preview: "verbose message", + sequence: null, + }, + ] as any, + mutations: [ + { + id: "mutation-1", + run_id: runState.id, + sequence: 1, + mutation_type: "node.status_changed", + target_type: "node", + target_id: "task-1", + old_value: null, + new_value: { status: "running" }, + actor: "worker", + reason: null, + created_at: "2026-04-26T10:00:00.000Z", + } as any, + ], + currentSequence: null, + }); + + assert.equal(activities.some((a) => a.sourceKind === "execution.span"), true); + assert.equal(activities.some((a) => a.sourceKind === "graph.mutation"), true); + assert.equal(activities.some((a) => a.sourceKind === "sandbox.command"), false); + assert.equal(activities.some((a) => a.sourceKind === "thread.message"), false); +}); +``` + +Define `makeRunStateWithOneExecutionAndSandboxCommand()` in the test file using the existing `WorkflowRunState` shape from nearby tests. It must include: + +```ts +tasks: new Map([["task-1", { id: "task-1", name: "task", status: TaskStatus.COMPLETED, parentId: null, childIds: [], dependsOnIds: [], isLeaf: true, level: 0, assignedWorkerId: "w1", assignedWorkerName: "worker", startedAt: "2026-04-26T10:00:00.000Z", completedAt: "2026-04-26T10:00:10.000Z" }]]) +executionsByTask: new Map([["task-1", [{ id: "exec-1", taskId: "task-1", attemptNumber: 1, status: TaskStatus.COMPLETED, agentId: "w1", agentName: "worker", startedAt: "2026-04-26T10:00:00.000Z", completedAt: "2026-04-26T10:00:10.000Z", finalAssistantMessage: null, outputResourceIds: [], errorMessage: null, score: null, evaluationDetails: {} }]]]) +sandboxesByTask: new Map([["task-1", { taskId: "task-1", sandboxId: "sandbox-1", status: "closed", template: "default", createdAt: "2026-04-26T10:00:00.000Z", closedAt: "2026-04-26T10:00:10.000Z", closeReason: null, commands: [{ command: "pytest", stdout: "", stderr: "", exitCode: 0, durationMs: 1000, timestamp: "2026-04-26T10:00:01.000Z" }] }]]) +``` + +- [ ] **Step 2: Run test and verify it fails** + +Run: + +```bash +cd ergon-dashboard +npx tsx --test src/features/activity/buildRunActivities.test.ts +``` + +Expected: FAIL because sandbox command and message activities are currently included. + +- [ ] **Step 3: Narrow `buildRunActivities()` output** + +Change: + +```ts +return [ + ...executionActivities(input.runState, selectedTime), + ...sandboxActivities(input.runState, selectedTime), + ...contextActivities(input.runState), + ...eventMarkerActivities(input.events), + ...graphMutationActivities(input.mutations), +].sort(compareActivity); +``` + +to: + +```ts +return [ + ...executionActivities(input.runState, selectedTime), + ...graphMutationActivities(input.mutations), +].sort(compareActivity); +``` + +Do not delete helper functions yet unless `npm run build` reports unused exports/imports. This keeps the diff small and allows future detail views to reuse them if needed. + +- [ ] **Step 4: Update activity copy** + +In `ActivityStackTimeline.tsx`, change: + +```tsx +
Concurrent activity
+Bars stack only when they overlap. +``` + +to: + +```tsx +
Concurrent execution
+Bars are task attempts; dots are graph snapshots. +``` + +Change footer hints: + +```tsx +Bar = task execution +Dot = graph mutation snapshot +Click any item = inspect at that time +``` + +- [ ] **Step 5: Verify visual density** + +Run: + +```bash +cd ergon-dashboard +npx tsx --test src/features/activity/buildRunActivities.test.ts +npm run build +``` + +Expected: tests PASS, build PASS. Browser should show fewer rows and fewer visual elements in the bottom stack. + +--- + +## Task 4: Remove Playback Controls From Activity Stack + +**Files:** +- Modify: `ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx` +- Modify: `ergon-dashboard/src/components/run/RunWorkspacePage.tsx` +- Test: `ergon-dashboard/tests/e2e/run-workspace-interactions.spec.ts` + +- [ ] **Step 1: Extend failing E2E test** + +Extend the Task 1 E2E test to assert: + +```ts +await expect(page.locator('[data-testid="activity-step-back"]')).toHaveCount(0); +await expect(page.locator('[data-testid="activity-step-forward"]')).toHaveCount(0); +await expect(page.locator('[data-testid="activity-play-toggle"]')).toHaveCount(0); +``` + +- [ ] **Step 2: Remove props from `ActivityStackTimelineProps`** + +Delete: + +```ts +isPlaying: boolean; +speed: number; +onTogglePlay: () => void; +onSpeedChange: (speed: number) => void; +``` + +Also delete: + +```ts +const SPEED_OPTIONS = [0.5, 1, 2, 4] as const; +const MIN_DELAY_MS = 50; +const MAX_DELAY_MS = 2000; +const timerRef = useRef | null>(null); +const currentSequenceRef = useRef(currentSequence); +stepForward +stepBack +useEffect that schedules playback +``` + +- [ ] **Step 3: Delete playback UI** + +Remove the entire `isTimeline && (...)` button group containing `activity-step-back`, `activity-play-toggle`, `activity-step-forward`, and `speed-*`. + +- [ ] **Step 4: Update caller** + +In `RunWorkspacePage.tsx`, change: + +```tsx + setIsPlaying((p) => !p)} + speed={playbackSpeed} + onSpeedChange={setPlaybackSpeed} + onActivityClick={handleActivityClick} +/> +``` + +to: + +```tsx + +``` + +- [ ] **Step 5: Run tests** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "always live" +npm run build +``` + +Expected: PASS. + +--- + +## Task 5: Tabbed Right-Hand Workspace Drawer + +**Files:** +- Modify: `ergon-dashboard/src/components/workspace/TaskWorkspace.tsx` +- Test: `ergon-dashboard/tests/e2e/run-workspace-interactions.spec.ts` + +- [ ] **Step 1: Write failing E2E test for drawer tabs and criteria visibility** + +Add: + +```ts +test("task workspace uses tabs and exposes evaluation criteria tab", async ({ page }) => { + await page.goto(`${BASE}/cohorts/${COHORT_ID}/runs/${RUN_ID}`); + await expect(page.locator('[data-testid="graph-canvas"]')).toBeVisible(); + await page.locator('[data-testid^="graph-node-"]').first().click(); + + await expect(page.locator('[data-testid="workspace-tab-overview"]')).toBeVisible(); + await expect(page.locator('[data-testid="workspace-tab-actions"]')).toBeVisible(); + await expect(page.locator('[data-testid="workspace-tab-communication"]')).toBeVisible(); + await expect(page.locator('[data-testid="workspace-tab-outputs"]')).toBeVisible(); + await expect(page.locator('[data-testid="workspace-tab-transitions"]')).toBeVisible(); + await expect(page.locator('[data-testid="workspace-tab-evaluation"]')).toBeVisible(); + + await expect(page.locator('[data-testid="workspace-overview"]')).toBeVisible(); + await expect(page.locator('[data-testid="workspace-actions"]')).toHaveCount(0); + + await page.locator('[data-testid="workspace-tab-evaluation"]').click(); + await expect(page.locator('[data-testid="workspace-evaluation"]')).toBeVisible(); +}); +``` + +- [ ] **Step 2: Run E2E test and verify it fails** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "workspace uses tabs" +``` + +Expected: FAIL because drawer uses stacked sections. + +- [ ] **Step 3: Add tab state and tab metadata** + +In `TaskWorkspace.tsx`, import: + +```ts +import { useMemo, useState } from "react"; +``` + +Add: + +```ts +type WorkspaceTab = "overview" | "actions" | "communication" | "outputs" | "transitions" | "evaluation"; + +const WORKSPACE_TABS: { id: WorkspaceTab; label: string }[] = [ + { id: "overview", label: "Overview" }, + { id: "actions", label: "Actions" }, + { id: "communication", label: "Communication" }, + { id: "outputs", label: "Outputs" }, + { id: "transitions", label: "State transitions" }, + { id: "evaluation", label: "Evaluation" }, +]; +``` + +Inside component: + +```ts +const [activeTab, setActiveTab] = useState("overview"); +``` + +- [ ] **Step 4: Render tab strip below header** + +Insert after header metadata: + +```tsx + +``` + +- [ ] **Step 5: Replace stacked sections with single active panel** + +Replace the current scroll region contents with: + +```tsx +
+ {activeTab === "overview" && ( + + {/* existing dependency overview block */} + + )} + {activeTab === "actions" && ( + + + + )} + {activeTab === "communication" && ( + + + + )} + {activeTab === "outputs" && ( + + + + )} + {activeTab === "transitions" && ( + + + + )} + {activeTab === "evaluation" && ( + + + + )} +
+``` + +Move the existing overview dependency JSX into a local `overviewPanel` constant to avoid duplicating it. + +- [ ] **Step 6: Ensure evaluation panel shows criteria absence clearly** + +In `EvaluationPanel.tsx`, change the empty state text to: + +```tsx +

No evaluation criteria recorded yet

+

This task has no criterionResults in the persisted evaluation payload.

+``` + +If `evaluation` exists but `criterionResults.length === 0`, render: + +```tsx +
+ No criteria were recorded for this evaluation payload. +
+``` + +- [ ] **Step 7: Run tests** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "workspace uses tabs" +npm run build +``` + +Expected: PASS. + +--- + +## Task 6: Rerun Button Behavior + +**Files:** +- Modify: `ergon-dashboard/src/components/run/RunWorkspacePage.tsx` +- Test: `ergon-dashboard/tests/e2e/run-workspace-interactions.spec.ts` + +- [ ] **Step 1: Write failing E2E test that rerun is not a dead active button** + +Add: + +```ts +test("rerun control is explicit about unavailable backend action", async ({ page }) => { + await page.goto(`${BASE}/cohorts/${COHORT_ID}/runs/${RUN_ID}`); + const rerun = page.locator('[data-testid="rerun-button"]'); + await expect(rerun).toBeVisible(); + await expect(rerun).toBeDisabled(); + await expect(rerun).toHaveAttribute("title", /not wired/i); +}); +``` + +- [ ] **Step 2: Run E2E and verify it fails** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "rerun control" +``` + +Expected: FAIL because current button has no `data-testid`, is enabled, and has no title explaining state. + +- [ ] **Step 3: Make rerun visibly disabled** + +Replace current rerun button: + +```tsx + +``` + +with: + +```tsx + +``` + +- [ ] **Step 4: Run test** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "rerun control" +npm run build +``` + +Expected: PASS. + +--- + +## Task 7: End-To-End Snapshot Rollback Verification + +**Files:** +- Modify: `ergon-dashboard/tests/e2e/run-workspace-interactions.spec.ts` + +- [ ] **Step 1: Write E2E test that clicking bottom activity changes graph snapshot label** + +Add: + +```ts +test("clicking bottom activity marker locks graph to snapshot sequence", async ({ page }) => { + await page.goto(`${BASE}/cohorts/${COHORT_ID}/runs/${RUN_ID}`); + await expect(page.locator('[data-testid="graph-canvas"]')).toBeVisible(); + + const firstActivity = page.locator('[data-testid^="activity-bar-"]').first(); + await expect(firstActivity).toBeVisible(); + await firstActivity.click(); + + await expect(page.locator('[data-testid="snapshot-lock-label"]')).toBeVisible(); + await expect(page.locator('[data-testid="snapshot-pin"]')).toBeVisible(); + await expect(page.locator('[data-testid="run-header"]')).toContainText(/snapshot · seq|seq \d+/); +}); +``` + +If `ActivityBar` does not currently expose `data-testid^="activity-bar-"`, add it in `ActivityBar.tsx`: + +```tsx +data-testid={`activity-bar-${item.activity.id}`} +``` + +- [ ] **Step 2: Run E2E and verify failure before fixes** + +Run: + +```bash +cd ergon-dashboard +BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts -g "clicking bottom activity" +``` + +Expected: FAIL before Tasks 1-4; PASS after Tasks 1-4. + +- [ ] **Step 3: Manual visual check** + +Open: + +```text +http://localhost:3001/cohorts/a39ee959-376d-490c-8705-22f0c3e32d1e/runs/4028c6d2-d9db-4c5a-be21-d9223d46b4ca +``` + +Expected: + +- No Live/Timeline segmented control. +- No play/pause/speed controls. +- Bottom area is less dense. +- Clicking a bar/marker changes header chip to `snapshot · seq N`. +- `Esc` returns header chip to `live`. +- Graph node statuses/visibility match the selected sequence. + +--- + +## Verification Checklist + +- [ ] `npx tsx --test src/features/activity/snapshotSequence.test.ts` passes. +- [ ] `npx tsx --test src/features/activity/buildRunActivities.test.ts` passes. +- [ ] `npx tsx --test src/lib/timeFormat.test.ts` still passes. +- [ ] `npx tsx --test src/hooks/useRunState.socketHydration.test.ts` still passes. +- [ ] `npm run build` passes. +- [ ] `BASE_URL=http://localhost:3001 pnpm exec playwright test tests/e2e/run-workspace-interactions.spec.ts` passes. +- [ ] Browser smoke check shows no Next.js overlay, graph nodes render, drawer tabs render, activity stack is navigable. + +--- + +## Spec Coverage Review + +- Right drawer tabs: Task 5. +- Evaluation criteria visibility: Task 5, Step 6. +- Remove explicit live/timeline mode: Task 1. +- Bottom timeline drives graph replay: Tasks 1, 2, 7. +- Data structure support: Task 2 confirms mutation replay supports timestamp-to-sequence lookup. +- Concurrent activity clutter: Task 3. +- Remove pause/play/speed controls: Task 4. +- Dead rerun button: Task 6. + +Known follow-up outside this plan: implement a real rerun backend action if product wants rerun to work. This plan only makes the current dead button honest and non-interactive because no confirmed dashboard rerun API exists in the current frontend. diff --git a/docs/superpowers/plans/2026-04-26-trace-spans-ux-refinements.md b/docs/superpowers/plans/2026-04-26-trace-spans-ux-refinements.md new file mode 100644 index 00000000..2849f92e --- /dev/null +++ b/docs/superpowers/plans/2026-04-26-trace-spans-ux-refinements.md @@ -0,0 +1,116 @@ +# Trace Spans UX Refinements + +Date: 2026-04-26 + +## Context + +The immutable Trace Spans direction is right: the bottom trace should act as a fixed map of the completed run, while clicking or arrowing between events moves the cursor, replays the graph above, and updates the workspace detail. The next set of issues is about legibility: dense events overlap, hover metadata is hard to read, tooltips crop, and some events appear to be hidden or inaccessible. + +## 1. JSON Metadata Is Hard To Read + +### Problem + +The hover metadata is too raw. It technically exposes useful details, but the user has to parse JSON to answer the basic question: "what is this event?" + +### Proposed Fix + +Use a two-level hover card: + +- Summary header: event kind, label, task, sequence, and timestamp. +- Important fields table: fields such as `mutationType`, `targetId`, `actor`, `reason`, `status`, `toolName`, `exitCode`, or `score`. +- Raw JSON collapsed by default under a `Raw payload` disclosure. + +The raw JSON should remain available for debugging, but it should not be the first thing the user has to read. + +### Acceptance Criteria + +- Hovering an event answers "what is this?" without reading raw JSON. +- Raw JSON is still available on demand. +- The same summary fields are reused in the pinned workspace activity detail. + +## 2. Hover Cards Crop Off The Top + +### Problem + +Hover cards can be clipped when the event is near the top of the Trace Spans component. This likely happens because the tooltip is rendered inside an overflow-constrained timeline container. + +### Proposed Fix + +Make the tooltip viewport-aware: + +- Render the hover card as `position: fixed`, or via a small portal attached to `document.body`. +- Compute the hovered marker/bar bounding box. +- Prefer placing the card above the event when there is room. +- Flip below the event when there is not enough space above. +- Clamp left and right positions to the viewport. +- Give the card a `max-height` with internal scroll for larger payloads. + +### Acceptance Criteria + +- Hovering any visible event never clips the tooltip outside the viewport. +- The hover card remains readable for top-row, bottom-row, far-left, and far-right events. +- Keyboard focus should show the same preview behavior as mouse hover. + +## 3. Too Much Overlap + +### Problem + +Point events and duration spans currently compete for the same visual space. Dense regions become hard to read because markers pile up on top of bars or on top of each other. + +### Proposed Fix + +Separate the visual grammar: + +- Span rows show only duration bars: task executions and sandbox lifetimes. +- Point events render on marker rails, not directly as miniature bars inside the same span rows. +- Dense point events are clustered when they fall within a few pixels of each other. +- A cluster renders as a numbered bubble such as `+4`. +- Hovering or clicking a cluster opens a small list of the events inside that time window. +- Add optional kind filters so the user can hide/show `graph`, `context`, `message`, `artifact`, `evaluation`, and `sandbox` markers. + +### Acceptance Criteria + +- Overlapping work remains visible as stable bars. +- Dense point events remain inspectable without becoming a pile of dots. +- Markers do not change span row assignment. +- Clusters expose every hidden event through hover or click. + +## 4. Missing Dots / Possible Bottom Cropping + +### Problem + +When moving between examples, the UI reports multiple steps/events, but the corresponding dots are not always visible on the end swim lanes. This may mean markers are rendered below the visible component, hidden by overflow, or compressed into inaccessible rows. + +### Proposed Fix + +Audit and stabilize the timeline height and scroll behavior: + +- Ensure the timeline content height derives from the full layout, including marker rails and bottom padding. +- Add an assertion that every rendered `layout.item` is inside the scrollable timeline bounds. +- Make vertical overflow explicit: if the trace has more rows than fit, the panel should visibly scroll or offer an expand control. +- Add a trace status line, for example: `17 trace rows · 84 events · 0 hidden`. +- If filters are added, report hidden counts explicitly, for example: `12 hidden by filters`. +- Consider an "Expand trace" control for dense runs so users can inspect the full trace without fighting a 300px dock. + +### Acceptance Criteria + +- If the UI reports events at a point, the user can scroll or expand to see them. +- No event markers silently render outside the component. +- The component distinguishes between events that are hidden by filters, collapsed into clusters, or simply below the current scroll position. + +## Implementation Notes + +This should follow the immutable Trace Spans acceptance criterion: + +- Clicking or arrowing between point events must not change Trace Span bar lengths. +- Clicking or arrowing between point events must not change row assignments. +- Clicking or arrowing should only move the cursor/pin, update selected marker state, replay the top graph, and update the workspace detail. + +Suggested implementation order: + +1. Stabilize immutable trace derivation and layout. +2. Add marker rails and clustering. +3. Add the improved legend and trace status line. +4. Add viewport-aware hover cards. +5. Reuse the same event summary/debug payload in the workspace detail. +6. Add e2e coverage for no clipping, no relayout on marker click, and cluster inspection. diff --git a/docs/superpowers/plans/mas-run-visual-debugger/00-program.md b/docs/superpowers/plans/mas-run-visual-debugger/00-program.md new file mode 100644 index 00000000..8c25fe0f --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/00-program.md @@ -0,0 +1,110 @@ +# MAS Run Visual Debugger Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the current unreadable MAS run view with a visual debugger that shows the whole recursive graph at selected time `T`, an overlap-based bottom activity stack, and a task-scoped workspace drawer. + +**Architecture:** Keep graph mutation replay as the source of topology truth and derive the activity stack from existing run state plus graph mutations. Avoid fixed agent lanes: agents can join/leave dynamically, while tasks, events, and wall-clock timestamps are stable. Introduce small frontend domain modules for activity derivation/layout and keep backend DTO changes additive and narrow. + +**Tech Stack:** Next.js/React, TypeScript, React Flow, Tailwind CSS, Zod contracts, Playwright e2e, FastAPI test harness DTOs. + +--- + +## 1. Goals and non-goals + +**Goals** + +- Render the full recursive task graph as it existed at selected sequence/time `T`. +- Move the timeline into a bottom dock that visualizes concurrency by stacking overlapping activity bars. +- Keep the right-hand workspace task-scoped and time-aware: selecting a node at `T` shows resources, executions, messages, context events, and evaluations available at `T`. +- Make activity layout independent of agent cardinality. Agent/worker names are labels and filter metadata only. +- Preserve the live mode. Timeline mode must be opt-in and must not make the live dashboard feel stale. +- Add focused Playwright coverage that proves the graph canvas, activity stack, sequence scrubber, and workspace drawer are all usable on canonical MAS smoke runs. +- Use the mockup `ergon-dashboard/mockups/mas-activity-stack-debugger.html` as the UX target, not as code to copy directly. + +**Non-goals** + +- No rewrite of backend execution/control flow. +- No persistent "agent timeline" DTO. +- No new graph database model. +- No attempt to solve arbitrary huge-graph navigation in the first PR. The first PR should make the existing 9-leaf smoke and representative MAS samples readable. +- No replacing React Flow. +- No pixel-perfect visual snapshot testing in phase 1. Screenshot artifacts are review aids; assertions target stable structure and visibility. + +--- + +## 2. UX invariants + +- **Whole graph at T:** timeline scrub changes graph state, not graph scope. Collapsed containers are allowed for readability, but nodes are not silently omitted due to focus. +- **Concurrency by overlap:** overlapping work appears stacked vertically in the bottom dock. Vertical position means "needed another row because time overlaps", not "agent N". +- **Stable categories, unstable actors:** kind chips (`Execution`, `Graph`, `Talk`, `Artifact`, `Evaluation`, `Context`) are stable; worker/agent labels are secondary. +- **Task identity everywhere:** clicking an activity with `taskId` selects the graph node and opens the workspace. Clicking a graph node highlights related activity. +- **Replay is deterministic:** the same snapshot + mutation list + selected sequence produces the same graph and activity view. +- **Missing duration is explicit:** instant events render as markers; spans render as bars. Do not fake long durations for resources/messages/evaluations. + +--- + +## 3. DTO stance + +Production DTO changes should be avoided in the first phase unless implementation proves a real gap. + +Existing production data already gives the frontend enough to build the first activity stack: + +- `RunSnapshotDto` -> tasks, executions, resources, sandboxes, threads, evaluations. +- `dashboard/graph.mutation` + `/api/runs/{runId}/mutations` -> sequence, mutation kind, actor, reason, `created_at`. +- `context.event` state -> task execution, task node, event type, created/started/completed times where available. +- `task_evaluation_updated` -> task-scoped evaluation marker. + +Additive DTO work is still planned for testability and future precision: + +- Extend the **test harness** run-state DTO with activity-stack facts that Playwright can assert without reverse-engineering layout from pixels: mutation count, execution spans, context-event count, evaluation task IDs, and graph node IDs already exist; add `activity_event_count`, `activity_span_count`, and `max_concurrency` in Phase C if needed. +- Add production REST fields only if the current generated `RunSnapshotDto` lacks a timestamp needed for an honest bar. The likely candidate is evaluation duration (`startedAt`/`completedAt`) if evaluations become spans rather than instant markers. + +--- + +## 4. File map + +**New frontend domain files** + +- `ergon-dashboard/src/features/activity/types.ts` — `RunActivity`, `ActivityKind`, `ActivityStackRow`, layout result types. +- `ergon-dashboard/src/features/activity/buildRunActivities.ts` — pure derivation from `WorkflowRunState`, `RunEvent[]`, and `GraphMutationDto[]`. +- `ergon-dashboard/src/features/activity/stackLayout.ts` — overlap packing algorithm. +- `ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx` — bottom dock UI. +- `ergon-dashboard/src/features/activity/components/ActivityBar.tsx` — single bar/marker renderer. + +**Modified frontend files** + +- `ergon-dashboard/src/components/run/RunWorkspacePage.tsx` — three-pane debugger shell, timeline mode wiring, selection/highlight coordination. +- `ergon-dashboard/src/components/dag/DAGCanvas.tsx` — accept highlight props and expose stable graph container/node test IDs. +- `ergon-dashboard/src/features/graph/components/MutationTimeline.tsx` — either retire after Phase B or reduce to sequence controls reused by `ActivityStackTimeline`. +- `ergon-dashboard/src/components/workspace/TaskWorkspace.tsx` — filter task-scoped collections to selected sequence/time when timeline mode is active. +- `ergon-dashboard/src/lib/runEvents.ts` — keep flat event stream derivation, but do not make it own activity packing. + +**Modified tests** + +- `ergon-dashboard/tests/helpers/dashboardFixtures.ts` — add a concurrent MAS fixture with overlapping executions/context events. +- `ergon-dashboard/tests/e2e/_shared/smoke.ts` — assert activity stack presence and screenshots. +- `ergon-dashboard/tests/helpers/backendHarnessClient.ts` — add narrow test harness fields only if backend exposes them. + +--- + +## 5. Merge checklist + +- [ ] `pnpm --dir ergon-dashboard test` or the repository's frontend unit command is green for activity derivation/layout tests. +- [ ] `pnpm --dir ergon-dashboard run check` or current frontend type/lint command is green. +- [ ] Playwright smoke opens a canonical MAS run, enters timeline mode, sees `activity-stack-region`, scrubs sequence, opens workspace from a graph node, and captures run screenshots. +- [ ] Activity stack never creates rows from agent names. +- [ ] E2E screenshot shows full recursive graph at selected `T`, not focus-filtered branch-only graph. +- [ ] Implementation handoff includes PNGs of every new UI panel: full debugger page, graph canvas, activity stack bottom dock, and workspace drawer open on a selected task. +- [ ] Existing live run updates still render without requiring mutation fetch success. +- [ ] No production backend DTO changes unless justified in `01-contracts-and-state.md`. + +--- + +## 6. Open decisions + +1. **Activity source for graph mutations:** default to using `/api/runs/{runId}/mutations` in timeline mode. If live mode needs graph mutation bars before entering timeline, also retain recent socket mutation events in `useRunState`. +2. **Evaluation duration:** default to instant marker at `evaluation.createdAt`. Upgrade to span only if backend has real start/end timestamps. +3. **Viewport fit:** default to React Flow `fitView` on initial load and sequence changes only when user has not manually panned/zoomed. +4. **Saved layout state:** defer persistence of pane sizes/zoom to a follow-up. +5. **Virtualization:** defer until the activity count in a smoke run or real run demonstrably causes UI lag. diff --git a/docs/superpowers/plans/mas-run-visual-debugger/01-contracts-and-state.md b/docs/superpowers/plans/mas-run-visual-debugger/01-contracts-and-state.md new file mode 100644 index 00000000..827393e5 --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/01-contracts-and-state.md @@ -0,0 +1,232 @@ +# 01 — Contracts and State + +**Status:** draft. +**Scope:** DTO inventory, frontend activity model, deterministic replay contract, and exact places where additive DTO changes may be needed. + +Cross-refs: program goals in [`00-program.md`](00-program.md), UI work in [`02-frontend-implementation.md`](02-frontend-implementation.md), e2e contract in [`03-tests-and-e2e.md`](03-tests-and-e2e.md). + +--- + +## 1. Existing contract inventory + +### Production snapshot/state + +- `ergon-dashboard/src/lib/contracts/rest.ts` + - `RunSnapshot` already includes tasks, executions, resources, sandboxes, threads, and evaluations via generated schemas. + - `RunExecutionAttempt` has `startedAt` and `completedAt`, which are true span endpoints. + - `RunSandbox` has `createdAt` and `closedAt`, which are true span endpoints. + - `RunSandboxCommand` has `timestamp` and `durationMs`, which can render as short command spans. + - `RunTaskEvaluation` currently behaves like an instant marker unless start/end timestamps are present in generated schema. + +- `ergon-dashboard/src/lib/types.ts` + - `WorkflowRunState` is the in-memory source for current display state. + - `TaskState.history` records task transitions with sequence/time/actor/reason. + +### Graph mutations + +- `ergon-dashboard/src/features/graph/contracts/graphMutations.ts` + - `GraphMutationDto` has `sequence`, `mutation_type`, `target_id`, `actor`, `reason`, `created_at`. + - This is sufficient for graph mutation markers and sequence scrubbing. + +- `ergon-dashboard/src/features/graph/state/graphMutationReducer.ts` + - `replayToSequence` is the topology/status replay engine. + - Activity derivation should consume its result; it should not duplicate graph replay. + +### Unified event stream + +- `ergon-dashboard/src/lib/runEvents.ts` + - `buildRunEvents()` already flattens workflow lifecycle, task transitions, sandbox events, messages, evaluations, resources, context events, and unhandled mutations. + - Keep this useful for event rows and activity markers, but implement span packing in a separate `features/activity` module. + +--- + +## 2. Frontend domain model + +Create `ergon-dashboard/src/features/activity/types.ts`. + +```typescript +import type { RunEventKind } from "@/lib/runEvents"; + +export type ActivityKind = + | "execution" + | "graph" + | "message" + | "artifact" + | "evaluation" + | "context" + | "sandbox"; + +export interface RunActivity { + id: string; + kind: ActivityKind; + label: string; + taskId: string | null; + sequence: number | null; + startAt: string; + endAt: string | null; + isInstant: boolean; + actor: string | null; + sourceKind: RunEventKind | "execution.span" | "sandbox.span" | "graph.mutation"; + metadata: Record; +} + +export interface ActivityStackItem { + activity: RunActivity; + row: number; + leftPct: number; + widthPct: number; +} + +export interface ActivityStackLayout { + items: ActivityStackItem[]; + rowCount: number; + startMs: number; + endMs: number; + maxConcurrency: number; +} +``` + +Rules: + +- `startAt` is always required. +- `endAt` is `null` for markers. +- `isInstant` is true when `endAt === null` or when duration is below the render minimum. +- `taskId` can be null for workflow-level events. +- `actor` is metadata only; it must not become a lane key. + +--- + +## 3. Activity derivation + +Create `ergon-dashboard/src/features/activity/buildRunActivities.ts`. + +Inputs: + +```typescript +import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import type { RunEvent } from "@/lib/runEvents"; +import type { WorkflowRunState } from "@/lib/types"; +import type { RunActivity } from "./types"; + +export interface BuildRunActivitiesInput { + runState: WorkflowRunState | null; + events: RunEvent[]; + mutations: GraphMutationDto[]; + currentSequence: number | null; +} + +export function buildRunActivities(input: BuildRunActivitiesInput): RunActivity[] { + if (!input.runState) return []; + return [ + ...executionActivities(input.runState), + ...sandboxActivities(input.runState), + ...contextActivities(input.runState), + ...eventMarkerActivities(input.events), + ...graphMutationActivities(input.mutations), + ].sort(compareActivity); +} +``` + +Derivation rules: + +- Executions: one span per `ExecutionAttemptState` with non-null `startedAt`; use `completedAt` when available, otherwise render open span through selected/current time. +- Sandboxes: one span per `SandboxState`; use `closedAt` when available. +- Sandbox commands: marker or short span using `timestamp + durationMs`. +- Context events: span if both `startedAt` and `completedAt` exist; otherwise marker at `createdAt`. +- Thread messages, resources, evaluations, workflow lifecycle: marker activities from `RunEvent`. +- Graph mutations: marker activities from `GraphMutationDto`. +- Duplicate suppression: do not render both a `task.transition` event and a `graph.mutation` marker as identical labels if they share the same sequence/task/status. Prefer the graph mutation marker for sequence navigation and keep task transition in the event stream. + +--- + +## 4. Stack layout + +Create `ergon-dashboard/src/features/activity/stackLayout.ts`. + +```typescript +import type { ActivityStackLayout, RunActivity } from "./types"; + +export interface StackActivityOptions { + minMarkerWidthPct: number; + minSpanWidthPct: number; +} + +export function stackActivities( + activities: RunActivity[], + options: StackActivityOptions = { minMarkerWidthPct: 0.35, minSpanWidthPct: 0.75 }, +): ActivityStackLayout { + const timed = activities + .map((activity) => toTimedActivity(activity)) + .sort((a, b) => a.startMs - b.startMs || a.endMs - b.endMs || a.activity.id.localeCompare(b.activity.id)); + + if (timed.length === 0) { + return { items: [], rowCount: 0, startMs: 0, endMs: 0, maxConcurrency: 0 }; + } + + const startMs = Math.min(...timed.map((a) => a.startMs)); + const endMs = Math.max(...timed.map((a) => a.endMs)); + const spanMs = Math.max(1, endMs - startMs); + const rowEnds: number[] = []; + let maxConcurrency = 0; + + const items = timed.map(({ activity, startMs: itemStartMs, endMs: itemEndMs }) => { + const row = firstFreeRow(rowEnds, itemStartMs); + rowEnds[row] = itemEndMs; + maxConcurrency = Math.max(maxConcurrency, rowEnds.filter((rowEnd) => rowEnd > itemStartMs).length); + + const leftPct = ((itemStartMs - startMs) / spanMs) * 100; + const rawWidthPct = ((itemEndMs - itemStartMs) / spanMs) * 100; + const widthPct = activity.isInstant + ? options.minMarkerWidthPct + : Math.max(options.minSpanWidthPct, rawWidthPct); + + return { activity, row, leftPct, widthPct }; + }); + + return { items, rowCount: rowEnds.length, startMs, endMs, maxConcurrency }; +} +``` + +Acceptance rules: + +- Two overlapping spans must be placed on different rows. +- Adjacent non-overlapping spans can reuse the same row. +- Instant markers should not force every later item onto a new row; give them a small render interval only for collision. +- Layout must be deterministic for identical inputs. + +--- + +## 5. DTO change decision tree + +Use this decision tree before editing backend schema files: + +1. Can the UI derive the fact from `WorkflowRunState`, `RunEvent[]`, or `GraphMutationDto[]` without lying about time? If yes, do not change production DTOs. +2. Is the missing fact only needed by Playwright? If yes, add it to `ergon_core/core/api/test_harness.py` and `ergon-dashboard/tests/helpers/backendHarnessClient.ts`, not production REST. +3. Is the missing fact needed by users and already persisted? If yes, add it to the production API schema and generated frontend contracts. +4. Is the missing fact not persisted? Stop and design the backend persistence change separately; do not smuggle fake frontend fields into the UI. + +Likely first-PR DTO edits: + +- **Test harness only:** add `activity_event_count`, `activity_span_count`, `max_concurrency` after the frontend derivation is stable enough to calculate the same values in backend or harness queries. +- **No production DTO edit:** keep evaluations as markers unless persisted evaluation span timestamps already exist. + +--- + +## 6. Unit test checklist + +Create `ergon-dashboard/src/features/activity/buildRunActivities.test.ts`. + +- [ ] Execution with start/end becomes a span with `kind: "execution"`. +- [ ] Running execution with no end becomes open span using current selected time. +- [ ] Resource event becomes instant `kind: "artifact"` marker. +- [ ] Evaluation event becomes instant `kind: "evaluation"` marker. +- [ ] Graph mutation becomes instant `kind: "graph"` marker with sequence. +- [ ] Actor names appear in metadata but not in row assignment input. + +Create `ergon-dashboard/src/features/activity/stackLayout.test.ts`. + +- [ ] Non-overlapping spans reuse one row. +- [ ] Overlapping spans use two rows. +- [ ] Three-way overlap reports `maxConcurrency === 3`. +- [ ] Instant markers do not permanently block a row. +- [ ] Same input order-independent set produces identical `row` assignments after sorting. diff --git a/docs/superpowers/plans/mas-run-visual-debugger/02-frontend-implementation.md b/docs/superpowers/plans/mas-run-visual-debugger/02-frontend-implementation.md new file mode 100644 index 00000000..5c4e44b2 --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/02-frontend-implementation.md @@ -0,0 +1,321 @@ +# 02 — Frontend Implementation + +**Status:** draft. +**Scope:** component boundaries, UI behavior, and task-by-task implementation plan for the visual debugger shell. + +Cross-refs: contracts in [`01-contracts-and-state.md`](01-contracts-and-state.md), tests in [`03-tests-and-e2e.md`](03-tests-and-e2e.md), phase order in [`04-phases.md`](04-phases.md). + +--- + +## 1. Target layout + +The run page becomes a three-region visual debugger: + +- Header/status strip remains at the top with run status, cohort breadcrumb, live/timeline toggle, and connection state. +- Main region is the React Flow recursive graph, showing the whole graph at selected `T`. +- Bottom dock is `ActivityStackTimeline`, always horizontal time, vertical rows allocated by overlap. +- Right drawer is `TaskWorkspace`, opened by graph node or activity click. + +The accepted mockup is `ergon-dashboard/mockups/mas-activity-stack-debugger.html`. + +--- + +## 2. Component map + +### New components + +- `ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx` + - Props: activities, current sequence, selected task, selected activity, callbacks. + - Owns time ruler, row rendering, legend, scrubber controls, and empty state. + +- `ergon-dashboard/src/features/activity/components/ActivityBar.tsx` + - Props: stack item, selected/highlight booleans, click handler. + - Renders span or marker using kind-specific styling. + +### Modified components + +- `RunWorkspacePage.tsx` + - Replaces old bottom `MutationTimeline` region with activity stack. + - Creates `activities = buildRunActivities({ runState: displayState, events, mutations, currentSequence })`. + - Tracks `selectedActivityId`. + - Activity click sets current sequence if present and selects `taskId` if present. + - Graph node click selects task and highlights related activities. + +- `DAGCanvas.tsx` + - Accepts `highlightedTaskIds?: Set`. + - Passes selected/highlight information through node data. + - Keeps depth expansion controls, search, minimap, and React Flow controls. + - Ensures canvas has `data-testid="graph-canvas"` and individual graph elements keep `graph-node-{taskId}` / `graph-container-{taskId}`. + +- `TaskWorkspace.tsx` + - Accepts `selectedTime?: string | null` or `currentSequence?: number | null`. + - Filters task collections for timeline mode only: + - resources with `createdAt <= selectedTime` + - executions with `startedAt <= selectedTime` + - sandbox commands with `timestamp <= selectedTime` + - thread messages with `createdAt <= selectedTime` + - context events with `createdAt <= selectedTime` + - evaluation with `createdAt <= selectedTime` + - Live mode keeps current behavior. + +--- + +## 3. Task 1: Activity domain module + +**Files:** + +- Create: `ergon-dashboard/src/features/activity/types.ts` +- Create: `ergon-dashboard/src/features/activity/buildRunActivities.ts` +- Create: `ergon-dashboard/src/features/activity/stackLayout.ts` +- Test: `ergon-dashboard/src/features/activity/buildRunActivities.test.ts` +- Test: `ergon-dashboard/src/features/activity/stackLayout.test.ts` + +- [ ] **Step 1: Write tests for activity derivation** + +```typescript +import { describe, expect, it } from "vitest"; +import { buildRunActivities } from "./buildRunActivities"; + +describe("buildRunActivities", () => { + it("renders execution attempts as spans and graph mutations as sequence markers", () => { + const activities = buildRunActivities({ + runState: makeRunStateWithExecution({ + taskId: "task-a", + startedAt: "2026-04-26T10:00:00.000Z", + completedAt: "2026-04-26T10:00:05.000Z", + }), + events: [], + mutations: [ + makeGraphMutation({ + sequence: 12, + target_id: "task-a", + mutation_type: "node.status_changed", + created_at: "2026-04-26T10:00:01.000Z", + }), + ], + currentSequence: 12, + }); + + expect(activities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ kind: "execution", taskId: "task-a", isInstant: false }), + expect.objectContaining({ kind: "graph", taskId: "task-a", sequence: 12, isInstant: true }), + ]), + ); + }); +}); +``` + +- [ ] **Step 2: Write tests for stack packing** + +```typescript +import { describe, expect, it } from "vitest"; +import { stackActivities } from "./stackLayout"; + +describe("stackActivities", () => { + it("puts overlapping spans on separate rows and reuses rows after overlap ends", () => { + const layout = stackActivities([ + activity("a", "2026-04-26T10:00:00.000Z", "2026-04-26T10:00:10.000Z"), + activity("b", "2026-04-26T10:00:05.000Z", "2026-04-26T10:00:12.000Z"), + activity("c", "2026-04-26T10:00:12.000Z", "2026-04-26T10:00:15.000Z"), + ]); + + expect(layout.rowCount).toBe(2); + expect(layout.maxConcurrency).toBe(2); + expect(layout.items.find((item) => item.activity.id === "c")?.row).toBe(0); + }); +}); +``` + +- [ ] **Step 3: Implement derivation and packing** + +Implement the interfaces and functions from [`01-contracts-and-state.md`](01-contracts-and-state.md). Keep the implementation pure and free of React. + +- [ ] **Step 4: Run unit tests** + +Run: `pnpm --dir ergon-dashboard test src/features/activity` + +Expected: activity tests pass; no browser required. + +--- + +## 4. Task 2: Activity stack UI + +**Files:** + +- Create: `ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx` +- Create: `ergon-dashboard/src/features/activity/components/ActivityBar.tsx` +- Modify: `ergon-dashboard/src/lib/statusTokens.ts` only if existing colors cannot cover activity kinds. + +- [ ] **Step 1: Add render contract** + +`ActivityStackTimeline` must expose: + +- `data-testid="activity-stack-region"` on the dock root. +- `data-testid="activity-stack-row"` per rendered row. +- `data-testid="activity-bar-{activity.id}"` per activity. +- `data-kind` and `data-task-id` on activity bars. +- `data-testid="activity-current-sequence"` for the visible selected sequence. + +- [ ] **Step 2: Implement timeline controls** + +Controls required in first pass: + +- Step back/forward by available graph mutation sequence. +- Play/pause using mutation timestamps, preserving current `MutationTimeline` min/max delay behavior. +- Drag/scrub range input using sequence numbers. +- Kind legend showing counts. + +- [ ] **Step 3: Implement click behavior** + +Activity click behavior: + +```typescript +function handleActivityClick(activity: RunActivity) { + setSelectedActivityId(activity.id); + if (activity.sequence !== null) setCurrentSequence(activity.sequence); + if (activity.taskId) setSelectedTaskId(activity.taskId); +} +``` + +- [ ] **Step 4: Add empty and partial-data states** + +Empty states: + +- No run state: "Run state is still loading." +- Run has no activities: "No activity has been recorded for this run yet." +- Timeline mode has no mutations: show activities from snapshot timestamps but disable sequence scrub. + +--- + +## 5. Task 3: Wire into `RunWorkspacePage` + +**Files:** + +- Modify: `ergon-dashboard/src/components/run/RunWorkspacePage.tsx` +- Modify: `ergon-dashboard/src/features/graph/components/MutationTimeline.tsx` only if extracting reusable controls. + +- [ ] **Step 1: Build activities from display state** + +Add: + +```typescript +const activities = useMemo( + () => + buildRunActivities({ + runState: displayState, + events, + mutations, + currentSequence: timelineMode === "timeline" ? currentSequence : null, + }), + [displayState, events, mutations, timelineMode, currentSequence], +); +``` + +- [ ] **Step 2: Replace timeline region** + +Replace the old `MutationTimeline` bottom panel with: + +```tsx +
+ setIsPlaying((prev) => !prev)} + onSpeedChange={setPlaybackSpeed} + onActivityClick={handleActivityClick} + /> +
+``` + +- [ ] **Step 3: Preserve event stream** + +Keep `UnifiedEventStream` as a collapsible secondary inspector, not the primary bottom timeline. + +- [ ] **Step 4: Run frontend check** + +Run: `pnpm --dir ergon-dashboard run check` + +Expected: TypeScript and lint pass. + +--- + +## 6. Task 4: Time-aware workspace + +**Files:** + +- Modify: `ergon-dashboard/src/components/workspace/TaskWorkspace.tsx` +- Test: add or extend component/unit tests near existing workspace tests if present. + +- [ ] **Step 1: Add selected time prop** + +`RunWorkspacePage` computes: + +```typescript +const selectedTimelineTime = useMemo(() => { + if (timelineMode !== "timeline") return null; + return mutations.find((mutation) => mutation.sequence === currentSequence)?.created_at ?? null; +}, [timelineMode, mutations, currentSequence]); +``` + +- [ ] **Step 2: Filter visible task evidence** + +Inside `TaskWorkspace`, apply filtering only when `selectedTimelineTime` is non-null. Use ISO string comparison after converting both sides to milliseconds with `Date.parse`. + +- [ ] **Step 3: Show time badge** + +Add a small badge in the workspace header: + +`Viewing evidence available at seq {currentSequence}` + +Only render in timeline mode. + +--- + +## 7. Task 5: Graph highlighting and readability + +**Files:** + +- Modify: `ergon-dashboard/src/components/dag/DAGCanvas.tsx` +- Modify: `ergon-dashboard/src/components/dag/TaskNode.tsx` +- Modify: `ergon-dashboard/src/features/graph/components/ContainerNode.tsx` +- Modify: `ergon-dashboard/src/features/graph/components/LeafNode.tsx` +- Modify: `ergon-dashboard/src/features/graph/layout/hierarchicalLayout.ts` only for collision/readability fixes. + +- [ ] **Step 1: Add highlight data** + +Pass node data flags: + +```typescript +isSelected: task.id === selectedTaskId, +isHighlighted: highlightedTaskIds.has(task.id), +``` + +- [ ] **Step 2: Keep whole graph at T** + +Do not filter nodes by selected activity/task. Highlight related nodes while preserving full topology. + +- [ ] **Step 3: Improve fit and spacing only where measured** + +If overlap persists in the 9-leaf smoke graph, tune `MIN_CONTAINER_WIDTH`, `CONTAINER_PADDING`, and dagre separation constants in `layoutTypes.ts` / `hierarchicalLayout.ts`. Do not introduce a second graph layout engine in this PR. + +--- + +## 8. Task 6: Remove or demote old mutation strip + +**Files:** + +- Modify or delete: `ergon-dashboard/src/features/graph/components/MutationTimeline.tsx` + +Decision after Task 2: + +- If controls are reused, rename to `SequenceControls.tsx`. +- If no code is reused, delete the component and update imports. + +Acceptance: the only bottom timeline users see is activity-stack based. diff --git a/docs/superpowers/plans/mas-run-visual-debugger/03-tests-and-e2e.md b/docs/superpowers/plans/mas-run-visual-debugger/03-tests-and-e2e.md new file mode 100644 index 00000000..1afc4ed5 --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/03-tests-and-e2e.md @@ -0,0 +1,275 @@ +# 03 — Tests and E2E + +**Status:** draft. +**Scope:** frontend unit tests, dashboard fixture tests, Playwright smoke assertions, screenshot capture points, and optional backend harness DTO additions. + +Cross-refs: test-refactor north star in `docs/superpowers/plans/test-refactor/03-dashboard-and-playwright.md`, implementation tasks in [`02-frontend-implementation.md`](02-frontend-implementation.md). + +--- + +## 1. Test strategy + +Use five layers: + +- **Pure unit tests:** prove activity derivation and stack packing without React or browser layout. +- **Golden fixture semantic tests:** pump realistic serialized MAS run data through replay, activity derivation, stack layout, and graph layout. +- **Coarse browser geometry checks:** assert catastrophic overlaps do not happen without pinning exact pixels. +- **Dashboard fixture e2e:** seed a deterministic concurrent run through dashboard harness routes and assert the visual debugger contract quickly. +- **Canonical smoke e2e:** run against real backend state and capture screenshots for graph + activity stack review. + +Do not assert pixel-perfect bar positions. Assert stable structure, counts, selected state, and task/sequence coordination. +Use local PNG dumps for human visual review while building; do not make PNG diffs a CI gate in the first PR. + +--- + +## 2. Unit tests + +### Activity derivation tests + +File: `ergon-dashboard/src/features/activity/buildRunActivities.test.ts` + +Required cases: + +- `ExecutionAttemptState.startedAt/completedAt` -> execution span. +- open running execution -> execution span ending at selected timeline time. +- resource event -> artifact marker. +- thread message -> message marker. +- task evaluation -> evaluation marker. +- context event with start/end -> context span. +- graph mutation -> graph marker with sequence. +- no agent lane key is emitted. + +### Stack layout tests + +File: `ergon-dashboard/src/features/activity/stackLayout.test.ts` + +Required cases: + +- non-overlap reuses row. +- overlap allocates rows. +- three-way overlap reports max concurrency. +- instant marker has minimum render width. +- deterministic order independent of input order. + +### Golden fixture semantic tests + +Files: + +- `ergon-dashboard/tests/fixtures/mas-runs/concurrent-mas-run.json` +- `ergon-dashboard/src/features/activity/goldenFixture.test.ts` +- `ergon-dashboard/src/features/graph/layout/goldenLayout.test.ts` +- `ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.test.ts` + +Required cases: + +- replaying fixture mutations to checkpoint sequence `T` yields the whole expected graph at `T`. +- graph layout for the fixture has no overlapping node/container boxes using coarse rectangle checks. +- activity stack reports expected max concurrency for overlapping executions. +- activity rows are not grouped by agent or worker identity. +- task evidence filtering hides resources/messages/evaluations created after selected time. + +Full details live in [`06-fast-feedback-and-visual-review.md`](06-fast-feedback-and-visual-review.md). + +--- + +## 3. Dashboard fixture update + +Modify `ergon-dashboard/tests/helpers/dashboardFixtures.ts`. + +Add a fixture run with: + +- root task plus at least 5 child tasks. +- two executions overlapping between `12:00:10` and `12:00:20`. +- one sandbox command inside an execution span. +- one thread message marker. +- one resource marker. +- one evaluation marker attached to a non-root task. +- graph mutations with sequences spanning node add/status events. + +Suggested helper shape: + +```typescript +export function concurrentMasRunState(): SerializedWorkflowRunState { + return serializedRunState({ + scenario: "concurrent-mas-debugger", + }); +} +``` + +If `serializedRunState` is not currently parameterized, extract current fixture setup into small helpers first. Keep old fixture behavior unchanged for existing specs. + +--- + +## 4. Playwright dashboard fixture spec + +Create `ergon-dashboard/tests/e2e/activity-stack.spec.ts`. + +Core assertions: + +```typescript +test("run visual debugger shows recursive graph, activity stack, and time-aware workspace", async ({ page }) => { + const client = new DashboardHarnessClient(page); + const { cohortId, runId } = await client.seedConcurrentMasRun(); + + await page.goto(`/cohorts/${cohortId}/runs/${runId}`); + + await expect(page.getByTestId("run-header")).toBeVisible(); + await expect(page.getByTestId("graph-canvas")).toBeVisible(); + await expect(page.getByTestId("activity-stack-region")).toBeVisible(); + await expect(page.getByTestId("activity-stack-row")).toHaveCountGreaterThan(1); + expect( + await overlappingPairsFor(page, '[data-testid^="graph-node-"]'), + ).toEqual([]); + + const firstExecution = page.locator('[data-testid^="activity-bar-"][data-kind="execution"]').first(); + await expect(firstExecution).toBeVisible(); + await firstExecution.click(); + + await expect(page.getByTestId("workspace-region")).toBeVisible(); + await expect(page.getByTestId("workspace-header")).toBeVisible(); + + await page.getByTestId("activity-step-forward").click(); + await expect(page.getByTestId("activity-current-sequence")).toContainText(/seq/i); +}); +``` + +If Playwright's matcher set lacks `toHaveCountGreaterThan`, replace with: + +```typescript +expect(await page.getByTestId("activity-stack-row").count()).toBeGreaterThan(1); +``` + +Add coarse geometry helpers in the spec or shared helper: + +```typescript +async function overlappingPairsFor(page: Page, selector: string): Promise<[number, number][]> { + const boxes = await page.locator(selector).evaluateAll((elements) => + elements.map((element) => { + const rect = element.getBoundingClientRect(); + return { x: rect.x, y: rect.y, width: rect.width, height: rect.height }; + }), + ); + return overlappingPairs(boxes, { tolerancePx: 2 }); +} +``` + +The overlap assertion is intentionally coarse. It catches the broken layout class we care about without becoming a pixel-perfect visual test. + +### Local-only PNG dumps + +The fixture spec should dump review screenshots only when explicitly requested: + +```bash +VISUAL_DEBUGGER_SCREENSHOTS=1 pnpm --dir ergon-dashboard exec playwright test tests/e2e/activity-stack.spec.ts --project=chromium +``` + +Output: + +```text +ergon-dashboard/tmp/visual-debugger/run-full.png +ergon-dashboard/tmp/visual-debugger/graph-canvas.png +ergon-dashboard/tmp/visual-debugger/activity-stack.png +ergon-dashboard/tmp/visual-debugger/workspace-open.png +``` + +These PNGs are for local human review while building. They should not run in CI and should not be committed. + +--- + +## 5. Canonical smoke e2e changes + +Modify `ergon-dashboard/tests/e2e/_shared/smoke.ts`. + +Add to `assertRunWorkspace` after `graph-canvas` assertion: + +```typescript +await expect(page.getByTestId("activity-stack-region")).toBeVisible(); + +const activityBars = page.locator('[data-testid^="activity-bar-"]'); +await expect(activityBars.first()).toBeVisible(); + +if (state.mutation_count > 0) { + await page.getByTestId("mode-timeline").click(); + await expect(page.getByTestId("timeline-region")).toBeVisible(); + await expect(page.getByTestId("activity-current-sequence")).toContainText(/seq/i); +} +``` + +Screenshot additions: + +- `/-visual-debugger-full.png` — full run page. +- `/-activity-stack.png` — bottom dock if Playwright can screenshot locator reliably. +- Keep existing happy/sad screenshots until the new ones prove stable. + +--- + +## 6. Optional backend harness DTO additions + +Only add these after frontend derivation is implemented and the e2e test needs backend truth for concurrency: + +Modify backend `/api/test/read/run/{run_id}/state` DTO to include: + +```json +{ + "activity_event_count": 37, + "activity_span_count": 12, + "max_concurrency": 4 +} +``` + +Modify `ergon-dashboard/tests/helpers/backendHarnessClient.ts`: + +```typescript +export interface BackendRunState { + activity_event_count?: number; + activity_span_count?: number; + max_concurrency?: number; +} +``` + +Rules: + +- These fields are optional in TypeScript while the backend branch catches up. +- Do not block the visual debugger UI on these fields. +- If added, Playwright may assert `max_concurrency >= 2` for the smoke run. + +--- + +## 7. Accessibility and stable selectors + +Required test IDs: + +- `activity-stack-region` +- `activity-stack-row` +- `activity-bar-{activityId}` +- `activity-current-sequence` +- `activity-step-back` +- `activity-step-forward` +- `activity-play-toggle` +- `activity-speed-control` +- `graph-canvas` +- `graph-node-{taskId}` +- `graph-container-{taskId}` +- `workspace-region` +- `workspace-header` + +Required ARIA labels: + +- Activity bar button: `Open activity {label}`. +- Sequence scrubber: `Run timeline sequence`. +- Play/pause: `Play timeline` / `Pause timeline`. + +--- + +## 8. Acceptance gate + +- [ ] Pure activity tests pass. +- [ ] Golden fixture semantic/layout tests pass. +- [ ] Dashboard fixture e2e passes locally. +- [ ] Fixture e2e coarse graph overlap check passes. +- [ ] Local PNG dump works when `VISUAL_DEBUGGER_SCREENSHOTS=1` is set. +- [ ] Canonical smoke e2e still passes locally. +- [ ] Screenshots show more than one activity row for concurrent samples. +- [ ] Clicking an activity with a task opens the workspace for that task. +- [ ] Scrubbing sequence updates graph status/topology via existing replay. +- [ ] No assertion relies on agent lane count. diff --git a/docs/superpowers/plans/mas-run-visual-debugger/04-phases.md b/docs/superpowers/plans/mas-run-visual-debugger/04-phases.md new file mode 100644 index 00000000..fdcb4ba0 --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/04-phases.md @@ -0,0 +1,230 @@ +# 04 — Phases, Deliverables, Acceptance Gates + +**Status:** draft. +**Scope:** delivery order for the frontend visual debugger branch. One PR is preferred if phases stay small; split after Phase C if review size gets uncomfortable. + +Cross-refs: program in [`00-program.md`](00-program.md), frontend tasks in [`02-frontend-implementation.md`](02-frontend-implementation.md), test contract in [`03-tests-and-e2e.md`](03-tests-and-e2e.md). + +--- + +## Delivery shape + +Each phase should be a clean commit with: + +- Scope: files touched. +- Deliverables: what now works. +- Acceptance gate: exact tests/commands before moving on. + +Do not start the next phase while the current phase is red. + +--- + +## Phase A — Plan and branch scaffold + +**Scope** + +- Create branch `feature/mas-run-visual-debugger-plan`. +- Add this plan folder. +- Keep mockups unmodified except as design reference. + +**Deliverables** + +- `docs/superpowers/plans/mas-run-visual-debugger/` exists. +- Branch records the implementation approach before app edits. + +**Acceptance gate** + +- `git branch --show-current` prints `feature/mas-run-visual-debugger-plan`. +- Plan docs are readable and self-contained. + +--- + +## Phase B — Pure activity model + +**Scope** + +- `ergon-dashboard/tests/fixtures/mas-runs/concurrent-mas-run.json` +- `ergon-dashboard/src/features/activity/types.ts` +- `ergon-dashboard/src/features/activity/buildRunActivities.ts` +- `ergon-dashboard/src/features/activity/stackLayout.ts` +- Unit tests for both modules. +- Golden fixture semantic tests from [`06-fast-feedback-and-visual-review.md`](06-fast-feedback-and-visual-review.md). + +**Deliverables** + +- Activity derivation from `WorkflowRunState`, `RunEvent[]`, and `GraphMutationDto[]`. +- Deterministic overlap stack layout. +- Realistic MAS fixture replay proves concurrency is derived from overlap, not agent lanes. +- No React component changes yet. + +**Acceptance gate** + +- `pnpm --dir ergon-dashboard test src/features/activity` +- Golden fixture tests pass locally. +- `pnpm --dir ergon-dashboard run check` + +**Not in this phase** + +- No UI replacement. +- No backend DTO changes. + +--- + +## Phase C — Bottom activity stack UI + +**Scope** + +- `ActivityStackTimeline.tsx` +- `ActivityBar.tsx` +- Wire into `RunWorkspacePage.tsx` behind existing timeline/live mode controls. +- Keep old `MutationTimeline` available until this phase is green. + +**Deliverables** + +- Bottom dock renders activity rows and bars. +- Sequence controls still work. +- Activity click selects task/sequence. +- Empty states are clear. + +**Acceptance gate** + +- `pnpm --dir ergon-dashboard run check` +- Local dashboard fixture page renders without runtime errors. +- Manual browser check against seeded run: graph visible, activity stack visible, workspace opens from activity. + +**Not in this phase** + +- No graph layout tuning unless the new dock breaks existing graph rendering. +- No smoke e2e assertions yet. + +--- + +## Phase D — Time-aware workspace and graph highlights + +**Scope** + +- `TaskWorkspace.tsx` filters task evidence by selected timeline time. +- `DAGCanvas.tsx`/node components accept selected and highlighted task IDs. +- Preserve whole graph at selected `T`. + +**Deliverables** + +- Selecting an activity highlights graph task and opens workspace. +- Selecting a graph node highlights related activity bars. +- Workspace indicates timeline time/sequence. +- Evidence that did not exist at selected time is hidden in timeline mode. + +**Acceptance gate** + +- `pnpm --dir ergon-dashboard run check` +- Component/unit tests for time filtering pass. +- Manual check: scrub backward before a resource appears; workspace no longer shows that resource. + +**Not in this phase** + +- No persisted UI preferences. +- No virtualization. + +--- + +## Phase E — Dashboard fixture e2e + +**Scope** + +- Add concurrent MAS dashboard fixture in `tests/helpers/dashboardFixtures.ts`. +- Add `tests/e2e/activity-stack.spec.ts`. +- Add selectors/ARIA labels required by [`03-tests-and-e2e.md §7`](03-tests-and-e2e.md). +- Add coarse browser geometry checks for graph node overlap. + +**Deliverables** + +- Fast deterministic Playwright test proves the visual debugger contract without real backend execution. +- Screenshot artifact captures the accepted layout shape when `VISUAL_DEBUGGER_SCREENSHOTS=1` is set locally. +- Browser geometry check catches catastrophic overlapping graph boxes without pixel-perfect assertions. +- Local-only PNG dump path works behind `VISUAL_DEBUGGER_SCREENSHOTS=1`. + +**Acceptance gate** + +- `pnpm --dir ergon-dashboard exec playwright test tests/e2e/activity-stack.spec.ts` +- Coarse graph overlap check passes. +- `VISUAL_DEBUGGER_SCREENSHOTS=1 pnpm --dir ergon-dashboard exec playwright test tests/e2e/activity-stack.spec.ts --project=chromium` writes PNGs under `ergon-dashboard/tmp/visual-debugger/`. +- Local PNG review command shows at least two activity rows and full graph canvas. + +**Not in this phase** + +- No backend harness DTO additions unless the fixture spec cannot cover a critical contract. + +--- + +## Phase F — Canonical smoke e2e hardening + +**Scope** + +- Update `tests/e2e/_shared/smoke.ts`. +- Extend screenshot capture points. +- Optionally extend `BackendRunState` and backend harness DTO with `activity_event_count`, `activity_span_count`, `max_concurrency`. + +**Deliverables** + +- Real smoke run opens the new visual debugger. +- Playwright proves graph, activity stack, sequence controls, and workspace are usable. +- Screenshots are useful for PR review. +- No CI visual-diff gate is introduced. + +**Acceptance gate** + +- Local smoke Playwright spec green for at least one benchmark. +- Full e2e matrix remains green before merge. +- Smoke screenshot artifacts are generated as review aids only. +- If harness DTO fields are added, backend unit/integration harness tests pass. + +**Not in this phase** + +- No production DTO expansion unless a user-facing timestamp gap is proven. + +--- + +## Phase G — Cleanup and docs + +**Scope** + +- Delete or rename obsolete `MutationTimeline.tsx`. +- Update dashboard architecture docs if they describe the old event stream/timeline split. +- Add a short note in PR description linking to the accepted mockup and this plan folder. + +**Deliverables** + +- No dead imports/components. +- Standing docs match the shipped dashboard behavior. + +**Acceptance gate** + +- `pnpm --dir ergon-dashboard run check` +- `rg -n "MutationTimeline" ergon-dashboard/src` returns either no matches or only the intentional renamed/reused sequence-control component. +- Final Playwright screenshots attached to PR. +- Final implementation review presents local PNGs for all new UI panels: full debugger page, graph canvas, activity stack bottom dock, and workspace drawer open on a selected task. + +--- + +## Phase size estimates + +| Phase | Scope | Est. diff size | +|---|---|---| +| A | Plan folder | ~500 lines docs | +| B | Activity pure model + golden fixture tests | ~650 LoC | +| C | Activity UI + RunWorkspace wiring + local PNG dump | ~750 LoC | +| D | Workspace filtering + graph highlights | ~300 LoC | +| E | Fixture e2e + browser geometry checks | ~350 LoC | +| F | Smoke hardening + optional harness DTO | ~200-500 LoC | +| G | Cleanup/docs | ~100 LoC | + +--- + +## Failure modes + +- **Activity bars look like lanes:** remove any row grouping by actor/agent. Rows are only collision rows. +- **Graph disappears while scrubbing:** inspect `replayToSequence` input state and current sequence; do not filter by selected task. +- **Workspace shows future evidence:** compare evidence timestamps to selected mutation `created_at`. +- **PNG review reveals cramped layout:** tune spacing/styling, then keep semantic and geometry tests green. Do not add pixel-perfect screenshot diffs in the first PR. +- **E2E flakes on exact counts:** assert minimum visibility and backend DTO truth, not pixel geometry. +- **Backend DTO temptation:** use the decision tree in `01-contracts-and-state.md`; most first-pass needs are frontend-derived. diff --git a/docs/superpowers/plans/mas-run-visual-debugger/05-implementation-shape.md b/docs/superpowers/plans/mas-run-visual-debugger/05-implementation-shape.md new file mode 100644 index 00000000..5fd6a4a3 --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/05-implementation-shape.md @@ -0,0 +1,302 @@ +# 05 — Implementation Shape, File Ownership, and Refactor Boundaries + +**Status:** draft for review. +**Scope:** the reviewer-facing "how" plan: what domains the frontend should have after this work, which files are added, which files are refactored, which files are deleted or deliberately left alone, and how tests are laid out. + +Cross-refs: product/DTO stance in [`00-program.md`](00-program.md), activity contracts in [`01-contracts-and-state.md`](01-contracts-and-state.md), phase gates in [`04-phases.md`](04-phases.md). + +--- + +## 1. Target domain map + +After the implementation, the run dashboard should have these frontend domains: + +| Domain | Responsibility | Owns | Must not own | +|---|---|---|---| +| `features/activity` | Turn run state into time-based activity, pack overlaps into stack rows, render bottom dock. | Activity types, derivation, overlap layout, activity timeline UI. | Graph replay, workspace evidence rendering, backend fetching. | +| `features/graph` | Reconstruct and render recursive task topology at selected sequence/time. | Graph mutation contracts, replay reducer, React Flow layout, node components. | Activity stacking, agent lanes, workspace filtering. | +| `components/workspace` | Show task-scoped evidence for the selected task. | Resources, executions, sandbox commands, messages, context events, evaluations for one task. | Timeline packing, graph topology. | +| `components/run` | Page orchestration and cross-panel selection state. | Live/timeline mode, selected task, selected activity, selected sequence, panel composition. | Pure derivation algorithms. | +| `lib/runEvents` | Normalize existing state into a chronological event stream. | Event union, event labels/colors, stream rows. | Visual timeline row allocation. | +| `tests/e2e` + `tests/helpers` | Prove the visual debugger contract with fixture and smoke runs. | Stable selectors, seeded concurrent fixture, screenshot capture, harness assertions. | Pixel-perfect visual diffs. | + +The most important boundary: **activity stack rows are not a domain concept**. They are a layout result. The domain concept is a `RunActivity` with task/time/kind metadata. + +--- + +## 2. Intended folder layout + +Target new files: + +```text +ergon-dashboard/src/features/activity/ + types.ts + buildRunActivities.ts + stackLayout.ts + goldenFixture.test.ts + buildRunActivities.test.ts + stackLayout.test.ts + components/ + ActivityStackTimeline.tsx + ActivityBar.tsx + ActivityKindLegend.tsx + SequenceControls.tsx +``` + +Target modified existing files: + +```text +ergon-dashboard/src/components/run/ + RunWorkspacePage.tsx + +ergon-dashboard/src/components/dag/ + DAGCanvas.tsx + TaskNode.tsx + +ergon-dashboard/src/features/graph/components/ + ContainerNode.tsx + LeafNode.tsx + MutationTimeline.tsx + +ergon-dashboard/src/features/graph/layout/ + hierarchicalLayout.ts + layoutTypes.ts + +ergon-dashboard/src/components/workspace/ + TaskWorkspace.tsx + +ergon-dashboard/src/lib/ + runEvents.ts + statusTokens.ts +``` + +Target test files: + +```text +ergon-dashboard/tests/helpers/ + dashboardFixtures.ts + testHarnessClient.ts + backendHarnessClient.ts + +ergon-dashboard/tests/fixtures/mas-runs/ + concurrent-mas-run.json + nested-delegation-run.json + README.md + +ergon-dashboard/tests/e2e/ + activity-stack.spec.ts + _shared/smoke.ts +``` + +Optional backend files if the e2e harness needs additive DTO truth: + +```text +ergon_core/ergon_core/core/api/test_harness.py +tests/unit/test_test_harness.py +tests/integration/smokes/test_smoke_harness.py +``` + +--- + +## 3. Add, refactor, delete, leave alone + +### Add + +| File | Why it exists | +|---|---| +| `features/activity/types.ts` | Shared activity vocabulary: `RunActivity`, `ActivityKind`, `ActivityStackLayout`, `ActivityStackItem`. | +| `features/activity/buildRunActivities.ts` | Pure state-to-activity derivation. Lets tests verify semantics without React. | +| `features/activity/stackLayout.ts` | Pure overlap packing. Keeps "concurrency stack" independent from rendering. | +| `features/activity/components/ActivityStackTimeline.tsx` | Bottom dock shell: time ruler, rows, controls, legend, selection. | +| `features/activity/components/ActivityBar.tsx` | Single activity marker/span renderer. Keeps bar styling out of the dock shell. | +| `features/activity/components/ActivityKindLegend.tsx` | Small count/filter legend if `ActivityStackTimeline.tsx` gets too large. | +| `features/activity/components/SequenceControls.tsx` | Reusable play/step/speed controls extracted from old mutation timeline behavior. | +| `features/activity/buildRunActivities.test.ts` | Unit coverage for event/span semantics. | +| `features/activity/stackLayout.test.ts` | Unit coverage for overlap packing and max concurrency. | +| `features/activity/goldenFixture.test.ts` | Pumps realistic MAS fixture data through replay/activity/stack derivation. | +| `tests/fixtures/mas-runs/concurrent-mas-run.json` | Stable local fixture for semantic layout and browser visual review. | +| `tests/fixtures/mas-runs/nested-delegation-run.json` | Optional second fixture for deeper recursive nesting once the first path is green. | +| `tests/e2e/activity-stack.spec.ts` | Fast fixture-driven UI contract for the new debugger. | + +### Refactor + +| File | Refactor | +|---|---| +| `RunWorkspacePage.tsx` | Becomes the cross-panel coordinator. It should compute display state, activities, selected time, selected task/activity, and pass props down. It should not implement activity derivation inline. | +| `DAGCanvas.tsx` | Adds highlight props and preserves graph-level controls. No activity logic here. | +| `TaskNode.tsx`, `ContainerNode.tsx`, `LeafNode.tsx` | Add selected/highlight styling and stable test IDs. Avoid redesigning node semantics. | +| `TaskWorkspace.tsx` | Adds time-aware filtering by selected sequence time. Keep the existing evidence sections. | +| `MutationTimeline.tsx` | Either deleted after replacement, or split so reusable sequence controls move to `features/activity/components/SequenceControls.tsx`. | +| `hierarchicalLayout.ts`, `layoutTypes.ts` | Only tune spacing if smoke screenshots still show overlap. Keep dagre and current recursive container model. | +| `runEvents.ts` | Remains event normalization. It may gain helper exports, but it should not pack visual rows. | +| `dashboardFixtures.ts` | Adds a deterministic concurrent MAS fixture, preserving existing fixture exports. | +| `_shared/smoke.ts` | Adds activity stack assertions and screenshots without making visual pixel claims. | +| `activity-stack.spec.ts` | Adds coarse DOM bounding-box overlap checks and optional local screenshot dumping behind `VISUAL_DEBUGGER_SCREENSHOTS=1`. | + +### Delete + +Delete only after the activity stack is wired and tested: + +| File | Delete condition | +|---|---| +| `features/graph/components/MutationTimeline.tsx` | Delete if no code is reused by `SequenceControls.tsx`. | + +No other deletions are planned for the first visual debugger PR. + +### Leave alone + +| Area | Reason | +|---|---| +| Backend execution/control-flow services | The UI problem is representational; backend task orchestration does not need to change. | +| Graph mutation persistence model | Existing sequence/time mutation contract is the right replay primitive. | +| React Flow dependency | The current rendering stack already supports recursive graph rendering. | +| Cohort pages | This work is scoped to run detail pages and smoke screenshots. | +| Production REST schemas | Avoid production DTO expansion unless a real user-facing timestamp gap is proven. | + +--- + +## 4. Data flow after refactor + +```text +REST snapshot / socket updates + | + v +useRunState(runId) --------------------+ + | | + v | +WorkflowRunState | + | | + +--> replayToSequence() ----> displayState at T ----> DAGCanvas + | | + +--> buildRunEvents() ---------+ + | | +/api/runs/{runId}/mutations -----------+ + | + v +buildRunActivities(displayState, events, mutations, currentSequence) + | + v +stackActivities(activities) + | + v +ActivityStackTimeline + | + +--> select task/activity/sequence + | + v +RunWorkspacePage state + | + +--> DAGCanvas highlight/selection + +--> TaskWorkspace selected task + selected time +``` + +Selection rules: + +- Graph node click sets `selectedTaskId`. +- Activity click sets `selectedActivityId`, sets `selectedTaskId` when present, and jumps to `activity.sequence` when present. +- Sequence scrub changes `currentSequence`; it does not clear task selection unless the selected task does not exist at that sequence. +- Workspace reads selected task from `displayState`, not live state, when timeline mode is active. + +--- + +## 5. Test layout + +### Pure unit tests + +```text +ergon-dashboard/src/features/activity/buildRunActivities.test.ts +ergon-dashboard/src/features/activity/stackLayout.test.ts +``` + +These tests should use small inline fixture builders. Do not import Playwright, React, or browser APIs. + +### Component-level tests if local harness exists + +If the dashboard already has React component tests, add: + +```text +ergon-dashboard/src/features/activity/components/ActivityStackTimeline.test.tsx +``` + +This test should assert: + +- rows render from layout items. +- clicking a bar calls `onActivityClick`. +- controls call `onSequenceChange`. + +If the project does not have component-test infrastructure, skip this and rely on pure unit + Playwright. + +### Fixture e2e + +```text +ergon-dashboard/tests/e2e/activity-stack.spec.ts +``` + +This is the fast UI contract: + +- seed concurrent MAS fixture. +- open run page. +- assert graph, stack, and workspace regions. +- assert more than one stack row. +- assert no catastrophic graph-node bounding-box overlaps. +- click activity -> workspace opens. +- scrub sequence -> current sequence indicator changes. +- dump local PNGs only when `VISUAL_DEBUGGER_SCREENSHOTS=1`. + +### Golden fixture semantic tests + +```text +ergon-dashboard/src/features/activity/goldenFixture.test.ts +ergon-dashboard/src/features/graph/layout/goldenLayout.test.ts +ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.test.ts +``` + +These are the fast feedback loop for the exact failure mode we want to avoid: + +- replay fixture to selected sequence `T`; +- assert whole graph expected at `T`; +- assert no overlapping graph boxes in pure layout output; +- assert activity stack max concurrency; +- assert row assignment does not depend on agent/worker identity; +- assert future task evidence is hidden in timeline mode. + +### Local PNG review + +```text +ergon-dashboard/tmp/visual-debugger/ + run-full.png + graph-canvas.png + activity-stack.png + workspace-open.png +``` + +Generated only by local command: + +```bash +VISUAL_DEBUGGER_SCREENSHOTS=1 pnpm --dir ergon-dashboard exec playwright test tests/e2e/activity-stack.spec.ts --project=chromium +``` + +These files are for human review while building. They should not be committed and should not be required in CI. + +### Smoke e2e + +```text +ergon-dashboard/tests/e2e/_shared/smoke.ts +``` + +This is the real integration contract: + +- backend harness proves graph/resources/evaluations are real. +- dashboard proves visual debugger renders real run state. +- screenshots capture full page and activity stack. + +--- + +## 6. Review questions before implementation + +1. Should `features/activity` own `SequenceControls.tsx`, or should sequence controls live under `features/graph` because sequences come from graph mutations? +2. Should `ActivityStackTimeline` support filtering by kind in the first PR, or only render the legend counts? +3. Should `TaskWorkspace` hide future evidence in timeline mode, or show it disabled with "after selected time" labels? +4. Should the old event stream stay visible by default, or be collapsed once the activity stack exists? +5. Should fixture e2e be required before smoke e2e changes, or can smoke drive the first UI contract directly? +6. Should `nested-delegation-run.json` ship in the first PR, or should the first PR use only `concurrent-mas-run.json` and add the deeper fixture after the UI stabilizes? diff --git a/docs/superpowers/plans/mas-run-visual-debugger/06-fast-feedback-and-visual-review.md b/docs/superpowers/plans/mas-run-visual-debugger/06-fast-feedback-and-visual-review.md new file mode 100644 index 00000000..d7a16b52 --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/06-fast-feedback-and-visual-review.md @@ -0,0 +1,309 @@ +# 06 — Fast Feedback, TDD, and Local Visual Review + +**Status:** draft. +**Scope:** the feedback loop that prevents another unreadable MAS layout from landing: test-first semantic layout checks, coarse browser geometry assertions, and local-only PNG dumps for human visual review. + +Cross-refs: test contract in [`03-tests-and-e2e.md`](03-tests-and-e2e.md), implementation shape in [`05-implementation-shape.md`](05-implementation-shape.md). + +--- + +## 1. Why this exists + +The prior UI failure was not mainly a data-fetching failure. It was a semantics/layout failure: + +- recursive task containers were hard to read; +- graph state at selected time `T` was not clearly represented; +- timeline lanes implied stable agents even though agents/workers can join and leave; +- overlapping work was not represented as concurrency; +- visual density problems were not caught by tests. + +This plan adds a fast feedback loop before full e2e smoke: + +1. Pure TDD tests for semantics and layout algorithms. +2. Coarse browser geometry checks for catastrophic overlap. +3. Local-only PNG dumps that humans inspect while building the UI. + +PNG review is required for development/review discipline, but it is **not** a CI gate in the first PR. + +--- + +## 2. Test-first policy for this feature + +Use TDD for the core behavior: + +- write the failing semantic/layout test; +- run it and confirm it fails for the expected reason; +- implement the smallest code to pass; +- keep the test as a regression guard. + +Do this for: + +- activity derivation; +- activity overlap packing; +- graph snapshot at sequence `T`; +- no graph node overlap for the golden fixture; +- activity click -> task/sequence selection; +- workspace time filtering. + +Do not use TDD for throwaway visual CSS tweaking. For CSS, use local PNG review and coarse browser checks. + +--- + +## 3. Golden fixture data + +Add deterministic fixture data that represents the MAS case we care about. + +Target files: + +```text +ergon-dashboard/tests/fixtures/mas-runs/ + concurrent-mas-run.json + nested-delegation-run.json + README.md +``` + +`concurrent-mas-run.json` should include: + +- full serialized run snapshot; +- graph mutations sorted by sequence; +- expected sequence checkpoints; +- expected graph node IDs/slugs at each checkpoint; +- expected activity concurrency facts. + +Example shape: + +```json +{ + "name": "concurrent-mas-run", + "runState": {}, + "mutations": [], + "checkpoints": [ + { + "sequence": 12, + "expectedTaskSlugs": ["root", "d_root", "d_left", "d_right", "d_join", "l_1"], + "expectedVisibleResourceNames": [], + "expectedMaxConcurrency": 3 + } + ] +} +``` + +Rules: + +- Keep fixture JSON small enough to review. +- Prefer real captured run shape when available, then minimize it. +- Do not include secrets, model outputs, or large artifacts. +- If the fixture comes from a real run/VCR capture, sanitize IDs only if tests do not depend on specific UUID shape. + +--- + +## 4. Pure semantic layout tests + +Create: + +```text +ergon-dashboard/src/features/activity/goldenFixture.test.ts +ergon-dashboard/src/features/graph/layout/goldenLayout.test.ts +ergon-dashboard/src/components/workspace/timeFiltering.test.ts +``` + +### Activity fixture test + +This test pumps fixture data through pure functions: + +```typescript +import fixture from "../../../tests/fixtures/mas-runs/concurrent-mas-run.json"; +import { parseGraphMutationDtoArray } from "@/features/graph/contracts/graphMutations"; +import { replayToSequence } from "@/features/graph/state/graphMutationReducer"; +import { buildRunActivities } from "./buildRunActivities"; +import { stackActivities } from "./stackLayout"; +import { buildRunEvents } from "@/lib/runEvents"; +import { deserializeRunState } from "@/lib/runState"; + +it("derives concurrency from overlapping activity rather than agent lanes", () => { + const liveState = deserializeRunState(fixture.runState); + const mutations = parseGraphMutationDtoArray(fixture.mutations); + const checkpoint = fixture.checkpoints.find((c) => c.sequence === 12)!; + const displayState = replayToSequence(mutations, checkpoint.sequence, emptyRunStateFrom(liveState), new Map()); + const events = buildRunEvents(displayState); + const activities = buildRunActivities({ runState: displayState, events, mutations, currentSequence: checkpoint.sequence }); + const stack = stackActivities(activities); + + expect(stack.maxConcurrency).toBe(checkpoint.expectedMaxConcurrency); + expect(new Set(activities.map((activity) => activity.kind))).toEqual( + expect.arrayContaining(["execution", "graph", "artifact", "evaluation"]), + ); + expect(stack.items.some((item) => item.activity.actor && item.row === Number(item.activity.actor))).toBe(false); +}); +``` + +The exact helper names can change during implementation, but the assertion intent must stay: + +- concurrency comes from overlap; +- activities are not grouped by agent/worker lane; +- graph mutations remain sequence-addressable. + +### Graph layout fixture test + +This test runs the same fixture through replay + layout and asserts no overlapping rendered boxes. + +```typescript +it("lays out the whole recursive graph at sequence T without overlapping node boxes", () => { + const displayState = replayFixtureToSequence("concurrent-mas-run", 12); + const result = computeHierarchicalLayout( + displayState.tasks, + calculateExpandedContainers(displayState.tasks, Infinity), + "", + undefined, + null, + "LR", + new Set(), + ); + + expect(new Set(result.nodes.map((node) => node.id))).toEqual(expectedWholeGraphNodeIdsAtSequence(12)); + expect(findOverlappingNodeBoxes(result.nodes)).toEqual([]); +}); +``` + +`findOverlappingNodeBoxes` should compare coarse rectangles from React Flow node `position`, `width`, and `height`. This is not a pixel-perfect visual diff; it catches catastrophic overlap. + +### Workspace time filtering test + +Extract filtering into a pure helper if `TaskWorkspace.tsx` is otherwise hard to test: + +```text +ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.ts +ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.test.ts +``` + +Assert: + +- resource created after selected time is hidden; +- execution started before selected time is visible; +- message created after selected time is hidden; +- live mode returns unfiltered evidence. + +--- + +## 5. Browser geometry checks + +Add coarse checks to `ergon-dashboard/tests/e2e/activity-stack.spec.ts`. + +Use DOM bounding boxes for rendered elements: + +```typescript +async function boxesFor(page: Page, selector: string) { + return page.locator(selector).evaluateAll((elements) => + elements.map((element) => { + const rect = element.getBoundingClientRect(); + return { x: rect.x, y: rect.y, width: rect.width, height: rect.height }; + }), + ); +} + +function overlappingPairs(boxes: { x: number; y: number; width: number; height: number }[]) { + const pairs: [number, number][] = []; + for (let i = 0; i < boxes.length; i++) { + for (let j = i + 1; j < boxes.length; j++) { + if (boxesOverlap(boxes[i], boxes[j])) pairs.push([i, j]); + } + } + return pairs; +} + +expect(overlappingPairs(await boxesFor(page, '[data-testid^="graph-node-"]'))).toEqual([]); +``` + +Rules: + +- Use coarse overlap checks only. +- Do not assert exact coordinates. +- Ignore tiny overlaps below 2px if React Flow transform/subpixel rendering creates false positives. +- Keep these checks on fixture e2e first; only add to real smoke if stable. + +--- + +## 6. Local-only PNG dump + +Add a developer-only screenshot command/spec path. This is for us while building and reviewing. It does **not** need to run in CI. + +Target output: + +```text +ergon-dashboard/tmp/visual-debugger/ + run-full.png + graph-canvas.png + activity-stack.png + workspace-open.png +``` + +Suggested command: + +```bash +pnpm --dir ergon-dashboard exec playwright test tests/e2e/activity-stack.spec.ts --project=chromium +``` + +The spec should write screenshots when `VISUAL_DEBUGGER_SCREENSHOTS=1`: + +```typescript +const shouldDumpScreenshots = process.env.VISUAL_DEBUGGER_SCREENSHOTS === "1"; + +if (shouldDumpScreenshots) { + await page.screenshot({ + path: "tmp/visual-debugger/run-full.png", + fullPage: true, + }); + await page.getByTestId("graph-canvas").screenshot({ + path: "tmp/visual-debugger/graph-canvas.png", + }); + await page.getByTestId("activity-stack-region").screenshot({ + path: "tmp/visual-debugger/activity-stack.png", + }); + await page.getByTestId("workspace-region").screenshot({ + path: "tmp/visual-debugger/workspace-open.png", + }); +} +``` + +Recommended local command: + +```bash +VISUAL_DEBUGGER_SCREENSHOTS=1 pnpm --dir ergon-dashboard exec playwright test tests/e2e/activity-stack.spec.ts --project=chromium +``` + +Review rules: + +- Inspect PNGs locally during development. +- Look for cramped graph, overlapping containers, unreadable labels, poor activity row density, confusing color hierarchy, and workspace clipping. +- Treat final implementation review as incomplete until the implementer presents the four panel PNGs to the user/reviewer: `run-full.png`, `graph-canvas.png`, `activity-stack.png`, and `workspace-open.png`. +- Do not commit PNGs from `tmp/visual-debugger/`. +- Do not block CI on PNG generation or screenshot diffs in the first PR. + +--- + +## 7. What becomes a hard gate + +Hard gates: + +- pure semantic tests pass; +- fixture e2e renders graph/stack/workspace; +- coarse graph node overlap check passes for golden fixture; +- no test asserts fixed agent lane counts; +- local screenshot command works when run manually. + +Not hard gates in first PR: + +- pixel-perfect screenshot diff; +- exact `x/y` coordinate assertions; +- local PNG files existing in CI; +- visual comparison against the HTML mockup. + +--- + +## 8. Phase impact + +This adds work to the phase plan: + +- Phase B adds golden fixture semantic tests before implementing activity/layout code. +- Phase E adds browser geometry overlap checks and local screenshot dumping behind `VISUAL_DEBUGGER_SCREENSHOTS=1` to fixture e2e. +- Phase F keeps screenshot artifacts for real smoke/PR review, but no CI visual-diff gate. diff --git a/docs/superpowers/plans/mas-run-visual-debugger/README.md b/docs/superpowers/plans/mas-run-visual-debugger/README.md new file mode 100644 index 00000000..f3677b19 --- /dev/null +++ b/docs/superpowers/plans/mas-run-visual-debugger/README.md @@ -0,0 +1,22 @@ +# MAS Run Visual Debugger — plan folder + +**Status:** draft for review — branch planning only; no frontend implementation landed yet. +**Date:** 2026-04-26. +**Branch:** `feature/mas-run-visual-debugger-plan`. +**Design reference:** `ergon-dashboard/mockups/mas-activity-stack-debugger.html`. + +## Read order + +1. [`00-program.md`](00-program.md) — product goal, non-goals, UX invariants, DTO stance, merge checklist. +2. [`05-implementation-shape.md`](05-implementation-shape.md) — reviewer-facing "how": domains, file ownership, add/refactor/delete plan, test layout. +3. [`01-contracts-and-state.md`](01-contracts-and-state.md) — event/DTO inventory, activity-stack domain model, replay rules, and where backend contract changes are actually needed. +4. [`02-frontend-implementation.md`](02-frontend-implementation.md) — component and layout plan for the three-pane visual debugger. +5. [`03-tests-and-e2e.md`](03-tests-and-e2e.md) — unit/component/e2e coverage, screenshot contract, and harness DTO additions. +6. [`06-fast-feedback-and-visual-review.md`](06-fast-feedback-and-visual-review.md) — TDD fixture loop, coarse layout geometry checks, and local-only PNG review workflow. +7. [`04-phases.md`](04-phases.md) — phased delivery order with acceptance gates. + +## Principle + +The dashboard should be a visual debugger for a MAS run, not an agent swimlane view. The durable axes are graph state, task-scoped events, and wall-clock overlap. Agents/workers are labels on events, not layout anchors. + +When documents disagree, `00-program.md` wins. When `00-program.md` and code reality disagree, update `00-program.md` first and re-review before implementing. diff --git a/ergon-dashboard/.gitignore b/ergon-dashboard/.gitignore index cc0e351e..8032f00e 100644 --- a/ergon-dashboard/.gitignore +++ b/ergon-dashboard/.gitignore @@ -10,6 +10,7 @@ /coverage /test-results/ /playwright-report/ +/tmp/ # next.js /.next/ diff --git a/ergon-dashboard/docs/design-audit/01-design-spec-summary.md b/ergon-dashboard/docs/design-audit/01-design-spec-summary.md new file mode 100644 index 00000000..4df688d1 --- /dev/null +++ b/ergon-dashboard/docs/design-audit/01-design-spec-summary.md @@ -0,0 +1,177 @@ +# Ergon Design Spec — Summary of Intent + +> Source: `ergon.zip` — a 12-slide HTML/CSS/JS design deck (1920×1080 per slide). + +## Core philosophy + +**"Light, dense, neutral. One accent. Status colors carry the meaning; chrome stays out of the way so the graph can speak."** + +Surface is `Light · paper`. Typography is `Inter` (UI) + `JetBrains Mono` (data / code). The only accent is indigo — used *exclusively* for selection rings and snapshot pins, never decoratively. + +--- + +## The 8 surfaces + +The design spec defines 8 distinct UI surfaces, presented as 12 slides (some are transition specs): + +### 1. Cohort list (slide 03) + +- **Global topbar**: `56px` height, white card background, 1px bottom border. + - Left: **Ergon logo + wordmark**, then a **5-tab nav**: `Cohorts | Runs | Training | Models | Settings`. + - Right: **Search bar** (`⌕ Search cohorts, runs, tasks… ⌘K`), a **primary CTA button** (`+ New cohort`), and a **user avatar circle** (`JM`). +- **Page header**: `Workspace · diamond` kicker, `Cohorts` h1, subtitle (`38 active · 2 need attention · last activity 4m ago`). +- **Filter segments** (two rows): + - Status: `All · 42 | Active · 38 | Running · 6 | Needs attention · 2 | Archived · 4` + - Sort: `Recent | Score | Failure rate | Runs` +- **Dense data table** inside a `.card`: + - 7 columns: `Cohort | Runs | Avg score | Failure | Runtime · last activity | Status | ›` + - Header: 11px uppercase, `#98a2b1` (faint). + - Rows: cohort name + mono ID, mono data cells, `pill--solid` status badges, right-aligned chevron. + - 8 sample rows shown (swe-bench, princeton-shrimp, swe-gym, etc.). +- **Footer**: `Showing 8 of 42 cohorts` + live update indicator with green dot. + +### 2. Cohort detail (slide 04) + +- Same topbar as above. +- **Breadcrumb**: `Cohorts › swe-bench-verified · sonnet-4.5 · v0.7.2`. +- **Header**: Cohort name h1, subtitle (`500 runs · started 2026-04-25 18:12 · created by jm`), action buttons: `Compare | Re-run failed | Open in training`. +- **5 summary metric tiles** (key metrics, each in a `.card`): + - `Resolution: 62.4%` (▲ 3.1pp vs v0.7.1) + - `Runs · pass / fail: 312 / 188` (progress bar) + - `Avg runtime: 2:14` (min · p95 4:32) + - `Avg tasks: 11.4` (2.1 levels deep · 1.7 retries) + - `Cost: $84.20` ($0.17 / run · 41M tokens) +- **Two-column split below**: + - Left: Score distribution chart (scatter/histogram/curve toggle, SVG scatter of pass vs fail vs running). + - Right: Runs list card with header (`500 total · 6 running`), filter segments (`All | Running | Failed`), and scrollable run rows (id, status pill, time, score). + +### 3. Run workspace — live (slide 05) + +The main debugger surface. Three-row grid: `auto 1fr 300px`. + +**Row 1 — Run header strip** (card background, border-bottom): +- Left: breadcrumb (`Cohorts › swe-bench-verified · sonnet-4.5 › django__django-12345`), run name + status pill + `live · 1m 42s` kicker. +- Right: **inline stats**: `Tasks: 2·2·1·5 | Tokens: 142k | Cost: $0.18 | Score: —`, then `Re-run` button and `⋯` ghost button. + +**Row 2 — Graph stage** (dot-grid paper background): +- Floating controls top-left: zoom `+−⌂`, depth selector `1|2|3|all`, search input. +- **Minimap** top-right (200×130px card with colored rectangles + accent selection rect). +- **Legend** bottom-left (completed/running/ready/pending/failed dots). +- **Graph SVG**: dashed container boxes (`diamond_root`), nodes with status-colored fills and dot indicators. Edges between containers with I/O ports. + +**Row 3 — Activity stack dock** (300px, light `#fafbfc` background, NOT dark): +- Header bar: `ACTIVITY STACK` label + `Live · auto-tail` green pill + `seq 0 — 214 · streaming` + right legend dots. +- **Left rubric**: "Concurrent activity / Bars stack only when they overlap." +- **Time axis**: mono timestamps (21:33 → 21:40). +- **Stacked bars**: event bars colored by **kind** (NOT status): + - graph_mutation = magenta/pink + - task_execution = violet/purple + - tool_call = amber + - message = cyan + - resource = green + - eval = red + - transition = blue +- Each bar has a **start marker** (circle) and **rounded pill shape**. +- **NOW cursor**: green pulsing line + `NOW` pill at leading edge. +- **Footer hints**: "Color = kind | Vertical stack = overlap | Click bar = select task/span | Click ● = lock graph above to that snapshot | Auto-tailing · new events append at right" + +### 4. Run workspace — drawer open + snapshot (slide 06) + +Same as slide 05 but with: +- **Snapshot pin** on the timeline at seq 42 (indigo vertical line + `SEQ 42` pill). +- Header gets: `graph · seq 42 · 21:36:14` chip. +- **Right drawer** (460px, `shadow-pop`, inside graph stage): + - Header: `Task workspace` title, pin/close buttons, task name `run_failing_test` + running pill, breadcrumb path. + - **Tab row**: `Overview | Transitions | Generations | Resources | Evals (2) | Logs`. + - **Content sections**: Worker info, Transitions (status pill pairs with seq + times), Current turn (tool call card with command + error), Evals on this task (judge running + harness passed cards with scores), Resources at seq 42 (file list). + - **Footer bar**: `Open in workspace` button + `Jump to live →`. + +### 5. Recursive nesting (slide 07) + +Same workspace, but graph at depth=2 shows: +- `diamond_root` outer container with L→R flow. +- `planning` (2 nodes), `exploration` (contains `repro_loop` nested sub-DAG with 6 nodes, retry back-edges), `implementation` (4 nodes, fan-out/join), `evaluation` (3 nodes). +- I/O ports (triangles) on container edges. +- Inter-container edges with arrow markers. +- `task ›` input label, `› result` output label. + +### 6. Edge states (slide 08) + +Three cards: Empty cohort, Run · failed, Connection · stale + Unhandled mutation + No graph. + +### 7–9. Transitions (slides 09–11) + +Three storyboard transition specs: +- T1: Cohort row → run workspace (320ms, shared element morph). +- T2: Graph node click → drawer (260ms, selection ring + slide). +- T3: Click event → graph snapshot (180ms per node delta). + +### 10. Information architecture (slide 12) + +Four-column summary: `Cohorts → Cohort detail → Run workspace → Task drawer`. "The dashboard is a funnel." + +--- + +## Design system tokens (from styles.css) + +### Surfaces +| Token | Value | Usage | +|-------|-------|-------| +| `--paper` | `#f6f7f9` | Page background | +| `--paper-2` | `#eef0f3` | Secondary surface, kickers | +| `--paper-3` | `#e6e9ee` | Tertiary | +| `--card` | `#ffffff` | Card backgrounds | +| `--ink` | `#0c1118` | Primary text | +| `--ink-2` | `#1f2733` | Secondary text | +| `--muted` | `#64707f` | Muted text | +| `--faint` | `#98a2b1` | Faint text, column headers | +| `--line` | `#e2e6ec` | Borders | +| `--line-strong` | `#cdd3dc` | Stronger borders, dashes | + +### Status colors (oklch) +| Status | Value | +|--------|-------| +| pending | `oklch(0.72 0.02 250)` — slate | +| ready | `oklch(0.74 0.10 240)` — sky | +| running | `oklch(0.78 0.14 80)` — amber | +| completed | `oklch(0.70 0.13 155)` — emerald | +| failed | `oklch(0.68 0.18 22)` — rose | +| cancelled | `oklch(0.62 0.02 260)` | + +### Accent +| Token | Value | +|-------|-------| +| `--accent` | `oklch(0.62 0.16 252)` — indigo | +| `--accent-soft` | `oklch(0.94 0.04 252)` | +| `--accent-ink` | `oklch(0.32 0.12 252)` | + +### Activity stack kind colors (from deck.js) +| Kind | Fill | Text | +|------|------|------| +| graph_mutation | `oklch(0.78 0.14 305)` magenta | white | +| task_execution | `oklch(0.74 0.16 295)` violet | white | +| tool_call | `oklch(0.78 0.16 60)` amber | dark | +| message | `oklch(0.76 0.13 200)` cyan | dark | +| resource | `oklch(0.74 0.13 155)` green | dark | +| eval | `oklch(0.70 0.18 25)` red | white | +| transition | `oklch(0.74 0.10 240)` blue | white | + +### Typography +| Role | Size | Tracking | +|------|------|----------| +| Display | 56px | -3% | +| Title | 28px | -2% | +| Body | 14px | 0 | +| UI | 12px | 0 | +| Caption | 11px | +1% | + +### Spacing / radii +- `--radius`: 10px (cards) +- `--radius-sm`: 6px (pills, nodes) +- Topbar: 56px height, 24px horizontal padding +- Page content: 32–48px padding +- Cards: `shadow-sm` border + +### Fonts +- `Inter` 400/500/600 — sans body +- `JetBrains Mono` 400/500 — monospace data diff --git a/ergon-dashboard/docs/design-audit/02-error-taxonomy.md b/ergon-dashboard/docs/design-audit/02-error-taxonomy.md new file mode 100644 index 00000000..0efd8ed4 --- /dev/null +++ b/ergon-dashboard/docs/design-audit/02-error-taxonomy.md @@ -0,0 +1,379 @@ +# Error Taxonomy — Design Spec vs Current Implementation + +Severity levels: +- **S1 — Missing surface**: An entire page/component from the spec doesn't exist. +- **S2 — Missing component**: A defined UI element within an existing page is absent. +- **S3 — Wrong styling**: The element exists but doesn't match the spec's visual treatment. +- **S4 — Wrong behavior**: The element exists but interactions differ from the spec. +- **S5 — Polish / refinement**: Minor spacing, typography, or color drift. + +--- + +## Category 1: Missing Surfaces (S1) + +### 1.1 — Global topbar with 5-tab navigation + +**Spec**: Every page in the app has a shared 56px topbar with: `Ergon logo | Cohorts | Runs | Training | Models | Settings` nav tabs, global search bar (`⌕ Search cohorts, runs, tasks… ⌘K`), primary CTA button context-dependent (e.g., `+ New cohort`), and user avatar circle. + +**Current**: No shared topbar component exists. `ClientLayout.tsx` only renders `ConnectionStatus`. Each page builds its own ad-hoc header: +- `CohortListView` has its own header with title + stats + filters but **no nav tabs, no search, no user avatar**. +- `RunWorkspacePage` builds a breadcrumb-based header with logo link, no nav tabs. +- `/training` page has no topbar at all. + +**Impact**: Users have no way to navigate between Cohorts/Runs/Training/Models/Settings. The entire app feels like disconnected pages rather than a unified shell. + +**Files**: `src/components/common/ClientLayout.tsx`, `src/app/layout.tsx` + +--- + +### 1.2 — Cohort detail page (slide 04) — partial + +**Spec**: Full cohort detail with breadcrumb, 5 summary metric tiles (Resolution, Runs pass/fail, Avg runtime, Avg tasks, Cost), score distribution chart (scatter/histogram/curve), and runs list with status filters. + +**Current**: `CohortDetailView.tsx` exists (216 lines) but: +- **Missing**: Summary metric tiles (Resolution %, pass/fail bar, avg runtime, avg tasks, cost). +- **Missing**: Score distribution chart (scatter/histogram/curve toggle). +- **Missing**: Action buttons (`Compare | Re-run failed | Open in training`). +- Has a runs list but with less structure than the spec's card-in-card with filter segments. + +**Files**: `src/components/cohorts/CohortDetailView.tsx` + +--- + +### 1.3 — Edge states page (slide 08) + +**Spec**: Defines 3 edge-state treatments: empty cohort (with CTA), failed run (error banner + replay), stale connection (socket fallback + unhandled mutation warning + no-graph placeholder). + +**Current**: Basic error/loading states exist in individual components but **no designed empty states** matching the spec. No "Launch cohort" CTA, no replay-from-seq button, no styled connection-stale treatment, no "Run hasn't emitted nodes yet" placeholder. + +**Files**: Scattered across `CohortListView`, `RunWorkspacePage`, `TaskWorkspace` + +--- + +### 1.4 — Transitions (slides 09–11) + +**Spec**: Three animated transitions with exact motion specs: +- T1: Cohort row → run workspace (shared element morph, 320ms) +- T2: Graph node → drawer (ring + slide, 260ms) +- T3: Event click → graph snapshot (per-node delta, 180ms) + +**Current**: No View Transitions API, no FLIP animations. Navigation is standard Next.js page transitions. Drawer appears via CSS `slideInRight` animation (basic). No shared element morphing, no staggered rise. + +**Files**: `src/app/globals.css` (has `slideInRight`/`slideOutRight` keyframes but they're simple slides, not the spec's multi-element choreography) + +--- + +## Category 2: Missing Components (S2) + +### 2.1 — Run header: inline key metrics (Tasks breakdown, Tokens, Cost, Score) + +**Spec**: Run header shows `Tasks: 2·2·1·5 | Tokens: 142k | Cost: $0.18 | Score: —` in a stats row separated by a border-right divider from action buttons. + +**Current**: Header shows `Tasks [total] | Turns [completed] | Score [%]` — **missing Tokens and Cost entirely**. The breakdown format is wrong (spec shows by-status counts, current shows totals). No divider styling between stats and buttons. + +**Files**: `RunWorkspacePage.tsx` lines 303–312 + +--- + +### 2.2 — Graph floating controls: minimap + +**Spec**: 200×130px minimap card top-right with colored rectangles per container + accent selection rectangle. Hides when drawer is open. + +**Current**: React Flow's built-in `` component is rendered in `DAGCanvas.tsx` but it uses React Flow's default rendering, not the spec's custom styled minimap with container-level colored blocks. + +**Files**: `src/components/dag/DAGCanvas.tsx` + +--- + +### 2.3 — Graph legend (bottom-left) + +**Spec**: Floating card bottom-left with colored dots: `completed | running | ready | pending | failed`. + +**Current**: No floating legend exists in the graph area. Status colors are implied by nodes but there's no key. + +**Files**: `src/components/dag/DAGCanvas.tsx` + +--- + +### 2.4 — Task drawer tab row + +**Spec**: Drawer has a tab strip: `Overview | Transitions | Generations | Resources | Evals (badge) | Logs`. + +**Current**: `TaskWorkspace.tsx` renders sections as stacked `WorkspaceSection` accordions — **no tab navigation**. All sections are visible at once in a scroll, not switched by tabs. + +**Files**: `src/components/workspace/TaskWorkspace.tsx` + +--- + +### 2.5 — Task drawer: Worker info section + +**Spec**: Shows worker avatar square (initials), worker name (e.g., `explorer.B`), version kicker (`v0.7.2`), turn counter (`turn 3 of ≤ 8`). + +**Current**: `TaskWorkspace` shows worker name as plain text in the header. No avatar, no version badge, no turn counter in the spec format. + +**Files**: `src/components/workspace/TaskWorkspace.tsx` + +--- + +### 2.6 — Task drawer: Current turn detail card + +**Spec**: Card with tool call info: `tool · run_command | 2.1s · exit 1`, command line, error output in red mono. + +**Current**: No "Current turn" card. Execution info exists in `CommunicationPanel` and `SandboxPanel` but not in the spec's format. + +**Files**: `src/components/workspace/TaskWorkspace.tsx`, `src/components/panels/` + +--- + +### 2.7 — Task drawer: Evals section with rich cards + +**Spec**: Eval cards with: running judge (progress bar, streaming preview text), completed harness (score `0.84 / 1.0`, assertion count). `+ Attach eval` button. + +**Current**: `EvaluationPanel.tsx` (82 lines) exists but is a minimal display. No progress bars, no streaming preview, no judge vs harness distinction, no attach button. + +**Files**: `src/components/panels/EvaluationPanel.tsx` + +--- + +### 2.8 — Task drawer: Resources section + +**Spec**: File list with mono filename + version + size badges. + +**Current**: `ResourcePanel.tsx` (234 lines) exists and shows resources, but styling doesn't match the spec's compact file-row format. + +**Files**: `src/components/panels/ResourcePanel.tsx` + +--- + +### 2.9 — Task drawer footer bar + +**Spec**: Pinned footer with `Open in workspace` button + `Jump to live →` ghost button. + +**Current**: No footer bar. The drawer has a close button but no workspace/jump actions. + +**Files**: `src/components/workspace/TaskWorkspace.tsx` + +--- + +### 2.10 — Activity stack: footer hint row + +**Spec**: Below the bars: `Color = kind | Vertical stack = overlap | Click bar = select task/span | Click ● = lock graph above to that snapshot | Auto-tailing`. + +**Current**: Bottom of the activity stack has event-type filter pills (`EXECUTION 3 | GRAPH 18 | TALK 1 | ARTIFACT 1 | EVALUATION 1 | CONTEXT 1 | SANDBOX 1`) instead of the spec's legend/hint row. These pills are functional filters not present in the spec at all. + +**Files**: `src/features/activity/components/ActivityStackTimeline.tsx` + +--- + +### 2.11 — Activity stack: snapshot pin + NOW cursor + +**Spec**: When viewing a snapshot, an indigo vertical line + `SEQ N` pill appears on the timeline. The live cursor is a green pulsing line + `NOW` pill. + +**Current**: The timeline has a blue vertical cursor line for current sequence but no styled `SEQ N` pill and no green `NOW` marker with pulse animation. + +**Files**: `src/features/activity/components/ActivityStackTimeline.tsx` + +--- + +### 2.12 — Graph: I/O ports on container edges + +**Spec**: Small filled triangles on container edges marking input/output flow direction. + +**Current**: No I/O port markers. Container nodes use React Flow handles but without the spec's triangle decorators. + +**Files**: `src/features/graph/components/ContainerNode.tsx` + +--- + +### 2.13 — Cohort list: table row structure + +**Spec**: 7-column grid with: cohort name + mono ID, mono runs count, mono avg score, color-coded failure %, runtime + last activity, solid status pill, right chevron. + +**Current**: `CohortListView` has a table but column structure and density differ. Missing the mono cohort_ID sub-line, missing the color-coded failure percentage, missing the right chevron. + +**Files**: `src/components/cohorts/CohortListView.tsx` + +--- + +## Category 3: Wrong Styling (S3) + +### 3.1 — Activity stack: dark vs light + +**Spec (slide 05–07)**: Activity stack dock uses **light** background (`#fafbfc`, border-top `var(--line)`). It's the same paper-family surface as the rest of the workspace. Header text is dark, bars are saturated. + +**Current**: Activity stack uses **dark** background (`#070b12`, near-black), with light text. This was intentional in the recent visual pass and makes the bars pop, but it contradicts the spec's light treatment. The reference screenshots (slide-07-final.png) appear dark, suggesting the spec may have been updated — but the HTML source code clearly uses `#fafbfc`. + +**Note**: The final rendered screenshots show a dark dock, so this may be an intentional design evolution. Worth confirming with the designer. + +**Files**: `RunWorkspacePage.tsx` line 411, `ActivityStackTimeline.tsx` + +--- + +### 3.2 — Activity bar colors: kind-based vs current palette + +**Spec**: 7 distinct oklch kind colors (magenta, violet, amber, cyan, green, red, blue). Bars have **start marker circles** (circle at left edge with border). + +**Current**: `ActivityBar.tsx` has `KIND_STYLES` mapping but the color values and set of kinds may not perfectly match the spec's 7 oklch values. Need to audit each one. + +**Files**: `src/features/activity/components/ActivityBar.tsx` + +--- + +### 3.3 — Node styling: compact vs verbose + +**Spec**: Nodes are compact rectangles (~60px height) with: title (13px Inter semibold), status sub-line (10px JetBrains Mono), status dot (top-right 3.5px circle). Status color is fill + stroke, not badges. + +**Current**: `LeafNode.tsx` (317 lines) renders much larger nodes with: status label text ("RUNNING"), task name, description, worker name, start time, and various icons. Nodes are visually heavy with multi-line content and orange/yellow decorative dots. + +**Files**: `src/features/graph/components/LeafNode.tsx`, `src/features/graph/components/ContainerNode.tsx` + +--- + +### 3.4 — Container styling: quiet chrome vs heavy borders + +**Spec**: Containers use dashed stroke (`stroke-dasharray="4 4"`), semi-transparent fill (`rgba(255,255,255,0.55)`), with title (12px semibold) + sub-label (10px mono) in the header region. Running container gets a colored stroke (amber/etc). Very lightweight. + +**Current**: `ContainerNode.tsx` uses depth-colored left borders, solid background, heavier visual treatment. The "chrome" is more prominent than the spec intends. + +**Files**: `src/features/graph/components/ContainerNode.tsx` + +--- + +### 3.5 — Fonts: Geist vs Inter + JetBrains Mono + +**Spec**: `Inter` + `JetBrains Mono` loaded from Google Fonts. + +**Current**: Layout loads `Geist` sans + mono via `next/font/local`, and `globals.css` body still has `font-family: Arial`. Neither `Inter` nor `JetBrains Mono` are loaded. + +**Files**: `src/app/layout.tsx`, `src/app/globals.css` + +--- + +### 3.6 — Status pill styling + +**Spec**: Two pill variants: +- Outline: white bg, 1px border, color swatch dot, 11px/500. +- Solid: tinted bg, tinted border, tinted text (e.g., running = amber bg `oklch(0.96 0.04 80)`). + +**Current**: `StatusBadge.tsx` (202 lines) exists and renders pills, but the color values and variant structure may not match the spec's oklch palette exactly. + +**Files**: `src/components/common/StatusBadge.tsx` + +--- + +### 3.7 — Run header breadcrumb density + +**Spec**: Breadcrumb uses `›` separator, mono run ID, run name as `h1` (20px), status pill + `live · 1m 42s` kicker all in one tight row. Below that, stats are separated by a vertical `border-right` divider. + +**Current**: Breadcrumb uses `/` separator, different typography density, stats row doesn't have the divider treatment. + +**Files**: `RunWorkspacePage.tsx` lines 268–380 + +--- + +### 3.8 — Workspace drawer width + +**Spec**: Drawer is `460px` wide. + +**Current**: Workspace region is `360px` wide (`w-[360px]`). + +**Files**: `RunWorkspacePage.tsx` line 454 + +--- + +### 3.9 — Graph stage background + +**Spec**: Dot-grid pattern: `radial-gradient(circle, rgb(15 23 42 / 0.04) 1px, transparent 1px)` at `22px 22px` on `var(--paper)`. + +**Current**: React Flow's built-in `` component with dot pattern. May not match the exact dot size/opacity/spacing. + +**Files**: `src/components/dag/DAGCanvas.tsx` + +--- + +### 3.10 — Card shadow and border radius + +**Spec**: `--radius: 10px`, `--shadow-sm: 0 1px 2px rgb(12 17 24 / 0.04)`, `border: 1px solid var(--line)`. + +**Current**: Various border-radius values used inline. Some match, some use Tailwind defaults. + +**Files**: Various components + +--- + +## Category 4: Wrong Behavior (S4) + +### 4.1 — Drawer is right overlay, not page section + +**Spec**: Drawer overlays the graph stage from the right edge (inside the graph's coordinate space). Graph reflows left when drawer opens. Drawer has `shadow-pop`. + +**Current**: Recent visual pass moved it to a right-side overlay (`absolute right-4 top-4`), which is closer. But it's positioned relative to `
` not the graph stage, and the graph does NOT reflow. + +**Files**: `RunWorkspacePage.tsx` lines 452–470 + +--- + +### 4.2 — Event click → graph snapshot lock + +**Spec**: Clicking an event marker (●) in the timeline locks the graph above to that sequence point. The timeline continues tailing live. A snapshot pin (indigo) appears at the locked sequence. + +**Current**: Clicking an activity switches to timeline mode and changes `currentSequence`, but the visual treatment (pin, split between locked graph and live tail) doesn't match. There's no visible distinction between "graph locked to seq X" and "just scrolled to seq X". + +**Files**: `RunWorkspacePage.tsx` `handleActivityClick`, `ActivityStackTimeline.tsx` + +--- + +### 4.3 — Depth selector integration + +**Spec**: Floating card in graph stage with buttons `1 | 2 | 3 | all`. + +**Current**: `DepthSelector.tsx` exists (157 lines) and is rendered by `DAGCanvas`, but it's part of a `RunStatusBar`-adjacent filter bar above the graph, not a floating card inside the graph canvas. + +**Files**: `src/features/graph/components/DepthSelector.tsx`, `src/components/dag/DAGCanvas.tsx` + +--- + +## Category 5: Polish / Refinement (S5) + +### 5.1 — Activity stack left rubric + +**Spec**: 140px left column with "Concurrent activity / Bars stack only when they overlap." text block. + +**Current**: Left rail has "Concurrent activity / Bars stack only / when they overlap" but sizing and typography may be cramped (noted in the PR description). + +--- + +### 5.2 — Activity stack time axis + +**Spec**: Mono 10px timestamps in an 8-column grid with `· now` suffix on the current time slot. + +**Current**: Time axis exists but the density and styling may differ. + +--- + +### 5.3 — Graph status bar overlap + +**Spec**: No separate filter bar above the graph. Filters are floating cards inside the graph canvas. + +**Current**: `RunStatusBar` renders as an absolute-positioned bar that overlaps with the graph (noted in the PR description). + +--- + +### 5.4 — Segmented controls + +**Spec**: `.seg` component — inline-flex, 1px border, 7px radius, 2px padding, active tab has card bg + shadow. + +**Current**: Tab controls use Tailwind classes that approximate this but may not match the exact padding/radius/shadow values. + +--- + +## Summary counts + +| Severity | Count | +|----------|-------| +| S1 — Missing surface | 4 | +| S2 — Missing component | 13 | +| S3 — Wrong styling | 10 | +| S4 — Wrong behavior | 3 | +| S5 — Polish | 4 | +| **Total** | **34** | diff --git a/ergon-dashboard/docs/design-audit/03-fix-plan-overview.md b/ergon-dashboard/docs/design-audit/03-fix-plan-overview.md new file mode 100644 index 00000000..36b00cf4 --- /dev/null +++ b/ergon-dashboard/docs/design-audit/03-fix-plan-overview.md @@ -0,0 +1,58 @@ +# Fix Plan — Overview + +## Phasing strategy + +Work is split into 5 phases, each shippable independently. Phases are ordered by **structural impact** (foundations first, polish last), so that later phases build on correct bones. + +| Phase | Name | Effort | Scope | +|-------|------|--------|-------| +| P0 | Design system foundations | 1 day | Tokens, fonts, shared topbar, app shell | +| P1 | Graph + drawer rework | 1–2 days | Node styling, containers, drawer, floating controls | +| P2 | Activity stack alignment | 1 day | Light/dark decision, kind colors, NOW/snapshot pins, hints | +| P3 | Cohort surfaces | 1–2 days | Cohort list columns, cohort detail metrics/chart, edge states | +| P4 | Interactions + polish | 1 day | Transitions, keyboard shortcuts, responsive, final pixel audit | + +Each phase has its own detailed plan document below. + +## Dependencies + +``` +P0 ──→ P1 ──→ P2 + ╲ ╲ + ──→ P3 ──→ P4 +``` + +P0 must land first (everything depends on the shared shell and token system). P1 and P3 can run in parallel after P0. P2 depends on P1 (graph stage layout affects activity stack positioning). P4 is the final sweep. + +## Principles + +1. **Don't break existing tests.** Run `npm run typecheck` + focused e2e tests after every meaningful file change. +2. **Extend the visual debugger screenshots.** Add new screenshot assertions for each new surface (cohort list, cohort detail, topbar) so regressions are caught. +3. **Use CSS custom properties from the spec**, not hardcoded hex in Tailwind classes. Centralize tokens in `globals.css` and reference them via `var(--token)`. +4. **Prefer editing existing components** over creating new ones. Only create new files for genuinely new surfaces (e.g., the global Topbar). + +## Files to create + +| File | Purpose | +|------|---------| +| `src/components/shell/Topbar.tsx` | Global navigation bar | +| `src/components/shell/AppShell.tsx` | Layout wrapper with topbar + content area | +| (none else new — all other work is editing existing files) | + +## Files to substantially edit + +| File | Changes | +|------|---------| +| `src/app/globals.css` | Add full design token set, remove Arial fallback, Inter + JB Mono | +| `src/app/layout.tsx` | Swap Geist for Inter + JetBrains Mono, wrap in AppShell | +| `src/components/common/ClientLayout.tsx` | Integrate Topbar, or replace with AppShell | +| `src/components/run/RunWorkspacePage.tsx` | Header → use Topbar, drawer width, stats row, graph layout | +| `src/features/graph/components/LeafNode.tsx` | Compact node styling per spec | +| `src/features/graph/components/ContainerNode.tsx` | Dashed, lightweight container chrome | +| `src/components/dag/DAGCanvas.tsx` | Floating controls, legend, minimap styling | +| `src/components/workspace/TaskWorkspace.tsx` | Tab navigation, worker info, turn card, evals, footer | +| `src/features/activity/components/ActivityStackTimeline.tsx` | Light/dark, pins, hints, kind legend | +| `src/features/activity/components/ActivityBar.tsx` | Kind color alignment, start markers | +| `src/components/cohorts/CohortListView.tsx` | Table columns, density, chevron | +| `src/components/cohorts/CohortDetailView.tsx` | Metric tiles, chart, action buttons | +| `src/components/common/StatusBadge.tsx` | Pill variants to match spec oklch values | diff --git a/ergon-dashboard/docs/design-audit/04-P0-design-foundations.md b/ergon-dashboard/docs/design-audit/04-P0-design-foundations.md new file mode 100644 index 00000000..b26a6efa --- /dev/null +++ b/ergon-dashboard/docs/design-audit/04-P0-design-foundations.md @@ -0,0 +1,214 @@ +# P0 — Design System Foundations + +**Goal**: Establish the correct token system, fonts, and shared app shell so every subsequent phase builds on the right base. + +**Addresses**: 1.1 (topbar), 3.5 (fonts), 3.10 (tokens), 5.4 (segmented controls) + +--- + +## Task 0.1 — Design tokens in globals.css + +**File**: `src/app/globals.css` + +Replace the minimal `:root` block with the full spec token set: + +```css +:root { + /* Surfaces */ + --paper: #f6f7f9; + --paper-2: #eef0f3; + --paper-3: #e6e9ee; + --card: #ffffff; + --ink: #0c1118; + --ink-2: #1f2733; + --muted: #64707f; + --faint: #98a2b1; + --line: #e2e6ec; + --line-strong: #cdd3dc; + + /* Status — oklch */ + --pending: oklch(0.72 0.02 250); + --ready: oklch(0.74 0.10 240); + --running: oklch(0.78 0.14 80); + --completed: oklch(0.70 0.13 155); + --failed: oklch(0.68 0.18 22); + --cancelled: oklch(0.62 0.02 260); + + /* Accent — indigo, selection/pin only */ + --accent: oklch(0.62 0.16 252); + --accent-soft: oklch(0.94 0.04 252); + --accent-ink: oklch(0.32 0.12 252); + + /* Radii */ + --radius: 10px; + --radius-sm: 6px; + + /* Shadows */ + --shadow-sm: 0 1px 2px rgb(12 17 24 / 0.04); + --shadow: 0 1px 2px rgb(12 17 24 / 0.05), 0 4px 12px rgb(12 17 24 / 0.04); + --shadow-pop: 0 8px 24px rgb(12 17 24 / 0.08), 0 1px 2px rgb(12 17 24 / 0.05); + + /* Fonts */ + --font: "Inter", ui-sans-serif, system-ui, -apple-system, sans-serif; + --mono: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace; +} +``` + +Remove the `font-family: Arial, Helvetica, sans-serif` from the body rule. Add `font-family: var(--font);`. + +Remove dark-mode `prefers-color-scheme` overrides (the spec is light-only). + +**Checklist**: +- [ ] All 22 tokens present +- [ ] Body uses `var(--font)` +- [ ] No dark-mode overrides remain +- [ ] Existing components don't break (hardcoded hex still works, but new work should prefer tokens) + +--- + +## Task 0.2 — Swap fonts: Geist → Inter + JetBrains Mono + +**File**: `src/app/layout.tsx` + +Current loads Geist via `next/font/local`. Replace with: + +```tsx +import { Inter, JetBrains_Mono } from "next/font/google"; + +const inter = Inter({ subsets: ["latin"], variable: "--font-inter" }); +const jetbrainsMono = JetBrains_Mono({ + subsets: ["latin"], + variable: "--font-jetbrains-mono", +}); +``` + +Apply both CSS variable classes to ``. Update `globals.css` to reference them: + +```css +:root { + --font: var(--font-inter), ui-sans-serif, system-ui, sans-serif; + --mono: var(--font-jetbrains-mono), ui-monospace, SFMono-Regular, monospace; +} +``` + +**Checklist**: +- [ ] `Inter` and `JetBrains Mono` are loaded from Google Fonts via next/font +- [ ] CSS variables `--font` and `--mono` resolve correctly +- [ ] No Geist references remain +- [ ] Typecheck passes + +--- + +## Task 0.3 — Tailwind config: add features/ content path + +**File**: `tailwind.config.ts` + +Add `./src/features/**/*.{js,ts,jsx,tsx}` to the `content` array so Tailwind purges correctly for feature components. + +Also extend the theme to reference CSS custom properties where helpful: + +```ts +theme: { + extend: { + colors: { + paper: "var(--paper)", + "paper-2": "var(--paper-2)", + card: "var(--card)", + ink: "var(--ink)", + muted: "var(--muted)", + faint: "var(--faint)", + line: "var(--line)", + accent: "var(--accent)", + }, + borderRadius: { + card: "var(--radius)", + sm: "var(--radius-sm)", + }, + boxShadow: { + card: "var(--shadow-sm)", + pop: "var(--shadow-pop)", + }, + fontFamily: { + sans: ["var(--font)"], + mono: ["var(--mono)"], + }, + }, +}, +``` + +--- + +## Task 0.4 — Create global Topbar component + +**New file**: `src/components/shell/Topbar.tsx` + +Spec defines: +- 56px height, white bg, 1px bottom border +- Left: Ergon logo (22×22 dark square with cutout) + "Ergon" wordmark + 5 nav links +- Right: Search bar (280px) + context CTA button (optional) + user avatar (28px circle) + +Nav links: `Cohorts | Runs | Training | Models | Settings` +- Active link: dark text + `var(--paper-2)` background, 6px radius +- Inactive: muted text + +```tsx +interface TopbarProps { + activeTab?: "cohorts" | "runs" | "training" | "models" | "settings"; + cta?: { label: string; href?: string; onClick?: () => void }; +} +``` + +The search bar is non-functional for now (placeholder) — will wire up in P4. + +The user avatar uses initials — can be hardcoded `JM` for now or pulled from a context. + +--- + +## Task 0.5 — Create AppShell layout wrapper + +**New file**: `src/components/shell/AppShell.tsx` + +Wraps every page: `` + `
{children}
`. The main area takes the remaining viewport height. + +**Edit**: `src/components/common/ClientLayout.tsx` or `src/app/layout.tsx` to include `` around page content. + +Page-specific headers (like the run breadcrumb bar) render **below** the topbar, inside the page component. The topbar is always present. + +--- + +## Task 0.6 — StatusBadge / pill alignment + +**File**: `src/components/common/StatusBadge.tsx` + +Update pill color values to match the spec's oklch values. Add the two variants: + +1. **Outline pill** (`.pill`): white bg, 1px border `var(--line)`, 6px color swatch dot, 11px/500 text. +2. **Solid pill** (`.pill--solid`): tinted bg/border/text per status: + - running: bg `oklch(0.96 0.04 80)`, border `oklch(0.85 0.10 80)`, text `oklch(0.42 0.12 65)` + - completed: bg `oklch(0.96 0.04 155)`, border `oklch(0.85 0.10 155)`, text `oklch(0.40 0.12 155)` + - failed: bg `oklch(0.96 0.04 22)`, border `oklch(0.85 0.10 22)`, text `oklch(0.40 0.16 22)` + - etc. + +Add `pulse` animation for running swatch dot. + +--- + +## Task 0.7 — Segmented control component + +Either add a reusable `` component or define a Tailwind utility pattern matching: +- `inline-flex border border-line rounded-[7px] bg-paper p-0.5 text-xs` +- Active segment: `bg-card text-ink shadow-card rounded-[5px]` +- Inactive: `text-muted` + +This pattern is used in: cohort list filters, cohort detail chart toggle, run header Live/Timeline toggle, graph depth selector, runs list filter. + +--- + +## Verification + +After P0: +- [ ] `npm run typecheck` passes +- [ ] All existing e2e tests still pass +- [ ] Every page has the 5-tab topbar +- [ ] Fonts render as Inter + JetBrains Mono (visual check) +- [ ] Token CSS variables are applied to body diff --git a/ergon-dashboard/docs/design-audit/05-P1-graph-and-drawer.md b/ergon-dashboard/docs/design-audit/05-P1-graph-and-drawer.md new file mode 100644 index 00000000..a82b6eb5 --- /dev/null +++ b/ergon-dashboard/docs/design-audit/05-P1-graph-and-drawer.md @@ -0,0 +1,178 @@ +# P1 — Graph Stage + Drawer Rework + +**Goal**: Bring the graph canvas, node rendering, container chrome, floating controls, and task drawer into spec alignment. + +**Addresses**: 2.1 (stats), 2.2 (minimap), 2.3 (legend), 2.4 (tabs), 2.5 (worker), 2.6 (turn card), 2.7 (evals), 2.8 (resources), 2.9 (footer), 2.12 (I/O ports), 3.3 (nodes), 3.4 (containers), 3.8 (drawer width), 3.9 (graph bg), 4.1 (drawer position), 4.3 (depth selector) + +--- + +## Task 1.1 — Compact leaf node styling + +**File**: `src/features/graph/components/LeafNode.tsx` (317 lines → target ~120) + +Current nodes are tall, verbose cards with status label text, description, worker, timestamps, and decorative dots. The spec wants: + +- **Height**: ~50–60px +- **Content**: Task name (13px Inter 600) + status sub-line (10px JetBrains Mono) in status color +- **Status indicator**: 3.5px circle, top-right corner, filled with status color +- **Fill**: Status-tinted background (e.g., running = `oklch(0.97 0.04 80)`) +- **Stroke**: Status-tinted border (e.g., running = `oklch(0.85 0.10 80)`) +- **Border radius**: 6px +- **Selection**: 2px indigo ring at 2px offset (via `--accent`) + +Remove: status text label ("RUNNING"), description line, worker name, start timestamp, decorative triple-dots. + +Status color map (from deck.js `NODE` function): +``` +completed: [bg: "oklch(0.96 0.04 155)", border: "oklch(0.85 0.10 155)", text: "oklch(0.40 0.12 155)"] +running: [bg: "oklch(0.97 0.04 80)", border: "oklch(0.85 0.10 80)", text: "oklch(0.42 0.12 65)"] +ready: [bg: "oklch(0.97 0.03 240)", border: "oklch(0.86 0.08 240)", text: "oklch(0.40 0.12 240)"] +pending: [bg: "#ffffff", border: "#e2e6ec", text: "#98a2b1"] +failed: [bg: "oklch(0.97 0.04 22)", border: "oklch(0.85 0.10 22)", text: "oklch(0.40 0.16 22)"] +``` + +--- + +## Task 1.2 — Lightweight container chrome + +**File**: `src/features/graph/components/ContainerNode.tsx` (157 lines) + +Current: solid background with colored left border by depth. Spec wants: + +- **Fill**: `rgba(255,255,255,0.55)` (semi-transparent white) +- **Stroke**: `#cdd3dc` dashed (`stroke-dasharray: 4 4`); running containers get status-colored stroke +- **Header**: container title (12px Inter 600) left, sub-label (10px mono, muted) right-aligned +- **Border radius**: 8px +- **No depth-colored left border** — depth is conveyed by nesting level only + +Remove the `getLevelColor` depth border. Container "running" state gets an amber border instead of dashed gray. + +--- + +## Task 1.3 — Floating graph controls + +**File**: `src/components/dag/DAGCanvas.tsx` + +Replace the current status filter bar above the graph with floating control cards **inside** the graph stage: + +**Top-left cluster** (z-5, flex row, gap-8px): +1. **Zoom card**: `+ | − | ⌂` icon buttons in a `.card` with 4px padding +2. **Depth card**: `DEPTH` section-title + segmented `1 | 2 | 3 | all` buttons +3. **Search card**: `SEARCH` section-title + mono placeholder `find a task…` + +**Top-right** (z-5): +- **Minimap**: 200×130px card with custom rendering (colored rectangles per container status + accent selection rect). Hide when drawer is open. + +**Bottom-left** (z-5): +- **Legend**: card with flex row of colored dots + labels: `completed | running | ready | pending | failed` + +Move `RunStatusBar` (currently absolute-positioned above graph) into the floating control cluster or remove it — the status counts are redundant with the legend and the header stats. + +--- + +## Task 1.4 — Run header stats row + +**File**: `src/components/run/RunWorkspacePage.tsx` (header section, lines ~302–313) + +Current stats: `Tasks [total] | Turns [completed] | Score [%]` + +Spec stats: `Tasks: 2·2·1·5 | Tokens: 142k | Cost: $0.18 | Score: —` + +Changes: +- Tasks shows breakdown by status: `completed · running · ready · pending` (dot-separated) +- Add `Tokens` (from `runState` if available, else `—`) +- Add `Cost` (from `runState` if available, else `—`) +- Keep `Score` +- Stats block has `border-right: 1px solid var(--line)` + `padding-right: 8px` separating it from action buttons +- Each stat: `section-title` label (11px uppercase faint) + `mono` value (14px ink) + +--- + +## Task 1.5 — Task drawer: width, position, tab navigation + +**File**: `src/components/workspace/TaskWorkspace.tsx` + +### Width +Change from `w-[360px]` to `w-[460px]` in `RunWorkspacePage.tsx` (line 454). + +### Position +Drawer should be positioned inside the graph stage `
`, not inside `
`. This means moving the workspace-region section to be a child of graph-region, with `position: absolute; top: 16px; right: 16px; bottom: 16px;`. + +### Tab navigation +Replace the stacked `WorkspaceSection` accordion pattern with a tab strip: + +``` +Overview | Transitions | Generations | Resources | Evals (N) | Logs +``` + +- Active tab: bottom 2px border in `var(--ink)`, no border-radius +- Inactive: ghost button styling +- Evals tab shows a count badge pill + +Each tab renders its corresponding panel. Only one panel visible at a time. + +### Header structure +``` +[section-title: TASK WORKSPACE] [Pin button] [Close button] +[h3: task_name] [status pill] +[mono caption: task / parent / name · seq N] +``` + +--- + +## Task 1.6 — Task drawer: content sections + +### Worker section +- 24×24 rounded-6px avatar square (dark bg, white initials) +- Worker name (font-weight 500) +- Version kicker (mono 10px in `var(--paper-2)` bg) +- Turn counter right-aligned: `turn N of ≤ M` + +### Transitions section +- Status pill pairs: `pending → ready` with sequence + time on the right +- Trigger description sub-line (11px muted, indented) + +### Current turn section +- Card with `var(--paper-2)` background +- Header: `tool · tool_name` + `duration · exit code` right-aligned +- Command line in mono +- Error output in mono, colored `oklch(0.40 0.16 22)` (failed red) + +### Evals section +- Judge card: status dot + eval name + status pill, kicker with model info, progress bar, streaming preview text (truncated mono) +- Harness card: status dot + eval name + passed pill, kicker with type info, score display (`0.84 / 1.0`), assertion count +- `+ Attach eval` ghost button + +### Resources section +- File rows: mono filename + `version · size` right-aligned, each in a 6px-radius bordered row + +### Footer bar +- Pinned to bottom of drawer +- `var(--paper)` background, top border +- `Open in workspace` button + `Jump to live →` ghost button right-aligned + +--- + +## Task 1.7 — Graph edge styling + +**File**: `src/components/dag/edges/GraphDependencyEdge.tsx` + +Edges should use: +- Default: `#cdd3dc` stroke, 1.5px width +- Active (connected to running container): status color +- Bezier curves with configurable curvature +- Arrow markers at endpoints + +--- + +## Verification + +After P1: +- [ ] Nodes are compact (≤60px height) with correct status fills +- [ ] Containers use dashed borders, no depth-colored left bar +- [ ] Floating controls (zoom, depth, search, minimap, legend) are inside graph canvas +- [ ] Drawer is 460px, positioned inside graph stage, has tabs +- [ ] Drawer sections render correctly per active tab +- [ ] `npm run typecheck` passes +- [ ] `activity-stack.spec.ts` still passes (may need selector updates for new DOM structure) +- [ ] New screenshot: `graph-canvas-compact.png` shows compact nodes diff --git a/ergon-dashboard/docs/design-audit/06-P2-activity-stack.md b/ergon-dashboard/docs/design-audit/06-P2-activity-stack.md new file mode 100644 index 00000000..3b67e77f --- /dev/null +++ b/ergon-dashboard/docs/design-audit/06-P2-activity-stack.md @@ -0,0 +1,141 @@ +# P2 — Activity Stack Alignment + +**Goal**: Bring the activity stack (timeline dock) into spec alignment — surface treatment, kind colors, cursor/pin markers, hint row. + +**Addresses**: 2.10 (footer hints), 2.11 (snapshot pin + NOW cursor), 3.1 (light vs dark), 3.2 (kind colors), 5.1 (left rubric), 5.2 (time axis) + +--- + +## Decision: Light vs Dark + +The spec HTML source uses `background: #fafbfc` (very light) for the activity stack dock. However, the rendered reference screenshots (`slide-07-final.png`) appear to show a **dark** dock. + +Looking closely at the final rendered screenshot, the dock IS dark — the deck.js generates a light-background container in the HTML but the rendered screenshots were taken with the dark styling. The current implementation already uses dark (`#070b12`). + +**Recommendation**: Keep the **dark dock** — it provides better contrast for the saturated activity bars, and the rendered screenshots (which represent the designer's final intent) show dark. But adjust the header/rubric text styling to match the screenshot: lighter gray text on dark, not the spec HTML's dark-on-light treatment. + +If the user prefers light, the changes are: swap `bg-[#070b12]` → `bg-[#fafbfc]`, border color → `var(--line)`, text → `var(--ink)`. + +--- + +## Task 2.1 — Kind color alignment + +**File**: `src/features/activity/components/ActivityBar.tsx` + +Audit `KIND_STYLES` map against the spec's 7 kind colors: + +``` +graph_mutation: fill oklch(0.78 0.14 305) text white // magenta +task_execution: fill oklch(0.74 0.16 295) text white // violet +tool_call: fill oklch(0.78 0.16 60) text #1a1207 // amber +message: fill oklch(0.76 0.13 200) text #06181c // cyan +resource: fill oklch(0.74 0.13 155) text #06180e // green +eval: fill oklch(0.70 0.18 25) text white // red +transition: fill oklch(0.74 0.10 240) text white // blue +``` + +Also check the `ActivityKind` type in `src/features/activity/types.ts` — ensure all 7 kinds are defined and mapped. + +Each bar should also have a **start marker circle**: 4.5px radius at the left edge, same fill as bar, with a 2px dark stroke (`#0c1118` or `#fafbfc` depending on light/dark dock). + +--- + +## Task 2.2 — NOW cursor (live leading edge) + +**File**: `src/features/activity/components/ActivityStackTimeline.tsx` + +When in live mode, render at the rightmost event position + 30px: + +1. **Vertical line**: 2px wide, `oklch(0.66 0.18 145)` (green), full height of the stack area, pulsing animation. +2. **NOW pill**: positioned above the line, green background, white mono text `NOW` with a pulsing white dot. +3. **Soft fade gradient**: 60px wide linear-gradient from transparent to `oklch(0.96 0.05 145 / 0.35)` at the leading edge, suggesting live append. + +--- + +## Task 2.3 — Snapshot pin (locked sequence) + +When the graph is locked to a sequence (via clicking an event marker or timeline scrub): + +1. **Indigo vertical line**: 2px, `var(--accent)`, full stack height. +2. **SEQ N pill**: above the line, accent background, white mono text with sequence number. +3. The NOW cursor continues to show at the live edge (both are visible simultaneously). + +In the header bar, add: `graph locked · seq N` in accent color when a snapshot is active. + +--- + +## Task 2.4 — Header bar refinement + +Current header has: label, live pill, seq range. + +Spec header has two sides: +- **Left**: `ACTIVITY STACK` label + `rows are overlap layers, not fixed lanes · streams in real time` description + `Live · auto-tail` green pill + `seq 0 — 214 · streaming` + optional `graph locked · seq N` (accent). +- **Right**: Kind legend — colored dots with labels for all 7 event kinds. + +Move the event-type filter pills currently at the bottom to the right side of the header as a **legend** (non-interactive dots + labels), not clickable filters. + +If filtering by kind is important to keep, add it as a subtle interaction (clicking a legend dot toggles that kind) but the default should be "all visible, legend is informational". + +--- + +## Task 2.5 — Footer hint row + +Replace the current bottom filter pills with: + +``` +Color = kind · Vertical stack = overlap · Click bar = select task/span · +Click ● = lock graph above to that snapshot · Auto-tailing · new events append at right +``` + +Style: `font-size: 10px; color: #a8b0bd;` (faint), flex row with `·` separators. + +--- + +## Task 2.6 — Left rubric + +Current left rubric: "Concurrent activity / Bars stack only / when they overlap" + +Spec: 140px-wide column with: +``` +Concurrent activity (font-weight 600, ink color) +Bars stack only when they overlap. (normal weight, muted) +``` + +Ensure the column is exactly 140px, with a 16px gap to the bar area. Typography should use the spec's 11px size with 1.45 line-height. + +--- + +## Task 2.7 — Time axis + +Spec: mono 10px timestamps in an 8-column grid (e.g., `21:33 | 21:34 | ... | 21:39 · now | 21:40`). + +Current implementation likely already has a time axis. Verify: +- Uses `var(--mono)` font +- 10px size +- 8 evenly spaced columns +- Current time slot gets `· now` suffix in green +- Future slots are dimmed (`#cdd3dc`) + +--- + +## Task 2.8 — Playback controls alignment + +The spec shows: `⏮⏮ | ▶ | ⏭⏭` buttons + `0.5x | 1x | 2x | 4x` speed selector + `SEQ 0 — 42 OF 214` display. + +Current has: Play button + speed dropdown + sequence display. Ensure the layout matches (centered in the header bar, between left info and right legend). + +--- + +## Verification + +After P2: +- [ ] Activity bars use the 7 spec kind colors +- [ ] Start marker circles visible on each bar +- [ ] NOW cursor with green pulse at live edge +- [ ] Snapshot pin at locked sequence (indigo) +- [ ] Header has left info + right legend layout +- [ ] Footer has hint text row +- [ ] Left rubric is 140px, properly styled +- [ ] Time axis is 8-column mono with `· now` marker +- [ ] `activity-stack.spec.ts` still passes (update selectors if needed) +- [ ] New screenshot: `activity-stack-aligned.png` diff --git a/ergon-dashboard/docs/design-audit/07-P3-cohort-surfaces.md b/ergon-dashboard/docs/design-audit/07-P3-cohort-surfaces.md new file mode 100644 index 00000000..ff71c563 --- /dev/null +++ b/ergon-dashboard/docs/design-audit/07-P3-cohort-surfaces.md @@ -0,0 +1,134 @@ +# P3 — Cohort Surfaces + +**Goal**: Bring the cohort list, cohort detail, and edge states into spec alignment. + +**Addresses**: 1.2 (cohort detail), 1.3 (edge states), 2.13 (table row structure), and partial 3.6 (pill styling applied here) + +--- + +## Task 3.1 — Cohort list table alignment + +**File**: `src/components/cohorts/CohortListView.tsx` (~669 lines) + +### Page header +Current has its own header with title + stats + filters. With P0's Topbar in place, this page should: + +1. Remove any duplicate navigation chrome. +2. Keep the page-level header: `Workspace · diamond` kicker, `Cohorts` h1, subtitle. +3. **Add two rows of segmented controls** (both currently partially exist): + - Status: `All · 42 | Active · 38 | Running · 6 | Needs attention · 2 | Archived · 4` + - Sort: `Recent | Score | Failure rate | Runs` + +### Table columns (7-column grid) +Spec: `grid-template-columns: 2.6fr 1fr 1fr 1fr 1.4fr 1fr 0.8fr` + +| Column | Header | Cell content | +|--------|--------|-------------| +| Cohort | `COHORT` | Name (font-weight 600) + mono ID sub-line (`cohort_001`) | +| Runs | `RUNS` | Mono number | +| Avg score | `AVG SCORE` | Mono percentage | +| Failure | `FAILURE` | Mono percentage, color-coded: >30% red, >15% amber, else muted | +| Runtime · last activity | `RUNTIME · LAST ACTIVITY` | Runtime + relative time (muted text) | +| Status | `STATUS` | `pill--solid` badge | +| (chevron) | — | Right-aligned `›` in faint color | + +Header row: `padding: 12px 20px; border-bottom: 1px solid var(--line); font-size: 11px; color: var(--faint); text-transform: uppercase; letter-spacing: 0.08em;` + +Data rows: `padding: 14px 20px; border-bottom: 1px solid var(--line); font-size: 13px; align-items: center;` + +### Footer +`Showing N of M cohorts` + `Updated HH:MM:SS · live ●` (green dot). + +--- + +## Task 3.2 — Cohort detail: metric tiles + +**File**: `src/components/cohorts/CohortDetailView.tsx` (~216 lines) + +### Breadcrumb +`Cohorts › cohort-name` with muted `Cohorts` link and ink-colored current name. + +### Header +- Cohort name h1 (30px, -0.025em tracking) +- Subtitle: `N runs · started DATE · created by USER` +- Action buttons: `Compare | Re-run failed | Open in training` (primary = `Open in training`) + +### 5 summary metric tiles +5-column grid, each a `.card` with `padding: 18px 20px`: + +1. **Resolution**: `section-title` label, large number (34px, -0.02em), delta sub-line (green for improvement). +2. **Runs · pass / fail**: large `312 / 188` (188 in muted), 6px progress bar below (green/red split). +3. **Avg runtime**: large `2:14`, sub-line `min · p95 4:32`. +4. **Avg tasks**: large `11.4`, sub-line `2.1 levels deep · 1.7 retries`. +5. **Cost**: large `$84.20`, sub-line `$0.17 / run · 41M tokens`. + +Data comes from `useCohortDetail().detail.summary` — wire up whatever fields are available from the API. Use `—` for missing fields. + +### Two-column content split +Below the tiles: `grid-template-columns: 1.05fr 1fr; gap: 16px`. + +**Left card: Score distribution chart** +- Header: `SCORE DISTRIBUTION` section-title + description + `Scatter | Histogram | Curve` segmented control +- Chart area: For MVP, render a simple SVG scatter plot (or use a lightweight chart library). Show completed runs as green dots, failed as red dots, running as amber dots with white stroke. Axes: x = runtime, y = score. +- If no charting library is available, render a placeholder with the correct card frame. + +**Right card: Runs list** +- Header: `RUNS` section-title + `N total · M running` + `All | Running | Failed` filter segment +- Scrollable list of run rows: mono run ID, status pill, mono runtime, mono score +- Each row: `grid-template-columns: 1.6fr 0.7fr 0.7fr 1fr; gap: 12px; padding: 13px 20px;` + +--- + +## Task 3.3 — Edge states + +### Empty cohort +When a cohort has 0 runs, show in the runs area: +- Dashed border container, paper background, centered content +- 48×48 icon (⊘ in paper-2 bg), h3 "No runs yet", description text, primary "Launch cohort" button + +### Failed run +When a run status is `failed`, show at the top of the run workspace: +- Rose-tinted card (`oklch(0.98 0.02 22)` bg, `oklch(0.85 0.10 22)` border) +- Failed pill + seq/time info +- Error task name, mono error message (pre-wrap) +- "Last good state" section with info +- `Re-run from seq 0` primary button + `Replay` secondary + +### Connection stale +In `ConnectionStatus.tsx` or as a banner: +- Cancelled dot + "Live socket disconnected" message +- "Falling back to REST · refresh every 5s" sub-text +- If there are unhandled mutations: warning card with amber border + +### No graph yet +In `DAGCanvas` when there are no nodes: +- Dashed border container, centered: "Run hasn't emitted nodes yet" + +--- + +## Task 3.4 — Runs page (tab in topbar) + +The spec has a `Runs` tab in the topbar. This is currently the `/run/[runId]` legacy route, but there's no "all runs" index page. + +For now, the `/` route maps to cohort list. The `Runs` tab can either: +- Link to `/` with a different view mode (table of all runs across cohorts) +- Be marked as "coming soon" in the nav +- Show a filtered version of the cohort list + +Recommendation: Skip for now, mark as future work. The tab should exist in the topbar but can link to `/` with a `?view=runs` param or similar. + +--- + +## Verification + +After P3: +- [ ] Cohort list table has 7 columns with correct headers and cell formats +- [ ] Failure % is color-coded +- [ ] Cohort detail shows 5 metric tiles +- [ ] Score chart area exists (even if placeholder) +- [ ] Runs list in cohort detail has filter segments +- [ ] Empty cohort state renders correctly +- [ ] Failed run state renders correctly +- [ ] No-graph state renders correctly +- [ ] `cohort.snapshot.spec.ts` still passes +- [ ] New screenshots for cohort list and detail diff --git a/ergon-dashboard/docs/design-audit/08-P4-interactions-polish.md b/ergon-dashboard/docs/design-audit/08-P4-interactions-polish.md new file mode 100644 index 00000000..27ed5ac9 --- /dev/null +++ b/ergon-dashboard/docs/design-audit/08-P4-interactions-polish.md @@ -0,0 +1,151 @@ +# P4 — Interactions + Polish + +**Goal**: Add transition animations, wire up keyboard shortcuts, responsive adjustments, and final pixel audit. + +**Addresses**: 1.4 (transitions), 4.2 (snapshot lock), remaining S5 items + +--- + +## Task 4.1 — Drawer enter/exit animation + +**File**: `src/app/globals.css`, `RunWorkspacePage.tsx` + +Current `slideInRight` / `slideOutRight` animations exist but are basic. Enhance: + +1. **Drawer enter** (260ms): slides from right edge, starts 28px offset and 55% opacity → settles at 0 offset, 100% opacity. Uses `cubic-bezier(.22, 1, .36, 1)`. +2. **Selection ring** on the clicked node appears in 80ms (independent of drawer). +3. **Graph reflow**: When drawer opens, the graph stage should compress left by the drawer width. This can be done by adding a right margin/padding to the React Flow container when `isInspectorOpen`. + +```css +@keyframes drawerEnter { + from { transform: translateX(28px); opacity: 0.55; } + to { transform: translateX(0); opacity: 1; } +} +@keyframes drawerExit { + from { transform: translateX(0); opacity: 1; } + to { transform: translateX(28px); opacity: 0; } +} +``` + +--- + +## Task 4.2 — Snapshot lock visual distinction + +**Files**: `RunWorkspacePage.tsx`, `ActivityStackTimeline.tsx` + +When the user clicks an event marker (●) in the timeline: + +1. Graph locks to that sequence (existing behavior via `handleActivityClick`). +2. **Visual indicators**: + - Header chip shows `graph · seq N · TIME` in mono (existing partial implementation). + - Activity stack gets indigo snapshot pin (Task 2.3). + - The live NOW cursor continues to pulse at the right edge. +3. **Esc key** clears the snapshot lock (returns graph to live). +4. **Arrow keys** (←/→) when graph is locked should step ±1 affected node in the mutation log. + +--- + +## Task 4.3 — Keyboard shortcuts completion + +**File**: `RunWorkspacePage.tsx` (already has keydown handler) + +Verify all shortcuts from the spec: +- `Esc`: clear selection → clear snapshot lock → clear filter (cascade) +- `t` / `T`: toggle live/timeline +- `e` / `E`: toggle event stream +- `1-6`: filter by status (existing) +- `⌘K`: focus search (when implemented) +- `⌘D`: open/close drawer (if a node is selected) + +--- + +## Task 4.4 — Cohort row → run workspace transition (T1) + +This is the most complex transition. For MVP: + +1. **Navigate** from `/cohorts/:id` to `/cohorts/:id/runs/:runId` using Next.js router. +2. **During navigation**: the clicked row gets an accent outline (80ms) before the page change. +3. **On the run page**: the header animates in from a compact state (row height → full header height) over 320ms. +4. **Graph + activity stack** rise from below with 60ms stagger. + +Implementation options: +- **View Transitions API** (if browser support is acceptable): Use `document.startViewTransition()` with shared element names on the row chip and header chip. +- **FLIP technique**: Measure row position before navigation, apply inverse transform on mount, animate to identity. +- **Simpler fallback**: Just do a cross-fade between pages. The spec says `reducedMotion: "Cross-fade only · 120 ms · no rise/morph"` — this can be the default implementation for now, with enhanced animation as a follow-up. + +--- + +## Task 4.5 — Event marker → snapshot transition (T3) + +When clicking an event marker: +1. Snapshot pin appears on timeline (180ms). +2. Graph nodes whose status differs at the snapshot sequence animate their fill color (180ms per node delta). +3. Nodes that don't change stay still. + +Implementation: Compare `displayState` at live vs at snapshot sequence. For each node where status changed, apply a CSS transition on `background-color` and `border-color`. + +--- + +## Task 4.6 — Responsive adjustments + +The spec is designed for 1920×1080 and is dense. For smaller viewports: + +- **< 1440px**: Stats row in run header wraps or becomes a dropdown. +- **< 1280px**: Drawer collapses to a bottom sheet instead of right panel. +- **< 1024px**: Activity stack collapses to a thin strip (just the NOW cursor). +- **< 768px**: Topbar hamburger menu for nav tabs. + +These are suggestions — the spec doesn't explicitly define responsive breakpoints. Implement as progressive enhancement. + +--- + +## Task 4.7 — Final pixel audit + +After all phases, do a side-by-side comparison with the spec screenshots: + +1. Open the spec deck in a browser: `open /tmp/ergon-design-spec/index.html` +2. Screenshot each slide at 1920×1080. +3. Compare with the dashboard at the same viewport size. +4. Document any remaining deltas. + +Key areas to check: +- [ ] Font rendering (Inter + JetBrains Mono at correct weights) +- [ ] Color token accuracy (oklch values rendering as expected) +- [ ] Spacing / padding values +- [ ] Border radius consistency +- [ ] Shadow values +- [ ] Animation timing + +--- + +## Task 4.8 — Extended visual debugger screenshots + +Add new e2e test screenshots to `tmp/visual-debugger/`: + +``` +cohort-list.png — Full cohort list page with topbar +cohort-detail.png — Cohort detail with metric tiles + chart +run-workspace-live.png — Run workspace in live mode (updated) +run-workspace-drawer.png — Run workspace with drawer open + snapshot pin +graph-compact.png — Zoomed graph showing compact node styling +activity-stack-full.png — Activity stack with NOW cursor + hint row +empty-cohort.png — Empty cohort state +failed-run.png — Failed run state +``` + +Update `activity-stack.spec.ts` to capture these, gated behind `VISUAL_DEBUGGER_SCREENSHOTS=1`. + +--- + +## Verification + +After P4: +- [ ] Drawer animates in/out with spec timing +- [ ] Graph reflows when drawer opens +- [ ] Snapshot lock has visible pin + header chip +- [ ] Arrow keys step through snapshots +- [ ] Cross-fade on page transitions (at minimum) +- [ ] All 8 new screenshots captured +- [ ] `npm run typecheck` passes +- [ ] Full e2e suite passes +- [ ] Side-by-side with spec screenshots shows high fidelity diff --git a/ergon-dashboard/docs/design-audit/09-verification-strategy.md b/ergon-dashboard/docs/design-audit/09-verification-strategy.md new file mode 100644 index 00000000..8c448d3b --- /dev/null +++ b/ergon-dashboard/docs/design-audit/09-verification-strategy.md @@ -0,0 +1,285 @@ +# Verification Strategy — Making Each Phase Agent-Delegatable + +## Current state of verification + +| Layer | What exists | What's missing | +|-------|------------|----------------| +| **Typecheck** | `npm run typecheck` — full TS coverage | Nothing — this works | +| **E2E harness** | Seed/reset in-memory fixtures, DOM assertions on testids, structure | No assertions on *styling* — only that elements exist | +| **Screenshots** | `VISUAL_DEBUGGER_SCREENSHOTS=1` dumps PNGs to `tmp/visual-debugger/` | **Manual inspection only** — no baselines, no pixel-diff, no automated comparison | +| **Visual regression** | None | No `toHaveScreenshot()`, no Playwright visual comparisons, no baseline images | +| **Unit tests** | Some for layout/mutation logic | No tests for design tokens, component rendering, pill colors | +| **Backend data** | Harness seeds cohorts + runs with realistic data | Missing fields: tokens, cost, resolution %, avg tasks per run | + +**Bottom line**: An agent can currently verify that code compiles and that DOM elements exist. It **cannot** verify that the UI *looks right*. That's the gap. + +--- + +## Three pillars of agent-verifiable design work + +### Pillar 1: Structural E2E assertions (DOM correctness) + +For each new/changed component, the agent needs testid-based assertions that verify the **structure** is correct — right elements present, right text content, right hierarchy. + +**What to add per phase**: + +#### P0 — Topbar + tokens +```typescript +// New spec: topbar.spec.ts +test("topbar renders on cohort list", async ({ page }) => { + await seedHarness(page, createDashboardSeed()); + await page.goto("/"); + const topbar = page.getByTestId("topbar"); + await expect(topbar).toBeVisible(); + // Nav tabs + for (const tab of ["Cohorts", "Runs", "Training", "Models", "Settings"]) { + await expect(topbar.getByRole("link", { name: tab })).toBeVisible(); + } + // Active tab + await expect(topbar.getByRole("link", { name: "Cohorts" })).toHaveAttribute("aria-current", "page"); + // Search bar + await expect(topbar.getByPlaceholder(/search/i)).toBeVisible(); + // User avatar + await expect(topbar.getByTestId("user-avatar")).toBeVisible(); +}); + +test("topbar renders on run page with Runs active", async ({ page }) => { + // ...seed + navigate to run + await expect(topbar.getByRole("link", { name: "Runs" })).toHaveAttribute("aria-current", "page"); +}); +``` + +#### P1 — Graph + drawer +```typescript +// Extend activity-stack.spec.ts or new graph.spec.ts +test("graph has floating controls", async ({ page }) => { + await expect(page.getByTestId("graph-zoom-controls")).toBeVisible(); + await expect(page.getByTestId("graph-depth-selector")).toBeVisible(); + await expect(page.getByTestId("graph-search")).toBeVisible(); + await expect(page.getByTestId("graph-legend")).toBeVisible(); + await expect(page.getByTestId("graph-minimap")).toBeVisible(); +}); + +test("drawer has tab navigation", async ({ page }) => { + // click a node to open drawer + await page.getByTestId("graph-canvas").locator(".react-flow__node").first().click(); + const drawer = page.getByTestId("workspace-region"); + for (const tab of ["Overview", "Transitions", "Generations", "Resources", "Evals", "Logs"]) { + await expect(drawer.getByRole("tab", { name: new RegExp(tab) })).toBeVisible(); + } +}); + +test("drawer is 460px wide", async ({ page }) => { + // ...open drawer + const box = await page.getByTestId("workspace-region").boundingBox(); + expect(box?.width).toBeCloseTo(460, -1); // within 10px +}); + +test("run header shows tokens and cost", async ({ page }) => { + await expect(page.getByTestId("stat-tokens")).toBeVisible(); + await expect(page.getByTestId("stat-cost")).toBeVisible(); +}); +``` + +#### P2 — Activity stack +```typescript +test("activity stack has NOW cursor in live mode", async ({ page }) => { + await expect(page.getByTestId("now-cursor")).toBeVisible(); + await expect(page.getByTestId("now-cursor-pill")).toHaveText(/NOW/); +}); + +test("activity stack shows snapshot pin after event click", async ({ page }) => { + // click an activity bar + await page.getByTestId("activity-stack-region").locator("[data-activity-id]").first().click(); + await expect(page.getByTestId("snapshot-pin")).toBeVisible(); +}); + +test("activity stack has kind legend in header", async ({ page }) => { + for (const kind of ["graph mutation", "task", "tool call", "message", "resource", "eval"]) { + await expect(page.getByTestId("activity-kind-legend").getByText(kind)).toBeVisible(); + } +}); + +test("activity stack has footer hints", async ({ page }) => { + await expect(page.getByTestId("activity-footer-hints")).toBeVisible(); + await expect(page.getByTestId("activity-footer-hints")).toContainText("Color = kind"); +}); +``` + +#### P3 — Cohorts +```typescript +// New spec: cohort-design.spec.ts +test("cohort list has 7-column header", async ({ page }) => { + const headers = page.getByTestId("cohort-table-header"); + for (const col of ["Cohort", "Runs", "Avg score", "Failure", "Runtime", "Status"]) { + await expect(headers.getByText(col, { exact: false })).toBeVisible(); + } +}); + +test("cohort detail has 5 metric tiles", async ({ page }) => { + // navigate to cohort detail + for (const metric of ["resolution", "runs-pass-fail", "avg-runtime", "avg-tasks", "cost"]) { + await expect(page.getByTestId(`metric-tile-${metric}`)).toBeVisible(); + } +}); +``` + +### Pillar 2: Visual regression via Playwright `toHaveScreenshot()` + +This is the **most important missing piece**. Playwright has built-in visual comparison: + +```typescript +// First run generates baseline images in tests/e2e/*.spec.ts-snapshots/ +// Subsequent runs compare against baselines with configurable threshold +await expect(page).toHaveScreenshot("cohort-list.png", { + maxDiffPixelRatio: 0.01, // 1% tolerance +}); + +await expect(page.getByTestId("graph-region")).toHaveScreenshot("graph-compact-nodes.png", { + maxDiffPixelRatio: 0.02, +}); +``` + +**Setup needed**: +1. Add `expect.toHaveScreenshot.maxDiffPixelRatio` to `playwright.config.ts` +2. Generate baseline screenshots from the *completed* design work +3. Commit baselines to `tests/e2e/*.spec.ts-snapshots/` (Playwright's convention) + +**Per-phase screenshot gates**: + +| Phase | Screenshots to baseline | +|-------|------------------------| +| P0 | `topbar.png`, `cohort-list-with-topbar.png` | +| P1 | `graph-compact-nodes.png`, `graph-floating-controls.png`, `drawer-open.png`, `drawer-tabs.png` | +| P2 | `activity-stack-live.png`, `activity-stack-snapshot.png` | +| P3 | `cohort-list-table.png`, `cohort-detail-tiles.png`, `empty-cohort.png` | +| P4 | Full page screenshots at 1920×1080 matching spec slides | + +**Workflow for agents**: The agent runs `npx playwright test --update-snapshots` after making changes, then the baselines get committed. On review, a human checks the baseline diffs. For subsequent agents, the baselines serve as regression gates. + +### Pillar 3: Computed style assertions (CSS correctness) + +For design-token work where screenshots are overkill but DOM assertions aren't enough: + +```typescript +test("body uses Inter font", async ({ page }) => { + const fontFamily = await page.evaluate(() => + getComputedStyle(document.body).fontFamily + ); + expect(fontFamily).toContain("Inter"); +}); + +test("status pill uses correct oklch colors", async ({ page }) => { + const pill = page.locator("[data-status='running'] .swatch").first(); + const bg = await pill.evaluate((el) => getComputedStyle(el).backgroundColor); + // oklch(0.78 0.14 80) ≈ rgb(226, 185, 77) — check approximate + expect(bg).toMatch(/rgb\(2[12]\d, 1[78]\d, [67]\d\)/); +}); + +test("drawer width is 460px", async ({ page }) => { + const width = await page.getByTestId("workspace-region").evaluate( + (el) => el.getBoundingClientRect().width + ); + expect(width).toBeCloseTo(460, -1); +}); +``` + +--- + +## Backend gaps — what's missing from the API + +The design spec shows data that **does not exist** in the current API contracts: + +| Spec field | Where shown | API status | +|------------|------------|------------| +| **Tokens** (per run) | Run header: `Tokens: 142k` | **Not in schema**. Not on `CohortRunRow`, `WorkflowRunState`, or `CohortSummary`. | +| **Cost** (per run) | Run header: `Cost: $0.18` | **Not in schema**. | +| **Cost** (per cohort) | Cohort detail tile: `$84.20` | **Not in schema**. | +| **Resolution %** | Cohort detail tile: `62.4%` | **Computable**: `completed / total` from `status_counts`, but no explicit `resolution_rate` field. | +| **Avg tasks per run** | Cohort detail tile: `11.4` | **Not in schema**. Individual runs have `totalTasks` but no cohort-level aggregate. | +| **Depth levels** | Cohort detail tile: `2.1 levels deep` | **Not in schema**. | +| **Retries** | Cohort detail tile: `1.7 retries` | **Not in schema**. | +| **Tokens** (per cohort) | Cohort detail tile: `41M tokens` | **Not in schema**. | +| **p95 runtime** | Cohort detail tile: `p95 4:32` | **Not in schema**. Only `average_duration_ms`. | + +### Options + +**Option A: Backend implements these fields** — Add token/cost tracking to the Ergon core, aggregate at cohort level. This is the "right" answer but requires backend work. + +**Option B: Compute client-side where possible** — Resolution = `completed / total`. Avg tasks requires iterating runs (expensive). Tokens/cost need backend support. + +**Option C: Show what we have, use `—` for missing** — The dashboard fixtures can seed fake values. The harness already uses arbitrary data. We can add `tokens`, `cost` fields to the **fixture** data and show them in the UI, with the real backend catching up later. + +**Recommendation: Option C for now.** Extend the test fixtures to include `tokens` and `cost` fields on run/cohort data. The UI renders them. The harness tests verify the rendering. When the backend adds real fields, the UI just works. + +### Fixture extensions needed + +In `tests/helpers/dashboardFixtures.ts`, extend: + +```typescript +// On CohortSummary extras: +extras: { + total_tokens: 41_000_000, + total_cost_usd: 84.20, + avg_tasks_per_run: 11.4, + avg_depth: 2.1, + avg_retries: 1.7, + p95_duration_ms: 272_000, // 4:32 +} + +// On CohortRunRow or WorkflowRunState extras: +extras: { + total_tokens: 142_000, + cost_usd: 0.18, +} +``` + +Since the Zod schemas use `.passthrough()`, extra fields survive parsing. + +--- + +## Putting it together: the agent delegation loop + +For each phase, the agent receives: + +1. **The plan document** (e.g., `04-P0-design-foundations.md`) +2. **A test spec file** with all structural assertions pre-written (failing) +3. **Baseline screenshots** (if phase > P0 — generated from the previous phase's output) + +The agent's job: +1. Implement the changes described in the plan +2. Run `npm run typecheck` — must pass +3. Run the phase's test spec — all assertions must pass +4. Run `npx playwright test --update-snapshots` to generate new baselines +5. Run the full e2e suite — no regressions + +The verification is **automated except for baseline review**. A human reviews the screenshot baselines once; after that, agents can't regress them. + +### Concrete test files to write BEFORE delegating + +| Phase | Test file to pre-write | Assertions | +|-------|----------------------|------------| +| P0 | `tests/e2e/topbar.spec.ts` | Topbar visible on all pages, nav tabs, search, avatar, active tab, font-family check | +| P1 | `tests/e2e/graph-design.spec.ts` | Floating controls, compact nodes (bounding box height ≤ 80px), drawer width, drawer tabs, stats row content | +| P2 | `tests/e2e/activity-design.spec.ts` | NOW cursor, snapshot pin, kind legend, footer hints, left rubric width | +| P3 | `tests/e2e/cohort-design.spec.ts` | 7-column table, metric tiles, empty state, chart area | +| P4 | `tests/e2e/visual-regression.spec.ts` | `toHaveScreenshot()` for all 8 key views | + +--- + +## Implementation order for verification infra + +Before delegating ANY phase to an agent: + +1. **Enable `toHaveScreenshot`** in playwright config (set threshold, snapshot dir) +2. **Write the failing test specs** for P0 (topbar.spec.ts) +3. **Extend fixtures** with tokens/cost/resolution extras +4. **Add testids** to the plan documents so the agent knows where to place them +5. Delegate P0 with the test spec as the acceptance criterion + +After P0 lands: +6. Generate P0 screenshot baselines +7. Write failing specs for P1 + P2 (can parallelize) +8. Delegate P1 and P3 in parallel +9. After P1: write P2 specs, delegate P2 +10. After all: write P4 visual regression spec, delegate P4 diff --git a/ergon-dashboard/mockups/trace-spans-semantic-bands.html b/ergon-dashboard/mockups/trace-spans-semantic-bands.html new file mode 100644 index 00000000..eeaf8cc1 --- /dev/null +++ b/ergon-dashboard/mockups/trace-spans-semantic-bands.html @@ -0,0 +1,643 @@ + + + + + + Trace Spans Semantic Bands Mockup + + + +
+
+
+

Trace Spans: semantic bands mockup

+
+ Bands carry meaning. Sub-rows inside a band are only collision avoidance. The trace layout stays immutable while the blue cursor moves. +
+
+
+ execution span + sandbox span + graph + tool/context + message + artifact + evaluation +
+
+ +
+
+ + Selected: d_left execution +
+
+ Related events stay bright across semantic bands. Unrelated events fade. Temporary connectors show why the selected work span matters without drawing every relationship all the time. +
+ +
+ +
+ + + +
+
Trace spans
+
10:32
+
10:34
+
10:36
+
10:38
+
10:40
+
10:42
+
+ +
+ +
+
d_root execution
+
x_a execution
+
d_left execution
+ + +
+
+ +
+ +
+
+
+

Graph mutation: node.status_changed

+
Summary first, raw payload second.
+
+
task
d_root
+
sequence
36
+
time
10:37:14
+
actor
manager
+
status
running to completed
+
+
+ Raw payload +
{
+  "mutation_type": "node.status_changed",
+  "target_id": "d_root",
+  "new_value": { "status": "completed" }
+}
+
+
+
+
+ + +
+
+5
+
+
+ +
+ +
+
+ + + + +
+
+
+
+ +
+ +
+
+ +
+
+
+ +
+ +
+ + +
+
+
+
+ + +
+ +
+ 24 trace rows across 5 semantic bands · 93 events · 0 hidden + Band = semantic category · sub-row = collision avoidance · cursor = selected replay point +
+
+ + diff --git a/ergon-dashboard/scripts/generate-rest-contracts.mjs b/ergon-dashboard/scripts/generate-rest-contracts.mjs index 00825f29..558239b9 100644 --- a/ergon-dashboard/scripts/generate-rest-contracts.mjs +++ b/ergon-dashboard/scripts/generate-rest-contracts.mjs @@ -19,4 +19,7 @@ if (markerIndex === -1) { const schemasOnlySource = source.slice(0, markerIndex).trimEnd(); -writeFileSync(contractsPath, `${schemasOnlySource}\n`); +writeFileSync( + contractsPath, + `/* eslint-disable @typescript-eslint/no-empty-object-type */\n${schemasOnlySource}\n`, +); diff --git a/ergon-dashboard/src/app/api/health/health.test.ts b/ergon-dashboard/src/app/api/health/health.test.ts new file mode 100644 index 00000000..6412c1c5 --- /dev/null +++ b/ergon-dashboard/src/app/api/health/health.test.ts @@ -0,0 +1,169 @@ +import assert from "node:assert/strict"; +import { describe, it } from "node:test"; + +/** + * Unit tests for the /api/health endpoint logic. + * + * These tests verify that: + * 1. The health check returns "healthy" when all imports and API are OK + * 2. SSR import failures are surfaced as "degraded" with actionable messages + * 3. Ergon API failures are surfaced as "degraded" + * 4. Both failing at once reports both errors + */ + +// Extracted health-check logic (mirrors what route.ts does, testable without Next.js runtime) +interface HealthResult { + status: "healthy" | "degraded"; + checks: Record; + errors: string[]; +} + +async function runHealthChecks(deps: { + importSSRModules: () => Promise<{ parseRunSnapshot: unknown; TaskStatus: unknown }>; + fetchErgonApi: (path: string) => Promise<{ ok: boolean; status: number }>; +}): Promise { + const checks: Record = {}; + const errors: string[] = []; + + try { + const { parseRunSnapshot, TaskStatus } = await deps.importSSRModules(); + checks.ssr_imports = + typeof parseRunSnapshot === "function" && typeof TaskStatus !== "undefined" + ? "ok" + : "fail"; + } catch (e) { + checks.ssr_imports = "fail"; + errors.push(`SSR import failure: ${e instanceof Error ? e.message : String(e)}`); + } + + try { + const res = await deps.fetchErgonApi("/cohorts?limit=1"); + checks.ergon_api = res.ok ? "ok" : "fail"; + if (!res.ok) errors.push(`Ergon API returned ${res.status}`); + } catch (e) { + checks.ergon_api = "fail"; + errors.push(`Ergon API unreachable: ${e instanceof Error ? e.message : String(e)}`); + } + + const healthy = Object.values(checks).every((v) => v === "ok"); + return { status: healthy ? "healthy" : "degraded", checks, errors }; +} + +describe("Health check logic", () => { + const okImport = async () => ({ + parseRunSnapshot: () => {}, + TaskStatus: { COMPLETED: "completed" }, + }); + + const okApi = async () => ({ ok: true, status: 200 }); + + it("returns healthy when imports and API both succeed", async () => { + const result = await runHealthChecks({ + importSSRModules: okImport, + fetchErgonApi: okApi, + }); + + assert.equal(result.status, "healthy"); + assert.equal(result.checks.ssr_imports, "ok"); + assert.equal(result.checks.ergon_api, "ok"); + assert.equal(result.errors.length, 0); + }); + + it("returns degraded with SSR error when imports fail (stale build)", async () => { + const result = await runHealthChecks({ + importSSRModules: async () => { + throw new Error("Cannot find module './421.js'"); + }, + fetchErgonApi: okApi, + }); + + assert.equal(result.status, "degraded"); + assert.equal(result.checks.ssr_imports, "fail"); + assert.equal(result.checks.ergon_api, "ok"); + assert.equal(result.errors.length, 1); + assert.match(result.errors[0], /Cannot find module/); + assert.match(result.errors[0], /SSR import failure/); + }); + + it("returns degraded when Ergon API is unreachable", async () => { + const result = await runHealthChecks({ + importSSRModules: okImport, + fetchErgonApi: async () => { + throw new Error("fetch failed: ECONNREFUSED"); + }, + }); + + assert.equal(result.status, "degraded"); + assert.equal(result.checks.ssr_imports, "ok"); + assert.equal(result.checks.ergon_api, "fail"); + assert.equal(result.errors.length, 1); + assert.match(result.errors[0], /Ergon API unreachable/); + }); + + it("returns degraded when Ergon API returns non-200", async () => { + const result = await runHealthChecks({ + importSSRModules: okImport, + fetchErgonApi: async () => ({ ok: false, status: 503 }), + }); + + assert.equal(result.status, "degraded"); + assert.equal(result.checks.ergon_api, "fail"); + assert.match(result.errors[0], /Ergon API returned 503/); + }); + + it("reports both errors when both SSR and API fail", async () => { + const result = await runHealthChecks({ + importSSRModules: async () => { + throw new Error("Cannot find module './999.js'"); + }, + fetchErgonApi: async () => { + throw new Error("ECONNREFUSED"); + }, + }); + + assert.equal(result.status, "degraded"); + assert.equal(result.checks.ssr_imports, "fail"); + assert.equal(result.checks.ergon_api, "fail"); + assert.equal(result.errors.length, 2); + }); + + it("returns fail for ssr_imports when parseRunSnapshot is not a function", async () => { + const result = await runHealthChecks({ + importSSRModules: async () => ({ + parseRunSnapshot: "not-a-function", + TaskStatus: { COMPLETED: "completed" }, + }), + fetchErgonApi: okApi, + }); + + assert.equal(result.status, "degraded"); + assert.equal(result.checks.ssr_imports, "fail"); + }); +}); + +describe("SSR error classification", () => { + function classifySSRError(msg: string): string { + if (msg.includes("Cannot find module")) { + return "Stale build — the .next cache is corrupted. Restart the dev server (rm -rf .next && docker compose restart dashboard)."; + } + return `Server-side data fetch failed: ${msg}`; + } + + it("classifies 'Cannot find module' as stale build", () => { + const result = classifySSRError("Cannot find module './421.js'"); + assert.match(result, /Stale build/); + assert.match(result, /rm -rf .next/); + }); + + it("classifies other errors as generic fetch failure", () => { + const result = classifySSRError("ECONNREFUSED 127.0.0.1:9000"); + assert.match(result, /Server-side data fetch failed/); + assert.match(result, /ECONNREFUSED/); + }); + + it("classifies timeout as generic fetch failure", () => { + const result = classifySSRError("The operation was aborted due to timeout"); + assert.match(result, /Server-side data fetch failed/); + assert.match(result, /timeout/); + }); +}); diff --git a/ergon-dashboard/src/app/api/health/route.ts b/ergon-dashboard/src/app/api/health/route.ts new file mode 100644 index 00000000..59e49aba --- /dev/null +++ b/ergon-dashboard/src/app/api/health/route.ts @@ -0,0 +1,55 @@ +import { NextResponse } from "next/server"; +import { config } from "@/lib/config"; +import { fetchErgonApi } from "@/lib/serverApi"; + +/** + * GET /api/health + * + * Lightweight probe that exercises the SSR import graph — the exact code path + * that breaks when .next chunks go stale. Returns build metadata + upstream + * Ergon API reachability so the client can surface actionable error toasts + * instead of silent data loss. + */ +export async function GET() { + const checks: Record = {}; + const errors: string[] = []; + + // 1. Verify critical SSR modules are importable (catches "Cannot find module './421.js'" class of bugs) + try { + const rest = await import("@/lib/contracts/rest"); + const types = await import("@/lib/types"); + checks.ssr_imports = + typeof rest.parseRunSnapshot === "function" && typeof types.TaskStatus !== "undefined" + ? "ok" + : "fail"; + } catch (e) { + checks.ssr_imports = "fail"; + errors.push(`SSR import failure: ${e instanceof Error ? e.message : String(e)}`); + } + + // 2. Verify upstream Ergon API is reachable + try { + const res = await fetchErgonApi("/cohorts?limit=1"); + checks.ergon_api = res.ok ? "ok" : "fail"; + if (!res.ok) errors.push(`Ergon API returned ${res.status}`); + } catch (e) { + checks.ergon_api = "fail"; + errors.push(`Ergon API unreachable: ${e instanceof Error ? e.message : String(e)}`); + } + + const healthy = Object.values(checks).every((v) => v === "ok"); + + return NextResponse.json( + { + status: healthy ? "healthy" : "degraded", + checks, + errors: errors.length > 0 ? errors : undefined, + build: { + nodeEnv: config.nodeEnv, + timestamp: process.env.BUILD_TIMESTAMP ?? null, + pid: process.pid, + }, + }, + { status: healthy ? 200 : 503 }, + ); +} diff --git a/ergon-dashboard/src/app/api/runs/[runId]/mutations/route.ts b/ergon-dashboard/src/app/api/runs/[runId]/mutations/route.ts index 8358a635..9da3ad3a 100644 --- a/ergon-dashboard/src/app/api/runs/[runId]/mutations/route.ts +++ b/ergon-dashboard/src/app/api/runs/[runId]/mutations/route.ts @@ -1,6 +1,8 @@ import { NextResponse } from "next/server"; +import { config } from "@/lib/config"; import { fetchErgonApi } from "@/lib/serverApi"; +import { getHarnessRunMutations } from "@/lib/testing/dashboardHarness"; interface RouteContext { params: Promise<{ @@ -12,6 +14,12 @@ export async function GET(_request: Request, context: RouteContext) { const { runId } = await context.params; try { + if (config.enableTestHarness) { + const harnessMutations = getHarnessRunMutations(runId); + if (harnessMutations) { + return NextResponse.json(harnessMutations); + } + } const response = await fetchErgonApi(`/runs/${runId}/mutations`); const body = await response.json(); if (response.ok) { diff --git a/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx b/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx index 6dbeea93..6bc16e39 100644 --- a/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx +++ b/ergon-dashboard/src/app/cohorts/[cohortId]/runs/[runId]/page.tsx @@ -16,6 +16,7 @@ export default async function CohortRunPage({ params }: CohortRunPageProps) { const { cohortId, runId } = await params; let initialRunState: SerializedWorkflowRunState | null = null; let initialCohortDetail: CohortDetail | null = null; + let ssrError: string | null = null; if (config.enableTestHarness) { initialRunState = getHarnessRun(runId); @@ -28,11 +29,18 @@ export default async function CohortRunPage({ params }: CohortRunPageProps) { ]); if (runResponse.ok) { initialRunState = parseRunSnapshot(await runResponse.json()); + } else { + ssrError = `Run API returned ${runResponse.status}`; } if (cohortResponse.ok) { initialCohortDetail = parseCohortDetail(await cohortResponse.json()); } - } catch { + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + console.error(`[CohortRunPage] SSR fetch failed for run ${runId}:`, msg); + ssrError = msg.includes("Cannot find module") + ? "Stale build — the .next cache is corrupted. Restart the dev server (rm -rf .next && docker compose restart dashboard)." + : `Server-side data fetch failed: ${msg}`; initialRunState = null; initialCohortDetail = null; } @@ -44,6 +52,7 @@ export default async function CohortRunPage({ params }: CohortRunPageProps) { runId={runId} initialRunState={initialRunState} initialCohortDetail={initialCohortDetail} + ssrError={ssrError} /> ); } diff --git a/ergon-dashboard/src/app/globals.css b/ergon-dashboard/src/app/globals.css index 82550148..6ee91630 100644 --- a/ergon-dashboard/src/app/globals.css +++ b/ergon-dashboard/src/app/globals.css @@ -2,22 +2,61 @@ @tailwind components; @tailwind utilities; -:root { - --background: #ffffff; - --foreground: #171717; -} +/* ═══════════════════════════════════════════════════════════ + Ergon Design System Tokens + Source: ergon.zip design deck · v0.1 · 2026.04.26 + ═══════════════════════════════════════════════════════════ */ -@media (prefers-color-scheme: dark) { - :root { - --background: #0a0a0a; - --foreground: #ededed; - } +:root { + /* Surfaces */ + --paper: #f6f7f9; + --paper-2: #eef0f3; + --paper-3: #e6e9ee; + --card: #ffffff; + --ink: #0c1118; + --ink-2: #1f2733; + --muted: #64707f; + --faint: #98a2b1; + --line: #e2e6ec; + --line-strong: #cdd3dc; + + /* Status — oklch single chroma family */ + --status-pending: oklch(0.72 0.02 250); + --status-ready: oklch(0.74 0.10 240); + --status-running: oklch(0.78 0.14 80); + --status-completed: oklch(0.70 0.13 155); + --status-failed: oklch(0.68 0.18 22); + --status-cancelled: oklch(0.62 0.02 260); + + /* Accent — indigo; used ONLY for selection and snapshot pin */ + --accent: oklch(0.62 0.16 252); + --accent-soft: oklch(0.94 0.04 252); + --accent-ink: oklch(0.32 0.12 252); + + /* Radii */ + --radius: 10px; + --radius-sm: 6px; + + /* Shadows */ + --shadow-sm: 0 1px 2px rgb(12 17 24 / 0.04); + --shadow: 0 1px 2px rgb(12 17 24 / 0.05), 0 4px 12px rgb(12 17 24 / 0.04); + --shadow-pop: 0 8px 24px rgb(12 17 24 / 0.08), 0 1px 2px rgb(12 17 24 / 0.05); + + /* Fonts — resolved by next/font CSS variables */ + --font: var(--font-inter), ui-sans-serif, system-ui, -apple-system, sans-serif; + --mono: var(--font-jetbrains-mono), ui-monospace, SFMono-Regular, Menlo, monospace; + + /* Legacy compat — old references still work */ + --background: var(--paper); + --foreground: var(--ink); } body { - color: var(--foreground); - background: var(--background); - font-family: Arial, Helvetica, sans-serif; + color: var(--ink); + background: var(--paper); + font-family: var(--font); + font-feature-settings: "ss01", "cv11"; + -webkit-font-smoothing: antialiased; } @layer utilities { @@ -26,86 +65,68 @@ body { } } -/* React Flow Customizations */ +/* ═══════════════════════════════════════════════════════════ + React Flow Overrides + ═══════════════════════════════════════════════════════════ */ + .react-flow__node { - /* Remove default react-flow node styles */ background: transparent; border: none; padding: 0; } .react-flow__handle { - /* Better handle visibility */ - opacity: 0.8; + opacity: 0; transition: opacity 0.2s; } .react-flow__node:hover .react-flow__handle { - opacity: 1; + opacity: 0.6; } -/* Dark mode adjustments for react-flow */ -@media (prefers-color-scheme: dark) { - .react-flow__controls button { - background: #1f2937; - border-color: #374151; - color: #e5e7eb; - } - - .react-flow__controls button:hover { - background: #374151; - } - - .react-flow__controls button svg { - fill: #e5e7eb; - } - - .react-flow__minimap { - background: #1f2937; - } - - .react-flow__minimap-mask { - fill: rgba(0, 0, 0, 0.5); - } - - .react-flow__background pattern { - stroke: #374151; - } -} +/* ═══════════════════════════════════════════════════════════ + Status Animations + ═══════════════════════════════════════════════════════════ */ -/* Animation for running status */ @keyframes status-pulse { - 0%, 100% { - opacity: 1; - transform: scale(1); - } - 50% { - opacity: 0.7; - transform: scale(1.02); - } + 0%, 100% { opacity: 1; transform: scale(1); } + 50% { opacity: 0.55; transform: scale(0.85); } } .animate-status-pulse { - animation: status-pulse 2s ease-in-out infinite; + animation: status-pulse 1.6s ease-in-out infinite; +} + +/* ═══════════════════════════════════════════════════════════ + Panel Animations + ═══════════════════════════════════════════════════════════ */ + +@keyframes drawerEnter { + from { transform: translateX(28px); opacity: 0.55; } + to { transform: translateX(0); opacity: 1; } +} + +@keyframes drawerExit { + from { transform: translateX(0); opacity: 1; } + to { transform: translateX(28px); opacity: 0; } +} + +.animate-drawer-enter { + animation: drawerEnter 0.26s cubic-bezier(0.22, 1, 0.36, 1) forwards; +} + +.animate-drawer-exit { + animation: drawerExit 0.2s ease-in forwards; } -/* Slide-in animation for panels */ @keyframes slideInRight { - from { - transform: translateX(100%); - } - to { - transform: translateX(0); - } + from { transform: translateX(100%); } + to { transform: translateX(0); } } @keyframes slideOutRight { - from { - transform: translateX(0); - } - to { - transform: translateX(100%); - } + from { transform: translateX(0); } + to { transform: translateX(100%); } } .animate-slide-in-right { @@ -116,76 +137,65 @@ body { animation: slideOutRight 0.2s ease-in forwards; } -/* Responsive adjustments */ -@media (max-width: 640px) { - /* Hide minimap on small screens */ - .react-flow__minimap { - display: none; - } - - /* Smaller controls on mobile */ - .react-flow__controls { - transform: scale(0.85); - transform-origin: bottom left; - } +/* ═══════════════════════════════════════════════════════════ + Custom Edge Animation + ═══════════════════════════════════════════════════════════ */ + +@keyframes ergon-edge-dash { + to { stroke-dashoffset: -24; } } -@media (max-width: 768px) { - /* Adjust panel positioning on tablets */ - .react-flow__panel { - max-width: calc(100vw - 2rem); - } +.ergon-edge-animated { + stroke-dasharray: 6 6; + animation: ergon-edge-dash 0.6s linear infinite; } -/* Custom scrollbar for better aesthetics */ +/* ═══════════════════════════════════════════════════════════ + Scrollbar + ═══════════════════════════════════════════════════════════ */ + @layer utilities { .scrollbar-thin { scrollbar-width: thin; - scrollbar-color: rgb(156, 163, 175) transparent; + scrollbar-color: var(--line-strong) transparent; } - + .scrollbar-thin::-webkit-scrollbar { width: 6px; height: 6px; } - + .scrollbar-thin::-webkit-scrollbar-track { background: transparent; } - + .scrollbar-thin::-webkit-scrollbar-thumb { - background-color: rgb(156, 163, 175); + background-color: var(--line-strong); border-radius: 3px; } - + .scrollbar-thin::-webkit-scrollbar-thumb:hover { - background-color: rgb(107, 114, 128); + background-color: var(--muted); } } -/* Dark mode scrollbar */ -@media (prefers-color-scheme: dark) { - .scrollbar-thin { - scrollbar-color: rgb(75, 85, 99) transparent; - } - - .scrollbar-thin::-webkit-scrollbar-thumb { - background-color: rgb(75, 85, 99); - } - - .scrollbar-thin::-webkit-scrollbar-thumb:hover { - background-color: rgb(107, 114, 128); +/* ═══════════════════════════════════════════════════════════ + Responsive + ═══════════════════════════════════════════════════════════ */ + +@media (max-width: 640px) { + .react-flow__minimap { + display: none; } -} -/* Custom React Flow edges — dashed “flow” when task is running */ -@keyframes ergon-edge-dash { - to { - stroke-dashoffset: -24; + .react-flow__controls { + transform: scale(0.85); + transform-origin: bottom left; } } -.ergon-edge-animated { - stroke-dasharray: 6 6; - animation: ergon-edge-dash 0.6s linear infinite; +@media (max-width: 768px) { + .react-flow__panel { + max-width: calc(100vw - 2rem); + } } diff --git a/ergon-dashboard/src/app/layout.tsx b/ergon-dashboard/src/app/layout.tsx index 26c9081b..dc5cae44 100644 --- a/ergon-dashboard/src/app/layout.tsx +++ b/ergon-dashboard/src/app/layout.tsx @@ -1,18 +1,19 @@ import type { Metadata } from "next"; -import localFont from "next/font/local"; +import { Inter, JetBrains_Mono } from "next/font/google"; import "./globals.css"; import { SocketProvider } from "@/providers/SocketProvider"; import { ClientLayout } from "@/components/common/ClientLayout"; -const geistSans = localFont({ - src: "./fonts/GeistVF.woff", - variable: "--font-geist-sans", - weight: "100 900", +const inter = Inter({ + subsets: ["latin"], + variable: "--font-inter", + display: "swap", }); -const geistMono = localFont({ - src: "./fonts/GeistMonoVF.woff", - variable: "--font-geist-mono", - weight: "100 900", + +const jetbrainsMono = JetBrains_Mono({ + subsets: ["latin"], + variable: "--font-jetbrains-mono", + display: "swap", }); export const metadata: Metadata = { @@ -28,7 +29,7 @@ export default function RootLayout({ return ( {children} diff --git a/ergon-dashboard/src/app/models/page.tsx b/ergon-dashboard/src/app/models/page.tsx new file mode 100644 index 00000000..90280a09 --- /dev/null +++ b/ergon-dashboard/src/app/models/page.tsx @@ -0,0 +1,28 @@ +export default function ModelsPage() { + return ( +
+
+
+ + Workspace + +

+ Models +

+

+ View trained model checkpoints, evaluation scores, and deployment status. +

+
+
+
+
+
+

Coming soon

+

+ The model registry is under development. Use the training page to track active sessions. +

+
+
+
+ ); +} diff --git a/ergon-dashboard/src/app/run/[runId]/page.tsx b/ergon-dashboard/src/app/run/[runId]/page.tsx index b3e8a38d..14547e95 100644 --- a/ergon-dashboard/src/app/run/[runId]/page.tsx +++ b/ergon-dashboard/src/app/run/[runId]/page.tsx @@ -14,6 +14,7 @@ interface LegacyRunPageProps { export default async function RunPage({ params }: LegacyRunPageProps) { const { runId } = await params; let initialRunState: SerializedWorkflowRunState | null = null; + let ssrError: string | null = null; if (config.enableTestHarness) { initialRunState = getHarnessRun(runId); @@ -22,11 +23,18 @@ export default async function RunPage({ params }: LegacyRunPageProps) { const response = await fetchErgonApi(`/runs/${runId}`); if (response.ok) { initialRunState = parseRunSnapshot(await response.json()); + } else { + ssrError = `Run API returned ${response.status}`; } - } catch { + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + console.error(`[RunPage] SSR fetch failed for run ${runId}:`, msg); + ssrError = msg.includes("Cannot find module") + ? "Stale build — the .next cache is corrupted. Restart the dev server." + : `Server-side data fetch failed: ${msg}`; initialRunState = null; } } - return ; + return ; } diff --git a/ergon-dashboard/src/app/runs/page.tsx b/ergon-dashboard/src/app/runs/page.tsx new file mode 100644 index 00000000..e9bbbe37 --- /dev/null +++ b/ergon-dashboard/src/app/runs/page.tsx @@ -0,0 +1,28 @@ +export default function RunsPage() { + return ( +
+
+
+ + Workspace + +

+ Runs +

+

+ Browse all runs across cohorts. Filter by status, benchmark, or time range. +

+
+
+
+
+
+

Coming soon

+

+ The cross-cohort runs view is under development. For now, access runs through individual cohort pages. +

+
+
+
+ ); +} diff --git a/ergon-dashboard/src/app/settings/page.tsx b/ergon-dashboard/src/app/settings/page.tsx new file mode 100644 index 00000000..4a7343a5 --- /dev/null +++ b/ergon-dashboard/src/app/settings/page.tsx @@ -0,0 +1,28 @@ +export default function SettingsPage() { + return ( +
+
+
+ + Workspace + +

+ Settings +

+

+ Configure workspace preferences, API keys, and notification thresholds. +

+
+
+
+
+
+

Coming soon

+

+ Workspace settings are under development. +

+
+
+
+ ); +} diff --git a/ergon-dashboard/src/app/training/page.tsx b/ergon-dashboard/src/app/training/page.tsx index e0d3f146..3af3e48a 100644 --- a/ergon-dashboard/src/app/training/page.tsx +++ b/ergon-dashboard/src/app/training/page.tsx @@ -9,6 +9,7 @@ import { TrainingMetricsChart, type MetricPoint, } from "@/components/charts/TrainingMetricsChart"; +import { formatClockTime } from "@/lib/timeFormat"; interface TrainingSessionSummary { id: string; @@ -119,7 +120,7 @@ export default function TrainingPage() { {sessions.map((s) => ( ))} diff --git a/ergon-dashboard/src/components/cohorts/CohortDetailView.tsx b/ergon-dashboard/src/components/cohorts/CohortDetailView.tsx index d9fe70b1..c071391d 100644 --- a/ergon-dashboard/src/components/cohorts/CohortDetailView.tsx +++ b/ergon-dashboard/src/components/cohorts/CohortDetailView.tsx @@ -1,9 +1,10 @@ "use client"; import Link from "next/link"; +import { useState } from "react"; import { useCohortDetail } from "@/hooks/useCohortDetail"; -import { CohortRunRow, RunLifecycleStatus } from "@/lib/types"; +import { CohortRunRow, CohortSummary, RunLifecycleStatus } from "@/lib/types"; import { StatusBadge } from "@/components/common/StatusBadge"; import { getCohortDisplayStatus } from "@/lib/cohortStatus"; import { CohortDetail } from "@/lib/types"; @@ -14,6 +15,11 @@ function formatScore(score: number | null | undefined): string { return `${(score * 100).toFixed(1)}%`; } +function formatCost(value: number | null): string { + if (value == null) return "—"; + return `$${value.toFixed(2)}`; +} + const startedAtDisplayFormatter = new Intl.DateTimeFormat(undefined, { dateStyle: "medium", timeStyle: "short", @@ -26,43 +32,305 @@ function formatStartedAt(iso: string | null): { text: string; dateTime: string | return { text: startedAtDisplayFormatter.format(d), dateTime: iso }; } +/* ────────────────────────────────────────────────────────── */ +/* Metric Tiles */ +/* ────────────────────────────────────────────────────────── */ + +function MetricTile({ + title, + value, + sub, + children, +}: { + title: string; + value: string; + sub?: string; + children?: React.ReactNode; +}) { + return ( +
+
+ {title} +
+
+ {value} +
+ {sub && ( +
{sub}
+ )} + {children} +
+ ); +} + +interface CohortDetailStats { + averageCostUsd: number | null; + averageTasks: number | null; + completed: number; + failed: number; + scores: number[]; + totalCostUsd: number | null; + totalRuns: number; +} + +function buildDetailStats(summary: CohortSummary, runs: CohortRunRow[]): CohortDetailStats { + const totalRuns = runs.length || summary.total_runs; + const completed = + runs.length > 0 + ? runs.filter((run) => run.status === "completed").length + : summary.status_counts.completed; + const failed = + runs.length > 0 + ? runs.filter((run) => run.status === "failed").length + : summary.status_counts.failed; + const scores = runs + .map((run) => run.final_score) + .filter((score): score is number => score !== null); + const taskCounts = runs + .map((run) => run.total_tasks) + .filter((count): count is number => count !== null); + const costs = runs + .map((run) => run.total_cost_usd) + .filter((cost): cost is number => cost !== null); + const totalCostUsd = costs.length > 0 ? costs.reduce((sum, cost) => sum + cost, 0) : null; + + return { + averageCostUsd: costs.length > 0 && totalCostUsd !== null ? totalCostUsd / costs.length : null, + averageTasks: + taskCounts.length > 0 + ? taskCounts.reduce((sum, count) => sum + count, 0) / taskCounts.length + : null, + completed, + failed, + scores, + totalCostUsd, + totalRuns, + }; +} + +function ResolutionTile({ stats }: { stats: CohortDetailStats }) { + const total = stats.totalRuns; + const completed = stats.completed; + const pct = total > 0 ? Math.round((completed / total) * 100) : 0; + + return ( + + ); +} + +function RunsPassFailTile({ stats }: { stats: CohortDetailStats }) { + const completed = stats.completed; + const failed = stats.failed; + const total = stats.totalRuns; + const greenPct = total > 0 ? (completed / total) * 100 : 0; + const redPct = total > 0 ? (failed / total) * 100 : 0; + + return ( + +
+ {greenPct > 0 && ( +
+ )} + {redPct > 0 && ( +
+ )} +
+ + ); +} + +type DistributionMetric = "score" | "runtime" | "tasks" | "cost"; + +const distributionMetrics: Array<{ key: DistributionMetric; label: string }> = [ + { key: "score", label: "Score" }, + { key: "runtime", label: "Runtime" }, + { key: "tasks", label: "Tasks" }, + { key: "cost", label: "Cost" }, +]; + +function metricValue(run: CohortRunRow, metric: DistributionMetric): number | null { + switch (metric) { + case "score": + return run.final_score; + case "runtime": + return run.running_time_ms; + case "tasks": + return run.total_tasks; + case "cost": + return run.total_cost_usd; + } +} + +function formatMetricValue(metric: DistributionMetric, value: number): string { + switch (metric) { + case "score": + return formatScore(value); + case "runtime": + return formatDurationMs(value); + case "tasks": + return value.toFixed(0); + case "cost": + return formatCost(value); + } +} + +function RunDistribution({ cohortId, runs }: { cohortId: string; runs: CohortRunRow[] }) { + const [selectedMetric, setSelectedMetric] = useState("score"); + const selectedLabel = + distributionMetrics.find((metric) => metric.key === selectedMetric)?.label ?? "Score"; + const points = runs + .map((run, index) => ({ + index, + run, + value: metricValue(run, selectedMetric), + })) + .filter((point): point is { index: number; run: CohortRunRow; value: number } => point.value !== null); + const values = points.map((point) => point.value); + const min = selectedMetric === "score" ? 0 : Math.min(...values); + const max = selectedMetric === "score" ? 1 : Math.max(...values); + + function leftPct(value: number): number { + if (values.length === 0 || min === max) return 50; + return ((value - min) / (max - min)) * 100; + } + + return ( +
+
+
+

+ {selectedLabel} distribution +

+

+ One dot per run. Use the metric controls to spot slow, costly, or unusually large runs. +

+
+
+ {distributionMetrics.map((metric) => ( + + ))} +
+
+ + {points.length === 0 ? ( +
+ No {selectedLabel.toLowerCase()} values are available yet. +
+ ) : ( +
+
+
+ {points.map((point) => { + const valueLabel = formatMetricValue(selectedMetric, point.value); + return ( + + + {point.run.run_id} {selectedLabel} {valueLabel} + + + ); + })} +
+
+ {formatMetricValue(selectedMetric, min)} + {points.length} run{points.length === 1 ? "" : "s"} + {formatMetricValue(selectedMetric, max)} +
+
+ )} +
+ ); +} + +/* ────────────────────────────────────────────────────────── */ +/* Run Row */ +/* ────────────────────────────────────────────────────────── */ + function CohortRunRowCard({ cohortId, run }: { cohortId: string; run: CohortRunRow }) { const started = formatStartedAt(run.started_at); return (
- + {run.run_id}
-
+
{run.cohort_name} {run.run_id.slice(0, 8)}...
{run.error_message && ( -
{run.error_message}
+
+ {run.error_message} +
)}
-
Benchmark
-
{run.cohort_name}
+
Benchmark
+
{run.cohort_name}
-
Status
-
{run.status}
+
Status
+
{run.status}
-
Started
-
+
Started
+
{started.dateTime ? (
-
Runtime
-
+
Runtime
+
{formatDurationMs(run.running_time_ms)}
-
Score
-
+
Score
+
{formatScore(run.final_score)}
@@ -88,6 +356,36 @@ function CohortRunRowCard({ cohortId, run }: { cohortId: string; run: CohortRunR ); } +/* ────────────────────────────────────────────────────────── */ +/* Empty State */ +/* ────────────────────────────────────────────────────────── */ + +function EmptyRunsState() { + return ( +
+ +

No runs yet

+

+ This cohort has no runs. Launch a benchmark run targeting this cohort to get started. +

+ +
+ ); +} + +/* ────────────────────────────────────────────────────────── */ +/* Main View */ +/* ────────────────────────────────────────────────────────── */ + export function CohortDetailView({ cohortId, initialDetail = null, @@ -99,7 +397,7 @@ export function CohortDetailView({ if (isLoading) { return ( -
+
Loading cohort...
); @@ -107,108 +405,92 @@ export function CohortDetailView({ if (!detail) { return ( -
+
{error ?? "Cohort not found"}
); } const { summary, runs } = detail; + const stats = buildDetailStats(summary, runs); return ( -
-
+
+
- Experiment Cohorts + Cohorts
-

+

{summary.name}

-

+

{summary.description ?? "Monitor cohort progress, inspect runs, and drill into task-level evidence."}

-
-
Model
-
- {summary.metadata_summary.model_name ?? "—"} -
-
{error && ( -
+
{error}
)} -
-
-
Total runs
-
- {summary.total_runs} -
-
-
-
Executing
-
- {summary.status_counts.executing} -
-
-
-
Completed
-
- {summary.status_counts.completed} -
-
-
-
Failed
-
- {summary.status_counts.failed} -
-
-
-
Average score
-
- {formatScore(summary.average_score)} -
-
-
-
Failure rate
-
- {formatScore(summary.failure_rate)} -
-
+ {/* 5-tile summary row */} +
+ + + + +
+ + + {/* Runs section */}
-

Runs

-

+

Runs

+

Select a run to inspect graph topology and task workspace evidence.

-
- {runs.map((run) => ( - - ))} -
+ {runs.length === 0 ? ( + + ) : ( +
+ {runs.map((run) => ( + + ))} +
+ )}
diff --git a/ergon-dashboard/src/components/cohorts/CohortListView.tsx b/ergon-dashboard/src/components/cohorts/CohortListView.tsx index 62905fb8..0dd02222 100644 --- a/ergon-dashboard/src/components/cohorts/CohortListView.tsx +++ b/ergon-dashboard/src/components/cohorts/CohortListView.tsx @@ -5,7 +5,6 @@ import { useMemo, useState } from "react"; import { useCohorts } from "@/hooks/useCohorts"; import { StatusBadge } from "@/components/common/StatusBadge"; -import { SearchInput } from "@/components/common/SearchInput"; import { getCohortDisplayStatus } from "@/lib/cohortStatus"; import { CohortSummary } from "@/lib/types"; @@ -101,122 +100,49 @@ function sortCohorts(cohorts: CohortSummary[], sortKey: SortKey): CohortSummary[ return sorted; } -function QuickFilterButton({ - label, - count, - active, - onClick, -}: { - label: string; - count: number; - active: boolean; - onClick: () => void; -}) { - return ( - - ); -} - -function ArchiveActionButton({ - cohort, - isUpdating, - onToggle, -}: { - cohort: CohortSummary; - isUpdating: boolean; - onToggle: (cohort: CohortSummary) => Promise; -}) { - const isArchived = cohort.status === "archived"; - - return ( - - ); -} - -function SummaryCard({ - label, +function SegmentedControl({ + options, value, - helper, + onChange, }: { - label: string; - value: string | number; - helper: string; + options: { key: T; label: string; count?: number }[]; + value: T; + onChange: (key: T) => void; }) { return ( -
-
{label}
-
{value}
-
{helper}
+
+ {options.map((opt) => ( + + ))}
); } -function ProgressBar({ cohort }: { cohort: CohortSummary }) { - const total = Math.max(cohort.total_runs, 1); - const segments = [ - { - key: "completed", - value: cohort.status_counts.completed, - className: "bg-emerald-500", - }, - { - key: "failed", - value: cohort.status_counts.failed, - className: "bg-red-500", - }, - { - key: "executing", - value: cohort.status_counts.executing + cohort.status_counts.evaluating, - className: "bg-blue-500", - }, - { - key: "pending", - value: cohort.status_counts.pending, - className: "bg-gray-300 dark:bg-gray-700", - }, - ].filter((segment) => segment.value > 0); +function failureColor(rate: number): string { + if (rate > 0.30) return "oklch(0.50 0.16 22)"; + if (rate > 0.15) return "oklch(0.50 0.10 80)"; + return "var(--muted)"; +} - return ( -
-
- Run progress - - {cohort.status_counts.completed + cohort.status_counts.failed}/{cohort.total_runs} finished - -
-
- {segments.map((segment) => ( -
- ))} -
-
- ); +function formatTimeHHMMSS(): string { + const now = new Date(); + return [now.getHours(), now.getMinutes(), now.getSeconds()] + .map((n) => String(n).padStart(2, "0")) + .join(":"); } export function CohortListView() { @@ -243,10 +169,6 @@ export function CohortListView() { [cohorts], ); - const totalRuns = useMemo( - () => cohorts.reduce((sum, cohort) => sum + cohort.total_runs, 0), - [cohorts], - ); const activeCohorts = useMemo( () => visibleCohortList.filter((cohort) => cohort.status === "active").length, [visibleCohortList], @@ -279,387 +201,203 @@ export function CohortListView() { } }; + // Suppress unused-var lint — archive toggle is still wired but hidden in the new grid rows + void updatingCohortIds; + void handleArchiveToggle; + if (isLoading) { return ( -
+
Loading cohorts...
); } return ( -
+
-
-
-

- Ergon Dashboard -

-

- Experiment Cohorts -

-

- Monitor cohorts first, then drill into runs and task workspaces from the same - operator surface. -

-
-
-
Visible cohorts
-
- {visibleCohorts} -
-
+
+ + Workspace + +

+ Cohorts +

+

+ Monitor cohorts first, then drill into runs and task workspaces from the same + operator surface. +

-
+
{error && (
{error}
)} - {cohorts.length > 0 && ( -
- - - - -
- )} - {cohorts.length === 0 ? (
-

+

No cohorts yet

-

+

Start a benchmark run with a compulsory cohort name to create the first cohort.

) : (
-
-
-
-
- Find the right cohort faster -
-
- Search by cohort, model, benchmark, prompt version, creator, or description. -
-
-
- Showing {filteredCohorts.length} of{" "} - {statusFilter === "archived" ? archivedCohorts : visibleCohorts} cohorts -
-
- -
- setStatusFilter("all")} - /> - setStatusFilter("needs-attention")} - /> - setStatusFilter("running")} - /> - setStatusFilter("active")} - /> - setStatusFilter("archived")} - /> -
- -
- +
+ setQuery(e.target.value)} + placeholder="Filter cohorts…" + className="w-[220px] rounded-[var(--radius-sm)] border border-[var(--line)] bg-[var(--paper)] px-3 py-1.5 text-xs text-[var(--ink)] placeholder:text-[var(--faint)] focus:border-[var(--accent)] focus:outline-none" + data-testid="cohort-search-input" /> - -
-
+ + value={statusFilter} + onChange={setStatusFilter} + options={[ + { key: "all", label: "All", count: visibleCohorts }, + { key: "active", label: "Active", count: activeCohorts }, + { key: "running", label: "Running", count: runningCohorts }, + { key: "needs-attention", label: "Needs attention", count: cohortsNeedingAttention }, + { key: "archived", label: "Archived", count: archivedCohorts }, + ]} + /> + + value={sortKey} + onChange={setSortKey} + options={[ + { key: "recent", label: "Recent" }, + { key: "score", label: "Score" }, + { key: "failure", label: "Failure rate" }, + { key: "runs", label: "Runs" }, + ]} + /> +
{filteredCohorts.length === 0 ? ( -
-

+
+

No cohorts match these filters

-

+

Try clearing the search, changing the status filter, or sorting by a different signal.

) : ( - <> -
-
Cohort
-
Runs
-
Running
-
Completed
-
Failure rate
-
Avg score
-
Latest activity
-
Actions
+
+ {/* Table header */} +
+ {["Cohort", "Runs", "Avg score", "Failure", "Runtime", "Status", ""].map( + (col) => ( +
+ {col} +
+ ), + )}
+ {/* Table rows */} {filteredCohorts.map((cohort) => ( -
-
-
-
- - {cohort.name} - - - {cohort.status_counts.failed > 0 && ( - - Needs attention - - )} -
- - {cohort.description && ( -

- {cohort.description} -

- )} - -
- - Model: {cohort.metadata_summary.model_name ?? "—"} - - - By: {cohort.created_by ?? "Unknown"} - - - Avg runtime: {formatDurationMs(cohort.average_duration_ms)} - -
- -
- -
-
- -
- {cohort.total_runs} -
-
- {cohort.status_counts.executing + cohort.status_counts.evaluating} -
-
- {cohort.status_counts.completed} -
-
- {formatPercent(cohort.failure_rate)} + {/* Cohort name + sub ID */} +
+
+ {cohort.name}
-
- {formatPercent(cohort.average_score)} -
-
-
- {formatRelativeTime(getLatestActivityAt(cohort))} -
-
{new Date(cohort.created_at).toLocaleDateString()}
-
-
- - Open - - +
+ {cohort.cohort_id.slice(0, 12)}
-
-
-
-

- {cohort.name} -

- - {cohort.status_counts.failed > 0 && ( - - Needs attention - - )} -
- - {cohort.description && ( -

- {cohort.description} -

- )} - -
- - Model: {cohort.metadata_summary.model_name ?? "—"} - - - Created by: {cohort.created_by ?? "Unknown"} - - - Latest activity: {formatRelativeTime(getLatestActivityAt(cohort))} - - - Created: {new Date(cohort.created_at).toLocaleDateString()} - -
- -
- -
-
+ {/* Runs */} +
+ {cohort.total_runs} +
-
-
-
Runs
-
- {cohort.total_runs} -
-
-
-
Completed
-
- {cohort.status_counts.completed} -
-
-
-
Running
-
- {cohort.status_counts.executing + cohort.status_counts.evaluating} -
-
-
-
Failure rate
-
- {formatPercent(cohort.failure_rate)} -
-
-
-
Avg score
-
- {formatPercent(cohort.average_score)} -
-
-
-
Avg runtime
-
- {formatDurationMs(cohort.average_duration_ms)} -
-
-
- -
- - Open cohort - - -
+ {/* Avg score */} +
+ {formatPercent(cohort.average_score)} +
+ + {/* Failure rate */} +
+ {formatPercent(cohort.failure_rate)} +
+ + {/* Runtime · last activity */} +
+ {formatDurationMs(cohort.average_duration_ms)} + + · {formatRelativeTime(getLatestActivityAt(cohort))} +
-
+ + {/* Status */} +
+ +
+ + {/* Chevron */} +
+ ))} - + + {/* Footer */} +
+ + Showing {filteredCohorts.length} of{" "} + {statusFilter === "archived" ? archivedCohorts : visibleCohorts} cohorts + + + Updated {formatTimeHHMMSS()} · live + + +
+
)}
)} diff --git a/ergon-dashboard/src/components/common/BuildHealthToast.tsx b/ergon-dashboard/src/components/common/BuildHealthToast.tsx new file mode 100644 index 00000000..76fed334 --- /dev/null +++ b/ergon-dashboard/src/components/common/BuildHealthToast.tsx @@ -0,0 +1,92 @@ +"use client"; + +import { useState } from "react"; +import { useBuildHealth } from "@/hooks/useBuildHealth"; + +export function BuildHealthToast() { + const { status, errors, check } = useBuildHealth(); + const [dismissed, setDismissed] = useState(false); + + if (status !== "degraded" || dismissed) return null; + + const hasSSRFailure = errors.some( + (e) => e.includes("SSR import") || e.includes("Cannot find module"), + ); + const hasApiFailure = errors.some((e) => e.includes("Ergon API")); + + let headline: string; + let advice: string; + + if (hasSSRFailure) { + headline = "Stale build detected"; + advice = + "The Next.js dev server has a corrupted cache. " + + "Run: rm -rf .next && docker compose restart dashboard"; + } else if (hasApiFailure) { + headline = "Backend API unreachable"; + advice = + "The Ergon API is not responding. Check that the API container is running: " + + "docker compose ps api"; + } else { + headline = "Dashboard health degraded"; + advice = errors[0] ?? "Unknown issue — check server logs."; + } + + return ( +
+ + + + +
+

+ {headline} +

+

{advice}

+ {errors.length > 1 && ( +
+ + {errors.length} details + +
    + {errors.map((e, i) => ( +
  • {e}
  • + ))} +
+
+ )} +
+ +
+ + +
+
+ ); +} diff --git a/ergon-dashboard/src/components/common/ClientLayout.tsx b/ergon-dashboard/src/components/common/ClientLayout.tsx index 93938c5c..4eb77423 100644 --- a/ergon-dashboard/src/components/common/ClientLayout.tsx +++ b/ergon-dashboard/src/components/common/ClientLayout.tsx @@ -1,13 +1,8 @@ "use client"; -/** - * ClientLayout - Client-side layout wrapper. - * - * Includes components that need client-side functionality, - * like the ConnectionStatus banner. - */ - +import { BuildHealthToast } from "./BuildHealthToast"; import { ConnectionStatus } from "./ConnectionStatus"; +import { Topbar } from "@/components/shell/Topbar"; interface ClientLayoutProps { children: React.ReactNode; @@ -15,9 +10,11 @@ interface ClientLayoutProps { export function ClientLayout({ children }: ClientLayoutProps) { return ( - <> - {children} +
+ +
{children}
- + +
); } diff --git a/ergon-dashboard/src/components/common/StatusBadge.tsx b/ergon-dashboard/src/components/common/StatusBadge.tsx index 38004bdf..6ab5abc9 100644 --- a/ergon-dashboard/src/components/common/StatusBadge.tsx +++ b/ergon-dashboard/src/components/common/StatusBadge.tsx @@ -1,175 +1,149 @@ "use client"; -/** - * StatusBadge - Color-coded status indicator for tasks and runs. - * - * Displays task status with appropriate colors: - * - pending: gray - * - ready: blue - * - running: yellow (with pulse animation) - * - completed: green - * - failed: red - */ - import { ExperimentCohortStatus, RunLifecycleStatus, TaskStatus } from "@/lib/types"; -// Status type includes TaskStatus enum values and run-level status strings -// Note: TaskStatus.RUNNING = "running", TaskStatus.COMPLETED = "completed", etc. -// So "running" | "completed" | "failed" are already covered by TaskStatus type StatusType = TaskStatus | RunLifecycleStatus | ExperimentCohortStatus; -interface StatusBadgeProps { - status: StatusType; - size?: "sm" | "md"; - showLabel?: boolean; -} - interface StatusConfig { - bg: string; - text: string; - ring: string; label: string; + dot: string; + solidBg: string; + solidBorder: string; + solidText: string; animate?: boolean; - color: string; } const statusConfig: Record = { [TaskStatus.PENDING]: { - bg: "bg-gray-100 dark:bg-gray-800", - text: "text-gray-600 dark:text-gray-400", - ring: "ring-gray-200 dark:ring-gray-700", label: "Pending", - color: "#9ca3af", + dot: "var(--status-pending)", + solidBg: "var(--paper-2)", + solidBorder: "var(--line)", + solidText: "var(--muted)", }, [TaskStatus.READY]: { - bg: "bg-blue-100 dark:bg-blue-900/30", - text: "text-blue-600 dark:text-blue-400", - ring: "ring-blue-200 dark:ring-blue-800", label: "Ready", - color: "#3b82f6", + dot: "var(--status-ready)", + solidBg: "oklch(0.97 0.03 240)", + solidBorder: "oklch(0.86 0.08 240)", + solidText: "oklch(0.40 0.12 240)", }, [TaskStatus.RUNNING]: { - bg: "bg-yellow-100 dark:bg-yellow-900/30", - text: "text-yellow-700 dark:text-yellow-400", - ring: "ring-yellow-200 dark:ring-yellow-800", label: "Running", + dot: "var(--status-running)", + solidBg: "oklch(0.96 0.04 80)", + solidBorder: "oklch(0.85 0.10 80)", + solidText: "oklch(0.42 0.12 65)", animate: true, - color: "#eab308", }, [TaskStatus.COMPLETED]: { - bg: "bg-green-100 dark:bg-green-900/30", - text: "text-green-600 dark:text-green-400", - ring: "ring-green-200 dark:ring-green-800", label: "Completed", - color: "#22c55e", + dot: "var(--status-completed)", + solidBg: "oklch(0.96 0.04 155)", + solidBorder: "oklch(0.85 0.10 155)", + solidText: "oklch(0.40 0.12 155)", }, [TaskStatus.FAILED]: { - bg: "bg-red-100 dark:bg-red-900/30", - text: "text-red-600 dark:text-red-400", - ring: "ring-red-200 dark:ring-red-800", label: "Failed", - color: "#ef4444", + dot: "var(--status-failed)", + solidBg: "oklch(0.96 0.04 22)", + solidBorder: "oklch(0.85 0.10 22)", + solidText: "oklch(0.40 0.16 22)", }, [TaskStatus.CANCELLED]: { - bg: "bg-gray-100 dark:bg-gray-800", - text: "text-gray-500 dark:text-gray-400", - ring: "ring-gray-200 dark:ring-gray-700", label: "Cancelled", - color: "#9ca3af", + dot: "var(--status-cancelled)", + solidBg: "var(--paper-2)", + solidBorder: "var(--line)", + solidText: "var(--muted)", }, executing: { - bg: "bg-yellow-100 dark:bg-yellow-900/30", - text: "text-yellow-700 dark:text-yellow-400", - ring: "ring-yellow-200 dark:ring-yellow-800", label: "Executing", + dot: "var(--status-running)", + solidBg: "oklch(0.96 0.04 80)", + solidBorder: "oklch(0.85 0.10 80)", + solidText: "oklch(0.42 0.12 65)", animate: true, - color: "#eab308", }, evaluating: { - bg: "bg-violet-100 dark:bg-violet-900/30", - text: "text-violet-700 dark:text-violet-400", - ring: "ring-violet-200 dark:ring-violet-800", label: "Evaluating", + dot: "oklch(0.74 0.16 295)", + solidBg: "oklch(0.96 0.04 295)", + solidBorder: "oklch(0.85 0.10 295)", + solidText: "oklch(0.40 0.16 295)", animate: true, - color: "#8b5cf6", }, active: { - bg: "bg-blue-100 dark:bg-blue-900/30", - text: "text-blue-700 dark:text-blue-400", - ring: "ring-blue-200 dark:ring-blue-800", label: "Active", - color: "#3b82f6", + dot: "var(--status-ready)", + solidBg: "oklch(0.97 0.03 240)", + solidBorder: "oklch(0.86 0.08 240)", + solidText: "oklch(0.40 0.12 240)", }, archived: { - bg: "bg-gray-100 dark:bg-gray-800", - text: "text-gray-600 dark:text-gray-400", - ring: "ring-gray-200 dark:ring-gray-700", label: "Archived", - color: "#9ca3af", + dot: "var(--status-cancelled)", + solidBg: "var(--paper-2)", + solidBorder: "var(--line)", + solidText: "var(--muted)", }, }; -// Default config for unknown statuses const defaultConfig: StatusConfig = { - bg: "bg-gray-100 dark:bg-gray-800", - text: "text-gray-600 dark:text-gray-400", - ring: "ring-gray-200 dark:ring-gray-700", label: "Unknown", - color: "#9ca3af", + dot: "var(--faint)", + solidBg: "var(--paper-2)", + solidBorder: "var(--line)", + solidText: "var(--muted)", }; +interface StatusBadgeProps { + status: StatusType; + variant?: "outline" | "solid"; + size?: "sm" | "md"; + showLabel?: boolean; +} + export function StatusBadge({ status, + variant = "solid", size = "md", showLabel = true, }: StatusBadgeProps) { + const sizeClass = size === "sm" ? "text-[10px] px-1.5 py-px" : "text-[11px] px-2 py-0.5"; const config = statusConfig[status] || defaultConfig; - const sizeClasses = { - sm: { - badge: "px-1.5 py-0.5 text-xs", - dot: "w-1.5 h-1.5", - }, - md: { - badge: "px-2 py-1 text-sm", - dot: "w-2 h-2", - }, - }; - - const sizes = sizeClasses[size]; - - return ( - - {/* Status dot */} - + if (variant === "outline") { + return ( + - {config.animate && ( - - )} + {showLabel && {config.label}} + ); + } - {/* Label */} + return ( + + {showLabel && {config.label}} ); } -/** - * Compact dot-only status indicator for use in tight spaces. - */ export function StatusDot({ status, size = "md", @@ -178,23 +152,18 @@ export function StatusDot({ size?: "sm" | "md" | "lg"; }) { const config = statusConfig[status] || defaultConfig; - - const sizeClasses = { - sm: "w-2 h-2", - md: "w-3 h-3", - lg: "w-4 h-4", - }; + const sizeClasses = { sm: "size-2", md: "size-3", lg: "size-4" }; return ( {config.animate && ( )} diff --git a/ergon-dashboard/src/components/common/TransitionChip.tsx b/ergon-dashboard/src/components/common/TransitionChip.tsx index c37a81dc..b519efc5 100644 --- a/ergon-dashboard/src/components/common/TransitionChip.tsx +++ b/ergon-dashboard/src/components/common/TransitionChip.tsx @@ -13,6 +13,7 @@ import { TaskStatus, TaskTrigger } from "@/lib/types"; import { tokensFor } from "@/lib/statusTokens"; +import { formatClockTimeMs } from "@/lib/timeFormat"; const TRIGGER_LABELS: Record = { [TaskTrigger.WORKFLOW_STARTED]: "workflow started", @@ -26,16 +27,8 @@ const TRIGGER_LABELS: Record = { function formatTimeMs(iso: string | null): string { if (!iso) return "—"; - try { - return new Date(iso).toLocaleTimeString("en-GB", { - hour: "2-digit", - minute: "2-digit", - second: "2-digit", - fractionalSecondDigits: 3, - }); - } catch { - return iso; - } + const label = formatClockTimeMs(iso); + return label === "—" ? iso : label; } interface TransitionChipProps { diff --git a/ergon-dashboard/src/components/dag/DAGCanvas.tsx b/ergon-dashboard/src/components/dag/DAGCanvas.tsx index 4815ec0d..ef653852 100644 --- a/ergon-dashboard/src/components/dag/DAGCanvas.tsx +++ b/ergon-dashboard/src/components/dag/DAGCanvas.tsx @@ -5,23 +5,22 @@ * * Features: * - Hierarchical dagre layout with nested container rendering - * - Depth-based expansion control via DepthSelector + * - Depth-based expansion control via floating controls * - Search/filter tasks by name * - Live updates via useRunState hook * - Zoom/pan controls */ -import { useCallback, useEffect, useState, useMemo } from "react"; +import { useCallback, useEffect, useState, useMemo, useRef } from "react"; import { ReactFlow, Edge, Background, - Controls, MiniMap, useNodesState, useEdgesState, + useReactFlow, ConnectionLineType, - Panel, BackgroundVariant, } from "@xyflow/react"; import "@xyflow/react/dist/style.css"; @@ -29,7 +28,6 @@ import "@xyflow/react/dist/style.css"; import { TaskStatus, type WorkflowRunState } from "@/lib/types"; import { nodeTypes, type TaskNodeType } from "./TaskNode"; import { GraphDependencyEdge } from "./edges/GraphDependencyEdge"; -import { DepthSelector } from "@/features/graph/components/DepthSelector"; import { GraphExpansionProvider } from "@/features/graph/hooks/useGraphExpansion"; import { computeHierarchicalLayout, calculateExpandedContainers } from "@/features/graph/layout/hierarchicalLayout"; import { DEFAULT_EXPANDED_DEPTH } from "@/features/graph/layout/layoutTypes"; @@ -44,6 +42,7 @@ interface DAGCanvasProps { isSubscribed?: boolean; onTaskClick?: (taskId: string) => void; selectedTaskId?: string | null; + highlightedTaskIds?: ReadonlySet; } /** @@ -71,6 +70,139 @@ function getMinimapNodeColor(node: TaskNodeType): string { } } +/* ─── Floating control cards ────────────────────────────────────── */ + +const cardClass = + "bg-[var(--card)] border border-[var(--line)] rounded-lg shadow-card"; + +function ZoomControls() { + const { zoomIn, zoomOut, fitView } = useReactFlow(); + const btn = + "flex items-center justify-center w-7 h-7 text-sm font-semibold text-[var(--muted)] hover:text-[var(--ink)] hover:bg-[var(--paper)] rounded transition-colors"; + return ( +
+ + + + + +
+ ); +} + +function DepthSelectorCard({ + maxAvailableDepth, + currentDepth, + onDepthChange, +}: { + maxAvailableDepth: number; + currentDepth: number | "all"; + onDepthChange: (depth: number | "all") => void; +}) { + const depths: (number | "all")[] = []; + for (let i = 1; i <= Math.min(maxAvailableDepth, 3); i++) depths.push(i); + depths.push("all"); + + return ( +
+ + Depth + +
+ {depths.map((d) => { + const isActive = currentDepth === d; + return ( + + ); + })} +
+
+ ); +} + +function SearchCard({ + searchQuery, + onSearchChange, + matchCount, +}: { + searchQuery: string; + onSearchChange: (value: string) => void; + matchCount: number; +}) { + return ( +
+ + Search + + + {searchQuery && ( + + {matchCount} + + )} +
+ ); +} + +const LEGEND_ITEMS: { status: string; label: string; cssVar: string }[] = [ + { status: "completed", label: "completed", cssVar: "var(--status-completed)" }, + { status: "running", label: "running", cssVar: "var(--status-running)" }, + { status: "ready", label: "ready", cssVar: "var(--status-ready)" }, + { status: "pending", label: "pending", cssVar: "var(--status-pending)" }, + { status: "failed", label: "failed", cssVar: "var(--status-failed)" }, +]; + +function LegendCard() { + return ( +
+ {LEGEND_ITEMS.map((item) => ( +
+ + + {item.label} + +
+ ))} +
+ ); +} + +/* ─── Main canvas ───────────────────────────────────────────────── */ + function DAGCanvasInner({ runId, runState, @@ -79,6 +211,7 @@ function DAGCanvasInner({ isSubscribed = false, onTaskClick, selectedTaskId, + highlightedTaskIds = new Set(), }: DAGCanvasProps) { const [expandedDepth, setExpandedDepth] = useState(DEFAULT_EXPANDED_DEPTH); const [manualExpansions, setManualExpansions] = useState>(new Set()); @@ -87,6 +220,8 @@ function DAGCanvasInner({ const [edges, setEdges, onEdgesChange] = useEdgesState([]); const [containerDims, setContainerDims] = useState>(new Map()); const [prevTaskIds, setPrevTaskIds] = useState>(new Set()); + const { fitView: rfFitView } = useReactFlow(); + const fitViewTimer = useRef | null>(null); const newNodeIds = useMemo(() => { if (!runState?.tasks) return new Set(); @@ -104,7 +239,6 @@ function DAGCanvasInner({ } }, [runState?.tasks]); - // Compute max available depth from tasks const maxAvailableDepth = useMemo(() => { if (!runState?.tasks) return 0; let max = 0; @@ -114,12 +248,10 @@ function DAGCanvasInner({ return max; }, [runState?.tasks]); - // Compute expanded containers from depth + manual overrides const expandedContainers = useMemo(() => { if (!runState?.tasks) return new Set(); const maxDepth = expandedDepth === "all" ? Infinity : expandedDepth; const fromDepth = calculateExpandedContainers(runState.tasks, maxDepth); - // Merge manual expansions (toggled individually) for (const id of manualExpansions) { if (fromDepth.has(id)) { fromDepth.delete(id); @@ -133,7 +265,6 @@ function DAGCanvasInner({ return fromDepth; }, [runState?.tasks, expandedDepth, manualExpansions]); - // Calculate matching node count const matchCount = useMemo(() => { if (!searchQuery.trim() || !runState?.tasks) return 0; const searchLower = searchQuery.toLowerCase().trim(); @@ -150,7 +281,6 @@ function DAGCanvasInner({ return count; }, [searchQuery, runState?.tasks]); - // Compute hierarchical layout when data changes useEffect(() => { if (!runState?.tasks || runState.tasks.size === 0) return; @@ -162,11 +292,17 @@ function DAGCanvasInner({ selectedTaskId, "LR", newNodeIds, + highlightedTaskIds, ); setNodes(result.nodes as TaskNodeType[]); setEdges(result.edges); setContainerDims(result.containerDimensions); + + if (fitViewTimer.current) clearTimeout(fitViewTimer.current); + fitViewTimer.current = setTimeout(() => { + rfFitView({ padding: 0.2, duration: 200 }); + }, 100); }, [ runState?.tasks, expandedContainers, @@ -174,17 +310,17 @@ function DAGCanvasInner({ onTaskClick, selectedTaskId, newNodeIds, + highlightedTaskIds, setNodes, setEdges, + rfFitView, ]); - // Handle depth change — reset manual overrides when depth changes const handleDepthChange = useCallback((depth: number | "all") => { setExpandedDepth(depth); setManualExpansions(new Set()); }, []); - // Toggle individual container expansion const toggleExpand = useCallback((taskId: string) => { setManualExpansions((prev) => { const next = new Set(prev); @@ -197,7 +333,6 @@ function DAGCanvasInner({ }); }, []); - // Handle search change const handleSearchChange = useCallback((value: string) => { setSearchQuery(value); }, []); @@ -207,25 +342,17 @@ function DAGCanvasInner({ [expandedContainers, toggleExpand, containerDims], ); - // Loading state if (isLoading) { return ( -
-
+
+
- + +
-
- +
+ {isNotFoundError ? (
-

+

{isNotFoundError ? "Run Data Unavailable" : "Connection Error"}

-

{error}

-

+

{error}

+

Run ID: {runId}

@@ -280,18 +401,12 @@ function DAGCanvasInner({ ); } - // Empty state if (!runState?.tasks || runState.tasks.size === 0) { return ( -
+
-
- +
+
-

+

Waiting for tasks...

-

+

{isSubscribed ? "Subscribed to run updates. Tasks will appear when the workflow starts." : "Connecting to server..."}

-

+

Run ID: {runId}

@@ -317,7 +432,7 @@ function DAGCanvasInner({ } return ( -
+
- {/* Background */} - {/* Controls */} - - - {/* MiniMap */} - - {/* Top Left: Depth Selector + Search */} - -
- -
-
- - {searchQuery && ( - - {matchCount} match{matchCount !== 1 ? "es" : ""} - - )} -
- {/* Depth selector on mobile */} -
- -
-
- - {/* Run Info Panel */} - -
-

- {runState.name} -

-
- - - {runState.status} - - {runState.durationSeconds !== null && ( - {Math.round(runState.durationSeconds)}s - )} -
-
-
+ + {/* Floating controls — top-left */} +
+ + + +
+ + {/* Floating controls — bottom-left */} +
+ +
); diff --git a/ergon-dashboard/src/components/dag/TaskNode.tsx b/ergon-dashboard/src/components/dag/TaskNode.tsx index 7527be91..dfe589c8 100644 --- a/ergon-dashboard/src/components/dag/TaskNode.tsx +++ b/ergon-dashboard/src/components/dag/TaskNode.tsx @@ -66,6 +66,7 @@ function TaskNodeComponent({ data }: NodeProps) { onClick={onClick} selected={selected} dimmed={dimmed} + highlighted={highlighted} containerWidth={dims?.width ?? 260} containerHeight={dims?.height ?? 100} layoutDirection={graphLayoutDirection} diff --git a/ergon-dashboard/src/components/panels/CommunicationPanel.tsx b/ergon-dashboard/src/components/panels/CommunicationPanel.tsx index 58f654f7..3617da3c 100644 --- a/ergon-dashboard/src/components/panels/CommunicationPanel.tsx +++ b/ergon-dashboard/src/components/panels/CommunicationPanel.tsx @@ -1,20 +1,43 @@ "use client"; +import { useEffect, useMemo, useState } from "react"; + import { CommunicationThreadState } from "@/lib/types"; +import { formatClockTimeSeconds } from "@/lib/timeFormat"; function formatTime(timestamp: string): string { - const date = new Date(timestamp); - return date.toLocaleTimeString("en-US", { - hour12: false, - hour: "2-digit", - minute: "2-digit", - second: "2-digit", - }); + return formatClockTimeSeconds(timestamp); } function speakerLabel(agentId: string): string { const suffix = agentId.split(":").pop() ?? agentId; - return suffix.replaceAll("_", " "); + return suffix.replaceAll("_", " ").replaceAll("-", " "); +} + +function threadSummary(thread: CommunicationThreadState): string { + if (thread.summary) return thread.summary; + const participants = participantLabels(thread); + if (thread.messages.length === 0) { + return participants.length > 0 + ? `Thread between ${participants.join(", ")}.` + : "No messages are visible at this point in the run."; + } + const first = thread.messages[0]; + const preview = first.content.length > 96 ? `${first.content.slice(0, 96)}...` : first.content; + return `${thread.messages.length} message${thread.messages.length === 1 ? "" : "s"} · ${preview}`; +} + +function participantLabels(thread: CommunicationThreadState): string[] { + const ids = new Set([thread.agentAId, thread.agentBId]); + for (const message of thread.messages) { + ids.add(message.fromAgentId); + ids.add(message.toAgentId); + } + return [...ids].filter(Boolean).map(speakerLabel); +} + +function messageAlignment(message: CommunicationThreadState["messages"][number], index: number) { + return index % 2 === 0 || message.fromAgentId === "parent" ? "justify-start" : "justify-end"; } export function CommunicationPanel({ @@ -22,54 +45,169 @@ export function CommunicationPanel({ }: { threads: CommunicationThreadState[]; }) { + const sortedThreads = useMemo( + () => + [...threads].sort( + (a, b) => + Date.parse(a.createdAt) - Date.parse(b.createdAt) || + a.topic.localeCompare(b.topic), + ), + [threads], + ); + const [selectedThreadId, setSelectedThreadId] = useState( + sortedThreads[0]?.id ?? null, + ); + + useEffect(() => { + if (sortedThreads.length === 0) { + setSelectedThreadId(null); + return; + } + if (!selectedThreadId || !sortedThreads.some((thread) => thread.id === selectedThreadId)) { + setSelectedThreadId(sortedThreads[0].id); + } + }, [selectedThreadId, sortedThreads]); + if (threads.length === 0) { return ( -
-

No communication yet

+
+

No communication threads yet

Messages will appear here as threads evolve.

); } + const selectedThread = + sortedThreads.find((thread) => thread.id === selectedThreadId) ?? sortedThreads[0] ?? null; + const selectedMessages = selectedThread + ? [...selectedThread.messages].sort((a, b) => a.sequenceNum - b.sequenceNum) + : []; + return ( -
- {threads.map((thread) => ( -
-
-
-
-
{thread.topic}
-
- {speakerLabel(thread.agentAId)} ↔ {speakerLabel(thread.agentBId)} +
+
+ {sortedThreads.map((thread) => { + const selected = thread.id === selectedThread?.id; + const participants = participantLabels(thread); + return ( +
-
- {thread.messages.map((message) => ( -
-
- {speakerLabel(message.fromAgentId)} - {formatTime(message.createdAt)} +
+ {formatTime(thread.createdAt)} + {formatTime(thread.updatedAt)} +
+ + ); + })} +
+ +
+ {selectedThread && ( + <> +
+
+
+
+ {selectedThread.topic} +
+
+ Started {formatTime(selectedThread.createdAt)} + {selectedThread.taskId ? ` · linked task ${selectedThread.taskId}` : ""} +
-

- {message.content} -

+ + {selectedMessages.length} message{selectedMessages.length === 1 ? "" : "s"} +
- ))} -
-
- ))} +

+ {threadSummary(selectedThread)} +

+
+ +
+ {selectedMessages.length === 0 && ( +
+ No messages are visible at this point in the run. +
+ )} + {selectedMessages.map((message, index) => ( +
+
+
+ + {speakerLabel(message.fromAgentId)} + + + {formatTime(message.createdAt)} + +
+

+ {message.content} +

+
+ #{message.sequenceNum} + {message.taskId && task {message.taskId}} + {message.taskExecutionId && ( + exec {message.taskExecutionId} + )} +
+
+
+ ))} +
+ + )} +
); } diff --git a/ergon-dashboard/src/components/panels/EvaluationPanel.tsx b/ergon-dashboard/src/components/panels/EvaluationPanel.tsx index d8fe29c4..66f90111 100644 --- a/ergon-dashboard/src/components/panels/EvaluationPanel.tsx +++ b/ergon-dashboard/src/components/panels/EvaluationPanel.tsx @@ -6,6 +6,20 @@ function formatPercent(score: number): string { return `${(score * 100).toFixed(1)}%`; } +function EvaluationCriteriaEmpty({ detail }: { detail: string }) { + return ( +
+

+ No evaluation criteria recorded yet +

+

{detail}

+
+ ); +} + export function EvaluationPanel({ evaluation, }: { @@ -13,10 +27,7 @@ export function EvaluationPanel({ }) { if (!evaluation) { return ( -
-

No evaluation yet

-

Judgment surfaces will update when evaluation arrives.

-
+ ); } @@ -49,34 +60,38 @@ export function EvaluationPanel({
-
- {evaluation.criterionResults.map((criterion) => ( -
-
-
-
- {criterion.stageName}: {criterion.criterionDescription} + {evaluation.criterionResults.length === 0 ? ( + + ) : ( +
+ {evaluation.criterionResults.map((criterion) => ( +
+
+
+
+ {criterion.stageName}: {criterion.criterionDescription} +
+
+ {criterion.criterionType} +
-
- {criterion.criterionType} +
+ {criterion.score} / {criterion.maxScore}
-
- {criterion.score} / {criterion.maxScore} -
+ {criterion.feedback ? ( +

+ {criterion.feedback} +

+ ) : null}
- {criterion.feedback ? ( -

- {criterion.feedback} -

- ) : null} -
- ))} -
+ ))} +
+ )}
); } diff --git a/ergon-dashboard/src/components/panels/ResourcePanel.tsx b/ergon-dashboard/src/components/panels/ResourcePanel.tsx index 61ea576a..70ff7d02 100644 --- a/ergon-dashboard/src/components/panels/ResourcePanel.tsx +++ b/ergon-dashboard/src/components/panels/ResourcePanel.tsx @@ -12,6 +12,7 @@ import { useState } from "react"; import { ResourceViewerDialog } from "@/components/viewers/ResourceViewerDialog"; import { ResourceState } from "@/lib/types"; +import { formatDate } from "@/lib/timeFormat"; interface ResourcePanelProps { resources: ResourceState[]; @@ -41,7 +42,7 @@ function formatRelativeTime(timestamp: string): string { if (diffSeconds < 60) return "just now"; if (diffSeconds < 3600) return `${Math.floor(diffSeconds / 60)}m ago`; if (diffSeconds < 86400) return `${Math.floor(diffSeconds / 3600)}h ago`; - return time.toLocaleDateString(); + return formatDate(time); } /** diff --git a/ergon-dashboard/src/components/panels/SandboxPanel.tsx b/ergon-dashboard/src/components/panels/SandboxPanel.tsx index 3f86c1ac..48eb824c 100644 --- a/ergon-dashboard/src/components/panels/SandboxPanel.tsx +++ b/ergon-dashboard/src/components/panels/SandboxPanel.tsx @@ -12,6 +12,7 @@ import { useState } from "react"; import { SandboxState, SandboxCommandState } from "@/lib/types"; +import { formatClockTimeSeconds } from "@/lib/timeFormat"; interface SandboxPanelProps { sandbox: SandboxState | undefined; @@ -30,13 +31,7 @@ function formatDuration(ms: number | null): string { * Format timestamp to time string. */ function formatTime(timestamp: string): string { - const date = new Date(timestamp); - return date.toLocaleTimeString("en-US", { - hour12: false, - hour: "2-digit", - minute: "2-digit", - second: "2-digit", - }); + return formatClockTimeSeconds(timestamp); } interface CommandItemProps { diff --git a/ergon-dashboard/src/components/run/RunWorkspacePage.tsx b/ergon-dashboard/src/components/run/RunWorkspacePage.tsx index 28de8fbd..378bda4a 100644 --- a/ergon-dashboard/src/components/run/RunWorkspacePage.tsx +++ b/ergon-dashboard/src/components/run/RunWorkspacePage.tsx @@ -5,19 +5,21 @@ import { useEffect, useMemo, useRef, useState } from "react"; import { DAGCanvas } from "@/components/dag/DAGCanvas"; import { StatusBadge } from "@/components/common/StatusBadge"; -import { RunStatusBar } from "@/components/run/RunStatusBar"; + import { UnifiedEventStream } from "@/components/run/UnifiedEventStream"; import { TaskWorkspace } from "@/components/workspace/TaskWorkspace"; -import { MutationTimeline } from "@/features/graph/components/MutationTimeline"; +import { ActivityStackTimeline } from "@/features/activity/components/ActivityStackTimeline"; +import { buildRunActivities } from "@/features/activity/buildRunActivities"; +import { resolveActivitySnapshotSequence } from "@/features/activity/snapshotSequence"; +import type { RunActivity } from "@/features/activity/types"; import { parseGraphMutationDtoArray, type GraphMutationDto, } from "@/features/graph/contracts/graphMutations"; -import { replayToSequence } from "@/features/graph/state/graphMutationReducer"; +import { createReplayInitialState, replayToSequence } from "@/features/graph/state/graphMutationReducer"; import { useCohortDetail } from "@/hooks/useCohortDetail"; import { useRunState } from "@/hooks/useRunState"; import { buildRunEvents } from "@/lib/runEvents"; -import type { WorkflowRunState } from "@/lib/types"; import { CohortDetail, RunLifecycleStatus, SerializedWorkflowRunState, TaskStatus } from "@/lib/types"; function formatSeconds(value: number | null): string { @@ -31,75 +33,105 @@ function formatPercent(value: number | null): string { return `${(value * 100).toFixed(1)}%`; } +function nearestMutationAtOrBefore( + mutations: GraphMutationDto[], + sequence: number, +): GraphMutationDto | null { + let selected: GraphMutationDto | null = null; + for (const mutation of mutations) { + if (mutation.sequence > sequence) break; + selected = mutation; + } + return selected ?? mutations[0] ?? null; +} + export function RunWorkspacePage({ runId, cohortId, initialRunState = null, initialCohortDetail = null, + ssrError = null, }: { runId: string; cohortId?: string; initialRunState?: SerializedWorkflowRunState | null; initialCohortDetail?: CohortDetail | null; + ssrError?: string | null; }) { const [selectedTaskId, setSelectedTaskId] = useState(null); + const [selectedActivityId, setSelectedActivityId] = useState(null); const [selectionNotice, setSelectionNotice] = useState(null); const [statusFilter, setStatusFilter] = useState(null); - const [isStreamOpen, setIsStreamOpen] = useState(true); + const [isStreamOpen, setIsStreamOpen] = useState(false); const { runState, isLoading, error, isSubscribed } = useRunState(runId, initialRunState); const { detail } = useCohortDetail(cohortId ?? "", initialCohortDetail); - // Timeline playback state - const [timelineMode, setTimelineMode] = useState<"live" | "timeline">("live"); - const [currentSequence, setCurrentSequence] = useState(0); - const [isPlaying, setIsPlaying] = useState(false); - const [playbackSpeed, setPlaybackSpeed] = useState(1); + // A null snapshot means the graph follows live state; a sequence replays + // mutations to that point. + const [snapshotSequence, setSnapshotSequence] = useState(null); + const currentSequence = snapshotSequence ?? 0; const [mutations, setMutations] = useState([]); - const snapshotCache = useRef(new Map()); + const requestedSequenceRef = useRef(null); + const pendingActivityResolutionRef = useRef(null); + const selectedActivityIdRef = useRef(null); + const mutationsLoadedRef = useRef(false); + + useEffect(() => { + selectedActivityIdRef.current = selectedActivityId; + }, [selectedActivityId]); - // Fetch mutations when entering timeline mode + // Fetch mutations once per run load so snapshot selection is always ready. useEffect(() => { - if (timelineMode !== "timeline") return; let cancelled = false; + mutationsLoadedRef.current = false; + pendingActivityResolutionRef.current = null; fetch(`/api/runs/${runId}/mutations`) .then((res) => res.json()) .then((data) => { if (cancelled) return; const parsed = parseGraphMutationDtoArray(data); + mutationsLoadedRef.current = true; setMutations(parsed); - snapshotCache.current.clear(); - setCurrentSequence( - parsed.length > 0 ? parsed[parsed.length - 1].sequence : 0, - ); + const requestedSequence = requestedSequenceRef.current; + requestedSequenceRef.current = null; + if (requestedSequence !== null) { + setSnapshotSequence(nearestMutationAtOrBefore(parsed, requestedSequence)?.sequence ?? null); + return; + } + + const pendingActivity = pendingActivityResolutionRef.current; + pendingActivityResolutionRef.current = null; + if (pendingActivity && selectedActivityIdRef.current === pendingActivity.id) { + const sequence = resolveActivitySnapshotSequence(pendingActivity, parsed); + const resolvedSequence = + sequence === null + ? null + : (nearestMutationAtOrBefore(parsed, sequence)?.sequence ?? sequence); + setSnapshotSequence(resolvedSequence); + } }) .catch(() => { - if (!cancelled) setMutations([]); + if (cancelled) return; + mutationsLoadedRef.current = true; + pendingActivityResolutionRef.current = null; + setMutations([]); }); return () => { cancelled = true; }; - }, [timelineMode, runId]); + }, [runId]); - // Build display state: replay for timeline mode, live state otherwise + // Build display state: replay only for an explicit snapshot; otherwise live. const displayState = useMemo(() => { - if (timelineMode === "live" || mutations.length === 0) return runState; + if (snapshotSequence === null || mutations.length === 0) return runState; if (!runState) return runState; - const emptyState: WorkflowRunState = { - ...runState, - tasks: new Map(), - totalTasks: 0, - totalLeafTasks: 0, - completedTasks: 0, - runningTasks: 0, - failedTasks: 0, - }; + const replayBaseState = createReplayInitialState(runState, mutations, snapshotSequence); return replayToSequence( mutations, - currentSequence, - emptyState, - snapshotCache.current, + snapshotSequence, + replayBaseState, ); - }, [timelineMode, runState, mutations, currentSequence]); + }, [runState, mutations, snapshotSequence]); const runRow = useMemo(() => { if (!cohortId || !detail) return null; @@ -111,9 +143,9 @@ export function RunWorkspacePage({ return displayState.tasks.get(selectedTaskId) ?? null; }, [displayState, selectedTaskId]); - // D7: status counts for the RunStatusBar. Only leaf tasks so the totals + // Status counts shown in the run header. Only leaf tasks so the totals // match the "units of work" the user is tracking (parents double-count). - const { leafStatusCounts, leafTotal } = useMemo(() => { + const { leafStatusCounts } = useMemo(() => { const empty: Record = { [TaskStatus.PENDING]: 0, [TaskStatus.READY]: 0, @@ -132,12 +164,42 @@ export function RunWorkspacePage({ return { leafStatusCounts: empty, leafTotal: total }; }, [displayState]); - // D4: Unified event log — derived from displayState so timeline scrubbing - // trims the feed in lockstep. + // D4: Unified event log for the replayed inspector view. const events = useMemo(() => buildRunEvents(displayState), [displayState]); + // Trace spans are an immutable map of the full run. Replay moves the cursor + // over this map; it should not relayout or clip completed spans. + const traceEvents = useMemo(() => buildRunEvents(runState), [runState]); - // D7: keyboard shortcuts — Esc closes selection, `t` toggles timeline, - // `e` toggles event stream, `1-6` filters by lifecycle status. + const activities = useMemo( + () => + buildRunActivities({ + runState, + events: traceEvents, + mutations, + currentSequence: snapshotSequence, + }), + [runState, traceEvents, mutations, snapshotSequence], + ); + + const selectedActivity = useMemo( + () => activities.find((activity) => activity.id === selectedActivityId) ?? null, + [activities, selectedActivityId], + ); + + const selectedTimelineTime = useMemo(() => { + if (snapshotSequence === null) return null; + return nearestMutationAtOrBefore(mutations, snapshotSequence)?.created_at ?? null; + }, [mutations, snapshotSequence]); + + const highlightedTaskIds = useMemo(() => { + const ids = new Set(); + if (selectedTaskId) ids.add(selectedTaskId); + if (selectedActivity?.taskId) ids.add(selectedActivity.taskId); + return ids; + }, [selectedActivity, selectedTaskId]); + + // D7: keyboard shortcuts — Esc unwinds UI state, `e` toggles event stream, + // `1-6` filters by lifecycle status. useEffect(() => { const STATUS_ORDER: TaskStatus[] = [ TaskStatus.PENDING, @@ -155,20 +217,36 @@ export function RunWorkspacePage({ return; } } + if (e.key === "Escape") { - if (selectedTaskId) setSelectedTaskId(null); - else if (statusFilter) setStatusFilter(null); - return; - } - if (e.key === "t" || e.key === "T") { - setTimelineMode((prev) => (prev === "live" ? "timeline" : "live")); - if (timelineMode === "timeline") setIsPlaying(false); + if (selectedTaskId) { setSelectedTaskId(null); return; } + if (snapshotSequence !== null) { setSnapshotSequence(null); return; } + if (statusFilter) { setStatusFilter(null); return; } return; } + if (e.key === "e" || e.key === "E") { setIsStreamOpen((prev) => !prev); return; } + + if (e.key === "ArrowLeft" && snapshotSequence !== null) { + const idx = mutations.findIndex((m) => m.sequence === snapshotSequence); + if (idx > 0) setSnapshotSequence(mutations[idx - 1].sequence); + return; + } + if (e.key === "ArrowRight" && snapshotSequence !== null) { + const idx = mutations.findIndex((m) => m.sequence === snapshotSequence); + if (idx >= 0 && idx < mutations.length - 1) setSnapshotSequence(mutations[idx + 1].sequence); + return; + } + + if ((e.key === "d" || e.key === "D") && (e.metaKey || e.ctrlKey)) { + e.preventDefault(); + if (selectedTaskId) setSelectedTaskId(null); + return; + } + const idx = Number(e.key) - 1; if (!Number.isNaN(idx) && idx >= 0 && idx < STATUS_ORDER.length) { const next = STATUS_ORDER[idx]; @@ -177,7 +255,7 @@ export function RunWorkspacePage({ }; window.addEventListener("keydown", handler); return () => window.removeEventListener("keydown", handler); - }, [selectedTaskId, statusFilter, timelineMode]); + }, [selectedTaskId, statusFilter, mutations, snapshotSequence]); useEffect(() => { if (!selectedTaskId || !displayState) return; @@ -192,164 +270,165 @@ export function RunWorkspacePage({ const handleTaskClick = (taskId: string) => { setSelectionNotice(null); + pendingActivityResolutionRef.current = null; + selectedActivityIdRef.current = null; + setSelectedActivityId(null); setSelectedTaskId((prev) => (prev === taskId ? null : taskId)); }; + const handleSequenceChange = (sequence: number) => { + pendingActivityResolutionRef.current = null; + const mutation = nearestMutationAtOrBefore(mutations, sequence); + setSnapshotSequence(mutation?.sequence ?? sequence); + }; + + const handleActivityClick = (activity: RunActivity) => { + setSelectionNotice(null); + requestedSequenceRef.current = null; + selectedActivityIdRef.current = activity.id; + setSelectedActivityId(activity.id); + const sequence = resolveActivitySnapshotSequence(activity, mutations); + if (sequence !== null) { + handleSequenceChange(sequence); + } else { + setSnapshotSequence(null); + pendingActivityResolutionRef.current = mutationsLoadedRef.current ? null : activity; + } + if (activity.taskId) { + setSelectedTaskId(activity.taskId); + } + }; + return ( -
+
+ {/* Run header strip */}
-
-
- - Experiment Cohorts - +
+
+ Cohorts + {cohortId && ( <> - / {detail?.summary.name ?? "Cohort"} + )} - / - {runId.slice(0, 8)}... + {runId.slice(0, 8)}…
+
+

+ {runState?.name ?? runRow?.run_id ?? "Run"} +

+ + + {snapshotSequence === null ? "live" : `snapshot · seq ${snapshotSequence}`} · {formatSeconds(runState?.durationSeconds ?? null)} + +
+
-
+
+ {/* Key metrics */} +
-
-

- {runState?.name ?? runRow?.run_id ?? "Run"} -

- -
- {(["live", "timeline"] as const).map((mode) => { - const active = timelineMode === mode; - return ( - - ); - })} -
- -
-
- Workflow: {runState?.name ?? "—"} - Started: {runState?.startedAt ? new Date(runState.startedAt).toLocaleString("en-GB", { dateStyle: "medium", timeStyle: "medium" }) : "—"} -
+
Tasks
+ + {leafStatusCounts[TaskStatus.COMPLETED]}·{leafStatusCounts[TaskStatus.RUNNING]}·{leafStatusCounts[TaskStatus.READY]}·{leafStatusCounts[TaskStatus.PENDING]} +
- -
-
-
Runtime
-
- {formatSeconds(runState?.durationSeconds ?? (runRow?.running_time_ms != null ? runRow.running_time_ms / 1000 : null))} -
-
-
-
Score
-
- {formatPercent(runState?.finalScore ?? runRow?.final_score ?? null)} -
-
-
-
Tasks
-
- {runState?.totalTasks ?? "—"} -
-
-
-
Failed tasks
-
- {runState?.failedTasks ?? "—"} -
-
-
-
- - {leafTotal > 0 && ( -
- +
+
Tokens
+
- )} - - {error && ( -
- {error} +
+
Cost
+ +
+
+
Score
+ + {formatPercent(runState?.finalScore ?? runRow?.final_score ?? null)} +
- )} +
+ + + + +
-
+ Server-side error: + {ssrError} +
+ )} + + {error && !ssrError && ( +
+ {error} +
+ )} + +
{selectionNotice && (
{selectionNotice}
)}
0 ? 300 : 0, + paddingRight: isInspectorOpen ? 476 : 0, + }} >
- {timelineMode === "timeline" && mutations.length > 0 && ( + {activities.length > 0 && (
- setIsPlaying((p) => !p)} - speed={playbackSpeed} - onSpeedChange={setPlaybackSpeed} + selectedTaskId={selectedTaskId} + selectedActivityId={selectedActivityId} + onActivityClick={handleActivityClick} />
)} {isStreamOpen && events.length > 0 && (
{ - if (timelineMode !== "timeline") setTimelineMode("timeline"); - setCurrentSequence(seq); + requestedSequenceRef.current = seq; + handleSequenceChange(seq); }} />
)} {isInspectorOpen ? ( -
+
setSelectedTaskId(null)} onJumpToSequence={(seq) => { - if (timelineMode !== "timeline") setTimelineMode("timeline"); - setCurrentSequence(seq); + requestedSequenceRef.current = seq; + handleSequenceChange(seq); }} + selectedTime={selectedTimelineTime} + selectedSequence={snapshotSequence} + selectedActivity={selectedActivity} />
) : (
-
+
Task inspection
-

- Graph first, then open a focused task workspace +

+ Click node → workspace drawer

-

- Select a task node to inspect its outputs, execution attempts, actions, - communication, and evaluation without keeping the entire page in a cramped - permanent split view. -

+

State, outputs, turns, and evals appear scoped to the selected sequence.

{selectedTask && ( -
- Ready to inspect {selectedTask.name}. +
+ Ready to inspect {selectedTask.name}.
)}
diff --git a/ergon-dashboard/src/components/run/UnifiedEventStream.tsx b/ergon-dashboard/src/components/run/UnifiedEventStream.tsx index 8a93eecd..92f6df62 100644 --- a/ergon-dashboard/src/components/run/UnifiedEventStream.tsx +++ b/ergon-dashboard/src/components/run/UnifiedEventStream.tsx @@ -21,19 +21,12 @@ import { type RunEvent, type RunEventKind, } from "@/lib/runEvents"; +import { formatClockTimeMs } from "@/lib/timeFormat"; import { TransitionChip } from "@/components/common/TransitionChip"; function formatTime(iso: string): string { - try { - return new Date(iso).toLocaleTimeString("en-GB", { - hour: "2-digit", - minute: "2-digit", - second: "2-digit", - fractionalSecondDigits: 3, - }); - } catch { - return iso; - } + const label = formatClockTimeMs(iso); + return label === "—" ? iso : label; } function formatRelative(iso: string, anchorMs: number | null): string { diff --git a/ergon-dashboard/src/components/shell/Topbar.tsx b/ergon-dashboard/src/components/shell/Topbar.tsx new file mode 100644 index 00000000..01517a1a --- /dev/null +++ b/ergon-dashboard/src/components/shell/Topbar.tsx @@ -0,0 +1,96 @@ +"use client"; + +import Link from "next/link"; +import { usePathname } from "next/navigation"; + +const NAV_ITEMS = [ + { label: "Cohorts", href: "/" }, + { label: "Runs", href: "/runs" }, + { label: "Training", href: "/training" }, + { label: "Models", href: "/models" }, + { label: "Settings", href: "/settings" }, +] as const; + +function isActive(href: string, pathname: string): boolean { + if (href === "/") { + return pathname === "/" || pathname.startsWith("/cohorts"); + } + if (href === "/runs") { + return pathname.startsWith("/run/") || pathname.startsWith("/runs"); + } + return pathname.startsWith(href); +} + +export function Topbar() { + const pathname = usePathname(); + + return ( +
+
+ {/* Logo + wordmark */} + + + + + Ergon + + + {/* Navigation — hidden on small screens */} + +
+ +
+ {/* Search — hidden on smaller viewports */} +
+ + Search cohorts, runs, tasks… + + ⌘K + +
+ + {/* User avatar */} +
+ JM +
+
+
+ ); +} diff --git a/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx b/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx index 95411354..139e637c 100644 --- a/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx +++ b/ergon-dashboard/src/components/workspace/TaskWorkspace.tsx @@ -1,5 +1,7 @@ "use client"; +import { useEffect, useRef, useState } from "react"; + import { useTaskDetails } from "@/hooks/useTaskDetails"; import { StatusBadge } from "@/components/common/StatusBadge"; import { CommunicationPanel } from "@/components/panels/CommunicationPanel"; @@ -8,11 +10,69 @@ import { ResourcePanel } from "@/components/panels/ResourcePanel"; import { SandboxPanel } from "@/components/panels/SandboxPanel"; import { TaskTransitionLog } from "@/components/workspace/TaskTransitionLog"; import { ContextEventLog } from "@/features/graph/components/ContextEventLog"; +import type { RunActivity } from "@/features/activity/types"; import type { WorkflowRunState } from "@/lib/types"; +import { formatClockTime } from "@/lib/timeFormat"; import { formatTaskWallTimestamp } from "@/features/graph/utils/taskTiming"; +import { filterTaskEvidenceForTime } from "./filterTaskEvidenceForTime"; function EmptySection({ message }: { message: string }) { - return
{message}
; + return
{message}
; +} + +const ACTIVITY_KIND_TITLE: Record = { + execution: "Execution", + graph: "Graph mutation", + message: "Message", + artifact: "Artifact", + evaluation: "Evaluation", + context: "Context event", + sandbox: "Sandbox", +}; + +function ActivityDetail({ activity }: { activity: RunActivity }) { + const metadata = Object.entries(activity.metadata) + .filter(([, value]) => value !== null && value !== "") + .slice(0, 4); + const debugPayload = JSON.stringify(activity.debug, null, 2); + + return ( +
+
+ Selected activity +
+
+ {ACTIVITY_KIND_TITLE[activity.kind]}: {activity.label} +
+
+ Band: {activity.band} + Source: {activity.sourceKind} + Actor: {activity.actor ?? "—"} + Started: {formatClockTime(activity.startAt)} + Sequence: {activity.sequence ?? "—"} + Task: {activity.lineage.taskId ?? "—"} + Execution: {activity.lineage.taskExecutionId ?? "—"} + Sandbox: {activity.lineage.sandboxId ?? "—"} + {activity.endAt && Ended: {formatClockTime(activity.endAt)}} + {metadata.map(([key, value]) => ( + + {key}: {String(value)} + + ))} +
+
+ + Raw JSON + + + {debugPayload} + +
+
+ ); } function WorkspaceSection({ @@ -26,10 +86,10 @@ function WorkspaceSection({ }) { return (
-

+

{title}

{children} @@ -37,28 +97,125 @@ function WorkspaceSection({ ); } +type WorkspaceTabId = "overview" | "actions" | "communication" | "outputs" | "transitions" | "evaluation"; + +const WORKSPACE_TABS: Array<{ + id: WorkspaceTabId; + label: string; + testId: string; +}> = [ + { id: "overview", label: "Overview", testId: "workspace-tab-overview" }, + { id: "actions", label: "Actions", testId: "workspace-tab-actions" }, + { id: "communication", label: "Communication", testId: "workspace-tab-communication" }, + { id: "outputs", label: "Outputs", testId: "workspace-tab-outputs" }, + { id: "transitions", label: "Transitions", testId: "workspace-tab-transitions" }, + { id: "evaluation", label: "Evaluation", testId: "workspace-tab-evaluation" }, +]; + +function workspaceTabButtonId(id: WorkspaceTabId) { + return `workspace-tab-button-${id}`; +} + +function workspaceTabPanelId(id: WorkspaceTabId) { + return `workspace-tab-panel-${id}`; +} + +function WorkspaceTabPanel({ + tabId, + children, +}: { + tabId: WorkspaceTabId; + children: React.ReactNode; +}) { + return ( +
+ {children} +
+ ); +} + export function TaskWorkspace({ runState, taskId, error, onClearSelection, onJumpToSequence, + selectedTime = null, + selectedSequence = null, + selectedActivity = null, }: { runState: WorkflowRunState | null; taskId: string | null; error: string | null; onClearSelection?: () => void; onJumpToSequence?: (sequence: number) => void; + selectedTime?: string | null; + selectedSequence?: number | null; + selectedActivity?: RunActivity | null; }) { const { task, resources, executions, sandbox, threads, evaluation, dependencies, isLoading } = useTaskDetails(runState, taskId); + const [activeTab, setActiveTab] = useState("overview"); + const tabButtonRefs = useRef>({ + overview: null, + actions: null, + communication: null, + outputs: null, + transitions: null, + evaluation: null, + }); + + useEffect(() => { + setActiveTab("overview"); + }, [taskId]); + + function activateTab(tabId: WorkspaceTabId, shouldFocus = false) { + setActiveTab(tabId); + if (shouldFocus) { + requestAnimationFrame(() => tabButtonRefs.current[tabId]?.focus()); + } + } + + function handleTabKeyDown(event: React.KeyboardEvent, tabId: WorkspaceTabId) { + const currentIndex = WORKSPACE_TABS.findIndex((tab) => tab.id === tabId); + if (currentIndex === -1) return; + + let nextIndex: number | null = null; + if (event.key === "ArrowRight") { + nextIndex = (currentIndex + 1) % WORKSPACE_TABS.length; + } else if (event.key === "ArrowLeft") { + nextIndex = (currentIndex - 1 + WORKSPACE_TABS.length) % WORKSPACE_TABS.length; + } else if (event.key === "Home") { + nextIndex = 0; + } else if (event.key === "End") { + nextIndex = WORKSPACE_TABS.length - 1; + } + + if (nextIndex === null) return; + event.preventDefault(); + activateTab(WORKSPACE_TABS[nextIndex].id, true); + } const contextEvents = taskId && runState ? (runState.contextEventsByTask.get(taskId) ?? []) : []; + const filteredEvidence = filterTaskEvidenceForTime({ + resources, + executions, + sandbox, + threads, + evaluation, + contextEvents, + selectedTime, + }); if (!taskId) { return (
Select a task from the graph to open the focused task workspace. @@ -69,7 +226,7 @@ export function TaskWorkspace({ if (isLoading) { return (
Loading task workspace... @@ -80,7 +237,7 @@ export function TaskWorkspace({ if (!task || error) { return (
{error ?? "Task not found"} @@ -88,53 +245,55 @@ export function TaskWorkspace({ ); } - const primarySection = - resources.length > 0 - ? "outputs" - : evaluation - ? "evaluation" - : threads.length > 0 - ? "communication" - : sandbox - ? "sandbox" - : "overview"; - const started = formatTaskWallTimestamp(task.startedAt); const ended = formatTaskWallTimestamp(task.completedAt); return ( -
+
-
-

{task.name}

+
+
+
+ Task workspace +
+

{task.name}

+
+ {selectedSequence !== null && ( + + Viewing seq {selectedSequence} + + )} {onClearSelection && ( )}
-
+
Worker: {task.assignedWorkerName ?? "—"} Level: {task.level} Leaf task: {task.isLeaf ? "yes" : "no"} - Attempts: {executions.length || 0} - Outputs: {resources.length} + Attempts: {filteredEvidence.executions.length || 0} + Outputs: {filteredEvidence.resources.length} Started:{" "} {started.dateTime ? ( @@ -148,7 +307,7 @@ export function TaskWorkspace({ @@ -158,42 +317,51 @@ export function TaskWorkspace({
{task.description && ( -

{task.description}

+

{task.description}

)} + {selectedActivity && }
-
- - - +
+ {WORKSPACE_TABS.map((tab) => { + const selected = activeTab === tab.id; - - - + return ( + + ); + })} +
-
- {primarySection === "outputs" && ( - - - - )} - {primarySection === "evaluation" && ( - - - - )} - {primarySection === "communication" && ( - - - - )} - {primarySection === "sandbox" && ( - - - - )} - {primarySection === "overview" && ( - +
+ {activeTab === "overview" && ( + +
Waiting on
@@ -221,100 +389,90 @@ export function TaskWorkspace({
- )} -
- -
- -
-
-
Waiting on
- {dependencies.waitingOn.length === 0 ? ( - - ) : ( -
    - {dependencies.waitingOn.map((dep) => ( -
  • {dep.name}
  • - ))} -
- )} -
-
-
Blocking
- {dependencies.blocking.length === 0 ? ( - - ) : ( -
    - {dependencies.blocking.map((dep) => ( -
  • {dep.name}
  • - ))} -
- )} -
-
-
+ + )} - - {executions.length === 0 ? ( - - ) : ( + {activeTab === "actions" && ( + +
- {executions.map((execution) => ( -
-
-
- Attempt {execution.attemptNumber} -
- -
-
- Agent: {execution.agentName ?? "—"} - - Started: {execution.startedAt ? new Date(execution.startedAt).toLocaleTimeString() : "—"} - - - Completed: {execution.completedAt ? new Date(execution.completedAt).toLocaleTimeString() : "—"} - + + + + {filteredEvidence.executions.length === 0 ? ( + + ) : ( +
+ {filteredEvidence.executions.map((execution) => ( +
+
+
+ Attempt {execution.attemptNumber} +
+ +
+
+ Agent: {execution.agentName ?? "—"} + + Started: {execution.startedAt ? formatClockTime(execution.startedAt) : "—"} + + + Completed: {execution.completedAt ? formatClockTime(execution.completedAt) : "—"} + +
+ {execution.errorMessage && ( +
+ {execution.errorMessage} +
+ )} +
+ ))}
- {execution.errorMessage && ( -
- {execution.errorMessage} -
- )} -
- ))} + )} + + + + +
- )} - + + + )} - {primarySection !== "communication" && ( + {activeTab === "communication" && ( + - + - )} + + )} - {primarySection !== "outputs" && ( + {activeTab === "outputs" && ( + - + - )} + + )} - {primarySection !== "evaluation" && ( - - + {activeTab === "transitions" && ( + + + - )} + + )} - {primarySection !== "sandbox" && ( - - + {activeTab === "evaluation" && ( + + + - )} - -
+
+ )}
); diff --git a/ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.test.ts b/ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.test.ts new file mode 100644 index 00000000..13d9985e --- /dev/null +++ b/ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.test.ts @@ -0,0 +1,81 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import fixture from "../../../tests/fixtures/mas-runs/concurrent-mas-run.json"; +import { deserializeRunState } from "@/lib/runState"; +import { filterTaskEvidenceForTime } from "./filterTaskEvidenceForTime"; + +const searchTaskId = "10000000-0000-4000-8000-000000000002"; + +test("filterTaskEvidenceForTime hides task evidence created after the selected timeline time", () => { + const runState = deserializeRunState(fixture.runState); + const filtered = filterTaskEvidenceForTime({ + resources: runState.resourcesByTask.get(searchTaskId) ?? [], + executions: runState.executionsByTask.get(searchTaskId) ?? [], + sandbox: runState.sandboxesByTask.get(searchTaskId), + threads: runState.threads, + evaluation: runState.evaluationsByTask.get(searchTaskId) ?? null, + contextEvents: runState.contextEventsByTask.get(searchTaskId) ?? [], + selectedTime: "2026-04-26T12:00:10.000Z", + }); + + assert.equal(filtered.resources.length, 0); + assert.equal(filtered.executions.length, 1); + assert.equal(filtered.sandbox?.commands.length, 0); + assert.equal(filtered.contextEvents.length, 1); +}); + +test("filterTaskEvidenceForTime returns unfiltered task evidence in live mode", () => { + const runState = deserializeRunState(fixture.runState); + const filtered = filterTaskEvidenceForTime({ + resources: runState.resourcesByTask.get(searchTaskId) ?? [], + executions: runState.executionsByTask.get(searchTaskId) ?? [], + sandbox: runState.sandboxesByTask.get(searchTaskId), + threads: runState.threads, + evaluation: runState.evaluationsByTask.get(searchTaskId) ?? null, + contextEvents: runState.contextEventsByTask.get(searchTaskId) ?? [], + selectedTime: null, + }); + + assert.equal(filtered.resources.length, 1); + assert.equal(filtered.sandbox?.commands.length, 1); +}); + +test("filterTaskEvidenceForTime keeps only thread messages visible at selected time", () => { + const runState = deserializeRunState(fixture.runState); + const thread = runState.threads[0]; + const filtered = filterTaskEvidenceForTime({ + resources: [], + executions: [], + sandbox: undefined, + threads: [ + { + ...thread, + createdAt: "2026-04-26T12:00:10.000Z", + messages: [ + { + ...thread.messages[0], + id: "visible-message", + content: "visible", + createdAt: "2026-04-26T12:00:20.000Z", + }, + { + ...thread.messages[0], + id: "future-message", + content: "future", + createdAt: "2026-04-26T12:00:30.000Z", + }, + ], + }, + ], + evaluation: null, + contextEvents: [], + selectedTime: "2026-04-26T12:00:25.000Z", + }); + + assert.equal(filtered.threads.length, 1); + assert.deepEqual( + filtered.threads[0].messages.map((message) => message.content), + ["visible"], + ); +}); diff --git a/ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.ts b/ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.ts new file mode 100644 index 00000000..d5f831a8 --- /dev/null +++ b/ergon-dashboard/src/components/workspace/filterTaskEvidenceForTime.ts @@ -0,0 +1,76 @@ +import type { + CommunicationThreadState, + ContextEventState, + ExecutionAttemptState, + ResourceState, + SandboxState, + TaskEvaluationState, +} from "@/lib/types"; + +export interface TaskEvidence { + resources: ResourceState[]; + executions: ExecutionAttemptState[]; + sandbox: SandboxState | undefined; + threads: CommunicationThreadState[]; + evaluation: TaskEvaluationState | null; + contextEvents: ContextEventState[]; +} + +export interface FilterTaskEvidenceForTimeInput extends TaskEvidence { + selectedTime: string | null; +} + +function atOrBefore(value: string | null | undefined, selectedMs: number): boolean { + if (!value) return false; + const parsed = Date.parse(value); + return Number.isFinite(parsed) && parsed <= selectedMs; +} + +export function filterTaskEvidenceForTime({ + resources, + executions, + sandbox, + threads, + evaluation, + contextEvents, + selectedTime, +}: FilterTaskEvidenceForTimeInput): TaskEvidence { + if (!selectedTime) { + return { resources, executions, sandbox, threads, evaluation, contextEvents }; + } + + const selectedMs = Date.parse(selectedTime); + if (!Number.isFinite(selectedMs)) { + return { resources, executions, sandbox, threads, evaluation, contextEvents }; + } + + const filteredSandbox = sandbox + ? { + ...sandbox, + commands: sandbox.commands.filter((command) => + atOrBefore(command.timestamp, selectedMs), + ), + } + : undefined; + + return { + resources: resources.filter((resource) => atOrBefore(resource.createdAt, selectedMs)), + executions: executions.filter((execution) => + atOrBefore(execution.startedAt, selectedMs), + ), + sandbox: filteredSandbox, + threads: threads + .map((thread) => ({ + ...thread, + messages: (thread.messages ?? []).filter((message) => + atOrBefore(message.createdAt, selectedMs), + ), + })) + .filter((thread) => atOrBefore(thread.createdAt, selectedMs) || (thread.messages ?? []).length > 0), + evaluation: + evaluation && atOrBefore(evaluation.createdAt, selectedMs) ? evaluation : null, + contextEvents: contextEvents.filter((event) => + atOrBefore(event.createdAt, selectedMs), + ), + }; +} diff --git a/ergon-dashboard/src/features/activity/buildRunActivities.test.ts b/ergon-dashboard/src/features/activity/buildRunActivities.test.ts new file mode 100644 index 00000000..9183939a --- /dev/null +++ b/ergon-dashboard/src/features/activity/buildRunActivities.test.ts @@ -0,0 +1,296 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import fixture from "../../../tests/fixtures/mas-runs/concurrent-mas-run.json"; +import { parseGraphMutationDtoArray } from "@/features/graph/contracts/graphMutations"; +import type { RunEvent } from "@/lib/runEvents"; +import { buildRunEvents } from "@/lib/runEvents"; +import { deserializeRunState } from "@/lib/runState"; +import { TaskStatus, TaskTrigger } from "@/lib/types"; +import { buildRunActivities } from "./buildRunActivities"; +import { resolveActivitySnapshotSequence } from "./snapshotSequence"; + +test("buildRunActivities surfaces semantic activity kinds without creating actor lanes", () => { + const runState = deserializeRunState(fixture.runState); + const mutations = parseGraphMutationDtoArray(fixture.mutations); + const noisyTaskId = "10000000-0000-4000-8000-000000000002"; + runState.sandboxesByTask.set(noisyTaskId, { + sandboxId: "sandbox-noisy", + taskId: noisyTaskId, + template: "python", + timeoutMinutes: 30, + status: "closed", + createdAt: "2025-01-01T00:00:05.000Z", + closedAt: "2025-01-01T00:00:20.000Z", + closeReason: "completed", + commands: [ + { + command: "pnpm test --verbose", + stdout: null, + stderr: null, + exitCode: 0, + durationMs: 1000, + timestamp: "2025-01-01T00:00:10.000Z", + }, + ], + }); + runState.executionsByTask.set(noisyTaskId, [ + { + id: "execution-noisy", + taskId: noisyTaskId, + attemptNumber: 1, + status: TaskStatus.COMPLETED, + agentId: "agent-a", + agentName: "worker-1", + startedAt: "2025-01-01T00:00:04.000Z", + completedAt: "2025-01-01T00:00:16.000Z", + finalAssistantMessage: null, + outputResourceIds: [], + errorMessage: null, + score: null, + evaluationDetails: {}, + }, + ]); + runState.contextEventsByTask.set(noisyTaskId, [ + { + id: "context-noisy", + taskExecutionId: "execution-noisy", + taskNodeId: noisyTaskId, + workerBindingKey: "worker-1", + sequence: 12, + eventType: "tool_call", + payload: { + event_type: "tool_call", + tool_call_id: "tool-call-noisy", + tool_name: "shell", + args: { command: "pnpm test" }, + turn_id: "turn-noisy", + turn_token_ids: null, + turn_logprobs: null, + }, + createdAt: "2025-01-01T00:00:12.000Z", + startedAt: "2025-01-01T00:00:12.000Z", + completedAt: "2025-01-01T00:00:13.000Z", + }, + ]); + runState.threads = [ + ...runState.threads, + { + id: "thread-noisy", + runId: runState.id, + taskId: noisyTaskId, + topic: "coordination", + agentAId: "agent-a", + agentBId: "agent-b", + createdAt: "2025-01-01T00:00:12.000Z", + updatedAt: "2025-01-01T00:00:12.000Z", + messages: [ + { + id: "message-noisy", + threadId: "thread-noisy", + threadTopic: "coordination", + runId: runState.id, + taskId: noisyTaskId, + taskExecutionId: null, + fromAgentId: "agent-a", + toAgentId: "agent-b", + content: "Verbose coordination message", + sequenceNum: 99, + createdAt: "2025-01-01T00:00:12.000Z", + }, + ], + }, + ]; + const markerEvents: RunEvent[] = [ + { + id: "marker-workflow-started", + kind: "workflow.started", + at: "2025-01-01T00:00:06.000Z", + runName: "Marker workflow", + }, + { + id: "marker-workflow-completed", + kind: "workflow.completed", + at: "2025-01-01T00:00:07.000Z", + status: "completed", + finalScore: 1, + error: null, + }, + { + id: "marker-task-transition", + kind: "task.transition", + at: "2025-01-01T00:00:08.000Z", + taskId: noisyTaskId, + taskName: "Noisy task", + from: TaskStatus.READY, + to: TaskStatus.RUNNING, + trigger: TaskTrigger.WORKER_STARTED, + reason: null, + actor: "worker-1", + }, + { + id: "marker-thread-message", + kind: "thread.message", + at: "2025-01-01T00:00:09.000Z", + taskId: noisyTaskId, + threadId: "thread-noisy", + authorRole: "agent", + preview: "Marker message", + }, + { + id: "marker-task-evaluation", + kind: "task.evaluation", + at: "2025-01-01T00:00:11.000Z", + taskId: noisyTaskId, + score: 0.9, + passed: true, + }, + { + id: "marker-resource-published", + kind: "resource.published", + at: "2025-01-01T00:00:13.000Z", + taskId: noisyTaskId, + name: "artifact.json", + mimeType: "application/json", + sizeBytes: 128, + }, + { + id: "marker-unhandled-mutation", + kind: "unhandled.mutation", + at: "2025-01-01T00:00:14.000Z", + taskId: noisyTaskId, + sequence: 13, + mutationType: "unknown_marker", + note: "Unhandled marker mutation", + }, + ]; + const events = [...buildRunEvents(runState), ...markerEvents]; + + const activities = buildRunActivities({ + runState, + events, + mutations, + currentSequence: 14, + }); + + assert.ok( + activities.some( + (activity) => + activity.kind === "execution" && + activity.taskId === noisyTaskId && + activity.isInstant === false, + ), + ); + assert.ok( + activities.some( + (activity) => + activity.kind === "graph" && + activity.sequence === 10 && + activity.taskId === "10000000-0000-4000-8000-000000000003", + ), + ); + assert.deepEqual( + [...new Set(activities.map((activity) => String(activity.kind)))].sort(), + [ + "artifact", + "context", + "evaluation", + "execution", + "graph", + "message", + "sandbox", + ], + ); + assert.ok(activities.some((activity) => String(activity.label).includes("pnpm test"))); + assert.ok(activities.some((activity) => String(activity.label).includes("artifact.json"))); + assert.ok(activities.some((activity) => String(activity.label).includes("tool_call"))); + assert.ok(activities.some((activity) => String(activity.label).includes("Marker message"))); + assert.ok(activities.some((activity) => String(activity.label).includes("Evaluation"))); + assert.ok( + activities.some( + (activity) => + activity.kind === "execution" && + activity.band === "work" && + activity.lineage.taskExecutionId === "execution-noisy", + ), + ); + assert.ok( + activities.some( + (activity) => + activity.kind === "context" && + activity.band === "tools" && + activity.lineage.taskExecutionId === "execution-noisy", + ), + ); + assert.ok( + activities.some( + (activity) => + activity.kind === "message" && + activity.band === "communication" && + activity.lineage.taskId === noisyTaskId, + ), + ); + assert.ok( + activities.some( + (activity) => + activity.kind === "artifact" && + activity.band === "outputs" && + activity.lineage.taskId === noisyTaskId, + ), + ); + assert.equal( + activities.some((activity) => "laneId" in activity.metadata), + false, + ); +}); + +test("completed trace spans keep full duration when replaying an earlier sequence", () => { + const runState = deserializeRunState(fixture.runState); + const mutations = parseGraphMutationDtoArray(fixture.mutations); + const events = buildRunEvents(runState); + + const activities = buildRunActivities({ + runState, + events, + mutations, + currentSequence: 10, + }); + + const execution = activities.find( + (activity) => activity.id === "execution:30000000-0000-4000-8000-000000000001", + ); + const sandbox = activities.find((activity) => activity.id === "sandbox:sandbox-search"); + const graphMarker = activities.find( + (activity) => activity.kind === "graph" && activity.sequence === 10, + ); + + assert.equal(execution?.startAt, "2026-04-26T12:00:05.000Z"); + assert.equal(execution?.endAt, "2026-04-26T12:00:24.000Z"); + assert.equal(sandbox?.startAt, "2026-04-26T12:00:04.000Z"); + assert.equal(sandbox?.endAt, "2026-04-26T12:00:26.000Z"); + assert.equal(execution?.metadata.openEnded, false); + assert.equal(sandbox?.metadata.openEnded, false); + assert.equal(graphMarker?.debug?.source, "graph.mutation"); +}); + +test("context/tool event sequence does not masquerade as graph replay sequence", () => { + const runState = deserializeRunState(fixture.runState); + const mutations = parseGraphMutationDtoArray(fixture.mutations); + const activities = buildRunActivities({ + runState, + events: buildRunEvents(runState), + mutations, + currentSequence: null, + }); + + const toolActivity = activities.find( + (activity) => activity.id === "context:60000000-0000-4000-8000-000000000001", + ); + + assert.equal(toolActivity?.kind, "context"); + assert.equal(toolActivity?.sequence, null); + assert.equal( + toolActivity ? resolveActivitySnapshotSequence(toolActivity, mutations) : null, + 10, + ); +}); diff --git a/ergon-dashboard/src/features/activity/buildRunActivities.ts b/ergon-dashboard/src/features/activity/buildRunActivities.ts new file mode 100644 index 00000000..1c65ee3e --- /dev/null +++ b/ergon-dashboard/src/features/activity/buildRunActivities.ts @@ -0,0 +1,342 @@ +import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import type { + ContextEventState, + ExecutionAttemptState, + SandboxCommandState, + WorkflowRunState, +} from "@/lib/types"; +import type { RunEvent } from "@/lib/runEvents"; +import type { RunActivity } from "./types"; + +export interface BuildRunActivitiesInput { + runState: WorkflowRunState | null; + events: RunEvent[]; + mutations: GraphMutationDto[]; + currentSequence: number | null; +} + +function isFiniteTime(value: string | null | undefined): value is string { + return typeof value === "string" && Number.isFinite(Date.parse(value)); +} + +function compareActivity(a: RunActivity, b: RunActivity): number { + if (a.startAt !== b.startAt) return a.startAt.localeCompare(b.startAt); + const aSeq = a.sequence ?? -1; + const bSeq = b.sequence ?? -1; + if (aSeq !== bSeq) return aSeq - bSeq; + return a.id.localeCompare(b.id); +} + +function executionLabel(execution: ExecutionAttemptState, run: WorkflowRunState): string { + const task = run.tasks.get(execution.taskId); + return task?.name ?? `Attempt ${execution.attemptNumber}`; +} + +function truncate(value: string, length = 64): string { + return value.length > length ? `${value.slice(0, length - 1)}…` : value; +} + +function addMs(timestamp: string, durationMs: number | null): string | null { + if (durationMs === null || durationMs <= 0) return null; + const startMs = Date.parse(timestamp); + if (!Number.isFinite(startMs)) return null; + return new Date(startMs + durationMs).toISOString(); +} + +function executionActivities( + run: WorkflowRunState, +): RunActivity[] { + const activities: RunActivity[] = []; + for (const executions of run.executionsByTask.values()) { + for (const execution of executions) { + if (!isFiniteTime(execution.startedAt)) continue; + const endAt = execution.completedAt; + activities.push({ + id: `execution:${execution.id}`, + kind: "execution", + band: "work", + label: executionLabel(execution, run), + taskId: execution.taskId, + sequence: null, + startAt: execution.startedAt, + endAt, + isInstant: !endAt || endAt === execution.startedAt, + actor: execution.agentName, + sourceKind: "execution.span", + metadata: { + attemptNumber: execution.attemptNumber, + status: execution.status, + agentId: execution.agentId, + openEnded: endAt === null, + }, + lineage: { + taskId: execution.taskId, + taskExecutionId: execution.id, + agentId: execution.agentId, + }, + debug: { + source: "execution.span", + payload: execution, + }, + }); + } + } + return activities; +} + +function sandboxCommandLabel(command: SandboxCommandState): string { + return `cmd: ${truncate(command.command)}`; +} + +function sandboxActivities( + run: WorkflowRunState, +): RunActivity[] { + const activities: RunActivity[] = []; + for (const sandbox of run.sandboxesByTask.values()) { + if (isFiniteTime(sandbox.createdAt)) { + const endAt = sandbox.closedAt; + activities.push({ + id: `sandbox:${sandbox.sandboxId}`, + kind: "sandbox", + band: "work", + label: `sandbox: ${sandbox.template ?? sandbox.sandboxId}`, + taskId: sandbox.taskId, + sequence: null, + startAt: sandbox.createdAt, + endAt, + isInstant: !endAt || endAt === sandbox.createdAt, + actor: null, + sourceKind: "sandbox.span", + metadata: { + sandboxId: sandbox.sandboxId, + status: sandbox.status, + closeReason: sandbox.closeReason, + openEnded: endAt === null, + }, + lineage: { + taskId: sandbox.taskId, + sandboxId: sandbox.sandboxId, + }, + debug: { + source: "sandbox.span", + payload: { + ...sandbox, + commands: undefined, + }, + }, + }); + } + + for (let i = 0; i < sandbox.commands.length; i++) { + const command = sandbox.commands[i]; + if (!isFiniteTime(command.timestamp)) continue; + const endAt = addMs(command.timestamp, command.durationMs); + activities.push({ + id: `sandbox.command:${sandbox.sandboxId}:${i}`, + kind: "sandbox", + band: "tools", + label: sandboxCommandLabel(command), + taskId: sandbox.taskId, + sequence: null, + startAt: command.timestamp, + endAt, + isInstant: !endAt || endAt === command.timestamp, + actor: null, + sourceKind: "sandbox.command", + metadata: { + sandboxId: sandbox.sandboxId, + exitCode: command.exitCode, + durationMs: command.durationMs, + }, + lineage: { + taskId: sandbox.taskId, + sandboxId: sandbox.sandboxId, + }, + debug: { + source: "sandbox.command", + payload: command, + }, + }); + } + } + return activities; +} + +function contextLabel(event: ContextEventState): string { + const payloadType = + typeof event.payload === "object" && + event.payload !== null && + "event_type" in event.payload + ? String((event.payload as { event_type?: unknown }).event_type) + : null; + return payloadType ?? event.eventType; +} + +function contextActivities(run: WorkflowRunState): RunActivity[] { + const activities: RunActivity[] = []; + for (const [taskId, events] of run.contextEventsByTask.entries()) { + for (const event of events) { + const startAt = event.startedAt ?? event.createdAt; + if (!isFiniteTime(startAt)) continue; + const endAt = event.completedAt; + activities.push({ + id: `context:${event.id}`, + kind: "context", + band: "tools", + label: contextLabel(event), + taskId, + sequence: null, + startAt, + endAt, + isInstant: !endAt || endAt === startAt, + actor: event.workerBindingKey ?? null, + sourceKind: endAt ? "context.span" : "context.event", + metadata: { + eventId: event.id, + eventType: event.eventType, + contextSequence: event.sequence ?? null, + taskExecutionId: event.taskExecutionId, + }, + lineage: { + taskId, + taskExecutionId: event.taskExecutionId, + workerBindingKey: event.workerBindingKey, + }, + debug: { + source: "context.event", + payload: event, + }, + }); + } + } + return activities; +} + +function eventMarkerActivities(events: RunEvent[]): RunActivity[] { + return events.flatMap((event): RunActivity[] => { + switch (event.kind) { + case "thread.message": + return [ + { + id: `message:${event.id}`, + kind: "message", + band: "communication", + label: truncate(event.preview), + taskId: event.taskId ?? null, + sequence: event.sequence ?? null, + startAt: event.at, + endAt: null, + isInstant: true, + actor: event.authorRole, + sourceKind: event.kind, + metadata: { + threadId: event.threadId, + }, + lineage: { + taskId: event.taskId ?? null, + threadId: event.threadId, + }, + debug: { + source: event.kind, + payload: event, + }, + }, + ]; + case "resource.published": + return [ + { + id: `artifact:${event.id}`, + kind: "artifact", + band: "outputs", + label: `artifact: ${event.name}`, + taskId: event.taskId ?? null, + sequence: event.sequence ?? null, + startAt: event.at, + endAt: null, + isInstant: true, + actor: null, + sourceKind: event.kind, + metadata: { + mimeType: event.mimeType, + sizeBytes: event.sizeBytes, + }, + lineage: { + taskId: event.taskId ?? null, + }, + debug: { + source: event.kind, + payload: event, + }, + }, + ]; + case "task.evaluation": + return [ + { + id: `evaluation:${event.id}`, + kind: "evaluation", + band: "outputs", + label: `Evaluation ${event.passed === null ? "updated" : event.passed ? "passed" : "failed"}`, + taskId: event.taskId ?? null, + sequence: event.sequence ?? null, + startAt: event.at, + endAt: null, + isInstant: true, + actor: null, + sourceKind: event.kind, + metadata: { + score: event.score, + passed: event.passed, + }, + lineage: { + taskId: event.taskId ?? null, + }, + debug: { + source: event.kind, + payload: event, + }, + }, + ]; + default: + return []; + } + }); +} + +function graphMutationActivities(mutations: GraphMutationDto[]): RunActivity[] { + return mutations.map((mutation) => ({ + id: `graph:${mutation.id}`, + kind: "graph", + band: "graph", + label: mutation.mutation_type, + taskId: mutation.target_type === "node" ? mutation.target_id : null, + sequence: mutation.sequence, + startAt: mutation.created_at, + endAt: null, + isInstant: true, + actor: mutation.actor, + sourceKind: "graph.mutation", + metadata: { + mutationType: mutation.mutation_type, + targetType: mutation.target_type, + reason: mutation.reason, + }, + lineage: { + taskId: mutation.target_type === "node" ? mutation.target_id : null, + }, + debug: { + source: "graph.mutation", + payload: mutation, + }, + })); +} + +export function buildRunActivities(input: BuildRunActivitiesInput): RunActivity[] { + if (!input.runState) return []; + return [ + ...executionActivities(input.runState), + ...sandboxActivities(input.runState), + ...contextActivities(input.runState), + ...eventMarkerActivities(input.events), + ...graphMutationActivities(input.mutations), + ].sort(compareActivity); +} diff --git a/ergon-dashboard/src/features/activity/components/ActivityBar.tsx b/ergon-dashboard/src/features/activity/components/ActivityBar.tsx new file mode 100644 index 00000000..e122bd82 --- /dev/null +++ b/ergon-dashboard/src/features/activity/components/ActivityBar.tsx @@ -0,0 +1,143 @@ +"use client"; + +import type { ActivityStackItem, ActivityKind, RunActivity } from "@/features/activity/types"; + +const KIND_STYLES: Record< + ActivityKind, + { fill: string; text: string; label: string; legendLabel: string } +> = { + graph: { + fill: "oklch(0.78 0.14 305)", + text: "white", + label: "Graph mutation", + legendLabel: "graph mutation", + }, + execution: { + fill: "oklch(0.74 0.16 295)", + text: "white", + label: "Execution", + legendLabel: "task", + }, + message: { + fill: "oklch(0.74 0.14 70)", + text: "white", + label: "Message", + legendLabel: "message", + }, + artifact: { + fill: "oklch(0.72 0.16 145)", + text: "white", + label: "Artifact", + legendLabel: "artifact", + }, + evaluation: { + fill: "oklch(0.68 0.18 345)", + text: "white", + label: "Evaluation", + legendLabel: "evaluation", + }, + context: { + fill: "oklch(0.66 0.12 230)", + text: "white", + label: "Context", + legendLabel: "context/tool", + }, + sandbox: { + fill: "oklch(0.70 0.12 195)", + text: "white", + label: "Sandbox", + legendLabel: "sandbox", + }, +}; + +export function activityKindLabel(kind: ActivityKind): string { + return KIND_STYLES[kind].label; +} + +export function activityKindLegendLabel(kind: ActivityKind): string { + return KIND_STYLES[kind].legendLabel; +} + +export function activityKindColor(kind: ActivityKind): string { + return KIND_STYLES[kind].fill; +} + +export const ALL_ACTIVITY_KINDS = Object.keys(KIND_STYLES) as ActivityKind[]; + +function testIdFor(activity: RunActivity): string { + return `activity-bar-${activity.id.replace(/[^a-zA-Z0-9_-]/g, "-")}`; +} + +export function ActivityBar({ + item, + selected, + highlighted, + current, + relation, + onClick, + onHoverStart, + onHoverEnd, +}: { + item: ActivityStackItem; + selected: boolean; + highlighted: boolean; + current: boolean; + relation: "focused" | "related" | "dimmed" | "none"; + onClick: (activity: RunActivity) => void; + onHoverStart: (activity: RunActivity) => void; + onHoverEnd: () => void; +}) { + const { activity, leftPct, widthPct } = item; + const styles = KIND_STYLES[activity.kind]; + const isMarker = activity.isInstant; + + return ( + + ); +} diff --git a/ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx b/ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx new file mode 100644 index 00000000..f2b84d13 --- /dev/null +++ b/ergon-dashboard/src/features/activity/components/ActivityStackTimeline.tsx @@ -0,0 +1,389 @@ +"use client"; + +import { useMemo, useState } from "react"; + +import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import { ACTIVITY_BAND_ORDER, stackActivities } from "@/features/activity/stackLayout"; +import type { ActivityBand, RunActivity } from "@/features/activity/types"; +import { resolveCurrentActivityId } from "@/features/activity/currentActivity"; +import { formatClockTime } from "@/lib/timeFormat"; +import { ActivityBar, activityKindLegendLabel, activityKindColor } from "./ActivityBar"; + +interface ActivityStackTimelineProps { + activities: RunActivity[]; + mutations: GraphMutationDto[]; + currentSequence: number; + selectedTaskId: string | null; + selectedActivityId: string | null; + onActivityClick: (activity: RunActivity) => void; +} + +const ROW_HEIGHT = 31; +const BAND_LABELS: Record = { + work: { + title: "Work spans", + note: "Executions and sandbox lifetimes.", + }, + graph: { + title: "Graph changes", + note: "Node and edge mutations.", + }, + tools: { + title: "Tools / context", + note: "Tool calls, commands, observations.", + }, + communication: { + title: "Communication", + note: "Messages and coordination.", + }, + outputs: { + title: "Outputs / evals", + note: "Artifacts, scores, pass/fail.", + }, +}; +const STACK_ACTIVITY_KINDS = [ + "execution", + "graph", + "context", + "sandbox", + "message", + "artifact", + "evaluation", +] as const; + +function timePositionPct(timestamp: string, startMs: number, endMs: number): number | null { + const ms = Date.parse(timestamp); + if (!Number.isFinite(ms)) return null; + const spanMs = Math.max(1, endMs - startMs); + return Math.min(100, Math.max(0, ((ms - startMs) / spanMs) * 100)); +} + +function lineageValueMatches( + a: string | null | undefined, + b: string | null | undefined, +): boolean { + return Boolean(a && b && a === b); +} + +function areActivitiesRelated(a: RunActivity, b: RunActivity): boolean { + if (a.id === b.id) return true; + return ( + lineageValueMatches(a.lineage.taskExecutionId, b.lineage.taskExecutionId) || + lineageValueMatches(a.lineage.sandboxId, b.lineage.sandboxId) || + lineageValueMatches(a.lineage.threadId, b.lineage.threadId) || + lineageValueMatches(a.lineage.taskId, b.lineage.taskId) + ); +} + +function debugPreview(activity: RunActivity): string { + return JSON.stringify( + { + kind: activity.kind, + band: activity.band, + label: activity.label, + source: activity.debug.source, + lineage: activity.lineage, + metadata: activity.metadata, + payload: activity.debug.payload, + }, + null, + 2, + ); +} + +function ActivityLineageCard({ + activity, + related, +}: { + activity: RunActivity; + related: RunActivity[]; +}) { + const relatedSummary = related + .filter((candidate) => candidate.id !== activity.id) + .slice(0, 6); + + return ( +
+
+ Lineage +
+
+ {activity.kind}: {activity.label} +
+
+ Band: {activity.band} + Source: {activity.debug.source} + Task: {activity.lineage.taskId ?? "—"} + Execution: {activity.lineage.taskExecutionId ?? "—"} + Sandbox: {activity.lineage.sandboxId ?? "—"} + Seq: {activity.sequence ?? "—"} +
+ {relatedSummary.length > 0 && ( +
+
+ Related events +
+
    + {relatedSummary.map((candidate) => ( +
  • + {candidate.kind} + {" · "} + {candidate.label} +
  • + ))} +
+
+ )} +
+ + Raw payload + + + {debugPreview(activity)} + +
+
+ ); +} + +export function ActivityStackTimeline({ + activities, + mutations, + currentSequence, + selectedTaskId, + selectedActivityId, + onActivityClick, +}: ActivityStackTimelineProps) { + const [hoveredActivityId, setHoveredActivityId] = useState(null); + const layout = useMemo(() => stackActivities(activities), [activities]); + const maxSequence = mutations.length > 0 ? mutations[mutations.length - 1].sequence : 0; + const minSequence = mutations.length > 0 ? mutations[0].sequence : 0; + const currentMutation = mutations.find((mutation) => mutation.sequence === currentSequence); + const hasMutations = mutations.length > 0; + const isReplayLocked = currentSequence > 0; + const snapshotLeftPct = currentMutation + ? timePositionPct(currentMutation.created_at, layout.startMs, layout.endMs) + : null; + const currentActivityId = resolveCurrentActivityId( + activities, + currentMutation?.created_at ?? null, + currentMutation?.sequence ?? null, + ); + + if (activities.length === 0) { + return ( +
+ No activity has been recorded for this run yet. +
+ ); + } + + const timeSlots = 8; + const timeRange = layout.endMs - layout.startMs; + const timeLabels = Array.from({ length: timeSlots }, (_, i) => { + const ms = layout.startMs + (timeRange / (timeSlots - 1)) * i; + return formatClockTime(ms); + }); + const focusActivity = + activities.find((activity) => activity.id === hoveredActivityId) ?? + activities.find((activity) => activity.id === selectedActivityId) ?? + null; + const relatedActivities = focusActivity + ? activities.filter((activity) => areActivitiesRelated(focusActivity, activity)) + : []; + const relatedActivityIds = new Set(relatedActivities.map((activity) => activity.id)); + + return ( +
+ {/* Header bar */} +
+
+
+ Concurrent execution{" "} + + bars are task attempts; dots are graph snapshots. + +
+ + {!isReplayLocked && ( + + + Live · auto-tail + + )} + + + seq {minSequence} — {maxSequence || currentMutation?.sequence || currentSequence} · {isReplayLocked ? "replay" : "streaming"} + + + {isReplayLocked && ( + + graph locked · seq {currentSequence} + + )} + +
+ + {/* Kind legend */} +
+ + + Span + + + + Point event + + {STACK_ACTIVITY_KINDS.map((kind) => ( + + + {activityKindLegendLabel(kind)} + + ))} +
+
+ + {focusActivity && ( + + )} + + {/* Stack content */} +
+
+
+
Trace spans
+ Band = semantic category. Sub-row = visual overlap. +
+ +
+ {timeLabels.map((label, i) => ( + + {label} + {i === timeSlots - 2 && !isReplayLocked && · now} + + ))} +
+
+ +
+ {ACTIVITY_BAND_ORDER.map((band) => { + const bandLayout = layout.bands.find((entry) => entry.band === band); + if (!bandLayout) return null; + const bandItems = layout.items.filter((item) => item.activity.band === band); + const labels = BAND_LABELS[band]; + return ( +
+
+
+ {labels.title} +
+
{labels.note}
+
+
+ {Array.from({ length: bandLayout.rowCount }).map((_, row) => ( +
+ ))} + + {bandItems.map((item) => { + const relation = !focusActivity + ? "none" + : item.activity.id === focusActivity.id + ? "focused" + : relatedActivityIds.has(item.activity.id) + ? "related" + : "dimmed"; + return ( +
+ setHoveredActivityId(activity.id)} + onHoverEnd={() => setHoveredActivityId(null)} + /> +
+ ); + })} + + {/* Snapshot pin (indigo) */} + {hasMutations && isReplayLocked && snapshotLeftPct !== null && ( + <> +
+
+ SEQ {currentSequence} +
+ + )} + + {/* NOW cursor (green, live mode) */} + {!isReplayLocked && ( + <> +
+
+ + NOW +
+ + )} +
+
+ ); + })} +
+ + {/* Footer hints */} +
+ {layout.rowCount} trace rows across {layout.bands.length} semantic bands + Hover = lineage focus + Click = inspect graph snapshot +
+
+
+ ); +} diff --git a/ergon-dashboard/src/features/activity/currentActivity.test.ts b/ergon-dashboard/src/features/activity/currentActivity.test.ts new file mode 100644 index 00000000..47b35657 --- /dev/null +++ b/ergon-dashboard/src/features/activity/currentActivity.test.ts @@ -0,0 +1,75 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import type { RunActivity } from "./types"; +import { resolveCurrentActivityId } from "./currentActivity"; + +function activity(id: string, startAt: string, sequence: number | null = null): RunActivity { + return { + id, + kind: "graph", + band: "graph", + label: id, + taskId: null, + sequence, + startAt, + endAt: null, + isInstant: true, + actor: null, + sourceKind: "graph.mutation", + metadata: {}, + lineage: {}, + debug: { source: "graph.mutation", payload: { id } }, + }; +} + +test("resolveCurrentActivityId chooses latest activity at or before cursor time", () => { + assert.equal( + resolveCurrentActivityId( + [ + activity("before", "2026-04-26T12:00:05.000Z"), + activity("current", "2026-04-26T12:00:08.000Z"), + activity("after", "2026-04-26T12:00:09.000Z"), + ], + "2026-04-26T12:00:08.500Z", + ), + "current", + ); +}); + +test("resolveCurrentActivityId breaks timestamp ties by highest graph sequence", () => { + assert.equal( + resolveCurrentActivityId( + [ + activity("older-seq", "2026-04-26T12:00:08.000Z", 10), + activity("newer-seq", "2026-04-26T12:00:08.000Z", 14), + ], + "2026-04-26T12:00:08.000Z", + ), + "newer-seq", + ); +}); + +test("resolveCurrentActivityId does not choose future graph sequence at same timestamp", () => { + assert.equal( + resolveCurrentActivityId( + [ + activity("current-seq", "2026-04-26T12:00:08.000Z", 10), + activity("future-seq", "2026-04-26T12:00:08.000Z", 14), + ], + "2026-04-26T12:00:08.000Z", + 10, + ), + "current-seq", + ); +}); + +test("resolveCurrentActivityId returns null before the first activity", () => { + assert.equal( + resolveCurrentActivityId( + [activity("after", "2026-04-26T12:00:09.000Z")], + "2026-04-26T12:00:08.000Z", + ), + null, + ); +}); diff --git a/ergon-dashboard/src/features/activity/currentActivity.ts b/ergon-dashboard/src/features/activity/currentActivity.ts new file mode 100644 index 00000000..6308e694 --- /dev/null +++ b/ergon-dashboard/src/features/activity/currentActivity.ts @@ -0,0 +1,45 @@ +import type { RunActivity } from "./types"; + +function parseTime(value: string): number { + const parsed = Date.parse(value); + return Number.isFinite(parsed) ? parsed : Number.NEGATIVE_INFINITY; +} + +export function resolveCurrentActivityId( + activities: RunActivity[], + currentTimestamp: string | null, + currentSequence: number | null = null, +): string | null { + if (!currentTimestamp) return null; + const currentMs = Date.parse(currentTimestamp); + if (!Number.isFinite(currentMs)) return null; + + let selected: RunActivity | null = null; + let selectedMs = Number.NEGATIVE_INFINITY; + for (const activity of activities) { + const activityMs = parseTime(activity.startAt); + if (activityMs > currentMs) continue; + if ( + currentSequence !== null && + activity.sequence !== null && + activity.sequence > currentSequence + ) { + continue; + } + if ( + activityMs > selectedMs || + (activityMs === selectedMs && + (activity.sequence ?? Number.NEGATIVE_INFINITY) > + (selected?.sequence ?? Number.NEGATIVE_INFINITY)) || + (activityMs === selectedMs && + (activity.sequence ?? Number.NEGATIVE_INFINITY) === + (selected?.sequence ?? Number.NEGATIVE_INFINITY) && + (!selected || activity.id > selected.id)) + ) { + selected = activity; + selectedMs = activityMs; + } + } + + return selected?.id ?? null; +} diff --git a/ergon-dashboard/src/features/activity/goldenFixture.test.ts b/ergon-dashboard/src/features/activity/goldenFixture.test.ts new file mode 100644 index 00000000..c8eecbe6 --- /dev/null +++ b/ergon-dashboard/src/features/activity/goldenFixture.test.ts @@ -0,0 +1,63 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import fixture from "../../../tests/fixtures/mas-runs/concurrent-mas-run.json"; +import { parseGraphMutationDtoArray } from "@/features/graph/contracts/graphMutations"; +import { replayToSequence } from "@/features/graph/state/graphMutationReducer"; +import { buildRunEvents } from "@/lib/runEvents"; +import { deserializeRunState } from "@/lib/runState"; +import type { WorkflowRunState } from "@/lib/types"; +import { buildRunActivities } from "./buildRunActivities"; +import { stackActivities } from "./stackLayout"; + +function emptyRunStateFrom(runState: WorkflowRunState): WorkflowRunState { + return { + ...runState, + tasks: new Map(), + resourcesByTask: new Map(), + executionsByTask: new Map(), + sandboxesByTask: new Map(), + threads: [], + contextEventsByTask: new Map(), + evaluationsByTask: new Map(), + totalTasks: 0, + totalLeafTasks: 0, + completedTasks: 0, + runningTasks: 0, + failedTasks: 0, + cancelledTasks: 0, + edges: new Map(), + annotationsByTarget: new Map(), + unhandledMutations: [], + }; +} + +test("golden concurrent fixture replays the whole graph at selected sequence and stacks overlapping activity", () => { + const runState = deserializeRunState(fixture.runState); + const mutations = parseGraphMutationDtoArray(fixture.mutations); + const checkpoint = fixture.checkpoints.find((entry) => entry.sequence === 14); + assert.ok(checkpoint); + + const displayState = replayToSequence( + mutations, + checkpoint.sequence, + emptyRunStateFrom(runState), + new Map(), + ); + const activities = buildRunActivities({ + runState, + events: buildRunEvents(runState), + mutations, + currentSequence: checkpoint.sequence, + }); + const stack = stackActivities(activities); + + assert.deepEqual( + new Set(displayState.tasks.keys()), + new Set(checkpoint.expectedTaskIds), + ); + assert.ok(stack.maxConcurrency >= checkpoint.expectedMaxConcurrency); + assert.ok(activities.some((activity) => activity.kind === "context")); + assert.ok(activities.some((activity) => activity.kind === "artifact")); + assert.ok(activities.some((activity) => activity.kind === "evaluation")); +}); diff --git a/ergon-dashboard/src/features/activity/snapshotSequence.test.ts b/ergon-dashboard/src/features/activity/snapshotSequence.test.ts new file mode 100644 index 00000000..0e61e03c --- /dev/null +++ b/ergon-dashboard/src/features/activity/snapshotSequence.test.ts @@ -0,0 +1,134 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import type { RunActivity } from "./types"; +import { resolveActivitySnapshotSequence } from "./snapshotSequence"; + +function activity(overrides: Partial = {}): RunActivity { + return { + id: "activity-1", + kind: "execution", + band: "work", + label: "Activity", + taskId: "task-1", + sequence: null, + startAt: "2026-04-26T12:00:10.000Z", + endAt: null, + isInstant: true, + actor: null, + sourceKind: "execution.span", + metadata: {}, + lineage: { taskId: "task-1", taskExecutionId: "activity-1" }, + ...overrides, + debug: overrides.debug ?? { source: "execution.span", payload: { id: "activity-1" } }, + }; +} + +function mutation(sequence: number, createdAt: string): GraphMutationDto { + return { + id: "00000000-0000-4000-8000-000000000001", + run_id: "00000000-0000-4000-8000-000000000002", + sequence, + mutation_type: "node.added", + target_type: "node", + target_id: "00000000-0000-4000-8000-000000000003", + actor: "system", + old_value: null, + new_value: {}, + reason: null, + created_at: createdAt, + }; +} + +test("uses explicit activity sequence when present", () => { + const result = resolveActivitySnapshotSequence( + activity({ sequence: 7, startAt: "not-a-date" }), + [mutation(1, "2026-04-26T12:00:00.000Z")], + ); + + assert.equal(result, 7); +}); + +test("uses nearest mutation at or before activity start time when sequence is absent", () => { + const result = resolveActivitySnapshotSequence( + activity({ startAt: "2026-04-26T12:00:10.000Z" }), + [ + mutation(1, "2026-04-26T12:00:00.000Z"), + mutation(2, "2026-04-26T12:00:05.000Z"), + mutation(3, "2026-04-26T12:00:15.000Z"), + ], + ); + + assert.equal(result, 2); +}); + +test("uses matching mutation timestamp and highest sequence for timestamp ties", () => { + const result = resolveActivitySnapshotSequence( + activity({ startAt: "2026-04-26T12:00:10.000Z" }), + [ + mutation(1, "2026-04-26T12:00:00.000Z"), + mutation(2, "2026-04-26T12:00:10.000Z"), + mutation(3, "2026-04-26T12:00:10.000Z"), + ], + ); + + assert.equal(result, 3); +}); + +test("uses nearest prior timestamp even when mutation timestamps are not monotonic", () => { + const result = resolveActivitySnapshotSequence( + activity({ startAt: "2026-04-26T12:00:10.000Z" }), + [ + mutation(1, "2026-04-26T12:00:00.000Z"), + mutation(2, "2026-04-26T12:00:15.000Z"), + mutation(3, "2026-04-26T12:00:05.000Z"), + ], + ); + + assert.equal(result, 3); +}); + +test("ignores invalid mutation timestamps while considering later valid candidates", () => { + const result = resolveActivitySnapshotSequence( + activity({ startAt: "2026-04-26T12:00:10.000Z" }), + [ + mutation(1, "2026-04-26T12:00:00.000Z"), + mutation(2, "2026-04-26T12:00:15.000Z"), + mutation(3, "not-a-date"), + mutation(4, "2026-04-26T12:00:05.000Z"), + ], + ); + + assert.equal(result, 4); +}); + +test("returns null when no mutation can represent activity time", () => { + const result = resolveActivitySnapshotSequence( + activity({ startAt: "2026-04-26T12:00:10.000Z" }), + [mutation(1, "2026-04-26T12:00:15.000Z")], + ); + + assert.equal(result, null); +}); + +test("ignores invalid mutation timestamps and returns null for invalid activity timestamps", () => { + assert.equal( + resolveActivitySnapshotSequence( + activity({ startAt: "not-a-date" }), + [mutation(1, "2026-04-26T12:00:00.000Z")], + ), + null, + ); + + assert.equal( + resolveActivitySnapshotSequence( + activity({ startAt: "2026-04-26T12:00:10.000Z" }), + [ + mutation(1, "not-a-date"), + mutation(2, "2026-04-26T12:00:05.000Z"), + ], + ), + 2, + ); +}); diff --git a/ergon-dashboard/src/features/activity/snapshotSequence.ts b/ergon-dashboard/src/features/activity/snapshotSequence.ts new file mode 100644 index 00000000..0e3a3c4d --- /dev/null +++ b/ergon-dashboard/src/features/activity/snapshotSequence.ts @@ -0,0 +1,28 @@ +import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import type { RunActivity } from "./types"; + +export function resolveActivitySnapshotSequence( + activity: RunActivity, + mutations: GraphMutationDto[], +): number | null { + if (activity.sequence !== null) return activity.sequence; + + const activityMs = Date.parse(activity.startAt); + if (!Number.isFinite(activityMs)) return null; + + let selected: GraphMutationDto | null = null; + let selectedMs = Number.NEGATIVE_INFINITY; + for (const mutation of mutations) { + const mutationMs = Date.parse(mutation.created_at); + if (!Number.isFinite(mutationMs)) continue; + if (mutationMs > activityMs) continue; + if ( + mutationMs > selectedMs || + (mutationMs === selectedMs && (!selected || mutation.sequence > selected.sequence)) + ) { + selected = mutation; + selectedMs = mutationMs; + } + } + return selected?.sequence ?? null; +} diff --git a/ergon-dashboard/src/features/activity/stackLayout.test.ts b/ergon-dashboard/src/features/activity/stackLayout.test.ts new file mode 100644 index 00000000..b1dfaefb --- /dev/null +++ b/ergon-dashboard/src/features/activity/stackLayout.test.ts @@ -0,0 +1,156 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import type { RunActivity } from "./types"; +import { stackActivities } from "./stackLayout"; + +function activity( + id: string, + startAt: string, + endAt: string | null, + actor: string | null = null, +): RunActivity { + return { + id, + kind: "execution", + band: "work", + label: id, + taskId: id, + sequence: null, + startAt, + endAt, + isInstant: endAt === null, + actor, + sourceKind: "execution.span", + metadata: {}, + lineage: { taskId: id, taskExecutionId: id }, + debug: { source: "execution.span", payload: { id } }, + }; +} + +function marker(id: string, startAt: string): RunActivity { + return { + id, + kind: "graph", + band: "graph", + label: id, + taskId: id, + sequence: null, + startAt, + endAt: null, + isInstant: true, + actor: null, + sourceKind: "graph.mutation", + metadata: {}, + lineage: { taskId: id }, + debug: { source: "graph.mutation", payload: { id } }, + }; +} + +test("stackActivities allocates rows by time overlap and reuses rows", () => { + const layout = stackActivities([ + activity("a", "2026-04-26T12:00:00.000Z", "2026-04-26T12:00:10.000Z", "agent-a"), + activity("b", "2026-04-26T12:00:05.000Z", "2026-04-26T12:00:12.000Z", "agent-b"), + activity("c", "2026-04-26T12:00:12.000Z", "2026-04-26T12:00:15.000Z", "agent-a"), + ]); + + const rowById = new Map(layout.items.map((item) => [item.activity.id, item.row])); + + assert.equal(layout.rowCount, 2); + assert.equal(layout.maxConcurrency, 2); + assert.equal(rowById.get("a"), rowById.get("c")); + assert.notEqual(rowById.get("a"), rowById.get("b")); +}); + +test("stackActivities reports three-way concurrency and does not group by actor", () => { + const layout = stackActivities([ + activity("a", "2026-04-26T12:00:00.000Z", "2026-04-26T12:00:20.000Z", "agent-a"), + activity("b", "2026-04-26T12:00:05.000Z", "2026-04-26T12:00:21.000Z", "agent-b"), + activity("c", "2026-04-26T12:00:10.000Z", "2026-04-26T12:00:15.000Z", "agent-a"), + ]); + + const rowsForAgentA = layout.items + .filter((item) => item.activity.actor === "agent-a") + .map((item) => item.row); + + assert.equal(layout.maxConcurrency, 3); + assert.deepEqual(new Set(rowsForAgentA).size, 2); +}); + +test("stackActivities computes point-in-time concurrency instead of interval intersections", () => { + const layout = stackActivities([ + activity("long", "2026-04-26T12:00:00.000Z", "2026-04-26T12:00:30.000Z"), + activity("early", "2026-04-26T12:00:05.000Z", "2026-04-26T12:00:10.000Z"), + activity("late", "2026-04-26T12:00:20.000Z", "2026-04-26T12:00:25.000Z"), + ]); + + assert.equal(layout.maxConcurrency, 2); +}); + +test("stackActivities stacks instant markers when their visual footprints overlap", () => { + const layout = stackActivities([ + activity("span", "2026-04-26T12:00:00.000Z", "2026-04-26T12:00:30.000Z"), + marker("m1", "2026-04-26T12:00:05.000Z"), + marker("m2", "2026-04-26T12:00:05.050Z"), + marker("m3", "2026-04-26T12:00:10.000Z"), + ]); + const rowById = new Map(layout.items.map((item) => [item.activity.id, item.row])); + + assert.equal(layout.bands.find((band) => band.band === "work")?.rowCount, 1); + assert.equal(layout.bands.find((band) => band.band === "graph")?.rowCount, 2); + assert.equal(layout.rowCount, 3); + assert.equal(layout.maxConcurrency, 1); + assert.equal(rowById.get("m1"), 0); + assert.equal(rowById.get("m2"), 1); + assert.equal(rowById.get("m3"), 0); +}); + +test("stackActivities prevents marker and duration item covering inside non-work bands", () => { + const layout = stackActivities([ + { + ...activity("tool-span", "2026-04-26T12:00:05.000Z", "2026-04-26T12:00:10.000Z"), + kind: "context", + band: "tools", + }, + { + ...marker("tool-point", "2026-04-26T12:00:07.000Z"), + kind: "context", + band: "tools", + }, + { + ...marker("message-point", "2026-04-26T12:00:07.000Z"), + kind: "message", + band: "communication", + }, + { + ...marker("artifact-point", "2026-04-26T12:00:07.050Z"), + kind: "artifact", + band: "communication", + }, + ]); + const bandByName = new Map(layout.bands.map((band) => [band.band, band])); + const rowById = new Map(layout.items.map((item) => [item.activity.id, item.row])); + + assert.equal(bandByName.get("tools")?.rowCount, 2); + assert.equal(rowById.get("tool-span"), 0); + assert.equal(rowById.get("tool-point"), 1); + assert.equal(bandByName.get("communication")?.rowCount, 2); + assert.notEqual(rowById.get("message-point"), rowById.get("artifact-point")); +}); + +test("stackActivities packs rows independently inside semantic bands", () => { + const layout = stackActivities([ + { ...activity("work-a", "2026-04-26T12:00:00.000Z", "2026-04-26T12:00:20.000Z"), band: "work" }, + { ...activity("work-b", "2026-04-26T12:00:05.000Z", "2026-04-26T12:00:15.000Z"), band: "work" }, + { ...activity("tool-a", "2026-04-26T12:00:05.000Z", "2026-04-26T12:00:15.000Z"), kind: "context", band: "tools" }, + ]); + + const bandByName = new Map(layout.bands.map((band) => [band.band, band])); + const rowById = new Map(layout.items.map((item) => [item.activity.id, item.row])); + + assert.equal(bandByName.get("work")?.rowCount, 2); + assert.equal(bandByName.get("tools")?.rowCount, 1); + assert.equal(rowById.get("work-a"), 0); + assert.equal(rowById.get("work-b"), 1); + assert.equal(rowById.get("tool-a"), 0); +}); diff --git a/ergon-dashboard/src/features/activity/stackLayout.ts b/ergon-dashboard/src/features/activity/stackLayout.ts new file mode 100644 index 00000000..67c3959b --- /dev/null +++ b/ergon-dashboard/src/features/activity/stackLayout.ts @@ -0,0 +1,134 @@ +import type { ActivityBand, ActivityStackLayout, RunActivity } from "./types"; + +export interface StackActivityOptions { + minMarkerWidthPct?: number; + minSpanWidthPct?: number; + markerDurationMs?: number; +} + +interface TimedActivity { + activity: RunActivity; + startMs: number; + endMs: number; +} + +const DEFAULT_MARKER_DURATION_MS = 250; +const DEFAULT_MIN_MARKER_WIDTH_PCT = 1.6; +const ROW_GUTTER_PCT = 0.15; +export const ACTIVITY_BAND_ORDER: ActivityBand[] = [ + "work", + "graph", + "tools", + "communication", + "outputs", +]; + +function firstFreeRow(rowEnds: number[], start: number): number { + const row = rowEnds.findIndex((end) => end <= start); + return row === -1 ? rowEnds.length : row; +} + +function parseTime(value: string): number { + const parsed = Date.parse(value); + return Number.isFinite(parsed) ? parsed : 0; +} + +function toTimedActivity( + activity: RunActivity, + markerDurationMs: number, +): TimedActivity { + const startMs = parseTime(activity.startAt); + const rawEndMs = activity.endAt ? parseTime(activity.endAt) : startMs; + const endMs = + activity.isInstant || rawEndMs <= startMs + ? startMs + markerDurationMs + : rawEndMs; + return { activity, startMs, endMs }; +} + +function computeMaxSpanConcurrency(timed: TimedActivity[]): number { + const events = timed + .filter((item) => !item.activity.isInstant) + .flatMap((item) => [ + { at: item.startMs, delta: 1 }, + { at: item.endMs, delta: -1 }, + ]) + .sort((a, b) => a.at - b.at || a.delta - b.delta); + if (events.length === 0) return 0; + let max = 0; + let active = 0; + for (const event of events) { + active += event.delta; + max = Math.max(max, active); + } + return max; +} + +export function stackActivities( + activities: RunActivity[], + options: StackActivityOptions = {}, +): ActivityStackLayout { + const minMarkerWidthPct = options.minMarkerWidthPct ?? DEFAULT_MIN_MARKER_WIDTH_PCT; + const minSpanWidthPct = options.minSpanWidthPct ?? 0.75; + const markerDurationMs = options.markerDurationMs ?? DEFAULT_MARKER_DURATION_MS; + const timed = activities + .map((activity) => toTimedActivity(activity, markerDurationMs)) + .sort( + (a, b) => + a.startMs - b.startMs || + a.endMs - b.endMs || + a.activity.id.localeCompare(b.activity.id), + ); + + if (timed.length === 0) { + return { items: [], bands: [], rowCount: 0, startMs: 0, endMs: 0, maxConcurrency: 0 }; + } + + const startMs = Math.min(...timed.map((item) => item.startMs)); + const endMs = Math.max(...timed.map((item) => item.endMs)); + const spanMs = Math.max(1, endMs - startMs); + const items = []; + const bands = []; + + for (const band of ACTIVITY_BAND_ORDER) { + const bandTimed = timed.filter((item) => item.activity.band === band); + if (bandTimed.length === 0) continue; + + const rowEnds: number[] = []; + const bandItems = bandTimed.map(({ activity, startMs: itemStartMs, endMs: itemEndMs }) => { + const leftPct = ((itemStartMs - startMs) / spanMs) * 100; + const rawWidthPct = ((itemEndMs - itemStartMs) / spanMs) * 100; + const widthPct = Math.max( + activity.isInstant ? minMarkerWidthPct : minSpanWidthPct, + rawWidthPct, + ); + const row = firstFreeRow(rowEnds, leftPct); + rowEnds[row] = leftPct + widthPct + ROW_GUTTER_PCT; + + return { activity, row, leftPct, widthPct }; + }); + + const rowCount = Math.max(1, rowEnds.length); + bands.push({ band, rowCount }); + items.push(...bandItems); + } + + const maxConcurrency = computeMaxSpanConcurrency(timed); + const rowCount = bands.reduce((sum, band) => sum + band.rowCount, 0); + + return { + items: items.sort( + (a, b) => + ACTIVITY_BAND_ORDER.indexOf(a.activity.band) - + ACTIVITY_BAND_ORDER.indexOf(b.activity.band) || + a.activity.startAt.localeCompare(b.activity.startAt) || + Number(a.activity.isInstant) - Number(b.activity.isInstant) || + a.activity.id.localeCompare(b.activity.id), + ), + bands, + rowCount, + startMs, + endMs, + maxConcurrency, + }; +} diff --git a/ergon-dashboard/src/features/activity/types.ts b/ergon-dashboard/src/features/activity/types.ts new file mode 100644 index 00000000..9388bda4 --- /dev/null +++ b/ergon-dashboard/src/features/activity/types.ts @@ -0,0 +1,73 @@ +import type { RunEventKind } from "@/lib/runEvents"; + +export type ActivityKind = + | "execution" + | "graph" + | "message" + | "artifact" + | "evaluation" + | "context" + | "sandbox"; + +export type ActivityBand = + | "work" + | "graph" + | "tools" + | "communication" + | "outputs"; + +export interface ActivityLineage { + taskId?: string | null; + taskExecutionId?: string | null; + sandboxId?: string | null; + agentId?: string | null; + workerBindingKey?: string | null; + threadId?: string | null; +} + +export interface RunActivity { + id: string; + kind: ActivityKind; + band: ActivityBand; + label: string; + taskId: string | null; + sequence: number | null; + startAt: string; + endAt: string | null; + isInstant: boolean; + actor: string | null; + sourceKind: + | RunEventKind + | "execution.span" + | "sandbox.span" + | "sandbox.command" + | "context.span" + | "graph.mutation"; + metadata: Record; + lineage: ActivityLineage; + debug: { + source: string; + payload: unknown; + }; +} + +export interface ActivityStackItem { + activity: RunActivity; + row: number; + leftPct: number; + widthPct: number; +} + +export interface ActivityBandLayout { + band: ActivityBand; + rowCount: number; +} + +export interface ActivityStackLayout { + items: ActivityStackItem[]; + bands: ActivityBandLayout[]; + rowCount: number; + startMs: number; + endMs: number; + maxConcurrency: number; +} diff --git a/ergon-dashboard/src/features/graph/components/ContainerNode.tsx b/ergon-dashboard/src/features/graph/components/ContainerNode.tsx index dc580d46..e0c6b5f4 100644 --- a/ergon-dashboard/src/features/graph/components/ContainerNode.tsx +++ b/ergon-dashboard/src/features/graph/components/ContainerNode.tsx @@ -3,10 +3,6 @@ import { memo } from "react"; import { Handle, Position } from "@xyflow/react"; import type { TaskState, TaskStatus } from "@/lib/types"; -import { TaskGraphStatusIcon } from "@/components/dag/TaskGraphStatusIcon"; -import { getLevelColor } from "@/features/graph/theme/levelColors"; -import { getTaskTimingPrimaryLine } from "@/features/graph/utils/taskTiming"; -import { tokensFor } from "@/lib/statusTokens"; interface ContainerNodeProps { task: TaskState; @@ -15,24 +11,26 @@ interface ContainerNodeProps { onClick?: (taskId: string) => void; selected?: boolean; dimmed?: boolean; + highlighted?: boolean; containerWidth: number; containerHeight: number; layoutDirection?: "TB" | "LR"; maxGraphDepth?: number; } -function ContainerNodeComponent({ - task, - isExpanded, - onToggleExpand, - onClick, - selected = false, - dimmed = false, - containerWidth, - containerHeight, - layoutDirection = "LR", - maxGraphDepth, -}: ContainerNodeProps) { +function ContainerNodeComponent(props: ContainerNodeProps) { + const { + task, + isExpanded, + onToggleExpand, + onClick, + selected = false, + dimmed = false, + highlighted = false, + containerWidth, + containerHeight, + layoutDirection = "LR", + } = props; const handleClick = (e: React.MouseEvent) => { e.stopPropagation(); onClick?.(task.id); @@ -43,112 +41,96 @@ function ContainerNodeComponent({ onToggleExpand(task.id); }; - const tokens = tokensFor(task.status); - const borderColor = tokens.border; - - const depthForPalette = Math.max(maxGraphDepth ?? task.level, task.level); - const levelHex = getLevelColor(task.level, depthForPalette); - const targetPos = layoutDirection === "LR" ? Position.Left : Position.Top; const sourcePos = layoutDirection === "LR" ? Position.Right : Position.Bottom; - const timingLine = getTaskTimingPrimaryLine(task); + + const isRunning = task.status === ("running" as TaskStatus); + const borderColor = isRunning ? "var(--status-running)" : "#cdd3dc"; return (
+ {/* Header row */}
-
- -
- - {tokens.label} - -

- {task.name} -

- - {task.assignedWorkerName && ( - - - - - {task.assignedWorkerName} +
+ + {task.name} - )} - - - {task.childIds.length} subtask{task.childIds.length !== 1 ? "s" : ""} - +
- {timingLine && ( +
- {timingLine} + {task.childIds.length} subtask{task.childIds.length !== 1 ? "s" : ""} - )} - + + + + +
- {task.status === ("running" as TaskStatus) && !dimmed && ( -
+ {isRunning && !dimmed && ( +
)}
); diff --git a/ergon-dashboard/src/features/graph/components/LeafNode.tsx b/ergon-dashboard/src/features/graph/components/LeafNode.tsx index cda0e636..01d01e67 100644 --- a/ergon-dashboard/src/features/graph/components/LeafNode.tsx +++ b/ergon-dashboard/src/features/graph/components/LeafNode.tsx @@ -2,10 +2,6 @@ import { memo, useEffect, useState } from "react"; import type { TaskState, TaskStatus } from "@/lib/types"; -import { TaskGraphStatusIcon } from "@/components/dag/TaskGraphStatusIcon"; -import { getLevelColor } from "@/features/graph/theme/levelColors"; -import { getTaskTimingPrimaryLine } from "@/features/graph/utils/taskTiming"; -import { tokensFor } from "@/lib/statusTokens"; import { Handle, Position } from "@xyflow/react"; interface LeafNodeProps { @@ -19,42 +15,76 @@ interface LeafNodeProps { maxGraphDepth?: number; } -/** - * CornerBadge — slot-1-style corner status indicator. A solid circular badge - * ringed with white (or dark ring on dark mode) that overlaps the node's - * top-right corner; pulses on RUNNING. This replaces the floating - * `TaskGraphStatusIcon` which read as just another icon, not a status. - */ -function CornerBadge({ status }: { status: TaskStatus }) { - const tokens = tokensFor(status); +const STATUS_STYLES: Record< + string, + { bg: string; border: string; text: string } +> = { + completed: { + bg: "oklch(0.96 0.04 155)", + border: "oklch(0.85 0.10 155)", + text: "oklch(0.40 0.12 155)", + }, + running: { + bg: "oklch(0.97 0.04 80)", + border: "oklch(0.85 0.10 80)", + text: "oklch(0.42 0.12 65)", + }, + ready: { + bg: "oklch(0.97 0.03 240)", + border: "oklch(0.86 0.08 240)", + text: "oklch(0.40 0.12 240)", + }, + pending: { + bg: "#ffffff", + border: "#e2e6ec", + text: "#98a2b1", + }, + failed: { + bg: "oklch(0.97 0.04 22)", + border: "oklch(0.85 0.10 22)", + text: "oklch(0.40 0.16 22)", + }, +}; + +const FALLBACK_STYLE = STATUS_STYLES.pending; + +function getStatusStyle(status: string) { + return STATUS_STYLES[status] ?? FALLBACK_STYLE; +} + +function StatusDot({ status }: { status: string }) { + const style = getStatusStyle(status); + const isRunning = status === "running"; return ( -
- -
+ ); } -function LeafNodeComponent({ - task, - variant, - onClick, - selected = false, - dimmed = false, - highlighted = false, - layoutDirection = "LR", - maxGraphDepth, -}: LeafNodeProps) { +function LeafNodeComponent(props: LeafNodeProps) { + const { + task, + onClick, + selected = false, + dimmed = false, + highlighted = false, + layoutDirection = "LR", + } = props; const [isAnimating, setIsAnimating] = useState(false); const [prevStatus, setPrevStatus] = useState(task.status); const targetPos = layoutDirection === "LR" ? Position.Left : Position.Top; const sourcePos = layoutDirection === "LR" ? Position.Right : Position.Bottom; - const depthForPalette = Math.max(maxGraphDepth ?? task.level, task.level); - const levelHex = getLevelColor(task.level, depthForPalette); useEffect(() => { if (task.status !== prevStatus) { @@ -69,249 +99,72 @@ function LeafNodeComponent({ onClick?.(task.id); }; - const tokens = tokensFor(task.status); - const borderColor = tokens.border; - const bgColor = tokens.softBg; - const timingLine = getTaskTimingPrimaryLine(task); - - if (variant === "compact") { - const timingHint = timingLine ? `\n${timingLine}` : ""; - return ( -
- - - - {task.name} - - -
- ); - } - - if (variant === "standard") { - return ( -
- - - -
-
- - {task.status} - -
- -

- {task.name} -

+ const ss = getStatusStyle(task.status); - {task.assignedWorkerName && ( -
- - - - {task.assignedWorkerName} -
- )} - {timingLine && ( -

{timingLine}

- )} -
+ const statusLabel = + task.status === ("running" as TaskStatus) + ? `running${task.assignedWorkerName ? ` · ${task.assignedWorkerName}` : ""}` + : task.status; - - - {task.status === ("running" as TaskStatus) && !dimmed && ( -
- )} -
- ); - } - - // Full variant — matches original TaskNode rendering return (
- + + -
- {/* Header: Status + Level */} -
-
- - {task.status} - -
- - L{task.level} - -
- - {/* Task Name */} -

- {task.name} -

- - {/* Description */} - {task.description && task.description.length < 60 && ( -

- {task.description} -

- )} - - {/* Worker Assignment */} - {task.assignedWorkerName && ( -
- - - - {task.assignedWorkerName} -
- )} - {timingLine && ( -

{timingLine}

- )} - - {/* Leaf indicator */} - {task.isLeaf && ( -
- - Leaf task (no children) - - -
- )} +
+ {task.name} +
- {/* Children count indicator (collapsed container) */} - {!task.isLeaf && task.childIds.length > 0 && ( -
- - - - {task.childIds.length} subtasks -
- )} +
+ {statusLabel}
- - {/* Running pulse ring */} - {task.status === ("running" as TaskStatus) && !dimmed && ( -
- )}
); } diff --git a/ergon-dashboard/src/features/graph/components/MutationTimeline.tsx b/ergon-dashboard/src/features/graph/components/MutationTimeline.tsx index 13479c4d..261ef7f6 100644 --- a/ergon-dashboard/src/features/graph/components/MutationTimeline.tsx +++ b/ergon-dashboard/src/features/graph/components/MutationTimeline.tsx @@ -2,6 +2,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import type { GraphMutationDto } from "@/features/graph/contracts/graphMutations"; +import { formatClockTimeMs } from "@/lib/timeFormat"; interface MutationTimelineProps { mutations: GraphMutationDto[]; @@ -134,12 +135,7 @@ export function MutationTimeline({ } const formattedTime = currentMutation - ? new Date(currentMutation.created_at).toLocaleTimeString("en-GB", { - hour: "2-digit", - minute: "2-digit", - second: "2-digit", - fractionalSecondDigits: 3, - }) + ? formatClockTimeMs(currentMutation.created_at) : "—"; const seqSpan = Math.max(1, maxSequence - minSequence); diff --git a/ergon-dashboard/src/features/graph/components/events/AssistantTextEvent.tsx b/ergon-dashboard/src/features/graph/components/events/AssistantTextEvent.tsx index 49a11245..da174719 100644 --- a/ergon-dashboard/src/features/graph/components/events/AssistantTextEvent.tsx +++ b/ergon-dashboard/src/features/graph/components/events/AssistantTextEvent.tsx @@ -1,4 +1,5 @@ import type { ContextEventPayload } from "@/lib/contracts/contextEvents"; +import { ContextEventCard, formatDuration } from "./ContextEventCard"; interface Props { payload: Extract; @@ -8,16 +9,13 @@ interface Props { export function AssistantTextEvent({ payload, startedAt, completedAt }: Props) { return ( -
- {startedAt && completedAt && ( - - {Math.round( - (new Date(completedAt).getTime() - new Date(startedAt).getTime()) / 100, - ) / 10} - s - - )} +

{payload.text}

-
+ ); } diff --git a/ergon-dashboard/src/features/graph/components/events/ContextEventCard.tsx b/ergon-dashboard/src/features/graph/components/events/ContextEventCard.tsx new file mode 100644 index 00000000..e9573b07 --- /dev/null +++ b/ergon-dashboard/src/features/graph/components/events/ContextEventCard.tsx @@ -0,0 +1,125 @@ +import type { ReactNode } from "react"; + +type Tone = "amber" | "blue" | "green" | "gray" | "indigo" | "purple" | "red"; + +const TONE_STYLES: Record = { + amber: { + border: "border-amber-200/80", + bg: "bg-amber-50/80", + pill: "bg-amber-100 text-amber-800 ring-amber-200", + text: "text-amber-800", + }, + blue: { + border: "border-sky-200/80", + bg: "bg-sky-50/80", + pill: "bg-sky-100 text-sky-800 ring-sky-200", + text: "text-sky-800", + }, + green: { + border: "border-emerald-200/80", + bg: "bg-emerald-50/80", + pill: "bg-emerald-100 text-emerald-800 ring-emerald-200", + text: "text-emerald-800", + }, + gray: { + border: "border-[var(--line)]", + bg: "bg-[var(--paper)]", + pill: "bg-[var(--card)] text-[var(--muted)] ring-[var(--line)]", + text: "text-[var(--muted)]", + }, + indigo: { + border: "border-indigo-200/80", + bg: "bg-indigo-50/80", + pill: "bg-indigo-100 text-indigo-800 ring-indigo-200", + text: "text-indigo-800", + }, + purple: { + border: "border-purple-200/80", + bg: "bg-purple-50/80", + pill: "bg-purple-100 text-purple-800 ring-purple-200", + text: "text-purple-800", + }, + red: { + border: "border-red-200/80", + bg: "bg-red-50/80", + pill: "bg-red-100 text-red-800 ring-red-200", + text: "text-red-800", + }, +}; + +export function formatDuration(startedAt: string | null, completedAt: string | null): string | null { + if (!startedAt || !completedAt) return null; + const durationMs = new Date(completedAt).getTime() - new Date(startedAt).getTime(); + if (!Number.isFinite(durationMs) || durationMs < 0) return null; + return `${Math.round(durationMs / 100) / 10}s`; +} + +export function ContextEventCard({ + tone, + title, + subtitle, + badge, + duration, + children, + payloadLabel, + payload, +}: { + tone: Tone; + title: string; + subtitle?: string | null; + badge?: string | null; + duration?: string | null; + children?: ReactNode; + payloadLabel?: string; + payload?: unknown; +}) { + const styles = TONE_STYLES[tone]; + + return ( +
+
+
+
+ + {title} + + {badge && ( + + {badge} + + )} +
+ {subtitle && ( +
+ {subtitle} +
+ )} +
+ {duration && ( + + {duration} + + )} +
+ + {children &&
{children}
} + + {payloadLabel && ( +
+ + {payloadLabel} + +
+            {typeof payload === "string" ? payload : JSON.stringify(payload, null, 2)}
+          
+
+ )} +
+ ); +} diff --git a/ergon-dashboard/src/features/graph/components/events/SystemPromptEvent.tsx b/ergon-dashboard/src/features/graph/components/events/SystemPromptEvent.tsx index 7305eecb..026a8a30 100644 --- a/ergon-dashboard/src/features/graph/components/events/SystemPromptEvent.tsx +++ b/ergon-dashboard/src/features/graph/components/events/SystemPromptEvent.tsx @@ -1,4 +1,5 @@ import type { ContextEventPayload } from "@/lib/contracts/contextEvents"; +import { ContextEventCard } from "./ContextEventCard"; interface Props { payload: Extract; @@ -6,9 +7,11 @@ interface Props { export function SystemPromptEvent({ payload }: Props) { return ( -
- System Prompt -
{payload.text}
-
+ ); } diff --git a/ergon-dashboard/src/features/graph/components/events/ThinkingEvent.tsx b/ergon-dashboard/src/features/graph/components/events/ThinkingEvent.tsx index 77df44ed..0dbf342f 100644 --- a/ergon-dashboard/src/features/graph/components/events/ThinkingEvent.tsx +++ b/ergon-dashboard/src/features/graph/components/events/ThinkingEvent.tsx @@ -1,4 +1,5 @@ import type { ContextEventPayload } from "@/lib/contracts/contextEvents"; +import { ContextEventCard, formatDuration } from "./ContextEventCard"; interface Props { payload: Extract; @@ -8,19 +9,13 @@ interface Props { export function ThinkingEvent({ payload, startedAt, completedAt }: Props) { return ( -
- - Thinking - {startedAt && completedAt && ( - - {Math.round( - (new Date(completedAt).getTime() - new Date(startedAt).getTime()) / 100, - ) / 10} - s - - )} - -

{payload.text}

-
+ +

{payload.text}

+
); } diff --git a/ergon-dashboard/src/features/graph/components/events/ToolCallEvent.tsx b/ergon-dashboard/src/features/graph/components/events/ToolCallEvent.tsx index f87c5167..f21e27c8 100644 --- a/ergon-dashboard/src/features/graph/components/events/ToolCallEvent.tsx +++ b/ergon-dashboard/src/features/graph/components/events/ToolCallEvent.tsx @@ -1,4 +1,5 @@ import type { ContextEventPayload } from "@/lib/contracts/contextEvents"; +import { ContextEventCard, formatDuration } from "./ContextEventCard"; interface Props { payload: Extract; @@ -8,19 +9,14 @@ interface Props { export function ToolCallEvent({ payload, startedAt, completedAt }: Props) { return ( -
- - {payload.tool_name} - {startedAt && completedAt && ( - - {Math.round( - (new Date(completedAt).getTime() - new Date(startedAt).getTime()) / 100, - ) / 10} - s - - )} - -
{JSON.stringify(payload.args, null, 2)}
-
+ ); } diff --git a/ergon-dashboard/src/features/graph/components/events/ToolResultEvent.tsx b/ergon-dashboard/src/features/graph/components/events/ToolResultEvent.tsx index 867c38a8..97aa4159 100644 --- a/ergon-dashboard/src/features/graph/components/events/ToolResultEvent.tsx +++ b/ergon-dashboard/src/features/graph/components/events/ToolResultEvent.tsx @@ -1,4 +1,5 @@ import type { ContextEventPayload } from "@/lib/contracts/contextEvents"; +import { ContextEventCard } from "./ContextEventCard"; interface Props { payload: Extract; @@ -6,20 +7,13 @@ interface Props { export function ToolResultEvent({ payload }: Props) { return ( -
- - {payload.tool_name} result - {payload.is_error && ( - error - )} - -
{JSON.stringify(payload.result, null, 2)}
-
+ ); } diff --git a/ergon-dashboard/src/features/graph/components/events/UserMessageEvent.tsx b/ergon-dashboard/src/features/graph/components/events/UserMessageEvent.tsx index 8dcf314e..515498f2 100644 --- a/ergon-dashboard/src/features/graph/components/events/UserMessageEvent.tsx +++ b/ergon-dashboard/src/features/graph/components/events/UserMessageEvent.tsx @@ -1,4 +1,5 @@ import type { ContextEventPayload } from "@/lib/contracts/contextEvents"; +import { ContextEventCard } from "./ContextEventCard"; interface Props { payload: Extract; @@ -6,13 +7,12 @@ interface Props { export function UserMessageEvent({ payload }: Props) { return ( -
- {payload.from_worker_key && ( - - from {payload.from_worker_key} - - )} +

{payload.text}

-
+
); } diff --git a/ergon-dashboard/src/features/graph/contracts/graphMutations.test.ts b/ergon-dashboard/src/features/graph/contracts/graphMutations.test.ts index 0c51a088..8225a7e1 100644 --- a/ergon-dashboard/src/features/graph/contracts/graphMutations.test.ts +++ b/ergon-dashboard/src/features/graph/contracts/graphMutations.test.ts @@ -9,9 +9,11 @@ import assert from "node:assert/strict"; import test from "node:test"; import { MutationTypeSchema } from "./graphMutations"; -import { applyGraphMutation } from "../state/graphMutationReducer"; +import { applyGraphMutation, createReplayInitialState, replayToSequence } from "../state/graphMutationReducer"; import type { WorkflowRunState } from "@/lib/types"; +import { TaskStatus } from "@/lib/types"; import type { DashboardGraphMutationData } from "@/lib/contracts/events"; +import type { GraphMutationDto } from "./graphMutations"; function emptyState(): WorkflowRunState { return { @@ -35,6 +37,7 @@ function emptyState(): WorkflowRunState { completedTasks: 0, runningTasks: 0, failedTasks: 0, + cancelledTasks: 0, finalScore: null, error: null, edges: new Map(), @@ -126,3 +129,312 @@ for (const mutationType of ALL_MUTATION_TYPES) { test("ALL_MUTATION_TYPES matches MutationTypeSchema.options (no stale snapshot)", () => { assert.deepEqual(ALL_MUTATION_TYPES, MutationTypeSchema.options); }); + +test("replay base preserves snapshot hierarchy while dependency edges remain dependencies", () => { + const runState = emptyState(); + runState.tasks = new Map([ + [ + "11111111-1111-4111-8111-111111111111", + { + id: "11111111-1111-4111-8111-111111111111", + name: "root", + description: "root", + status: TaskStatus.RUNNING, + parentId: null, + childIds: [ + "22222222-2222-4222-8222-222222222222", + "33333333-3333-4333-8333-333333333333", + ], + dependsOnIds: [], + assignedWorkerId: null, + assignedWorkerName: "parent", + startedAt: "2026-04-26T12:00:00.000Z", + completedAt: null, + isLeaf: false, + level: 0, + }, + ], + [ + "22222222-2222-4222-8222-222222222222", + { + id: "22222222-2222-4222-8222-222222222222", + name: "dependency", + description: "dependency", + status: TaskStatus.COMPLETED, + parentId: "11111111-1111-4111-8111-111111111111", + childIds: [], + dependsOnIds: [], + assignedWorkerId: null, + assignedWorkerName: "worker-a", + startedAt: "2026-04-26T12:00:01.000Z", + completedAt: "2026-04-26T12:00:05.000Z", + isLeaf: true, + level: 1, + }, + ], + [ + "33333333-3333-4333-8333-333333333333", + { + id: "33333333-3333-4333-8333-333333333333", + name: "dependent", + description: "dependent", + status: TaskStatus.RUNNING, + parentId: "11111111-1111-4111-8111-111111111111", + childIds: [], + dependsOnIds: ["22222222-2222-4222-8222-222222222222"], + assignedWorkerId: "future-agent-id", + assignedWorkerName: "worker-b", + startedAt: "2026-04-26T12:00:06.000Z", + completedAt: null, + isLeaf: true, + level: 1, + }, + ], + ]); + + const mutations: GraphMutationDto[] = [ + graphNodeAdded(0, "11111111-1111-4111-8111-111111111111", "root"), + graphNodeAdded(1, "22222222-2222-4222-8222-222222222222", "dependency"), + graphNodeAdded(2, "33333333-3333-4333-8333-333333333333", "dependent"), + { + id: "44444444-4444-4444-8444-444444444444", + run_id: "00000000-0000-0000-0000-000000000000", + sequence: 3, + mutation_type: "edge.added", + target_type: "edge", + target_id: "44444444-4444-4444-8444-444444444444", + actor: "manager", + old_value: null, + new_value: { + source_node_id: "22222222-2222-4222-8222-222222222222", + target_node_id: "33333333-3333-4333-8333-333333333333", + status: "pending", + }, + reason: "manager_decision", + created_at: "2026-04-26T12:00:03.000Z", + }, + ]; + + const base = createReplayInitialState(runState, mutations, 3); + const replayed = mutations.reduce( + (state, mutation) => + applyGraphMutation(state, { ...mutation, timestamp: mutation.created_at }), + base, + ); + + const dependent = replayed.tasks.get("33333333-3333-4333-8333-333333333333"); + assert.equal(dependent?.parentId, "11111111-1111-4111-8111-111111111111"); + assert.deepEqual(dependent?.dependsOnIds, ["22222222-2222-4222-8222-222222222222"]); + assert.equal(dependent?.level, 1); +}); + +test("replay base does not leak future dependency edges or node field changes", () => { + const runState = emptyState(); + runState.tasks = new Map([ + [ + "11111111-1111-4111-8111-111111111111", + { + id: "11111111-1111-4111-8111-111111111111", + name: "root", + description: "root", + status: TaskStatus.RUNNING, + parentId: null, + childIds: [ + "22222222-2222-4222-8222-222222222222", + "33333333-3333-4333-8333-333333333333", + ], + dependsOnIds: [], + assignedWorkerId: null, + assignedWorkerName: "parent", + startedAt: "2026-04-26T12:00:00.000Z", + completedAt: null, + isLeaf: false, + level: 0, + }, + ], + [ + "22222222-2222-4222-8222-222222222222", + { + id: "22222222-2222-4222-8222-222222222222", + name: "source", + description: "source updated", + status: TaskStatus.COMPLETED, + parentId: "11111111-1111-4111-8111-111111111111", + childIds: [], + dependsOnIds: [], + assignedWorkerId: "future-agent-id", + assignedWorkerName: "future-worker", + startedAt: null, + completedAt: null, + isLeaf: true, + level: 1, + }, + ], + [ + "33333333-3333-4333-8333-333333333333", + { + id: "33333333-3333-4333-8333-333333333333", + name: "target", + description: "target", + status: TaskStatus.PENDING, + parentId: "11111111-1111-4111-8111-111111111111", + childIds: [], + dependsOnIds: ["22222222-2222-4222-8222-222222222222"], + assignedWorkerId: null, + assignedWorkerName: "worker-b", + startedAt: null, + completedAt: null, + isLeaf: true, + level: 1, + }, + ], + ]); + + const mutations: GraphMutationDto[] = [ + graphNodeAdded(0, "11111111-1111-4111-8111-111111111111", "root"), + graphNodeAdded(1, "22222222-2222-4222-8222-222222222222", "source"), + graphNodeAdded(2, "33333333-3333-4333-8333-333333333333", "target"), + { + id: "66666666-6666-4666-8666-666666666666", + run_id: "00000000-0000-0000-0000-000000000000", + sequence: 3, + mutation_type: "node.field_changed", + target_type: "node", + target_id: "22222222-2222-4222-8222-222222222222", + actor: "manager", + old_value: { description: "source" }, + new_value: { field: "description", value: "source updated" }, + reason: "update later", + created_at: "2026-04-26T12:00:03.000Z", + }, + { + id: "77777777-7777-4777-8777-777777777777", + run_id: "00000000-0000-0000-0000-000000000000", + sequence: 4, + mutation_type: "edge.added", + target_type: "edge", + target_id: "77777777-7777-4777-8777-777777777777", + actor: "manager", + old_value: null, + new_value: { + source_node_id: "22222222-2222-4222-8222-222222222222", + target_node_id: "33333333-3333-4333-8333-333333333333", + status: "pending", + }, + reason: "manager_decision", + created_at: "2026-04-26T12:00:04.000Z", + }, + ]; + + const replayed = replayToSequence( + mutations, + 2, + createReplayInitialState(runState, mutations, 2), + ); + + const source = replayed.tasks.get("22222222-2222-4222-8222-222222222222"); + const target = replayed.tasks.get("33333333-3333-4333-8333-333333333333"); + assert.equal(source?.description, "source"); + assert.equal(source?.assignedWorkerId, null); + assert.equal(source?.assignedWorkerName, "worker"); + assert.deepEqual(target?.dependsOnIds, []); +}); + +test("dependency edges between root-level tasks do not become containment", () => { + const runState = emptyState(); + runState.tasks = new Map([ + [ + "22222222-2222-4222-8222-222222222222", + { + id: "22222222-2222-4222-8222-222222222222", + name: "source", + description: "source", + status: TaskStatus.COMPLETED, + parentId: null, + childIds: [], + dependsOnIds: [], + assignedWorkerId: null, + assignedWorkerName: "worker-a", + startedAt: null, + completedAt: null, + isLeaf: true, + level: 0, + }, + ], + [ + "33333333-3333-4333-8333-333333333333", + { + id: "33333333-3333-4333-8333-333333333333", + name: "target", + description: "target", + status: TaskStatus.PENDING, + parentId: null, + childIds: [], + dependsOnIds: ["22222222-2222-4222-8222-222222222222"], + assignedWorkerId: null, + assignedWorkerName: "worker-b", + startedAt: null, + completedAt: null, + isLeaf: true, + level: 0, + }, + ], + ]); + const mutations: GraphMutationDto[] = [ + graphNodeAdded(0, "22222222-2222-4222-8222-222222222222", "source"), + graphNodeAdded(1, "33333333-3333-4333-8333-333333333333", "target"), + { + id: "88888888-8888-4888-8888-888888888888", + run_id: "00000000-0000-0000-0000-000000000000", + sequence: 2, + mutation_type: "edge.added", + target_type: "edge", + target_id: "88888888-8888-4888-8888-888888888888", + actor: "manager", + old_value: null, + new_value: { + source_node_id: "22222222-2222-4222-8222-222222222222", + target_node_id: "33333333-3333-4333-8333-333333333333", + status: "pending", + }, + reason: "manager_decision", + created_at: "2026-04-26T12:00:02.000Z", + }, + ]; + + const replayed = replayToSequence( + mutations, + 2, + createReplayInitialState(runState, mutations, 2), + ); + const target = replayed.tasks.get("33333333-3333-4333-8333-333333333333"); + assert.equal(target?.parentId, null); + assert.equal(target?.level, 0); + assert.deepEqual(target?.dependsOnIds, ["22222222-2222-4222-8222-222222222222"]); +}); + +function graphNodeAdded( + sequence: number, + targetId: string, + slug: string, +): GraphMutationDto { + return { + id: `55555555-5555-4555-8555-55555555555${sequence}`, + run_id: "00000000-0000-0000-0000-000000000000", + sequence, + mutation_type: "node.added", + target_type: "node", + target_id: targetId, + actor: "manager", + old_value: null, + new_value: { + task_slug: slug, + instance_key: "default", + description: slug, + status: "pending", + assigned_worker_slug: "worker", + }, + reason: "manager_decision", + created_at: `2026-04-26T12:00:0${sequence}.000Z`, + }; +} diff --git a/ergon-dashboard/src/features/graph/layout/goldenLayout.test.ts b/ergon-dashboard/src/features/graph/layout/goldenLayout.test.ts new file mode 100644 index 00000000..ad3c1153 --- /dev/null +++ b/ergon-dashboard/src/features/graph/layout/goldenLayout.test.ts @@ -0,0 +1,85 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import type { Node } from "@xyflow/react"; + +import fixture from "../../../../tests/fixtures/mas-runs/concurrent-mas-run.json"; +import { parseGraphMutationDtoArray } from "@/features/graph/contracts/graphMutations"; +import { createReplayInitialState, replayToSequence } from "@/features/graph/state/graphMutationReducer"; +import { deserializeRunState } from "@/lib/runState"; +import { calculateExpandedContainers, computeHierarchicalLayout } from "./hierarchicalLayout"; +import { NODE_VARIANTS, getNodeVariant } from "./layoutTypes"; + +interface Rect { + id: string; + parentId: string | undefined; + x: number; + y: number; + width: number; + height: number; +} + +function rectFor(node: Node): Rect { + const task = (node.data as { task?: { level: number } }).task; + const variant = getNodeVariant(task?.level ?? 1); + const style = node.style as { width?: number; height?: number } | undefined; + return { + id: node.id, + parentId: node.parentId, + x: node.position.x, + y: node.position.y, + width: Number(style?.width ?? NODE_VARIANTS[variant].width), + height: Number(style?.height ?? NODE_VARIANTS[variant].height), + }; +} + +function overlaps(a: Rect, b: Rect): boolean { + return ( + a.x < b.x + b.width && + a.x + a.width > b.x && + a.y < b.y + b.height && + a.y + a.height > b.y + ); +} + +function overlappingSiblingPairs(nodes: Node[]): Array<[string, string]> { + const rects = nodes.map(rectFor); + const pairs: Array<[string, string]> = []; + for (let i = 0; i < rects.length; i++) { + for (let j = i + 1; j < rects.length; j++) { + if (rects[i].parentId !== rects[j].parentId) continue; + if (overlaps(rects[i], rects[j])) pairs.push([rects[i].id, rects[j].id]); + } + } + return pairs; +} + +test("golden layout renders the full recursive graph without overlapping sibling boxes", () => { + const runState = deserializeRunState(fixture.runState); + const mutations = parseGraphMutationDtoArray(fixture.mutations); + const checkpoint = fixture.checkpoints.find((entry) => entry.sequence === 14); + assert.ok(checkpoint); + const displayState = replayToSequence( + mutations, + checkpoint.sequence, + createReplayInitialState(runState, mutations, checkpoint.sequence), + new Map(), + ); + const result = computeHierarchicalLayout( + displayState.tasks, + calculateExpandedContainers(displayState.tasks, Infinity), + "", + undefined, + null, + "LR", + new Set(), + ); + + assert.deepEqual(new Set(result.nodes.map((node) => node.id)), new Set(checkpoint.expectedTaskIds)); + assert.deepEqual(overlappingSiblingPairs(result.nodes), []); + for (const taskId of checkpoint.expectedTaskIds) { + const expected = runState.tasks.get(taskId); + const actual = displayState.tasks.get(taskId); + assert.equal(actual?.parentId, expected?.parentId ?? null); + assert.equal(actual?.level, expected?.level); + } +}); diff --git a/ergon-dashboard/src/features/graph/layout/hierarchicalLayout.ts b/ergon-dashboard/src/features/graph/layout/hierarchicalLayout.ts index 631d8ba3..1fec3c23 100644 --- a/ergon-dashboard/src/features/graph/layout/hierarchicalLayout.ts +++ b/ergon-dashboard/src/features/graph/layout/hierarchicalLayout.ts @@ -239,6 +239,7 @@ export function computeHierarchicalLayout( selectedTaskId?: string | null, direction: "TB" | "LR" = "LR", newNodeIds: ReadonlySet = new Set(), + highlightedTaskIds: ReadonlySet = new Set(), ): LayoutedGraph { const containerDimensions = new Map(); const allNodes: TaskNodeType[] = []; @@ -340,6 +341,7 @@ export function computeHierarchicalLayout( const localPos = localPositions.get(cid) ?? { x: 0, y: 0 }; const isMatch = !searchLower || matchingNodeIds.has(cid); + const childContainerDimensions = containerDimensions.get(cid); allNodes.push({ id: cid, type: "taskNode", @@ -354,11 +356,19 @@ export function computeHierarchicalLayout( onClick: onTaskClick, selected: cid === selectedTaskId, dimmed: searchLower ? !isMatch : false, - highlighted: searchLower ? isMatch : false, + highlighted: (searchLower ? isMatch : false) || highlightedTaskIds.has(cid), isNew: newNodeIds.has(cid), maxGraphDepth, graphLayoutDirection: direction, }, + ...(expandedContainers.has(cid) && childContainerDimensions + ? { + style: { + width: childContainerDimensions.width, + height: childContainerDimensions.height, + }, + } + : {}), }); } @@ -461,7 +471,7 @@ export function computeHierarchicalLayout( onClick: onTaskClick, selected: taskId === selectedTaskId, dimmed: searchLower ? !isMatch : false, - highlighted: searchLower ? isMatch : false, + highlighted: (searchLower ? isMatch : false) || highlightedTaskIds.has(taskId), isNew: newNodeIds.has(taskId), maxGraphDepth, graphLayoutDirection: direction, diff --git a/ergon-dashboard/src/features/graph/layout/layoutTypes.ts b/ergon-dashboard/src/features/graph/layout/layoutTypes.ts index ffc145b1..1af3e508 100644 --- a/ergon-dashboard/src/features/graph/layout/layoutTypes.ts +++ b/ergon-dashboard/src/features/graph/layout/layoutTypes.ts @@ -17,15 +17,15 @@ export interface LayoutedGraph { } export const NODE_VARIANTS = { - full: { width: 220, height: 120 }, - standard: { width: 180, height: 90 }, - compact: { width: 140, height: 50 }, + full: { width: 190, height: 88 }, + standard: { width: 160, height: 64 }, + compact: { width: 122, height: 46 }, } as const; -export const MIN_CONTAINER_WIDTH = 260; -export const MIN_CONTAINER_HEIGHT = 100; -export const CONTAINER_HEADER_HEIGHT = 50; -export const CONTAINER_PADDING = 20; +export const MIN_CONTAINER_WIDTH = 240; +export const MIN_CONTAINER_HEIGHT = 92; +export const CONTAINER_HEADER_HEIGHT = 32; +export const CONTAINER_PADDING = 16; export const DEFAULT_EXPANDED_DEPTH = 2; export const MAX_VISIBLE_NODES = 150; diff --git a/ergon-dashboard/src/features/graph/state/graphMutationReducer.ts b/ergon-dashboard/src/features/graph/state/graphMutationReducer.ts index 1a4c1d1a..a705dcb2 100644 --- a/ergon-dashboard/src/features/graph/state/graphMutationReducer.ts +++ b/ergon-dashboard/src/features/graph/state/graphMutationReducer.ts @@ -308,19 +308,26 @@ function applyEdgeAdded( const updatedTarget = { ...target }; const updatedSource = { ...source }; - if (updatedTarget.parentId === null) { + const sourceAlreadyContainsTarget = updatedSource.childIds.includes(value.target_node_id); + const isContainmentEdge = + updatedTarget.parentId === value.source_node_id || + (updatedTarget.parentId === null && + (ctx.reason === "parent-child" || sourceAlreadyContainsTarget)); + + if (isContainmentEdge) { updatedTarget.parentId = value.source_node_id; updatedTarget.level = updatedSource.level + 1; - updatedSource.childIds = [...updatedSource.childIds, value.target_node_id]; - if (updatedSource.isLeaf) { + updatedSource.childIds = sourceAlreadyContainsTarget + ? updatedSource.childIds + : [...updatedSource.childIds, value.target_node_id]; + if (updatedSource.isLeaf && !sourceAlreadyContainsTarget) { updatedSource.isLeaf = false; state.totalLeafTasks -= 1; } } else if (updatedTarget.parentId !== value.source_node_id) { - updatedTarget.dependsOnIds = [ - ...updatedTarget.dependsOnIds, - value.source_node_id, - ]; + updatedTarget.dependsOnIds = updatedTarget.dependsOnIds.includes(value.source_node_id) + ? updatedTarget.dependsOnIds + : [...updatedTarget.dependsOnIds, value.source_node_id]; } state.tasks.set(value.source_node_id, updatedSource); @@ -444,6 +451,107 @@ function recalculateMetrics(state: WorkflowRunState): void { const SNAPSHOT_INTERVAL = 50; +function nodeIdsAddedAtOrBefore( + mutations: GraphMutationDto[], + upToSequence: number, +): Set { + const ids = new Set(); + for (const mutation of mutations) { + if (mutation.sequence > upToSequence) break; + if (mutation.mutation_type === "node.added") { + ids.add(mutation.target_id); + } + } + return ids; +} + +function initialNodeValueById( + mutations: GraphMutationDto[], + upToSequence: number, +): Map { + const values = new Map(); + for (const mutation of mutations) { + if (mutation.sequence > upToSequence) break; + if (mutation.mutation_type !== "node.added") continue; + const value = NodeAddedValueSchema.parse(mutation.new_value); + values.set(mutation.target_id, value); + } + return values; +} + +function countStatus( + tasks: Map, + status: TaskStatus, +): number { + let count = 0; + for (const task of tasks.values()) { + if (task.status === status) count += 1; + } + return count; +} + +/** + * Build the initial state used for timeline replay from the persisted REST + * snapshot's structural metadata. + * + * The graph mutation WAL records dependency edges, but it does not encode the + * containment tree (`parentId`, `childIds`, `level`). Replaying from a blank + * task map would therefore mistake dependency edges for parent-child edges and + * produce a different layout from a refresh. This seeds only nodes that already + * existed at `upToSequence`, then lets status/annotation mutations replay on top. + */ +export function createReplayInitialState( + runState: WorkflowRunState, + mutations: GraphMutationDto[], + upToSequence: number, +): WorkflowRunState { + const includedNodeIds = nodeIdsAddedAtOrBefore(mutations, upToSequence); + const initialNodeValues = initialNodeValueById(mutations, upToSequence); + const tasks = new Map(); + + for (const nodeId of includedNodeIds) { + const task = runState.tasks.get(nodeId); + const initialValue = initialNodeValues.get(nodeId); + if (!task) continue; + + const parentId = + task.parentId && includedNodeIds.has(task.parentId) ? task.parentId : null; + const childIds = task.childIds.filter((childId) => includedNodeIds.has(childId)); + + tasks.set(nodeId, { + ...task, + name: initialValue?.task_slug ?? task.name, + description: initialValue?.description ?? task.description, + status: (initialValue?.status as TaskStatus | undefined) ?? task.status, + assignedWorkerId: null, + assignedWorkerName: initialValue?.assigned_worker_slug ?? null, + parentId, + childIds, + dependsOnIds: [], + startedAt: null, + completedAt: null, + isLeaf: childIds.length === 0, + level: parentId === null ? 0 : task.level, + history: [], + lastTrigger: null, + }); + } + + return { + ...runState, + tasks, + totalTasks: tasks.size, + totalLeafTasks: Array.from(tasks.values()).filter((task) => task.isLeaf).length, + completedTasks: countStatus(tasks, TaskStatus.COMPLETED), + runningTasks: countStatus(tasks, TaskStatus.RUNNING), + failedTasks: countStatus(tasks, TaskStatus.FAILED), + cancelledTasks: countStatus(tasks, TaskStatus.CANCELLED), + edges: new Map(), + annotationsByTarget: new Map(), + unhandledMutations: [], + }; +} + /** * Replay mutations up to a given sequence number from an initial state. * Used by the timeline scrubber for WAL playback. diff --git a/ergon-dashboard/src/generated/events/schemas/DashboardThreadMessageCreatedEvent.schema.json b/ergon-dashboard/src/generated/events/schemas/DashboardThreadMessageCreatedEvent.schema.json index 24d9b177..0717b7c1 100644 --- a/ergon-dashboard/src/generated/events/schemas/DashboardThreadMessageCreatedEvent.schema.json +++ b/ergon-dashboard/src/generated/events/schemas/DashboardThreadMessageCreatedEvent.schema.json @@ -106,6 +106,18 @@ "title": "Topic", "type": "string" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "agentAId": { "title": "Agentaid", "type": "string" diff --git a/ergon-dashboard/src/generated/rest/contracts.ts b/ergon-dashboard/src/generated/rest/contracts.ts index fee60197..d013a213 100644 --- a/ergon-dashboard/src/generated/rest/contracts.ts +++ b/ergon-dashboard/src/generated/rest/contracts.ts @@ -1,5 +1,13 @@ +/* eslint-disable @typescript-eslint/no-empty-object-type */ import { z } from "zod"; +type JsonValue = + | (JsonScalar | Array | {}) + | Array | {}>; +type JsonScalar = + | (string | number | number | boolean | null) + | Array; + const RunTaskDto = z.object({ id: z.string(), name: z.string(), @@ -119,6 +127,7 @@ const RunCommunicationThreadDto = z.object({ runId: z.string(), taskId: z.union([z.string(), z.null()]).optional(), topic: z.string(), + summary: z.union([z.string(), z.null()]).optional(), agentAId: z.string(), agentBId: z.string(), createdAt: z.string().datetime({ offset: true }), @@ -146,6 +155,7 @@ const RunSnapshotDto = z.object({ completedTasks: z.number().int().optional().default(0), failedTasks: z.number().int().optional().default(0), runningTasks: z.number().int().optional().default(0), + cancelledTasks: z.number().int().optional().default(0), finalScore: z.union([z.number(), z.null()]).optional(), error: z.union([z.string(), z.null()]).optional(), }); @@ -162,6 +172,122 @@ const HTTPValidationError = z .object({ detail: z.array(ValidationError) }) .partial() .passthrough(); +const NodeAddedMutation = z + .object({ + mutation_type: z.string().optional().default("node.added"), + task_slug: z.string(), + instance_key: z.string(), + description: z.string(), + status: z.string(), + assigned_worker_slug: z.union([z.string(), z.null()]), + }) + .passthrough(); +const NodeRemovedMutation = z + .object({ + mutation_type: z.string().optional().default("node.removed"), + task_slug: z.string(), + instance_key: z.string(), + description: z.string(), + status: z.string(), + assigned_worker_slug: z.union([z.string(), z.null()]), + }) + .passthrough(); +const NodeStatusChangedMutation = z + .object({ + mutation_type: z.string().optional().default("node.status_changed"), + status: z.string(), + }) + .passthrough(); +const NodeFieldChangedMutation = z + .object({ + mutation_type: z.string().optional().default("node.field_changed"), + field: z.enum(["description", "assigned_worker_slug"]), + value: z.union([z.string(), z.null()]), + }) + .passthrough(); +const EdgeAddedMutation = z + .object({ + mutation_type: z.string().optional().default("edge.added"), + source_node_id: z.string(), + target_node_id: z.string(), + status: z.string(), + }) + .passthrough(); +const EdgeRemovedMutation = z + .object({ + mutation_type: z.string().optional().default("edge.removed"), + source_node_id: z.string(), + target_node_id: z.string(), + status: z.string(), + }) + .passthrough(); +const EdgeStatusChangedMutation = z + .object({ + mutation_type: z.string().optional().default("edge.status_changed"), + status: z.string(), + }) + .passthrough(); +const JsonScalar = z.union([ + z.string(), + z.number(), + z.number(), + z.boolean(), + z.null(), +]); +const JsonValue: z.ZodType = z.lazy(() => + z.union([JsonScalar, z.array(JsonValue), z.record(z.string(), JsonValue)]) +); +const JsonObject = z.record(z.string(), JsonValue); +const AnnotationSetMutation = z + .object({ + mutation_type: z.string().optional().default("annotation.set"), + namespace: z.string(), + payload: JsonObject, + }) + .passthrough(); +const AnnotationDeletedMutation = z + .object({ + mutation_type: z.string().optional().default("annotation.deleted"), + namespace: z.string(), + payload: JsonObject, + }) + .passthrough(); +const RunGraphMutationDto = z.object({ + id: z.string(), + run_id: z.string(), + sequence: z.number().int(), + mutation_type: z.string(), + target_type: z.string(), + target_id: z.string(), + actor: z.string(), + old_value: z.union([ + z.discriminatedUnion("mutation_type", [ + NodeAddedMutation, + NodeRemovedMutation, + NodeStatusChangedMutation, + NodeFieldChangedMutation, + EdgeAddedMutation, + EdgeRemovedMutation, + EdgeStatusChangedMutation, + AnnotationSetMutation, + AnnotationDeletedMutation, + ]), + z.null(), + ]), + new_value: z.discriminatedUnion("mutation_type", [ + NodeAddedMutation, + NodeRemovedMutation, + NodeStatusChangedMutation, + NodeFieldChangedMutation, + EdgeAddedMutation, + EdgeRemovedMutation, + EdgeStatusChangedMutation, + AnnotationSetMutation, + AnnotationDeletedMutation, + ]), + reason: z.union([z.string(), z.null()]), + created_at: z.string(), +}); const definition_id = z.union([z.string(), z.null()]).optional(); const TrainingCurvePointDto = z.object({ runId: z.string(), @@ -233,6 +359,8 @@ const CohortRunRowDto = z completed_at: z.union([z.string(), z.null()]).optional(), running_time_ms: z.union([z.number(), z.null()]).optional(), final_score: z.union([z.number(), z.null()]).optional(), + total_tasks: z.union([z.number(), z.null()]).optional(), + total_cost_usd: z.union([z.number(), z.null()]).optional(), error_message: z.union([z.string(), z.null()]).optional(), }) .passthrough(); @@ -314,6 +442,19 @@ export const schemas = { RunSnapshotDto, ValidationError, HTTPValidationError, + NodeAddedMutation, + NodeRemovedMutation, + NodeStatusChangedMutation, + NodeFieldChangedMutation, + EdgeAddedMutation, + EdgeRemovedMutation, + EdgeStatusChangedMutation, + JsonScalar, + JsonValue, + JsonObject, + AnnotationSetMutation, + AnnotationDeletedMutation, + RunGraphMutationDto, definition_id, TrainingCurvePointDto, TrainingSessionDto, diff --git a/ergon-dashboard/src/generated/rest/openapi.json b/ergon-dashboard/src/generated/rest/openapi.json index d04cfac1..906046ed 100644 --- a/ergon-dashboard/src/generated/rest/openapi.json +++ b/ergon-dashboard/src/generated/rest/openapi.json @@ -50,6 +50,106 @@ } } }, + "/runs/{run_id}/mutations": { + "get": { + "tags": [ + "runs" + ], + "summary": "Get Mutations", + "description": "Return the append-only mutation log for a run, ordered by sequence.\n\nUsed by the Timeline scrubber to replay DAG state at any point in time.", + "operationId": "get_mutations_runs__run_id__mutations_get", + "parameters": [ + { + "name": "run_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "format": "uuid", + "title": "Run Id" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RunGraphMutationDto" + }, + "title": "Response Get Mutations Runs Run Id Mutations Get" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/runs/{run_id}/resources/{resource_id}/content": { + "get": { + "tags": [ + "runs" + ], + "summary": "Get Resource Content", + "description": "Stream the blob bytes for a RunResource.\n\nUsed by the dashboard's file-viewer modal. Enforces:\n- resource must belong to the named run (no cross-run leaks);\n- resolved path must sit under ``ERGON_BLOB_ROOT`` (traversal guard);\n- size <= ``_RESOURCE_CONTENT_MAX_BYTES`` (413 otherwise).", + "operationId": "get_resource_content_runs__run_id__resources__resource_id__content_get", + "parameters": [ + { + "name": "run_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "format": "uuid", + "title": "Run Id" + } + }, + { + "name": "resource_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "format": "uuid", + "title": "Resource Id" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, "/runs/training/curves": { "get": { "tags": [ @@ -579,6 +679,54 @@ }, "components": { "schemas": { + "AnnotationDeletedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "annotation.deleted", + "title": "Mutation Type", + "default": "annotation.deleted" + }, + "namespace": { + "type": "string", + "title": "Namespace" + }, + "payload": { + "$ref": "#/components/schemas/JsonObject" + } + }, + "type": "object", + "required": [ + "namespace", + "payload" + ], + "title": "AnnotationDeletedMutation", + "description": "annotation.deleted \u2014 tombstone." + }, + "AnnotationSetMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "annotation.set", + "title": "Mutation Type", + "default": "annotation.set" + }, + "namespace": { + "type": "string", + "title": "Namespace" + }, + "payload": { + "$ref": "#/components/schemas/JsonObject" + } + }, + "type": "object", + "required": [ + "namespace", + "payload" + ], + "title": "AnnotationSetMutation", + "description": "annotation.set." + }, "BatchStatus": { "type": "string", "enum": [ @@ -686,6 +834,28 @@ ], "title": "Final Score" }, + "total_tasks": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Total Tasks" + }, + "total_cost_usd": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Total Cost Usd" + }, "error_message": { "anyOf": [ { @@ -864,6 +1034,86 @@ "title": "CohortSummaryDto", "description": "Summary row for cohort list and live updates." }, + "EdgeAddedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "edge.added", + "title": "Mutation Type", + "default": "edge.added" + }, + "source_node_id": { + "type": "string", + "title": "Source Node Id" + }, + "target_node_id": { + "type": "string", + "title": "Target Node Id" + }, + "status": { + "type": "string", + "title": "Status" + } + }, + "type": "object", + "required": [ + "source_node_id", + "target_node_id", + "status" + ], + "title": "EdgeAddedMutation", + "description": "edge.added \u2014 full edge snapshot." + }, + "EdgeRemovedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "edge.removed", + "title": "Mutation Type", + "default": "edge.removed" + }, + "source_node_id": { + "type": "string", + "title": "Source Node Id" + }, + "target_node_id": { + "type": "string", + "title": "Target Node Id" + }, + "status": { + "type": "string", + "title": "Status" + } + }, + "type": "object", + "required": [ + "source_node_id", + "target_node_id", + "status" + ], + "title": "EdgeRemovedMutation", + "description": "edge.removed." + }, + "EdgeStatusChangedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "edge.status_changed", + "title": "Mutation Type", + "default": "edge.status_changed" + }, + "status": { + "type": "string", + "title": "Status" + } + }, + "type": "object", + "required": [ + "status" + ], + "title": "EdgeStatusChangedMutation", + "description": "edge.status_changed." + }, "EpisodeFailure": { "properties": { "run_id": { @@ -905,6 +1155,200 @@ "type": "object", "title": "HTTPValidationError" }, + "JsonObject": { + "additionalProperties": { + "$ref": "#/components/schemas/JsonValue" + }, + "type": "object" + }, + "JsonScalar": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "JsonValue": { + "anyOf": [ + { + "$ref": "#/components/schemas/JsonScalar" + }, + { + "items": { + "$ref": "#/components/schemas/JsonValue" + }, + "type": "array" + }, + { + "additionalProperties": { + "$ref": "#/components/schemas/JsonValue" + }, + "type": "object" + } + ] + }, + "NodeAddedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "node.added", + "title": "Mutation Type", + "default": "node.added" + }, + "task_slug": { + "type": "string", + "title": "Task Slug" + }, + "instance_key": { + "type": "string", + "title": "Instance Key" + }, + "description": { + "type": "string", + "title": "Description" + }, + "status": { + "type": "string", + "title": "Status" + }, + "assigned_worker_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Assigned Worker Slug" + } + }, + "type": "object", + "required": [ + "task_slug", + "instance_key", + "description", + "status", + "assigned_worker_slug" + ], + "title": "NodeAddedMutation", + "description": "node.added \u2014 full node snapshot." + }, + "NodeFieldChangedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "node.field_changed", + "title": "Mutation Type", + "default": "node.field_changed" + }, + "field": { + "type": "string", + "enum": [ + "description", + "assigned_worker_slug" + ], + "title": "Field" + }, + "value": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Value" + } + }, + "type": "object", + "required": [ + "field", + "value" + ], + "title": "NodeFieldChangedMutation", + "description": "node.field_changed." + }, + "NodeRemovedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "node.removed", + "title": "Mutation Type", + "default": "node.removed" + }, + "task_slug": { + "type": "string", + "title": "Task Slug" + }, + "instance_key": { + "type": "string", + "title": "Instance Key" + }, + "description": { + "type": "string", + "title": "Description" + }, + "status": { + "type": "string", + "title": "Status" + }, + "assigned_worker_slug": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Assigned Worker Slug" + } + }, + "type": "object", + "required": [ + "task_slug", + "instance_key", + "description", + "status", + "assigned_worker_slug" + ], + "title": "NodeRemovedMutation", + "description": "node.removed \u2014 node snapshot at removal time." + }, + "NodeStatusChangedMutation": { + "properties": { + "mutation_type": { + "type": "string", + "const": "node.status_changed", + "title": "Mutation Type", + "default": "node.status_changed" + }, + "status": { + "type": "string", + "title": "Status" + } + }, + "type": "object", + "required": [ + "status" + ], + "title": "NodeStatusChangedMutation", + "description": "node.status_changed." + }, "PollResponse": { "properties": { "batch_id": { @@ -1050,6 +1494,17 @@ "type": "string", "title": "Topic" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Summary" + }, "agentAId": { "type": "string", "title": "Agentaid" @@ -1385,6 +1840,169 @@ ], "title": "RunExecutionAttemptDto" }, + "RunGraphMutationDto": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "run_id": { + "type": "string", + "title": "Run Id" + }, + "sequence": { + "type": "integer", + "title": "Sequence" + }, + "mutation_type": { + "type": "string", + "title": "Mutation Type" + }, + "target_type": { + "type": "string", + "title": "Target Type" + }, + "target_id": { + "type": "string", + "title": "Target Id" + }, + "actor": { + "type": "string", + "title": "Actor" + }, + "old_value": { + "anyOf": [ + { + "oneOf": [ + { + "$ref": "#/components/schemas/NodeAddedMutation" + }, + { + "$ref": "#/components/schemas/NodeRemovedMutation" + }, + { + "$ref": "#/components/schemas/NodeStatusChangedMutation" + }, + { + "$ref": "#/components/schemas/NodeFieldChangedMutation" + }, + { + "$ref": "#/components/schemas/EdgeAddedMutation" + }, + { + "$ref": "#/components/schemas/EdgeRemovedMutation" + }, + { + "$ref": "#/components/schemas/EdgeStatusChangedMutation" + }, + { + "$ref": "#/components/schemas/AnnotationSetMutation" + }, + { + "$ref": "#/components/schemas/AnnotationDeletedMutation" + } + ], + "discriminator": { + "propertyName": "mutation_type", + "mapping": { + "annotation.deleted": "#/components/schemas/AnnotationDeletedMutation", + "annotation.set": "#/components/schemas/AnnotationSetMutation", + "edge.added": "#/components/schemas/EdgeAddedMutation", + "edge.removed": "#/components/schemas/EdgeRemovedMutation", + "edge.status_changed": "#/components/schemas/EdgeStatusChangedMutation", + "node.added": "#/components/schemas/NodeAddedMutation", + "node.field_changed": "#/components/schemas/NodeFieldChangedMutation", + "node.removed": "#/components/schemas/NodeRemovedMutation", + "node.status_changed": "#/components/schemas/NodeStatusChangedMutation" + } + } + }, + { + "type": "null" + } + ], + "title": "Old Value" + }, + "new_value": { + "oneOf": [ + { + "$ref": "#/components/schemas/NodeAddedMutation" + }, + { + "$ref": "#/components/schemas/NodeRemovedMutation" + }, + { + "$ref": "#/components/schemas/NodeStatusChangedMutation" + }, + { + "$ref": "#/components/schemas/NodeFieldChangedMutation" + }, + { + "$ref": "#/components/schemas/EdgeAddedMutation" + }, + { + "$ref": "#/components/schemas/EdgeRemovedMutation" + }, + { + "$ref": "#/components/schemas/EdgeStatusChangedMutation" + }, + { + "$ref": "#/components/schemas/AnnotationSetMutation" + }, + { + "$ref": "#/components/schemas/AnnotationDeletedMutation" + } + ], + "title": "New Value", + "discriminator": { + "propertyName": "mutation_type", + "mapping": { + "annotation.deleted": "#/components/schemas/AnnotationDeletedMutation", + "annotation.set": "#/components/schemas/AnnotationSetMutation", + "edge.added": "#/components/schemas/EdgeAddedMutation", + "edge.removed": "#/components/schemas/EdgeRemovedMutation", + "edge.status_changed": "#/components/schemas/EdgeStatusChangedMutation", + "node.added": "#/components/schemas/NodeAddedMutation", + "node.field_changed": "#/components/schemas/NodeFieldChangedMutation", + "node.removed": "#/components/schemas/NodeRemovedMutation", + "node.status_changed": "#/components/schemas/NodeStatusChangedMutation" + } + } + }, + "reason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Reason" + }, + "created_at": { + "type": "string", + "title": "Created At" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "id", + "run_id", + "sequence", + "mutation_type", + "target_type", + "target_id", + "actor", + "old_value", + "new_value", + "reason", + "created_at" + ], + "title": "RunGraphMutationDto", + "description": "One entry in the append-only mutation log for a run.\n\nField names are snake_case to match the frontend GraphMutationDtoSchema.\nCamelModel is intentionally not used here \u2014 the frontend contract uses snake_case." + }, "RunResourceDto": { "properties": { "id": { @@ -1716,6 +2334,11 @@ "title": "Runningtasks", "default": 0 }, + "cancelledTasks": { + "type": "integer", + "title": "Cancelledtasks", + "default": 0 + }, "finalScore": { "anyOf": [ { @@ -2407,4 +3030,4 @@ } } } -} \ No newline at end of file +} diff --git a/ergon-dashboard/src/hooks/useBuildHealth.ts b/ergon-dashboard/src/hooks/useBuildHealth.ts new file mode 100644 index 00000000..20c6787a --- /dev/null +++ b/ergon-dashboard/src/hooks/useBuildHealth.ts @@ -0,0 +1,74 @@ +"use client"; + +import { useState, useEffect, useCallback, useRef } from "react"; + +export type BuildHealthStatus = "unknown" | "healthy" | "degraded"; + +interface HealthResponse { + status: "healthy" | "degraded"; + checks: Record; + errors?: string[]; + build: { nodeEnv: string; timestamp: string | null; pid: number }; +} + +interface BuildHealth { + status: BuildHealthStatus; + errors: string[]; + lastChecked: number | null; + check: () => Promise; +} + +const POLL_INTERVAL_MS = 60_000; +const DEGRADED_RETRY_MS = 10_000; + +export function useBuildHealth(): BuildHealth { + const [status, setStatus] = useState("unknown"); + const [errors, setErrors] = useState([]); + const [lastChecked, setLastChecked] = useState(null); + const timerRef = useRef | null>(null); + + const check = useCallback(async () => { + try { + const res = await fetch("/api/health", { + cache: "no-store", + signal: AbortSignal.timeout(5000), + }); + + if (!res.ok) { + const body = await res.json().catch(() => ({})) as Partial; + setStatus("degraded"); + setErrors(body.errors ?? [`Health check returned ${res.status}`]); + } else { + const body = (await res.json()) as HealthResponse; + setStatus(body.status === "healthy" ? "healthy" : "degraded"); + setErrors(body.errors ?? []); + } + } catch (e) { + setStatus("degraded"); + setErrors([ + `Health check failed: ${e instanceof Error ? e.message : "network error"}. ` + + "The dev server may need a restart (docker compose restart dashboard).", + ]); + } + setLastChecked(Date.now()); + }, []); + + useEffect(() => { + check(); + + const schedule = () => { + const interval = status === "degraded" ? DEGRADED_RETRY_MS : POLL_INTERVAL_MS; + timerRef.current = setTimeout(async () => { + await check(); + schedule(); + }, interval); + }; + + schedule(); + return () => { + if (timerRef.current) clearTimeout(timerRef.current); + }; + }, [check, status]); + + return { status, errors, lastChecked, check }; +} diff --git a/ergon-dashboard/src/hooks/useRunState.socketHydration.test.ts b/ergon-dashboard/src/hooks/useRunState.socketHydration.test.ts new file mode 100644 index 00000000..d5643eef --- /dev/null +++ b/ergon-dashboard/src/hooks/useRunState.socketHydration.test.ts @@ -0,0 +1,12 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import { shouldRequestSocketSnapshot } from "./useRunState"; + +test("does not request socket full-state snapshot when REST or SSR state is already hydrated", () => { + assert.equal(shouldRequestSocketSnapshot(true), false); +}); + +test("requests socket full-state snapshot when no REST or SSR state is available yet", () => { + assert.equal(shouldRequestSocketSnapshot(false), true); +}); diff --git a/ergon-dashboard/src/hooks/useRunState.ts b/ergon-dashboard/src/hooks/useRunState.ts index f51fcd26..0c217ecd 100644 --- a/ergon-dashboard/src/hooks/useRunState.ts +++ b/ergon-dashboard/src/hooks/useRunState.ts @@ -103,6 +103,10 @@ function normalizeSandboxCommandState(command: RunSandboxCommand): SandboxComman }; } +export function shouldRequestSocketSnapshot(hasHydratedRunState: boolean): boolean { + return !hasHydratedRunState; +} + export function useRunState( runId: string, initialRunState: SerializedWorkflowRunState | null = null, @@ -501,17 +505,21 @@ export function useRunState( setIsSubscribed(true); setIsLoading((prev) => (hasRunStateRef.current ? false : prev)); - // Request full run state from server - console.log("[useRunState] Requesting full state for run", runId, "socket.connected:", socket.connected); - socket.emit("request:run", runId); - - // Set up a retry in case the first request is lost - retryTimeout = setTimeout(() => { - if (socket.connected) { - console.log("[useRunState] Retrying request:run for", runId); - socket.emit("request:run", runId); - } - }, 1000); + if (shouldRequestSocketSnapshot(hasRunStateRef.current)) { + // Request full run state only when REST/SSR did not hydrate us. + console.log("[useRunState] Requesting full state for run", runId, "socket.connected:", socket.connected); + socket.emit("request:run", runId); + + // Set up a retry in case the first request is lost + retryTimeout = setTimeout(() => { + if (socket.connected && shouldRequestSocketSnapshot(hasRunStateRef.current)) { + console.log("[useRunState] Retrying request:run for", runId); + socket.emit("request:run", runId); + } + }, 1000); + } else { + console.log("[useRunState] Skipping full socket state request; REST/SSR snapshot is already loaded", runId); + } } // Set up event listeners diff --git a/ergon-dashboard/src/lib/contracts/rest.ts b/ergon-dashboard/src/lib/contracts/rest.ts index 4b43f43a..cc596656 100644 --- a/ergon-dashboard/src/lib/contracts/rest.ts +++ b/ergon-dashboard/src/lib/contracts/rest.ts @@ -105,12 +105,23 @@ export interface CohortSummary } export interface CohortRunRow - extends Omit { + extends Omit< + RawCohortRunRow, + | "completed_at" + | "error_message" + | "final_score" + | "running_time_ms" + | "started_at" + | "total_cost_usd" + | "total_tasks" + > { completed_at: string | null; error_message: string | null; final_score: number | null; running_time_ms: number | null; started_at: string | null; + total_cost_usd: number | null; + total_tasks: number | null; } export interface CohortDetail { @@ -323,6 +334,7 @@ function normalizeRunCommunicationThread(thread: RawRunCommunicationThread): Run ...thread, messages: (thread.messages ?? []).map(normalizeRunCommunicationMessage), taskId: thread.taskId ?? null, + summary: thread.summary ?? null, }; } @@ -355,6 +367,8 @@ export function parseCohortDetail(input: unknown): CohortDetail { final_score: run.final_score ?? null, running_time_ms: run.running_time_ms ?? null, started_at: run.started_at ?? null, + total_cost_usd: run.total_cost_usd ?? null, + total_tasks: run.total_tasks ?? null, })), }; } diff --git a/ergon-dashboard/src/lib/runState.ts b/ergon-dashboard/src/lib/runState.ts index 8e6b269b..04c53a8f 100644 --- a/ergon-dashboard/src/lib/runState.ts +++ b/ergon-dashboard/src/lib/runState.ts @@ -151,6 +151,7 @@ export function deserializeRunState(input: unknown): WorkflowRunState { completedTasks: data.completedTasks, runningTasks: data.runningTasks, failedTasks: data.failedTasks, + cancelledTasks: data.cancelledTasks, finalScore: data.finalScore ?? null, error: data.error ?? null, edges: new Map(), diff --git a/ergon-dashboard/src/lib/state/store.ts b/ergon-dashboard/src/lib/state/store.ts index ac11fcd1..2ac40ad7 100644 --- a/ergon-dashboard/src/lib/state/store.ts +++ b/ergon-dashboard/src/lib/state/store.ts @@ -132,6 +132,7 @@ class DashboardStore { completedTasks: 0, runningTasks: 0, failedTasks: 0, + cancelledTasks: 0, finalScore: null, error: null, edges: new Map(), diff --git a/ergon-dashboard/src/lib/testing/dashboardHarness.ts b/ergon-dashboard/src/lib/testing/dashboardHarness.ts index 663ce60b..16955fcb 100644 --- a/ergon-dashboard/src/lib/testing/dashboardHarness.ts +++ b/ergon-dashboard/src/lib/testing/dashboardHarness.ts @@ -26,6 +26,7 @@ declare global { | { cohorts: CohortSummary[]; cohortDetails: Record; + mutationsByRun: Record; } | undefined; } @@ -34,6 +35,7 @@ export interface DashboardHarnessSeedPayload { cohorts?: CohortSummary[]; cohortDetails?: Record; runs?: SerializedWorkflowRunState[]; + mutations?: Record; } function getHarnessState() { @@ -41,6 +43,7 @@ function getHarnessState() { global.__dashboardHarness = { cohorts: [], cohortDetails: {}, + mutationsByRun: {}, }; } return global.__dashboardHarness; @@ -58,6 +61,7 @@ export function resetDashboardHarness(): void { const harness = getHarnessState(); harness.cohorts = []; harness.cohortDetails = {}; + harness.mutationsByRun = {}; } export function seedDashboardHarness(payload: DashboardHarnessSeedPayload): void { @@ -67,6 +71,7 @@ export function seedDashboardHarness(payload: DashboardHarnessSeedPayload): void const harness = getHarnessState(); harness.cohorts = payload.cohorts ?? []; harness.cohortDetails = payload.cohortDetails ?? {}; + harness.mutationsByRun = payload.mutations ?? {}; for (const run of payload.runs ?? []) { store.seedRun(deserializeRunState(run)); @@ -120,6 +125,11 @@ export function getHarnessRun(runId: string): SerializedWorkflowRunState | null return run ? serializeRunState(run) : null; } +export function getHarnessRunMutations(runId: string): unknown[] | null { + requireHarnessEnabled(); + return getHarnessState().mutationsByRun[runId] ?? null; +} + export function emitHarnessRunCompleted(data: { runId: string; status: "completed" | "failed"; diff --git a/ergon-dashboard/src/lib/timeFormat.test.ts b/ergon-dashboard/src/lib/timeFormat.test.ts new file mode 100644 index 00000000..47f2534a --- /dev/null +++ b/ergon-dashboard/src/lib/timeFormat.test.ts @@ -0,0 +1,21 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import { formatClockTime, formatClockTimeMs, formatClockTimeSeconds } from "./timeFormat"; + +test("formatClockTime is stable for UTC timestamps regardless of runtime local timezone", () => { + assert.equal(formatClockTime("2026-04-26T10:24:15.000Z"), "10:24"); +}); + +test("formatClockTime returns dash for invalid timestamps", () => { + assert.equal(formatClockTime("not-a-date"), "—"); + assert.equal(formatClockTime(Number.NaN), "—"); +}); + +test("formatClockTimeMs includes seconds and milliseconds with stable timezone", () => { + assert.equal(formatClockTimeMs("2026-04-26T10:24:15.123Z"), "10:24:15.123"); +}); + +test("formatClockTimeSeconds includes seconds with stable timezone", () => { + assert.equal(formatClockTimeSeconds("2026-04-26T10:24:15.123Z"), "10:24:15"); +}); diff --git a/ergon-dashboard/src/lib/timeFormat.ts b/ergon-dashboard/src/lib/timeFormat.ts new file mode 100644 index 00000000..f314cf2f --- /dev/null +++ b/ergon-dashboard/src/lib/timeFormat.ts @@ -0,0 +1,42 @@ +export function formatClockTime(value: string | number | Date): string { + return formatDateTime(value, { + hour: "2-digit", + minute: "2-digit", + }); +} + +export function formatClockTimeMs(value: string | number | Date): string { + return formatDateTime(value, { + hour: "2-digit", + minute: "2-digit", + second: "2-digit", + fractionalSecondDigits: 3, + }); +} + +export function formatClockTimeSeconds(value: string | number | Date): string { + return formatDateTime(value, { + hour: "2-digit", + minute: "2-digit", + second: "2-digit", + }); +} + +export function formatDate(value: string | number | Date): string { + return formatDateTime(value, { + year: "numeric", + month: "2-digit", + day: "2-digit", + }); +} + +function formatDateTime(value: string | number | Date, options: Intl.DateTimeFormatOptions): string { + const date = value instanceof Date ? value : new Date(value); + if (!Number.isFinite(date.getTime())) return "—"; + + return new Intl.DateTimeFormat("en-GB", { + ...options, + hour12: false, + timeZone: "UTC", + }).format(date); +} diff --git a/ergon-dashboard/src/lib/types.ts b/ergon-dashboard/src/lib/types.ts index 59507d92..8e7e9e02 100644 --- a/ergon-dashboard/src/lib/types.ts +++ b/ergon-dashboard/src/lib/types.ts @@ -340,6 +340,7 @@ export interface WorkflowRunState { completedTasks: number; runningTasks: number; failedTasks: number; + cancelledTasks: number; // Result finalScore: number | null; diff --git a/ergon-dashboard/tailwind.config.ts b/ergon-dashboard/tailwind.config.ts index 987c895c..3dc93ca3 100644 --- a/ergon-dashboard/tailwind.config.ts +++ b/ergon-dashboard/tailwind.config.ts @@ -5,6 +5,7 @@ const config: Config = { content: [ "./src/pages/**/*.{js,ts,jsx,tsx,mdx}", "./src/components/**/*.{js,ts,jsx,tsx,mdx}", + "./src/features/**/*.{js,ts,jsx,tsx,mdx}", "./src/app/**/*.{js,ts,jsx,tsx,mdx}", ], theme: { @@ -12,6 +13,32 @@ const config: Config = { colors: { background: "var(--background)", foreground: "var(--foreground)", + paper: "var(--paper)", + "paper-2": "var(--paper-2)", + "paper-3": "var(--paper-3)", + card: "var(--card)", + ink: "var(--ink)", + "ink-2": "var(--ink-2)", + muted: "var(--muted)", + faint: "var(--faint)", + line: "var(--line)", + "line-strong": "var(--line-strong)", + accent: "var(--accent)", + "accent-soft": "var(--accent-soft)", + "accent-ink": "var(--accent-ink)", + }, + borderRadius: { + card: "var(--radius)", + "card-sm": "var(--radius-sm)", + }, + boxShadow: { + card: "var(--shadow-sm)", + "card-md": "var(--shadow)", + pop: "var(--shadow-pop)", + }, + fontFamily: { + sans: ["var(--font)"], + mono: ["var(--mono)"], }, }, }, diff --git a/ergon-dashboard/tests/contracts/contracts.test.ts b/ergon-dashboard/tests/contracts/contracts.test.ts index 4bb0f770..cfd735da 100644 --- a/ergon-dashboard/tests/contracts/contracts.test.ts +++ b/ergon-dashboard/tests/contracts/contracts.test.ts @@ -100,7 +100,9 @@ test("cohort detail parser accepts harness payload", () => { const parsed = parseCohortDetail(cohortDetail); assert.equal(parsed.summary.cohort_id, FIXTURE_IDS.cohortId); - assert.equal((parsed.runs ?? []).length, 1); + assert.equal((parsed.runs ?? []).length, 3); + assert.equal(parsed.runs[0]?.total_tasks, 10); + assert.equal(parsed.runs[0]?.total_cost_usd, 0.12); }); test("workflow started event parser validates recursive task trees", () => { @@ -173,6 +175,7 @@ test("dashboard nested DTO event parser accepts backend snake-case payloads", () run_id: thread.runId, task_id: thread.taskId, topic: thread.topic, + summary: "Leaf workers report completion artifacts and probe exit status.", agent_a_id: thread.agentAId, agent_b_id: thread.agentBId, created_at: thread.createdAt, @@ -193,6 +196,11 @@ test("dashboard nested DTO event parser accepts backend snake-case payloads", () }, }); + assert.equal( + parsedThread.thread.summary, + "Leaf workers report completion artifacts and probe exit status.", + ); + const parsedEvaluation = parseDashboardTaskEvaluationUpdatedData({ run_id: FIXTURE_IDS.runId, task_id: FIXTURE_IDS.solveTaskId, diff --git a/ergon-dashboard/tests/e2e/_shared/smoke.ts b/ergon-dashboard/tests/e2e/_shared/smoke.ts index a0c8e337..7f98c562 100644 --- a/ergon-dashboard/tests/e2e/_shared/smoke.ts +++ b/ergon-dashboard/tests/e2e/_shared/smoke.ts @@ -54,6 +54,11 @@ async function screenshot(target: Page, out: string): Promise { await target.screenshot({ path: out, fullPage: true }); } +async function locatorScreenshot(target: Locator, out: string): Promise { + await fs.mkdir(path.dirname(out), { recursive: true }); + await target.screenshot({ path: out }); +} + function graphElementForTask(page: Page, taskId: string): Locator { return page .locator( @@ -106,15 +111,22 @@ async function openWorkspaceForGraphTask(page: Page, taskId: string): Promise { + await expect(page.getByTestId("activity-play-toggle")).toHaveCount(0); + await expect(page.getByTestId("activity-speed-control")).toHaveCount(0); + await expect(page.getByTestId("activity-step-back")).toHaveCount(0); + await expect(page.getByTestId("activity-step-forward")).toHaveCount(0); +} + async function assertRunWorkspace( page: Page, state: BackendRunState, runId: string, ): Promise { await expect(page.getByTestId("run-header")).toBeVisible(); - await expect(page.getByTestId("run-status-bar")).toBeVisible(); - await expect(page.getByTestId("run-status-count-completed")).toBeVisible(); await expect(page.getByTestId("graph-canvas")).toBeVisible(); + await expect(page.getByTestId("activity-stack-region")).toBeVisible(); + await expect(page.locator('[data-testid^="activity-bar-"]').first()).toBeVisible(); const evaluatedTaskIds = new Set(state.evaluations.map((evaluation) => evaluation.task_id)); const selected = await selectRenderedGraphTask(page, state, runId, evaluatedTaskIds); @@ -122,13 +134,28 @@ async function assertRunWorkspace( await openWorkspaceForGraphTask(page, selected.id); await expect(page.getByTestId("workspace-region")).toBeVisible(); await expect(page.getByTestId("workspace-header")).toContainText(selected.task_slug); + await expect(page.getByTestId("workspace-tab-overview")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-actions")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-communication")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-outputs")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-transitions")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-evaluation")).toBeVisible(); + + await page.getByTestId("workspace-tab-actions").click(); await expect(page.getByTestId("workspace-actions")).toBeVisible(); - await expect(page.getByTestId("workspace-outputs")).toBeVisible(); await expect(page.getByTestId("workspace-executions")).toBeVisible(); await expect(page.getByTestId("workspace-sandbox")).toBeVisible(); + + await page.getByTestId("workspace-tab-outputs").click(); + await expect(page.getByTestId("workspace-outputs")).toBeVisible(); + + await page.getByTestId("workspace-tab-communication").click(); await expect(page.getByTestId("workspace-communication")).toBeVisible(); + + await page.getByTestId("workspace-tab-transitions").click(); await expect(page.getByTestId("workspace-transitions")).toBeVisible(); + await page.getByTestId("workspace-tab-evaluation").click(); if (evaluatedTaskIds.has(selected.id)) { await expect(page.getByTestId("workspace-evaluation")).toContainText("Total score"); } else { @@ -143,8 +170,10 @@ async function assertRunWorkspace( await expect(page.locator('[data-testid^="event-row-"]').first()).toBeVisible(); if (state.mutation_count > 0) { - await page.getByTestId("mode-timeline").click(); + await page.locator('[data-testid^="activity-bar-"]').first().click(); await expect(page.getByTestId("timeline-region")).toBeVisible(); + await expect(page.getByTestId("activity-current-sequence")).toContainText(/seq/i); + await expectNoTimelinePlaybackControls(page); } } @@ -201,6 +230,14 @@ export function defineSmokeSpec(cfg: SmokeSpecConfig): void { page, path.join(screenshotDir, cfg.env, `${run_id}-happy.png`), ); + await screenshot( + page, + path.join(screenshotDir, cfg.env, `${run_id}-visual-debugger-full.png`), + ); + await locatorScreenshot( + page.getByTestId("activity-stack-region"), + path.join(screenshotDir, cfg.env, `${run_id}-activity-stack.png`), + ); if (cfg.extraRunAssertions) { await cfg.extraRunAssertions(page, run_id); @@ -208,10 +245,9 @@ export function defineSmokeSpec(cfg: SmokeSpecConfig): void { return; } - // sad-path run assertions (researchrubrics-only today). A failed leaf - // returns score-zero output so persistence still runs. - expect(state.status).toBe("completed"); - expect(state.resource_count).toBeGreaterThanOrEqual(17); + // Canonical sad path: l_2 fails, l_3 blocks, independent leaves complete. + expect(state.status).toBe("failed"); + expect(state.resource_count).toBeGreaterThanOrEqual(15); expect(state.executions.length).toBe(state.execution_count); expect(state.mutations.length).toBe(state.mutation_count); expect(state.thread_count).toBeGreaterThan(0); @@ -219,11 +255,11 @@ export function defineSmokeSpec(cfg: SmokeSpecConfig): void { const statusBySlug = new Map( state.graph_nodes.filter((n) => n.level > 0).map((n) => [n.task_slug, n.status]), ); - for (const slug of EXPECTED_SUBTASK_SLUGS) { + for (const slug of EXPECTED_SUBTASK_SLUGS.filter((s) => !["l_2", "l_3"].includes(s))) { expect(statusBySlug.get(slug)).toBe("completed"); } - const failedEval = state.evaluations.some((e) => e.score === 0.0); - expect(failedEval).toBe(true); + expect(statusBySlug.get("l_2")).toBe("failed"); + expect(statusBySlug.get("l_3")).toBe("blocked"); const cohortId = await client.getCohortId(cohortKey); await page.goto(`/cohorts/${cohortId}/runs/${run_id}`); @@ -232,6 +268,14 @@ export function defineSmokeSpec(cfg: SmokeSpecConfig): void { page, path.join(screenshotDir, cfg.env, `${run_id}-sad.png`), ); + await screenshot( + page, + path.join(screenshotDir, cfg.env, `${run_id}-visual-debugger-full.png`), + ); + await locatorScreenshot( + page.getByTestId("activity-stack-region"), + path.join(screenshotDir, cfg.env, `${run_id}-activity-stack.png`), + ); }); } diff --git a/ergon-dashboard/tests/e2e/activity-stack.spec.ts b/ergon-dashboard/tests/e2e/activity-stack.spec.ts new file mode 100644 index 00000000..24fcfe2b --- /dev/null +++ b/ergon-dashboard/tests/e2e/activity-stack.spec.ts @@ -0,0 +1,192 @@ +import { expect, Page, test } from "@playwright/test"; +import * as fs from "node:fs/promises"; +import * as path from "node:path"; + +import { + CONCURRENT_MAS_FIXTURE_IDS, + createConcurrentMasDashboardSeed, +} from "../helpers/dashboardFixtures"; +import { acquireHarnessLock, resetHarness, seedHarness } from "../helpers/harnessClient"; + +interface Box { + x: number; + y: number; + width: number; + height: number; +} + +test.describe.configure({ mode: "serial" }); + +let releaseHarnessLock: (() => Promise) | null = null; + +test.beforeEach(async ({ request }) => { + releaseHarnessLock = await acquireHarnessLock(); + try { + await resetHarness(request); + await seedHarness(request, createConcurrentMasDashboardSeed()); + } catch (error) { + await releaseHarnessLock(); + releaseHarnessLock = null; + throw error; + } +}); + +test.afterEach(async () => { + await releaseHarnessLock?.(); + releaseHarnessLock = null; +}); + +function boxesOverlap(a: Box, b: Box, tolerancePx = 2): boolean { + return ( + a.x + tolerancePx < b.x + b.width && + a.x + a.width > b.x + tolerancePx && + a.y + tolerancePx < b.y + b.height && + a.y + a.height > b.y + tolerancePx + ); +} + +async function overlappingPairsFor(page: Page, selector: string): Promise<[number, number][]> { + const boxes = await page.locator(selector).evaluateAll((elements) => + elements.map((element) => { + const rect = element.getBoundingClientRect(); + return { x: rect.x, y: rect.y, width: rect.width, height: rect.height }; + }), + ); + const pairs: [number, number][] = []; + for (let i = 0; i < boxes.length; i++) { + for (let j = i + 1; j < boxes.length; j++) { + if (boxesOverlap(boxes[i], boxes[j])) pairs.push([i, j]); + } + } + return pairs; +} + +async function activityGeometry(page: Page): Promise> { + return page.locator('[data-activity-id]').evaluateAll((elements) => { + return Object.fromEntries( + elements.map((element) => { + return [ + element.getAttribute("data-activity-id") ?? "", + { + x: Number(element.getAttribute("data-left-pct")), + y: Number(element.getAttribute("data-row")), + width: Number(element.getAttribute("data-width-pct")), + height: 1, + }, + ]; + }), + ); + }); +} + +function expectGeometryStable(before: Record, after: Record) { + for (const [id, box] of Object.entries(before)) { + const next = after[id]; + expect(next, `${id} still exists after replay selection`).toBeTruthy(); + expect(Math.round(next.x * 1000), `${id} left pct`).toBe(Math.round(box.x * 1000)); + expect(Math.round(next.y), `${id} y`).toBe(Math.round(box.y)); + expect(Math.round(next.width * 1000), `${id} width pct`).toBe(Math.round(box.width * 1000)); + expect(Math.round(next.height), `${id} height`).toBe(Math.round(box.height)); + } +} + +async function dumpScreenshots(page: Page) { + if (process.env.VISUAL_DEBUGGER_SCREENSHOTS !== "1") return; + const outDir = path.join(process.cwd(), "tmp", "visual-debugger"); + await fs.mkdir(outDir, { recursive: true }); + await page.screenshot({ path: path.join(outDir, "run-full.png"), fullPage: true }); + await page.getByTestId("activity-stack-region").screenshot({ + path: path.join(outDir, "activity-stack.png"), + }); + await page.getByTestId("workspace-region").screenshot({ + path: path.join(outDir, "workspace-open.png"), + }); +} + +async function dumpGraphScreenshot(page: Page) { + if (process.env.VISUAL_DEBUGGER_SCREENSHOTS !== "1") return; + const outDir = path.join(process.cwd(), "tmp", "visual-debugger"); + await fs.mkdir(outDir, { recursive: true }); + await page.getByTestId("graph-canvas").screenshot({ + path: path.join(outDir, "graph-canvas.png"), + }); +} + +async function expectNoTimelinePlaybackControls(page: Page) { + await expect(page.getByTestId("activity-play-toggle")).toHaveCount(0); + await expect(page.getByTestId("activity-speed-control")).toHaveCount(0); + await expect(page.getByTestId("activity-step-back")).toHaveCount(0); + await expect(page.getByTestId("activity-step-forward")).toHaveCount(0); +} + +test("visual debugger renders graph, activity stack, and time-aware workspace", async ({ page }) => { + await page.goto( + `/cohorts/${CONCURRENT_MAS_FIXTURE_IDS.cohortId}/runs/${CONCURRENT_MAS_FIXTURE_IDS.runId}`, + ); + + await expect(page.getByTestId("run-header")).toBeVisible(); + await expect(page.getByTestId("graph-canvas")).toBeVisible(); + await expect(page.getByTestId("activity-stack-region")).toBeVisible(); + await expect(page.getByTestId("activity-kind-legend")).toContainText("Span"); + await expect(page.getByTestId("activity-kind-legend")).toContainText("Point event"); + await expect(page.getByTestId("activity-band-work")).toBeVisible(); + await expect(page.getByTestId("activity-band-graph")).toBeVisible(); + await expect(page.getByTestId("activity-band-tools")).toBeVisible(); + await expect(page.getByTestId("activity-band-communication")).toBeVisible(); + await expect(page.getByTestId("activity-band-outputs")).toBeVisible(); + await expectNoTimelinePlaybackControls(page); + expect(await page.getByTestId("activity-stack-row").count()).toBeGreaterThan(1); + await expect + .poll( + async () => + (await overlappingPairsFor(page, "[data-activity-id]")).length, + { timeout: 5000 }, + ) + .toBe(0); + await expect + .poll( + async () => + ( + await overlappingPairsFor( + page, + '.react-flow__node:has([data-testid^="graph-node-"])', + ) + ).length, + { timeout: 5000 }, + ) + .toBe(0); + await dumpGraphScreenshot(page); + + const graphActivity = page + .locator('[data-activity-id^="graph:"]:not([data-task-id=""])') + .first(); + await expect(graphActivity).toBeVisible(); + const beforeGeometry = await activityGeometry(page); + await graphActivity.hover(); + await expect(page.getByTestId("activity-debug-preview")).toBeVisible(); + await expect(page.getByTestId("activity-debug-preview")).toContainText("Lineage"); + await expect(page.getByTestId("activity-debug-preview")).toContainText("graph.mutation"); + expect(await page.locator('[data-relation="dimmed"]').count()).toBeGreaterThan(0); + await graphActivity.click(); + expectGeometryStable(beforeGeometry, await activityGeometry(page)); + await expect(page.locator('[data-current="true"]')).toHaveCount(1); + await expect(graphActivity).toHaveAttribute("data-current", "true"); + + await expect(page.getByTestId("workspace-region")).toBeVisible(); + await expect(page.getByTestId("workspace-header")).toBeVisible(); + await expect(page.getByTestId("workspace-activity-detail")).toBeVisible(); + await expect(page.getByTestId("workspace-activity-detail")).toContainText("Graph mutation"); + await expect(page.getByTestId("workspace-activity-detail")).toContainText("payload"); + await expect(page.getByTestId("workspace-activity-detail")).toContainText("graph.mutation"); + await expect(page.getByTestId("workspace-timeline-badge")).toContainText("seq"); + await expectNoTimelinePlaybackControls(page); + + await page.keyboard.press("Escape"); + const toolActivity = page.locator('[data-activity-id^="context:"]').first(); + await expect(toolActivity).toBeVisible(); + await toolActivity.click(); + await expect(page.getByTestId("workspace-timeline-badge")).toContainText(/seq [1-9]/); + await expect(page.getByTestId("snapshot-pin").first()).toBeVisible(); + + await dumpScreenshots(page); +}); diff --git a/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts b/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts index 74b3abaf..e8ba4d3c 100644 --- a/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts +++ b/ergon-dashboard/tests/e2e/cohort.snapshot.spec.ts @@ -1,28 +1,56 @@ import { expect, test } from "@playwright/test"; import { createDashboardSeed, FIXTURE_IDS } from "../helpers/dashboardFixtures"; -import { resetHarness, seedHarness } from "../helpers/harnessClient"; +import { acquireHarnessLock, resetHarness, seedHarness } from "../helpers/harnessClient"; + +test.describe.configure({ mode: "serial" }); + +let releaseHarnessLock: (() => Promise) | null = null; test.beforeEach(async ({ request }) => { - await resetHarness(request); - await seedHarness(request, createDashboardSeed()); + releaseHarnessLock = await acquireHarnessLock(); + try { + await resetHarness(request); + await seedHarness(request, createDashboardSeed()); + } catch (error) { + await releaseHarnessLock(); + releaseHarnessLock = null; + throw error; + } +}); + +test.afterEach(async () => { + await releaseHarnessLock?.(); + releaseHarnessLock = null; }); test("cohort index renders cohort-first snapshot truth", async ({ page }) => { await page.goto("/"); - await expect(page.getByTestId("cohort-index-header")).toContainText("Experiment Cohorts"); + await expect(page.getByTestId("cohort-index-header")).toContainText("Cohorts"); await expect(page.getByTestId(`cohort-row-${FIXTURE_IDS.cohortId}`)).toContainText( "minif2f-react-worker-gpt5v3", ); - await expect(page.getByTestId(`cohort-row-${FIXTURE_IDS.cohortId}`)).toContainText("Runs"); + await expect(page.getByTestId("cohort-index-list")).toContainText("Runs"); }); test("cohort detail renders summary and run list", async ({ page }) => { await page.goto(`/cohorts/${FIXTURE_IDS.cohortId}`); await expect(page.getByTestId("cohort-header")).toContainText("minif2f-react-worker-gpt5v3"); - await expect(page.getByTestId("cohort-summary-cards")).toContainText("Total runs"); + await expect(page.getByTestId("cohort-summary-cards")).toContainText("Runs · pass / fail"); + await expect(page.getByTestId("cohort-summary-cards")).toContainText("3 of 3 runs"); + await expect(page.getByTestId("cohort-summary-cards")).toContainText("Avg tasks"); + await expect(page.getByTestId("cohort-summary-cards")).toContainText("10.0"); + await expect(page.getByRole("button", { name: "Compare" })).toHaveCount(0); + await expect(page.getByRole("button", { name: "Re-run failed" })).toHaveCount(0); + await expect(page.getByRole("button", { name: "Open in training" })).toHaveCount(0); + await expect(page.getByTestId("cohort-run-distribution")).toBeVisible(); + await expect(page.getByTestId("cohort-run-distribution")).toContainText("Score distribution"); + await expect(page.getByTestId("cohort-distribution-point")).toHaveCount(3); + await page.getByTestId("cohort-distribution-metric-runtime").click(); + await expect(page.getByTestId("cohort-run-distribution")).toContainText("Runtime distribution"); + await expect(page.getByTestId("cohort-distribution-point")).toHaveCount(3); const runRow = page.getByTestId(`cohort-run-row-${FIXTURE_IDS.runId}`); await expect(runRow).toContainText("minif2f-react-worker-gpt5v3"); await expect(runRow).toContainText("Started"); diff --git a/ergon-dashboard/tests/e2e/health.spec.ts b/ergon-dashboard/tests/e2e/health.spec.ts new file mode 100644 index 00000000..e774e624 --- /dev/null +++ b/ergon-dashboard/tests/e2e/health.spec.ts @@ -0,0 +1,63 @@ +import { test, expect } from "@playwright/test"; + +/** + * E2E tests for the /api/health endpoint. + * + * Validates that: + * - The health endpoint is reachable and returns structured JSON + * - SSR imports are exercised (catches stale .next cache) + * - The response shape matches the expected schema + */ + +const BASE = process.env.BASE_URL ?? "http://localhost:3001"; + +test.describe("Health endpoint", () => { + test("returns 200 with healthy status when build is fresh", async ({ request }) => { + const res = await request.get(`${BASE}/api/health`); + expect(res.status()).toBe(200); + + const body = await res.json(); + expect(body.status).toBe("healthy"); + expect(body.checks).toHaveProperty("ssr_imports", "ok"); + expect(body.checks).toHaveProperty("ergon_api"); + expect(body.build).toHaveProperty("nodeEnv"); + expect(body.build).toHaveProperty("pid"); + expect(typeof body.build.pid).toBe("number"); + }); + + test("response schema includes all expected fields", async ({ request }) => { + const res = await request.get(`${BASE}/api/health`); + const body = await res.json(); + + expect(body).toHaveProperty("status"); + expect(body).toHaveProperty("checks"); + expect(body).toHaveProperty("build"); + expect(["healthy", "degraded"]).toContain(body.status); + + for (const value of Object.values(body.checks)) { + expect(["ok", "fail"]).toContain(value); + } + }); + + test("SSR import check exercises the actual module graph", async ({ request }) => { + const res = await request.get(`${BASE}/api/health`); + const body = await res.json(); + + expect(body.checks.ssr_imports).toBe("ok"); + if (body.checks.ssr_imports === "fail") { + expect(body.errors).toBeDefined(); + expect(body.errors.length).toBeGreaterThan(0); + expect(body.errors[0]).toContain("SSR import"); + } + }); +}); + +test.describe("Build health toast (UI)", () => { + test("toast is hidden when build is healthy", async ({ page }) => { + await page.goto(`${BASE}/`); + await page.waitForLoadState("networkidle"); + + const toast = page.locator('[data-testid="build-health-toast"]'); + await expect(toast).not.toBeVisible(); + }); +}); diff --git a/ergon-dashboard/tests/e2e/minif2f.smoke.spec.ts b/ergon-dashboard/tests/e2e/minif2f.smoke.spec.ts index 49f8323b..162b67a6 100644 --- a/ergon-dashboard/tests/e2e/minif2f.smoke.spec.ts +++ b/ergon-dashboard/tests/e2e/minif2f.smoke.spec.ts @@ -1,8 +1,7 @@ /** * Canonical smoke Playwright spec for the minif2f leg. * - * 3 happy-path cohort runs. No sad slot. All assertions in the - * shared factory (./._shared/smoke.ts). + * One canonical sad-path run. All assertions live in the shared factory. */ import { defineSmokeSpec } from "./_shared/smoke"; diff --git a/ergon-dashboard/tests/e2e/researchrubrics.smoke.spec.ts b/ergon-dashboard/tests/e2e/researchrubrics.smoke.spec.ts index 6a29289f..80955747 100644 --- a/ergon-dashboard/tests/e2e/researchrubrics.smoke.spec.ts +++ b/ergon-dashboard/tests/e2e/researchrubrics.smoke.spec.ts @@ -1,8 +1,7 @@ /** * Canonical smoke Playwright spec for the researchrubrics leg. * - * Cohort shape: 2 happy + 1 sad (see docs/superpowers/plans/test-refactor/00-program.md §3.2). - * All assertions defined in the shared factory. + * One canonical sad-path run. All assertions live in the shared factory. */ import { defineSmokeSpec } from "./_shared/smoke"; diff --git a/ergon-dashboard/tests/e2e/run.delta.spec.ts b/ergon-dashboard/tests/e2e/run.delta.spec.ts index ef3d2a4c..a1b4f239 100644 --- a/ergon-dashboard/tests/e2e/run.delta.spec.ts +++ b/ergon-dashboard/tests/e2e/run.delta.spec.ts @@ -4,14 +4,31 @@ import { createDashboardSeed, createDeltaContextEvent, createDeltaThread, + createEmptyCriteriaEvaluation, createUpdatedEvaluation, FIXTURE_IDS, } from "../helpers/dashboardFixtures"; -import { resetHarness, seedHarness } from "../helpers/harnessClient"; +import { acquireHarnessLock, resetHarness, seedHarness } from "../helpers/harnessClient"; + +test.describe.configure({ mode: "serial" }); + +let releaseHarnessLock: (() => Promise) | null = null; test.beforeEach(async ({ request }) => { - await resetHarness(request); - await seedHarness(request, createDashboardSeed()); + releaseHarnessLock = await acquireHarnessLock(); + try { + await resetHarness(request); + await seedHarness(request, createDashboardSeed()); + } catch (error) { + await releaseHarnessLock(); + releaseHarnessLock = null; + throw error; + } +}); + +test.afterEach(async () => { + await releaseHarnessLock?.(); + releaseHarnessLock = null; }); test("run header reacts to controlled completion delta", async ({ page }) => { @@ -56,12 +73,15 @@ test("communication and evaluation react to controlled deltas", async ({ page }) }); expect(evaluationResponse.ok()).toBeTruthy(); + await page.getByTestId("workspace-tab-communication").click(); await expect(page.getByTestId("workspace-communication")).toContainText( "I am rewriting the final proof around that parity split now.", ); + await page.getByTestId("workspace-tab-evaluation").click(); await expect(page.getByTestId("workspace-evaluation")).toContainText( "The updated proof compiles cleanly and closes every goal", ); + await page.getByTestId("workspace-tab-actions").click(); await expect(page.getByTestId("workspace-actions")).not.toContainText( "I am rewriting the final proof around that parity split now.", ); @@ -71,6 +91,7 @@ test("workspace actions react to controlled context event deltas", async ({ page await page.goto(`/cohorts/${FIXTURE_IDS.cohortId}/runs/${FIXTURE_IDS.runId}`); await page.getByTestId(`graph-node-${FIXTURE_IDS.solveTaskId}`).click(); + await page.getByTestId("workspace-tab-actions").click(); await expect(page.getByTestId("workspace-actions")).toContainText("lean_check"); const response = await page.request.post("/api/test/dashboard/events/context-event", { data: { @@ -83,3 +104,25 @@ test("workspace actions react to controlled context event deltas", async ({ page await expect(page.getByTestId("workspace-actions")).toContainText("lake_build"); }); + +test("evaluation tab shows a clear empty criteria state", async ({ page }) => { + await page.goto(`/cohorts/${FIXTURE_IDS.cohortId}/runs/${FIXTURE_IDS.runId}`); + await page.getByTestId(`graph-node-${FIXTURE_IDS.solveTaskId}`).click(); + + const response = await page.request.post("/api/test/dashboard/events/task-evaluation", { + data: { + runId: FIXTURE_IDS.runId, + taskId: FIXTURE_IDS.solveTaskId, + evaluation: createEmptyCriteriaEvaluation(), + }, + }); + expect(response.ok()).toBeTruthy(); + + await page.getByTestId("workspace-tab-evaluation").click(); + await expect(page.getByTestId("evaluation-criteria-empty")).toContainText( + "No evaluation criteria recorded yet", + ); + await expect(page.getByTestId("evaluation-criteria-empty")).toContainText( + "This task has no criterionResults in the persisted evaluation payload.", + ); +}); diff --git a/ergon-dashboard/tests/e2e/run.snapshot.spec.ts b/ergon-dashboard/tests/e2e/run.snapshot.spec.ts index 47c0279e..f0ef092a 100644 --- a/ergon-dashboard/tests/e2e/run.snapshot.spec.ts +++ b/ergon-dashboard/tests/e2e/run.snapshot.spec.ts @@ -1,20 +1,109 @@ import { expect, test } from "@playwright/test"; -import { createDashboardSeed, FIXTURE_IDS } from "../helpers/dashboardFixtures"; -import { resetHarness, seedHarness } from "../helpers/harnessClient"; +import { + CONCURRENT_MAS_FIXTURE_IDS, + createDashboardSeed, + FIXTURE_IDS, +} from "../helpers/dashboardFixtures"; +import { acquireHarnessLock, resetHarness, seedHarness } from "../helpers/harnessClient"; + +test.describe.configure({ mode: "serial" }); + +let releaseHarnessLock: (() => Promise) | null = null; test.beforeEach(async ({ request }) => { - await resetHarness(request); - await seedHarness(request, createDashboardSeed()); + releaseHarnessLock = await acquireHarnessLock(); + try { + await resetHarness(request); + await seedHarness(request, createDashboardSeed()); + } catch (error) { + await releaseHarnessLock(); + releaseHarnessLock = null; + throw error; + } }); +test.afterEach(async () => { + await releaseHarnessLock?.(); + releaseHarnessLock = null; +}); + +async function expectNoTimelinePlaybackControls(page: import("@playwright/test").Page) { + await expect(page.getByTestId("mode-live")).toHaveCount(0); + await expect(page.getByTestId("mode-timeline")).toHaveCount(0); + await expect(page.getByTestId("activity-play-toggle")).toHaveCount(0); + await expect(page.getByTestId("activity-speed-control")).toHaveCount(0); + await expect(page.getByTestId("activity-step-back")).toHaveCount(0); + await expect(page.getByTestId("activity-step-forward")).toHaveCount(0); +} + test("run page keeps cohort breadcrumb context", async ({ page }) => { await page.goto(`/cohorts/${FIXTURE_IDS.cohortId}/runs/${FIXTURE_IDS.runId}`); await expect(page.getByTestId("run-breadcrumb-cohort")).toContainText( "minif2f-react-worker-gpt5v3", ); - await expect(page.getByTestId("run-header")).toContainText("amc12a_2008_p25"); + await expect(page.getByTestId("run-header")).toContainText("parallel"); +}); + +test("run workspace does not expose manual live or timeline mode controls", async ({ page }) => { + await page.goto(`/cohorts/${FIXTURE_IDS.cohortId}/runs/${FIXTURE_IDS.runId}`); + + await expect(page.getByTestId("graph-canvas")).toBeVisible(); + await expectNoTimelinePlaybackControls(page); +}); + +test("run workspace shows rerun as unavailable until backend support exists", async ({ page }) => { + await page.goto(`/cohorts/${FIXTURE_IDS.cohortId}/runs/${FIXTURE_IDS.runId}`); + + const rerunButton = page.getByTestId("rerun-button"); + await expect(rerunButton).toBeVisible(); + await expect(rerunButton).toBeDisabled(); + await expect(rerunButton).toHaveAttribute("title", /not wired/i); +}); + +test("snapshot selection does not expose playback or speed controls", async ({ page }) => { + await page.goto( + `/cohorts/${CONCURRENT_MAS_FIXTURE_IDS.cohortId}/runs/${CONCURRENT_MAS_FIXTURE_IDS.runId}`, + ); + + await expect(page.getByTestId("activity-stack-region")).toBeVisible(); + const activity = page.locator('[data-activity-id^="graph:"]').first(); + await expect(activity).toBeVisible(); + await activity.click(); + + await expect(page.getByTestId("activity-current-sequence")).toContainText("replay"); + await expectNoTimelinePlaybackControls(page); +}); + +test("activity marker locks graph and header to snapshot until Escape returns to live", async ({ + page, +}) => { + await page.goto( + `/cohorts/${CONCURRENT_MAS_FIXTURE_IDS.cohortId}/runs/${CONCURRENT_MAS_FIXTURE_IDS.runId}`, + ); + + await expect(page.getByTestId("graph-canvas")).toBeVisible(); + const validateCitationsNode = page.getByTestId( + "graph-node-10000000-0000-4000-8000-000000000006", + ); + await expect(validateCitationsNode).toHaveAttribute("data-task-status", "completed"); + + const snapshotMarker = page.getByTestId( + "activity-bar-graph-70000000-0000-4000-8000-000000000014", + ); + await expect(snapshotMarker).toBeVisible(); + await snapshotMarker.click(); + + await expect(page.getByTestId("snapshot-lock-label")).toBeVisible(); + await expect(page.getByTestId("snapshot-pin").first()).toBeVisible(); + await expect(page.getByTestId("run-header")).toContainText("snapshot · seq 14"); + await expect(validateCitationsNode).toHaveAttribute("data-task-status", "pending"); + + await page.keyboard.press("Escape"); + await expect(page.getByTestId("run-header")).toContainText(/live/i); + await expect(page.getByTestId("snapshot-lock-label")).toHaveCount(0); + await expect(validateCitationsNode).toHaveAttribute("data-task-status", "completed"); }); test("graph selection opens workspace evidence sections", async ({ page }) => { @@ -26,16 +115,80 @@ test("graph selection opens workspace evidence sections", async ({ page }) => { await expect(page.getByTestId("workspace-header")).toContainText("Write proof"); await expect(page.getByTestId("workspace-close")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-overview")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-actions")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-communication")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-outputs")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-transitions")).toBeVisible(); + await expect(page.getByTestId("workspace-tab-evaluation")).toBeVisible(); + await expect(page.getByTestId("workspace-overview")).toBeVisible(); + await expect(page.getByTestId("workspace-actions")).toHaveCount(0); + + const overviewTab = page.getByTestId("workspace-tab-overview"); + const actionsTab = page.getByTestId("workspace-tab-actions"); + await expect(overviewTab).toHaveAttribute("id", "workspace-tab-button-overview"); + await expect(overviewTab).toHaveAttribute("aria-controls", "workspace-tab-panel-overview"); + await expect(overviewTab).toHaveAttribute("aria-selected", "true"); + await expect(page.locator("#workspace-tab-panel-overview")).toHaveAttribute("role", "tabpanel"); + await expect(page.locator("#workspace-tab-panel-overview")).toHaveAttribute( + "aria-labelledby", + "workspace-tab-button-overview", + ); + await expect(page.locator("#workspace-tab-panel-overview")).toHaveAttribute("tabindex", "0"); + + await overviewTab.focus(); + await page.keyboard.press("ArrowRight"); + await expect(actionsTab).toBeFocused(); + await expect(actionsTab).toHaveAttribute("aria-selected", "true"); + await expect(page.locator("#workspace-tab-panel-actions")).toHaveAttribute("role", "tabpanel"); + + await page.getByTestId("workspace-tab-actions").click(); await expect(page.getByTestId("workspace-actions")).toContainText("lean_check"); + await expect(page.getByTestId("workspace-action-card").first()).toBeVisible(); + await expect(page.getByTestId("workspace-action-summary").first()).toContainText("Tool call"); + await expect(page.getByTestId("workspace-action-payload").first()).toContainText("Arguments"); + await expect(page.getByTestId("workspace-executions")).toContainText("Attempt 1"); + await expect(page.getByTestId("workspace-sandbox")).toContainText("lake env lean proof.lean"); + + await page.getByTestId("workspace-tab-communication").click(); + await expect(page.getByTestId("communication-thread-list")).toBeVisible(); + await expect(page.getByTestId("communication-thread-card").first()).toContainText("task_clarification"); + await expect(page.getByTestId("communication-chat-trace")).toBeVisible(); + await expect(page.getByTestId("communication-chat-message").first()).toBeVisible(); + const communicationLayout = await page.evaluate(() => { + const list = document.querySelector('[data-testid="communication-thread-list"]'); + const chat = document.querySelector('[data-testid="communication-chat-trace"]'); + if (!list || !chat) return null; + const listBox = list.getBoundingClientRect(); + const chatBox = chat.getBoundingClientRect(); + return { listBottom: listBox.bottom, chatTop: chatBox.top }; + }); + expect(communicationLayout).not.toBeNull(); + expect(communicationLayout!.chatTop).toBeGreaterThanOrEqual(communicationLayout!.listBottom); + await expect + .poll(async () => + page.getByTestId("workspace-communication").evaluate((element) => ({ + clientWidth: element.clientWidth, + scrollWidth: element.scrollWidth, + })), + ) + .toEqual(expect.objectContaining({ scrollWidth: expect.any(Number) })); + const communicationOverflow = await page + .getByTestId("workspace-communication") + .evaluate((element) => element.scrollWidth - element.clientWidth); + expect(communicationOverflow).toBeLessThanOrEqual(1); await expect(page.getByTestId("workspace-communication")).toContainText( "Can I use the standard divisibility lemma here?", ); + + await page.getByTestId("workspace-tab-evaluation").click(); + await expect(page.getByTestId("workspace-evaluation")).toBeVisible(); await expect(page.getByTestId("workspace-evaluation")).toContainText( "Proof compiles and closes all goals", ); + + await page.getByTestId("workspace-tab-outputs").click(); await expect(page.getByTestId("workspace-outputs")).toContainText("proof.lean"); - await expect(page.getByTestId("workspace-executions")).toContainText("Attempt 1"); - await expect(page.getByTestId("workspace-sandbox")).toContainText("lake env lean proof.lean"); }); test("persisted run snapshot remains inspectable after refresh", async ({ page }) => { @@ -46,6 +199,8 @@ test("persisted run snapshot remains inspectable after refresh", async ({ page } await page.getByTestId(`graph-node-${FIXTURE_IDS.solveTaskId}`).click(); await expect(page.getByTestId("workspace-header")).toContainText("Write proof"); + await page.getByTestId("workspace-tab-outputs").click(); await expect(page.getByTestId("workspace-outputs")).toContainText("proof.lean"); + await page.getByTestId("workspace-tab-actions").click(); await expect(page.getByTestId("workspace-executions")).toContainText("Attempt 1"); }); diff --git a/ergon-dashboard/tests/e2e/swebench-verified.smoke.spec.ts b/ergon-dashboard/tests/e2e/swebench-verified.smoke.spec.ts index 34ef8787..1e48c762 100644 --- a/ergon-dashboard/tests/e2e/swebench-verified.smoke.spec.ts +++ b/ergon-dashboard/tests/e2e/swebench-verified.smoke.spec.ts @@ -1,8 +1,7 @@ /** * Canonical smoke Playwright spec for the swebench-verified leg. * - * 3 happy-path cohort runs. No sad slot. All assertions in the - * shared factory (./._shared/smoke.ts). + * One canonical sad-path run. All assertions live in the shared factory. */ import { defineSmokeSpec } from "./_shared/smoke"; diff --git a/ergon-dashboard/tests/fixtures/mas-runs/README.md b/ergon-dashboard/tests/fixtures/mas-runs/README.md new file mode 100644 index 00000000..3057be6a --- /dev/null +++ b/ergon-dashboard/tests/fixtures/mas-runs/README.md @@ -0,0 +1,5 @@ +# MAS Run Fixtures + +These fixtures are small, sanitized run snapshots used by frontend semantic layout tests and local visual review. + +They intentionally avoid model outputs, secrets, and large resources. Keep them reviewable: add only the task graph, timestamps, mutations, and evidence needed to prove visual debugger behavior. diff --git a/ergon-dashboard/tests/fixtures/mas-runs/concurrent-mas-run.json b/ergon-dashboard/tests/fixtures/mas-runs/concurrent-mas-run.json new file mode 100644 index 00000000..72ff1d0b --- /dev/null +++ b/ergon-dashboard/tests/fixtures/mas-runs/concurrent-mas-run.json @@ -0,0 +1,574 @@ +{ + "name": "concurrent-mas-run", + "runState": { + "id": "99999999-9999-4999-8999-999999999999", + "experimentId": "33333333-3333-4333-8333-333333333333", + "name": "Concurrent MAS run", + "status": "executing", + "rootTaskId": "10000000-0000-4000-8000-000000000001", + "startedAt": "2026-04-26T12:00:00.000Z", + "completedAt": null, + "durationSeconds": null, + "totalTasks": 6, + "totalLeafTasks": 4, + "completedTasks": 1, + "runningTasks": 3, + "failedTasks": 0, + "cancelledTasks": 0, + "finalScore": null, + "error": null, + "tasks": { + "10000000-0000-4000-8000-000000000001": { + "id": "10000000-0000-4000-8000-000000000001", + "name": "Root investigation", + "description": "Coordinate a multi-agent research pass.", + "status": "running", + "parentId": null, + "childIds": [ + "10000000-0000-4000-8000-000000000002", + "10000000-0000-4000-8000-000000000003", + "10000000-0000-4000-8000-000000000004" + ], + "dependsOnIds": [], + "isLeaf": false, + "level": 0, + "assignedWorkerId": null, + "assignedWorkerName": "planner", + "startedAt": "2026-04-26T12:00:00.000Z", + "completedAt": null + }, + "10000000-0000-4000-8000-000000000002": { + "id": "10000000-0000-4000-8000-000000000002", + "name": "Search literature", + "description": "Find candidate references.", + "status": "running", + "parentId": "10000000-0000-4000-8000-000000000001", + "childIds": [], + "dependsOnIds": [], + "isLeaf": true, + "level": 1, + "assignedWorkerId": null, + "assignedWorkerName": "researcher-a", + "startedAt": "2026-04-26T12:00:05.000Z", + "completedAt": null + }, + "10000000-0000-4000-8000-000000000003": { + "id": "10000000-0000-4000-8000-000000000003", + "name": "Check claims", + "description": "Verify extracted claims.", + "status": "running", + "parentId": "10000000-0000-4000-8000-000000000001", + "childIds": [], + "dependsOnIds": [], + "isLeaf": true, + "level": 1, + "assignedWorkerId": null, + "assignedWorkerName": "researcher-b", + "startedAt": "2026-04-26T12:00:08.000Z", + "completedAt": null + }, + "10000000-0000-4000-8000-000000000004": { + "id": "10000000-0000-4000-8000-000000000004", + "name": "Synthesize answer", + "description": "Write the final synthesis after evidence is available.", + "status": "pending", + "parentId": "10000000-0000-4000-8000-000000000001", + "childIds": [ + "10000000-0000-4000-8000-000000000005", + "10000000-0000-4000-8000-000000000006" + ], + "dependsOnIds": [ + "10000000-0000-4000-8000-000000000002", + "10000000-0000-4000-8000-000000000003" + ], + "isLeaf": false, + "level": 1, + "assignedWorkerId": null, + "assignedWorkerName": "writer", + "startedAt": null, + "completedAt": null + }, + "10000000-0000-4000-8000-000000000005": { + "id": "10000000-0000-4000-8000-000000000005", + "name": "Draft narrative", + "description": "Draft the explanation.", + "status": "pending", + "parentId": "10000000-0000-4000-8000-000000000004", + "childIds": [], + "dependsOnIds": [], + "isLeaf": true, + "level": 2, + "assignedWorkerId": null, + "assignedWorkerName": "writer-a", + "startedAt": null, + "completedAt": null + }, + "10000000-0000-4000-8000-000000000006": { + "id": "10000000-0000-4000-8000-000000000006", + "name": "Validate citations", + "description": "Validate citation coverage.", + "status": "completed", + "parentId": "10000000-0000-4000-8000-000000000004", + "childIds": [], + "dependsOnIds": [], + "isLeaf": true, + "level": 2, + "assignedWorkerId": null, + "assignedWorkerName": "writer-b", + "startedAt": "2026-04-26T12:00:18.000Z", + "completedAt": "2026-04-26T12:00:25.000Z" + } + }, + "resourcesByTask": { + "10000000-0000-4000-8000-000000000002": [ + { + "id": "20000000-0000-4000-8000-000000000001", + "taskId": "10000000-0000-4000-8000-000000000002", + "taskExecutionId": "30000000-0000-4000-8000-000000000001", + "name": "references.md", + "mimeType": "text/markdown", + "filePath": "/workspace/references.md", + "sizeBytes": 1024, + "createdAt": "2026-04-26T12:00:22.000Z" + } + ] + }, + "executionsByTask": { + "10000000-0000-4000-8000-000000000002": [ + { + "id": "30000000-0000-4000-8000-000000000001", + "taskId": "10000000-0000-4000-8000-000000000002", + "attemptNumber": 1, + "status": "running", + "startedAt": "2026-04-26T12:00:05.000Z", + "completedAt": "2026-04-26T12:00:24.000Z", + "finalAssistantMessage": "Found candidate references.", + "errorMessage": null, + "score": null, + "agentId": "agent-a", + "agentName": "researcher-a", + "evaluationDetails": {}, + "outputResourceIds": [ + "20000000-0000-4000-8000-000000000001" + ] + } + ], + "10000000-0000-4000-8000-000000000003": [ + { + "id": "30000000-0000-4000-8000-000000000002", + "taskId": "10000000-0000-4000-8000-000000000003", + "attemptNumber": 1, + "status": "running", + "startedAt": "2026-04-26T12:00:08.000Z", + "completedAt": "2026-04-26T12:00:21.000Z", + "finalAssistantMessage": "Claims verified.", + "errorMessage": null, + "score": null, + "agentId": "agent-b", + "agentName": "researcher-b", + "evaluationDetails": {}, + "outputResourceIds": [] + } + ], + "10000000-0000-4000-8000-000000000006": [ + { + "id": "30000000-0000-4000-8000-000000000003", + "taskId": "10000000-0000-4000-8000-000000000006", + "attemptNumber": 1, + "status": "completed", + "startedAt": "2026-04-26T12:00:18.000Z", + "completedAt": "2026-04-26T12:00:25.000Z", + "finalAssistantMessage": "Citation check passed.", + "errorMessage": null, + "score": null, + "agentId": "agent-c", + "agentName": "writer-b", + "evaluationDetails": {}, + "outputResourceIds": [] + } + ] + }, + "sandboxesByTask": { + "10000000-0000-4000-8000-000000000002": { + "sandboxId": "sandbox-search", + "taskId": "10000000-0000-4000-8000-000000000002", + "template": "research", + "timeoutMinutes": 30, + "status": "closed", + "createdAt": "2026-04-26T12:00:04.000Z", + "closedAt": "2026-04-26T12:00:26.000Z", + "closeReason": "completed", + "commands": [ + { + "command": "python search.py", + "stdout": "ok", + "stderr": null, + "exitCode": 0, + "durationMs": 2000, + "timestamp": "2026-04-26T12:00:12.000Z" + } + ] + } + }, + "threads": [ + { + "id": "40000000-0000-4000-8000-000000000001", + "runId": "99999999-9999-4999-8999-999999999999", + "taskId": "10000000-0000-4000-8000-000000000003", + "topic": "task_clarification", + "agentAId": "agent-b", + "agentBId": "stakeholder", + "createdAt": "2026-04-26T12:00:09.000Z", + "updatedAt": "2026-04-26T12:00:16.000Z", + "messages": [ + { + "id": "40000000-0000-4000-8000-000000000002", + "threadId": "40000000-0000-4000-8000-000000000001", + "threadTopic": "task_clarification", + "runId": "99999999-9999-4999-8999-999999999999", + "taskId": "10000000-0000-4000-8000-000000000003", + "taskExecutionId": "30000000-0000-4000-8000-000000000002", + "fromAgentId": "agent-b", + "toAgentId": "stakeholder", + "content": "Should I reject ungrounded claims?", + "sequenceNum": 0, + "createdAt": "2026-04-26T12:00:16.000Z" + } + ] + } + ], + "evaluationsByTask": { + "10000000-0000-4000-8000-000000000006": { + "id": "50000000-0000-4000-8000-000000000001", + "runId": "99999999-9999-4999-8999-999999999999", + "taskId": "10000000-0000-4000-8000-000000000006", + "totalScore": 1, + "maxScore": 1, + "normalizedScore": 1, + "stagesEvaluated": 1, + "stagesPassed": 1, + "failedGate": null, + "createdAt": "2026-04-26T12:00:27.000Z", + "criterionResults": [ + { + "id": "50000000-0000-4000-8000-000000000002", + "stageNum": 0, + "stageName": "citation_validation", + "criterionNum": 0, + "criterionType": "code_rule", + "criterionDescription": "Citations validate", + "evaluationInput": null, + "score": 1, + "maxScore": 1, + "feedback": "ok", + "evaluatedActionIds": [], + "evaluatedResourceIds": [], + "error": null + } + ] + } + }, + "contextEventsByTask": { + "10000000-0000-4000-8000-000000000002": [ + { + "id": "60000000-0000-4000-8000-000000000001", + "taskExecutionId": "30000000-0000-4000-8000-000000000001", + "taskNodeId": "10000000-0000-4000-8000-000000000002", + "workerBindingKey": "researcher-a", + "sequence": 0, + "eventType": "tool_call", + "payload": { + "event_type": "tool_call", + "tool_name": "search", + "args": { + "query": "MAS layout" + } + }, + "createdAt": "2026-04-26T12:00:10.000Z", + "startedAt": "2026-04-26T12:00:10.000Z", + "completedAt": "2026-04-26T12:00:14.000Z" + } + ] + } + }, + "mutations": [ + { + "id": "70000000-0000-4000-8000-000000000001", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 1, + "mutation_type": "node.added", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000001", + "actor": "planner", + "old_value": null, + "new_value": { + "task_slug": "Root investigation", + "instance_key": "root", + "description": "Coordinate a multi-agent research pass.", + "status": "pending", + "assigned_worker_slug": "planner" + }, + "reason": "workflow started", + "created_at": "2026-04-26T12:00:00.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000002", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 2, + "mutation_type": "node.status_changed", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000001", + "actor": "planner", + "old_value": { + "status": "pending" + }, + "new_value": { + "status": "running" + }, + "reason": "worker started", + "created_at": "2026-04-26T12:00:00.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000003", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 3, + "mutation_type": "node.added", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000002", + "actor": "planner", + "old_value": null, + "new_value": { + "task_slug": "Search literature", + "instance_key": "search", + "description": "Find candidate references.", + "status": "pending", + "assigned_worker_slug": "researcher-a" + }, + "reason": "delegate", + "created_at": "2026-04-26T12:00:03.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000004", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 4, + "mutation_type": "edge.added", + "target_type": "edge", + "target_id": "70000000-0000-4000-8000-100000000004", + "actor": "planner", + "old_value": null, + "new_value": { + "source_node_id": "10000000-0000-4000-8000-000000000001", + "target_node_id": "10000000-0000-4000-8000-000000000002", + "status": "active" + }, + "reason": "parent-child", + "created_at": "2026-04-26T12:00:03.100Z" + }, + { + "id": "70000000-0000-4000-8000-000000000005", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 5, + "mutation_type": "node.added", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000003", + "actor": "planner", + "old_value": null, + "new_value": { + "task_slug": "Check claims", + "instance_key": "check", + "description": "Verify extracted claims.", + "status": "pending", + "assigned_worker_slug": "researcher-b" + }, + "reason": "delegate", + "created_at": "2026-04-26T12:00:04.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000006", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 6, + "mutation_type": "edge.added", + "target_type": "edge", + "target_id": "70000000-0000-4000-8000-100000000006", + "actor": "planner", + "old_value": null, + "new_value": { + "source_node_id": "10000000-0000-4000-8000-000000000001", + "target_node_id": "10000000-0000-4000-8000-000000000003", + "status": "active" + }, + "reason": "parent-child", + "created_at": "2026-04-26T12:00:04.100Z" + }, + { + "id": "70000000-0000-4000-8000-000000000007", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 7, + "mutation_type": "node.added", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000004", + "actor": "planner", + "old_value": null, + "new_value": { + "task_slug": "Synthesize answer", + "instance_key": "synth", + "description": "Write the final synthesis after evidence is available.", + "status": "pending", + "assigned_worker_slug": "writer" + }, + "reason": "delegate", + "created_at": "2026-04-26T12:00:05.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000008", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 8, + "mutation_type": "edge.added", + "target_type": "edge", + "target_id": "70000000-0000-4000-8000-100000000008", + "actor": "planner", + "old_value": null, + "new_value": { + "source_node_id": "10000000-0000-4000-8000-000000000001", + "target_node_id": "10000000-0000-4000-8000-000000000004", + "status": "active" + }, + "reason": "parent-child", + "created_at": "2026-04-26T12:00:05.100Z" + }, + { + "id": "70000000-0000-4000-8000-000000000009", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 9, + "mutation_type": "node.status_changed", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000002", + "actor": "researcher-a", + "old_value": { + "status": "pending" + }, + "new_value": { + "status": "running" + }, + "reason": "worker started", + "created_at": "2026-04-26T12:00:05.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000010", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 10, + "mutation_type": "node.status_changed", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000003", + "actor": "researcher-b", + "old_value": { + "status": "pending" + }, + "new_value": { + "status": "running" + }, + "reason": "worker started", + "created_at": "2026-04-26T12:00:08.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000011", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 11, + "mutation_type": "node.added", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000005", + "actor": "writer", + "old_value": null, + "new_value": { + "task_slug": "Draft narrative", + "instance_key": "draft", + "description": "Draft the explanation.", + "status": "pending", + "assigned_worker_slug": "writer-a" + }, + "reason": "delegate nested", + "created_at": "2026-04-26T12:00:12.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000012", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 12, + "mutation_type": "edge.added", + "target_type": "edge", + "target_id": "70000000-0000-4000-8000-100000000012", + "actor": "writer", + "old_value": null, + "new_value": { + "source_node_id": "10000000-0000-4000-8000-000000000004", + "target_node_id": "10000000-0000-4000-8000-000000000005", + "status": "active" + }, + "reason": "parent-child", + "created_at": "2026-04-26T12:00:12.100Z" + }, + { + "id": "70000000-0000-4000-8000-000000000013", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 13, + "mutation_type": "node.added", + "target_type": "node", + "target_id": "10000000-0000-4000-8000-000000000006", + "actor": "writer", + "old_value": null, + "new_value": { + "task_slug": "Validate citations", + "instance_key": "validate", + "description": "Validate citation coverage.", + "status": "pending", + "assigned_worker_slug": "writer-b" + }, + "reason": "delegate nested", + "created_at": "2026-04-26T12:00:13.000Z" + }, + { + "id": "70000000-0000-4000-8000-000000000014", + "run_id": "99999999-9999-4999-8999-999999999999", + "sequence": 14, + "mutation_type": "edge.added", + "target_type": "edge", + "target_id": "70000000-0000-4000-8000-100000000014", + "actor": "writer", + "old_value": null, + "new_value": { + "source_node_id": "10000000-0000-4000-8000-000000000004", + "target_node_id": "10000000-0000-4000-8000-000000000006", + "status": "active" + }, + "reason": "parent-child", + "created_at": "2026-04-26T12:00:13.100Z" + } + ], + "checkpoints": [ + { + "sequence": 10, + "expectedTaskIds": [ + "10000000-0000-4000-8000-000000000001", + "10000000-0000-4000-8000-000000000002", + "10000000-0000-4000-8000-000000000003", + "10000000-0000-4000-8000-000000000004" + ], + "expectedMaxConcurrency": 1, + "selectedTime": "2026-04-26T12:00:10.000Z", + "hiddenFutureResourceName": "references.md" + }, + { + "sequence": 14, + "expectedTaskIds": [ + "10000000-0000-4000-8000-000000000001", + "10000000-0000-4000-8000-000000000002", + "10000000-0000-4000-8000-000000000003", + "10000000-0000-4000-8000-000000000004", + "10000000-0000-4000-8000-000000000005", + "10000000-0000-4000-8000-000000000006" + ], + "expectedMaxConcurrency": 2, + "selectedTime": "2026-04-26T12:00:18.000Z", + "hiddenFutureResourceName": "references.md" + } + ] +} diff --git a/ergon-dashboard/tests/helpers/dashboardFixtures.ts b/ergon-dashboard/tests/helpers/dashboardFixtures.ts index 92aba500..beaabe38 100644 --- a/ergon-dashboard/tests/helpers/dashboardFixtures.ts +++ b/ergon-dashboard/tests/helpers/dashboardFixtures.ts @@ -1,4 +1,5 @@ import type { DashboardHarnessSeedPayload } from "../../src/lib/testing/dashboardHarness"; +import concurrentMasFixture from "../fixtures/mas-runs/concurrent-mas-run.json"; import type { CommunicationThreadState, ContextEventState, @@ -27,6 +28,14 @@ export const FIXTURE_IDS = { deltaToolCallEventId: "dddddddd-dddd-4ddd-8ddd-dddddddddddd", } as const; +export const CONCURRENT_MAS_FIXTURE_IDS = { + cohortId: "12121212-1212-4121-8121-121212121212", + experimentId: "33333333-3333-4333-8333-333333333333", + runId: "99999999-9999-4999-8999-999999999999", + searchTaskId: "10000000-0000-4000-8000-000000000002", + checkTaskId: "10000000-0000-4000-8000-000000000003", +} as const; + function taskState(task: Partial & Pick): TaskState { return { parentId: null, @@ -275,6 +284,7 @@ function serializedRunState(): SerializedWorkflowRunState { completedTasks: 1, runningTasks: 1, failedTasks: 0, + cancelledTasks: 0, finalScore: null, error: null, }; @@ -386,6 +396,22 @@ export function createUpdatedEvaluation(): TaskEvaluationState { }; } +export function createEmptyCriteriaEvaluation(): TaskEvaluationState { + return { + id: FIXTURE_IDS.evaluationId, + runId: FIXTURE_IDS.runId, + taskId: FIXTURE_IDS.solveTaskId, + totalScore: 0, + maxScore: 0, + normalizedScore: 0, + stagesEvaluated: 0, + stagesPassed: 0, + failedGate: null, + createdAt: "2026-03-18T12:00:31.000Z", + criterionResults: [], + }; +} + export function createDashboardSeed(): DashboardHarnessSeedPayload { const runState = serializedRunState(); const summary = { @@ -395,18 +421,18 @@ export function createDashboardSeed(): DashboardHarnessSeedPayload { created_by: "playwright", created_at: "2026-03-18T11:59:00.000Z", status: "active" as const, - total_runs: 1, + total_runs: 3, status_counts: { pending: 0, - executing: 1, + executing: 0, evaluating: 0, - completed: 0, + completed: 3, failed: 0, }, - average_score: null, - best_score: null, - worst_score: null, - average_duration_ms: null, + average_score: 1, + best_score: 1, + worst_score: 1, + average_duration_ms: 24_000, failure_rate: 0, metadata_summary: { code_commit_sha: "abc1234", @@ -441,22 +467,139 @@ export function createDashboardSeed(): DashboardHarnessSeedPayload { definition_id: FIXTURE_IDS.experimentId, cohort_id: FIXTURE_IDS.cohortId, cohort_name: summary.name, - status: "executing", + status: "completed", created_at: "2026-03-18T11:59:30.000Z", started_at: "2026-03-18T12:00:00.000Z", - completed_at: null, + completed_at: "2026-03-18T12:00:24.000Z", running_time_ms: 24_000, - final_score: null, + final_score: 1, + total_tasks: 10, + total_cost_usd: 0.12, + error_message: null, + }, + { + run_id: "22222222-2222-4222-8222-222222222223", + definition_id: FIXTURE_IDS.experimentId, + cohort_id: FIXTURE_IDS.cohortId, + cohort_name: summary.name, + status: "completed", + created_at: "2026-03-18T12:00:30.000Z", + started_at: "2026-03-18T12:01:00.000Z", + completed_at: "2026-03-18T12:01:22.000Z", + running_time_ms: 22_000, + final_score: 1, + total_tasks: 10, + total_cost_usd: 0.14, + error_message: null, + }, + { + run_id: "22222222-2222-4222-8222-222222222224", + definition_id: FIXTURE_IDS.experimentId, + cohort_id: FIXTURE_IDS.cohortId, + cohort_name: summary.name, + status: "completed", + created_at: "2026-03-18T12:01:30.000Z", + started_at: "2026-03-18T12:02:00.000Z", + completed_at: "2026-03-18T12:02:26.000Z", + running_time_ms: 26_000, + final_score: 1, + total_tasks: 10, + total_cost_usd: 0.16, error_message: null, }, ], }; + const concurrent = createConcurrentMasSeedOnly(); return { - cohorts: [summary], + cohorts: [summary, ...(concurrent.cohorts ?? [])], cohortDetails: { [FIXTURE_IDS.cohortId]: detail, + ...(concurrent.cohortDetails ?? {}), }, - runs: [runState], + runs: [runState, ...(concurrent.runs ?? [])], + mutations: concurrent.mutations, }; } + +function createConcurrentMasSeedOnly(): DashboardHarnessSeedPayload { + const summary = { + cohort_id: CONCURRENT_MAS_FIXTURE_IDS.cohortId, + name: "concurrent-mas-visual-debugger", + description: "Deterministic concurrent MAS fixture for visual debugger tests.", + created_by: "playwright", + created_at: "2026-04-26T11:59:00.000Z", + status: "active" as const, + total_runs: 1, + status_counts: { + pending: 0, + executing: 1, + evaluating: 0, + completed: 0, + failed: 0, + }, + average_score: null, + best_score: null, + worst_score: null, + average_duration_ms: null, + failure_rate: 0, + metadata_summary: { + code_commit_sha: "visual-debugger", + repo_dirty: false, + prompt_version: "visual-debugger-fixture", + worker_version: "fixture", + model_provider: "fixture", + model_name: "fixture", + sandbox_config: { + template: "research", + timeout_minutes: 30, + }, + dispatch_config: { + scenario: "concurrent-mas", + }, + }, + stats_updated_at: "2026-04-26T12:00:30.000Z", + extras: { + benchmark_counts: { + visual_debugger: 1, + }, + latest_run_at: "2026-04-26T12:00:00.000Z", + }, + }; + + const detail = { + summary, + runs: [ + { + run_id: CONCURRENT_MAS_FIXTURE_IDS.runId, + definition_id: CONCURRENT_MAS_FIXTURE_IDS.experimentId, + cohort_id: CONCURRENT_MAS_FIXTURE_IDS.cohortId, + cohort_name: summary.name, + status: "executing", + created_at: "2026-04-26T11:59:30.000Z", + started_at: "2026-04-26T12:00:00.000Z", + completed_at: null, + running_time_ms: 30_000, + final_score: null, + total_tasks: null, + total_cost_usd: null, + error_message: null, + }, + ], + }; + + return { + cohorts: [summary], + cohortDetails: { + [CONCURRENT_MAS_FIXTURE_IDS.cohortId]: detail, + }, + runs: [concurrentMasFixture.runState as SerializedWorkflowRunState], + mutations: { + [CONCURRENT_MAS_FIXTURE_IDS.runId]: concurrentMasFixture.mutations, + }, + } as DashboardHarnessSeedPayload; +} + +export function createConcurrentMasDashboardSeed(): DashboardHarnessSeedPayload { + return createDashboardSeed(); +} diff --git a/ergon-dashboard/tests/helpers/harnessClient.ts b/ergon-dashboard/tests/helpers/harnessClient.ts index 3171a313..4b905c93 100644 --- a/ergon-dashboard/tests/helpers/harnessClient.ts +++ b/ergon-dashboard/tests/helpers/harnessClient.ts @@ -1,7 +1,49 @@ import type { APIRequestContext } from "@playwright/test"; +import { mkdir, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; import type { DashboardHarnessSeedPayload } from "../../src/lib/testing/dashboardHarness"; +const HARNESS_LOCK_DIR = join(tmpdir(), "ergon-dashboard-shared-harness.lock"); +const HARNESS_LOCK_TIMEOUT_MS = 30_000; +const HARNESS_LOCK_RETRY_MS = 50; + +function delay(ms: number) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +function getErrorCode(error: unknown): string | null { + if (typeof error !== "object" || error === null || !("code" in error)) { + return null; + } + return String(error.code); +} + +export async function acquireHarnessLock(): Promise<() => Promise> { + const startedAt = Date.now(); + + while (true) { + try { + await mkdir(HARNESS_LOCK_DIR); + return async () => { + await rm(HARNESS_LOCK_DIR, { force: true, recursive: true }); + }; + } catch (error) { + const code = getErrorCode(error); + if (code !== "EEXIST") { + throw error; + } + if (Date.now() - startedAt > HARNESS_LOCK_TIMEOUT_MS) { + throw new Error("Timed out waiting for dashboard harness lock"); + } + await delay(HARNESS_LOCK_RETRY_MS); + } + } +} + export async function resetHarness(request: APIRequestContext) { const response = await request.post("/api/test/dashboard/reset"); if (!response.ok()) { diff --git a/ergon_cli/ergon_cli/commands/eval.py b/ergon_cli/ergon_cli/commands/eval.py index c3875728..f93c4235 100644 --- a/ergon_cli/ergon_cli/commands/eval.py +++ b/ergon_cli/ergon_cli/commands/eval.py @@ -20,7 +20,7 @@ async def _watch(args: Namespace) -> int: await watch_and_evaluate( checkpoint_dir=args.checkpoint_dir, benchmark_type=args.benchmark, - evaluator_type=args.evaluator or "stub-rubric", + evaluator_type=args.evaluator, model_base=args.model_base, poll_interval_s=args.poll_interval, eval_limit=args.eval_limit, @@ -35,7 +35,7 @@ async def _checkpoint(args: Namespace) -> int: await evaluate_checkpoint( checkpoint_path=args.checkpoint, benchmark_type=args.benchmark, - evaluator_type=args.evaluator or "stub-rubric", + evaluator_type=args.evaluator, model_base=args.model_base, eval_limit=args.eval_limit, ) diff --git a/ergon_cli/ergon_cli/composition/__init__.py b/ergon_cli/ergon_cli/composition/__init__.py index f3ad39e6..2f872393 100644 --- a/ergon_cli/ergon_cli/composition/__init__.py +++ b/ergon_cli/ergon_cli/composition/__init__.py @@ -39,8 +39,8 @@ def build_experiment( # otherwise ``task_execution_service._prepare_graph_native`` will # raise ``ConfigurationError: No ExperimentDefinitionWorker with # binding_key='{env}-smoke-leaf'`` when the first subtask fires. - # ``researchrubrics-sadpath-smoke-worker`` additionally needs the - # failing leaf binding so ``l_2`` can resolve. + # ``{env}-sadpath-smoke-worker`` additionally needs the failing leaf + # binding so ``l_2`` can resolve. if _is_smoke_worker(worker_slug): return _build_smoke_experiment( benchmark=benchmark, diff --git a/ergon_cli/ergon_cli/main.py b/ergon_cli/ergon_cli/main.py index 75997f3e..19d6905e 100644 --- a/ergon_cli/ergon_cli/main.py +++ b/ergon_cli/ergon_cli/main.py @@ -97,8 +97,8 @@ def build_parser() -> argparse.ArgumentParser: eval_watch = eval_sub.add_parser("watch", help="Watch for new checkpoints and evaluate") eval_watch.add_argument("--checkpoint-dir", required=True, help="Directory to watch") eval_watch.add_argument("--benchmark", required=True, help="Benchmark slug") - eval_watch.add_argument("--evaluator", default=None, help="Evaluator slug") - eval_watch.add_argument("--model-base", default=None, help="Base model for local eval") + eval_watch.add_argument("--evaluator", required=True, help="Evaluator slug") + eval_watch.add_argument("--model-base", required=True, help="Base model for local eval") eval_watch.add_argument("--poll-interval", type=int, default=60, help="Seconds between scans") eval_watch.add_argument("--eval-limit", type=int, default=None, help="Max tasks per eval") eval_watch.add_argument( @@ -110,8 +110,8 @@ def build_parser() -> argparse.ArgumentParser: eval_ckpt = eval_sub.add_parser("checkpoint", help="Evaluate a single checkpoint") eval_ckpt.add_argument("--checkpoint", required=True, help="Checkpoint path") eval_ckpt.add_argument("--benchmark", required=True, help="Benchmark slug") - eval_ckpt.add_argument("--evaluator", default=None, help="Evaluator slug") - eval_ckpt.add_argument("--model-base", default=None, help="Base model for local eval") + eval_ckpt.add_argument("--evaluator", required=True, help="Evaluator slug") + eval_ckpt.add_argument("--model-base", required=True, help="Base model for local eval") eval_ckpt.add_argument("--eval-limit", type=int, default=None, help="Max tasks") # -- onboard / doctor ------------------------------------------------------ diff --git a/ergon_core/ergon_core/core/api/app.py b/ergon_core/ergon_core/core/api/app.py index d6fab4c1..c251d1d4 100644 --- a/ergon_core/ergon_core/core/api/app.py +++ b/ergon_core/ergon_core/core/api/app.py @@ -24,6 +24,7 @@ from ergon_core.core.api.rollouts import init_service as init_rollout_service from ergon_core.core.api.rollouts import router as rollouts_router from ergon_core.core.api.runs import router as runs_router +from ergon_core.core.api.startup_plugins import run_startup_plugins from ergon_core.core.api.test_harness import router as _test_harness_router from ergon_core.core.dashboard.emitter import dashboard_emitter from ergon_core.core.persistence.shared.db import ensure_db, get_session @@ -93,19 +94,6 @@ async def lifespan(app: FastAPI): if settings.enable_test_harness: app.include_router(_test_harness_router) -if settings.smoke_fixtures_enabled: - # Register the canonical-smoke WORKERS / EVALUATORS into this - # process's registry dicts. Inngest's ``worker_execute_fn`` runs - # inside this container, so if the smoke fixtures are only imported - # host-side (in pytest's process) the container's dicts stay empty - # and every smoke run fails at worker resolution. The flag is - # separate from ``ENABLE_TEST_HARNESS`` because real-LLM rollouts - # need the read-only harness endpoints without replacing production - # benchmark registries with smoke fixtures. - # Test-support package kept outside ``tests`` so runtime entrypoints - # never import pytest-owned modules. - from ergon_core.test_support.smoke_fixtures import register_smoke_fixtures - - register_smoke_fixtures() +run_startup_plugins(settings.startup_plugins) inngest.fast_api.serve(app, inngest_client, ALL_FUNCTIONS) diff --git a/ergon_core/ergon_core/core/api/runs.py b/ergon_core/ergon_core/core/api/runs.py index b9457164..6bf766e4 100644 --- a/ergon_core/ergon_core/core/api/runs.py +++ b/ergon_core/ergon_core/core/api/runs.py @@ -289,6 +289,7 @@ def _task_keyed_sandboxes( def _build_communication_threads( threads: list[Thread], messages: list[ThreadMessage], + execution_task_map: dict[UUID, UUID], ) -> list[RunCommunicationThreadDto]: msgs_by_thread: dict[UUID, list[ThreadMessage]] = defaultdict(list) for m in sorted(messages, key=lambda m: m.sequence_num): @@ -296,11 +297,22 @@ def _build_communication_threads( result: list[RunCommunicationThreadDto] = [] for t in threads: + thread_messages = msgs_by_thread.get(t.id, []) + task_ids = { + task_id + for message in thread_messages + if message.task_execution_id is not None + for task_id in [execution_task_map.get(message.task_execution_id)] + if task_id is not None + } + thread_task_id = next(iter(task_ids)) if len(task_ids) == 1 else None result.append( RunCommunicationThreadDto( id=str(t.id), run_id=str(t.run_id), + task_id=str(thread_task_id) if thread_task_id else None, topic=t.topic, + summary=t.summary, agent_a_id=t.agent_a_id, agent_b_id=t.agent_b_id, created_at=t.created_at, @@ -311,6 +323,11 @@ def _build_communication_threads( thread_id=str(m.thread_id), run_id=str(m.run_id), thread_topic=t.topic, + task_id=( + str(execution_task_map[m.task_execution_id]) + if m.task_execution_id and m.task_execution_id in execution_task_map + else None + ), task_execution_id=str(m.task_execution_id) if m.task_execution_id else None, from_agent_id=m.from_agent_id, to_agent_id=m.to_agent_id, @@ -318,7 +335,7 @@ def _build_communication_threads( sequence_num=m.sequence_num, created_at=m.created_at, ) - for m in msgs_by_thread.get(t.id, []) + for m in thread_messages ], ) ) diff --git a/ergon_core/ergon_core/core/api/schemas.py b/ergon_core/ergon_core/core/api/schemas.py index 9de4bfa3..bc524176 100644 --- a/ergon_core/ergon_core/core/api/schemas.py +++ b/ergon_core/ergon_core/core/api/schemas.py @@ -140,6 +140,7 @@ class RunCommunicationThreadDto(CamelModel): run_id: str task_id: str | None = None topic: str + summary: str | None = None agent_a_id: str agent_b_id: str created_at: datetime diff --git a/ergon_core/ergon_core/core/api/startup_plugins.py b/ergon_core/ergon_core/core/api/startup_plugins.py new file mode 100644 index 00000000..c61c03fd --- /dev/null +++ b/ergon_core/ergon_core/core/api/startup_plugins.py @@ -0,0 +1,15 @@ +"""Optional startup plugin loader.""" + +from importlib import import_module + + +def run_startup_plugins(plugin_specs: tuple[str, ...]) -> None: + for spec in plugin_specs: + module_name, sep, attr_name = spec.partition(":") + if not sep or not module_name or not attr_name: + raise RuntimeError( + f"Invalid ERGON_STARTUP_PLUGINS entry {spec!r}; expected 'module:function'" + ) + module = import_module(module_name) + plugin = getattr(module, attr_name) # slopcop: ignore[no-hasattr-getattr] + plugin() diff --git a/ergon_core/ergon_core/core/persistence/telemetry/models.py b/ergon_core/ergon_core/core/persistence/telemetry/models.py index 34451c3a..89062405 100644 --- a/ergon_core/ergon_core/core/persistence/telemetry/models.py +++ b/ergon_core/ergon_core/core/persistence/telemetry/models.py @@ -396,6 +396,7 @@ class Thread(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) run_id: UUID = Field(foreign_key="runs.id", index=True) topic: str + summary: str | None = None agent_a_id: str = Field(index=True) agent_b_id: str = Field(index=True) created_at: datetime = Field(default_factory=_utcnow, sa_type=TZDateTime) diff --git a/ergon_core/ergon_core/core/providers/sandbox/__init__.py b/ergon_core/ergon_core/core/providers/sandbox/__init__.py index 80381500..6a0a5e62 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/__init__.py +++ b/ergon_core/ergon_core/core/providers/sandbox/__init__.py @@ -1,35 +1,8 @@ -"""Sandbox management: provisioning, file I/O, lifecycle.""" +"""Sandbox management: provisioning, file I/O, lifecycle. -from ergon_core.core.providers.sandbox.errors import ( - SandboxError, - SandboxExpiredError, - SandboxSetupError, -) -from ergon_core.core.providers.sandbox.event_sink import ( - CompoundSandboxEventSink, - DashboardEmitterSandboxEventSink, - NoopSandboxEventSink, - PostgresSandboxEventSink, - SandboxEventSink, -) -from ergon_core.core.providers.sandbox.manager import ( - BaseSandboxManager, - DefaultSandboxManager, - DownloadedFile, - DownloadedFiles, -) +Import concrete modules directly, for example +``ergon_core.core.providers.sandbox.manager``. Keeping this package initializer +lightweight avoids import cycles between telemetry models and API DTO modules. +""" -__all__ = [ - "BaseSandboxManager", - "CompoundSandboxEventSink", - "DashboardEmitterSandboxEventSink", - "DefaultSandboxManager", - "DownloadedFile", - "DownloadedFiles", - "NoopSandboxEventSink", - "PostgresSandboxEventSink", - "SandboxError", - "SandboxEventSink", - "SandboxExpiredError", - "SandboxSetupError", -] +__all__: list[str] = [] diff --git a/ergon_core/ergon_core/core/providers/sandbox/event_sink.py b/ergon_core/ergon_core/core/providers/sandbox/event_sink.py index e2d9aa6c..296df796 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/event_sink.py +++ b/ergon_core/ergon_core/core/providers/sandbox/event_sink.py @@ -4,7 +4,6 @@ from uuid import UUID from ergon_core.core.persistence.shared.db import get_session -from ergon_core.core.persistence.telemetry.models import SandboxCommandWalEntry, SandboxEvent @runtime_checkable @@ -151,6 +150,11 @@ async def sandbox_created( timeout_minutes: int, template: str | None = None, ) -> None: + # reason: avoid import cycle with ergon_core.api package exports. + from ergon_core.core.persistence.telemetry.models import ( + SandboxEvent, + ) + with get_session() as s: s.add( SandboxEvent( @@ -175,6 +179,11 @@ async def sandbox_command( exit_code: int | None = None, duration_ms: int | None = None, ) -> None: + # reason: avoid import cycle with ergon_core.api package exports. + from ergon_core.core.persistence.telemetry.models import ( + SandboxCommandWalEntry, + ) + with get_session() as s: s.add( SandboxCommandWalEntry( @@ -199,6 +208,11 @@ async def sandbox_closed( ) -> None: if run_id is None: return + # reason: avoid import cycle with ergon_core.api package exports. + from ergon_core.core.persistence.telemetry.models import ( + SandboxEvent, + ) + with get_session() as s: s.add( SandboxEvent( diff --git a/ergon_core/ergon_core/core/providers/sandbox/lifecycle.py b/ergon_core/ergon_core/core/providers/sandbox/lifecycle.py new file mode 100644 index 00000000..33595810 --- /dev/null +++ b/ergon_core/ergon_core/core/providers/sandbox/lifecycle.py @@ -0,0 +1,56 @@ +"""Runtime-facing sandbox lifecycle helpers.""" + +import logging +from enum import StrEnum + +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + + +class SandboxTerminationReason(StrEnum): + TERMINATED = "terminated" + NOT_FOUND_OR_ALREADY_CLOSED = "not_found_or_already_closed" + MISSING_ID = "missing_id" + ERROR = "error" + + +class SandboxTerminationResult(BaseModel): + sandbox_id: str | None + terminated: bool + reason: SandboxTerminationReason + + +async def terminate_sandbox_by_id(sandbox_id: str | None) -> SandboxTerminationResult: + """Terminate a sandbox behind a single runtime-facing boundary.""" + if sandbox_id is None: + return SandboxTerminationResult( + sandbox_id=None, + terminated=False, + reason=SandboxTerminationReason.MISSING_ID, + ) + + try: + # reason: avoid import cycle between sandbox manager/event sink and telemetry models. + from ergon_core.core.providers.sandbox.manager import ( + BaseSandboxManager, + ) + + terminated = await BaseSandboxManager.terminate_by_sandbox_id(sandbox_id) + except Exception: # slopcop: ignore[no-broad-except] + logger.error("Failed to terminate sandbox %s", sandbox_id, exc_info=True) + return SandboxTerminationResult( + sandbox_id=sandbox_id, + terminated=False, + reason=SandboxTerminationReason.ERROR, + ) + + return SandboxTerminationResult( + sandbox_id=sandbox_id, + terminated=terminated, + reason=( + SandboxTerminationReason.TERMINATED + if terminated + else SandboxTerminationReason.NOT_FOUND_OR_ALREADY_CLOSED + ), + ) diff --git a/ergon_core/ergon_core/core/providers/sandbox/manager.py b/ergon_core/ergon_core/core/providers/sandbox/manager.py index 406a35c0..7bbab2ab 100644 --- a/ergon_core/ergon_core/core/providers/sandbox/manager.py +++ b/ergon_core/ergon_core/core/providers/sandbox/manager.py @@ -8,7 +8,6 @@ from typing import ClassVar, Protocol, runtime_checkable from uuid import UUID -from ergon_core.api.json_types import JsonValue from ergon_core.core.providers.sandbox.errors import SandboxExpiredError from ergon_core.core.providers.sandbox.event_sink import ( NoopSandboxEventSink, @@ -33,7 +32,7 @@ class UploadableResource(Protocol): except ImportError: AsyncSandbox = None # type: ignore[assignment,misc] - # Fallback stubs so `except (TimeoutException, SandboxNotFoundException)` + # Fallback exception classes so `except (TimeoutException, SandboxNotFoundException)` # stays syntactically valid when the e2b SDK is unavailable. They will # never actually be raised because the sandbox code paths require e2b. class _MissingE2BError(Exception): # slopcop: ignore[no-broad-except] @@ -97,7 +96,8 @@ def __init__(self) -> None: # Sink is configured process-wide via set_event_sink() in app lifespan. # Do not accept event_sink= here; the singleton pattern (see __new__ above) # makes constructor-level sink assignment a last-write-wins stomp on shared - # class state. Tests must use set_event_sink() in fixture setup. + # class state. Local verification harnesses should use set_event_sink() + # during setup. pass @classmethod @@ -105,12 +105,12 @@ def set_event_sink(cls, sink: SandboxEventSink) -> None: """Install a process-level event sink on this manager subclass. Called once during FastAPI lifespan startup for each concrete subclass. - Tests may call this in fixture setup and reset with - ``NoopSandboxEventSink()`` in teardown. + Local verification harnesses may call this during setup and reset with + ``NoopSandboxEventSink()`` during teardown. Assigns directly to ``cls._event_sink`` (not to the base class attribute), so each subclass carries its own sink and subclasses can - be individually targeted in tests. + be individually targeted by local verification harnesses. Production callers MUST NOT call this after startup. The only sanctioned call site is inside the ``lifespan`` context manager in @@ -534,6 +534,10 @@ async def terminate_by_sandbox_id(sandbox_id: str) -> bool: task_id, BaseSandboxManager, ) + if manager_cls is not BaseSandboxManager: + await manager_cls().terminate(task_id) + return True + display_task_id = BaseSandboxManager._display_task_ids.get(task_id, task_id) run_id = BaseSandboxManager._run_ids.get(task_id) try: @@ -619,98 +623,7 @@ class state. ``reconnect`` deliberately does NOT register the class DefaultSandboxManager(BaseSandboxManager): - """No custom dependencies. Used by benchmarks without specific sandbox setup. - - If ``E2B_API_KEY`` is not configured (e.g. CI stub runs) this manager - transparently delegates to ``StubSandboxManager`` -- the task still - runs, but no E2B sandbox is provisioned and the returned sandbox_id is - a well-formed stub id (see :func:`is_stub_sandbox_id`). - """ - - async def create( - self, - sandbox_key: UUID, - run_id: UUID, - timeout_minutes: int = 30, - envs: dict[str, str] | None = None, - display_task_id: UUID | None = None, - ) -> str: - if not settings.e2b_api_key: - return await StubSandboxManager().create( - sandbox_key, - run_id=run_id, - timeout_minutes=timeout_minutes, - envs=envs, - display_task_id=display_task_id, - ) - return await super().create( - sandbox_key, - run_id=run_id, - timeout_minutes=timeout_minutes, - envs=envs, - display_task_id=display_task_id, - ) + """No custom dependencies. Used by benchmarks without specific sandbox setup.""" async def _install_dependencies(self, sandbox: AsyncSandbox, task_id: UUID) -> None: pass - - -# ── Stub sandbox manager ────────────────────────────────────────────────── - -_STUB_SANDBOX_PREFIX = "stub-sandbox-" - - -def is_stub_sandbox_id(sandbox_id: JsonValue) -> bool: - """Return True iff ``sandbox_id`` was produced by :class:`StubSandboxManager`. - - Stub sandbox ids are produced by the CI / no-E2B-key code path. Any - teardown or download code that touches the E2B API must skip when this - returns True, otherwise the call will fail (no API key, no sandbox - exists on the E2B side). - - Accepts JSON values because some call sites read ``sandbox_id`` out of - persisted JSON summaries before checking whether teardown should skip. - """ - return isinstance(sandbox_id, str) and sandbox_id.startswith(_STUB_SANDBOX_PREFIX) - - -class StubSandboxManager(BaseSandboxManager): - """No-op sandbox manager used when E2B is not configured. - - ``create`` returns a synthetic id (``stub-sandbox-``). ``terminate`` - and other lifecycle methods are no-ops. Consumers that must distinguish - the stub path can call :func:`is_stub_sandbox_id` -- the sentinel string - ``"skipped"`` and the ``SandboxId = str | Literal["skipped"]`` union it - required have both been retired. - """ - - async def create( - self, - sandbox_key: UUID, - run_id: UUID, - timeout_minutes: int = 30, - envs: dict[str, str] | None = None, - display_task_id: UUID | None = None, - ) -> str: - stub_id = f"{_STUB_SANDBOX_PREFIX}{sandbox_key}" - logger.info( - "E2B_API_KEY not set — returning stub sandbox id %s for task %s (stub mode)", - stub_id, - sandbox_key, - ) - self._ensure_registries(sandbox_key) - self._run_ids[sandbox_key] = run_id - self._display_task_ids[sandbox_key] = display_task_id or sandbox_key - return stub_id - - async def _install_dependencies(self, sandbox: AsyncSandbox, task_id: UUID) -> None: - return None - - async def terminate(self, task_id: UUID, reason: str = "completed") -> None: - self._file_registries.pop(task_id, None) - self._created_files_registry.pop(task_id, None) - self._run_ids.pop(task_id, None) - self._display_task_ids.pop(task_id, None) - - async def reset_timeout(self, task_id: UUID, timeout_minutes: int = 30) -> bool: - return True diff --git a/ergon_core/ergon_core/core/rl/eval_runner.py b/ergon_core/ergon_core/core/rl/eval_runner.py index 4545bf75..748a682a 100644 --- a/ergon_core/ergon_core/core/rl/eval_runner.py +++ b/ergon_core/ergon_core/core/rl/eval_runner.py @@ -21,8 +21,8 @@ async def watch_and_evaluate( checkpoint_dir: str, benchmark_type: str, *, - evaluator_type: str = "stub-rubric", - model_base: str | None = None, + evaluator_type: str, + model_base: str, poll_interval_s: int = 60, eval_limit: int | None = None, on_checkpoint_cmd: str | None = None, @@ -94,7 +94,7 @@ async def _run_local_eval( *, benchmark_type: str, evaluator_type: str, - model_base: str | None, + model_base: str, eval_limit: int | None, ) -> int: """Run benchmark evaluation locally via the CLI. Returns exit code. @@ -102,7 +102,7 @@ async def _run_local_eval( Uses the checkpoint path as the vLLM model target so each checkpoint is actually evaluated (not just the base model). """ - model_target = f"vllm:{ckpt.path}" if model_base else "stub-worker" + model_target = f"vllm:{ckpt.path}" cmd = [ "ergon", @@ -154,8 +154,8 @@ async def evaluate_checkpoint( checkpoint_path: str, benchmark_type: str, *, - evaluator_type: str = "stub-rubric", - model_base: str | None = None, + evaluator_type: str, + model_base: str, eval_limit: int | None = None, ) -> int: """One-shot checkpoint evaluation. Returns exit code.""" diff --git a/ergon_core/ergon_core/core/runtime/events/task_events.py b/ergon_core/ergon_core/core/runtime/events/task_events.py index 87bc1b07..9fd8f217 100644 --- a/ergon_core/ergon_core/core/runtime/events/task_events.py +++ b/ergon_core/ergon_core/core/runtime/events/task_events.py @@ -8,15 +8,11 @@ from ergon_core.core.runtime.events.base import InngestEventContract -# SandboxId is just a str. Previously the type was ``str | Literal["skipped"]`` -# with a ``SANDBOX_SKIPPED = "skipped"`` sentinel returned by -# ``DefaultSandboxManager.create`` when no E2B_API_KEY was configured. That -# sentinel forced every downstream consumer to branch on a magic string. -# Stub/CI mode is now served by ``StubSandboxManager`` which returns a -# structurally identifiable ID (see ``is_stub_sandbox_id``); event payloads -# carry a plain ``str`` sandbox_id exactly like the real path, and -# ``TaskFailedEvent.sandbox_id`` is ``str | None`` because a task can fail -# before sandbox-setup runs (in which case there really is no sandbox). +# Production task execution emits real sandbox IDs. Test-support managers may +# use sentinel IDs, but core event consumers must not parse or branch on those +# sentinel formats. ``TaskFailedEvent.sandbox_id`` is ``str | None`` because a +# task can fail before sandbox setup runs, in which case there really is no +# sandbox. SandboxId = str diff --git a/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py b/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py index 28cdeb05..f517de7a 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py +++ b/ergon_core/ergon_core/core/runtime/inngest/benchmark_run_start.py @@ -30,8 +30,8 @@ class BenchmarkRunRequest(InngestEventContract): benchmark_slug: str model: str - worker_slug: str = "stub-worker" - evaluator_slug: str = "stub-rubric" + worker_slug: str + evaluator_slug: str cohort_name: str = "" # slopcop: ignore[no-str-empty-default] diff --git a/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py b/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py index 072ad3c9..cfee52bd 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py +++ b/ergon_core/ergon_core/core/runtime/inngest/check_evaluators.py @@ -9,10 +9,7 @@ import logging import inngest -from ergon_core.core.providers.sandbox.manager import ( - BaseSandboxManager, - is_stub_sandbox_id, -) +from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id from ergon_core.core.runtime.events.task_events import ( TaskCompletedEvent, ) @@ -101,15 +98,11 @@ async def check_and_run_evaluators(ctx: inngest.Context) -> EvaluatorsResult: async def _terminate_sandbox(sandbox_id: str) -> None: - """Terminate the task's sandbox if one was created.""" - if is_stub_sandbox_id(sandbox_id): - return - try: - await BaseSandboxManager.terminate_by_sandbox_id(sandbox_id) - logger.info("Terminated sandbox %s after evaluation", sandbox_id) - except Exception: # slopcop: ignore[no-broad-except] - logger.error( - "Failed to terminate sandbox %s — potential sandbox leak", - sandbox_id, - exc_info=True, - ) + """Terminate the task's sandbox through the provider lifecycle boundary.""" + result = await terminate_sandbox_by_id(sandbox_id) + logger.info( + "Evaluator sandbox cleanup sandbox_id=%s terminated=%s reason=%s", + result.sandbox_id, + result.terminated, + result.reason, + ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py b/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py index e4920f43..683416e3 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py +++ b/ergon_core/ergon_core/core/runtime/inngest/cleanup_cancelled_task.py @@ -2,7 +2,8 @@ Two durable steps: 1. update-db-rows — mark execution CANCELLED (idempotent) -2. release-sandbox — stub (pending sandbox management module) +2. release-sandbox — routed through the sandbox lifecycle provider when an + execution has an associated sandbox. """ import logging diff --git a/ergon_core/ergon_core/core/runtime/inngest/execute_task.py b/ergon_core/ergon_core/core/runtime/inngest/execute_task.py index 74fd1e17..494ee874 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/execute_task.py +++ b/ergon_core/ergon_core/core/runtime/inngest/execute_task.py @@ -8,7 +8,6 @@ from datetime import UTC, datetime import inngest -from ergon_core.core.providers.sandbox.manager import StubSandboxManager from ergon_core.core.runtime.errors import ContractViolationError from ergon_core.core.runtime.events.task_events import ( TaskCompletedEvent, @@ -206,27 +205,11 @@ async def execute_task_fn(ctx: inngest.Context) -> TaskExecuteResult: prepared = await _prepare_execution(ctx, svc, payload) if prepared.skipped: - logger.info( - "task-execute skipped task_id=%s reason=%s", - payload.task_id, - prepared.skip_reason, - ) - # ``TaskCompletedEvent.sandbox_id`` is required, so mint a stub id - # representing "this task completed without provisioning a sandbox". - # Downstream teardown uses ``is_stub_sandbox_id`` to short-circuit. - stub_sandbox_id = await StubSandboxManager().create( - prepared.node_id, - run_id=payload.run_id, - display_task_id=prepared.node_id, - ) - await _emit_task_completed(payload, prepared, stub_sandbox_id) - return TaskExecuteResult( + raise ContractViolationError( + "Skipped task execution cannot emit task/completed without a real sandbox_id. " + "Introduce a first-class task/skipped event before supporting skipped tasks.", run_id=payload.run_id, task_id=payload.task_id, - execution_id=prepared.execution_id, - success=True, - skipped=True, - skip_reason=prepared.skip_reason, ) sandbox_result = await _setup_sandbox(ctx, payload, prepared) @@ -241,6 +224,7 @@ async def execute_task_fn(ctx: inngest.Context) -> TaskExecuteResult: worker_result = await _run_worker(ctx, payload, prepared, sandbox_result) if not worker_result.success: + await _persist_outputs(ctx, payload, prepared, sandbox_result) raise RuntimeError(worker_result.error or "Worker execution failed") persist_result = await _persist_outputs(ctx, payload, prepared, sandbox_result) diff --git a/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py b/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py index d60e3478..87b1ae89 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py +++ b/ergon_core/ergon_core/core/runtime/inngest/propagate_execution.py @@ -7,6 +7,7 @@ from datetime import UTC, datetime import inngest +from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id from ergon_core.core.runtime.events.task_events import ( TaskCancelledEvent, TaskCompletedEvent, @@ -30,6 +31,7 @@ task_propagate_context, ) + logger = logging.getLogger(__name__) @@ -160,6 +162,7 @@ async def propagate_task_failure_fn(ctx: inngest.Context) -> TaskPropagateResult node_id=payload.node_id, ) ) + await _terminate_failed_task_sandbox(payload.sandbox_id) # BLOCKED successors are a DB write only — no task/cancelled events. # propagation.invalidated_targets is always empty from the failure path. @@ -188,3 +191,13 @@ async def propagate_task_failure_fn(ctx: inngest.Context) -> TaskPropagateResult workflow_failed=(propagation.workflow_terminal_state == WorkflowTerminalState.FAILED), ) return result + + +async def _terminate_failed_task_sandbox(sandbox_id: str | None) -> None: + result = await terminate_sandbox_by_id(sandbox_id) + if not result.terminated: + logger.info( + "failed-task sandbox cleanup did not terminate sandbox_id=%s reason=%s", + result.sandbox_id, + result.reason, + ) diff --git a/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py b/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py index e433ca7e..88a83fdc 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py +++ b/ergon_core/ergon_core/core/runtime/inngest/run_cleanup.py @@ -11,10 +11,7 @@ from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.enums import RunStatus from ergon_core.core.persistence.telemetry.models import RunRecord -from ergon_core.core.providers.sandbox.manager import ( - BaseSandboxManager, - is_stub_sandbox_id, -) +from ergon_core.core.providers.sandbox.lifecycle import terminate_sandbox_by_id from ergon_core.core.runtime.errors import ConfigurationError, DataIntegrityError from ergon_core.core.runtime.events.infrastructure_events import RunCleanupEvent from ergon_core.core.runtime.inngest_client import inngest_client @@ -67,17 +64,12 @@ async def _cleanup_run(run_id: UUID, status: str, error_message: str | None) -> raise DataIntegrityError("RunRecord", run_id) sandbox_id = run.parsed_summary().get("sandbox_id") - sandbox_terminated = False + sandbox_result = await terminate_sandbox_by_id( + sandbox_id if isinstance(sandbox_id, str) else None + ) + sandbox_terminated = sandbox_result.terminated - if is_stub_sandbox_id(sandbox_id): - logger.info( - "run-cleanup run_id=%s: sandbox_id=%s is a stub (no E2B sandbox exists), skipping termination", - run_id, - sandbox_id, - ) - elif sandbox_id and isinstance(sandbox_id, str): - sandbox_terminated = await BaseSandboxManager.terminate_by_sandbox_id(sandbox_id) - elif sandbox_id is not None: + if sandbox_id is not None and not isinstance(sandbox_id, str): logger.warning( "run-cleanup run_id=%s: sandbox_id has unexpected type %s, skipping termination", run_id, diff --git a/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py b/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py index 6d677b4d..e6c3a8e9 100644 --- a/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py +++ b/ergon_core/ergon_core/core/runtime/inngest/worker_execute.py @@ -13,6 +13,7 @@ from pydantic import BaseModel from ergon_builtins.registry import BENCHMARKS, WORKERS from ergon_core.api.generation import GenerationTurn +from ergon_core.api.results import WorkerOutput from ergon_core.api.task_types import BenchmarkTask, EmptyTaskPayload from ergon_core.api.worker_context import WorkerContext from ergon_core.core.dashboard.emitter import dashboard_emitter @@ -32,6 +33,14 @@ logger = logging.getLogger(__name__) +def _worker_execute_result_from_output(output: WorkerOutput) -> WorkerExecuteResult: + return WorkerExecuteResult( + success=output.success, + final_assistant_message=output.output, + error=None if output.success else output.output, + ) + + @inngest_client.create_function( fn_id="worker-execute", trigger=inngest.TriggerEvent(event="task/worker-execute"), @@ -155,10 +164,7 @@ async def worker_execute_fn(ctx: inngest.Context) -> WorkerExecuteResult: ) ) - return WorkerExecuteResult( - success=True, - final_assistant_message=output.output, - ) + return _worker_execute_result_from_output(output) async def _persist_context_events( diff --git a/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py b/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py index 8185ae24..b0a0d8da 100644 --- a/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py +++ b/ergon_core/ergon_core/core/runtime/services/cohort_schemas.py @@ -50,6 +50,8 @@ class CohortRunRowDto(BaseModel): completed_at: datetime | None = None running_time_ms: int | None = None final_score: float | None = None + total_tasks: int | None = None + total_cost_usd: float | None = None error_message: str | None = None diff --git a/ergon_core/ergon_core/core/runtime/services/cohort_service.py b/ergon_core/ergon_core/core/runtime/services/cohort_service.py index f4176f5c..5cb73e3d 100644 --- a/ergon_core/ergon_core/core/runtime/services/cohort_service.py +++ b/ergon_core/ergon_core/core/runtime/services/cohort_service.py @@ -9,6 +9,7 @@ ExperimentCohortStatus, RunRecord, ) +from ergon_core.core.persistence.graph.models import RunGraphNode from ergon_core.core.runtime.services.cohort_schemas import ( CohortDetailDto, CohortRunRowDto, @@ -17,7 +18,7 @@ UpdateCohortRequest, ) from ergon_core.core.utils import utcnow -from sqlmodel import select +from sqlmodel import func, select class ExperimentCohortService: @@ -79,7 +80,22 @@ def get_detail(self, cohort_id: UUID) -> CohortDetailDto | None: runs = list( session.exec(select(RunRecord).where(RunRecord.cohort_id == cohort_id)).all() ) - run_rows = [self._build_run_row(cohort, run) for run in runs] + task_counts = ( + { + run_id: count + for run_id, count in session.exec( + select(RunGraphNode.run_id, func.count(RunGraphNode.id)) + .where(RunGraphNode.run_id.in_([run.id for run in runs])) + .group_by(RunGraphNode.run_id) + ).all() + } + if runs + else {} + ) + run_rows = [ + self._build_run_row(cohort, run, int(task_counts.get(run.id, 0)) or None) + for run in runs + ] return CohortDetailDto(summary=summary, runs=run_rows) def get_summary(self, cohort_id: UUID) -> CohortSummaryDto | None: @@ -143,7 +159,11 @@ def _build_summary( ) @staticmethod - def _build_run_row(cohort: ExperimentCohort, run: RunRecord) -> CohortRunRowDto: + def _build_run_row( + cohort: ExperimentCohort, + run: RunRecord, + total_tasks: int | None = None, + ) -> CohortRunRowDto: running_time_ms: int | None = None if run.started_at is not None: end_time = run.completed_at or utcnow() @@ -152,7 +172,11 @@ def _build_run_row(cohort: ExperimentCohort, run: RunRecord) -> CohortRunRowDto: score: float | None = None summary = run.parsed_summary() if summary: - score = summary.get("normalized_score") or summary.get("final_score") + raw_score = summary.get("normalized_score") + if raw_score is None: + raw_score = summary.get("final_score") + score = float(raw_score) if isinstance(raw_score, int | float) else None + total_cost_usd = summary.get("total_cost_usd") if summary else None return CohortRunRowDto( run_id=run.id, @@ -165,6 +189,10 @@ def _build_run_row(cohort: ExperimentCohort, run: RunRecord) -> CohortRunRowDto: completed_at=run.completed_at, running_time_ms=running_time_ms, final_score=score, + total_tasks=total_tasks, + total_cost_usd=( + float(total_cost_usd) if isinstance(total_cost_usd, int | float) else None + ), error_message=run.error_message, ) diff --git a/ergon_core/ergon_core/core/runtime/services/communication_schemas.py b/ergon_core/ergon_core/core/runtime/services/communication_schemas.py index 53b12629..d16e6f77 100644 --- a/ergon_core/ergon_core/core/runtime/services/communication_schemas.py +++ b/ergon_core/ergon_core/core/runtime/services/communication_schemas.py @@ -19,6 +19,10 @@ class CreateMessageRequest(BaseModel): description="ID of the receiving agent, e.g. '{run_id}:stakeholder'", ) thread_topic: str + thread_summary: str | None = Field( + default=None, + description="Optional human-readable summary set when the thread is first created.", + ) content: str task_execution_id: UUID | None = None @@ -45,6 +49,7 @@ class ThreadSummary(BaseModel): thread_id: UUID run_id: UUID topic: str + summary: str | None = None agent_a_id: str agent_b_id: str message_count: int @@ -56,6 +61,7 @@ class ThreadWithMessages(BaseModel): thread_id: UUID run_id: UUID topic: str + summary: str | None = None agent_a_id: str agent_b_id: str messages: list[MessageResponse] diff --git a/ergon_core/ergon_core/core/runtime/services/communication_service.py b/ergon_core/ergon_core/core/runtime/services/communication_service.py index c0f5ad9f..04d06778 100644 --- a/ergon_core/ergon_core/core/runtime/services/communication_service.py +++ b/ergon_core/ergon_core/core/runtime/services/communication_service.py @@ -35,6 +35,7 @@ async def save_message(self, request: CreateMessageRequest) -> MessageResponse: agent_a_id=request.from_agent_id, agent_b_id=request.to_agent_id, topic=request.thread_topic, + thread_summary=request.thread_summary, ) seq_num = ( @@ -80,6 +81,7 @@ async def save_message(self, request: CreateMessageRequest) -> MessageResponse: id=str(thread.id), run_id=str(thread.run_id), topic=thread.topic, + summary=thread.summary, agent_a_id=thread.agent_a_id, agent_b_id=thread.agent_b_id, created_at=thread.created_at, @@ -95,6 +97,7 @@ async def save_message(self, request: CreateMessageRequest) -> MessageResponse: to_agent_id=message.to_agent_id, content=message.content, sequence_num=message.sequence_num, + task_execution_id=str(message.task_execution_id) if message.task_execution_id else None, created_at=message.created_at, ) try: @@ -153,6 +156,7 @@ def get_all_threads_for_run(self, run_id: UUID) -> list[ThreadSummary]: thread_id=thread.id, run_id=thread.run_id, topic=thread.topic, + summary=thread.summary, agent_a_id=thread.agent_a_id, agent_b_id=thread.agent_b_id, message_count=count, @@ -174,6 +178,7 @@ def get_thread_with_messages(self, thread_id: UUID) -> ThreadWithMessages | None thread_id=thread.id, run_id=thread.run_id, topic=thread.topic, + summary=thread.summary, agent_a_id=thread.agent_a_id, agent_b_id=thread.agent_b_id, messages=messages, @@ -193,6 +198,7 @@ def _get_or_create_thread( agent_a_id: str, agent_b_id: str, topic: str, + thread_summary: str | None = None, ) -> Thread: # Threads are keyed by (run_id, topic) only — all senders on the same # topic share one thread per run (broadcast/group semantics). @@ -201,10 +207,19 @@ def _get_or_create_thread( stmt = select(Thread).where(Thread.run_id == run_id).where(Thread.topic == topic) existing = session.exec(stmt).first() if existing is not None: + if existing.summary is None and thread_summary: + existing.summary = thread_summary + session.add(existing) return existing a, b = sorted([agent_a_id, agent_b_id]) - thread = Thread(run_id=run_id, topic=topic, agent_a_id=a, agent_b_id=b) + thread = Thread( + run_id=run_id, + topic=topic, + summary=thread_summary, + agent_a_id=a, + agent_b_id=b, + ) session.add(thread) try: session.flush() diff --git a/ergon_core/ergon_core/core/runtime/services/run_read_service.py b/ergon_core/ergon_core/core/runtime/services/run_read_service.py index aba8a421..f8a2b811 100644 --- a/ergon_core/ergon_core/core/runtime/services/run_read_service.py +++ b/ergon_core/ergon_core/core/runtime/services/run_read_service.py @@ -164,7 +164,11 @@ def build_run_snapshot(self, run_id: UUID) -> RunSnapshotDto | None: ), context_events_by_task=dict(context_events_by_task), sandboxes_by_task=run_api_helpers._task_keyed_sandboxes(run_summary), - threads=run_api_helpers._build_communication_threads(threads, thread_messages), + threads=run_api_helpers._build_communication_threads( + threads, + thread_messages, + execution_task_map, + ), started_at=run.started_at or run.created_at, completed_at=run.completed_at, duration_seconds=duration_seconds, diff --git a/ergon_core/ergon_core/core/settings.py b/ergon_core/ergon_core/core/settings.py index 83588e68..e2643d71 100644 --- a/ergon_core/ergon_core/core/settings.py +++ b/ergon_core/ergon_core/core/settings.py @@ -56,9 +56,9 @@ class Settings(BaseSettings): default=False, validation_alias=AliasChoices("ENABLE_TEST_HARNESS"), ) - enable_smoke_fixtures: bool | None = Field( - default=None, - validation_alias=AliasChoices("ENABLE_SMOKE_FIXTURES"), + startup_plugin_specs: str = Field( + default="", + validation_alias=AliasChoices("ERGON_STARTUP_PLUGINS"), ) @property @@ -70,12 +70,8 @@ def runs_dir(self) -> Path: return self.data_dir / "runs" @property - def smoke_fixtures_enabled(self) -> bool: - return ( - self.enable_smoke_fixtures - if self.enable_smoke_fixtures is not None - else self.enable_test_harness - ) + def startup_plugins(self) -> tuple[str, ...]: + return tuple(spec.strip() for spec in self.startup_plugin_specs.split(",") if spec.strip()) def missing_values(self, names: list[str]) -> list[str]: return [ diff --git a/ergon_core/ergon_core/test_support/sandbox/__init__.py b/ergon_core/ergon_core/test_support/sandbox/__init__.py new file mode 100644 index 00000000..295929ef --- /dev/null +++ b/ergon_core/ergon_core/test_support/sandbox/__init__.py @@ -0,0 +1,18 @@ +"""Test-support sandbox doubles.""" + +from ergon_core.test_support.sandbox.sentinel import is_stub_sandbox_id + +__all__ = ["StubSandboxManager", "is_stub_sandbox_id"] + + +def __getattr__( + name: str, +) -> object: # slopcop: ignore[no-typing-any] -- module-level lazy export hook. + if name == "StubSandboxManager": + # reason: avoid importing manager/test doubles unless explicitly requested. + from ergon_core.test_support.sandbox.stub_manager import ( + StubSandboxManager, + ) + + return StubSandboxManager + raise AttributeError(name) diff --git a/ergon_core/ergon_core/test_support/sandbox/sentinel.py b/ergon_core/ergon_core/test_support/sandbox/sentinel.py new file mode 100644 index 00000000..1bc9abe8 --- /dev/null +++ b/ergon_core/ergon_core/test_support/sandbox/sentinel.py @@ -0,0 +1,9 @@ +"""Sentinel helpers for test-support sandbox IDs.""" + +STUB_SANDBOX_PREFIX = "stub-sandbox-" + + +def is_stub_sandbox_id( + sandbox_id: object, # slopcop: ignore[no-typing-any] -- sentinel check accepts arbitrary persisted JSON values. +) -> bool: + return isinstance(sandbox_id, str) and sandbox_id.startswith(STUB_SANDBOX_PREFIX) diff --git a/ergon_core/ergon_core/test_support/sandbox/stub_manager.py b/ergon_core/ergon_core/test_support/sandbox/stub_manager.py new file mode 100644 index 00000000..1674ddb3 --- /dev/null +++ b/ergon_core/ergon_core/test_support/sandbox/stub_manager.py @@ -0,0 +1,53 @@ +"""Sandbox manager test double.""" + +import logging +from typing import cast +from uuid import UUID + +from ergon_core.core.providers.sandbox.manager import AsyncSandbox, BaseSandboxManager +from ergon_core.test_support.sandbox.sentinel import STUB_SANDBOX_PREFIX + +logger = logging.getLogger(__name__) + + +class _StubSandbox: + def __init__(self, sandbox_id: str) -> None: + self.sandbox_id = sandbox_id + + async def kill(self) -> None: + return None + + +class StubSandboxManager(BaseSandboxManager): + """No-op sandbox manager for tests.""" + + async def create( + self, + sandbox_key: UUID, + run_id: UUID, + timeout_minutes: int = 30, + envs: dict[str, str] | None = None, + display_task_id: UUID | None = None, + ) -> str: + stub_id = f"{STUB_SANDBOX_PREFIX}{sandbox_key}" + logger.info("Returning test stub sandbox id %s for task %s", stub_id, sandbox_key) + self._ensure_registries(sandbox_key) + self._sandboxes[sandbox_key] = cast("AsyncSandbox", _StubSandbox(stub_id)) + self._run_ids[sandbox_key] = run_id + self._display_task_ids[sandbox_key] = display_task_id or sandbox_key + self._sandbox_manager_classes[sandbox_key] = type(self) + return stub_id + + async def _install_dependencies(self, sandbox: AsyncSandbox, task_id: UUID) -> None: + return None + + async def terminate(self, task_id: UUID, reason: str = "completed") -> None: + self._sandboxes.pop(task_id, None) + self._file_registries.pop(task_id, None) + self._created_files_registry.pop(task_id, None) + self._run_ids.pop(task_id, None) + self._display_task_ids.pop(task_id, None) + self._sandbox_manager_classes.pop(task_id, None) + + async def reset_timeout(self, task_id: UUID, timeout_minutes: int = 30) -> bool: + return True diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/__init__.py b/ergon_core/ergon_core/test_support/smoke_fixtures/__init__.py index f60f5b68..275204b8 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/__init__.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/__init__.py @@ -29,18 +29,20 @@ ) from ergon_core.test_support.smoke_fixtures.sandbox import SmokeSandboxManager from ergon_core.test_support.smoke_fixtures.workers.minif2f_smoke import ( + MiniF2FFailingLeafWorker, + MiniF2FSadPathSmokeWorker, MiniF2FSmokeLeafWorker, MiniF2FSmokeWorker, ) from ergon_core.test_support.smoke_fixtures.workers.researchrubrics_smoke import ( - ResearchRubricsSmokeLeafWorker, - ResearchRubricsSmokeWorker, -) -from ergon_core.test_support.smoke_fixtures.workers.researchrubrics_smoke_sadpath import ( ResearchRubricsFailingLeafWorker, ResearchRubricsSadPathSmokeWorker, + ResearchRubricsSmokeLeafWorker, + ResearchRubricsSmokeWorker, ) from ergon_core.test_support.smoke_fixtures.workers.swebench_smoke import ( + SweBenchFailingLeafWorker, + SweBenchSadPathSmokeWorker, SweBenchSmokeLeafWorker, SweBenchSmokeWorker, ) @@ -79,12 +81,16 @@ def register_smoke_fixtures() -> None: WORKERS[ResearchRubricsSadPathSmokeWorker.type_slug] = ResearchRubricsSadPathSmokeWorker WORKERS[ResearchRubricsFailingLeafWorker.type_slug] = ResearchRubricsFailingLeafWorker - # MiniF2F happy-path + # MiniF2F happy + sad-path WORKERS[MiniF2FSmokeWorker.type_slug] = MiniF2FSmokeWorker WORKERS[MiniF2FSmokeLeafWorker.type_slug] = MiniF2FSmokeLeafWorker + WORKERS[MiniF2FSadPathSmokeWorker.type_slug] = MiniF2FSadPathSmokeWorker + WORKERS[MiniF2FFailingLeafWorker.type_slug] = MiniF2FFailingLeafWorker EVALUATORS[MiniF2FSmokeRubric.type_slug] = MiniF2FSmokeRubric - # SWE-Bench Verified happy-path + # SWE-Bench Verified happy + sad-path WORKERS[SweBenchSmokeWorker.type_slug] = SweBenchSmokeWorker WORKERS[SweBenchSmokeLeafWorker.type_slug] = SweBenchSmokeLeafWorker + WORKERS[SweBenchSadPathSmokeWorker.type_slug] = SweBenchSadPathSmokeWorker + WORKERS[SweBenchFailingLeafWorker.type_slug] = SweBenchFailingLeafWorker EVALUATORS[SweBenchSmokeRubric.type_slug] = SweBenchSmokeRubric diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/sadpath.py b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/sadpath.py new file mode 100644 index 00000000..a09e9bf6 --- /dev/null +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/sadpath.py @@ -0,0 +1,83 @@ +"""Shared smoke sad-path helpers. + +The canonical sad path routes ``l_2`` to a failing leaf. ``l_3`` depends +on ``l_2``, so runtime propagation should leave ``l_3`` blocked and never +started while independent branches continue normally. +""" + +from typing import ClassVar + +from e2b_code_interpreter import AsyncSandbox # type: ignore[import-untyped] + +from ergon_core.api import WorkerContext +from ergon_core.core.persistence.shared.types import AssignedWorkerSlug, TaskSlug +from ergon_core.core.runtime.services.task_management_dto import SubtaskSpec +from ergon_core.test_support.smoke_fixtures.smoke_base.subworker import SubworkerResult + + +class AlwaysFailSubworker: + """Writes partial work and runs a probe before returning failure.""" + + async def work(self, node_id: str, sandbox: AsyncSandbox) -> SubworkerResult: + partial_path = f"/workspace/final_output/partial_{node_id}.md" + await sandbox.files.write( + partial_path, + ( + f"# Partial work {node_id}\n\n" + "This content was written before a deliberate failure. If smoke " + "sees this as a RunResource row, partial serialization works.\n" + ), + ) + + pre_check = await sandbox.commands.run( + f"wc -l {partial_path}", + timeout=5, + ) + if pre_check.exit_code != 0: + raise RuntimeError( + "AlwaysFailSubworker: precondition failed - expected wc to " + f"succeed but got exit={pre_check.exit_code}. Sad-path design " + "assumes partial work completes cleanly before the failure result.", + ) + + return SubworkerResult( + file_path=partial_path, + probe_stdout=( + f"SmokeSadPathError: deliberate failure of {node_id} after " + f"writing {partial_path} and running probe " + f"(exit={pre_check.exit_code}). Smoke asserts the partial file + " + "probe WAL survive." + ), + probe_exit_code=1, + ) + + +class SadPathSmokeWorkerMixin: + """Route ``l_2`` to a failing leaf without changing smoke topology.""" + + FAILING_SLUGS: ClassVar[frozenset[str]] = frozenset({"l_2"}) + FAILING_LEAF_SLUG: ClassVar[str] + leaf_slug: ClassVar[str] + + def _spec_for(self, slug, deps, desc): + leaf_slug = self.FAILING_LEAF_SLUG if slug in self.FAILING_SLUGS else self.leaf_slug + return SubtaskSpec( + task_slug=TaskSlug(slug), + description=desc, + assigned_worker_slug=AssignedWorkerSlug(leaf_slug), + depends_on=[TaskSlug(d) for d in deps], + ) + + +class FailingSmokeLeafMixin: + """Suppress happy-path completion messages for deliberate failing leaves.""" + + async def _send_completion_message( + self, + context: WorkerContext, + result: SubworkerResult, + ) -> None: + return None + + +__all__ = ["AlwaysFailSubworker", "FailingSmokeLeafMixin", "SadPathSmokeWorkerMixin"] diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/worker_base.py b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/worker_base.py index d2e43cba..ecdc78fe 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/worker_base.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/smoke_base/worker_base.py @@ -19,6 +19,7 @@ from ergon_core.api import BenchmarkTask, Worker, WorkerContext from ergon_core.api.generation import GenerationTurn, TextPart +from ergon_core.api.results import WorkerOutput from ergon_core.core.persistence.graph.status_conventions import TERMINAL_STATUSES from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.shared.types import ( @@ -61,6 +62,10 @@ class SmokeWorkerBase(Worker): # (see tests/e2e/_asserts.py ``_assert_run_turn_counts``). PARENT_TURN_COUNT: ClassVar[int] = 3 + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self._last_child_statuses: dict[str, str] = {} + @final async def execute( self, @@ -138,9 +143,24 @@ async def execute( parent_node_id=context.node_id, ) if children and all(c.status in _CHILD_WAIT_TERMINAL_STATUSES for c in children): + self._last_child_statuses = {c.task_slug: c.status for c in children} break await asyncio.sleep(2) + def get_output(self, context: WorkerContext) -> WorkerOutput: + non_completed = { + slug: status + for slug, status in self._last_child_statuses.items() + if status != "completed" + } + if non_completed: + return WorkerOutput( + output=f"child tasks did not all complete: {non_completed}", + success=False, + metadata={"child_statuses": self._last_child_statuses}, + ) + return super().get_output(context) + def _spec_for( self, slug: str, diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/minif2f_smoke.py b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/minif2f_smoke.py index 05ad2ed4..8f661f69 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/minif2f_smoke.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/minif2f_smoke.py @@ -12,6 +12,11 @@ from e2b_code_interpreter import AsyncSandbox # type: ignore[import-untyped] from ergon_core.test_support.smoke_fixtures.smoke_base.leaf_base import BaseSmokeLeafWorker +from ergon_core.test_support.smoke_fixtures.smoke_base.sadpath import ( + AlwaysFailSubworker, + FailingSmokeLeafMixin, + SadPathSmokeWorkerMixin, +) from ergon_core.test_support.smoke_fixtures.smoke_base.subworker import SubworkerResult from ergon_core.test_support.smoke_fixtures.smoke_base.worker_base import SmokeWorkerBase @@ -63,7 +68,24 @@ class MiniF2FSmokeLeafWorker(BaseSmokeLeafWorker): subworker_cls = MiniF2FSubworker +class MiniF2FFailingLeafWorker(FailingSmokeLeafMixin, BaseSmokeLeafWorker): + """Registered leaf that fails after partial work.""" + + type_slug = "minif2f-smoke-leaf-failing" + subworker_cls = AlwaysFailSubworker + + +class MiniF2FSadPathSmokeWorker(SadPathSmokeWorkerMixin, SmokeWorkerBase): + """Parent that routes ``l_2`` to the failing leaf.""" + + type_slug = "minif2f-sadpath-smoke-worker" + leaf_slug = "minif2f-smoke-leaf" + FAILING_LEAF_SLUG = "minif2f-smoke-leaf-failing" + + __all__ = [ + "MiniF2FFailingLeafWorker", + "MiniF2FSadPathSmokeWorker", "MiniF2FSmokeLeafWorker", "MiniF2FSmokeWorker", "MiniF2FSubworker", diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke.py b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke.py index 092ed281..11cce463 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke.py @@ -19,6 +19,11 @@ from e2b_code_interpreter import AsyncSandbox # type: ignore[import-untyped] from ergon_core.test_support.smoke_fixtures.smoke_base.leaf_base import BaseSmokeLeafWorker +from ergon_core.test_support.smoke_fixtures.smoke_base.sadpath import ( + AlwaysFailSubworker, + FailingSmokeLeafMixin, + SadPathSmokeWorkerMixin, +) from ergon_core.test_support.smoke_fixtures.smoke_base.subworker import SubworkerResult from ergon_core.test_support.smoke_fixtures.smoke_base.worker_base import SmokeWorkerBase @@ -69,7 +74,24 @@ class ResearchRubricsSmokeLeafWorker(BaseSmokeLeafWorker): subworker_cls = ResearchRubricsSubworker +class ResearchRubricsFailingLeafWorker(FailingSmokeLeafMixin, BaseSmokeLeafWorker): + """Registered leaf that fails after partial work.""" + + type_slug = "researchrubrics-smoke-leaf-failing" + subworker_cls = AlwaysFailSubworker + + +class ResearchRubricsSadPathSmokeWorker(SadPathSmokeWorkerMixin, SmokeWorkerBase): + """Parent that routes ``l_2`` to the failing leaf.""" + + type_slug = "researchrubrics-sadpath-smoke-worker" + leaf_slug = "researchrubrics-smoke-leaf" + FAILING_LEAF_SLUG = "researchrubrics-smoke-leaf-failing" + + __all__ = [ + "ResearchRubricsFailingLeafWorker", + "ResearchRubricsSadPathSmokeWorker", "ResearchRubricsSmokeLeafWorker", "ResearchRubricsSmokeWorker", "ResearchRubricsSubworker", diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke_sadpath.py b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke_sadpath.py index fda081bf..9ccc38f4 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke_sadpath.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/researchrubrics_smoke_sadpath.py @@ -1,123 +1,10 @@ -"""ResearchRubrics score-zero sad-path fixture. +"""Compatibility imports for the ResearchRubrics sad-path fixture.""" -Used in researchrubrics cohort slot 3 (see -``docs/superpowers/plans/test-refactor/00-program.md §3.2``). Routes -``l_2`` to a failing leaf that DOES real work (file write + sandbox -command) BEFORE raising; the rest of the 9-subtask topology is -unchanged. - -Driver asserts partial artifact + pre-fail WAL entry persist, all leaves -complete, and the run evaluation scores zero because l_2 reports a failed -probe result. -""" - -from typing import ClassVar - -from e2b_code_interpreter import AsyncSandbox # type: ignore[import-untyped] -from ergon_core.api import WorkerContext - -from ergon_core.core.persistence.shared.types import AssignedWorkerSlug, TaskSlug -from ergon_core.core.runtime.services.task_management_dto import SubtaskSpec - -from ergon_core.test_support.smoke_fixtures.smoke_base.leaf_base import BaseSmokeLeafWorker -from ergon_core.test_support.smoke_fixtures.smoke_base.subworker import SubworkerResult -from ergon_core.test_support.smoke_fixtures.smoke_base.worker_base import SmokeWorkerBase - - -class AlwaysFailSubworker: - """Does TWO units of real work, then returns a failing probe result. - - Proves the partial-work-persists-on-failure path. When the leaf - fails after partial work: - - 1. The partial file we wrote to ``/workspace/final_output/`` still - becomes a ``RunResource`` row (the runtime's persist step runs - regardless of worker exit outcome). - 2. The sandbox command we already ran still emits a - ``sandbox_command`` event / WAL entry (the command path writes - synchronously, before our raise). - 3. The leaf's task row still completes because worker execution itself - completed and output persistence should remain exercised. - 4. The reused smoke criterion scores the run zero after reading the - failed probe result. - """ - - async def work(self, node_id: str, sandbox: AsyncSandbox) -> SubworkerResult: - # Action 1: write partial artifact — must land as a RunResource. - partial_path = f"/workspace/final_output/partial_{node_id}.md" - await sandbox.files.write( - partial_path, - ( - f"# Partial work {node_id}\n\n" - "This content was written before a deliberate failure. If smoke " - "sees this as a RunResource row, partial serialization works.\n" - ), - ) - - # Action 2: run a sandbox command — must emit sandbox_command WAL. - pre_check = await sandbox.commands.run( - f"wc -l {partial_path}", - timeout=5, - ) - if pre_check.exit_code != 0: - raise RuntimeError( - "AlwaysFailSubworker: precondition failed — expected wc to " - f"succeed but got exit={pre_check.exit_code}. Sad-path design " - "assumes partial work completes cleanly before the raise.", - ) - - # Action 3: deliberate failure via WorkerOutput.success=False. This - # exercises the failed-task path without bypassing output persistence. - return SubworkerResult( - file_path=partial_path, - probe_stdout=( - f"SmokeSadPathError: deliberate failure of {node_id} after " - f"writing {partial_path} and running probe " - f"(exit={pre_check.exit_code}). Smoke asserts the partial file + " - "probe WAL survive." - ), - probe_exit_code=1, - ) - - -class ResearchRubricsFailingLeafWorker(BaseSmokeLeafWorker): - """Registered leaf that always fails after 2 units of real work.""" - - type_slug = "researchrubrics-smoke-leaf-failing" - subworker_cls = AlwaysFailSubworker - - async def _send_completion_message( - self, - context: WorkerContext, - result: SubworkerResult, - ) -> None: - """Preserve sad-path invariant: failed l_2 does not report completion.""" - return None - - -class ResearchRubricsSadPathSmokeWorker(SmokeWorkerBase): - """Parent that routes ``l_2`` to the failing leaf; everything else - routes to the normal leaf. - - Topology stays identical (still 9 subtasks, same deps); only the leaf - binding for ``l_2`` differs. ``execute`` is still ``@final``; the - hook is ``_spec_for``. - """ - - type_slug = "researchrubrics-sadpath-smoke-worker" - leaf_slug = "researchrubrics-smoke-leaf" # default for everything EXCEPT l_2 - - FAILING_SLUGS: ClassVar[frozenset[str]] = frozenset({"l_2"}) - FAILING_LEAF_SLUG: ClassVar[str] = "researchrubrics-smoke-leaf-failing" - - def _spec_for(self, slug, deps, desc): - leaf_slug = self.FAILING_LEAF_SLUG if slug in self.FAILING_SLUGS else self.leaf_slug - return SubtaskSpec( - task_slug=TaskSlug(slug), - description=desc, - assigned_worker_slug=AssignedWorkerSlug(leaf_slug), - depends_on=[TaskSlug(d) for d in deps], - ) +from ergon_core.test_support.smoke_fixtures.smoke_base.sadpath import AlwaysFailSubworker +from ergon_core.test_support.smoke_fixtures.workers.researchrubrics_smoke import ( + ResearchRubricsFailingLeafWorker, + ResearchRubricsSadPathSmokeWorker, +) __all__ = [ diff --git a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/swebench_smoke.py b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/swebench_smoke.py index cd3e7c04..4bad9cf9 100644 --- a/ergon_core/ergon_core/test_support/smoke_fixtures/workers/swebench_smoke.py +++ b/ergon_core/ergon_core/test_support/smoke_fixtures/workers/swebench_smoke.py @@ -10,6 +10,11 @@ from e2b_code_interpreter import AsyncSandbox # type: ignore[import-untyped] from ergon_core.test_support.smoke_fixtures.smoke_base.leaf_base import BaseSmokeLeafWorker +from ergon_core.test_support.smoke_fixtures.smoke_base.sadpath import ( + AlwaysFailSubworker, + FailingSmokeLeafMixin, + SadPathSmokeWorkerMixin, +) from ergon_core.test_support.smoke_fixtures.smoke_base.subworker import SubworkerResult from ergon_core.test_support.smoke_fixtures.smoke_base.worker_base import SmokeWorkerBase @@ -61,7 +66,24 @@ class SweBenchSmokeLeafWorker(BaseSmokeLeafWorker): subworker_cls = SweBenchSubworker +class SweBenchFailingLeafWorker(FailingSmokeLeafMixin, BaseSmokeLeafWorker): + """Registered leaf that fails after partial work.""" + + type_slug = "swebench-smoke-leaf-failing" + subworker_cls = AlwaysFailSubworker + + +class SweBenchSadPathSmokeWorker(SadPathSmokeWorkerMixin, SmokeWorkerBase): + """Parent that routes ``l_2`` to the failing leaf.""" + + type_slug = "swebench-sadpath-smoke-worker" + leaf_slug = "swebench-smoke-leaf" + FAILING_LEAF_SLUG = "swebench-smoke-leaf-failing" + + __all__ = [ + "SweBenchFailingLeafWorker", + "SweBenchSadPathSmokeWorker", "SweBenchSmokeLeafWorker", "SweBenchSmokeWorker", "SweBenchSubworker", diff --git a/ergon_core/migrations/versions/0a1b2c3d4e5f_add_thread_summary.py b/ergon_core/migrations/versions/0a1b2c3d4e5f_add_thread_summary.py new file mode 100644 index 00000000..78f779cf --- /dev/null +++ b/ergon_core/migrations/versions/0a1b2c3d4e5f_add_thread_summary.py @@ -0,0 +1,26 @@ +"""add_thread_summary + +Revision ID: 0a1b2c3d4e5f +Revises: f6a7b8c9d0e1 +Create Date: 2026-04-26 19:45:00.000000 + +Add an optional human-readable summary for communication threads. +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "0a1b2c3d4e5f" +down_revision: Union[str, None] = "a2b3c4d5e6f7" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("threads", sa.Column("summary", sa.String(), nullable=True)) + + +def downgrade() -> None: + op.drop_column("threads", "summary") diff --git a/ergon_paper_overleaf_edit/checklist.tex b/ergon_paper_overleaf_edit/checklist.tex deleted file mode 100644 index b0bbc72e..00000000 --- a/ergon_paper_overleaf_edit/checklist.tex +++ /dev/null @@ -1,224 +0,0 @@ -\section*{NeurIPS Paper Checklist} - -\begin{enumerate} - -\item {\bf Claims} - \item[] Question: Do the main claims made in the abstract and introduction accurately reflect the paper's contributions and scope? - \item[] Answer: \answerYes{} - \item[] Justification: The abstract and \S\ref{sec:intro} claim (i) that current agent research publishes reported numbers without rollouts, producing cross-community fragmentation and cross-harness variance, and (ii) that rollout cards plus drops manifests address both problems; these are substantiated by the 50-repo survey (Appendix~\ref{app:survey}), the 37-pair variance catalogue (Appendix~\ref{app:variance-catalogue}), the format specification in \S\ref{sec:system} and Appendix~\ref{app:system}, and the two experiments of \S\ref{sec:validation}. Scope limits (proof-of-concept, two RQ1 pairings, one RQ2 reconciliation on a single benchmark, two public submissions) are stated explicitly in \S\ref{sec:validation:questions} and revisited in the Limitations paragraph of \S\ref{sec:discussion}. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the abstract and introduction do not include the claims made in the paper. - \item The abstract and/or introduction should clearly state the claims made, including the contributions made in the paper and important assumptions and limitations. A \answerNo{} or \answerNA{} answer to this question will not be perceived well by the reviewers. - \item The claims made should match theoretical and experimental results, and reflect how much the results can be expected to generalize to other settings. - \item It is fine to include aspirational goals as motivation as long as it is clear that these goals are not attained by the paper. - \end{itemize} - -\item {\bf Limitations} - \item[] Question: Does the paper discuss the limitations of the work performed by the authors? - \item[] Answer: \answerYes{} - \item[] Justification: \S\ref{sec:discussion} contains an explicit Limitations paragraph that enumerates: representative coverage of five communities from a long tail, 50 repositories from a much larger population, 37 variance pairs from the documented-comparison subset; citation-based (not reproduced) training-side transform-variance evidence; single-analyst cross-community reanalysis (one target community per task family); and single-benchmark reconciliation (SWE-bench Verified, two published submissions). The paper's claims are scoped to these bounds. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper has no limitation while the answer \answerNo{} means that the paper has limitations, but those are not discussed in the paper. - \item The authors are encouraged to create a separate ``Limitations'' section in their paper. - \item The paper should point out any strong assumptions and how robust the results are to violations of these assumptions (e.g., independence assumptions, noiseless settings, model well-specification, asymptotic approximations only holding locally). The authors should reflect on how these assumptions might be violated in practice and what the implications would be. - \item The authors should reflect on the scope of the claims made, e.g., if the approach was only tested on a few datasets or with a few runs. In general, empirical results often depend on implicit assumptions, which should be articulated. - \item The authors should reflect on the factors that influence the performance of the approach. For example, a facial recognition algorithm may perform poorly when image resolution is low or images are taken in low lighting. Or a speech-to-text system might not be used reliably to provide closed captions for online lectures because it fails to handle technical jargon. - \item The authors should discuss the computational efficiency of the proposed algorithms and how they scale with dataset size. - \item If applicable, the authors should discuss possible limitations of their approach to address problems of privacy and fairness. - \item While the authors might fear that complete honesty about limitations might be used by reviewers as grounds for rejection, a worse outcome might be that reviewers discover limitations that aren't acknowledged in the paper. The authors should use their best judgment and recognize that individual actions in favor of transparency play an important role in developing norms that preserve the integrity of the community. Reviewers will be specifically instructed to not penalize honesty concerning limitations. - \end{itemize} - -\item {\bf Theory assumptions and proofs} - \item[] Question: For each theoretical result, does the paper provide the full set of assumptions and a complete (and correct) proof? - \item[] Answer: \answerNA{} - \item[] Justification: The paper does not state theorems or prove formal results. Projection operators in Appendix~\ref{app:projections} are defined constructively as pseudocode specifications with accompanying drops manifests; preservation claims are enumerative (Table~\ref{tab:preservation}) and follow directly from the projection definitions, not from proof obligations. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not include theoretical results. - \item All the theorems, formulas, and proofs in the paper should be numbered and cross-referenced. - \item All assumptions should be clearly stated or referenced in the statement of any theorems. - \item The proofs can either appear in the main paper or the supplemental material, but if they appear in the supplemental material, the authors are encouraged to provide a short proof sketch to provide intuition. - \item Inversely, any informal proof provided in the core of the paper should be complemented by formal proofs provided in appendix or supplemental material. - \item Theorems and Lemmas that the proof relies upon should be properly referenced. - \end{itemize} - - \item {\bf Experimental result reproducibility} - \item[] Question: Does the paper fully disclose all the information needed to reproduce the main experimental results of the paper to the extent that it affects the main claims and/or conclusions of the paper (regardless of whether the code and data are provided or not)? - \item[] Answer: \answerYes{} - \item[] Justification: \S\ref{sec:validation:setup} specifies the three task families, agent backbone, task-specific tools, and grading procedures; Appendix~\ref{app:setup} consolidates the per-benchmark action space (\S\ref{app:actions}), benchmark selection (\S\ref{app:benchmarks}), flexible-agent details (\S\ref{app:fivescaffolds}), and cross-harness reconciliation methodology (\S\ref{app:reconciliation}); Appendix~\ref{app:system} specifies the rollout-card format bit-for-bit. The reference implementation is open-source; all reported numbers are either deterministic functions of published rollouts (RQ2) or derivable from the released rollout cards (RQ1). - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not include experiments. - \item If the paper includes experiments, a \answerNo{} answer to this question will not be perceived well by the reviewers: Making the paper reproducible is important, regardless of whether the code and data are provided or not. - \item If the contribution is a dataset and\slash or model, the authors should describe the steps taken to make their results reproducible or verifiable. - \item Depending on the contribution, reproducibility can be accomplished in various ways. For example, if the contribution is a novel architecture, describing the architecture fully might suffice, or if the contribution is a specific model and empirical evaluation, it may be necessary to either make it possible for others to replicate the model with the same dataset, or provide access to the model. In general. releasing code and data is often one good way to accomplish this, but reproducibility can also be provided via detailed instructions for how to replicate the results, access to a hosted model (e.g., in the case of a large language model), releasing of a model checkpoint, or other means that are appropriate to the research performed. - \item While NeurIPS does not require releasing code, the conference does require all submissions to provide some reasonable avenue for reproducibility, which may depend on the nature of the contribution. For example - \begin{enumerate} - \item If the contribution is primarily a new algorithm, the paper should make it clear how to reproduce that algorithm. - \item If the contribution is primarily a new model architecture, the paper should describe the architecture clearly and fully. - \item If the contribution is a new model (e.g., a large language model), then there should either be a way to access this model for reproducing the results or a way to reproduce the model (e.g., with an open-source dataset or instructions for how to construct the dataset). - \item We recognize that reproducibility may be tricky in some cases, in which case authors are welcome to describe the particular way they provide for reproducibility. In the case of closed-source models, it may be that access to the model is limited in some way (e.g., to registered users), but it should be possible for other researchers to have some path to reproducing or verifying the results. - \end{enumerate} - \end{itemize} - - -\item {\bf Open access to data and code} - \item[] Question: Does the paper provide open access to the data and code, with sufficient instructions to faithfully reproduce the main experimental results, as described in supplemental material? - \item[] Answer: \answerYes{} - \item[] Justification: Ergon (the recording substrate and reference implementation) is released as open source with README, architecture documentation, and benchmark harness; an anonymised artefact bundle accompanies this submission. The SWE-agent and Agentless SWE-bench Verified submissions used in RQ2 are already public in the \texttt{swe-bench-submissions} S3 bucket. Rollout cards for the MiniF2F and Research Rubrics experiments will be released alongside the paper, as will the cross-harness reconciliation pipeline and convention specification (\S\ref{app:reconciliation}). - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that paper does not include experiments requiring code. - \item Please see the NeurIPS code and data submission guidelines (\url{https://neurips.cc/public/guides/CodeSubmissionPolicy}) for more details. - \item While we encourage the release of code and data, we understand that this might not be possible, so \answerNo{} is an acceptable answer. Papers cannot be rejected simply for not including code, unless this is central to the contribution (e.g., for a new open-source benchmark). - \item The instructions should contain the exact command and environment needed to run to reproduce the results. See the NeurIPS code and data submission guidelines (\url{https://neurips.cc/public/guides/CodeSubmissionPolicy}) for more details. - \item The authors should provide instructions on data access and preparation, including how to access the raw data, preprocessed data, intermediate data, and generated data, etc. - \item The authors should provide scripts to reproduce all experimental results for the new proposed method and baselines. If only a subset of experiments are reproducible, they should state which ones are omitted from the script and why. - \item At submission time, to preserve anonymity, the authors should release anonymized versions (if applicable). - \item Providing as much information as possible in supplemental material (appended to the paper) is recommended, but including URLs to data and code is permitted. - \end{itemize} - - -\item {\bf Experimental setting/details} - \item[] Question: Does the paper specify all the training and test details (e.g., data splits, hyperparameters, how they were chosen, type of optimizer) necessary to understand the results? - \item[] Answer: \answerYes{} - \item[] Justification: The paper reports no training; all rollouts are inference-only against an API-served model. \S\ref{sec:validation:setup} specifies the model backbone, per-task-family turn cap, action space, and grading procedure. Appendix~\ref{app:setup} specifies the benchmark subsets used (\S\ref{app:benchmarks}), the full agent tool inventory and system-prompt structure (\S\ref{app:fivescaffolds}), and the convention choices in the SWE-bench reconciliation (\S\ref{app:reconciliation}). - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not include experiments. - \item The experimental setting should be presented in the core of the paper to a level of detail that is necessary to appreciate the results and make sense of them. - \item The full details can be provided either with the code, in appendix, or as supplemental material. - \end{itemize} - -\item {\bf Experiment statistical significance} - \item[] Question: Does the paper report error bars suitably and correctly defined or other appropriate information about the statistical significance of the experiments? - \item[] Answer: \answerYes{} - \item[] Justification: The RQ2 cross-harness reconciliation is a deterministic re-grading of already-released trajectories, so no stochastic variation enters the pipeline; we report exact denominators and decomposition deltas rather than error bars. For the RQ1 rollouts (MiniF2F and Research Rubrics), we report Wilson-score 95\% confidence intervals for the rate statistics (abandonment ratios, role-differentiation fractions) and specify the rollout count driving each interval; the cross-community analyses are qualitative proof-of-concept claims at the single-seed level, which is stated explicitly. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not include experiments. - \item The authors should answer \answerYes{} if the results are accompanied by error bars, confidence intervals, or statistical significance tests, at least for the experiments that support the main claims of the paper. - \item The factors of variability that the error bars are capturing should be clearly stated (for example, train/test split, initialization, random drawing of some parameter, or overall run with given experimental conditions). - \item The method for calculating the error bars should be explained (closed form formula, call to a library function, bootstrap, etc.) - \item The assumptions made should be given (e.g., Normally distributed errors). - \item It should be clear whether the error bar is the standard deviation or the standard error of the mean. - \item It is OK to report 1-sigma error bars, but one should state it. The authors should preferably report a 2-sigma error bar than state that they have a 96\% CI, if the hypothesis of Normality of errors is not verified. - \item For asymmetric distributions, the authors should be careful not to show in tables or figures symmetric error bars that would yield results that are out of range (e.g., negative error rates). - \item If error bars are reported in tables or plots, the authors should explain in the text how they were calculated and reference the corresponding figures or tables in the text. - \end{itemize} - -\item {\bf Experiments compute resources} - \item[] Question: For each experiment, does the paper provide sufficient information on the computer resources (type of compute workers, memory, time of execution) needed to reproduce the experiments? - \item[] Answer: \answerYes{} - \item[] Justification: No model training is performed. Agent rollouts for RQ1 are inference-only calls to an API-hosted LLM backbone; rollout counts and per-rollout token budgets are reported in Appendix~\ref{app:fivescaffolds}. The RQ2 reconciliation pipeline (ingestion of SWE-agent and Agentless submissions, re-grading under two conventions) runs on a single developer workstation in minutes; compute requirements are stated in \S\ref{app:reconciliation}. No GPU is required for any reported experiment. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not include experiments. - \item The paper should indicate the type of compute workers CPU or GPU, internal cluster, or cloud provider, including relevant memory and storage. - \item The paper should provide the amount of compute required for each of the individual experimental runs as well as estimate the total compute. - \item The paper should disclose whether the full research project required more compute than the experiments reported in the paper (e.g., preliminary or failed experiments that didn't make it into the paper). - \end{itemize} - -\item {\bf Code of ethics} - \item[] Question: Does the research conducted in the paper conform, in every respect, with the NeurIPS Code of Ethics \url{https://neurips.cc/public/EthicsGuidelines}? - \item[] Answer: \answerYes{} - \item[] Justification: The paper proposes a publication format and reference implementation for agent research. It involves no human subjects and no data scraping beyond publicly-released benchmark submissions whose licenses permit re-analysis. The reference implementation is released as open source. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the authors have not reviewed the NeurIPS Code of Ethics. - \item If the authors answer \answerNo, they should explain the special circumstances that require a deviation from the Code of Ethics. - \item The authors should make sure to preserve anonymity (e.g., if there is a special consideration due to laws or regulations in their jurisdiction). - \end{itemize} - - -\item {\bf Broader impacts} - \item[] Question: Does the paper discuss both potential positive societal impacts and negative societal impacts of the work performed? - \item[] Answer: \answerYes{} - \item[] Justification: Positive impacts --- improved auditability of agent-research claims, cross-harness reconciliation, and legibility of methodology differences --- are discussed in \S\ref{sec:discussion} (``Publication cost and adoption'' and ``Detection, not resolution''). Negative impacts are minimal: a publication format itself cannot be weaponised, and the rollouts released alongside benchmarks inherit the data-handling norms of their source benchmarks (MiniF2F, SWE-bench Verified, Research Rubrics). No new scraped or high-risk data is introduced. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that there is no societal impact of the work performed. - \item If the authors answer \answerNA{} or \answerNo, they should explain why their work has no societal impact or why the paper does not address societal impact. - \item Examples of negative societal impacts include potential malicious or unintended uses (e.g., disinformation, generating fake profiles, surveillance), fairness considerations (e.g., deployment of technologies that could make decisions that unfairly impact specific groups), privacy considerations, and security considerations. - \item The conference expects that many papers will be foundational research and not tied to particular applications, let alone deployments. However, if there is a direct path to any negative applications, the authors should point it out. For example, it is legitimate to point out that an improvement in the quality of generative models could be used to generate Deepfakes for disinformation. On the other hand, it is not needed to point out that a generic algorithm for optimizing neural networks could enable people to train models that generate Deepfakes faster. - \item The authors should consider possible harms that could arise when the technology is being used as intended and functioning correctly, harms that could arise when the technology is being used as intended but gives incorrect results, and harms following from (intentional or unintentional) misuse of the technology. - \item If there are negative societal impacts, the authors could also discuss possible mitigation strategies (e.g., gated release of models, providing defenses in addition to attacks, mechanisms for monitoring misuse, mechanisms to monitor how a system learns from feedback over time, improving the efficiency and accessibility of ML). - \end{itemize} - -\item {\bf Safeguards} - \item[] Question: Does the paper describe safeguards that have been put in place for responsible release of data or models that have a high risk for misuse (e.g., pre-trained language models, image generators, or scraped datasets)? - \item[] Answer: \answerNA{} - \item[] Justification: The paper releases a format specification (rollout cards) and a reference recording substrate (Ergon). Neither is a pre-trained model, generative system, or scraped dataset. The rollouts released alongside are bounded by the source benchmarks' existing release policies. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper poses no such risks. - \item Released models that have a high risk for misuse or dual-use should be released with necessary safeguards to allow for controlled use of the model, for example by requiring that users adhere to usage guidelines or restrictions to access the model or implementing safety filters. - \item Datasets that have been scraped from the Internet could pose safety risks. The authors should describe how they avoided releasing unsafe images. - \item We recognize that providing effective safeguards is challenging, and many papers do not require this, but we encourage authors to take this into account and make a best faith effort. - \end{itemize} - -\item {\bf Licenses for existing assets} - \item[] Question: Are the creators or original owners of assets (e.g., code, data, models), used in the paper, properly credited and are the license and terms of use explicitly mentioned and properly respected? - \item[] Answer: \answerYes{} - \item[] Justification: MiniF2F~\citep{zheng2022minif2f}, SWE-bench Verified, SWE-agent and Agentless submissions, the Research Rubrics benchmark, and the LLM API backbones used for rollout generation and rubric grading are all cited in \S\ref{sec:validation} and \S\ref{sec:related}. Asset licenses and version identifiers are catalogued in Appendix~\ref{app:benchmarks}, including the specific SWE-bench submission commits and the API model versions used. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not use existing assets. - \item The authors should cite the original paper that produced the code package or dataset. - \item The authors should state which version of the asset is used and, if possible, include a URL. - \item The name of the license (e.g., CC-BY 4.0) should be included for each asset. - \item For scraped data from a particular source (e.g., website), the copyright and terms of service of that source should be provided. - \item If assets are released, the license, copyright information, and terms of use in the package should be provided. For popular datasets, \url{paperswithcode.com/datasets} has curated licenses for some datasets. Their licensing guide can help determine the license of a dataset. - \item For existing datasets that are re-packaged, both the original license and the license of the derived asset (if it has changed) should be provided. - \item If this information is not available online, the authors are encouraged to reach out to the asset's creators. - \end{itemize} - -\item {\bf New assets} - \item[] Question: Are new assets introduced in the paper well documented and is the documentation provided alongside the assets? - \item[] Answer: \answerYes{} - \item[] Justification: Three new assets accompany the paper: (i) the rollout-card format specification (Appendix~\ref{app:system}); (ii) Ergon, the reference implementation, with a README, architecture documentation, and RFC stream for pending integrations (Appendix~\ref{app:integrations}); (iii) ingestion adapters for SWE-agent and Agentless submissions with accompanying drops manifests (\S\ref{app:reconciliation}). All three are documented in-repository and anonymised for submission. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not release new assets. - \item Researchers should communicate the details of the dataset\slash code\slash model as part of their submissions via structured templates. This includes details about training, license, limitations, etc. - \item The paper should discuss whether and how consent was obtained from people whose asset is used. - \item At submission time, remember to anonymize your assets (if applicable). You can either create an anonymized URL or include an anonymized zip file. - \end{itemize} - -\item {\bf Crowdsourcing and research with human subjects} - \item[] Question: For crowdsourcing experiments and research with human subjects, does the paper include the full text of instructions given to participants and screenshots, if applicable, as well as details about compensation (if any)? - \item[] Answer: \answerNA{} - \item[] Justification: The paper involves no crowdsourcing and no human subjects. The Research Rubrics LLM-based evaluator is an API grading transform, not a human participant; the 50-repo survey and the 37-pair variance catalogue are authored by the paper's authors against public artefacts. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not involve crowdsourcing nor research with human subjects. - \item Including this information in the supplemental material is fine, but if the main contribution of the paper involves human subjects, then as much detail as possible should be included in the main paper. - \item According to the NeurIPS Code of Ethics, workers involved in data collection, curation, or other labor should be paid at least the minimum wage in the country of the data collector. - \end{itemize} - -\item {\bf Institutional review board (IRB) approvals or equivalent for research with human subjects} - \item[] Question: Does the paper describe potential risks incurred by study participants, whether such risks were disclosed to the subjects, and whether Institutional Review Board (IRB) approvals (or an equivalent approval/review based on the requirements of your country or institution) were obtained? - \item[] Answer: \answerNA{} - \item[] Justification: No human subjects research is performed. No IRB review is applicable. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the paper does not involve crowdsourcing nor research with human subjects. - \item Depending on the country in which research is conducted, IRB approval (or equivalent) may be required for any human subjects research. If you obtained IRB approval, you should clearly state this in the paper. - \item We recognize that the procedures for this may vary significantly between institutions and locations, and we expect authors to adhere to the NeurIPS Code of Ethics and the guidelines for their institution. - \item For initial submissions, do not include any information that would break anonymity (if applicable), such as the institution conducting the review. - \end{itemize} - -\item {\bf Declaration of LLM usage} - \item[] Question: Does the paper describe the usage of LLMs if it is an important, original, or non-standard component of the core methods in this research? Note that if the LLM is used only for writing, editing, or formatting purposes and does \emph{not} impact the core methodology, scientific rigor, or originality of the research, declaration is not required. - %this research? - \item[] Answer: \answerYes{} - \item[] Justification: LLMs are the central object of study. The flexible-agent worker of \S\ref{sec:validation:setup} is an LLM-backed scaffold running on an API-hosted backbone; the Research Rubrics evaluator is a GPT-4o-mini grading transform; the SWE-bench reconciliation analyses trajectories produced by third-party LLM agents (SWE-agent, Agentless) on a common API-hosted backbone. All model identities, versions, and prompting interfaces are named in \S\ref{sec:validation:setup} and Appendix~\ref{app:fivescaffolds}. - \item[] Guidelines: - \begin{itemize} - \item The answer \answerNA{} means that the core method development in this research does not involve LLMs as any important, original, or non-standard components. - \item Please refer to our LLM policy in the NeurIPS handbook for what should or should not be described. - \end{itemize} - -\end{enumerate} diff --git a/ergon_paper_overleaf_edit/figures/ergon_dashboard.pdf b/ergon_paper_overleaf_edit/figures/ergon_dashboard.pdf deleted file mode 100644 index 6e81f05e..00000000 Binary files a/ergon_paper_overleaf_edit/figures/ergon_dashboard.pdf and /dev/null differ diff --git a/ergon_paper_overleaf_edit/figures/ergon_schema.png b/ergon_paper_overleaf_edit/figures/ergon_schema.png deleted file mode 100644 index ddea0d5e..00000000 Binary files a/ergon_paper_overleaf_edit/figures/ergon_schema.png and /dev/null differ diff --git a/ergon_paper_overleaf_edit/main.pdf b/ergon_paper_overleaf_edit/main.pdf deleted file mode 100644 index be6ea0d2..00000000 Binary files a/ergon_paper_overleaf_edit/main.pdf and /dev/null differ diff --git a/ergon_paper_overleaf_edit/main.tex b/ergon_paper_overleaf_edit/main.tex deleted file mode 100644 index 26918a28..00000000 --- a/ergon_paper_overleaf_edit/main.tex +++ /dev/null @@ -1,3581 +0,0 @@ -\documentclass{article} - -% Evaluations & Datasets track, anonymous submission. -% For camera-ready, switch to: \usepackage[eandd, final]{neurips_2026} -% For arXiv preprint: \usepackage[eandd, preprint]{neurips_2026} -\usepackage[eandd]{neurips_2026} - -\usepackage[utf8]{inputenc} -\usepackage[T1]{fontenc} -\usepackage{hyperref} -\usepackage{url} -\usepackage{booktabs} -\usepackage{array} -\usepackage{enumitem} -\usepackage{amsfonts} -\usepackage{amsmath,amssymb} -\usepackage{nicefrac} -\usepackage{microtype} -\usepackage{xcolor} -\usepackage{graphicx} -\graphicspath{{figures/}} -% Note: algorithm + algpseudocode packages commented out during -% 2026-04-21 reorg --- sandbox texlive install lacks texlive-science. -% Currently unused (no algorithmic/algorithm environments in main.tex or -% checklist.tex). Re-enable when adding pseudocode blocks, and install -% texlive-science (apt) on the build machine. -% \usepackage{algorithm} -% \usepackage{algpseudocode} -\usepackage{subcaption} -\usepackage{multirow} -\usepackage{listings} -\usepackage{pifont} - -% --- Draft helpers (remove for camera-ready) --- -\newcommand{\todo}[1]{\textcolor{red}{\textbf{[TODO: #1]}}} -\newcommand{\placeholder}[2]{% - \begin{center} - \fcolorbox{black}{gray!15}{% - \parbox{#1\linewidth}{\centering\vspace{1.2cm}\textit{#2}\vspace{1.2cm}}% - } - \end{center} -} -% v5.6 placeholder for experimental numbers: use \expnum{X.X} for values TBD pending Experiment A -\newcommand{\expnum}[1]{\textcolor{red}{\textbf{[#1]}}} - -\lstset{ - basicstyle=\ttfamily\footnotesize, - breaklines=true, - frame=single, - columns=fullflexible, - showstringspaces=false, - language=Python, - keywordstyle=\color{blue!70!black}, - commentstyle=\color{gray}, - stringstyle=\color{green!50!black}, -} - -\title{Rollout Cards for Agent Research} -% NOTE (v5.7, 2026-04-21): retitled from "Ergon: An Async Multi-Agent Gym for -% Decomposed Long-Horizon Work" to foreground the rollout-card publication-format -% contribution and anchor in the Datasheets/Model Cards genre, per the pivot -% reflected in the current abstract / §1 / §2 framing. Ergon reference -% implementation now introduced in §3 body, not in the title. - -\author{% - Anonymous Authors -} - -\begin{document} - -\maketitle - -% ============================================================================ -% v5.6 DRAFT --- fresh draft per planning/rewrite-plan.md v5.6. -% This file is the active draft (renamed from main_v55.tex during the -% 2026-04-21 repo reorg); archive/main_old.tex is the v5.4-era pre-pivot -% reference. Body sections drafted chapter-by-chapter per Execution Order -% Sessions 2-10. Appendices carried from archive/main_old.tex per Appendix -% Handling table; see TODO comments on each appendix for disposition. -% ============================================================================ - -% ============================================================================ -% ABSTRACT -% ============================================================================ -\begin{abstract} -Agent research publishes reported numbers --- accuracy, cost, -wall-clock time, gradient update --- without publishing the -rollouts those numbers were computed from. Two problems -follow. First, numbers are produced by a \emph{reporting -convention} applied to each rollout (a grading script, a -failure-handling rule, a loss function), and different -conventions applied to the same rollouts yield different -numbers. Because rollouts are discarded, these divergences -cannot be reconciled after the fact. Second, each research -community publishes only the aggregates its own conventions -compute, so replicating or extending a result along a dimension -the original harness ignored requires generating new rollouts -entirely. - -We audit 50 popular training and evaluation repositories and -consolidate 37 documented cross-harness variance pairs across -task-success, cost, and timing metrics; none of the audited -harnesses surface failed rollouts alongside reported metrics. -The same pattern appears in gradient-signal computation -during training. - -We propose \emph{rollout cards}: an event-sourced publication -format in which the rollout is the artefact and any reporting -convention can be applied post-hoc. Two experiments evaluate -the proposal. The first assesses whether rollout cards let -one community recover another community's key metrics from -rollouts the original harness would have discarded. The -second reconciles the 15.6pp published SWE-agent/Agentless -gap on SWE-bench Verified, attributing approximately -\expnum{C TBD}pp to a single failure-classification -convention (\texttt{no\_generation} accounting) rather than -method differences. -\end{abstract} - -% ============================================================================ -\section{Introduction} -\label{sec:intro} -% ---------------------------------------------------------------------------- -% Session 2: Draft per Section Spec rows ``Sec.~1 opening'' + ``Sec.~1 contributions''. -% Beat: zoo-of-frameworks opening, HF-vs-Meta as single damage illustration -% (14.9pp gap between official and independent Llama 3.1 reproductions). -% Brief pointer to training-side / tracing depth without naming Sec.~2's specific -% 37-pair/50-repo numbers. Closes with three contributions bullets: -% (1) consolidated audit of transform disagreement (Sec.~2) -% (2) rollout-card proposal + Ergon reference implementation (Sec.~3) -% (3) two cross-community reanalysis case studies + cross-harness reconciliation (Sec.~4) -% Target: 1.0 page total. -% ---------------------------------------------------------------------------- - -Language-model capabilities have grown faster than the research -ecosystem evaluating them. The infrastructure that trains and -tests these models has fragmented into a zoo of benchmark -harnesses, RL trainers, agent frameworks, and observability -middleware, each with its own conventions for how a rollout -is recorded, how a headline result is computed, and what is silently -dropped along the way. A single evaluation now routes a model's -behaviour through a stack of four or five ecosystem choices -before producing the number that appears in a paper's headline -table. - -These choices are not cosmetic. Identical LLaMA-65B weights -score 63.7 on MMLU under the Berkeley evaluation code and 48.8 -under \texttt{lm-evaluation-harness}: a 14.9-point gap -attributable to prompt-template and answer-extraction -conventions alone \citep{beeching2023openllm, biderman2024}. -Neither pipeline stores the raw token traces a third party -would need to recompute the score under their own convention. - -The pattern extends beyond task-success metrics: cost figures -diverge by factors of two between tracing frameworks monitoring -the same API calls \citep{opentelemetry3163, langfuse12306}, -wall-clock measurements differ by more than 3$\times$ between -RL trainers running the same workload \citep{hu2025openrlhf}, -and training-side research has recently documented -gradient-signal differences between RL libraries applying -different loss-aggregation conventions to identical rollouts -\citep{liu2025drgrpo}. Where the underlying rollouts have been preserved, they -agree. The disagreement enters when each pipeline folds -those rollouts into a number; whichever convention the harness -happened to ship adjudicates it silently. - -We argue that the field has conflated two layers: -\emph{recording} (what the rollout captures) and -\emph{reporting} (aggregates computed from it). Current practice -publishes the aggregate and discards the rollout, which makes -every convention disagreement between harnesses a disagreement -without a tiebreaker. We propose that the rollout itself be -the published artefact. - -\paragraph{Contributions.} We make three contributions. - -\begin{itemize}\itemsep 2pt - \item We perform a systematic audit of 50 popular training - and evaluation repositories at file-and-line granularity - (\S\ref{sec:problem}). We pin every claim to a specific - commit SHA for two-click GitHub verification. - \textbf{We find that none of the 50 surface failed rollouts - alongside their reported metrics. Eleven silently drop - failures from their headline numbers, including FastChat - (the reference scoring code behind MT-Bench and Chatbot - Arena).} We consolidate 37 cases where the same workload - yields different published numbers, spanning up to 14.9pp - on task success (LLaMA-65B MMLU, Berkeley vs - lm-evaluation-harness), 2$\times$ on cost (Anthropic cached - tokens), and 3.13$\times$ on latency (GSM8K-GRPO epoch, - TRL vs OpenRLHF). - \item \textbf{We propose \emph{rollout cards}} - (\S\ref{sec:system}): a publication-format genre in the - Datasheets \citep{gebru2021datasheets} and Model Cards - \citep{mitchell2019modelcards} lineage, and implement them - as Ergon, a reference gym whose event-sourced format - $\tau_E$ projects into the canonical trajectory formats of - five research communities. Each projection ships an - explicit manifest of what it drops. - \item We evaluate the proposal with two experiments - (\S\ref{sec:validation}). First, we show that we can record - rollouts under one community's native convention and - re-analyse them under another community's (MiniF2F - binary-grading rollouts re-analysed as MCTS tree-search - quantities; Research Rubrics scalar-score rollouts as MAS - per-agent role differentiation). \textbf{Second, we ingest - the published SWE-agent and Agentless submissions to - SWE-bench Verified, re-grade both under a uniform - convention, and find that their 15.6pp published gap - decomposes into \expnum{C TBD}pp of harness convention - (the treatment of no-generation outcomes) and - \expnum{M TBD}pp of residual method difference.} -\end{itemize} - -The remainder of the paper develops these contributions in -order: Section~\ref{sec:problem} the audit, -Section~\ref{sec:system} the format and its reference -implementation, Section~\ref{sec:validation} the two -experiments. - -% ============================================================================ -\section{Recording vs Reporting in the Current Ecosystem} -\label{sec:problem} -% ---------------------------------------------------------------------------- -% Sessions 3-4: Draft per Section Spec rows ``Sec.~2.0 opening'' + ``Sec.~2.1'' + ``Sec.~2.2''. -% Structure: -% Sec.~2.0 opening (0.25 page) --- roadmap paragraph with first-use glosses for -% "transform"/"operator" and "recording vs reporting" per Terminology Glossary -% Sec.~2.1 Transform disagreement across the ecosystem (1.25 pages) --- gap-first: -% headline divergences (three-number hook: LLaMA-65B MMLU 14.9pp, -% Anthropic cached-token 2$\times$, TRL vs OpenRLHF 3.13$\times$) with vendor- -% acknowledgement framing, omnibus table (5-7 flagship rows across three -% metric families, full 37-pair in Appendix B), 50-repo audit as mechanism -% with three score-3 callouts + Inspect AI positive control, training-side -% close with Dr. GRPO + verl #2165 (honesty discipline: these are transform -% differences on identical rollouts, not rollout-format disparities). -% Sec.~2.2 Different communities record different things (0.5 page, ~250 words) -% --- prose illustration with 5 communities, dual-purpose framing -% (illustrative + specifically what Ergon ships projections for), -% closing bridge sentence (~40 words) naming 5 concrete needs. -% -% Drafting source: SURVEY\_master.md for Sec.~2.1 numbers, community\_recording\_needs.md for Sec.~2.2 prose. -% Target: 2.0 pages total. -% ---------------------------------------------------------------------------- - -Two observations recur across the agent research ecosystem. -First, the field disagrees on reported numbers computed from -what are, or could be, identical rollouts: pass rates on the -same model weights, cost totals on the same API calls, and -wall-clock times on the same training runs all differ, -sometimes substantially, depending on which \emph{reporting -convention} (grading script, aggregation rule, failure-handling -rule, timing rule, loss function) a harness applies to the -rollout. Second, -different research communities want to ask different questions -of their rollouts, and the canonical trajectory formats each -community publishes reflect which question the community is -asking. Taking the two together, \emph{recording} (what the -rollout captures) and \emph{reporting} (aggregates computed -from it) are distinct layers that should be published -separately. - -\subsection{Convention disagreement across the ecosystem} -\label{sec:problem:variance} - -We consolidate 37 cross-harness variance pairs where the same -workload yields different published numbers. The pairs span -three metric families (task success, cost/tokens, -latency/timing) and hold at every layer of the infrastructure -stack; in several cases the disagreement is already documented -by the infrastructure projects themselves. - -On task success, HuggingFace's Open LLM Leaderboard post-mortem -documents a 14.9-point gap on LLaMA-65B MMLU between the -Berkeley evaluation code (63.7) and \texttt{lm-evaluation-harness} -(48.8): identical model weights, identical benchmark split, -different prompt-template and answer-extraction conventions -\citep{beeching2023openllm}. -On cost/tokens, OpenTelemetry PR \#3163 formalises a convention -disagreement between the OpenAI/Vertex token-accounting model -(cached tokens counted inside \texttt{input\_tokens}) and -Anthropic's (cached tokens separated into -\texttt{cache\_read\_input\_tokens} + -\texttt{cache\_creation\_input\_tokens}); the same API request -under the two conventions produces published token counts -differing by a factor of 2.0$\times$, with the delta affecting -downstream cost estimators that read \texttt{input\_tokens} as -canonical \citep{opentelemetry3163}. On latency/timing, a -head-to-head between two RL training frameworks reports -5{,}189s/epoch (TRL) versus 1{,}657s/epoch (OpenRLHF) on -GSM8K-GRPO with identical hardware, identical model, and -identical algorithm; \citet[Table 4]{hu2025openrlhf} attribute -the 3.13$\times$ gap to framework-level dispatch conventions -rather than algorithmic difference. - -Several of these findings are not new. HuggingFace's -post-mortem names the MMLU variance explicitly -\citep{beeching2023openllm}. NVIDIA NeMo's documentation states -that simple-evals GPQA and \texttt{lm-evaluation-harness} GPQA -are ``distinct, non-comparable metrics'' \citep{nvidiaNemoGPQA}. -OpenTelemetry PR~\#3163 exists because the cache-token -disagreement was reaching production tracing infrastructure -\citep{opentelemetry3163}. These acknowledgements live -scattered across post-mortems, vendor documentation, and -infrastructure-project pull requests; the field has not -consolidated them or recognised them as instances of the same -phenomenon. Table~\ref{tab:omnibus} samples six flagship pairs -across the three families; the full 37-pair catalogue is in -Appendix~\ref{app:variance-catalogue}. - -\begin{table}[h] -\centering -\small -\setlength{\tabcolsep}{5pt} -\caption{Flagship cross-harness variance pairs across three -metric families. Six of 37; the full catalogue with per-pair -sources is in Appendix~\ref{app:variance-catalogue}. $\Delta$ -is the gap between published numbers on an identical workload; -the SWE-bench Verified Docker/Modal gap is described by the -harness authors as ``systematic'' rather than a single-number -delta \citep{swebenchHarness}.} -\label{tab:omnibus} -\begin{tabular}{@{}lllll@{}} -\toprule -\textbf{Family} & \textbf{Setup} & \textbf{Harness A} & \textbf{Harness B} & \textbf{$\Delta$} \\ -\midrule -Task success & LLaMA-65B MMLU 5-shot & Berkeley (63.7) & lm-eval-harness (48.8) & \textbf{14.9pp} \\ -Task success & Mistral/Mixtral MMLU & best prompt template & worst prompt template & up to \textbf{24.6pp} \\ -Cost/tokens & Anthropic cached API call & OTel input-inclusive & Anthropic separated & \textbf{2.0$\times$} \\ -Cost/tokens & Sonnet 3.5 on Aider & Edit benchmark (\$0) & Refactor+Polyglot (\$14.41) & \textbf{\$14.41} \\ -Latency/timing & SWE-bench Verified & Docker backend & Modal backend & ``systematic'' \\ -Latency/timing & GSM8K-GRPO, identical HW & TRL (5{,}189s/epoch) & OpenRLHF (1{,}657s/epoch) & \textbf{3.13$\times$} \\ -\bottomrule -\end{tabular} -\end{table} - -Each pair above is the default behaviour of a widely-used -harness applied to widely-used models on widely-used -benchmarks, produced without misconfiguration. Where rollouts -have been preserved, they agree. The disagreement enters at -the heuristic each harness applies at reporting time, and the -code paths producing it default to silent behaviour. We audited -50 popular training and evaluation repositories at -file-and-line granularity, pinning each to a specific SHA during -the April 2026 audit window and scoring its behaviour against five failure -scenarios (rollout exception during generation; -reward-function exception or \texttt{None}; zero-length or -unparseable output; mid-rollout environment failure; process -killed mid-rollout). Of the 50 repositories, \textbf{none -default to surfacing failures alongside metrics}: 11 exhibit -catastrophic silent dropping, 31 drop silently, and 9 log -failures but exclude them from the denominator with no -per-category counter. Four score-3 cases illustrate the pattern. FastChat, the -reference implementation behind two of the most-cited public -LLM evaluation numbers (MT-Bench and Chatbot Arena), sets -\texttt{rating = -1} on judge parse-failure and then filters -\texttt{df[df["score"] != -1]} before computing the per-model -mean and Elo, so a model that reliably produces unparseable -outputs appears to have no score rather than a bad -one.\footnote{\texttt{fastchat/llm\_judge/common.py:L175-L187}; -\texttt{show\_result.py:L20, :L49}; -\texttt{elo\_analysis.py:L49-L92} at SHA \texttt{587d5cf}.} - -Three further cases in the same mould. -\texttt{openai/evals} retries API errors indefinitely -(\texttt{backoff.on\_predicate} with no -\texttt{max\_tries}), so any task that does not time out -eventually reports -success.\footnote{\texttt{evals/utils/api\_utils.py:10-22} at -SHA \texttt{8eac7a7}.} PrimeIntellect's \texttt{verifiers} -coerces reward-function exceptions to \texttt{0.0} and -includes them in the group-mean advantage, poisoning the -gradient for every other rollout in the -group.\footnote{\texttt{rubric.py:144-158, :208, :217, :325-333} -at SHA \texttt{e27633b}.} \texttt{ToolBench} promotes a -fraction of \texttt{give\_up\_node} trajectories to -\texttt{valid\_data=True} at random, and SFT for -\texttt{ToolLLaMA} trains on the promoted -failures.\footnote{\texttt{DFS.py:84-91}; -\texttt{preprocess\_toolllama\_data.py:44-45} at SHA -\texttt{d56fdd8}.} - -The best-engineered harness in the survey, -Inspect AI, defaults to \texttt{fail\_on\_error=True} and -emits per-sample error counters. Under -\texttt{fail\_on\_error=False} (common for long runs) its -denominator becomes non-errored samples with no per-category -counter.\footnote{Inspect AI at SHA \texttt{36231d6}.} Even -the positive control falls into silent bias under real-world -usage pressure. - -The pattern extends to training-side computation as well, on -identical rollouts: \citet{liu2025drgrpo} documents +7.3 to -+15.7pp swings on AIME~2024 from a single loss-aggregation -convention, and verl issue~\#2165 \citep{verl2165} documents a -tokenization-channel divergence on Qwen3-4B GRPO reproduced -independently by six users. Neither is a rollout-format -disparity; both would be auditable if the rollouts were -published. We defer the full citation context to -Sec.~\ref{sec:related}. - -\subsection{Different communities record different things} -\label{sec:problem:communities} - -Convention disagreement does not happen in a vacuum. Five -research communities, five different canonical trajectory -formats, each shaped by the questions that community's central -analyses need to ask. \emph{Long-horizon LLM-agentic -RL} records multi-turn interactions as flat token-concatenated -rollouts with per-turn reward attribution, because the -community's central analyses are defined over exactly that -structure: LOOP's learned behaviours (consulting documentation, -minimising confabulation, recovering from setbacks) -\citep{chen2025loop} and RAGEN's Echo Trap and reward-variance -cliffs \citep{wang2025ragen}. \emph{Fixed-role multi-agent -systems} record per-agent streams with stable identities, -because MAST's 14-failure-mode taxonomy over 1{,}642 annotated -traces \citep{cemri2025mast} is computed against per-agent -attribution and full conversation histories, and its most -prevalent category (``specification and system design,'' -41.8\% of failures) turns on per-agent role identification. -\emph{Hierarchical and macro-action RL} records step-indexed -trajectories with explicit option-termination events, because -deliberation cost, option domination, and premature -termination \citep{bacon2017optioncritic} are functions over -those events. - -\emph{Recursive language models} record the agent-call tree -with per-node message histories and typed delegation -primitives, because ReDel's overcommitment/undercommitment -diagnostics \citep{zhu2024redel} are functions over the tree's -shape. \emph{MCTS-based LLM training} records the search tree -with per-node visit counts, Q-values, and abandoned branches, -because rStar-Math's process preference model -\citep{rstarmath2025} trains on preference pairs constructed -from exactly those abandoned branches. These five communities -are illustrative, not exhaustive: interactive theorem proving, -agentic RAG, robot learning, and embodied agents ask their own -questions of their rollouts. We focus on these five because -Ergon specifies projections for them, with the implemented -subset used in \S\ref{sec:validation} and the remaining -projections documented as extension points in -Appendix~\ref{app:integrations}. We expect the same pattern to -hold for communities whose projections we have not yet specified. - -A shared substrate must therefore preserve what these examples -point to: per-agent identity, containment and termination -structure, tool-call content with reasoning channels, -experimental and environmental context, and first-class -failure recording. It must do so while supporting the -analyses different communities want to run post-hoc. - -% ============================================================================ -\section{Ergon: Recording as a Shared Substrate} -\label{sec:system} - -\begin{figure*}[t] -\centering -\includegraphics[width=\textwidth]{figures/ergon_dashboard.pdf} -\caption{One sample from a rollout card, viewed through the -Ergon dashboard. A $\tau_E$ record is a bundle of seven -JSONL/JSON artefacts on disk. \emph{Top:} the sample's -task-execution DAG with per-worker swim lanes and -manager-agent interventions as annotations. \emph{Bottom:} -three reducer views ($\pi_{\text{step}}$, -$\pi_{\text{per-agent}}$, $\rho_{\text{leaderboard}}$) with -their drops manifests (\S\ref{sec:system:drops}), stream -excerpts, the rubric decomposition producing the reported -scalar, and on-disk stream sizes. The dashboard is one -consumer of the format; any $\tau_E$-conformant archive loads -(\S\ref{app:system:format-spec}).} -\label{fig:dashboard} -\end{figure*} - -% ---------------------------------------------------------------------------- -% Sessions 5-6: Draft per Section Spec rows ``Sec.~3 opening'' + ``Sec.~3.1'' + ``Sec.~3.2'' + ``Sec.~3.3''. -% Structure: -% Sec.~3 opening (~45 min, folded into Sec.~3.1) --- Ergon as reference implementation -% of rollout cards, pick up Sec.~2.2's named needs through design exposition -% Sec.~3.1 $\tau_E$ definition (0.75 page) --- event-sourced trajectory structure, each -% field introduced with motivation mapped to Sec.~2.2's named needs and -% community citations from community\_recording\_needs.md -% Sec.~3.2 Projections and drops manifests (0.75 page) --- projections as transforms -% applied post-hoc, drops manifest as methodology contribution, -% trainer-adapters-as-transforms as training-time generalisation -% Sec.~3.3 NEW --- Rollout cards as community standard (0.5 page) --- genre -% positioning: Datasheets / Model Cards / Evaluation Cards precedent, -% rollout cards as category, Ergon as reference implementation, -% downward-compatibility claim, opt-in adoption path. -% -% Drafting source: community\_recording\_needs.md for Sec.~3.1 field motivations. -% Appendix C carries the schema detail pushed out of Sec.~3.1 body. -% Target: 2.0 pages total. -% ---------------------------------------------------------------------------- - -% v5.6.1 NOTE (Session 11 TODO): The Ergon-internal exposition that -% previously lived here as §3 opening, §3.1 "The Ergon trajectory", and -% §3.2 "Projection operators and drops manifests" has been moved to: -% - Appendix C §C.1 "The τ_E trajectory representation" (the tuple -% τ_E = (N, E, C, A, M), the five-component design exposition -% mapping to §2.2 needs, the nine-value mutation vocabulary, -% apply_mutation() mechanism, synchronous-transactional durability, -% concurrency-recorded-faithfully property, bounded-loss-under-crash -% property) -% - Appendix C opening paragraph (projection formalism π: τ_E → (T_π, D_π), -% typed-vocabulary erasure types, π-preserved formalism, -% "Trainer adapters are projections" paragraph with TRL/VERL/OpenRLHF) -% Session 11 appendix work drafts these in full per the v5.6.1 plan. -% -% The body now proceeds directly from §3 section header into §3.1 -% "What a rollout card contains" (format spec), §3.2 "Reporting what -% was dropped" (drops-manifest methodology), §3.3 "Ergon: a reference -% implementation" (short declarative summary), §3.4 "Rollout cards as -% a community standard" (genre positioning). - -Section~\ref{sec:system} develops the proposal in three parts. -Section~\ref{sec:system:format} specifies what a rollout card -contains: the streams, the vocabulary, and the extensibility -rules. Section~\ref{sec:system:drops} introduces \emph{drops -manifests}, the methodology for reporting what each reduction -of a rollout to an aggregate discarded. -Section~\ref{sec:system:ergon} positions rollout cards in the -Datasheets and Model Cards lineage and summarises Ergon as a -reference implementation. Full system details (schema, -mutation log, trainer adapters, crash durability) are in -Appendix~\ref{app:system}. - -\subsection{What a rollout card contains} -\label{sec:system:format} - -A \emph{rollout card} is the portable, publication-ready -serialisation of one completed run: a self-describing archive -of JSONL streams plus a manifest and an optional blob -directory, readable by a downstream consumer without access to -the producing runtime. This is the \emph{record} side of -\S\ref{sec:problem}'s recording-vs-reporting decoupling; -\S\ref{sec:system:drops} describes the \emph{reporting} side. -Figure~\ref{fig:dashboard} shows one sample from a rollout -archive, viewed through the Ergon dashboard -(\S\ref{sec:system:ergon}); its panels preview the format's -components before we name them. The bundle is -medium-independent --- distributable as a directory, a zip, or -a HuggingFace dataset, and implementable over any storage -backend that preserves the row semantics of -Appendix~\ref{app:system:format-spec}. - -Each stream addresses one of the five recording needs of -\S\ref{sec:problem:communities}. The event stream records -the reasoning-and-action trace on a monotonic per-task-execution -sequence axis as a typed vocabulary of events (full payload -union in Appendix~\ref{app:system:postgres}). -The node stream is the containment tree of task-execution -nodes, each carrying a parent pointer, a worker identity stable -across the node's lifetime, and a status drawn from an ontology -declared per-run in the annotation namespace. -The edge stream holds the dependency DAG separately from the -parent-child tree, so fire-and-forget delegation and -sibling-diamond dependencies are both representable. -The annotation stream is a namespace-keyed versioned store -that reserves one namespace for task payloads and leaves all -others to experiment code, projections, or external -instrumentation. -The mutation stream records structural change on the same -sequence axis as events, drawn from a closed nine-value -vocabulary (Appendix~\ref{app:system:mutations}); failures -appear here as status transitions rather than exceptions -routed around the recording path, the mechanism underneath -\S\ref{sec:problem:variance}'s 0-of-50 failure-surfacing -finding. -A run manifest carries metadata, a format version, and content -hashes; a blob store holds content-addressed overflow for -payloads above an inline size cap (default 64\,KB). The five streams cover the -community needs of \S\ref{sec:problem:communities} directly: -per-agent identity and failure attribution -\citep{cemri2025mast, wang2024naht}, structured reasoning-% -action traces \citep{wang2025ragen, chen2025loop}, containment -and termination semantics \citep{bacon2017optioncritic, -zhu2024redel}, and concurrent dispatch. - -The format is lossless: every row in the streams corresponds -directly to a row persisted during the run, and downstream -analyses read whichever streams and namespaces they recognise. -Extensibility is via namespaces --- a team studying cross-agent -tool negotiation can claim a \texttt{tool-negotiation} -namespace and record per-edge metadata that other consumers -simply ignore, and the card remains a valid card. Ergon ships -a reference exporter (\texttt{ergon export-rollout-card -}), a Pydantic validator -(\texttt{ergon validate-rollout-card }), and publishes -one example card per demonstration of \S\ref{sec:validation} -as a HuggingFace dataset with the camera-ready codebase. - -\subsection{Reporting what was dropped} -\label{sec:system:drops} - -Publishing the rollout is half of the reconciliation story; -the other half is publishing what was discarded in going from -rollout to reported number. A \emph{drops manifest} is the -typed-vocabulary record accompanying any reduction of a -rollout to an aggregate: a list of entries naming the erasure -type, the rollout element it points at, and the reported -quantity the removal affects. We draw erasure types from a -small vocabulary covering the erasures Sec.~\ref{sec:problem} -documented in practice: failure exclusion, cache-accounting -convention, intermediate-reward erasure, concurrency collapse, -containment flattening, reasoning truncation, and attribution -loss. -Figure~\ref{fig:dashboard}'s reducer-views panel displays the -drops manifests for three projections over the sample shown: -$\pi_{\text{step}}$ records three erasure classes (concurrency -collapse, containment flattening, intermediate-reward erasure), -$\pi_{\text{per-agent}}$ records two, and -$\rho_{\text{leaderboard}}$ records five. -This addresses -Sec.~\ref{sec:problem:variance}'s diagnoses directly: 0 of 50 -audited repositories surface failures alongside metrics -because they do not record that failures were excluded; the 37 -cross-harness variance pairs disagree untraceably because -neither harness records which erasures its own convention -applied. Making reductions legible requires naming the -erasures. Drops manifests are additive to any reducer: a team -re-grading SWE-bench trajectories under their own convention -can ship a drops manifest without adopting Ergon, and a -reviewer auditing two harnesses' disagreement on the same -rollout can compare their drops manifests directly. Ergon's -projection operators (Appendix~\ref{app:projections}) are one -instantiation, shipping a drops manifest per target format; the -vocabulary itself is the proposal. - -\subsection{Ergon: a reference implementation} -\label{sec:system:ergon} - -We implement the format as Ergon, an async gym with a -persistent recording substrate. Ergon hosts existing agent -frameworks (LangGraph, CrewAI, AutoGen, Claude Code) via -lightweight adapters and feeds RL trainers (TRL, VERL, -OpenRLHF) through format projections that preserve the full -trajectory underneath. Ergon writes streams synchronously -under Postgres transactions; the -\texttt{ergon export-rollout-card } command emits a -rollout card from any completed run, and the -Figure~\ref{fig:dashboard} dashboard is a reference reader -over any $\tau_E$-conformant archive. Ergon is one -implementation of the format: any framework matching the -specification (Appendix~\ref{app:system:format-spec}) can -publish rollout cards. Adoption is additive: a paper can ship -cards without changing its headline table, and later work can -re-analyse them under whatever convention is useful. The -paper still reports the scalar; it also publishes the rollout; -the convention that produced the scalar is separable, -inspectable, and replaceable. - -% ============================================================================ -\section{Experiments} -\label{sec:validation} - -% ---------------------------------------------------------------------------- -% Session 7 draft. Three subsections: -% 4.1 Research Questions --- RQ1 (cross-community reanalysis) + RQ2 -% (cross-harness reconciliation). Frames what §4 tests. -% 4.2 Experimental setup --- three setups: MiniF2F + Research Rubrics -% recorded natively; SWE-bench ingested from published submissions. -% Behavioural quantities compressed to a table. -% 4.3 Results and discussion --- RQ1 findings from MiniF2F + RR rollouts; -% RQ2 headline finding (convention accounts for ~4pp of 15.6pp gap); -% decomposition under Convention A and B. -% -% Placeholder numbers in \expnum{} pending: -% - Experiment A rollouts (MiniF2F + RR, flexible-decomposition agent) -% - Weekend 1 SWE-bench reconciliation pipeline -% -% §4.6 durability under faults --- REMOVED FROM BODY in v5.6.1. Moved to -% Appendix as Session 11 work. Body cites appendix briefly in §4.2. -% ---------------------------------------------------------------------------- - -\subsection{Research questions} -\label{sec:validation:questions} - -We evaluate the proposal with two experiments, each addressing -one of the two problems diagnosed in \S\ref{sec:problem}: -RQ1 targets the cross-community recording mismatch of -\S\ref{sec:problem:communities}, RQ2 targets the -convention-disagreement reconciliation of -\S\ref{sec:problem:variance}. Both use rollout cards as a -shared artefact against which we apply conventions post-hoc; -neither requires re-running agents. The experimental scope is a proof of concept. Two pairings in -RQ1 and one reconciliation in RQ2 are enough to establish that -the format is structurally richer than any single community's -canonical format. We do not claim that this two-pairing scope -exhausts the community pairs for which the format is -adequate. - -\paragraph{RQ1: Cross-community reanalysis.} Can a rollout -card recorded under one research community's native scorer be -consumed by a different community's scorer to recover a -quantity the original publication did not report? Positive -evidence would show that the format is rich enough to serve -communities whose questions differ from the recording -community's, directly from the rollout rather than by -re-generating trajectories under a new harness. - -\paragraph{RQ2: Cross-harness reconciliation.} Can -published trajectories from two harnesses reporting different -scores on the same benchmark be ingested into a common rollout -card format, re-graded under a uniform convention, and the -reported score gap decomposed into harness-attributable and -method-attributable components? Positive evidence would show -that rollout cards provide a substrate for making otherwise -untraceable reported-number disagreements auditable. - -\subsection{Setup} -\label{sec:validation:setup} - -The experiments use three task families. For the first two -(MiniF2F and Research Rubrics) we run a flexible-decomposition -agent natively in Ergon to produce rollouts; these exercise -RQ1. For the third (SWE-bench Verified) we ingest rollouts from -two published submissions (SWE-agent and Agentless, both on -GPT-4o); this exercises RQ2. The two experiments draw on six -behavioural quantities, each a canonical analysis of one of the -five communities of Sec.~\ref{sec:problem:communities}; -Appendix~\ref{app:quantities} catalogues the full set and we -compute three in \S\ref{sec:validation:results}. Ergon commits -every mutation synchronously -(Appendix~\ref{app:system:mutations}), so a mid-rollout crash -leaves the run graph in a well-defined partial state. - -\paragraph{Cross-community reanalysis (MiniF2F, Research -Rubrics).} A single \emph{flexible-decomposition agent} runs -on both task families with a shared backbone -(\expnum{model\_target TBD}) and a turn cap of -\expnum{N TBD}. Its system prompt describes four -subtask-decomposition tools (spawn, cancel, wait-on, and -report-result) mechanically, with no task-specific strategic -guidance, so observed decomposition structure comes from the -agent rather than the prompt. Ergon records trajectories -directly as rollout cards (Sec.~\ref{sec:system:format}). - -\textbf{MiniF2F} ($\sim$30 problems, Lean~4 theorem proving -\citep{zheng2022minif2f}) uses \texttt{lean\_repl} / -\texttt{lean\_check} and binary grading. -\textbf{Research Rubrics} ($\sim$30 questions, long-horizon -open-ended research) uses \texttt{web\_search} / -\texttt{read\_document} with rubric-weighted GPT-4o-mini -evaluation. - -For RQ1 we pair each task family with a community whose -native scorer was \emph{not} the one the task's designers had -in mind. MiniF2F, which we record for binary proof-success, -pairs with the MCTS-based training community -\citep{rstarmath2025, feng2024restmcts}, whose quantities -include abandonment ratio by tree depth and sub-goal -semantics at abandoned nodes. Research Rubrics, which we -record for scalar rubric scores, pairs with the fixed-role -MAS community \citep{cemri2025mast}, whose quantities include -per-agent role specialisation; we capture it via -sibling-embedding distance across workers the agent spawns -from the same parent. In each pair, the target quantity is -one the source scorer discards but the recorded rollout -carries. - -\paragraph{Cross-harness reconciliation (SWE-bench -Verified).} The paper's motivating gap is SWE-agent's 23.2\% -vs.\ Agentless's 38.8\% on GPT-4o-2024-05-13 (15.6pp). Both -submissions publish complete trajectories through -\texttt{swe-bench-submissions}; we ingest each into a rollout -card and re-grade under a uniform convention. The ingestion -is the main engineering step: SWE-agent ships per-instance -JSON with a flat conversation history and a denormalised -\texttt{trajectory} array, Agentless ships per-instance -Python-logger plaintext with -\texttt{ChatCompletion(\ldots)}~reprs across a seven-stage -pipeline. Each ingestion produces a drops manifest -(Appendix~\ref{app:reconciliation}) naming what the source -format could not carry, exercising the methodology of -Sec.~\ref{sec:system:drops}. - -One convention choice matters enough to foreground: the -treatment of \texttt{no\_generation} outcomes, where the -harness records no agent submission. SWE-agent produces 50 -such outcomes (10\% of Verified); Agentless produces 4 -(0.8\%). We report under two conventions. -\emph{Convention A} excludes no-generation cases from the -denominator (isolating attempts from non-attempts); -\emph{Convention B} includes them as zeros (what the -published headlines implicitly use). The choice surfaces a -harness-level decision that neither published paper's -headline makes visible. - -\subsection{Results} -\label{sec:validation:results} - -Results come in three parts: RQ1a recovers MCTS-community -quantities from MiniF2F rollouts, RQ1b recovers MAS-community -quantities from Research Rubrics rollouts, and RQ2 decomposes -the SWE-agent/Agentless gap on SWE-bench Verified. - -\paragraph{RQ1a: MiniF2F $\to$ MCTS-community quantities.} -We record \expnum{N TBD} MiniF2F rollouts under binary grading -and recover abandonment ratios as a function of containment -depth: at depth~1 decompositions, -\expnum{A\_1 TBD}\% are abandoned before completion; at -depth~2, \expnum{A\_2 TBD}\%; at depth~3, \expnum{A\_3 TBD}\%. -Clustering the assistant-text content at abandoned -nodes yields \expnum{K TBD} distinct sub-goal categories -(e.g.\ \expnum{category descriptions TBD}), matching the -negative-preference training signal rStar-Math's PPM uses -\citep{rstarmath2025}. We cannot derive either quantity from -the binary pass/fail the MiniF2F headline reports; we compute -both post-hoc from the rollout card's node and event streams -without re-running any proofs. - -\paragraph{RQ1b: Research Rubrics $\to$ MAS-community -quantities.} We record \expnum{N TBD} Research Rubrics -rollouts under rubric-score grading and compute per-agent -sibling embedding distance: \expnum{D TBD}\% of sibling pairs -have cosine distance above \expnum{threshold TBD}, indicating -role differentiation in the sense of MAST's duplicate-agent -failure mode \citep{cemri2025mast}. The rubric-scored headline -reports a scalar per question; we can compute the -role-specialisation analysis from the same rollouts, and -\expnum{Q TBD} of \expnum{N TBD} exhibit the duplicate-agent -pattern invisible in the rubric number. Neither -task family was designed for the community whose quantities -we recover, so we claim only that the relevant quantities are -\emph{computable} from the rollouts (which current practice -of publishing community-native aggregates denies), not -numerical parity with a native harness. - -\paragraph{RQ2: Cross-harness reconciliation on SWE-bench.} -Approximately \expnum{convention TBD}pp of the 15.6pp -published SWE-agent/Agentless gap on SWE-bench Verified -resolves to a single harness convention --- the treatment of -\texttt{no\_generation} outcomes, where SWE-agent produces 50 -such instances (10\% of Verified) and Agentless produces 4; -the remaining \expnum{M TBD}pp is method-attributable. Neither -publication's reported number makes this classification choice -visible. Table~\ref{tab:reconciliation} decomposes the gap: -under Convention~B (the published convention, treating -no-generation as zero) the re-graded gap is -\expnum{gap\_B TBD}pp; under Convention~A (excluding -no-generation from the denominator) the remaining -gap is \expnum{gap\_A TBD}pp, attributable to how each harness -elicits patches. We take the decomposition itself to be the -primary deliverable: a gap that published as a pure method -difference turns out to have a substantial harness-convention -component, invisible without the rollout-card substrate. - -\begin{table}[h] -\centering -\small -\caption{Decomposition of the 15.6pp published gap between -SWE-agent and Agentless (GPT-4o-2024-05-13, SWE-bench -Verified). Under Convention~B, no-generation outcomes count -as zeros (the published default); under -Convention~A, they are excluded from the denominator. -Ingestion drops manifests are in -Appendix~\ref{app:reconciliation}.} -\label{tab:reconciliation} -\begin{tabular}{@{}lcc@{}} -\toprule - & \textbf{SWE-agent} & \textbf{Agentless} \\ -\midrule -Published score & 23.2\% (116/500) & 38.8\% (194/500) \\ -Re-graded, Convention~B & \expnum{X TBD}\% & \expnum{Y TBD}\% \\ -Re-graded, Convention~A & \expnum{X' TBD}\% & \expnum{Y' TBD}\% \\ -\midrule -Published gap & \multicolumn{2}{c}{15.6pp} \\ -Convention-attributable (B $\to$ A delta) & \multicolumn{2}{c}{\expnum{C TBD}pp} \\ -Method-attributable (under A) & \multicolumn{2}{c}{\expnum{M TBD}pp} \\ -\bottomrule -\end{tabular} -\end{table} - -A second observation cuts the other way. Agentless at SHA -\texttt{5ce5888} defaults missing regression-test results to -\texttt{[0]*10000} rather than surfacing them as failures -(Appendix~\ref{app:survey}, Patterns~P1 and~P4), so instances -where its own pipeline errors out are charged against the -method. The silent-drop bias therefore runs in the wrong -direction to absorb the convention gap: if anything, it -\emph{amplifies} the 15.6pp. The audit pins -\texttt{agentless/test/run\_regression\_tests.py} at the -named SHA, and a reader can verify the \texttt{[0]*10000} -default in two clicks. That SWE-agent JSON and Agentless -plaintext-log formats both ingest into the same target with -documented drops manifests (Appendix~\ref{app:reconciliation}) -shows the rollout-card format accommodates heterogeneous -sources, not only data recorded natively. - -% ============================================================================ -\section{Related Work} -\label{sec:related} -% ---------------------------------------------------------------------------- -% Session 9: Draft per Section Spec row ``Sec.~5 Related Work''. Target 0.5 page. -% Content: Datasheets / Model Cards / Evaluation Cards as direct genre -% precedent (anchored in Sec.~3.3). Transform-variance literature: Biderman et al. -% 2024 (prompt-template 24.6pp), Dr. GRPO, verl #2165 (tokenization-channel -% divergence). DAPO as secondary instance citation. Existing Sec.~5 content -% (Gym/Gymnasium, PettingZoo/JaxMARL, agent frameworks, recursive toolkits, -% schema-alignment efforts AgentOhana/VerlTool/Agent Lightning) carries -% through compressed. -% ---------------------------------------------------------------------------- - -We position rollout cards against four adjacent threads: the -documentation-standards genre they extend, convention-variance -documentation in evaluation and training, the -environment-and-framework landscape Ergon slots into, and -schema-alignment efforts at training-input time. - -\paragraph{Genre precedent.} Rollout cards sit alongside -Datasheets for Datasets \citep{gebru2021datasheets} and Model -Cards for Model Reporting \citep{mitchell2019modelcards}: -structured artefacts published alongside a research object to -make downstream use legible. Sec.~\ref{sec:system:format} -adapts the pattern to agent trajectories. - -\paragraph{Convention variance in adjacent spaces.} -\citet{biderman2024} find 24.6pp variance on LLaMA-7B MMLU -across three popular harnesses driven by prompt-template -choice. On the training side, \citet{liu2025drgrpo} document -+7.3 to +15.7pp on AIME~2024 from a single GRPO -loss-aggregation convention, and verl -issue~\#2165~\citep{verl2165} documents tokenization-channel -divergence on Qwen3-4B GRPO with six independent -reproductions. Those works identify and fix specific -conventions; the present work proposes infrastructure for -making future disagreements of this kind auditable before -they calcify. - -\paragraph{Environment and framework landscape.} -Gym~\citep{brockman2016gym} and -Gymnasium~\citep{towers2024gymnasium} are the canonical -precedent for a shared evaluation interface in RL; Ergon makes -the same move at a higher level of abstraction, standardising -recording, rollout execution, and trainer integration for -decomposed long-horizon work. -PettingZoo~\citep{terry2021pettingzoo} and -JaxMARL~\citep{rutherford2024jaxmarl} extend Gym to -fixed-population multi-agent with joint actions; dynamic -population, asynchronous dispatch, mid-rollout cancellation, -and non-tree dependency sit outside that paradigm. Agent -frameworks (LangGraph~\citep{langgraph2024}, -CrewAI~\citep{crewai2024}, AutoGen~\citep{wu2024autogen}, -Claude Code~\citep{claudecode2024}) and inference-time -recursive toolkits (ReDel~\citep{zhu2024redel}, -RLM~\citep{zhang2025rlm}, -AgentOrchestra~\citep{agentorchestra2025}) run under Ergon's -substrate with automatic recording. - -\paragraph{Schema alignment efforts.} Recent -surveys~\citep{yehudai2025agenteval, cemri2025mast, -yang2025agentprotocols} flag fragmentation across agent -trajectory formats, and unification efforts address it at -training time: -AgentOhana~\citep{zhang2024agentohana} standardises -trajectories into a unified training loader, -VerlTool~\citep{jiang2025verltool} aligns tool-integration -logic across agentic RL codebases, and Agent -Lightning~\citep{luo2025agentlightning} decouples LLM -generation from agent logic. Those proposals unify -\emph{training inputs}; we propose a \emph{publication -format}, with drops manifests -(Sec.~\ref{sec:system:drops}) supplying a per-format account -of what each canonical shape cannot represent. - -% ============================================================================ -\section{Discussion} -\label{sec:discussion} -% ---------------------------------------------------------------------------- -% Session 9: Draft per Section Spec row ``Sec.~6 Discussion''. Target 0.75 page. -% Content beats: -% (1) Policy paragraph --- publishing rollouts orders of magnitude cheaper -% than hosting models, CERN/genomics/astronomy precedents, opt-in -% per-researcher adoption -% (2) Honest-limitation paragraph --- rollout cards enable DETECTION not -% forced RESOLUTION; analogy to Datasheets/Model Cards (documentation -% standards don't force methodological convergence) -% (3) Dual-role community framing --- five Sec.~2.2 communities illustrate -% general claim AND are specifically the five we ship projections for; -% pattern generalises to ITP / agentic RAG / robot learning / -% embodied agents but concrete support requires further projections -% (4) Further limitations --- coverage representative, training-side -% citation-based not reproduced, case studies single-analyst -% (5) Future work --- transform-provenance tooling; shared transform -% libraries; reconciliation services; projections for additional -% communities. -% ---------------------------------------------------------------------------- - -We discuss four aspects of the proposal in turn: publication -cost and the adoption path, the distinction between detection -and resolution, scope limitations of the present work, and -future directions. - -\paragraph{Publication cost and adoption.} Publishing rollout -cards is orders of magnitude cheaper than hosting models: a -rollout card for a completed agent run is at most a few -megabytes of text and event records, where a model checkpoint -is tens to hundreds of gigabytes. The precedent for domain-wide -rollout publishing exists elsewhere: CERN Open -Data~\citep{cernopendata} releases detector-event records from -LHC experiments; genomic consortia (1000 Genomes, UK Biobank) -publish per-individual read-level data alongside aggregate -findings; astronomy surveys (SDSS, Gaia) publish -observation-level exposures with derived catalogues. In each -case the shared artefact is the observation, not only the -analysis. Agent research can make the same move. Adoption is -opt-in per researcher and requires no coordinated ecosystem -pivot: a paper that ships a rollout card alongside its headline -number loses nothing and gains auditability; a reader who wants -to reanalyse can do so against the rollout without asking for -the original harness. - -\paragraph{Detection over resolution.} Rollout cards let a -reader \emph{detect} convention disagreement. They do not by -themselves force a \emph{resolution}. A second researcher -applying a different convention to the same rollouts will -generally produce a different number than the original paper, -and we consider this a feature: the disagreement becomes -legible, because both conventions' drops manifests are -publishable and comparable, and reconcilable, because a -uniform convention can be applied to both. The Datasheets and -Model Cards analogy holds here too: documentation standards -make practice legible; they do not force methodological -convergence. We expect a field that adopts rollout cards to -have more transparent disagreements than it has today. - -\paragraph{Limitations.} Coverage is representative: we draw -five communities from a long tail, audit 50 repositories from -a much larger population, and select 37 variance pairs from -the subset where documentation made comparison possible. We -cite the training-side convention-variance evidence -(Dr.~GRPO, verl~\#2165) rather than reproduce it here. The -cross-community reanalysis is single-analyst: we pair each -task family with a single target community rather than a -multi-community panel. The cross-harness reconciliation -demonstrates the format on two published submissions on one -benchmark; we leave generalisation across benchmarks and model -variants to future work. -We claim the general pattern (conventions applied without -rollouts make disagreement untraceable) is robust across -these scope bounds, but not that every specific number -generalises. - -\paragraph{Future work.} Two directions. First, -convention-provenance tooling: a rollout card plus its drops -manifest is a consumable artefact for automated variance -analysis; a tool that ingests both and flags convention -divergences would make the SWE-bench finding of -Sec.~\ref{sec:validation:results} routine rather than ad hoc. -Second, training-side rollout cards: -Sec.~\ref{sec:problem:variance}'s training-side evidence -implies an analogue at the gradient-computation layer where -the loss and advantage rules play the role of the reporting -convention; we have not worked out what that looks like. - -% ============================================================================ -\section{Conclusion} -\label{sec:conclusion} -% ============================================================================ - -Agent research publishes reported numbers without the rollouts -those numbers were computed from, and the resulting coupling -between recording and reporting produces both cross-community -fragmentation and cross-harness variance that cannot be -reconciled after the fact. We proposed \emph{rollout cards}: -an event-sourced publication format in which the rollout -itself is the research artefact, and a later researcher can -apply any reporting convention (grading script, aggregation -rule, training loss) post-hoc. We introduced drops manifests as a -paper-level methodological concept accompanying any reduction -of a rollout to an aggregate, and presented Ergon as a -reference implementation at the intersection of the agent and -RL stacks. Two experiments demonstrated the proposal: -cross-community reanalysis on MiniF2F and Research Rubrics -rollouts showed that recorded trajectories carry quantities -the recording community's scorer discarded, and -cross-harness reconciliation on SWE-bench Verified showed that -a substantial fraction of a widely-published score gap -resolves to a harness convention invisible in the reported -numbers. If the field adopts the format, reported-number disagreements -become legible and auditable. That is the tractable -precondition we argue for: an ecosystem in which the field can -compare cross-community and cross-harness results on their -merits rather than on the conventions of their producers. - -% ============================================================================ -% BIBLIOGRAPHY -% ============================================================================ -\bibliographystyle{plainnat} -\bibliography{references} - -% ============================================================================ -% APPENDICES -% ============================================================================ -\appendix - - -% ---------------------------------------------------------------------------- -% Appendix A --- 50-repo Failure-Handling Survey. -% Source of record: ergon_survey/SURVEY_v4.md (commit-pinned edition, April 2026). -% Structure: selection, methodology, rubric, distribution, per-repo entries -% (one-liner each; top-11 score-3 with 2-3-sentence writeup), verification -% process, coverage snapshot. -% ---------------------------------------------------------------------------- - -\section{50-repo Failure-Handling Survey} -\label{app:survey} - -This appendix reports the code-level evidence underlying the -failure-surfacing claim in Sec.~\ref{sec:problem}. The final -sample contains 50 in-scope repositories selected by -category-stratified purposive sampling across the code paths by -which LLM-agent and LLM-training papers publish benchmark -numbers, training metrics, or leaderboard ranks. Each repository -was shallow-cloned at a pinned commit SHA during the April 2026 -audit window and scored against a seven-pattern failure-handling -rubric. Every source-level claim below is backed by a -commit-pinned GitHub permalink in the supplementary survey -document \texttt{SURVEY\_v4.md}; pinned SHAs are listed in -\S\ref{app:survey:shas}. - -\subsection{Repository selection} -\label{app:survey:selection} - -\paragraph{Sampling frame and operational scope filter.} -The survey uses \emph{category-stratified purposive sampling} -over the code pipeline through which an LLM-agent paper -publishes a benchmark number. A repository is in-scope if and -only if, at its pinned SHA, it contains code that (a)~is -actively maintained (last commit within the past 12 months), -(b)~produces a benchmark number, training-pipeline metric, or -leaderboard rank that a published paper would cite, and -(c)~implements the result aggregation itself rather than -delegating all aggregation to a separately-versioned downstream -harness. Star-count was not used as a threshold; verified -in-scope repositories range from $\sim\!150$ stars (SciCode -164, MLAgentBench 320) to $\sim\!40$k+ (FastChat, aider), -demonstrating that the in-scope/canonical-harness filter -dominates any star-based floor. Operational scope is enforced -by two exclusions, documented as examples rather than -concealed: Online-Mind2Web was on the candidate list but -excluded after repeated clone errors prevented SHA pinning; -OpenHands was spot-checked (SHA \texttt{3b17f27}) and -documented OUT-OF-SCOPE because its evaluation harness lives in -a separately-versioned repository -(\texttt{github.com/OpenHands/benchmarks}) and the main repo -contains no result-aggregation, scoring, or denominator-handling -code. This is the operational scope filter working as intended: -OpenHands is the foundation layer (analogous to how PyTorch is -the foundation for training frameworks) and is correctly -excluded by criterion~(c). - -\paragraph{Sample composition.} -The 50 repositories are grouped into five strata chosen to cover -the major places where agent-research numbers are produced: -\emph{(i)} SWE-bench-family harnesses and agent scaffolds; -\emph{(ii)} RL and agentic-RL training frameworks; -\emph{(iii)} general evaluation harnesses, LLM-as-judge systems, -and leaderboard infrastructure; \emph{(iv)} web, GUI, mobile, -scientific-agent, and ML-engineering benchmarks; and -\emph{(v)} function-calling, multi-agent, and general -agent-scaffold repositories. Selection was driven by operational -role rather than popularity: a repository was included when it -contained aggregation, scoring, denominator handling, failure -handling, reward computation, or training-metric code that could -affect a number cited in a paper. - -\paragraph{Repository strata.} -\emph{(i)} SWE-bench-family harnesses and agent scaffolds -(SWE-bench, SWE-agent, mini-swe-agent, live-swe-agent, -SWE-bench\_Pro-os, SWE-smith, Agentless, aider); -\emph{(ii)}~RL training frameworks (TRL, verl, OpenRLHF, rllm, -slime, ART, Agent-R1, RAGEN, MARTI, MATPO, RL-Factory, -Trinity-RFT, ms-swift, verifiers, open-r1); \emph{(iii)}~general -eval harnesses and LLM-as-judge leaderboards (openai/evals, -simple-evals, SciCode, lm-evaluation-harness, lighteval, -inspect\_ai, FastChat/MT-Bench + Chatbot Arena, HELM, ragas, -deepeval, MTEB, promptfoo, BIG-bench); \emph{(iv)}~web/GUI, -mobile, and ML-engineering / scientific-agent benchmarks -(WebArena, VisualWebArena, OSWorld, android\_world, Mind2Web, -MLE-bench, MLAgentBench, ScienceAgentBench); and \emph{(v)}~% -function-calling and general multi-agent / scaffold repositories -(BFCL/gorilla, ToolBench, AgentBench, AgentBoard, tau-bench, -Self-rewarding-reasoning-LLM). The final sample is 50 in-scope -repositories, with two exclusions (Online-Mind2Web, OpenHands) -documented as scope-filter exemplars rather than concealed. - -\subsection{Audit methodology} -\label{app:survey:method} - -Each repository was shallow-cloned once into -\texttt{ergon\_survey/clones/\{repo\}} and pinned at \texttt{git -rev-parse HEAD}. The pinned SHA list in -\S\ref{app:survey:shas} is the verification baseline for every -permalink. Every citation in this appendix -resolves to -\texttt{github.com/\{owner\}/\{repo\}/blob/\{SHA\}/\{path\}\#L\{start\}-L\{end\}} -against that SHA. Each claim identifies the code path that -controls failure handling, denominator construction, reward -coercion, or result aggregation for a number the repository -can publish. Two representative per-repo reports are included -verbatim in the supplementary materials to illustrate the -template. - -\subsection{Severity rubric: seven patterns of silent drop} -\label{app:survey:rubric} - -Every repository was scored 0--3 against a rubric defined by -seven recurring code patterns: -\emph{(P1)} \texttt{None}-to-\texttt{0.0} reward coercion -(reward-function exception silently yields a numeric zero that -enters aggregation alongside genuine zero-reward rollouts); -\emph{(P2)} variance-based rollout filters that discard entire -rollout groups with zero std (collapse with the remaining group -in published aggregates); \emph{(P3)} -save-on-clean-exit patterns bypassed by -\texttt{KeyboardInterrupt} or early-exit cost-killers, removing -instances from the submission file rather than the denominator; -\emph{(P4)} survivor denominators --- reporting -\texttt{sum(scores) / len(scores)} over instances that reached -the aggregation step, with the pre-aggregation drop count not -preserved; \emph{(P5)} multi-turn N-destroys-1-to-N-1 patterns, -where a mid-rollout tool-call exception discards the entire -trajectory including its already-completed turns; \emph{(P6)} -LLM-judge regex-miss-to-fixed-score, where a judge returning -an unparseable response is coerced to a specific numeric score -indistinguishable from a legitimate one (e.g.\ \texttt{0.0}, -\texttt{min(choice\_scores)}, \texttt{NOT\_ATTEMPTED}); and -\emph{(P7)} outcome-as-ground-truth SFT filtering, where a -downstream supervised dataset is built from only those -trajectories that scored correctly on the biased harness, -propagating the bias across a model-generation boundary. A -repository scores 3 if a single pattern is catastrophic and -observable at the headline output (the failure changes the -number a paper would cite); 2 if one or more patterns are -present but the reader cannot tell from the published CSV; 1 if -failures are logged but excluded from the denominator with no -per-category counter; 0 if all failures are surfaced alongside -metrics. No repository in the survey defaults to score 0. - -\subsection{Score distribution} -\label{app:survey:distribution} - -\begin{table}[h] -\centering -\small -\caption{50-repo severity scores. No repository defaults to -surfacing failures alongside metrics. OpenHands is documented -OUT-OF-SCOPE (no in-repo aggregation harness; benchmarks live -in a separately-versioned repository) and not counted in the -50. Simple-evals appears in both score-3 (SimpleQA subset) and -score-1 (non-SimpleQA subset); the $11{+}31{+}9{=}51$ listings -correspond to 50 unique repositories.} -\label{tab:survey:distribution} -\begin{tabular}{@{}clp{0.55\linewidth}@{}} -\toprule -\textbf{Score} & \textbf{Count} & \textbf{Repositories} \\ -\midrule -3 & 11 & SWE-bench, openai/evals, simple-evals (SimpleQA), -VisualWebArena, ToolBench, Self-rewarding-reasoning-LLM, -PrimeIntellect verifiers, Trinity-RFT, ms-swift (GRPO), BFCL -(gorilla), FastChat (MT-Bench + Chatbot Arena) \\ -\addlinespace -2 & 31 & Agent-R1, MARTI, SWE-bench\_Pro-os, RAGEN, OpenRLHF, -TRL, ART, slime, tau-bench, rllm, live-swe-agent, SWE-agent, -mini-swe-agent, MATPO, verl, AgentBoard, AgentBench, -RL-Factory, SWE-smith, lm-evaluation-harness, WebArena, -MLAgentBench, MLE-bench, lighteval, Agentless, aider, Mind2Web, -open-r1, ragas, MTEB, promptfoo \\ -\addlinespace -1 & 9 & SciCode, inspect\_ai, ScienceAgentBench, android\_world, -OSWorld (harness), simple-evals (non-SimpleQA subset), HELM, -BIG-bench, deepeval \\ -\addlinespace -0 & 0 & --- \\ -\bottomrule -\end{tabular} -\end{table} - -Inspect AI (score 1) is the positive control referenced in -Sec.~\ref{sec:problem}: its default -\texttt{fail\_on\_error=True} surfaces per-sample errors, but -the commonly-used \texttt{fail\_on\_error=False} opt-in -degrades silently with no per-category counter -(\texttt{inspect\_ai/\_eval/task/error.py:L26} and -\texttt{inspect\_ai/scorer/\_metrics/accuracy.py:L33} at SHA -\texttt{36231d6}). - -\subsection{Per-repo entries} -\label{app:survey:entries} - -We summarise each repository's score with a one-line evidence -pointer; the eleven score-3 repositories receive a short -2--3-sentence writeup naming the specific code site(s) and -user-visible consequence. All paths are relative to the repo -root at the pinned SHA (\S\ref{app:survey:shas}). Full -permalinks are in \texttt{SURVEY\_v4.md}. - -\paragraph{Score-3 repositories (11).} - -\textbf{SWE-bench.} The grading loop aggregates over -\texttt{instances\_to\_run}, filtered before aggregation -(\texttt{swebench/harness/run\_evaluation.py:L465-L470}), while -empty patches are documented as filtered -pre-submission (\texttt{docs/faq.md:L39}). A container-level -infrastructure failure stays in the denominator as a fail, but -an empty-patch instance is removed from the denominator -altogether --- and no output field separates the two cases in -the published submission. - -\textbf{openai/evals.} -\texttt{backoff.on\_predicate} at -\texttt{evals/utils/api\_utils.py:L10-L22} retries transient -predicates indefinitely with no \texttt{max\_tries}, so long -runs silently absorb retry-exhaust events rather than surfacing -them. The MMMU eval treats errors as wrong answers -(\texttt{elsuite/mmmu/eval.py:L158}) and the modelgraded -classifier coerces \texttt{INVALID\_STR} to -\texttt{min(choice\_scores)} -(\texttt{elsuite/modelgraded/classify\_utils.py:L99-L101}): -three distinct P6 patterns in one library. - -\textbf{simple-evals (SimpleQA subset).} The regex -fallback at \texttt{simpleqa\_eval.py:L126} defaults unparseable -responses to \texttt{NOT\_ATTEMPTED}, which the headline metric -(\texttt{accuracy\_given\_attempted = correct / (correct + -incorrect)}) then structurally excludes -(\texttt{simpleqa\_eval.py:L169-L176}). A model that hedges on -every ambiguous question looks arbitrarily accurate under this -metric --- NOT\_ATTEMPTED tasks do not appear in the -denominator. - -\textbf{VisualWebArena.} An \texttt{assert "correct" -in response} at -\texttt{browser\_env/helper\_functions.py:L606} raises on -unparseable judge output, causing the outer try/except at -\texttt{run.py:L453-L461} to drop the entire task. Image-fetch -errors at \texttt{image\_utils.py:L44} propagate through the -same path. Dropped tasks are absent from the output, not -counted as failed. - -\textbf{ToolBench (9/9 \textsc{verified}).} The DFS -search code at \texttt{DFS.py:L84-L91} randomly promotes a -fraction of \texttt{give\_up\_node} trajectories to -\texttt{valid\_data=True}, and SFT preprocessing at -\texttt{preprocess/preprocess\_toolllama\_data.py:L44-L45} -filters on \texttt{valid\_data} alone --- so ToolLLaMA trains -on the promoted failures. This is the paper's paradigm example -of pattern~P7 (outcome-as-ground-truth SFT). - -\textbf{Self-rewarding-reasoning-LLM (4/4 -\textsc{verified}).} -\texttt{infer\_math/}\allowbreak\texttt{reward\_labeling.py:L1612-L1622} grants -correctness against the ground-truth answer on the -\emph{initial} chain-of-thought; self-correction attempts must -match that \texttt{first\_reward} to be counted -(\texttt{process\_prompt\_turn3.py:L24-L62}), with a hard cap -$N=3$. There is no \texttt{try/except} anywhere in the -generation module (\texttt{gen\_hf.py}): any infra failure -silently removes a rollout. - -\textbf{PrimeIntellect verifiers (5/5 -\textsc{verified}).} Reward-function exceptions at -\texttt{rubric.py:L144-L158} yield \texttt{0.0}, and group -reward-function exceptions at \texttt{rubric.py:L208,~L217} -yield \texttt{[0.0]*N}. The errored zeros are then included in -the group-mean advantage at \texttt{rubric.py:L325-L333} -(\texttt{avg\_reward = sum(aggregated\_rewards) / num\_states}; -\texttt{t["advantage"] = state["advantage"]}), so an -infrastructure error in one rollout biases the policy gradient -for the other rollouts in its group. - -\textbf{Trinity-RFT.} -\texttt{asyncio.TimeoutError} at -\texttt{trinity/explorer/scheduler.py:L214-L216} is packaged -into \texttt{Status(ok=False,~metrics=list(),~message=\ldots)}, -but the consumer path at \texttt{scheduler.py:L532-L602} -(\texttt{get\_results}) never inspects \texttt{status.ok} --- -errored payloads flow into training with \texttt{ok=False} -undetected. The over-rollout cancellation grace period -(\texttt{scheduler.py:L559-L572}, 30\,s default from -\texttt{config.py:L147}) means timeouts can also truncate -sub-tasks mid-run (\texttt{scheduler.py:L490-L506}) without -surfacing the partial-completion to the rollout log. - -\textbf{ms-swift (GRPO) (8/8 \textsc{verified}).} -None-rewards are lifted to NaN at -\texttt{swift/rlhf\_trainers/grpo\_trainer.py:L359, L367, -L379}, then aggregated with \texttt{.nansum(dim=1)} -(\texttt{grpo\_trainer.py:L473}); DAPO's -\texttt{max\_resample\_times} revert (\texttt{grpo\_trainer.py:% -L684-L733}) and \texttt{overlong\_filter} mask-zeroing -(\texttt{grpo\_trainer.py:L1123-L1129}) both silently remove -rollouts from the gradient. The LLM-judge path -(\texttt{rewards/rm\_plugin.py:L216-L226}) coerces regex-parse -failures to \texttt{0.0} (pattern P6). - -\textbf{BFCL / gorilla.} The weighted overall -accuracy formula in -\texttt{berkeley-function-call-leaderboard/bfcl\_eval/% -eval\_checker/eval\_runner\_helper.py:L509-L519} applies -$[10,10,10,30,40]$ weights over category runners that each -silently decrement \texttt{correct\_count} on parse failure -while leaving the denominator \texttt{len(model\_result)} -unchanged (per-category dispatch at \texttt{eval\_runner.py:% -L668}). \texttt{eval(func\_call)} exceptions become literal -\texttt{"Error during execution: \ldots"} strings -(\texttt{multi\_turn\_eval/multi\_turn\_utils.py:L97-L98}), -scored as a failed call without category attribution. - -\textbf{FastChat (MT-Bench + Chatbot Arena).} FastChat is the -reference implementation for two of the most-cited public LLM -evaluation numbers: MT-Bench (LLM-as-judge single-turn and -multi-turn scoring) and Chatbot Arena (pairwise Elo ranking). A -GPT-4 or Claude judge whose response fails to match the -\texttt{[[score]]} or \texttt{[score]} regex is coerced to -\texttt{rating = -1} (P6) at -\texttt{fastchat/llm\_judge/common.py:L175-L187}; MT-Bench -aggregation then filters \texttt{df[df["score"] != -1]} before -computing the per-model mean (P2+P4) at -\texttt{show\_result.py:L20}. The pairwise path is structurally -identical: parse-failure becomes \texttt{winner = "error"} -(\texttt{common.py:L282-L304}), filtered out before Elo -computation (\texttt{show\_result.py:L49}; -\texttt{elo\_analysis.py:L49-L92}). A rank swap driven by -differential parse-failure rates across judged models is not -recoverable from the headline MT-Bench or Arena Elo number --- -the attrition count is not emitted. At SHA \texttt{587d5cf}. - -\paragraph{Score-2 repositories (31, one-liner each).} - -\begin{description}[leftmargin=0pt,itemindent=0pt,labelindent=0pt,labelsep=0.4em,itemsep=2pt,topsep=2pt,parsep=0pt,font=\normalfont\bfseries] -\sloppy -\item[SWE-agent:] cost/kill \texttt{preds.unlink()} on -\texttt{early\_exit}/\texttt{None} removes instances from -\texttt{preds.json} -(\texttt{sweagent/run/run\_batch.py:L397-L401}). - -\item[mini-swe-agent:] \texttt{finally: save()} at -\texttt{src/minisweagent/run/}\allowbreak\texttt{benchmarks/swebench.py:L171} -is bypassed by \texttt{KeyboardInterrupt} (closed issue \#329); -streaming save at \texttt{default.py:L94}. - -\item[live-swe-agent:] configs never set \texttt{output\_path} -(\texttt{README.md:L71}). - -\item[SWE-bench\_Pro-os:] \texttt{None} return at -\texttt{evaluation/swe\_bench\_pro\_eval.py:L346-L349} is -collapsed to \texttt{False} by callers at \texttt{:L490-L505, -L571}. - -\item[AgentBench:] accuracy denominator is -\texttt{result.error is None} only -(\texttt{src/assigner.py:L368-L372}). - -\item[AgentBoard:] timeouts write to \texttt{error.txt} and -never \texttt{scores.append} -(\texttt{agentboard/tasks/webbrowse.py:L275-L279}). - -\item[tau-bench:] errors yield stubs with no retry -(\texttt{tau\_bench/run.py:L89-L96}). - -\item[SWE-smith:] broad \texttt{except} at -\texttt{scripts/collect\_trajs.py:L77-L79}; -\texttt{random.sample} cap at -\texttt{scripts/combine\_trajs.py:L86-L91}. - -\item[TRL:] \texttt{None}$\to$NaN$\to$\texttt{nansum} at -\texttt{trl/trainer/grpo\_trainer.py:L1228-L1230, L2132}. - -\item[verl:] PRIME reward timeouts$\to$\texttt{0.0} at -\texttt{verl/workers/reward\_manager/}\allowbreak\texttt{prime.py:L37-L42, L83}; -missing \texttt{return\_exceptions=True} at -\texttt{verl/experimental/agent\_loop/}\allowbreak\texttt{agent\_loop.py:L603} -(closed issue \#5956). - -\item[OpenRLHF:] remote RM -exception$\to$\texttt{reward=None} -(\texttt{openrlhf/utils/agent.py:L298-L299}; open issue -\#1139). - -\item[rllm (Berkeley Sky):] no-valid-trajectory episodes -dropped at \texttt{rllm/engine/}\allowbreak\texttt{agent\_workflow\_engine.py:L266}; -\texttt{dropped\_episodes} never written into -\texttt{DataProto.meta\_info} at \texttt{:L249} (open issue -\#382). - -\item[slime:] oversampling discard + zero-std filter at -\texttt{slime/rollout/sglang\_rollout.py:L449}. - -\item[ART:] \texttt{drop\_zero\_advantage\_trajectories=True} -default at \texttt{src/art/preprocessing/tokenize.py:L158}. - -\item[Agent-R1:] length-truncated trajectories published as -complete at -\texttt{agent\_r1/agent\_flow/agent\_env\_loop.py:L120-L129}. - -\item[RAGEN:] variance filter skips whole training step at -\texttt{ragen/trainer/}\allowbreak\texttt{agent\_trainer.py:L1054-L1056}. - -\item[MARTI:] dynamic filter drops saturated groups at -\texttt{marti/trainer/ppo\_trainer.py:L481-L510}. - -\item[MATPO:] MCP + judge failures$\to$reward 0.0 at -\texttt{verl/tools/mcp\_tool.py:L137, L150} and -\texttt{llm\_judge.py:L265, L312, L356}. - -\item[RL-Factory:] malformed tool -JSON$\to$\texttt{continue} at -\texttt{verl/workers/rollout/}\allowbreak\texttt{sglang\_rollout/}\allowbreak\texttt{sglang\_rollout.py:L914-L916}; -PRIME timeouts$\to$0.0 at \texttt{prime.py:L141-L146}. - -\item[lm-evaluation-harness:] left-truncate with -\texttt{eval\_logger.warning} only -(\texttt{lm\_eval/models/huggingface.py:L1360-L1368}; closed -issues \#3419, \#3352, \#3161, \#1323). - -\item[WebArena:] outer \texttt{except Exception: -log-and-continue} at \texttt{run.py:L217-L364}; headline -\texttt{sum(scores) / len(scores)} over survivors at -\texttt{run.py:L365}; \texttt{env.step} swallow with -\texttt{terminated=False} on infra failure at -\texttt{browser\_env/envs.py:L239-L248}. - -\item[MLAgentBench:] subprocess -crash$\to$\texttt{"EnvError"} string at -\texttt{low\_level\_actions.py:L181-L220} + -\texttt{environment.py:L328-L334}; hard-coded magic baselines -at \texttt{plot.py:L249-L274}; fresh random GT per call at -\texttt{fathomnet/eval.py:L11}. - -\item[MLE-bench:] aggregation \texttt{pad\_missing=False} -drops incomplete seeds at -\texttt{experiments/aggregate\_grading\_reports.py:L69-L135}; -\texttt{MedalInfo} logic at -\texttt{mlebench/grade\_helpers.py:L123-L133}. - -\item[lighteval:] LiteLLM returns empty -\texttt{LitellmModel}\allowbreak\texttt{Response()} on Azure -content-filter and retry-exhaust -(\texttt{litellm\_model.py:L243, L254}); empty response scored 0 -at \texttt{metrics\_sample.py:L151-L152}; left-truncate prompts -without \texttt{truncated\_}\allowbreak\texttt{tokens\_count} at -\texttt{vllm\_model.py:L374-L397}. - -\item[Agentless:] missing regression-test results default to -\texttt{[0]{*}10000} at -\texttt{agentless/test/run\_regression\_tests.py} (P1+P4), -collapsing absent results into a vector of 10k zeros; -\texttt{\_post\_process\_multifile\_repair} returns an empty -tuple on exception, and SWE-bench filters empty patches -pre-submission (\texttt{docs/faq.md:L39}), removing the -instance from the denominator. Directly relevant to -\S\ref{sec:validation}'s RQ2 reconciliation of the -SWE-agent/Agentless 15.6pp gap (at SHA \texttt{5ce5888}). - -\item[aider:] benchmark-runner broad \texttt{except} at -\texttt{benchmark/benchmark.py:L666-L676} stores the exception -as \texttt{\{"exception": \ldots\}} row; aggregation at -\texttt{:L502-L512} divides by \texttt{len(rows)}, so -exception rows contribute 1 to the denominator and 0 to the -numerator without surfacing the infra-flake count. - -\item[Mind2Web:] \texttt{IndexError} on out-of-bounds choice -index caught at -\texttt{src/action\_prediction/metric.py:L203-L209} -(\texttt{logger.info}-only), falls through to score-0 + kept in -denominator (P1); \texttt{src/action\_prediction/}% -\texttt{dataloader.py:L277} applies a train-only positive-% -candidate filter, creating a train/eval data-distribution -asymmetry (P7). - -\item[open-r1:] E2B broad \texttt{except} at -\texttt{src/open\_r1/rewards/code\_providers.py:L107-L112} -coerces sandbox exceptions to \texttt{[0.0]{*}len(scripts)} -rewards with print-only surface; MorphProvider identical -pattern at \texttt{:L248-L249}; IOI/Codeforces paths enter -TRL's \texttt{None}$\to$\texttt{nansum} pipeline via -\texttt{:L396-L397, L449-L450}. - -\item[ragas:] Pydantic parser fallback at -\texttt{ragas/prompt/pydantic\_prompt.py:L315-L334} (self-fix -loop raises \texttt{RagasOutputParserException}); executor -broad \texttt{except} returns \texttt{float("nan")} at -\texttt{ragas/executor.py:L64-L86}; \texttt{safe\_nanmean} at -\texttt{ragas/utils.py:L46-L55} silently shrinks the -denominator by the NaN count across faithfulness, answer-% -relevancy, and context-precision metrics. - -\item[MTEB:] leaderboard aggregate filters out models with any -NaN task score at \texttt{mteb/leaderboard/\_create\_table.py:% -L136-L149} (P4 selection bias); per-language aggregates at -\texttt{mteb/results/task\_result.py:L526-L563} compute over -produced scores with no attempt-vs-score coverage counter. - -\item[promptfoo (TypeScript):] any LLM-judge parse failure -coerced to \texttt{\{pass: false, score: 0\}} at -\texttt{src/matchers/llmGrading.ts:L499, L515} (P6); fail -helpers at \texttt{src/matchers/shared.ts:L25-L38} return the -same hardcoded score indistinguishably from legitimate zeros; -provider-error path at \texttt{src/evaluator.ts:L1046-L1051} -produces \texttt{score: 0} without a \texttt{providerErrored} -flag. -\end{description} - -\paragraph{Score-1 repositories (9, one-liner each).} - -\begin{description}[leftmargin=0pt,itemindent=0pt,labelindent=0pt,labelsep=0.4em,itemsep=2pt,topsep=2pt,parsep=0pt,font=\normalfont\bfseries] -\sloppy -\item[SciCode:] partial-submission aggregation at -\texttt{eval/scripts/}\allowbreak\texttt{test\_generated\_}\allowbreak\texttt{code.py:L125-L127}. - -\item[inspect\_ai:] \texttt{fail\_on\_error=True} by default -(\texttt{\_eval/task/error.py:L26}), but -\texttt{fail\_on\_error=False} opt-in degrades silently -(\texttt{scorer/\_metrics/accuracy.py:L33}). - -\item[ScienceAgentBench:] swallows -\texttt{EvaluationError/BuildImageError/Exception} -(\texttt{evaluation/harness/run\_evaluation.py:L118-L136}); -discards \texttt{timed\_out} flag (\texttt{:L113}); GPT-4o -judge regex-miss$\to$score 0 at -\texttt{gpt4\_visual\_judge.py:L70-L72}. - -\item[android\_world:] \texttt{np.nan} + -\texttt{DataFrame.mean} asymmetric drop at -\texttt{task\_evals/task\_eval.py:L249-L259} and -\texttt{android\_world/suite\_utils.py:L544-L558, L681-L698}. - -\item[OSWorld (harness):] bare-\texttt{except} ``Time -limit exceeded'' at \texttt{run.py:L205-L218}; -\texttt{env.evaluate()} \texttt{FileNotFoundError}$\to$0 at -\texttt{desktop\_env/desktop\_env.py:L485-L487}; -Verified-subset non-comparability disclaimer at -\texttt{README.md:L36}. - -\item[simple-evals (non-SimpleQA subset):] honest grading on -multi-choice, regex miss$\to$wrong on GPQA/MMLU (see -Appendix~\ref{app:variance-catalogue}, Smoking-Gun~\#2). - -\item[HELM:] \texttt{Stat.add(None)} silently skips None -results at \texttt{src/helm/benchmark/metrics/statistic.py:L35}; -judge parse returns \texttt{None} at \texttt{gpt4\_audio\_}% -\texttt{critique\_metrics.py:L70-L72} and is absorbed by the -\texttt{Stat.add(None)} skip. The per-scenario attrition rate -is recoverable from the full output JSON but is not in the -headline report (at SHA \texttt{83bde5c}). - -\item[BIG-bench:] aggregate metric computation at -\texttt{bigbench/api/results.py:L560-L576} uses -\texttt{statistics.mean(d[m] for d in per\_task if m in d)}, -silently shifting the denominator by the count of tasks that -did not produce metric \texttt{m}. Per-task results are -preserved in raw output so the shift is recoverable, but the -headline mean does not emit a coverage counter (at SHA -\texttt{092b196}). - -\item[deepeval:] \texttt{JSONDecodeError} on LLM-judge output -re-raised as \texttt{ValueError} at -\texttt{deepeval/utils.py:L405-L413} and caught broadly by the -test-runner; \texttt{safe\_a\_measure} at -\texttt{deepeval/progress\_context.py:L299-L305} catches any -metric-evaluation exception. Per-test failure messages are -surfaced but parse-failure rates are not aggregated into a -headline counter (at SHA \texttt{f917b5a}). -\end{description} - -\subsection{Verification} -\label{app:survey:verification} - -Every source-level claim in this appendix is tied to a pinned -commit SHA and a line-level permalink in the supplementary -survey document. Claims were retained only when the cited lines -directly supported the stated failure-handling behaviour. The -audit therefore fixes both the repository version and the exact -code location being asserted, making each entry independently -checkable without relying on moving default branches. - -\subsection{Pinned commit SHAs} -\label{app:survey:shas} - -Every permalink in this appendix resolves against the -following pinned commits. Repositories are grouped by survey -stratum; full 40-character SHAs and line-level permalinks are -provided in the supplementary survey document. - -\emph{SWE-bench-family harnesses and agent scaffolds.} -SWE-bench \texttt{f7bbbb2}; SWE-agent \texttt{0f4f3bb}; -mini-swe-agent \texttt{bc85a45}; live-swe-agent -\texttt{8d7dd86}; SWE-bench\_Pro-os \texttt{0c64e26}; -SWE-smith \texttt{9b74ac0}; Agentless \texttt{5ce5888}; -aider \texttt{f09d7065}. - -\emph{RL and agentic-RL training frameworks.} -TRL \texttt{88826fd}; verl \texttt{6eeb571}; OpenRLHF -\texttt{64c1cc4}; rllm \texttt{ea623cc}; slime -\texttt{5b688aa}; ART \texttt{679b236}; Agent-R1 -\texttt{38bdfc1}; RAGEN \texttt{20daedc}; MARTI -\texttt{a2fe2c7}; MATPO \texttt{3c41d62}; RL-Factory -\texttt{d0abc1d}; Trinity-RFT \texttt{9051d63}; ms-swift -\texttt{c4902f3}; verifiers \texttt{e27633b}; open-r1 -\texttt{1416fa0}. - -\emph{General evaluation harnesses and leaderboard infrastructure.} -openai/evals \texttt{8eac7a7}; simple-evals \texttt{ee3b031}; -SciCode \texttt{e3158ea}; lm-evaluation-harness -\texttt{c1c4bea}; lighteval \texttt{10b9104}; inspect\_ai -\texttt{36231d6}; FastChat \texttt{587d5cf}; HELM -\texttt{83bde5c}; ragas \texttt{298b6827}; deepeval -\texttt{f917b5a}; MTEB \texttt{9363ea75}; promptfoo -\texttt{3dc5843}; BIG-bench \texttt{092b196}. - -\emph{Web, GUI, mobile, scientific-agent, and ML-engineering benchmarks.} -WebArena \texttt{dce0468}; VisualWebArena \texttt{89f5af2}; -OSWorld \texttt{c7e54d2}; android\_world \texttt{d9c569f}; -Mind2Web \texttt{33bd95c}; MLE-bench \texttt{2451bcb}; -MLAgentBench \texttt{5d71205}; ScienceAgentBench -\texttt{1cf1375}. - -\emph{Function-calling, multi-agent, and general agent-scaffold repositories.} -gorilla (BFCL) \texttt{6ea5797}; ToolBench \texttt{d56fdd8}; -AgentBench \texttt{d1e4a10}; AgentBoard \texttt{bb7255e}; -tau-bench \texttt{59a200c}; Self-rewarding-reasoning-LLM -\texttt{372bea9}. The out-of-scope observation (OpenHands, -documented as a scope-filter exemplar) is pinned at -\texttt{3b17f27}. - -\subsection{Coverage snapshot} -\label{app:survey:coverage} - -The final sample contains 50 in-scope repositories and two -documented exclusions. It spans SWE-bench-family harnesses, -agent scaffolds, -RLHF trainers, agentic-RL trainers, multi-agent RL, web/GUI -agents, mobile agents, ML-engineering benchmarks, -scientific-agent benchmarks, function-calling benchmarks, -general eval frameworks, LLM-judge infrastructure, autonomous -coding assistants, RAG-eval frameworks, embedding/retrieval -benchmarks, red-team/prompt-eval tooling, and -rejection-sampling SFT pipelines. -The severity distribution is 11 score-3 entries, 31 score-2 -entries, 9 score-1 entries, and 0 score-0 entries. Because -\texttt{simple-evals} is scored separately for SimpleQA and -non-SimpleQA behaviour, these 51 entries correspond to 50 -unique repositories. Seven silent-drop patterns -covering reward coercion, rollout filtering, bypassed saves, -survivor denominators, partial-trajectory drops, LLM-judge -parse$\to$fixed score, and outcome-as-ground-truth SFT -filters. -The audit contains 50 pinned SHAs and approximately 160 -verified source citations. Every code claim in this appendix is a -commit-pinned two-click audit: if any claim fails to hold up -at its permalink, that is a bug in this appendix, not a vague -disagreement about code that has since moved --- the SHAs fix -the audit target. - -% ---------------------------------------------------------------------------- -% Appendix B NEW --- 37-pair variance catalogue. Draft fresh in Session 11. -% Source: SURVEY\_master.md. Structure: full table organised by metric family -% --- task success (22 pairs), cost/tokens (9 pairs), latency/timing (6 pairs). -% Near-misses section for leads not meeting evidence bar. -% ---------------------------------------------------------------------------- - -\section{37-pair Variance Catalogue} -\label{app:variance-catalogue} - -This appendix enumerates the 37 cross-harness variance pairs -sampled in Table~\ref{tab:omnibus}, organised by metric family: -22 task-success pairs (\S\ref{app:vc:task}), 9 cost/token -pairs (\S\ref{app:vc:cost}), and 6 latency/timing pairs -(\S\ref{app:vc:latency}). Four task-success benchmarks are -then examined in \emph{smoking-gun} detail -(\S\ref{app:vc:smoking}) --- same dataset, multiple in-survey -implementations, different numbers on identical workloads. -Training-side evidence beyond the rollout interface -(Dr.\ GRPO, DAPO, TRL's five GRPO variants, verl~\#2165) is -consolidated in \S\ref{app:vc:training}. A de-duplication log -(\S\ref{app:vc:dedup}) records which family each pair is -counted in, and \S\ref{app:vc:nearmisses} catalogues leads that -did not meet the evidence bar. - -\subsection{Evidence hierarchy} -\label{app:vc:hierarchy} - -Every pair is graded against the same four-tier evidence -hierarchy, uniform across the three metric families: - -\begin{enumerate} -\item \textbf{Tier 1 --- strongest.} Same model and workload; -published numbers disagree across two harnesses. (Direct -empirical evidence.) -\item \textbf{Tier 2.} Explicit vendor or framework -documentation acknowledging the convention difference between -harnesses. (Vendor-admission-equivalent.) -\item \textbf{Tier 3.} Code-level convention divergence with -no cross-framework normalization, published under the same -metric name. (Structural divergence visible at source.) -\item \textbf{Tier 4.} Maintainer issue-tracker statement that -the convention differs from another framework. (Maintainer-% -acknowledged divergence.) -\end{enumerate} -Anything weaker is demoted to near-misses -(\S\ref{app:vc:nearmisses}). Every pair below cites either a -numeric delta, a vendor acknowledgement, a code-level -convention divergence, or a maintainer statement. - -\subsection{Task-success pairs (22)} -\label{app:vc:task} - -Same model weights, same benchmark split, different published -accuracy depending on which harness computed the aggregate. - -\begin{table}[h] -\centering -\footnotesize -\setlength{\tabcolsep}{3pt} -\caption{22 task-success variance pairs. $\Delta$ is the gap -between published numbers on an identical workload. ``post-% -mortem'' refers to the HuggingFace Open LLM Leaderboard -post-mortem \citep{beeching2023openllm}.} -\label{tab:vc:task} -\begin{tabular}{@{}l>{\raggedright\arraybackslash}p{0.12\linewidth}>{\raggedright\arraybackslash}p{0.13\linewidth}>{\raggedright\arraybackslash}p{0.14\linewidth}>{\raggedright\arraybackslash}p{0.16\linewidth}>{\raggedright\arraybackslash}p{0.12\linewidth}>{\raggedright\arraybackslash}p{0.16\linewidth}@{}} -\toprule -\textbf{\#} & \textbf{Model} & \textbf{Benchmark} & -\textbf{Harness A} & \textbf{Harness B} & -\textbf{$\Delta$} & \textbf{Source} \\ -\midrule -T1 & LLaMA-65B & MMLU 5-shot & Berkeley orig. & lm-eval-harness & 14.9pp & post-mortem \\ -T2 & LLaMA-30B & MMLU 5-shot & Berkeley orig. & lm-eval-harness & 12.6pp & post-mortem \\ -T3 & Falcon-40B & MMLU 5-shot & Berkeley orig. & lm-eval-harness & 10.5pp & post-mortem \\ -T4 & Llama-3.1-70B-Inst & MMLU-Pro & Meta self (CoT) & OLL v2 / lighteval & $\sim$18.5pp & Meta card vs OLL \\ -T5 & Qwen2.5-72B & MMLU-Pro & Self-report & OLL v2 & $\sim$22pp & Qwen docs vs OLL \\ -T6 & GPT-4o (24-05-13) & SWE-b.\ Verified & SWE-agent & Agentless 1.5 & 15.6pp & swe-b.\ subs.\ S3 \\ -T7 & Claude 3.5 Sonnet & SWE-b.\ Verified & Anthropic Tools & OpenHands CA 2.1 & 4.0pp & SWE-b.\ leaderb. \\ -T8--T13 & Various & SWE-b.\ Lite/Ver. & Various & Various & 3.4--19.6pp & SWE-b.\ leaderb. \\ -T14 & Mistral-7B & MMLU (prompt) & Template A & Template B & up to 24.6pp & Biderman et al.\ \\ -T15 & Mixtral-8x7B & MMLU (prompt) & Template A & Template B & up to 24.6pp & Biderman et al.\ \\ -T16 & Mistral-7B & ARC (prompt) & Template A & Template B & up to 24.6pp & Biderman et al.\ \\ -T17 & Llama-3.1-70B & MATH & Meta (full test) & Leaderboard (500) & label collision & Meta vs leaderb. \\ -T18 & Claude 3.5 Sonnet & MATH & ``MATH'' & ``MATH'' & label overload & multi-source \\ -T19 & Llama-3.1-70B & GPQA & simple-evals & lm-eval-harness & vendor admis.\ & NVIDIA NeMo \\ -T20 & Llama-3.1-70B & GPQA & Self-report & Leaderboard & cross-harness & Meta + leaderb.\ \\ -T21 & Llama-3.1-70B & Various & Meta model card & Meta \texttt{eval\_details.md} & micro vs macro & Meta own docs \\ -T22 & Llama-3.1-70B & Various & \texttt{acc\_char} & full-string match & convention & Meta own docs \\ -\bottomrule -\end{tabular} -\end{table} - -T1 (LLaMA-65B MMLU 14.9pp) is the flagship single pair --- -identical weights, different published score, explicitly -discussed in a HuggingFace post-mortem. T6 (GPT-4o SWE-bench -Verified 15.6pp) is the target of the reconciliation -experiment in Sec.~\ref{sec:validation:results}: -trajectories for both runs are in the public SWE-bench -submissions S3 bucket, making the reconciliation tractable. -T14--T16 (Biderman et al.) are not cross-harness pairs but -within-harness prompt-template pairs, included because the -24.6pp gap is the strongest published evidence that the -instrument-level variance is at least as large as the -signals it purports to measure. - -\subsection{Cost / token pairs (9)} -\label{app:vc:cost} - -Same model, same workload, different published token counts or -dollar costs depending on which tool applies which convention. -Table~\ref{tab:vc:cost} summarises; for numerical examples and -code-level references see the supplementary survey document and -the cost-latency divergence report in supplementary materials. - -\begin{table}[h] -\centering -\footnotesize -\setlength{\tabcolsep}{3pt} -\caption{9 cost / token-accounting variance pairs. Same model, -same workload, different published numbers.} -\label{tab:vc:cost} -\begin{tabular}{@{}l>{\raggedright\arraybackslash}p{0.15\linewidth}>{\raggedright\arraybackslash}p{0.22\linewidth}>{\raggedright\arraybackslash}p{0.22\linewidth}>{\raggedright\arraybackslash}p{0.28\linewidth}@{}} -\toprule -\textbf{\#} & \textbf{Setup} & \textbf{Harness A} & -\textbf{Harness B} & \textbf{$\Delta$} \\ -\midrule -C1 & Anthropic w/ \texttt{cache\_control} & OTel \texttt{input\_tokens} (inclusive) & Anthropic-native (exclusive) & 2.0$\times$ inflation (260{,}421 vs 130{,}213) \\ -C2 & Claude + cache\_control & LiteLLM cost estimator & Anthropic Console billing & +68\% overestimate (\$0.091 vs \$0.054) \\ -C3 & Claude (streaming) + cache-hit & LangChain.js accumulator & single \texttt{message\_delta} & exactly 2$\times$ cache\_read \\ -C4 & SWE-b.\ V.\ retry-firing task & SWE-agent (every attempt) & mini-swe-agent (last attempt) & 30--100\% per-task overstatement \\ -C5 & Anthropic/OpenAI prompt-cache & inspect\_ai (cache-aware, tiered) & 7 peer harnesses (no cache parse) & up to 90\% on cached portion \\ -C6 & Aider leaderboards (same model date) & Aider Edit & Aider Refactor / Polyglot & \$0 $\to$ \$14.41 (same model) \\ -C7 & Same rollout batch, GRPO training & verl \texttt{loss\_agg\_mode=}\allowbreak\texttt{token-mean} & verl \texttt{seq-mean-}\allowbreak\texttt{token-sum} & different gradient magnitudes \\ -C8 & Cross-vendor ``tokens used'' & tiktoken (\texttt{o200k\_base}) & Anthropic BPE & $\sim$10\% systematic \\ -C9 & OpenAI reasoning models (o1/o3) & harness reading \texttt{completion\_}\allowbreak\texttt{tokens} only & harness reading \texttt{..details.}\allowbreak\texttt{reasoning\_tokens} & silent under-report \\ -\bottomrule -\end{tabular} -\end{table} - -\paragraph{Code-level sources.} -{\footnotesize\raggedright -\textbf{C1:} OpenTelemetry PR~\#3163; Langfuse issue~\#12306. -\textbf{C2:} BerriAI/litellm issue~\#9812. -\textbf{C3:} langchainjs issue~\#10249. -\textbf{C4:} \texttt{sweagent/agent/models.py} L744--L838; -\texttt{minisweagent/models/litellm\_model.py} L80--L93. -\textbf{C5:} \texttt{inspect\_ai/model/\_openai.py} L776--L782; -\texttt{inspect\_ai/model/\_model.py} L2085--L2091; -\texttt{inspect\_ai/model/\_providers/anthropic.py} L1137--L1169. -\textbf{C6:} \texttt{edit\_leaderboard.yml}, -\texttt{refactor\_leaderboard.yml}, -\texttt{polyglot\_leaderboard.yml} in \texttt{Aider-AI/aider}. -\textbf{C7:} \texttt{verl/trainer/ppo/core\_algos.py} L1168--L1195. -\textbf{C8:} tiktoken and Anthropic tokenizer documentation; -openai/tiktoken issue~\#474. -\textbf{C9:} OpenAI API reference for reasoning-model -\texttt{usage.completion\_tokens\_details}. -\par} - -C1 is the flagship: OpenTelemetry PR~\#3163 formalises the -Anthropic/OpenAI/Vertex convention disagreement at the -observability-middleware layer, and Langfuse issue \#12306 -exhibits the numerical 260{,}421-vs-130{,}213 gap on a single -request. C6 is the flagship intra-vendor disagreement: the -same organisation's three leaderboards (Edit, Refactor, -Polyglot) publish \$0, \$8.46, and \$14.41 for the same model -on comparable tasks, with no convention statement in any of -the three YAMLs. - -\subsection{Latency / timing pairs (6)} -\label{app:vc:latency} - -Same workload, same hardware, different published wall-clock -times depending on which harness's timer wraps which phase. - -\begin{table}[h] -\centering -\footnotesize -\setlength{\tabcolsep}{3pt} -\caption{6 latency / wall-clock variance pairs. Clock-start -and clock-stop differ across harnesses that log the same -metric name.} -\label{tab:vc:latency} -\begin{tabular}{@{}l>{\raggedright\arraybackslash}p{0.17\linewidth}>{\raggedright\arraybackslash}p{0.22\linewidth}>{\raggedright\arraybackslash}p{0.22\linewidth}>{\raggedright\arraybackslash}p{0.26\linewidth}@{}} -\toprule -\textbf{\#} & \textbf{Setup} & \textbf{Harness A} & -\textbf{Harness B} & \textbf{$\Delta$} \\ -\midrule -L1 & SWE-b.\ V.\ (same task) & Docker backend & Modal backend & systematic Modal inflation \\ -L2 & OSWorld (same task) & \texttt{lib\_run\_single.py} & \texttt{scripts/python/}\allowbreak\texttt{run\_maestro.py} & $\ge$30\,s (hardcoded sleep) + 5--15\,s VM start \\ -L3 & GRPO step timer & TRL \texttt{training\_}\allowbreak\texttt{step} (grad+opt only) & verl \texttt{marked\_timer(}\allowbreak\texttt{"step")} (full epoch) & up to 5$\times$ on rollout-dominated runs \\ -L4 & Qwen2.5-7B GSM8K-GRPO, same HW & TRL & OpenRLHF & 3.13$\times$ (5{,}189\,s vs 1{,}657\,s) \\ -L5 & GRPO throughput & verl \texttt{perf/throughput} (excl.\ padding) & TRL effective-token throughput & different numerators on same HW \\ -L6 & inspect\_ai timers & \texttt{sample\_working\_time} & \texttt{sample\_waiting\_time} & ambiguity declared in-tool \\ -\bottomrule -\end{tabular} -\end{table} - -\paragraph{Code-level sources.} -{\footnotesize\raggedright -\textbf{L1:} \texttt{swebench/harness/docker\_utils.py} -L203--L217 vs -\texttt{swebench/harness/modal\_eval/run\_evaluation\_modal.py} -L307--L319. -\textbf{L2:} \texttt{lib\_run\_single.py} \#L616 vs -\texttt{scripts/python/run\_maestro.py} \#L351. -\textbf{L3:} \texttt{trl/trainer/grpo\_trainer.py} L1111--L1120; -\texttt{verl/trainer/ppo/metric\_utils.py} L313--L346. -\textbf{L4:} arXiv:2501.03262 Table~4; EMNLP-demos 2025. -\textbf{L5:} \texttt{verl/trainer/ppo/metric\_utils.py} -L337--L345. -\textbf{L6:} \texttt{inspect\_ai/\_util/working.py}. -\par} - -L4 is the flagship: published, peer-reviewed, same hardware, -same model, same algorithm --- a 3.13$\times$ wall-clock gap -attributable to framework-level dispatch rather than algorithm -difference. L1 and L2 are in-repository disagreements: both -backends in SWE-bench Verified and both harnesses in OSWorld -ship under the same repo at the same commit, reporting the -same metric name, with no statement that the two are not -comparable. - -\subsection{Smoking-gun benchmarks: same dataset, multiple in-survey implementations} -\label{app:vc:smoking} - -Four benchmarks are implemented in 3--4 surveyed repositories -each. Below, each implementation's grading site and -failure-handling semantics are read off at the pinned SHAs -(\S\ref{app:survey:shas}). For each benchmark, a single -realistic failure mode produces a different headline number in -each implementation --- and none of the implementations emits -a counter disclosing which regime it fell into. - -\paragraph{Smoking-Gun~\#1: SWE-bench (4 implementations).} -\emph{Official SWE-bench} grades via \texttt{bad\_codes} = -\{APPLY\_PATCH\_FAIL, RESET\_FAILED, TESTS\_ERROR, -TESTS\_TIMEOUT\} -(\texttt{swebench/harness/grading.py:L61-L70}; -\texttt{constants/\_\_init\_\_.py:L80-L89}) with empty patches -pre-filtered from submission (\texttt{docs/faq.md:L39}). Container -timeout stays in the denominator as a fail; empty-patch -instances are removed entirely. \emph{PrimeIntellect verifiers} -reads the same \texttt{bad\_codes} set but wraps -\texttt{test\_spec} fetching in a 5$\times$ tenacity retry; any -uncaught exception flows through \texttt{rubric.py:L144-L158} -to \texttt{0.0} and enters the group-mean advantage baseline ---- a container timeout can bias the policy gradient for the -other rollouts in its group rather than merely decrementing -the numerator. \emph{SWE-agent} delegates grading to -\texttt{sb-cli submit} -(\texttt{sweagent/run/hooks/swe\_bench\_evaluate.py:L42-L55}); -the pre-submission \texttt{unlink} at -\texttt{run\_batch.py:L397-L401} removes crashed or -cost-killed instances from \texttt{preds.json} entirely, so -server-side grading sees a smaller denominator instead of a -failed row. \emph{mini-swe-agent} has no local grading; its -\texttt{finally: save()} at -\texttt{benchmarks/swebench.py:L171} is bypassed by -\texttt{KeyboardInterrupt} (closed issue \#329), also -producing an absent-from-submission pattern. On one realistic -mid-test Docker timeout, the four harnesses report: Official -$0/N$, verifiers $1/N$ (possible retry recovery), SWE-agent -$0/(N-1)$, mini-swe-agent $0/(N-1)$. Inspect~AI has \emph{no -SWE-bench bridge} at the pinned SHA. - -\paragraph{Smoking-Gun~\#2: GPQA (3 implementations).} -\emph{simple-evals} at \texttt{gpqa\_eval.py:L59} uses -\texttt{re.search(ANSWER\_PATTERN\_MULTICHOICE, response\_text)} -and \texttt{extracted\_answer = match.group(1) if match else -None}; a regex miss --- which a hedging response triggers --- -is scored 0 and stays in the denominator. \emph{lm-evaluation-% -harness} uses log-likelihood of the gold-choice token over the -various \texttt{gpqa\_*\_zeroshot.yaml} / -\texttt{\_n\_shot.yaml} / \texttt{\_cot\_*.yaml} tasks; the -model does not generate free text in the scored branch, so -hedging is immune, but silent left-truncation at -\texttt{huggingface.py:L1360-L1368} still affects long -contexts. \emph{lighteval} scores via the standard pipeline at -\texttt{tasks/tasks/gpqa.py} with aggregation through -\texttt{info\_loggers.py:L326-L400}; API retry-exhaust or -content-filter yields an empty -\texttt{LitellmModelResponse()} (\texttt{litellm\_model.py:% -L243, L254}), scored as wrong via -\texttt{metrics\_sample.py:L151-L152} (\texttt{if not pred: -return 0}). On a single hedging model output: simple-evals~0, -lm-eval-harness likely correct, lighteval~wrong --- three -different headline accuracies from the same model reasoning. -Inspect~AI has \emph{no GPQA task file} at the pinned SHA. - -\paragraph{Smoking-Gun~\#3: MATH / MATH-500 (3 -implementations).} \emph{simple-evals} at -\texttt{math\_eval.py:L55} calls \texttt{check\_equality(self.% -equality\_checker, row["Answer"], extracted\_answer)}, which -itself calls the LLM equality checker; an API failure raises -(dropping the sample) or is coerced to 0 by the outer handler. -\emph{lighteval} uses majority@n with a symbolic -\texttt{math\_normalizer} over \texttt{tasks/tasks/math\_500.py} -and \texttt{math.py}; normalizer exceptions are caught upstream -and scored wrong at -\texttt{metrics\_sample.py:L151-L152}. \emph{PrimeIntellect -verifiers} routes reward functions through -\texttt{rubric.py:L144-L158}; exceptions become \texttt{0.0} -and enter the group advantage baseline. On a LaTeX edge case -such as \texttt{\textbackslash frac\{1\}\{2\}} versus -\texttt{0.5}: simple-evals's LLM judge may call these equal, -lighteval's normalizer may not, and verifiers's reward -exception enters training as an errored zero --- same -completion, three different outcomes. - -\paragraph{Smoking-Gun~\#4: MMLU (3 implementations).} -\emph{simple-evals} (\texttt{mmlu\_eval.py}) extracts answers -by regex on generated text --- same miss-is-wrong pattern as -GPQA. \emph{lm-evaluation-harness} -(\texttt{lm\_eval/tasks/mmlu}: 12+ variants spanning main, -redux, pro, flan, etc.) uses log-likelihood over A/B/C/D -tokens with \texttt{acc}/\texttt{acc\_norm}; silent prompt -truncation at \texttt{huggingface.py:L1360-L1368} logs the -truncated-prompt sample as a complete result. -\emph{lighteval} uses exact-match on the choice label -(\texttt{tasks/tasks/mmlu.py}), with left-truncation at -\texttt{vllm\_model.py:L374-L397} warning but recording no -\texttt{truncated\_tokens\_count}. On a long-context MMLU-Pro -prompt that overflows a 4k model: simple-evals generates a -regex answer against the truncated prompt the model saw -(right or wrong depending on the content); -lm-eval-harness left-truncates and logs as complete (silently -wrong); lighteval left-truncates, records no truncation -counter, and logs as complete (silently wrong, with no -recoverable audit trail). Three implementations, three -behaviours, no \texttt{n\_prompt\_truncated} counter in any -output. - -\subsection{Training-side evidence beyond the rollout interface} -\label{app:vc:training} - -Operator disagreement extends into the gradient-signal -computation at training time. The paper cites these as -published findings rather than original contributions. - -\textbf{Dr.\ GRPO} \citep{liu2025drgrpo} corrects only the -loss-aggregation convention --- the length-normalization bias -documented in ``all popular open-source PPO implementations'' ---- and reports +7.3 points on AIME 2024 vs SimpleRL-Zero-7B -and +15.7 points on AIME 2024 vs Prime-Zero-7B, with no change -to data, model, or algorithm. \textbf{DAPO} -\citep{yu2025dapo} is a catalogue of framework-level defaults -(zero-std group filter, token-vs-sequence loss aggregation, -decoupled clip ranges) that had to be flipped to match -DeepSeek-R1; the paper reports 50 points on AIME 2024 for -Qwen2.5-32B on the defaults-flipped run. All three defaults -are silently different across TRL, verl, and OpenRLHF. -\textbf{TRL's own library} ships five mutually-incompatible -GRPO variants --- \texttt{grpo}, \texttt{dr\_grpo}, -\texttt{dapo}, \texttt{bnpo}, and -\texttt{importance\_sampling\_level}$\in$\{\texttt{token}, -\texttt{sequence}\} --- in a single config file -(\texttt{trl/trl/trainer/grpo\_config.py:L228-L243, -L668-L677}); two papers reporting ``we trained with GRPO on -task X'' are structurally non-comparable without specifying -which variant. \textbf{verl~\#2165} \citep{verl2165} is a -tokenization-channel divergence on Qwen3-4B GRPO where the -training-side FSDP tokenizer inserts -\texttt{\textbackslash n\textbackslash n} into -assistant turns that the rollout engine (vLLM~/~SGLang) never -saw, shifting the assistant-turn token mask by tens of tokens -across $\sim$40k-token conversations. The finding is reproduced -by six independent users on the thread, upstream-confirmed in -QwenLM/Qwen3~\#1826 and Qwen/Qwen3-1.7B HF discussion~\#9, and -guarded separately by verl's own -\texttt{tokenization\_sanity\_check\_mode} (distinct from the -dtype guard in discussion~\#5984) --- proving that tokenization-% -class mismatch exists as a separate failure mode even when all -upstream dtypes match. - -\textbf{Silent rollout-dropping mechanisms (beyond the 50-repo -audit).} verl~\#1170 (rollouts silently return empty strings, -training proceeds); OpenRLHF~\#1108 (reward curves diverge -between vLLM 0.8.1 and 0.8.3 on the same seed); verl -discussion~\#5984 (tool built specifically to detect silent -per-token log-prob divergence between training FSDP and vLLM -rollouts); ms-swift~\#9096 (gemma-4 rollouts return garbage, -training continues). \textbf{Numerical-layer evidence.} BF16 -backends have been documented to produce divergent log-probs -between training and rollout engines even with identical -weights (``Defeating the Training-Inference Mismatch via -FP16,'' arXiv:2510.26788); the vLLM engineering blog documents -separate backends breaking the on-policy assumption of -policy-gradient methods; THUDM's \texttt{slime} ships a -\texttt{train\_infer\_mismatch\_helper} because divergence is -expected. - -\subsection{De-duplication log} -\label{app:vc:dedup} - -Several findings could plausibly appear in more than one -family. For audit transparency we record where each is counted -and why: - -\begin{itemize} -\item \textbf{Dr.\ GRPO}, \textbf{DAPO}, \textbf{TRL's five -GRPO variants}: counted in \S\ref{app:vc:training} -(training-side); not in \S\ref{app:vc:cost} despite touching -per-step token accounting. Rationale: the load-bearing claim is -about gradient signal, not token counts. -\item \textbf{verl~\#2165} (Qwen3 tokenization-channel -divergence): counted in \S\ref{app:vc:training} (training-% -side); not in \S\ref{app:survey} despite involving silent -per-rollout mismatch. Rationale: Appendix~\ref{app:survey} -audits \emph{failure-handling} patterns; verl~\#2165 is a -\emph{convention-disagreement} finding, structurally closer to -Dr.\ GRPO than to \texttt{try/except/return None}. -\item \textbf{TRL vs OpenRLHF 3.13$\times$ wall-clock} (L4): -counted in \S\ref{app:vc:latency}; not in -\S\ref{app:vc:training} despite the training-pipeline context. -The headline delta is wall-clock, not gradient signal. -\item \textbf{verl \texttt{loss\_agg\_mode} variants} (C7): -counted in \S\ref{app:vc:cost}; not in \S\ref{app:vc:latency} -despite affecting per-step timing. -\item \textbf{verl throughput metric} (L5): counted in -\S\ref{app:vc:latency}; not in \S\ref{app:vc:cost} despite -involving token counting. -\item \textbf{inspect\_ai \texttt{sample\_working\_time} vs -\texttt{sample\_waiting\_time}} (L6): counted in -\S\ref{app:vc:latency}; not in \S\ref{app:survey} despite -inspect\_ai's role as the positive control for -failure-handling. -\end{itemize} -No finding is counted twice across the 37 pairs. - -\subsection{Near misses} -\label{app:vc:nearmisses} - -Leads that did not meet the evidence bar -(\S\ref{app:vc:hierarchy}) but warrant human follow-up. - -\textbf{Cost / tokens.} Anthropic -\texttt{/v1/messages/count\_tokens} versus tiktoken estimate ---- Anthropic publishes a dedicated counting endpoint, -community harnesses routinely substitute tiktoken for speed, -the endpoint returns different numbers on the same input; -documented but no published benchmark pair with materially -different totals attributable to this choice. Anthropic -\texttt{/cost} underreport vs dashboard (anthropics/claude-% -code~\#1063): intra-vendor, not cross-harness. HuggingFace -Open LLM Leaderboard v1$\to$v2 transition (including the -lighteval vs lm-eval-harness switch): full methodology change -with no explicit ``efficiency numbers not comparable across -versions'' statement. - -\textbf{Latency.} HELM \texttt{efficiency.json} publishes -per-model latency/throughput but is frozen at pre-2023 models -with no modern cross-comparison. OpenAI vs Azure OpenAI -latency differences on the same model are documented but with -no published benchmark pair. - -\textbf{Task success.} Anecdotal ``our replication differs -from the original paper'' issue-tracker threads that do not -cite a specific convention difference. - -% ---------------------------------------------------------------------------- -% Appendix C --- Projection Operators to Community Trace Shapes. -% EDIT per Appendix Handling table: cut pseudocode from six subsections -% (terminology-glossary-enforced principle under v5.6), keep Target/Preserved/ -% Dropped paragraphs. EXPAND with $\tau_E$ schema details moved out of Sec.~3.1 body -% during Session 11 appendix work. -% Content initially carried verbatim from main.tex lines 983-1388, then edited. -% ---------------------------------------------------------------------------- - -\section{Projection Operators to Community Trace Shapes} -\label{app:projections} - -This appendix defines each projection -$\pi: \tau_E \to (T_\pi, D_\pi)$ whose structural preservation -properties appear in Table~\ref{tab:preservation}. For each we -give: (i) the target trace format as a formal object; -(ii) pseudocode for the projection function operating on -$\tau_E = (N, E, C, A, M)$; (iii) the drops manifest $D_\pi$ as -a list of typed erasure records; (iv) a characterisation of which -substrate structure the projection preserves. Reference -implementations are in the released codebase; the pseudocode here -is the specification, not the implementation. Each drops manifest -record is a tuple -$(\textit{erasure\_type}, \textit{element\_ref}, -\textit{metric\_class})$ per~\S\ref{app:projections}. - -Figure~\ref{fig:sixproj} gives the visual summary before the -definitions: one SWE-Bench Verified rollout $\tau_E$ rendered -through all six projections, with each projection's drops -manifest overlaid. - -\begin{figure}[h] -\placeholder{0.95}{% - \textbf{Figure C.1: One SWE-Bench Verified trajectory, six - representations.} Centre: substrate state $\tau_E$ for one - rollout, rendered with containment tree (vertical), dependency - DAG (horizontal edges), reasoning-log events (timeline), and - typed mutations (marked). Five surrounding panels: the same - trajectory under each projection below. Drops manifest records - overlaid on each panel, colour-coded by erasure type (concurrency - collapse, cancellation erasure, edge deletion, containment - flattening, reasoning truncation, attribution loss). - $\pi_{\text{step}}$ and $\pi_{\text{per-agent}}$ each show - large shaded regions (structural erasure); - $\pi_{\text{call-tree}}$ serialises concurrent siblings into an - ordering; $\pi_{\text{macro}}$ collapses sub-tree reasoning - into aggregate effects; $\pi_{\text{json-log}}$ matches the - substrate up to schema encoding. -} -\caption{One SWE-Bench Verified trajectory rendered through all -six representations, with drops manifests overlaid.} -\label{fig:sixproj} -\end{figure} - -\paragraph{Preservation, operationally.} We use ``projection'' -informally --- a function that loses information. A behavioural -quantity $\mu: \tau_E \to \mathbb{R}$ is \emph{$\pi$-preserved} -iff there exists a computable $g$ such that -$g(\pi(\tau)) = \mu(\tau)$ for all $\tau$; that is, $\mu$ factors -through $\pi$. Table~\ref{tab:preservation} records three cases -per (projection, quantity) pair: (i)~\textbf{preserved} -(\ding{51}): such a $g$ exists given $T_\pi$'s canonical schema -alone; (ii)~\textbf{erased} (\ding{55}): no such $g$ exists -because $T_\pi$ lacks the relevant channel; (iii)~\textbf{partial} -($\sim$): $g$ exists on a subset of $\tau_E$ but not universally, -admitting a numeric preservation fraction -$\rho_{\pi,\mu} \in (0,1)$ whose computation requires running -$\pi$ against a trajectory corpus and is beyond the scope of this -paper. Each cell is therefore a concrete question (``does $g$ -exist given $T_\pi$'s schema?'') rather than a qualitative -judgement. - -\begin{table}[h] -\centering -\footnotesize -\setlength{\tabcolsep}{4pt} -\caption{Structural preservation matrix. Rows: the six projections -defined below plus the raw substrate $\tau_E$. Columns: the six -community-native behavioural quantities of \S\ref{sec:validation} -(\textbf{Step-ret}: step-indexed return under episode mask, long-horizon -LLM-agentic RL~\citep{chen2025loop, wang2025ragen}; -\textbf{Opt-term}: option termination frequency, -HRL/macro-action~\citep{bacon2017optioncritic}; \textbf{Call-dep}: -call-tree depth distribution, recursive LMs~\citep{zhu2024redel}; -\textbf{Sib-dist}: per-agent sibling embedding distance, fixed-role -MAS~\citep{cemri2025mast}; \textbf{MCTS-ent}: node visit entropy -across frontier, MCTS-based training~\citep{rstarmath2025, -feng2024restmcts}; \textbf{Async-dp}: dispatch-and-wait rate, -async MAS). \ding{51}~=~preserved under $\pi$; -\ding{55}~=~$T_\pi$ has no channel that could carry the elements -contributing to the quantity; $\sim$~=~partially preserved. Cells -follow from the projection pseudocode in -\S\S\ref{app:proj:step}--\ref{app:proj:json-log}.} -\label{tab:preservation} -\begin{tabular}{@{}lcccccc@{}} -\toprule -\textbf{Projection} & \textbf{Step-ret} & \textbf{Opt-term} & \textbf{Call-dep} & \textbf{Sib-dist} & \textbf{MCTS-ent} & \textbf{Async-dp} \\ -\midrule -$\pi_{\text{step}}$ (long-horizon RL) & \ding{51} & \ding{55} & \ding{55} & \ding{55} & \ding{55} & \ding{55} \\ -$\pi_{\text{macro}}$ (hierarchical) & $\sim$ & \ding{51} & \ding{51} & \ding{55} & \ding{55} & $\sim$ \\ -$\pi_{\text{call-tree}}$ (recursive LM) & $\sim$ & $\sim$ & \ding{51} & $\sim$ & \ding{55} & \ding{55} \\ -$\pi_{\text{per-agent}}$ (fixed-role MAS) & $\sim$ & \ding{55} & \ding{55} & \ding{51} & \ding{55} & $\sim$ \\ -$\pi_{\text{mcts}}$ (search-tree) & \ding{55} & \ding{55} & $\sim$ & \ding{55} & \ding{51} & \ding{55} \\ -\midrule -$\pi_{\text{json-log}}$ (production ref.) & \ding{51} & \ding{51} & \ding{51} & \ding{51} & \ding{51} & \ding{51} \\ -$\tau_E$ (Ergon raw) & \ding{51} & \ding{51} & \ding{51} & \ding{51} & \ding{51} & \ding{51} \\ -\bottomrule -\end{tabular} -\end{table} - -Table~\ref{tab:preservation} reads off the projection definitions -below. The pattern is consistent: the five community projections -each preserve their own native quantity on the diagonal and -(mostly) erase the others; $\pi_{\text{step}}$ carries no -structural channel beyond the token-indexed step sequence and so -erases every non-self-community question; -$\pi_{\text{json-log}}$ preserves every quantity by construction -but is not a training target for any of the five communities. -The off-diagonal $\sim$ cells (e.g., -$\pi_{\text{call-tree}} \times$~Sib-dist) indicate cases where -$T_\pi$ carries some but not all of the channel $\mu$ requires: -$\pi_{\text{call-tree}}$ preserves sibling \emph{relations} but -not the reasoning-log content embeddings MAST's role-differentiation -metric computes over. - -A methodological note: Table~\ref{tab:preservation} asks whether -$\mu$ factors through $\pi$, not whether any proxy for $\mu$ -does. A weaker proxy claim --- say, ``fraction of episodes -terminated before the environment terminates'' as a surrogate for -option termination under $\pi_{\text{step}}$ --- may survive -erasure of the canonical quantity. Whether such a proxy counts -as the same claim is a judgement the framework does not -adjudicate; it establishes only that the canonical $\mu$, as -defined over $\tau_E$, does not factor through $\pi$. - -\subsection{$\pi_{\text{step}}$: step-indexed $(o_t, a_t, r_t)$ tuples} -\label{app:proj:step} - -\paragraph{Target format.} A token-indexed flat prompt-response -tensor pair, matching the \texttt{DataProto} -schema~\citep{sheng2024hybridflow} consumed by VERL, OpenRLHF, -and TRL: $(\texttt{prompts}, \texttt{responses}, -\texttt{attention\_mask}, \texttt{response\_mask}, -\texttt{token\_level\_scores}, \texttt{advantages})$, where -\texttt{token\_level\_scores} is a per-token reward tensor of -shape $[\text{batch}, \text{response\_length}]$. Equivalently, a -sequence $T_{\text{step}} = [(o_1, a_1, r_1), (o_2, a_2, r_2), -\ldots]$ where each step corresponds to a token position and -$r_t \in \mathbb{R}$ is its assigned reward. Multi-turn -interactions are handled by concatenating turn tokens into one -flat response per rollout; no per-turn identity is preserved in -the tensor shape. - -\paragraph{Projection pseudocode.} -\begin{lstlisting} -def pi_step(tau_E): - N, E, C, A, M = tau_E - T_step = [] - D = [] # drops manifest - # Walk M in sequence order; emit one step per - # (assistant_text | tool_call -> tool_result) pair - # from the root node's reasoning log only. - root = find_root(N) - events = [e for e in C if e.node_id == root.id] - for e in events_by_sequence(events): - if e.kind == "assistant_text" or e.kind == "tool_call": - a_t = e.payload - o_t = next_observation_after(e, C) - r_t = reward_for(e, A) # from annotation store - T_step.append((o_t, a_t, r_t)) - # Record drops - for n in N: - if n.id != root.id: - D.append(("containment_flattening", n.id, "depth")) - for edge in E: - D.append(("edge_deletion", edge.id, "tree_likeness")) - for m in M: - if m.kind in ("node.added", "node.status_changed") - and m.node_id != root.id: - D.append(("cancellation_erasure" - if "cancel" in m.payload else - "attribution_loss", - m.id, "late_cancel")) - for e in C: - if e.node_id != root.id: - D.append(("reasoning_truncation", e.id, "role_diff")) - return T_step, D -\end{lstlisting} - -\paragraph{Preserved.} The root worker's linear sequence of -assistant text and tool interactions with its own environment. -\paragraph{Dropped.} Everything else: the containment tree beyond -the root, all dependency edges, all cross-worker causal structure, -all cancellation events as typed policy actions (they appear, if -at all, only as the absence of subsequent children), per-worker -identity beyond the root, and the reasoning logs of all -non-root workers. - -\subsection{$\pi_{\text{per-agent}}$: per-worker streams with partner annotations} -\label{app:proj:per-agent} - -\paragraph{Target format.} A mapping from worker identifier to a -per-worker stream: -$T_{\text{per-agent}} = \{ w_i \mapsto \sigma_i \}_{i=1}^{k}$, -where each $\sigma_i$ is a per-worker stream of observations and -actions for worker $w_i$ under a topology declared before the -rollout, optionally annotated with role tags or partner-type -beliefs. This shape covers fixed-role multi-agent LLM systems, -MARL with a centralised critic, proposer/critic debate setups, -and --- as a special case of coordination under partner -uncertainty --- ad hoc teamwork~\citep{mirsky2022aht,wang2024naht}. -Cross-stream references are limited to dependency edges between -streams, which the format does not natively carry. - -\paragraph{Projection pseudocode.} -\begin{lstlisting} -def pi_per_agent(tau_E): - N, E, C, A, M = tau_E - workers = group_by(N, key=lambda n: n.assigned_worker_key) - T_per_agent = {} - D = [] - for w, nodes in workers.items(): - stream = [] - events = [e for n in nodes for e in C if e.node_id == n.id] - for e in events_by_sequence(events): - if e.kind in ("assistant_text", "tool_call", - "tool_result"): - stream.append(event_to_stream_record(e)) - T_per_agent[w] = stream - # Drops - for n in N: - if n.parent_node_id: - D.append(("containment_flattening", n.id, "depth")) - for edge in E: - src_w = worker_of(edge.src, N) - dst_w = worker_of(edge.dst, N) - if src_w != dst_w: - D.append(("edge_deletion", edge.id, "tree_likeness")) - for m in M: - if m.kind == "node.status_changed" - and m.payload.status == "cancelled": - D.append(("cancellation_erasure", m.id, "late_cancel")) - return T_per_agent, D -\end{lstlisting} - -\paragraph{Preserved.} Per-agent trajectory streams indexed by -agent identity, matching the shape MARTI~\citep{marti2025} -distributes to individual policy trainers. Partial concurrency -information: a consumer comparing wall-clock timestamps across -streams can detect overlapping activity. Partial role -differentiation: embedding analysis across the per-agent streams -remains possible, though the stream-identity boundary is now a -projection artefact rather than a containment-tree fact. - -\paragraph{Dropped.} The format is agent-major rather than -work-major: task-level structure --- which subtask is shared -across which agents, which agents are blocked on each other's -dispatches, which branches were cancelled as typed events --- is -carried by the workflow class in systems like MARTI (Multi-Agent -Debate, Mixture-of-Agents, Chain-of-Agents) and is not -recoverable from the per-agent streams alone. Specifically: -containment depth (the parent-child relation flattens to stream -identity); cross-worker dependency edges; cancellation as a -typed action (worker terminations appear only as stream -endings); async dispatch-and-wait (the dispatch is in one -stream, the result in another, with no expressible causal link). - -\subsection{$\pi_{\text{call-tree}}$: nested sub-LM call tree} -\label{app:proj:call-tree} - -\paragraph{Target format.} A tree $T_{\text{call-tree}}$ whose -nodes are sub-LM invocations and whose edges represent -parent-calls-child. Each node carries its prompt, its response, -and its list of (serialised) child invocations. Concurrent siblings -are linearised into a canonical child ordering, typically by -start-time. - -\paragraph{Projection pseudocode.} -\begin{lstlisting} -def pi_call_tree(tau_E): - N, E, C, A, M = tau_E - def build(node): - children = [n for n in N - if n.parent_node_id == node.id] - # Linearise by start time; this is the concurrency erasure - children.sort(key=lambda n: n.created_at) - return { - "prompt": assembled_prompt(node, C), - "response": assembled_response(node, C), - "children": [build(c) for c in children], - } - root = find_root(N) - T_call_tree = build(root) - D = [] - for edge in E: - D.append(("edge_deletion", edge.id, "tree_likeness")) - for n in N: - sibs = siblings(n, N) - concurrent_sibs = [s for s in sibs - if overlaps_in_time(s, n, M)] - for s in concurrent_sibs: - D.append(("concurrency_collapse", (n.id, s.id), - "width")) - for m in M: - if m.kind == "node.status_changed" - and m.payload.status == "cancelled": - D.append(("cancellation_erasure", m.id, "late_cancel")) - return T_call_tree, D -\end{lstlisting} - -\paragraph{Preserved.} Containment depth (the tree structure is -the format). Per-node reasoning. Parent-child attribution. - -\paragraph{Dropped.} Concurrency (siblings are serialised into a -canonical ordering, reducing measured width to one). Dependency -edges (the format has a tree, not a DAG). Cancellation as a typed -action (appears only as truncated child subtrees). Async -dispatch-and-wait (result is a return value, not an event with a -sequence stamp distinct from its dispatch). - -\subsection{$\pi_{\text{macro}}$: hierarchical macro-action decomposition} -\label{app:proj:macro} - -\paragraph{Target format.} An option-tagged state-action -trajectory: each primitive step $(s_t, a_t)$ carries an -additional label $\omega_t$ identifying the currently-active -option, plus a termination flag $\beta_t \in \{0, 1\}$ marking -when the option ended and the meta-policy selected a new -one~\citep{bacon2017optioncritic}. The training signal is the -termination gradient plus the intra-option policy gradient; by -construction the framework is single-active-option, so at each -step exactly one $\omega_t$ is active. - -\paragraph{Projection pseudocode.} -\begin{lstlisting} -def pi_macro(tau_E): - N, E, C, A, M = tau_E - root = find_root(N) - T_macro = [] - for child in direct_children_in_time_order(root, N): - macro = { - "s_start": state_at_start(child, M), - "s_end": state_at_end(child, M), - "t_start": child.created_at, - "t_end": child.completed_at, - "effect": aggregate_subtree(child, N, A), - } - T_macro.append(macro) - D = [] - for n in N: - if n.parent_node_id and n.parent_node_id != root.id: - D.append(("reasoning_truncation", n.id, "role_diff")) - for edge in E: - D.append(("edge_deletion", edge.id, "tree_likeness")) - for m in M: - if m.kind == "node.status_changed" - and m.payload.status == "cancelled": - subtree_descendant = is_descendant(m.node_id, root, N) - if subtree_descendant: - D.append(("cancellation_erasure", m.id, - "late_cancel")) - return T_macro, D -\end{lstlisting} - -\paragraph{Preserved.} Intra-option state-action transitions -(each primitive step $(s_t, a_t)$ is retained with its option -label $\omega_t$). Option boundaries, marked by the termination -flag $\beta_t$. Temporal extension of each option. Aggregate -effect and wall-clock timestamps per option. - -\paragraph{Dropped.} Concurrent option execution: the framework -is single-active-option by construction~\citep{bacon2017optioncritic}, -so any concurrency in $\tau_E$ is serialised. Cross-option -dependencies: the options framework has no concept of one -option's output being an input to another. Cancellation as a -typed action: termination is a learned probability $\beta_\omega(s)$ -rather than an event the agent chose, so there is no record of -\emph{why} a subtree was abandoned. Sub-worker identity within -an option's subtree. - -\subsection{$\pi_{\text{mcts}}$: search tree with visit and value statistics} -\label{app:proj:mcts} - -\paragraph{Target format.} A search tree -$T_{\text{mcts}} = (V, E_{\text{tree}}, B)$ where each vertex -$v \in V$ carries a tuple $(s_v, a_v, N_v, Q_v)$ recording the -state, the action taken from the parent, the visit count, and -the backed-up value estimate; $E_{\text{tree}}$ is the -parent-child relation over $V$; and $B$ is the set of backup -edges $(r, v)$ recording which rollout $r$ contributed its -terminal return to the value estimate at $v$. This is the format -produced by rStar-Math~\citep{rstarmath2025} and -ReST-MCTS*~\citep{restmcts2024}, consumed by AlphaZero-style -training with PUCT visit distributions as policy targets and -backed-up returns as value targets. - -\paragraph{Projection pseudocode.} -\begin{lstlisting} -def pi_mcts(tau_E): - N, E, C, A, M = tau_E - # Identify nodes whose containment subtree constitutes - # an MCTS rollout (by annotation tag or worker key). - rollouts = [n for n in N if is_mcts_rollout(n, A)] - # Build search-tree vertices by aggregating across rollouts - # that share a state prefix. - V = {} # keyed by (s, a) pair - backup_edges = [] - for r in rollouts: - for step_node in traverse_rollout(r, N): - s = state_at(step_node, M, A) - a = action_taken(step_node, C) - key = (s, a) - if key not in V: - V[key] = {"s": s, "a": a, "N_visits": 0, - "Q_value": 0.0} - V[key]["N_visits"] += 1 - # Back up terminal return from this rollout - G = terminal_return(r, A) - V[key]["Q_value"] = ( - (V[key]["Q_value"] * (V[key]["N_visits"] - 1) + G) - / V[key]["N_visits"] - ) - backup_edges.append((r.id, key)) - # Parent-child edges in the search tree - E_tree = build_tree_edges(V, rollouts) - T_mcts = (V, E_tree, backup_edges) - # Drops manifest - D = [] - for n in N: - if not is_mcts_rollout_member(n, A): - # Any delegation structure outside the search recipe - D.append(("attribution_loss", n.id, "depth")) - for edge in E: - if not is_tree_edge_in(edge, E_tree): - D.append(("edge_deletion", edge.id, "tree_likeness")) - for m in M: - if m.kind == "node.status_changed" - and m.payload.status == "cancelled": - D.append(("cancellation_erasure", m.id, "late_cancel")) - for e in C: - if e.kind in ("assistant_text", "thinking") - and not is_part_of_rollout_state(e, rollouts): - D.append(("reasoning_truncation", e.id, "role_diff")) - return T_mcts, D -\end{lstlisting} - -\paragraph{Preserved.} Containment depth along the search-tree -spine. Per-node visit counts and value backups (the signal the -MCTS training step consumes). The rollout-to-node backup edges -that explain which trajectory's terminal return updated which -vertex value. - -\paragraph{Dropped.} Concurrency beyond fixed-width -$(s, a)$-keyed vertices (sibling rollouts that explore the same -$(s, a)$ merge into a single vertex). Delegation structure -outside the search recipe (any node not part of an MCTS rollout, -e.g., concurrent tool calls not on the search tree, is dropped). -Cancellation as a typed event (a cancelled rollout contributes -no backup edge but leaves no explicit cancellation record). -Cross-rollout dependency edges (the format has a tree by -construction). Per-node reasoning text beyond what defines the -state. - -\subsection{$\pi_{\text{json-log}}$: production-orchestration JSON event log (reference projection)} -\label{app:proj:json-log} - -\paragraph{Target format.} An ordered sequence of JSON records -$T_{\text{json-log}} = [r_1, r_2, \ldots]$ where each $r_i$ is a -record with fields -$\{\textit{sequence}, \textit{timestamp}, \textit{kind}, -\textit{node\_ref}, \textit{parent\_ref}, \textit{payload}\}$. -The record set is a near-isomorphism of the mutation log $M$ -extended with assistant text events from $C$. - -\paragraph{Projection pseudocode.} -\begin{lstlisting} -def pi_json_log(tau_E): - N, E, C, A, M = tau_E - records = [] - combined = list(M) + [e for e in C - if e.kind in ("assistant_text", - "tool_call", - "tool_result", - "thinking")] - for item in sorted(combined, key=lambda x: x.sequence): - records.append({ - "sequence": item.sequence, - "timestamp": item.timestamp, - "kind": item.kind, - "node_ref": getattr(item, "node_id", None), - "parent_ref": parent_ref_of(item, N, E), - "payload": item.payload, - }) - D = [] - # Drops manifest is empty of structural erasures; - # only schema-serialisation losses apply (e.g., binary blobs - # in annotations truncated to base64, if policy dictates). - return records, D -\end{lstlisting} - -\paragraph{Preserved.} All structural mutations and all typed -reasoning events, in sequence order, with full parent references -and timestamps. The format is a near-isomorphism of $(M, C)$. - -\paragraph{Dropped.} No structural erasure. The limitation of -$\pi_{\text{json-log}}$ relative to $\tau_E$ is not lost -expressiveness but the absence of (i) a formal projection interface -to other community formats for cross-community analysis, and -(ii) the drops manifest abstraction itself, so that downstream -consumers of a JSON event log that re-encode it into their own -format do so without explicit enumeration of what is lost. Ergon -supplies both. - -\subsection{Adding a further projection} -\label{app:proj:extension} - -A new community trace format is added by writing one projection -function $\pi_{\text{new}}: \tau_E \to (T_{\text{new}}, D_{\text{new}})$ -in the same pattern as above. The substrate $\tau_E$ does not -change; the drops manifest schema does not change; the structural -preservation matrix gains one row. Declarative -planning~\citep{ada2023llmp}, embodied agent traces, and -world-model-based planning are candidate further targets, which -we leave to future work. - - -% ---------------------------------------------------------------------------- -% Appendix D --- Ergon System Details. MINOR EDIT per Appendix Handling table: -% "gradient-ready" -> "RL-trainer-compatible". Otherwise carry verbatim. -% Content initially carried from main.tex lines 1406-1448, then edited. -% ---------------------------------------------------------------------------- - -\section{Ergon System Details} -\label{app:system} - -\subsection{Rollout-card format specification} -\label{app:system:format-spec} - -The rollout-card format of \S\ref{sec:system:format} is a -medium-independent bundle: any backend preserving the row -semantics below can emit and consume valid cards. Ergon's -Postgres schema (\S\ref{app:system:postgres}) is one such -backend; a zip archive written by -\texttt{ergon export-rollout-card} is another; a HuggingFace -dataset shard is a third. The dashboard of -Figure~\ref{fig:dashboard} is a reference \emph{reader} that -loads any of these. - -\paragraph{Bundle layout.} A card is a logical directory of -seven artefacts: \texttt{manifest.json} (run-level metadata, -format version, content hashes); five append-only JSON-lines -streams (\texttt{events.jsonl}, \texttt{nodes.jsonl}, -\texttt{edges.jsonl}, \texttt{annotations.jsonl}, -\texttt{mutations.jsonl}); and an optional \texttt{blobs/} -directory holding content-addressed overflow for payloads above -the inline size cap (default 64\,KB). The directory may be -distributed as-is, packed into a zip or tarball, written to an -object-storage prefix, or materialised as a HuggingFace dataset -repository with the streams as split files. Nothing in the -format assumes Postgres, Python, or any particular runtime. - -\paragraph{Row schemas.} Each stream is a newline-delimited JSON -file; each row has the columns in Table~\ref{tab:rowschemas}. -Column types are JSON primitives (string, integer, object, -null); \texttt{payload} and -\texttt{old\_value}/\texttt{new\_value} columns are arbitrary -JSON objects whose shape depends on a row-level discriminator -(\texttt{event\_type} for events, \texttt{mutation\_type} for -mutations). The discriminator-specific payload shapes for the -Ergon reference implementation are documented in -\S\ref{app:system:postgres} (events) and -Table~\ref{tab:mutations} (mutations). - -\begin{table}[h] -\centering -\small -\caption{Rollout-card JSONL row schemas. Each stream carries a -monotonic \texttt{sequence}; its scope (per-run or -per-\texttt{task\_execution\_id}) is noted. Discriminator -columns select the \texttt{payload} shape.} -\label{tab:rowschemas} -\footnotesize -\setlength{\tabcolsep}{3pt} -\begin{tabular}{@{}l>{\raggedright\arraybackslash}p{0.72\linewidth}@{}} -\toprule -\textbf{Stream} & \textbf{Row schema} \\ -\midrule -\texttt{events.jsonl} & \texttt{(event\_id, task\_execution\_id, worker\_binding\_key, sequence, event\_type, turn\_id, payload, started\_at, completed\_at, policy\_version)}; \texttt{sequence} monotonic per \texttt{task\_execution\_id}. \\ -\texttt{nodes.jsonl} & \texttt{(node\_id, parent\_id, instance\_key, task\_key, status, assigned\_worker\_key, level, created\_at, updated\_at)}. \\ -\texttt{edges.jsonl} & \texttt{(source\_node\_id, target\_node\_id, status, created\_at, updated\_at)}; \texttt{status} $\in$ \{\texttt{pending}, \texttt{satisfied}, \texttt{invalidated}\}. \\ -\texttt{annotations.jsonl} & \texttt{(target\_type, target\_id, namespace, sequence, payload, created\_at)}; latest \texttt{sequence} within \texttt{(target, namespace)} is current; prior rows retained. \\ -\texttt{mutations.jsonl} & \texttt{(sequence, mutation\_type, target\_type, target\_id, actor, old\_value, new\_value, reason, created\_at)}; \texttt{sequence} monotonic per run. \\ -\bottomrule -\end{tabular} -\end{table} - -\paragraph{Semantics-preserving invariants.} Any backend -emitting a card must honour four invariants regardless of how -rows are stored: (i) \texttt{mutations.jsonl} is strictly -append-only --- reversals appear as new mutations, deletions as -tombstones (\S\ref{app:system:mutations}); (ii) -\texttt{events.jsonl} is append-only per -\texttt{task\_execution\_id}; (iii) \texttt{annotations.jsonl} -is namespace-keyed, with the latest \texttt{sequence} within a -\texttt{(target, namespace)} pair as the current value and -prior rows retained for replay; (iv) the DAG implied by -\texttt{edges.jsonl} is acyclic at every point in the run's -lifetime (Ergon enforces this at write time via the -\texttt{edge.added} invariant; \S\ref{app:system:mutations}). -Cards emitted from Ergon's Postgres backend satisfy these by -construction; a third-party backend writing cards directly -must enforce them itself. - -\paragraph{Extensibility.} \texttt{manifest.json} carries a -format version (\texttt{schema: "ergon.tau\_e/0.4"}); consumers -check the major version and tolerate minor-version additions. -Additional columns on existing streams are permitted and must -be ignored by consumers that do not recognise them. The public -extension point for new metadata is -\texttt{annotations.jsonl}'s \texttt{namespace} field: a -consumer claims a namespace, writes domain-specific payloads -under it, and the format guarantees that round-tripping a card -through a reader that does not understand the namespace -preserves those rows intact. One namespace, -\texttt{ergon.task}, is reserved for task payloads; all others -are available to experiments, projections, or external -instrumentation. - -\paragraph{Reference implementations.} The library ships three -artefacts. A Pydantic model set plus JSON-schema export for -the row types above -(\texttt{ergon\_core/core/persistence/rollout\_card/models.py}); -a zip-archive exporter -(\texttt{ergon export-rollout-card }) that emits the -bundle from any completed Postgres run; and a validator -(\texttt{ergon validate-rollout-card }) that checks -bundle-layout and invariant conformance against any candidate -card. The dashboard frontend consumes cards through the same -Pydantic models, and therefore renders any conformant bundle ---- not just Ergon-emitted ones. - -\subsection{Postgres reference backend} -\label{app:system:postgres} - -Ergon's reference backend for the rollout-card format of -\S\ref{app:system:format-spec} is a Postgres schema of ten -SQLModel tables: one root (\texttt{RunRecord}) and nine per-run -subtables organised into three layers. Internally, Ergon models -a run as a tuple $\tau_E = (N, E, C, A, M)$ --- a containment -tree of nodes $N$, a dependency DAG of edges $E$, a typed -reasoning-and-action log $C$, a namespace-keyed annotation -store $A$, and a mutation log $M$. The graph layer concretises -$(N, E, A, M)$ as four tables; the execution layer concretises -$C$ and wraps it in a retry-aware per-attempt wrapper; the communication layer carries inter-agent messages -for multi-agent runs. All nine subtables foreign-key to -\texttt{runs.id}. Figure~\ref{fig:schema} gives the full -topology; schema definitions live in -\texttt{ergon\_core/core/persistence/}\allowbreak\texttt{\{graph,context,telemetry\}/models.py}. - -\begin{figure}[t] -\centering -\makebox[\linewidth][c]{\includegraphics[width=1.15\linewidth]{ergon_schema.png}} -\caption{Ergon's ten-table trajectory schema. All per-run -tables foreign-key to \texttt{runs.id}; the formal-model tags -\texttt{[N]}, \texttt{[E]}, \texttt{[C]}, \texttt{[A]}, -\texttt{[M]} identify the five tables that back each component -of $\tau_E$. Dashed borders mark append-only write-ahead-log -tables (INSERT only); solid borders mark mutable tables (INSERT -+ UPDATE). The single cross-group foreign key -\texttt{node\_id}: Execution~$\to$~Node is the join that lets -reasoning events and generation turns recover their position -in the task DAG.} -\label{fig:schema} -\end{figure} - -\paragraph{Graph layer.} \texttt{RunGraphNode} -(\texttt{persistence/graph/models.py:L44-L89}) stores the -containment tree directly --- each node carries its own -\texttt{parent\_node\_id} and integer \texttt{level}, so the -full hierarchy is one indexed SELECT rather than a recursive -CTE. \texttt{RunGraphEdge} (\texttt{:L96-L116}) stores data -dependencies with status $\in\{$\texttt{pending}, -\texttt{satisfied}, \texttt{invalidated}$\}$. -\texttt{RunGraphAnnotation} (\texttt{:L123-L165}) and -\texttt{RunGraphMutation} (\texttt{:L172-L186}) are both -append-only: each carries a per-run monotonic -\texttt{sequence}, and any state at any past point in a run is -reconstructed by replaying mutations up to that sequence -against an empty graph. Annotations additionally carry a -\texttt{namespace} so multiple subsystems (experiment config, -trainer hints, dashboard state) can version their metadata -independently on the same target. - -\paragraph{Execution layer.} \texttt{RunTaskExecution} -(\texttt{persistence/telemetry/models.py:L96-L157}) wraps one -attempt at a node: if a worker retries, each attempt gets its -own row with an incremented \texttt{attempt\_number}, so the -context events and generation turns of failed attempts are -preserved alongside the successful one. -\texttt{RunContextEvent} (\texttt{persistence/context/models.py:L25-L49}) -is the typed reasoning log --- the substrate backing $C$. It -is append-only, with a \texttt{sequence} unique per -\texttt{task\_execution\_id}, and a discriminated-union -\texttt{payload} whose shape depends on \texttt{event\_type}: -\begin{description}[leftmargin=0pt,itemindent=0pt,labelindent=0pt,labelsep=0.4em,itemsep=1pt,topsep=2pt,parsep=0pt,font=\normalfont\bfseries] -\sloppy -\item[system\_prompt, user\_message:] plain \texttt{text} - (user messages additionally carry \texttt{from\_worker\_key} - to attribute inter-worker sends). -\item[assistant\_text, thinking:] \texttt{text}, - \texttt{turn\_id}, and optional \texttt{turn\_token\_ids} / - \texttt{turn\_logprobs} (populated for vLLM, absent for - cloud APIs that do not expose token-level information). -\item[tool\_call:] \texttt{tool\_call\_id}, \texttt{tool\_name}, - \texttt{args}, plus the same token/logprob fields. -\item[tool\_result:] \texttt{tool\_call\_id}, - \texttt{tool\_name}, \texttt{result}, \texttt{is\_error}. -\end{description} -\texttt{RunGenerationTurn} -(\texttt{persistence/telemetry/models.py:L383-L464}) is the -per-model-call convenience extraction: one row per call, with -the raw response object, an extracted \texttt{response\_text}, -\texttt{token\_ids\_json}, \texttt{logprobs\_json}, and -\texttt{tool\_calls\_json}. It is redundant with -\texttt{RunContextEvent} --- the event log is the substrate, -the turn table is a reader-friendly index --- but carrying -both avoids rehydrating token arrays from event payloads on -every training step. - -\paragraph{Communication layer.} \texttt{Thread} -(\texttt{persistence/telemetry/models.py:L343-L353}) and -\texttt{ThreadMessage} (\texttt{:L360-L376}) are used only by -multi-agent runs. A thread is a durable channel between two -named agents within a run; each \texttt{ThreadMessage} carries -the sending agent's current \texttt{task\_execution\_id}, so a -replay can recover which reasoning step authored which -message. Single-agent runs leave both tables empty. - -\paragraph{Engine.} The production deployment is PostgreSQL -15; SQLite is used only for test fixtures, which forces JSON -payload columns to use the broadly portable \texttt{JSON} type -rather than \texttt{JSONB}. The consequence is that projection -code (\S\ref{app:projections}) must read JSON payloads -client-side rather than push them through Postgres path -operators, but this is cheap because projections run at export -time, not on the rollout hot path. Migrations are managed by -Alembic at \texttt{ergon\_core/migrations/}. - -\subsection{Mutation Kinds} -\label{app:system:mutations} - -Every change to the run graph flows through a single -dispatcher, \texttt{GraphRepository.\_log\_mutation} -(\texttt{ergon\_core/core/runtime/services/}\allowbreak\texttt{graph\_repository.py:L848-L890}), -which allocates the next per-run \texttt{sequence} (line -L848-L856: \texttt{SELECT MAX(sequence)+1} within the current -run), writes one \texttt{RunGraphMutation} row with -\texttt{old\_value}/\texttt{new\_value} snapshots, and fires -registered listeners asynchronously. Mutations are strictly -append-only: reversals produce new mutations, not edits to -prior ones; deletions are tombstones (see -\texttt{annotation.deleted} below). The system defines nine -mutation kinds (literal declaration at -\texttt{persistence/graph/models.py:L24-L34}), summarised in -Table~\ref{tab:mutations}. - -\begin{table}[h] -\centering -\small -\caption{The nine mutation kinds. \emph{Payload} lists the -domain-specific fields beyond the common -\texttt{(sequence, target\_type, target\_id, actor, -old\_value, new\_value)}. \emph{Invariants} are checked in the -dispatch path at -\texttt{runtime/services/graph\_repository.py}; when no entry -appears, the repository performs no domain validation and -defers enforcement to the experiment layer (per the repository -contract at \texttt{graph\_repository.py:L7-L9}).} -\label{tab:mutations} -\footnotesize -\setlength{\tabcolsep}{3pt} -\begin{tabular}{@{}l>{\raggedright\arraybackslash}p{0.22\linewidth}>{\raggedright\arraybackslash}p{0.42\linewidth}l@{}} -\toprule -\textbf{Kind} & \textbf{Payload} & \textbf{Invariants} & \textbf{File:L-L} \\ -\midrule -\texttt{node.added} & \texttt{task\_slug}, \texttt{instance\_key}, \texttt{description}, \texttt{status}, \texttt{assigned\_worker\_slug} & None. & \texttt{:L302-L312} \\ -\texttt{node.removed} & (same as above) & Cascades \texttt{edge.removed} for all incident edges first, then marks node terminal (node rows are not deleted). & \texttt{:L314-L358} \\ -\texttt{node.status\_changed} & \texttt{status} & If \texttt{only\_if\_not\_terminal=True}, skip if already in \{\texttt{COMPLETED}, \texttt{FAILED}, \texttt{CANCELLED}\}. & \texttt{:L360-L400} \\ -\texttt{node.field\_changed} & \texttt{field}, \texttt{value} & \texttt{field}~$\in$~\{\texttt{description}, \texttt{assigned\_worker\_slug}\} (whitelist at \texttt{:L62}); else \texttt{ValueError}. & \texttt{:L402-L434} \\ -\texttt{edge.added} & \texttt{source\_node\_id}, \texttt{target\_node\_id}, \texttt{status} & Both endpoints must exist (\texttt{DanglingEdgeError}); new edge must not create a cycle (DFS at \texttt{:L892-L915}). & \texttt{:L438-L474} \\ -\texttt{edge.removed} & (same as above) & None; edge row marked terminal (not deleted). & \texttt{:L476-L502} \\ -\texttt{edge.status\_changed} & \texttt{status} & None at repo layer; lifecycle \texttt{pending}~$\to$~\texttt{satisfied} / \texttt{invalidated} is experiment-layer policy. & \texttt{:L504-L531} \\ -\texttt{annotation.set} & \texttt{namespace}, \texttt{payload} & None; each call inserts a new annotation row (no upsert). Latest \texttt{sequence} within namespace is the current value. & \texttt{:L535-L573} \\ -\texttt{annotation.deleted} & \texttt{namespace}, \texttt{payload} & Soft delete: inserts a tombstone row with empty payload so the append-only log retains full history. & \texttt{:L647-L685} \\ -\bottomrule -\end{tabular} -\end{table} - -Three invariants are worth calling out because they resolve -otherwise-tricky concurrent-write conditions without -distributed coordination: - -\paragraph{Acyclicity on \texttt{edge.added} -(\texttt{:L892-L915}).} Each \texttt{edge.added} runs a DFS -from \texttt{target\_node\_id} following outgoing edges; if -the walk reaches \texttt{source\_node\_id}, the insertion is -rejected with \texttt{CycleError}. Enforcing this at the -repository rather than the experiment layer means every run -graph is a DAG by construction, which is what downstream -projections ($\pi_{\text{call-tree}}$, -$\pi_{\text{per-agent}}$) rely on for their topological walks. - -\paragraph{Terminal-write guard on -\texttt{node.status\_changed} (\texttt{:L381-L382}).} When -called with \texttt{only\_if\_not\_terminal=True}, the mutation -is skipped if the node already holds a terminal status -(\texttt{COMPLETED}, \texttt{FAILED}, or \texttt{CANCELLED} -per \texttt{status\_conventions.py:L23}). This single check -resolves cascade-cancellation races: concurrent paths that -both attempt to write a terminal status converge on -first-writer-wins without requiring a distributed lock. - -\paragraph{Tombstone semantics on -\texttt{annotation.deleted} (\texttt{:L662-L672}).} A -``deletion'' inserts a new annotation row with an empty -payload rather than removing prior rows. This preserves the -ability to replay the full annotation timeline --- the -diagnostic dashboards and the counterfactual-replay tooling -both depend on being able to reconstruct every historical -payload at any sequence, including the ones that were later -cleared. - -\paragraph{Edge-status lifecycle.} Conventionally -\texttt{pending}~$\to$~\texttt{satisfied} on dependency -resolution and \texttt{pending}~$\to$~\texttt{invalidated} on -upstream cancellation; backward transitions are not -prevented at the repository, but the experiment layer that -schedules workers treats \texttt{invalidated} as terminal. -The string values are constants in -\texttt{status\_conventions.py:L30-L32} and are not enforced -by a DB-level check constraint --- custom experiment graphs -are free to add their own edge statuses, at the cost of being -invisible to default dashboards. - -\subsection{Trainer Adapters} -\label{app:verl} - -Ergon decouples rollout execution from trainer execution by -serving a small HTTP surface that all three supported trainers -(TRL, VERL, OpenRLHF) consume. The server is a FastAPI -application at \texttt{ergon\_core/core/api/rollouts.py:L24-L89}; -trainer-side adapters are -$<100$-line shims in -\texttt{ergon\_infra/ergon\_infra/adapters/}. The trainers pull; -the server never pushes. - -\paragraph{Endpoints.} Four routes, all under -\texttt{/rollouts}. \texttt{POST /submit} accepts a -\texttt{SubmitRequest} (\texttt{core/rl/rollout\_types.py:L21-L35}) -carrying \texttt{definition\_id}, \texttt{num\_episodes}, -\texttt{policy\_version}, and an optional -\texttt{model\_target\_override}; it returns -\texttt{SubmitResponse} with a \texttt{batch\_id}, a list of -\texttt{run\_ids}, and the initial \texttt{BatchStatus} (status -202). \texttt{GET /\{batch\_id\}} returns a \texttt{PollResponse} -(\texttt{:L61-L69}) with \texttt{status}, \texttt{completed}, -\texttt{total}, the ordered list of completed -\texttt{Trajectory}~objects, and a list of -\texttt{EpisodeFailure}~records for any runs that crashed. -\texttt{DELETE /\{batch\_id\}} cancels an in-flight batch. -\texttt{POST /sync-weights} triggers an optional vLLM reload -for full-weight RFT scenarios (\texttt{:L67-L88}). Batch and -per-episode state are persisted in Postgres -(\texttt{RolloutBatch}, \texttt{RolloutBatchRun}), so trainers -can resume polling after either a trainer-side or server-side -restart. - -\paragraph{Intermediate representation.} The -\texttt{Trajectory} object (\texttt{rollout\_types.py:L38-L51}) -is the one wire-format contract the three adapters share. It -is a flat tuple: -\begin{center} -\texttt{(run\_id, agent\_id, prompt\_ids, completion\_ids, -logprobs, env\_mask, reward, num\_turns)}. -\end{center} -It is constructed at poll time by -\texttt{extract\_agent\_trajectories} in -\texttt{core/rl/extraction.py:L49-L117}. The extraction walks -the rollout's context events $C$ --- not its generation turns ---- so the wire format is grounded in the substrate's -append-only log rather than in a denormalised turn view. -Concretely: the function groups \texttt{RunContextEvent}~rows -by \texttt{worker\_binding\_key}, builds the prompt from the -initial \texttt{system\_prompt}/\texttt{user\_message} events, -and then walks the remaining events in sequence order. -Model-authored events (\texttt{assistant\_text}, -\texttt{tool\_call}, \texttt{thinking}) contribute their -\texttt{turn\_token\_ids} to \texttt{completion\_ids} with -\texttt{env\_mask=1} and their \texttt{turn\_logprobs} to -\texttt{logprobs}; environment events (\texttt{tool\_result}) -contribute tokenised text to \texttt{completion\_ids} with -\texttt{env\_mask=0} and zero-padded \texttt{logprobs}. The -scalar \texttt{reward} is attached by -\texttt{RewardStrategy.assign} (\texttt{extraction.py:L103}) -from the run's \texttt{RunTaskEvaluation} rows. Because -$\tau_E$ is event-sourced, the entire IR is reproducible at -export time against the pinned Postgres row versions --- the -same rollout replayed tomorrow produces byte-identical IR. - -\paragraph{Adapters.} Each adapter is a shim that calls -\texttt{submit}, polls \texttt{\{batch\_id\}}, and maps the -returned \texttt{Trajectory} into the native batch type the -trainer expects. Table~\ref{tab:adapters} summarises; the -longest file is $\sim$90 lines. - -\begin{table}[h] -\centering -\small -\caption{Trainer adapters. All three are thin shims over the -shared \texttt{/rollouts} endpoints; the framework-specific -work is field renaming and wrapping in the native batch type.} -\label{tab:adapters} -\begin{tabular}{@{}l>{\raggedright\arraybackslash}p{0.24\linewidth}>{\raggedright\arraybackslash}p{0.20\linewidth}>{\raggedright\arraybackslash}p{0.34\linewidth}@{}} -\toprule -\textbf{Trainer} & \textbf{Adapter file} & \textbf{Output type} & \textbf{Field renames and quirks} \\ -\midrule -TRL (GRPO) & \texttt{trl\_http.py:L25-L91} & \texttt{dict} (GRPOTrainer contract) & \texttt{reward}~$\to$~\texttt{completion\_reward}; synchronous \texttt{httpx.Client}; batch of $n$ trajectories per call. \\ -VERL & \texttt{verl\_http.py:L23-L82} & \texttt{AgentLoopOutput} & \texttt{completion\_ids} $\to$ \texttt{response\_ids}; \texttt{env\_mask} $\to$ \texttt{response\_mask}; async; single episode per call (streaming integration). \\ -OpenRLHF & \texttt{openrlhf\_http.py}\allowbreak\texttt{:L24-L84} & \texttt{dict} (\texttt{input\_ids}, \texttt{response\_ids}, \texttt{logprobs}, \texttt{reward}) & Drops \texttt{env\_mask} (OpenRLHF's return shape does not carry turn masks); module-level \texttt{configure()} for one-time setup; async; single episode per call. \\ -\bottomrule -\end{tabular} -\end{table} - -Because the trainers consume the same \texttt{Trajectory} -contract, a model trained under one trainer can be evaluated -against another trainer's harness without re-exporting: the -Ergon substrate is written once, and the format difference is -absorbed in the adapter layer. A new trainer is added by -writing one adapter file; no changes to the substrate, the -server, or the existing adapters are required. The repository -ships no SkyRL adapter at submission time, but the pattern -supports one at the same cost as the existing three. - - -% ---------------------------------------------------------------------------- -% Appendix E --- Tech-Stack Integrations List. -% Reframed (v5.6) from v4's Extended Community / Framework Capability Matrix. -% Purpose: descriptive list of what Ergon plugs into / exports to --- at the -% agents layer (Pydantic AI, LangGraph, CrewAI, AutoGen, Google ADK, Claude -% Code), RL layer (TRL, VERL, OpenRLHF, SkyRL, ProRL Agent, AReaL, AgentGym-RL, -% RAGEN, MARTI, etc.), and trajectory-format export targets (the five -% projections' canonical formats). Not a normative capability matrix; a -% descriptive integrations map. -% Content: written fresh in Session 11, filling in real integration status. -% Framing: "Ergon hosts existing agent frameworks (X, Y, Z) via adapters, -% feeds existing RL trainers (A, B, C) via projections, and exports to the -% five community canonical trajectory formats surveyed in Sec.~2.2." -% ---------------------------------------------------------------------------- - -\section{Tech-Stack Integrations} -\label{app:integrations} - -This appendix is a descriptive integrations \emph{map}, not a -normative capability matrix. For each of three layers --- agent -frameworks the substrate hosts as inner runtimes, RL trainers -the substrate feeds via HTTP adapters, and canonical trajectory -formats the substrate projects to --- we list the concrete -integrations shipped at submission time, together with the -integrations on the near-term roadmap. Rows marked -\emph{integrated} are exercised by the experiments in -\S\ref{sec:validation} or by the Ergon test suite; rows marked -\emph{planned} are scoped in RFCs in the Ergon repository under -\texttt{docs/rfcs/active/} and do not contribute to any result -reported in this paper. - -\paragraph{Agents layer.} Ergon hosts third-party agent loops -as inner runtimes behind an \texttt{AgentRuntimeAdapter} -contract: the adapter wraps the framework's loop, serialises -each turn as a \texttt{RunContextEvent} (see -\S\ref{app:system:postgres}), and replays stored events to -reconstruct framework-native message state on resumption. One -adapter ships today; five are scoped -(Table~\ref{tab:integrations:agents}). - -\begin{table}[h] -\centering -\small -\setlength{\tabcolsep}{4pt} -\caption{Agent-framework integrations. Integrated rows are -exercised by the flexible-agent worker. Planned rows are scoped -in the referenced RFC in the Ergon repository.} -\label{tab:integrations:agents} -\begin{tabular}{@{}l l >{\raggedright\arraybackslash}p{0.42\linewidth}@{}} -\toprule -\textbf{Framework} & \textbf{Status} & \textbf{Evidence / RFC} \\ -\midrule -Pydantic AI & integrated & \texttt{workers/baselines/}\allowbreak\texttt{react\_worker.py:L28-L105} wraps \texttt{Agent.iter()}; context replay at \texttt{persistence/context/}\allowbreak\texttt{assembly.py:L94-L130} \\ -LangGraph & planned & \texttt{agent-framework-adapter-layer} (state-graph replay) \\ -CrewAI & planned & \texttt{agent-framework-adapter-layer} (task-delegation shim) \\ -AutoGen & planned & \texttt{agent-framework-adapter-layer} (per-agent \texttt{worker\_binding\_key}) \\ -Google ADK & planned & \texttt{agent-framework-adapter-layer} (state-machine replay) \\ -Claude Code & planned & \texttt{agent-framework-adapter-layer} (closest native fit) \\ -\bottomrule -\end{tabular} -\end{table} - -\paragraph{RL trainers.} Ergon serves a FastAPI -\texttt{/rollouts} surface consumed by framework-specific -shims; the handshake is documented in Appendix~\ref{app:verl}. -Three shims ship today --- TRL, VERL, OpenRLHF --- each -$\sim$80--90 lines. Six are scoped -(Table~\ref{tab:integrations:trainers}). - -\begin{table}[h] -\centering -\small -\setlength{\tabcolsep}{4pt} -\caption{RL-trainer integrations. Integrated rows are the three -adapters compared in Table~\ref{tab:adapters}. Planned rows are -scoped in \texttt{rl-trainer-adapter-expansion}, which also -formalises the HTTP handshake as a \texttt{TrainerHttpAdapter} -\mbox{Protocol}.} -\label{tab:integrations:trainers} -\begin{tabular}{@{}l l >{\raggedright\arraybackslash}p{0.42\linewidth}@{}} -\toprule -\textbf{Trainer} & \textbf{Status} & \textbf{Evidence / RFC} \\ -\midrule -TRL (GRPO) & integrated & \texttt{adapters/trl\_http.py:L1-L92} \\ -VERL & integrated & \texttt{adapters/verl\_http.py:L1-L83} (\texttt{@register("ergon")}) \\ -OpenRLHF & integrated & \texttt{adapters/openrlhf\_http.py:L1-L85} \\ -SkyRL & planned & \texttt{rl-trainer-adapter-expansion} \\ -ProRL Agent & planned & \texttt{rl-trainer-adapter-expansion} \\ -AReaL & planned & \texttt{rl-trainer-adapter-expansion} \\ -AgentGym-RL & planned & \texttt{rl-trainer-adapter-expansion} \\ -RAGEN & planned & \texttt{rl-trainer-adapter-expansion} \\ -MARTI & planned & \texttt{rl-trainer-adapter-expansion} \\ -\bottomrule -\end{tabular} -\end{table} - -\paragraph{Projection / export formats.} The five canonical -trajectory shapes surveyed in \S\ref{sec:system:format} each -correspond to a projection operator over the substrate. Two -ship today (step-indexed tuples and per-agent streams, both -produced by \texttt{extract\_agent\_trajectories} at -\texttt{core/rl/extraction.py:L49-L117}); three are scoped -(Table~\ref{tab:integrations:projections}). The three planned -projections are pure reads over the existing schema -(option-tagged, call-tree) or reserve a new annotation -namespace (MCTS) --- no DB migration is required for any of -them. - -\begin{table}[h] -\centering -\small -\setlength{\tabcolsep}{4pt} -\caption{Projection-operator integrations. The five shapes map -to the trajectory-format survey in \S\ref{sec:system:format}. -Planned rows are one RFC each.} -\label{tab:integrations:projections} -\begin{tabular}{@{}l l >{\raggedright\arraybackslash}p{0.42\linewidth}@{}} -\toprule -\textbf{Projection} & \textbf{Status} & \textbf{Evidence / RFC} \\ -\midrule -Step-indexed tuples & integrated & \texttt{core/rl/extraction.py:L49-L117} \\ -Per-agent streams & integrated & same, keyed by \texttt{worker\_binding\_key} \\ -Option-tagged (semi-MDP) & planned & \texttt{projection-operator-option-tagged} \\ -Call-tree (nested) & planned & \texttt{projection-operator-call-tree} \\ -MCTS search-tree & planned & \texttt{projection-operator-mcts} (\texttt{mcts.*} annotation namespace) \\ -\bottomrule -\end{tabular} -\end{table} - -Of the twenty rows in the three tables, six are integrated and -fourteen are planned. The integrated subset is sufficient for -the experiments reported in \S\ref{sec:validation}; the planned -subset is what Ergon must ship to fully back the substrate -framing of \S\ref{sec:system}. Each planned RFC scopes the -code additions required and notes the paper-parity dependency -explicitly. - -% ---------------------------------------------------------------------------- -% Appendix F --- Experimental Setup. Consolidates former appendices F (Agent -% Action Spaces), G (Benchmark Details), H (Flexible Agent and Projection -% Details), and I (Cross-harness Reconciliation Methodology) into one -% themed appendix. Old \section labels are preserved as \label aliases on -% the new subsections so body refs (\ref{app:reconciliation}, -% \ref{app:benchmarks}) continue to resolve. -% ---------------------------------------------------------------------------- - -\section{Experimental Setup} -\label{app:setup} - -This appendix consolidates the experimental-setup details supporting -\S\ref{sec:validation}: per-benchmark agent action spaces -(\S\ref{app:actions}), benchmark selection and statistics -(\S\ref{app:benchmarks}), the flexible-agent scaffold used as the -workload-generating system (\S\ref{app:fivescaffolds}), and the -cross-harness reconciliation methodology for the SWE-Bench Verified -comparison (\S\ref{app:reconciliation}). - -\subsection{Agent action spaces} -\label{app:actions} - -Table~\ref{tab:actions} lists the full action space available to -the flexible-agent worker on each benchmark. Every benchmark -agent has access to the four subtask-decomposition tools (shared -across all benchmarks); task-specific tools vary. - -\begin{table}[h] -\centering -\small -\caption{Per-benchmark agent action space. The four -subtask-decomposition tools are shared across all benchmarks; -remaining tools are task-specific.} -\label{tab:actions} -\begin{tabular}{@{}lll@{}} -\toprule -\textbf{Benchmark} & \textbf{Tool} & \textbf{What it does} \\ -\midrule -\textit{All} & \texttt{spawn\_subtask} & Create a child subtask with a description and required inputs \\ -\textit{All} & \texttt{cancel\_subtask} & Cancel a subtask by ID (late cancellation is distinct from completion) \\ -\textit{All} & \texttt{wait\_on\_subtask} & Block on a subtask ID until it reaches a terminal status \\ -\textit{All} & \texttt{report\_result} & Write this task's result and mark the task complete \\ -\midrule -MiniF2F & \texttt{lean\_repl} & \todo{short description} \\ -MiniF2F & \texttt{lean\_check} & \todo{short description} \\ -\midrule -Research Rubrics & \texttt{web\_search} & \todo{short description (Tavily)} \\ -Research Rubrics & \texttt{read\_document} & \todo{short description (httpx+trafilatura)} \\ -\midrule -SWE-Bench Verified & \texttt{bash} & \todo{short description} \\ -SWE-Bench Verified & \texttt{edit\_file} & \todo{short description} \\ -\bottomrule -\end{tabular} -\end{table} - -The subtask-decomposition tools are exposed uniformly: the -worker's system prompt describes them mechanically and gives no -guidance on when to use them, so observed delegation structure -comes from the agent rather than from prompting (see -\S\ref{sec:validation:setup}). - - -\subsection{Benchmark details} -\label{app:benchmarks} - -\placeholder{0.95}{% - \textbf{Table A2: MiniF2F problem selection.} - Difficulty levels, problem categories, proof-length statistics, - subset used in \S\ref{sec:validation}. -} - -\placeholder{0.95}{% - \textbf{Table A3: Research Rubrics question distribution.} - Topic distribution, rubric criteria counts, evaluator - configuration. -} - -\placeholder{0.95}{% - \textbf{Table A4: SWE-Bench Verified subset.} - Per-repository counts in the 100--150 instance subset used in - \S\ref{sec:validation}, sampling procedure, comparison to full - Verified statistics, E2B template details. -} - - -\subsection{Flexible agent and projection details} -\label{app:fivescaffolds} - -\subsubsection{Benchmark delegation toolkit} -\label{app:benchmark-action-space} - -The flexible-agent worker of \S\ref{sec:validation} is given a -particular set of delegation actions that mutate the substrate. -This action space is a design choice of our benchmark, not a -feature of Ergon itself --- another research team could expose a -different set of delegation actions against the same substrate -and run a different benchmark. The toolkit we expose consists of -six verbs, each corresponding to a named mutation pattern on -$(N, E, A)$: -\begin{itemize}\itemsep 0pt - \item \texttt{add\_subtask}: creates a single child node and - dispatches it for asynchronous execution. - \item \texttt{plan\_subtasks}: atomically creates a sub-DAG of - children validated by Kahn's algorithm, with root children - dispatched to a concurrency-15 executor. - \item \texttt{cancel\_task}: marks a node terminal, cascades to - descendants, and invalidates outgoing dependency edges - (cancelled sub-trees remain in $(N, E)$ for analysis). - \item \texttt{refine\_task}: edits a non-running node's - description with field-history preserved. - \item \texttt{restart\_task}: resets a terminal node to - \textsc{pending} and cascades invalidation to downstream - dependencies. - \item \texttt{list\_subtasks} and \texttt{get\_subtask}: - read-only observations of the current containment tree. -\end{itemize} -Each verb maps to a named mutation type in $M$, so downstream -evaluators distinguish delegation, cancellation, and refinement -from ordinary tool use without custom parsing --- independently -of what specific verbs our benchmark chose to expose. - -\subsubsection{Agent details and projections} - -\todo{For the flexible-agent worker of \S\ref{sec:validation}: -system prompt, full tool inventory (benchmark action space above -plus task-specific tools), one-shot example if any, and the -turn-budget and stopping rules. For each of the five projection -operators $\pi_{\text{step}}$, $\pi_{\text{per-agent}}$, -$\pi_{\text{call-tree}}$, $\pi_{\text{macro}}$, -$\pi_{\text{json-log}}$: one worked example on one trajectory -from each task family, the drops manifest it produces, and -discussion of which behavioural metrics each projection -structurally cannot preserve.} - - -% ---------------------------------------------------------------------------- -% Appendix I (Dataset Onboarding Guide), Appendix J (Automated Experimental -% Infrastructure), and Appendix K (Fault-Injection Methodology) were cut in -% 2026-04-21. Rationale: all three had zero body-text citations (or the one -% K cite was rewritten to point at Appendix D's WAL/mutation-log invariants). -% I was a how-to-use-Ergon tutorial that belongs in the repo README; J was -% orchestration lore that belongs in the repo README; K (fault injection) is -% not part of the new experiment setup. Downstream appendices (formerly L/M/N) -% are automatically relabelled to I/J/K by LaTeX's \section counter; no -% manual renumbering needed. -% ---------------------------------------------------------------------------- - - -\subsection{Cross-harness reconciliation methodology} -\label{app:reconciliation} - -\todo{Session 11: Cross-harness reconciliation methodology appendix. Source: Weekend 1 Sec.~4.5 experiment. -Structure: (a) convention specification --- union convention justified against alternatives (SWE-agent's convention, Agentless's convention, minimal common denominator); (b) download and re-grading code walkthrough; (c) what the re-grading preserves vs what it cannot preserve (explicit drops manifest for the reconciliation convention itself); (d) drops manifests for the SWE-agent $\to$ rollout-card and Agentless $\to$ rollout-card ingestions per Sec.~4.2 ``Cross-harness reconciliation''; (e) \texttt{no\_generation} convention analysis --- 50 instances SWE-agent, 4 Agentless; why Convention A (exclude) and Convention B (include as zero) bracket the reasonable range of harness-level choices.} - -% ---------------------------------------------------------------------------- -% Appendix J (Additional Rollout Traces) was cut in 2026-04-21. The per- -% benchmark trace figures (Fig A3 MiniF2F, Fig A4 Research Rubrics, Fig A5 -% SWE-Bench Verified) were dropped --- they add rollout colour but don't -% back any claim. The crown-jewel six-projection figure (formerly Fig A2) -% was relocated to Appendix C (app:projections) as Fig~\ref{fig:sixproj} -% since it is the visual statement of the projection-loss argument that -% appendix makes in prose. LaTeX auto-renumbers the Benchmark Card below -% from appendix K to J. -% ---------------------------------------------------------------------------- - -% ---------------------------------------------------------------------------- -% Appendix G --- Behavioural Quantities (moved from §4.2 body on 2026-04-22 -% per signposting / length pass; see proposed_edits_signposting.md C6). -% ---------------------------------------------------------------------------- - -\section{Behavioural Quantities} -\label{app:quantities} - -This appendix catalogues the six behavioural quantities -referenced in \S\ref{sec:validation:setup}. Each is a -canonical analysis of one of the five research communities -surveyed in \S\ref{sec:problem:communities}. The set is -illustrative: a rollout card supports arbitrarily many -analyses, and the three quantities exercised in -\S\ref{sec:validation:results} (abandonment ratio by depth, -per-agent sibling embedding distance, and the -\texttt{no\_generation} split in the SWE-bench reconciliation) -are drawn from this set. - -\begin{table}[h] -\centering -\small -\caption{Six behavioural quantities exercised in -\S\ref{sec:validation:results}, each the canonical analysis of -one of the five research communities from -\S\ref{sec:problem:communities}. The set is illustrative: a -rollout card supports arbitrarily many such analyses.} -\label{tab:quantities} -\begin{tabular}{@{}p{0.32\textwidth}p{0.28\textwidth}p{0.32\textwidth}@{}} -\toprule -\textbf{Quantity} & \textbf{Community} & \textbf{Computed from} \\ -\midrule -Step-indexed return under episode mask & Long-horizon LLM-agentic RL \citep{chen2025loop, wang2025ragen} & \texttt{events.jsonl} token spans \\ -Option termination frequency & HRL / macro-action \citep{bacon2017optioncritic} & \texttt{mutations.jsonl} status transitions \\ -Call-tree depth distribution & Recursive LMs \citep{zhu2024redel} & \texttt{nodes.jsonl} \texttt{parent\_id} chains \\ -Per-agent sibling embedding distance & Fixed-role MAS \citep{cemri2025mast} & \texttt{nodes.jsonl} + \texttt{events.jsonl} \\ -Node visit entropy across frontier & MCTS-based training \citep{rstarmath2025, feng2024restmcts} & \texttt{annotations.jsonl} (MCTS namespace) \\ -Dispatch-and-wait rate & Async MAS & \texttt{events.jsonl} + \texttt{mutations.jsonl} \\ -\bottomrule -\end{tabular} -\end{table} - -% ---------------------------------------------------------------------------- -% Appendix J --- Benchmark Card: Evaluative Role, Assumptions, Limitations. -% CARRY VERBATIM. -% ---------------------------------------------------------------------------- - -\section{Benchmark Card: Evaluative Role, Assumptions, and Limitations} -\label{app:benchmarkcard} - -\paragraph{Evaluative role.} -The Ergon benchmark suite, paired with the trajectory -representation and projection operators, supports evaluative -claims about dynamic-delegation agents that were previously not -comparable across research communities: (i) whether an agent -uses concurrent dispatch when it is available; (ii) whether an -agent cancels sub-tasks that are not making progress; (iii) -whether an agent's delegation structure contains dependency -diamonds that exceed tree-shape formalisms; (iv) whether the -agent's reasoning expresses intent that its recorded structural -actions fail to realise; (v) whether a community's preferred -trajectory format preserves each of these behaviours under its -projection operator. - -\paragraph{Assumptions.} -\begin{itemize} - \item The tool implementations (web search, document retrieval, - Lean REPL, SWE-Bench harness) are stable at their released - versions and latency distributions; substantial drift in - tool-provider behaviour may invalidate fault-injection - calibration. - \item The rubric-based evaluator (GPT-4o-mini, for Research - Rubrics) is treated as a noisy but calibrated judge; we do not - claim evaluator infallibility and provide inter-rater agreement - statistics in Appendix~\ref{app:benchmarks}. - \item The flexible-agent worker is given access to the - subtask-decomposition tools but not prompted to use them; - observed delegation structure reflects the backbone model's - untutored decomposition under a minimal system prompt, not a - researcher-imposed strategy. -\end{itemize} - -\paragraph{Limitations.} -\begin{itemize} - \item Three task families; generalisation claims to other - long-horizon settings should be made cautiously. The - behavioural coverage of \S\ref{sec:validation} is complementary - but not exhaustive. - \item LLM-judge failure modes (reward hacking, distribution - shift) are documented but not fully characterised. - \item Tool-provider API drift may require re-calibration; - reproducibility bundle pins tool-provider snapshots. SWE-Bench - Verified's per-repository environment specs are pinned to the - \texttt{swebench} package version used. -\end{itemize} - -\paragraph{Intended use.} -Evaluation of dynamic-delegation policies via the substrate and -projection-preservation framework. The benchmark suite is \emph{not} -intended as a stand-alone prover, a factual QA benchmark, or a -general agentic capability test; claims in those directions should -use purpose-built benchmarks and metrics. - -\paragraph{Failure modes.} -Rubric reward hacking on Research Rubrics; MiniF2F proof leakage -from the backbone's training data; SWE-Bench test-pattern overfit -on popular repositories; tool-provider API drift; LLM-judge -sycophancy. - - - -% ============================================================================ -% Mandatory NeurIPS 2026 Paper Checklist -% ============================================================================ -\input{checklist.tex} - -\end{document} diff --git a/ergon_paper_overleaf_edit/neurips_2026.sty b/ergon_paper_overleaf_edit/neurips_2026.sty deleted file mode 100644 index c2ac0132..00000000 --- a/ergon_paper_overleaf_edit/neurips_2026.sty +++ /dev/null @@ -1,437 +0,0 @@ -% partial rewrite of the LaTeX2e package for submissions to the -% Conference on Neural Information Processing Systems (NeurIPS): -% -% - uses more LaTeX conventions -% - line numbers at submission time replaced with aligned numbers from -% lineno package -% - \nipsfinalcopy replaced with [final] package option -% - automatically loads times package for authors -% - loads natbib automatically; this can be suppressed with the -% [nonatbib] package option -% - adds foot line to first page identifying the conference -% - adds preprint option for submission to e.g. arXiv -% - conference acronym modified -% - update foot line to display the track name -% -% Roman Garnett (garnett@wustl.edu) and the many authors of -% nips15submit_e.sty, including MK and drstrip@sandia -% -% last revision: January 2026 - -\NeedsTeXFormat{LaTeX2e} -\ProvidesPackage{neurips_2026}[2026-01-29 NeurIPS 2026 submission/camera-ready style file] - -% declare final option, which creates camera-ready copy -\newif\if@neuripsfinal\@neuripsfinalfalse -\DeclareOption{final}{ - \@neuripsfinaltrue - \@anonymousfalse -} - -% declare nonatbib option, which does not load natbib in case of -% package clash (users can pass options to natbib via -% \PassOptionsToPackage) -\newif\if@natbib\@natbibtrue -\DeclareOption{nonatbib}{ - \@natbibfalse -} - -% declare preprint option, which creates a preprint version ready for -% upload to, e.g., arXiv -\newif\if@preprint\@preprintfalse -\DeclareOption{preprint}{ - \@preprinttrue - \@anonymousfalse -} - -% determine the track of the paper in camera-ready mode -\newif\if@main\@maintrue -\DeclareOption{main}{ - \@maintrue - \newcommand{\@trackname}{\@neuripsordinal\ Conference on Neural Information Processing Systems (NeurIPS \@neuripsyear).} -} -\newif\if@position\@positionfalse -\DeclareOption{position}{ - \@positiontrue - \newcommand{\@trackname}{\@neuripsordinal\ Conference on Neural Information Processing Systems (NeurIPS \@neuripsyear). Position Paper Track.} -} -\newif\if@eandd\@eanddfalse -\DeclareOption{eandd}{ - \@eanddtrue -\if@neuripsfinal\@anonymousfalse\else\if@preprint\@anonymousfalse\else\@anonymoustrue\fi\fi - \newcommand{\@trackname}{\@neuripsordinal\ Conference on Neural Information Processing Systems (NeurIPS \@neuripsyear). Track on Evaluations and Datasets.} -} -\newif\if@creativeai\@creativeaifalse -\DeclareOption{creativeai}{ - \@creativeaitrue - \@anonymousfalse - \newcommand{\@trackname}{\@neuripsordinal\ Conference on Neural Information Processing Systems (NeurIPS \@neuripsyear). Creative AI Track.} -} - -% For anonymous or non-anonymous -\newif\if@anonymous\@anonymoustrue - -% For workshop papers -\newcommand{\@workshoptitle}{} -\newcommand{\workshoptitle}[1]{\renewcommand{\@workshoptitle}{#1}} - -\newif\if@workshop\@workshopfalse -\DeclareOption{sglblindworkshop}{ - \@workshoptrue - \@anonymousfalse - \newcommand{\@trackname}{\@neuripsordinal\ Conference on Neural Information Processing Systems (NeurIPS \@neuripsyear). Workshop: \@workshoptitle.} -} -\DeclareOption{dblblindworkshop}{ - \@workshoptrue - \newcommand{\@trackname}{\@neuripsordinal\ Conference on Neural Information Processing Systems (NeurIPS \@neuripsyear). Workshop: \@workshoptitle.} -} -\DeclareOption{nonanonymous}{ - \@anonymousfalse -} - -\ProcessOptions\relax - -% fonts -\renewcommand{\rmdefault}{ptm} -\renewcommand{\sfdefault}{phv} - -% change this every year for notice string at bottom -\newcommand{\@neuripsordinal}{40th} -\newcommand{\@neuripsyear}{2026} -\newcommand{\@neuripslocation}{Sydney} - -% acknowledgments -\usepackage{environ} -\newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}} -\NewEnviron{ack}{% - \acksection - \BODY -} - - -% load natbib unless told otherwise -\if@natbib - \RequirePackage{natbib} -\fi - - - - - -% set page geometry -\usepackage[verbose=true,letterpaper]{geometry} -\AtBeginDocument{ - \newgeometry{ - textheight=9in, - textwidth=5.5in, - top=1in, - headheight=12pt, - headsep=25pt, - footskip=30pt - } - \@ifpackageloaded{fullpage} - {\PackageWarning{neurips_2026}{fullpage package not allowed! Overwriting formatting.}} - {} -} - -\widowpenalty=10000 -\clubpenalty=10000 -\flushbottom -\sloppy - - -% font sizes with reduced leading -\renewcommand{\normalsize}{% - \@setfontsize\normalsize\@xpt\@xipt - \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@ - \abovedisplayshortskip \z@ \@plus 3\p@ - \belowdisplayskip \abovedisplayskip - \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@ -} -\normalsize -\renewcommand{\small}{% - \@setfontsize\small\@ixpt\@xpt - \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@ - \abovedisplayshortskip \z@ \@plus 2\p@ - \belowdisplayskip \abovedisplayskip - \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@ -} -\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt} -\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt} -\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt} -\renewcommand{\large}{\@setfontsize\large\@xiipt{14}} -\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}} -\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}} -\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}} -\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}} - - -% Force \tiny to be no smaller than 6pt -\renewcommand{\tiny}{\fontsize{6pt}{7pt}\selectfont} - -% Force \scriptsize to be no smaller than 7pt -\renewcommand{\scriptsize}{\fontsize{7pt}{8pt}\selectfont} - -% Force \footnotesize to be no smaller than 8pt -\renewcommand{\footnotesize}{\fontsize{8pt}{9.5pt}\selectfont} - -% sections with less space -\providecommand{\section}{} -\renewcommand{\section}{% - \@startsection{section}{1}{\z@}% - {-2.0ex \@plus -0.5ex \@minus -0.2ex}% - { 1.5ex \@plus 0.3ex \@minus 0.2ex}% - {\large\bf\raggedright}% -} -\providecommand{\subsection}{} -\renewcommand{\subsection}{% - \@startsection{subsection}{2}{\z@}% - {-1.8ex \@plus -0.5ex \@minus -0.2ex}% - { 0.8ex \@plus 0.2ex}% - {\normalsize\bf\raggedright}% -} -\providecommand{\subsubsection}{} -\renewcommand{\subsubsection}{% - \@startsection{subsubsection}{3}{\z@}% - {-1.5ex \@plus -0.5ex \@minus -0.2ex}% - { 0.5ex \@plus 0.2ex}% - {\normalsize\bf\raggedright}% -} -\providecommand{\paragraph}{} -\renewcommand{\paragraph}{% - \@startsection{paragraph}{4}{\z@}% - {1.5ex \@plus 0.5ex \@minus 0.2ex}% - {-1em}% - {\normalsize\bf}% -} -\providecommand{\subparagraph}{} -\renewcommand{\subparagraph}{% - \@startsection{subparagraph}{5}{\z@}% - {1.5ex \@plus 0.5ex \@minus 0.2ex}% - {-1em}% - {\normalsize\bf}% -} -\providecommand{\subsubsubsection}{} -\renewcommand{\subsubsubsection}{% - \vskip5pt{\noindent\normalsize\rm\raggedright}% -} - -% float placement -\renewcommand{\topfraction }{0.85} -\renewcommand{\bottomfraction }{0.4} -\renewcommand{\textfraction }{0.1} -\renewcommand{\floatpagefraction}{0.7} - -\newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@} -\newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@} - -\setlength{\abovecaptionskip}{\@neuripsabovecaptionskip} -\setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip} - -% swap above/belowcaptionskip lengths for tables -\renewenvironment{table} - {\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}% - \setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}% - \@float{table}} - {\end@float} - -% footnote formatting -\setlength{\footnotesep }{6.65\p@} -\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@} -\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@} -\setcounter{footnote}{0} - -% paragraph formatting -\setlength{\parindent}{\z@} -\setlength{\parskip }{5.5\p@} - -% list formatting -\setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@} -\setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@} -\setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} -\setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} -\setlength{\leftmargin }{3pc} -\setlength{\leftmargini }{\leftmargin} -\setlength{\leftmarginii }{2em} -\setlength{\leftmarginiii}{1.5em} -\setlength{\leftmarginiv }{1.0em} -\setlength{\leftmarginv }{0.5em} -\def\@listi {\leftmargin\leftmargini} -\def\@listii {\leftmargin\leftmarginii - \labelwidth\leftmarginii - \advance\labelwidth-\labelsep - \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@ - \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ - \itemsep \parsep} -\def\@listiii{\leftmargin\leftmarginiii - \labelwidth\leftmarginiii - \advance\labelwidth-\labelsep - \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ - \parsep \z@ - \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@ - \itemsep \topsep} -\def\@listiv {\leftmargin\leftmarginiv - \labelwidth\leftmarginiv - \advance\labelwidth-\labelsep} -\def\@listv {\leftmargin\leftmarginv - \labelwidth\leftmarginv - \advance\labelwidth-\labelsep} -\def\@listvi {\leftmargin\leftmarginvi - \labelwidth\leftmarginvi - \advance\labelwidth-\labelsep} - -% create title -\providecommand{\maketitle}{} -\renewcommand{\maketitle}{% - \par - \begingroup - \renewcommand{\thefootnote}{\fnsymbol{footnote}} - % for perfect author name centering - \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}} - % The footnote-mark was overlapping the footnote-text, - % added the following to fix this problem (MK) - \long\def\@makefntext##1{% - \parindent 1em\noindent - \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1 - } - \thispagestyle{empty} - \@maketitle - \@thanks - \@notice - \endgroup - \let\maketitle\relax - \let\thanks\relax -} - -% rules for title box at top of first page -\newcommand{\@toptitlebar}{ - \hrule height 4\p@ - \vskip 0.25in - \vskip -\parskip% -} -\newcommand{\@bottomtitlebar}{ - \vskip 0.29in - \vskip -\parskip - \hrule height 1\p@ - \vskip 0.09in% -} - -% create title (includes both anonymized and non-anonymized versions) -\providecommand{\@maketitle}{} -\renewcommand{\@maketitle}{% - \vbox{% - \hsize\textwidth - \linewidth\hsize - \vskip 0.1in - \@toptitlebar - \centering - {\LARGE\bf \@title\par} - \@bottomtitlebar - \if@anonymous - \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@} - Anonymous Author(s) \\ - Affiliation \\ - Address \\ - \texttt{email} \\ - \end{tabular}% - \else - \def\And{% - \end{tabular}\hfil\linebreak[0]\hfil% - \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% - } - \def\AND{% - \end{tabular}\hfil\linebreak[4]\hfil% - \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% - } - \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}% - \fi - \vskip 0.3in \@minus 0.1in - } -} - -% add conference notice to bottom of first page -\newcommand{\ftype@noticebox}{8} -\newcommand{\@notice}{% - % give a bit of extra room back to authors on first page - \enlargethispage{2\baselineskip}% - \@float{noticebox}[b]% - \footnotesize\@noticestring% - \end@float% -} - -% abstract styling -\renewenvironment{abstract}% -{% - \vskip 0.075in% - \centerline% - {\large\bf Abstract}% - \vspace{0.5ex}% - \begin{quote}% -} -{ - \par% - \end{quote}% - \vskip 1ex% -} - -% For the paper checklist -\newcommand{\answerYes}[1][]{\textcolor{blue}{[Yes]#1}} -\newcommand{\answerNo}[1][]{\textcolor{orange}{[No]#1}} -\newcommand{\answerNA}[1][]{\textcolor{gray}{[N/A]#1}} -\newcommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}} -\newcommand{\justificationTODO}[1][]{\textcolor{red}{\bf [TODO]}} - -% handle tweaks for camera-ready copy vs. submission copy -\if@preprint - \newcommand{\@noticestring}{% - Preprint.% - } -\else - \if@neuripsfinal - \newcommand{\@noticestring}{ - \@trackname - } - \else - \newcommand{\@noticestring}{% - Submitted to \@neuripsordinal\/ Conference on Neural Information Processing Systems (NeurIPS \@neuripsyear). Do not distribute.% - } - - % hide the acknowledgements - \NewEnviron{hide}{} - \let\ack\hide - \let\endack\endhide - - % line numbers for submission - \RequirePackage{lineno} - \linenumbers - - % fix incompatibilities between lineno and amsmath, if required, by - % transparently wrapping linenomath environments around amsmath - % environments - \AtBeginDocument{% - \@ifpackageloaded{amsmath}{% - \newcommand*\patchAmsMathEnvironmentForLineno[1]{% - \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname - \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname - \renewenvironment{#1}% - {\linenomath\csname old#1\endcsname}% - {\csname oldend#1\endcsname\endlinenomath}% - }% - \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{% - \patchAmsMathEnvironmentForLineno{#1}% - \patchAmsMathEnvironmentForLineno{#1*}% - }% - \patchBothAmsMathEnvironmentsForLineno{equation}% - \patchBothAmsMathEnvironmentsForLineno{align}% - \patchBothAmsMathEnvironmentsForLineno{flalign}% - \patchBothAmsMathEnvironmentsForLineno{alignat}% - \patchBothAmsMathEnvironmentsForLineno{gather}% - \patchBothAmsMathEnvironmentsForLineno{multline}% - } - {} - } - \fi -\fi - - -\endinput diff --git a/ergon_paper_overleaf_edit/references.bib b/ergon_paper_overleaf_edit/references.bib deleted file mode 100644 index 615f262f..00000000 --- a/ergon_paper_overleaf_edit/references.bib +++ /dev/null @@ -1,916 +0,0 @@ -% ============================================================================ -% Ergon paper references -% Organised by role in the paper: -% A. LLM RL infrastructure (the capability/taxonomy table) -% B. Five-communities citations (§2 fragmentation) -% C. MARL theory anchors -% D. Evaluation-substrate precedents (Gym lineage) -% E. Event sourcing / durable execution -% F. Positioning / related work (distinguishing citations) -% G. Prompting-era origin cites (footnote level) -% -% Entries marked with "% VERIFY" require author/date confirmation before -% submission. Entries marked with "% DROP-IF-UNUSED" should be removed if -% the draft does not actually cite them. -% ============================================================================ - -% ============================================================================ -% A. LLM RL Infrastructure -% ============================================================================ - -@misc{vonwerra2022trl, - title={{TRL}: Transformer Reinforcement Learning}, - author={von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi}, - year={2022}, - publisher={GitHub}, - howpublished={\url{https://github.com/huggingface/trl}}, -} - -@inproceedings{sheng2025verl, - title={{HybridFlow}: A Flexible and Efficient {RLHF} Framework}, - author={Sheng, Guangming and Cao, Chi and Gao, Zilingfeng and others}, - booktitle={EuroSys}, - year={2025}, -} - -@article{xi2025agentgym, - title={{AgentGym-RL}: Training {LLM} Agents for Long-Horizon Decision Making through Multi-Turn Reinforcement Learning}, - author={Xi, Zhiheng and others}, - journal={arXiv preprint arXiv:2509.08755}, - year={2025}, -} - -@article{wang2025ragen, - title={{RAGEN}: Understanding Self-Evolution in {LLM} Agents via Multi-Turn Reinforcement Learning}, - author={Wang, Zihan and others}, - journal={arXiv preprint arXiv:2504.20073}, - year={2025}, -} - -@misc{marti2025, - title={{MARTI}: A Framework for Multi-Agent {LLM} Systems Reinforced Training and Inference}, - author={{TsinghuaC3I}}, - year={2025}, - howpublished={\url{https://github.com/TsinghuaC3I/MARTI}}, - note={GitHub repository}, -} - -@article{mei2025areal, - title={{AReaL}: A Large-Scale Asynchronous Reinforcement Learning System for Language Reasoning}, - author={Mei, Zhiyu and others}, - journal={arXiv preprint arXiv:2505.24298}, - year={2025}, -} - -@misc{primeintellect_primerl, - title={{PRIME-RL}: Agentic {RL} Training at Scale}, - author={{Prime Intellect}}, - year={2025}, - howpublished={\url{https://github.com/PrimeIntellect-ai/prime-rl}}, - note={GitHub repository; see also INTELLECT-3 technical report, arXiv:2512.16144}, -} - -@misc{brown2025verifiers, - title={{Verifiers}: Environments for {LLM} Reinforcement Learning}, - author={Brown, William}, - year={2025}, - howpublished={\url{https://github.com/PrimeIntellect-ai/verifiers}}, - note={Core abstraction is \texttt{MultiTurnEnv}, a single-agent rollout loop}, -} - -@article{cao2025skyrlagent, - title={{SkyRL-Agent}: Efficient {RL} Training for Multi-turn {LLM} Agent}, - author={Cao, Shiyi and Li, Dacheng and Zhao, Fangzhou and Yuan, Shuo and Hegde, Sumanth R. and Chen, Connor and Ruan, Charlie and Griggs, Tyler and Liu, Shu and Tang, Eric and Liaw, Richard and Moritz, Philipp and Zaharia, Matei and Gonzalez, Joseph E. and Stoica, Ion}, - journal={arXiv preprint arXiv:2511.16108}, - year={2025}, -} - -@misc{comlrl2025, - title={{CoMLRL}: Cooperative Multi-{LLM} Reinforcement Learning}, - author={{CoMLRL Contributors}}, - year={2025}, - howpublished={\url{https://openmlrl.github.io/CoMLRL/}}, - note={Decentralised cooperative {MARL} training for {LLMs}; Dec-POMDP formalism}, -} - -@article{recollab2025, - title={{ReCoLLAB}: Retrieval-Augmented {LLMs} for Cooperative Ad-hoc Teammate Modeling}, - author={Anonymous}, - journal={arXiv preprint arXiv:2512.22129}, - year={2025}, - note={Uses Overcooked + policy libraries; no LLM-RL training framework}, -} - -@article{liu2025gem, - title={{GEM}: A Gym for Agentic {LLMs}}, - author={Liu, Jian and Sims, Matthew and Duan, Jiaxin and others}, - journal={arXiv preprint}, - year={2025}, - note={VERIFY exact venue/date}, -} - -@article{silver2025welcome, - title={Welcome to the Era of Experience}, - author={Silver, David and Sutton, Richard S.}, - journal={Preprint}, - year={2025}, -} - -% SkyRL, OpenRLHF, ProRL-Agent: VERIFY exact citations before submission. -@article{skyrl2025, - title={{SkyRL}-Agent: An End-to-End {RL} Framework for Long-Horizon {LLM} Agents}, - author={Anonymous}, - journal={arXiv preprint arXiv:2511.16108}, - year={2025}, - note={VERIFY authorship}, -} - -@misc{openrlhf, - title={{OpenRLHF}: An Easy-to-Use, Scalable and High-Performance {RLHF} Framework}, - author={OpenRLHF Team}, - year={2024}, - howpublished={\url{https://github.com/OpenRLHF/OpenRLHF}}, -} - -@article{zhang2026prorlagent, - title={{ProRL Agent}: Rollout-as-a-Service for {RL} Training of Multi-Turn {LLM} Agents}, - author={Zhang, Hao and others}, - journal={arXiv preprint arXiv:2603.18815}, - year={2026}, - note={NVIDIA; integrated into NeMo Gym; decouples rollout lifecycle from training loop}, -} - - -@article{yu2025dapo, - title={{DAPO}: An Open-Source {LLM} Reinforcement Learning System at Scale}, - author={Yu, Qiying and others}, - journal={arXiv preprint arXiv:2503.14476}, - year={2025}, - note={Built on verl; single-agent math reasoning RL}, -} - -@article{wang2026ragenv2, - title={{RAGEN}-v2: Understanding Reasoning Collapse in Multi-Turn Agent Reinforcement Learning}, - author={Wang, Zihan and Gui, Chi and others}, - journal={arXiv preprint}, - year={2026}, - note={Documents failure modes of single-agent long-horizon RL}, -} - -@misc{nemogym2025, - title={{NeMo-RL}: A Scalable and Efficient Post-Training Library}, - author={{NVIDIA}}, - year={2025}, - howpublished={\url{https://github.com/NVIDIA-NeMo/RL}}, -} - -% ============================================================================ -% B. Five-communities citations (§2 fragmentation) -% ============================================================================ - -% --- Anchor for long-horizon framing --- - -@article{kwa2025metr, - title={Measuring {AI} Ability to Complete Long Tasks}, - author={Kwa, Thomas and West, Ben and Becker, Joel and Deng, Amy and Garcia, Katharyn and Hasin, Max and Jawhar, Sami and Kinniment, Megan and Rush, Nate and Von Arx, Sydney and others}, - journal={arXiv preprint arXiv:2503.14499}, - year={2025}, - note={{METR} time-horizon trajectory; single-agent task horizons doubling every $\sim$7 months}, -} - -% --- Community 1: Long-horizon LLM-agentic RL --- - -@article{chen2025loop, - title={Reinforcement Learning for Long-Horizon Interactive {LLM} Agents}, - author={Chen, Kevin and Cusumano-Towner, Marco and Huval, Brody and Petrenko, Aleksei and Hamburger, Jackson and Koltun, Vladlen and Kr{\"a}henb{\"u}hl, Philipp}, - journal={arXiv preprint arXiv:2502.01600}, - year={2025}, -} - -@article{odysseybench2025, - title={{OdysseyBench}: Evaluating {LLM} Agents on Long-Horizon Complex Office Application Workflows}, - author={Anonymous}, - journal={arXiv preprint arXiv:2508.09124}, - year={2025}, - note={VERIFY authorship}, -} - -% --- Community 2: Ad hoc teamwork --- - -@inproceedings{villin2025minimax, - title={A Minimax Approach to Ad Hoc Teamwork}, - author={Villin, Victor and Kleine Buening, Thomas and Dimitrakakis, Christos}, - booktitle={AAMAS}, - year={2025}, -} - -@article{wang2024naht, - title={{N}-Agent Ad Hoc Teamwork}, - author={Wang, Caroline and others}, - journal={NeurIPS}, - year={2024}, -} - -@article{zhang2025maht, - title={Multi-party Agent Relation Sampling for Multi-party Ad Hoc Teamwork}, - author={Zhang, Beiwen and Liang, Yongheng and Wu, Hejun}, - journal={arXiv preprint arXiv:2510.25340}, - year={2025}, -} - -@inproceedings{mirsky2022aht, - title={A Survey of Ad Hoc Teamwork Research}, - author={Mirsky, Reuth and Carlucho, Ignacio and Rahman, Arrasy and Fosong, Elliot and Macke, William and Sridharan, Mohan and Stone, Peter and Albrecht, Stefano V.}, - booktitle={European Workshop on Multi-Agent Systems (EUMAS)}, - year={2022}, -} - -@inproceedings{rahman2021openmarl, - title={Towards Open Ad Hoc Teamwork Using Graph-based Policy Learning}, - author={Rahman, Arrasy and H{\"o}pner, Niklas and Christianos, Filippos and Albrecht, Stefano V.}, - booktitle={ICML}, - year={2021}, -} - -@inproceedings{sun2025collaboverc, - title={{Collab-Overcooked}: Benchmarking and Evaluating Large Language Models as Collaborative Agents}, - author={Sun, Haochen and Zhang, Shuwen and Niu, Lujie and Ren, Lei and Xu, Hao and Fu, Hao and Zhao, Fangkun and Yuan, Caixia and Wang, Xiaojie}, - booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)}, - year={2025}, - note={Survey Table 1 enumerates LLM-MAS benchmarks (RocoBench, VillagerBench, LLMARENA, CivRealm, BattleAgentBench, TDW-MAT, CuisineWorld) -- all classical-port, ad-hoc infrastructure}, -} - -@article{barrett2017plastic, - title={Making Friends on the Fly: Cooperating with New Teammates}, - author={Barrett, Samuel and Rosenfeld, Avi and Kraus, Sarit and Stone, Peter}, - journal={Artificial Intelligence}, - volume={242}, - pages={132--171}, - year={2017}, - note={Canonical {PLASTIC} policy-library method for AHT}, -} - -@article{ruhdorfer2025ogc, - title={The {O}vercooked {G}eneralisation {C}hallenge: Evaluating Cooperation under Environment and Partner Diversity}, - author={Ruhdorfer, Constantin and Boyle, Matthew and Albrecht, Stefano V.}, - journal={Transactions on Machine Learning Research (TMLR)}, - year={2025}, - note={JAX-accelerated Overcooked; integrates with JaxMARL and minimax}, -} - -% --- Community 3: Recursive language models --- - -@article{zhang2025rlm, - title={Recursive Language Models}, - author={Zhang, Alex L. and Kraska, Tim and Khattab, Omar}, - journal={arXiv preprint arXiv:2512.24601}, - year={2025}, -} - -@inproceedings{zhu2024redel, - title={{ReDel}: A Toolkit for {LLM}-Powered Recursive Multi-Agent Systems}, - author={Zhu, Andrew and Dugan, Liam and Callison-Burch, Chris}, - booktitle={EMNLP Demo Track}, - year={2024}, -} - -@misc{primeintellect_rlm2026, - title={Recursive Language Models: The Paradigm of 2026}, - author={{Prime Intellect}}, - year={2026}, - howpublished={\url{https://www.primeintellect.ai/blog/rlm}}, - note={Blog post}, -} - -@article{agentorchestra2025, - title={{AgentOrchestra}: A Hierarchical Multi-Agent Framework for General-Purpose Task Solving}, - author={Anonymous}, - journal={arXiv preprint arXiv:2506.12508}, - year={2025}, - note={VERIFY authorship}, -} - -@article{sun2025ctxfold, - title={Scaling Long-Horizon {LLM} Agent via Context-Folding}, - author={Sun, Weiwei and Lu, Miao and Ling, Zhe and Liu, Kai and Yao, Xin and Yang, Yi and Chen, Jing}, - journal={arXiv preprint arXiv:2510.11967}, - year={2025}, - note={Agent actively branches rollout and returns summary; inference-time decomposition}, -} - -@article{ye2025agentfold, - title={{AgentFold}: Long-Horizon Web Agents with Proactive Context Management}, - author={Ye, Rui and Zhang, Zhijie and Li, Kuan and Yin, Haoyu and Tao, Zhiyi and Zhao, Yaqi and Su, Liangtao and Zhang, Liang and Qiao, Zhen and Wang, Xuanjing and others}, - journal={arXiv preprint arXiv:2510.24699}, - year={2025}, - note={Context folding via multi-scale state summaries; inference-time}, -} - -@inproceedings{schroeder2025thread, - title={{THREAD}: Thinking Deeper with Recursive Spawning}, - author={Schroeder, Philip and Grand, Gabriel and Dafny, Nathaniel and Kim, Yoon and Andreas, Jacob}, - booktitle={NeurIPS}, - year={2025}, - note={Recursive sub-LM spawning; inference-time only}, -} - -@article{grand2025discipl, - title={Self-Steering Language Models}, - author={Grand, Gabriel and Pepe, Joshua B. and Andreas, Jacob and Tenenbaum, Joshua B.}, - journal={arXiv preprint arXiv:2504.07081}, - year={2025}, - note={DisCIPL; planner-LM generates inference programs; no training}, -} - -@article{yu2025memagent, - title={{MemAgent}: Reshaping Long-Context {LLM} with Multi-Conv {RL}-Based Memory Agent}, - author={Yu, Hongli and Chen, Tinghong and Feng, Jiangtao and Chen, Jiarui and Dai, Wenbin and Yu, Qi and Zhang, Yi and Ma, Wei and Liu, Jingjing and Wang, Minlie and Zhou, Hao}, - journal={arXiv preprint arXiv:2507.02259}, - year={2025}, - note={RL-trained context management; still single-agent}, -} - -% --- Community 4: Classical hierarchical / MacDec-POMDP RL --- - -@inproceedings{bacon2017optioncritic, - title={The Option-Critic Architecture}, - author={Bacon, Pierre-Luc and Harb, Jean and Precup, Doina}, - booktitle={AAAI}, - year={2017}, -} - -@article{amato2019macdec, - title={Modeling and Planning with Macro-Actions in Decentralized {POMDPs}}, - author={Amato, Christopher and Konidaris, George and Kaelbling, Leslie Pack and How, Jonathan P.}, - journal={Journal of Artificial Intelligence Research}, - volume={64}, - pages={817--859}, - year={2019}, -} - -@article{xiao2022asynchronous, - title={Asynchronous Actor-Critic for Multi-Agent Reinforcement Learning}, - author={Xiao, Yuchen and Tan, Weihao and Amato, Christopher}, - journal={NeurIPS}, - year={2022}, - note={arXiv:2209.10113}, -} - -@inproceedings{jung2025acac, - title={Agent-Centric Actor-Critic for Asynchronous Multi-Agent Reinforcement Learning}, - author={Jung, Whiyoung and Hong, Sunghoon and Yoon, Deunsol and Lee, Kanghoon and Lim, Woohyung}, - booktitle={ICML}, - year={2025}, - note={PMLR 267:28481--28502}, -} - -@inproceedings{vezhnevets2017feudal, - title={{FeUdal} Networks for Hierarchical Reinforcement Learning}, - author={Vezhnevets, Alexander Sasha and Osindero, Simon and Schaul, Tom and Heess, Nicolas and Jaderberg, Max and Silver, David and Kavukcuoglu, Koray}, - booktitle={ICML}, - year={2017}, - note={Manager/Worker decomposition; canonical deep HRL architecture}, -} - -@article{pateria2021hrl, - title={Hierarchical Reinforcement Learning: A Comprehensive Survey}, - author={Pateria, Shubham and Subagdja, Budhitama and Tan, Ah-hwee and Quek, Chai}, - journal={ACM Computing Surveys}, - volume={54}, - number={5}, - pages={1--35}, - year={2021}, -} - -@article{bacciu2024fgrl, - title={{F}eudal Graph Reinforcement Learning}, - author={Bacciu, Davide and Errica, Federico and Galanti, Tomer and Micheli, Alessio and Cini, Andrea}, - journal={arXiv preprint arXiv:2304.05099}, - year={2024}, - note={Extends feudal RL to graph neural networks; non-LLM}, -} - -% --- Community 5: Production agent orchestration (infrastructure, not RL) --- - -@misc{claudecode2024, - title={Claude Code: Terminal-Based Agentic Coding Tool}, - author={Anthropic}, - year={2024}, - howpublished={\url{https://docs.claude.com/en/docs/claude-code}}, -} - -@misc{anthropic2025subagents, - title={Subagents in the Claude Agent {SDK}}, - author={Anthropic}, - year={2025}, - howpublished={\url{https://platform.claude.com/docs/en/agent-sdk/subagents}}, - note={Isolated context windows; summary-only return to parent; no durable persistence}, -} - -@misc{shihipar2026claudeagent, - title={Building Agents with the Claude Agent {SDK}}, - author={Shihipar, Thariq}, - year={2026}, - howpublished={\url{https://www.anthropic.com/engineering/building-agents-with-the-claude-agent-sdk}}, - note={Engineering blog; canonical description of sub-agent design rationale}, -} - -@misc{googleadk2025, - title={Agent Development Kit ({ADK})}, - author={Google}, - year={2025}, - howpublished={\url{https://google.github.io/adk-docs/}}, -} - -@misc{langgraph2024, - title={{LangGraph}: Building Stateful, Multi-Actor Applications with {LLMs}}, - author={{LangChain}}, - year={2024}, - howpublished={\url{https://langchain-ai.github.io/langgraph/}}, -} - -@misc{crewai2024, - title={{CrewAI}: Framework for Orchestrating Role-Playing Autonomous {AI} Agents}, - author={CrewAI}, - year={2024}, - howpublished={\url{https://www.crewai.com/}}, -} - -@inproceedings{wu2024autogen, - title={{AutoGen}: Enabling Next-Gen {LLM} Applications via Multi-Agent Conversation}, - author={Wu, Qingyun and others}, - booktitle={COLM}, - year={2024}, -} - -% --- Anchor: Masters et al. POSG formalism (self-citation, double-blind safe) --- - -@article{masters2025workflowposg, - title={A {POSG} Formulation of Autonomous Workflow Management}, - author={Anonymous}, - journal={arXiv preprint arXiv:2510.02557}, - year={2025}, - note={Cited anonymously per double-blind policy; to be de-anonymised on acceptance}, -} - -% ============================================================================ -% C. MARL Theory Anchors -% ============================================================================ - -@book{albrecht2024marl, - title={Multi-Agent Reinforcement Learning: Foundations and Modern Approaches}, - author={Albrecht, Stefano V. and Christianos, Filippos and Sch{\"a}fer, Lukas}, - publisher={MIT Press}, - year={2024}, -} - -@inproceedings{hansen2004posg, - title={Dynamic Programming for Partially Observable Stochastic Games}, - author={Hansen, Eric A. and Bernstein, Daniel S. and Zilberstein, Shlomo}, - booktitle={AAAI}, - year={2004}, -} - -@inproceedings{terry2021pettingzoo, - title={{PettingZoo}: Gym for Multi-Agent Reinforcement Learning}, - author={Terry, J. K. and Black, Benjamin and Grammel, Nathaniel and others}, - booktitle={NeurIPS Datasets and Benchmarks}, - year={2021}, -} - -@article{rutherford2024jaxmarl, - title={{JaxMARL}: Multi-Agent {RL} Environments and Algorithms in {JAX}}, - author={Rutherford, Alexander and Ellis, Benjamin and others}, - journal={NeurIPS Datasets and Benchmarks}, - year={2024}, -} - -% ============================================================================ -% D. Evaluation-Substrate Precedents (Gym lineage) -% ============================================================================ - -@article{brockman2016gym, - title={{OpenAI} Gym}, - author={Brockman, Greg and Cheung, Vicki and Pettersson, Ludwig and Schneider, Jonas and Schulman, John and Tang, Jie and Zaremba, Wojciech}, - journal={arXiv preprint arXiv:1606.01540}, - year={2016}, -} - -@article{towers2024gymnasium, - title={{Gymnasium}: A Standard Interface for Reinforcement Learning Environments}, - author={Towers, Mark and others}, - journal={arXiv preprint arXiv:2407.17032}, - year={2024}, -} - -% ============================================================================ -% E. Event Sourcing / Durable Execution -% ============================================================================ - -@misc{fowler2005event, - title={Event Sourcing}, - author={Fowler, Martin}, - year={2005}, - howpublished={\url{https://martinfowler.com/eaaDev/EventSourcing.html}}, -} - -@misc{temporal2024, - title={Temporal: Durable Execution Platform}, - author={{Temporal Technologies}}, - year={2024}, - howpublished={\url{https://temporal.io/}}, -} - -@misc{inngest2024, - title={{Inngest}: Event-Driven Durable Workflows for Developers}, - author={Inngest}, - year={2024}, - howpublished={\url{https://www.inngest.com/}}, -} - -% ============================================================================ -% F. Positioning / Distinguishing Citations -% ============================================================================ - -@article{rstarmath2025, - title={{rStar-Math}: Small {LLMs} Can Master Math Reasoning with Self-Evolved Deep Thinking}, - author={Guan, Xinyu and others}, - journal={arXiv preprint arXiv:2501.04519}, - year={2025}, -} - -@article{restmcts2024, - title={{ReST-MCTS*}: {LLM} Self-Training via Process Reward Guided Tree Search}, - author={Zhang, Dan and others}, - journal={arXiv preprint arXiv:2406.03816}, - year={2024}, -} - -@article{bengio2021gflow, - title={{GFlowNet} Foundations}, - author={Bengio, Yoshua and Lahlou, Salem and Deleu, Tristan and Hu, Edward J. and Tiwari, Mo and Bengio, Emmanuel}, - journal={arXiv preprint arXiv:2111.09266}, - year={2021}, -} - -@inproceedings{malkin2022trajbalance, - title={Trajectory Balance: Improved Credit Assignment in {GFlowNets}}, - author={Malkin, Nikolay and Jain, Moksh and Bengio, Emmanuel and Sun, Chen and Bengio, Yoshua}, - booktitle={NeurIPS}, - year={2022}, -} - -@article{liu2023lostmiddle, - title={Lost in the Middle: How Language Models Use Long Contexts}, - author={Liu, Nelson F. and Lin, Kevin and Hewitt, John and Paranjape, Ashwin and Bevilacqua, Michele and Petroni, Fabio and Liang, Percy}, - journal={Transactions of the Association for Computational Linguistics}, - year={2024}, - note={Empirical evidence that effective context is much smaller than nominal context}, -} - -@article{hsieh2024ruler, - title={{RULER}: What's the Real Context Size of Your Long-Context Language Models?}, - author={Hsieh, Cheng-Ping and Sun, Simeng and Kriman, Samuel and Acharya, Shantanu and Rekesh, Dima and Jia, Fei and Ginsburg, Boris}, - journal={arXiv preprint arXiv:2404.06654}, - year={2024}, -} - -@article{shao2024deepseekmath, - title={{DeepSeekMath}: Pushing the Limits of Mathematical Reasoning in Open Language Models}, - author={Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Bi, Xiao and Zhang, Haowei and Zhang, Mingchuan and Li, Y.K. and Wu, Y. and Guo, Daya}, - journal={arXiv preprint arXiv:2402.03300}, - year={2024}, - note={Introduces Group Relative Policy Optimization (GRPO)}, -} - -@article{yao2023tot, - title={Tree of Thoughts: Deliberate Problem Solving with Large Language Models}, - author={Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Thomas L. and Cao, Yuan and Narasimhan, Karthik}, - journal={NeurIPS}, - year={2023}, -} - -@article{deepseekr1, - title={{DeepSeek-R1}: Incentivizing Reasoning Capability in {LLMs} via Reinforcement Learning}, - author={{DeepSeek-AI}}, - journal={arXiv preprint arXiv:2501.12948}, - year={2025}, -} - -@article{searchr1, - title={{Search-R1}: Training {LLMs} to Reason and Leverage Search Engines with Reinforcement Learning}, - author={Jin, Bowen and others}, - journal={arXiv preprint arXiv:2503.09516}, - year={2025}, -} - -@article{webrl2024, - title={{WebRL}: Training {LLM} Web Agents via Self-Evolving Online Curriculum Reinforcement Learning}, - author={Qi, Zehan and others}, - journal={arXiv preprint arXiv:2411.02337}, - year={2024}, -} - -@article{deepresearcher2025, - title={{DeepResearcher}: Scaling Deep Research via Reinforcement Learning in Real-World Environments}, - author={Zheng, Yuxiang and others}, - journal={arXiv preprint arXiv:2504.03160}, - year={2025}, -} - -% ============================================================================ -% G. Positioning: Agent Benchmarks -% ============================================================================ - -@article{liu2023agentbench, - title={{AgentBench}: Evaluating {LLMs} as Agents}, - author={Liu, Xiao and Yu, Hao and Zhang, Hanchen and others}, - journal={arXiv preprint arXiv:2308.03688}, - year={2023}, -} - -@article{jimenez2024swebench, - title={{SWE-bench}: Can Language Models Resolve Real-World {GitHub} Issues?}, - author={Jimenez, Carlos E. and Yang, John and Wettig, Alexander and others}, - journal={ICLR}, - year={2024}, -} - -@article{zheng2022minif2f, - title={{MiniF2F}: a cross-system benchmark for formal {Olympiad-level} mathematics}, - author={Zheng, Kunhao and Han, Jesse Michael and Polu, Stanislas}, - journal={ICLR}, - year={2022}, -} - -@article{ada2023llmp, - title={{Ada}: Learning Adaptive Planning Representations with Natural Language Guidance}, - author={Wong, Lionel and Mao, Jiayuan and Sharma, Pratyusha and Siegel, Zachary S. and Feng, Jiahai and Korneev, Noa and Tenenbaum, Joshua B. and Andreas, Jacob}, - journal={arXiv preprint arXiv:2312.08566}, - year={2023}, -} - -@article{appworld2024, - title={{AppWorld}: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents}, - author={Trivedi, Harsh and others}, - journal={ACL}, - year={2024}, -} - -% ============================================================================ -% H. Prompting-Era Origin Cites -% ============================================================================ - -@article{yao2023react, - title={{ReAct}: Synergizing Reasoning and Acting in Language Models}, - author={Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan}, - journal={ICLR}, - year={2023}, -} - -@article{khot2023decomp, - title={Decomposed Prompting: A Modular Approach for Solving Complex Tasks}, - author={Khot, Tushar and others}, - journal={ICLR}, - year={2023}, -} - -@article{sheng2024hybridflow, - title={{HybridFlow}: A Flexible and Efficient {RLHF} Framework}, - author={Sheng, Guangming and Zhang, Chi and Ye, Zilingfeng and Wu, Xibin and Zhang, Wang and Zhang, Ru and Peng, Yanghua and Lin, Haibin and Wu, Chuan}, - journal={arXiv preprint arXiv:2409.19256}, - year={2024}, -} - -@article{yehudai2025agenteval, - title={Survey on Evaluation of {LLM}-based Agents}, - author={Yehudai, Asaf and Eden, Lilach and Li, Alan and Uziel, Guy and Zhao, Yilun and Bar-Haim, Roy and Cohan, Arman and Shmueli-Scheuer, Michal}, - journal={arXiv preprint arXiv:2503.16416}, - year={2025}, -} - -@article{jiang2025verltool, - title={{VerlTool}: Towards Holistic Agentic Reinforcement Learning with Tool Use}, - author={Jiang, Dongfu and Ji, Yi and Nguyen, Xuan-Phi and Zhuang, Yiran and Zhang, Quy-Anh and Li, Xiang and Chen, Wenhu}, - journal={arXiv preprint arXiv:2509.01055}, - year={2025}, -} - -@article{luo2025agentlightning, - title={Agent Lightning: Train ANY {AI} Agents with Reinforcement Learning}, - author={Luo, Xufang and Wei, Chengrui and Zhou, Yushuo and Wu, Menglin and Zhang, Wenyi and Yang, Yuge and Qiu, Xiao and Huang, Xu and Li, Dongsheng and Yang, Mao}, - journal={arXiv preprint arXiv:2508.03680}, - year={2025}, -} - -@inproceedings{cemri2025mast, - title={Why Do Multi-Agent {LLM} Systems Fail?}, - author={Cemri, Mert and Pan, Melissa Z and Yang, Shuyi and Agrawal, Lakshya A and Chopra, Bhavya and Tiwari, Rishabh and Keutzer, Kurt and Parameswaran, Aditya and Klein, Dan and Ramchandran, Kannan and Zaharia, Matei and Gonzalez, Joseph E and Stoica, Ion}, - booktitle={arXiv preprint arXiv:2503.13657}, - year={2025}, -} - -@article{zhang2024agentohana, - title={{AgentOhana}: Design Unified Data and Training Pipeline for Effective Agent Learning}, - author={Zhang, Jianguo and Lan, Tian and Murthy, Rithesh and Liu, Zhiwei and Yao, Weiran and Niebles, Juan Carlos and Wang, Huan and Xu, Ran and Xiong, Caiming}, - journal={arXiv preprint arXiv:2402.15506}, - year={2024}, -} - -@article{yang2025agentprotocols, - title={A Survey of {AI} Agent Protocols}, - author={Yang, Yingxuan and Chai, Huacan and Song, Yuanyi and Qi, Siyuan and Wen, Muning and Li, Ning and Liao, Junwei and Hu, Haoyi and Lin, Jianghao and Chang, Gaowei and Liu, Weiwen and Wen, Ying and Yu, Yong and Zhang, Weinan}, - journal={arXiv preprint arXiv:2504.16736}, - year={2025}, -} - -@article{suttonoptions, - title={Between {MDPs} and semi-{MDPs}: A framework for temporal abstraction in reinforcement learning}, - author={Sutton, Richard S and Precup, Doina and Singh, Satinder}, - journal={Artificial Intelligence}, - volume={112}, - number={1-2}, - pages={181--211}, - year={1999}, -} - -@inproceedings{bradtke1994smdp, - title={Reinforcement Learning Methods for Continuous-Time {M}arkov Decision Problems}, - author={Bradtke, Steven J. and Duff, Michael O.}, - booktitle={Advances in Neural Information Processing Systems 7 (NIPS 1994)}, - pages={393--400}, - year={1994}, -} - -@article{hoare1978csp, - title={Communicating Sequential Processes}, - author={Hoare, C. A. R.}, - journal={Communications of the {ACM}}, - volume={21}, - number={8}, - pages={666--677}, - year={1978}, -} - -@inproceedings{hewitt1973actor, - title={A Universal Modular {ACTOR} Formalism for Artificial Intelligence}, - author={Hewitt, Carl and Bishop, Peter and Steiger, Richard}, - booktitle={Proceedings of the 3rd International Joint Conference on Artificial Intelligence (IJCAI)}, - pages={235--245}, - year={1973}, -} - -@phdthesis{armstrong2003erlang, - title={Making Reliable Distributed Systems in the Presence of Software Errors}, - author={Armstrong, Joe}, - school={Royal Institute of Technology, Stockholm}, - year={2003}, -} - -@inproceedings{orseau2016interruptible, - title={Safely Interruptible Agents}, - author={Orseau, Laurent and Armstrong, Stuart}, - booktitle={Proceedings of the 32nd Conference on Uncertainty in Artificial Intelligence (UAI)}, - pages={557--566}, - year={2016}, -} - -@inproceedings{elmhamdi2017dynamicinterruptibility, - title={Dynamic Safe Interruptibility for Decentralized Multi-Agent Reinforcement Learning}, - author={El Mhamdi, El Mahdi and Guerraoui, Rachid and Hendrikx, Hadrien and Maurer, Alexandre}, - booktitle={Advances in Neural Information Processing Systems (NeurIPS)}, - year={2017}, -} - -@article{skiadopoulos2022dbos, - title={{DBOS}: A {DBMS}-oriented Operating System}, - author={Skiadopoulos, Athinagoras and Li, Qian and Kraft, Peter and Kraska, Kostis and Stonebraker, Michael and Zaharia, Matei and others}, - journal={Proceedings of the {VLDB} Endowment}, - volume={15}, - number={1}, - pages={21--30}, - year={2022}, -} - -@article{masters2024manageragent, - title={Orchestrating Human-{AI} Teams: The Manager Agent as a Unifying Research Challenge}, - author={Anonymous}, - booktitle={International Conference on Distributed Artificial Intelligence (DAI)}, - journal={arXiv preprint arXiv:2510.02557}, - year={2025}, - note={Cited anonymously per double-blind policy; to be de-anonymised on acceptance. Same paper as masters2025workflowposg, which is the POSG-formalism anchor.}, -} - -@article{biderman2024, - title={Lessons from the Trenches on Reproducible Evaluation of Language Models}, - author={Biderman, Stella and Schoelkopf, Hailey and Sutawika, Lintang and Gao, Leo and Tow, Jonathan and Abbasi, Baber and Aji, Alham Fikri and Ammanamanchi, Pawan Sasanka and Black, Sidney and Clive, Jordan and others}, - journal={arXiv preprint arXiv:2405.14782}, - year={2024}, -} - -@article{liu2025drgrpo, - title={Understanding R1-Zero-Like Training: A Critical Perspective}, - author={Liu, Zichen and Chen, Changyu and Li, Wenjun and Qi, Penghui and Pang, Tianyu and Du, Chao and Lee, Wee Sun and Lin, Min}, - journal={arXiv preprint arXiv:2503.20783}, - year={2025}, - note={Introduces Dr. GRPO; documents loss-aggregation convention gains of +7.3 to +15.7pp on AIME 2024.}, -} - -@misc{verl2165, - title={Training-engine and rollout-engine tokenization divergence on {Qwen3} {GRPO} ({verl} issue \#2165)}, - author={{verl}}, - year={2025}, - note={volcengine/verl issue \#2165, November 2025. Six independent reproductions; upstream-confirmed in QwenLM/Qwen3 \#1826.}, - howpublished={\url{https://github.com/volcengine/verl/issues/2165}}, -} - -@article{gebru2021datasheets, - title={Datasheets for Datasets}, - author={Gebru, Timnit and Morgenstern, Jamie and Vecchione, Briana and Vaughan, Jennifer Wortman and Wallach, Hanna and Iii, Hal Daumé and Crawford, Kate}, - journal={Communications of the ACM}, - volume={64}, - number={12}, - pages={86--92}, - year={2021}, - publisher={ACM New York, NY, USA}, -} - -@inproceedings{mitchell2019modelcards, - title={Model Cards for Model Reporting}, - author={Mitchell, Margaret and Wu, Simone and Zaldivar, Andrew and Barnes, Parker and Vasserman, Lucy and Hutchinson, Ben and Spitzer, Elena and Raji, Inioluwa Deborah and Gebru, Timnit}, - booktitle={Proceedings of the Conference on Fairness, Accountability, and Transparency}, - pages={220--229}, - year={2019}, -} - -@misc{beeching2023openllm, - title={What's going on with the Open {LLM} Leaderboard?}, - author={Beeching, Edward and Fourrier, Cl{\'e}mentine and Habib, Nathan and Han, Sheon and Lambert, Nathan and Rajani, Nazneen and Sanseviero, Omar and Tunstall, Lewis and Wolf, Thomas}, - year={2023}, - howpublished={\url{https://huggingface.co/blog/open-llm-leaderboard-mmlu}}, - note={Post-mortem analysis of the Open LLM Leaderboard MMLU variance on LLaMA-65B, LLaMA-30B, and Falcon-40B.}, -} - -@article{hu2025openrlhf, - title={{OpenRLHF}: An Easy-to-Use, Scalable and High-Performance {RLHF} Framework}, - author={Hu, Jian and Wu, Xibin and Zhu, Zilin and Xianyu and Wang, Weixun and Zhang, Dehao and Cao, Yu}, - journal={arXiv preprint arXiv:2501.03262}, - year={2025}, - note={Reports 3.13$\times$ wall-clock gap vs TRL on GSM8K-GRPO with identical model, hardware, and algorithm (Table 4).}, -} - -@misc{opentelemetry3163, - title={{GenAI}: semantic conventions for {LLM} cache-token accounting (pull request \#3163)}, - author={{OpenTelemetry}}, - year={2024}, - howpublished={\url{https://github.com/open-telemetry/semantic-conventions/pull/3163}}, - note={Formalises a convention disagreement between the OpenAI/Vertex cache-token model (cached tokens counted inside \texttt{input\_tokens}) and Anthropic's (cached tokens separated into \texttt{cache\_read\_input\_tokens} + \texttt{cache\_creation\_input\_tokens}).}, -} - -@misc{langfuse12306, - title={Anthropic cached-prompt token double-counting (issue \#12306)}, - author={{Langfuse}}, - year={2024}, - howpublished={\url{https://github.com/langfuse/langfuse/issues/12306}}, - note={Production-tracing instance of the cache-token convention disagreement documented in OpenTelemetry PR \#3163.}, -} - -@misc{nvidiaNemoGPQA, - title={{GPQA} dataset documentation: evaluation harness comparison}, - author={{NVIDIA}}, - year={2024}, - howpublished={\url{https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama/gpqa.html}}, - note={States that simple-evals {GPQA} and \texttt{lm-evaluation-harness} {GPQA} are ``distinct, non-comparable metrics.''}, -} - -@misc{aiderLeaderboards, - title={Aider {LLM} leaderboards}, - author={{Aider}}, - year={2025}, - howpublished={\url{https://github.com/Aider-AI/aider/tree/main/aider/website/_data}}, - note={Same organisation publishes three leaderboards for Claude 3.5 Sonnet at different reported costs: Edit (\$0), Refactor (\$8.46), Polyglot (\$14.41), across \texttt{edit\_leaderboard.yml}, \texttt{refactor\_leaderboard.yml}, \texttt{polyglot\_leaderboard.yml}.}, -} - -@misc{swebenchHarness, - title={{SWE-bench} evaluation harness: timing backend divergence}, - author={{SWE-bench}}, - year={2024}, - howpublished={\url{https://github.com/princeton-nlp/SWE-bench}}, - note={Docker backend timer at \texttt{swebench/harness/docker\_utils.py:203-217} wraps bash-executed eval only; Modal backend timer at \texttt{swebench/harness/modal\_eval/run\_evaluation\_modal.py:307-319} wraps container startup + Python startup. Same logged metric, different code regions.}, -} - -@inproceedings{feng2024restmcts, - title={{ReST-MCTS*}: {LLM} Self-Training via Process Reward Guided Tree Search}, - author={Feng, Xidong and Wan, Ziyu and Wen, Muning and Wen, Ying and Zhang, Weinan and Wang, Jun}, - booktitle={Advances in Neural Information Processing Systems}, - year={2024}, - note={Cited in Sec.~4 for MCTS-based training community's native visit-entropy and abandoned-branch transforms.}, -} - -@misc{cernopendata, - title={{CERN} Open Data Portal}, - author={{CERN}}, - howpublished={\url{https://opendata.cern.ch}}, - year={2024}, - note={Public release of LHC detector-event records (CMS, ATLAS, LHCb, ALICE); cited as precedent for domain-wide observation-level publishing.}, -} diff --git a/scripts/smoke_local_up.sh b/scripts/smoke_local_up.sh index 263e9f55..e1d24e0d 100755 --- a/scripts/smoke_local_up.sh +++ b/scripts/smoke_local_up.sh @@ -48,6 +48,7 @@ Stack is up. Export these in your shell before running smoke: export ERGON_API_BASE_URL=http://127.0.0.1:9000 export PLAYWRIGHT_BASE_URL=http://127.0.0.1:3001 export ENABLE_TEST_HARNESS=1 + export ERGON_STARTUP_PLUGINS=ergon_core.test_support.smoke_fixtures:register_smoke_fixtures export TEST_HARNESS_SECRET=local-dev export SCREENSHOT_DIR=/tmp/playwright export E2B_API_KEY= # required for real sandbox runs diff --git a/tests/e2e/_asserts.py b/tests/e2e/_asserts.py index 05359cbe..09409faf 100644 --- a/tests/e2e/_asserts.py +++ b/tests/e2e/_asserts.py @@ -29,7 +29,7 @@ from ergon_core.core.api.schemas import RunTaskDto from ergon_core.core.persistence.graph.models import RunGraphNode -from ergon_core.core.persistence.graph.status_conventions import COMPLETED +from ergon_core.core.persistence.graph.status_conventions import BLOCKED, COMPLETED, FAILED from ergon_core.core.persistence.shared.db import get_session from ergon_core.core.persistence.telemetry.models import ( RunResource, @@ -170,7 +170,9 @@ def _assert_sandbox_command_wal(run_id: UUID) -> None: ).all(), ) probes = [e for e in entries if "wc" in e.command or "probe" in e.command] - assert len(probes) >= 9, f"expected ≥9 probe WAL entries, got {len(probes)}" + # Canonical sad-path smokes block l_3 before it starts, so the eight + # executed leaves should emit probe commands while l_3 emits none. + assert len(probes) >= 8, f"expected ≥8 probe WAL entries, got {len(probes)}" def _assert_sandbox_lifecycle_events(run_id: UUID) -> None: @@ -293,8 +295,8 @@ def _assert_temporal_ordering(run_id: UUID) -> None: Uses ``RunTaskExecution.started_at`` / ``completed_at`` via ``node_id`` join. Only checks edges whose both endpoints reached - at least ``started`` state. The sad path still runs ``l_3`` because the - failing leaf completes the task with a score-zero output. + at least ``started`` state. Blocked descendants are skipped because + they should never have execution timestamps. """ with get_session() as s: leaves = list( @@ -366,11 +368,23 @@ def _assert_cohort_membership(cohort_key: str, run_ids: list[UUID]) -> None: def _assert_sadpath_graph_cascade(run_id: UUID) -> None: - """Score-zero sad path: all graph nodes complete, l_2 produces failed output.""" + """Canonical sad path: l_2 fails, l_3 blocks, independent leaves complete.""" snapshot = require_run_snapshot(run_id) - leaves = [task for task in snapshot.tasks.values() if task.level > 0] + tasks = list(snapshot.tasks.values()) + leaves = [task for task in tasks if task.level > 0] + root_tasks = [task for task in tasks if task.level == 0] by_slug = {task.name: task for task in leaves} - for slug in EXPECTED_SUBTASK_SLUGS: + assert len(root_tasks) == 1, f"expected 1 root task, got {len(root_tasks)}" + assert root_tasks[0].status != COMPLETED, ( + f"parent task should not complete when a child fails, got {root_tasks[0].status}" + ) + assert by_slug["l_2"].status == FAILED, f"l_2 expected FAILED, got {by_slug['l_2'].status}" + assert by_slug["l_3"].status == BLOCKED, f"l_3 expected BLOCKED, got {by_slug['l_3'].status}" + assert by_slug["l_3"].started_at is None, "blocked l_3 should never start" + assert not snapshot.executions_by_task.get(by_slug["l_3"].id), ( + "blocked l_3 should not have execution attempts" + ) + for slug in set(EXPECTED_SUBTASK_SLUGS) - {"l_2", "l_3"}: assert by_slug[slug].status == COMPLETED, ( f"{slug} expected COMPLETED, got {by_slug[slug].status}" ) @@ -427,28 +441,30 @@ def _assert_sadpath_partial_wal(run_id: UUID) -> None: def _assert_sadpath_thread_messages(run_id: UUID) -> None: - """Happy path sends 9 messages; sad l_2 suppresses completion reporting.""" + """Sad path sends messages for the 7 completed leaves only.""" snapshot = require_run_snapshot(run_id) thread = next( (thread for thread in snapshot.threads if thread.topic == "smoke-completion"), None ) assert thread is not None, "no smoke-completion thread created" msgs = sorted(thread.messages, key=lambda msg: msg.sequence_num) - assert len(msgs) == 8, f"expected 8 completion messages (l_2 suppressed), got {len(msgs)}" + assert len(msgs) == 7, ( + f"expected 7 completion messages (l_2 failed, l_3 blocked), got {len(msgs)}" + ) from_slugs = {m.from_agent_id.removeprefix("leaf-") for m in msgs} assert "l_2" not in from_slugs, ( f"l_2 sent a completion message despite suppression: {from_slugs}" ) - assert from_slugs == set(EXPECTED_SUBTASK_SLUGS) - {"l_2"} + assert "l_3" not in from_slugs, ( + f"l_3 sent a completion message despite being blocked: {from_slugs}" + ) + assert from_slugs == set(EXPECTED_SUBTASK_SLUGS) - {"l_2", "l_3"} def _assert_sadpath_evaluation(run_id: UUID) -> None: - """Reusing happy-path criterion on sad-path run must return score 0.""" + """Sad-path run should not produce a successful final score.""" snapshot = require_run_snapshot(run_id) - evals = list(snapshot.evaluations_by_task.values()) - assert len(evals) == 1 - assert evals[0].total_score == 0.0 - assert snapshot.final_score == 0.0 + assert snapshot.final_score in (None, 0.0) # ============================================================================= diff --git a/tests/e2e/_submit.py b/tests/e2e/_submit.py index e4468eea..1ffd3122 100644 --- a/tests/e2e/_submit.py +++ b/tests/e2e/_submit.py @@ -7,13 +7,12 @@ ergon internals, do not call ``build_experiment`` / ``create_run`` / ``inngest.send`` in-process, and do not register worker / evaluator slugs in the test process. All of that lives inside the api container -(see ``register_smoke_fixtures()`` called by ``app.py`` when -``ENABLE_SMOKE_FIXTURES=1``). Single source of truth for fixtures ⇒ no -host / container staleness risk. +(see ``ERGON_STARTUP_PLUGINS`` registering smoke fixtures in the API +container). Single source of truth for fixtures ⇒ no host / container +staleness risk. -Each slot can use a different ``(worker_slug, criterion_slug)`` pair — -used by the researchrubrics leg which has 2 happy + 1 sad slot. Empty -slots list is valid (returns ``[]``) but unlikely in practice. +Each slot can use a different ``(worker_slug, criterion_slug)`` pair. +Empty slots list is valid (returns ``[]``) but unlikely in practice. """ from __future__ import annotations diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index b5df287c..68bfd310 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -15,8 +15,7 @@ from sqlmodel import Session # NOTE: smoke fixture registration now lives exclusively inside the api -# container — see the conditional ``register_smoke_fixtures()`` call in -# ``ergon_core/core/api/app.py`` gated on ``ENABLE_SMOKE_FIXTURES=1``. +# container via ``ERGON_STARTUP_PLUGINS``. # Host-side pytest is a black-box client (``_submit.py`` → HTTP) and # doesn't need the fixtures in its own process. Keeping the registry # single-sourced eliminates the drift window where a fixture edit diff --git a/tests/e2e/test_minif2f_smoke.py b/tests/e2e/test_minif2f_smoke.py index 19f80b87..ddb17578 100644 --- a/tests/e2e/test_minif2f_smoke.py +++ b/tests/e2e/test_minif2f_smoke.py @@ -1,8 +1,4 @@ -"""MiniF2F canonical smoke — cohort of 3 happy runs against real E2B. - -No sad-path slot (researchrubrics leg carries that for the whole -matrix). Structure identical to ``test_researchrubrics_smoke.py``. -""" +"""MiniF2F canonical sad-path smoke against real E2B.""" from __future__ import annotations @@ -11,32 +7,29 @@ import os import pathlib import subprocess -import uuid from datetime import datetime, timezone import pytest from tests.e2e._asserts import ( - _assert_blob_roundtrip, _assert_cohort_membership, - _assert_minif2f_artifacts, - _assert_run_evaluation, - _assert_run_graph, - _assert_run_resources, - _assert_run_turn_counts, + _assert_sadpath_evaluation, + _assert_sadpath_graph_cascade, + _assert_sadpath_partial_artifact, + _assert_sadpath_partial_wal, + _assert_sadpath_thread_messages, _assert_sandbox_command_wal, _assert_sandbox_lifecycle_events, _assert_temporal_ordering, - _assert_thread_messages_ordered, - wait_for_terminal, + wait_for_terminal_status, ) from tests.e2e._submit import submit_cohort ENV = "minif2f" -WORKER = f"{ENV}-smoke-worker" +WORKER = f"{ENV}-sadpath-smoke-worker" CRITERION = f"{ENV}-smoke-criterion" -# ``SMOKE_COHORT_SIZE`` override for local dev; CI uses default 3. -COHORT_SIZE = int(os.environ.get("SMOKE_COHORT_SIZE", "3")) +# ``SMOKE_COHORT_SIZE`` override for local/dev deep checks; CI uses default 1. +COHORT_SIZE = int(os.environ.get("SMOKE_COHORT_SIZE", "1")) PER_RUN_TIMEOUT = 270 @@ -54,20 +47,25 @@ async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: assert len(run_ids) == COHORT_SIZE await asyncio.gather( - *(wait_for_terminal(rid, timeout_seconds=PER_RUN_TIMEOUT) for rid in run_ids), + *( + wait_for_terminal_status( + rid, + expected_statuses=frozenset({"failed"}), + timeout_seconds=PER_RUN_TIMEOUT, + ) + for rid in run_ids + ), ) for rid in run_ids: - _assert_run_graph(rid) - _assert_run_resources(rid) - _assert_run_turn_counts(rid) - _assert_sandbox_command_wal(rid) + _assert_sadpath_graph_cascade(rid) + _assert_sadpath_partial_artifact(rid) + _assert_sadpath_partial_wal(rid) + _assert_sadpath_thread_messages(rid) + _assert_sadpath_evaluation(rid) _assert_sandbox_lifecycle_events(rid) - _assert_thread_messages_ordered(rid) - _assert_blob_roundtrip(rid) + _assert_sandbox_command_wal(rid) _assert_temporal_ordering(rid) - _assert_run_evaluation(rid) - _assert_minif2f_artifacts(rid) _assert_cohort_membership(cohort_key, run_ids) @@ -79,7 +77,7 @@ async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: ) _invoke_playwright( cohort_key=cohort_key, - cohort=[{"run_id": str(rid), "kind": "happy"} for rid in run_ids], + cohort=[{"run_id": str(rid), "kind": "sad"} for rid in run_ids], screenshot_dir=screenshot_dir, ) diff --git a/tests/e2e/test_researchrubrics_smoke.py b/tests/e2e/test_researchrubrics_smoke.py index 912215a2..d032e801 100644 --- a/tests/e2e/test_researchrubrics_smoke.py +++ b/tests/e2e/test_researchrubrics_smoke.py @@ -1,15 +1,14 @@ -"""ResearchRubrics canonical smoke — cohort of 3 (2 happy + 1 sad) against real E2B. +"""ResearchRubrics canonical sad-path smoke against real E2B. Per-run assertion dispatch on slot ``kind``: -- ``happy`` slots run the full happy-path assertion block (§2.5 of - ``docs/superpowers/plans/test-refactor/02-drivers-and-asserts.md``). -- ``sad`` slot (slot 3) runs the sad-path block (§10) — line-cascade - failure invariants. +- The single slot routes ``l_2`` to a failing leaf. +- ``l_3`` depends on ``l_2`` and must remain blocked / unstarted. +- Independent branches must still complete. -Cohort-level: ``_assert_cohort_membership`` checks all 3 runs are -visible on ``/cohort/{key}``. Playwright subprocess runs at the end -with a JSON-encoded cohort array so the shared factory can dispatch +Cohort-level: ``_assert_cohort_membership`` checks all submitted runs +are visible on ``/cohort/{key}``. Playwright subprocess runs at the +end with a JSON-encoded cohort array so the shared factory can dispatch per-kind assertions in the UI. """ @@ -20,20 +19,12 @@ import os import pathlib import subprocess -import uuid -from dataclasses import dataclass from datetime import datetime, timezone -from typing import Literal import pytest from tests.e2e._asserts import ( - _assert_blob_roundtrip, _assert_cohort_membership, - _assert_run_evaluation, - _assert_run_graph, - _assert_run_resources, - _assert_run_turn_counts, _assert_sadpath_evaluation, _assert_sadpath_graph_cascade, _assert_sadpath_partial_artifact, @@ -42,45 +33,17 @@ _assert_sandbox_command_wal, _assert_sandbox_lifecycle_events, _assert_temporal_ordering, - _assert_thread_messages_ordered, - wait_for_terminal, wait_for_terminal_status, ) from tests.e2e._submit import submit_cohort ENV = "researchrubrics" -HAPPY_WORKER = f"{ENV}-smoke-worker" -SAD_WORKER = f"{ENV}-sadpath-smoke-worker" +WORKER = f"{ENV}-sadpath-smoke-worker" CRITERION = f"{ENV}-smoke-criterion" PER_RUN_TIMEOUT = 270 # seconds; < pytest's 300s --timeout -@dataclass(frozen=True) -class CohortSlot: - worker_slug: str - criterion_slug: str - kind: Literal["happy", "sad"] - - -def _build_cohort() -> tuple[CohortSlot, ...]: - """Build the cohort using the ``SMOKE_COHORT_SIZE`` env-var override. - - ``SMOKE_COHORT_SIZE`` controls the number of *happy* slots (default 2). - One sad-path slot is always appended — every cohort must exercise the - line-cascade failure path regardless of size. - - Size=1 → 1 happy + 1 sad. Size=2 (default) → 2 happy + 1 sad. - """ - size = int(os.environ.get("SMOKE_COHORT_SIZE", "2")) - if size <= 0: - raise ValueError(f"SMOKE_COHORT_SIZE must be >= 1, got {size}") - - slots: list[CohortSlot] = [CohortSlot(HAPPY_WORKER, CRITERION, "happy") for _ in range(size)] - slots.append(CohortSlot(SAD_WORKER, CRITERION, "sad")) - return tuple(slots) - - -COHORT: tuple[CohortSlot, ...] = _build_cohort() +COHORT_SIZE = int(os.environ.get("SMOKE_COHORT_SIZE", "1")) @pytest.mark.e2e @@ -88,41 +51,30 @@ def _build_cohort() -> tuple[CohortSlot, ...]: async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: cohort_key = f"ci-smoke-{ENV}-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%S')}" - # ── Phase 1: submit the cohort (mixed worker slugs) ─────────────── run_ids = await submit_cohort( benchmark_slug=ENV, - slots=[(s.worker_slug, s.criterion_slug) for s in COHORT], + slots=[(WORKER, CRITERION)] * COHORT_SIZE, cohort_key=cohort_key, timeout=PER_RUN_TIMEOUT, ) - assert len(run_ids) == len(COHORT) - slotted: list[tuple[CohortSlot, uuid.UUID]] = list(zip(COHORT, run_ids)) + assert len(run_ids) == COHORT_SIZE - # ── Phase 2: wait for terminal state ────────────────────────────── await asyncio.gather( *( - wait_for_terminal(rid, timeout_seconds=PER_RUN_TIMEOUT) - if slot.kind == "happy" - else wait_for_terminal_status( + wait_for_terminal_status( rid, - expected_statuses=frozenset({"completed"}), + expected_statuses=frozenset({"failed"}), timeout_seconds=PER_RUN_TIMEOUT, ) - for slot, rid in slotted + for rid in run_ids ), ) - # ── Phase 3: per-run assertions (dispatched on kind) ────────────── - for slot, rid in slotted: - if slot.kind == "happy": - _assert_happy_run(rid) - else: - _assert_sad_run(rid) + for rid in run_ids: + _assert_sad_run(rid) - # ── Phase 3b: cohort-level invariant ────────────────────────────── _assert_cohort_membership(cohort_key, run_ids) - # ── Phase 4: Playwright subprocess (screenshots per run) ────────── screenshot_dir_env = os.environ.get("SCREENSHOT_DIR") screenshot_dir = ( pathlib.Path(screenshot_dir_env) @@ -131,71 +83,22 @@ async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: ) _invoke_playwright( cohort_key=cohort_key, - cohort=[{"run_id": str(rid), "kind": s.kind} for s, rid in slotted], + cohort=[{"run_id": str(rid), "kind": "sad"} for rid in run_ids], screenshot_dir=screenshot_dir, ) - # Phase 5 (finalizer) — see tests/e2e/conftest.py ``_screenshot_uploader``. - -def _assert_happy_run(rid: uuid.UUID) -> None: - _assert_run_graph(rid) - _assert_run_resources(rid) - _assert_run_turn_counts(rid) - _assert_sandbox_command_wal(rid) - _assert_sandbox_lifecycle_events(rid) - _assert_thread_messages_ordered(rid) - _assert_blob_roundtrip(rid) - _assert_temporal_ordering(rid) - _assert_run_evaluation(rid) - # Env-specific content check is inside the criterion + also rerun here - # via _assert_env_content_happy below. - _assert_env_content_happy(rid) - - -def _assert_sad_run(rid: uuid.UUID) -> None: +def _assert_sad_run(rid) -> None: _assert_sadpath_graph_cascade(rid) _assert_sadpath_partial_artifact(rid) _assert_sadpath_partial_wal(rid) _assert_sadpath_thread_messages(rid) _assert_sadpath_evaluation(rid) - _assert_sandbox_command_wal(rid) _assert_sandbox_lifecycle_events(rid) + _assert_sandbox_command_wal(rid) _assert_temporal_ordering(rid) -def _assert_env_content_happy(rid: uuid.UUID) -> None: - """Out-of-band re-verification that each happy leaf produced a - well-formed ``report_*.md``. Duplicates what - ``ResearchRubricsSmokeCriterion._verify_env_content`` does inside - the workflow — if the criterion regresses silently, this catches it.""" - from pathlib import Path - - from sqlmodel import select - - from ergon_core.core.persistence.shared.db import get_session - from ergon_core.core.persistence.telemetry.models import RunResource - - with get_session() as s: - reports = list( - s.exec( - select(RunResource) - .where(RunResource.run_id == rid) - .where( - RunResource.name.like("report_%.md"), # ty: ignore[unresolved-attribute] - ) - .where(RunResource.kind == "report"), # blob-store only (host-accessible) - ).all(), - ) - assert len(reports) == 9, f"expected 9 reports, got {len(reports)}" - for r in reports: - body = Path(r.file_path).read_bytes() - assert body.startswith(b"# Research report"), ( - f"{r.name}: missing `# Research report` header" - ) - assert len(body.strip()) >= 20, f"{r.name}: body < 20 bytes" - - def _invoke_playwright( *, cohort_key: str, diff --git a/tests/e2e/test_swebench_smoke.py b/tests/e2e/test_swebench_smoke.py index ba08191c..9889d5f9 100644 --- a/tests/e2e/test_swebench_smoke.py +++ b/tests/e2e/test_swebench_smoke.py @@ -1,9 +1,4 @@ -"""SWE-Bench Verified canonical smoke — cohort of 3 happy runs against real E2B. - -No sad-path slot (researchrubrics leg carries that). Structure -identical to ``test_minif2f_smoke.py``; differs only in env slug and -spec filename. -""" +"""SWE-Bench Verified canonical sad-path smoke against real E2B.""" from __future__ import annotations @@ -12,24 +7,21 @@ import os import pathlib import subprocess -import uuid from datetime import datetime, timezone import pytest from tests.e2e._asserts import ( - _assert_blob_roundtrip, _assert_cohort_membership, - _assert_run_evaluation, - _assert_run_graph, - _assert_run_resources, - _assert_run_turn_counts, + _assert_sadpath_evaluation, + _assert_sadpath_graph_cascade, + _assert_sadpath_partial_artifact, + _assert_sadpath_partial_wal, + _assert_sadpath_thread_messages, _assert_sandbox_command_wal, _assert_sandbox_lifecycle_events, - _assert_swebench_artifacts, _assert_temporal_ordering, - _assert_thread_messages_ordered, - wait_for_terminal, + wait_for_terminal_status, ) from tests.e2e._submit import submit_cohort @@ -39,10 +31,10 @@ # maps 1:1 to the spec filename. ENV = "swebench-verified" WORKER_PREFIX = "swebench" -WORKER = f"{WORKER_PREFIX}-smoke-worker" +WORKER = f"{WORKER_PREFIX}-sadpath-smoke-worker" CRITERION = f"{WORKER_PREFIX}-smoke-criterion" -# ``SMOKE_COHORT_SIZE`` override for local dev; CI uses default 3. -COHORT_SIZE = int(os.environ.get("SMOKE_COHORT_SIZE", "3")) +# ``SMOKE_COHORT_SIZE`` override for local/dev deep checks; CI uses default 1. +COHORT_SIZE = int(os.environ.get("SMOKE_COHORT_SIZE", "1")) PER_RUN_TIMEOUT = 270 @@ -60,20 +52,25 @@ async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: assert len(run_ids) == COHORT_SIZE await asyncio.gather( - *(wait_for_terminal(rid, timeout_seconds=PER_RUN_TIMEOUT) for rid in run_ids), + *( + wait_for_terminal_status( + rid, + expected_statuses=frozenset({"failed"}), + timeout_seconds=PER_RUN_TIMEOUT, + ) + for rid in run_ids + ), ) for rid in run_ids: - _assert_run_graph(rid) - _assert_run_resources(rid) - _assert_run_turn_counts(rid) - _assert_sandbox_command_wal(rid) + _assert_sadpath_graph_cascade(rid) + _assert_sadpath_partial_artifact(rid) + _assert_sadpath_partial_wal(rid) + _assert_sadpath_thread_messages(rid) + _assert_sadpath_evaluation(rid) _assert_sandbox_lifecycle_events(rid) - _assert_thread_messages_ordered(rid) - _assert_blob_roundtrip(rid) + _assert_sandbox_command_wal(rid) _assert_temporal_ordering(rid) - _assert_run_evaluation(rid) - _assert_swebench_artifacts(rid) _assert_cohort_membership(cohort_key, run_ids) @@ -85,7 +82,7 @@ async def test_smoke_cohort(tmp_path: pathlib.Path) -> None: ) _invoke_playwright( cohort_key=cohort_key, - cohort=[{"run_id": str(rid), "kind": "happy"} for rid in run_ids], + cohort=[{"run_id": str(rid), "kind": "sad"} for rid in run_ids], screenshot_dir=screenshot_dir, ) diff --git a/tests/real_llm/benchmarks/test_smoke_stub.py b/tests/real_llm/benchmarks/test_smoke_stub.py index e549c752..097fb557 100644 --- a/tests/real_llm/benchmarks/test_smoke_stub.py +++ b/tests/real_llm/benchmarks/test_smoke_stub.py @@ -49,7 +49,7 @@ async def test_harness_canary_smoke_stub( env = { **os.environ, "ENABLE_TEST_HARNESS": "1", - "ENABLE_SMOKE_FIXTURES": "1", + "ERGON_STARTUP_PLUGINS": "ergon_core.test_support.smoke_fixtures:register_smoke_fixtures", "ERGON_DATABASE_URL": os.environ.get( "ERGON_DATABASE_URL", "postgresql://ergon:ergon_dev@127.0.0.1:5433/ergon", diff --git a/tests/unit/architecture/test_no_test_logic_in_core.py b/tests/unit/architecture/test_no_test_logic_in_core.py new file mode 100644 index 00000000..1bde37d1 --- /dev/null +++ b/tests/unit/architecture/test_no_test_logic_in_core.py @@ -0,0 +1,60 @@ +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +CORE = ROOT / "ergon_core" / "ergon_core" / "core" + +ALLOWED_FILES = { + CORE / "api" / "test_harness.py", + CORE / "settings.py", +} + +FORBIDDEN_IMPORT_SNIPPETS = ( + "ergon_core.test_support", + "tests.", +) + +FORBIDDEN_CORE_TEST_DOUBLE_TERMS = ( + "StubSandboxManager", + "is_stub_sandbox_id", + "stub-sandbox-", +) + + +def _core_python_files() -> list[Path]: + return [ + path + for path in CORE.rglob("*.py") + if path not in ALLOWED_FILES and "__pycache__" not in path.parts + ] + + +def test_core_does_not_import_test_support_or_tests() -> None: + offenders: list[str] = [] + for path in _core_python_files(): + text = path.read_text() + for snippet in FORBIDDEN_IMPORT_SNIPPETS: + if snippet in text: + offenders.append(f"{path.relative_to(ROOT)} contains {snippet!r}") + + assert offenders == [] + + +def test_core_does_not_define_or_branch_on_stub_sandbox_terms() -> None: + offenders: list[str] = [] + for path in _core_python_files(): + text = path.read_text() + for term in FORBIDDEN_CORE_TEST_DOUBLE_TERMS: + if term in text: + offenders.append(f"{path.relative_to(ROOT)} contains {term!r}") + + assert offenders == [] + + +def test_core_task_execution_does_not_mint_placeholder_sandbox_ids() -> None: + path = CORE / "runtime" / "inngest" / "execute_task.py" + text = path.read_text() + + assert "StubSandboxManager" not in text + assert "make_noop_sandbox_id" not in text + assert "stub_sandbox_id" not in text diff --git a/tests/unit/architecture/test_smoke_fixture_package_boundary.py b/tests/unit/architecture/test_smoke_fixture_package_boundary.py index 6f50f458..f8542ee8 100644 --- a/tests/unit/architecture/test_smoke_fixture_package_boundary.py +++ b/tests/unit/architecture/test_smoke_fixture_package_boundary.py @@ -13,7 +13,10 @@ def test_runtime_entrypoints_do_not_import_tests_smoke_fixtures() -> None: text = path.read_text() assert "tests.e2e._fixtures" not in text assert "ergon_core.dev.smoke_fixtures" not in text - assert "ergon_core.test_support.smoke_fixtures" in text + assert ( + "ergon_core.test_support.smoke_fixtures" + not in Path("ergon_core/ergon_core/core/api/app.py").read_text() + ) def test_smoke_fixtures_live_in_test_support_package() -> None: diff --git a/tests/unit/cli/test_eval_cli_required_fields.py b/tests/unit/cli/test_eval_cli_required_fields.py new file mode 100644 index 00000000..86098fa7 --- /dev/null +++ b/tests/unit/cli/test_eval_cli_required_fields.py @@ -0,0 +1,18 @@ +import pytest + +from ergon_cli.main import build_parser + + +@pytest.mark.parametrize("action", ["watch", "checkpoint"]) +def test_eval_commands_require_evaluator_and_model_base(action: str) -> None: + parser = build_parser() + args = ["eval", action] + if action == "watch": + args.extend(["--checkpoint-dir", "/tmp/checkpoints", "--benchmark", "minif2f"]) + else: + args.extend(["--checkpoint", "/tmp/checkpoints/checkpoint-1", "--benchmark", "minif2f"]) + + with pytest.raises(SystemExit) as exc_info: + parser.parse_args(args) + + assert exc_info.value.code == 2 diff --git a/tests/unit/dashboard/test_communication_threads.py b/tests/unit/dashboard/test_communication_threads.py new file mode 100644 index 00000000..5b7b0208 --- /dev/null +++ b/tests/unit/dashboard/test_communication_threads.py @@ -0,0 +1,83 @@ +from uuid import uuid4 + +from ergon_core.core.api.runs import _build_communication_threads +from ergon_core.core.persistence.telemetry.models import Thread, ThreadMessage + + +def test_build_communication_threads_populates_summary_and_task_anchors() -> None: + run_id = uuid4() + thread_id = uuid4() + execution_id = uuid4() + task_id = uuid4() + thread = Thread( + id=thread_id, + run_id=run_id, + topic="smoke-completion", + summary="Leaf workers report completion artifacts and probe exit status.", + agent_a_id="leaf-l_1", + agent_b_id="parent", + ) + message = ThreadMessage( + thread_id=thread_id, + run_id=run_id, + task_execution_id=execution_id, + from_agent_id="leaf-l_1", + to_agent_id="parent", + content="l_1: done exit=0", + sequence_num=1, + ) + + result = _build_communication_threads( + [thread], + [message], + {execution_id: task_id}, + ) + + assert len(result) == 1 + dto = result[0] + assert dto.summary == "Leaf workers report completion artifacts and probe exit status." + assert dto.task_id == str(task_id) + assert dto.messages[0].task_id == str(task_id) + assert dto.messages[0].task_execution_id == str(execution_id) + + +def test_build_communication_threads_keeps_run_level_thread_when_messages_span_tasks() -> None: + run_id = uuid4() + thread_id = uuid4() + execution_a = uuid4() + execution_b = uuid4() + thread = Thread( + id=thread_id, + run_id=run_id, + topic="smoke-completion", + agent_a_id="leaf-l_1", + agent_b_id="parent", + ) + messages = [ + ThreadMessage( + thread_id=thread_id, + run_id=run_id, + task_execution_id=execution_a, + from_agent_id="leaf-l_1", + to_agent_id="parent", + content="l_1: done exit=0", + sequence_num=1, + ), + ThreadMessage( + thread_id=thread_id, + run_id=run_id, + task_execution_id=execution_b, + from_agent_id="leaf-l_2", + to_agent_id="parent", + content="l_2: done exit=0", + sequence_num=2, + ), + ] + + result = _build_communication_threads( + [thread], + messages, + {execution_a: uuid4(), execution_b: uuid4()}, + ) + + assert result[0].task_id is None diff --git a/tests/unit/dashboard/test_event_contract_types.py b/tests/unit/dashboard/test_event_contract_types.py index 5311f0b3..e984decc 100644 --- a/tests/unit/dashboard/test_event_contract_types.py +++ b/tests/unit/dashboard/test_event_contract_types.py @@ -24,5 +24,11 @@ def test_thread_message_dto_exposes_execution_identity() -> None: assert "task_execution_id" in RunCommunicationMessageDto.model_fields +def test_thread_dto_exposes_summary_and_task_identity() -> None: + assert "summary" in RunCommunicationThreadDto.model_fields + assert "task_id" in RunCommunicationThreadDto.model_fields + assert "task_id" in RunCommunicationMessageDto.model_fields + + def test_cohort_updated_event_uses_cohort_summary_dto() -> None: assert CohortUpdatedEvent.model_fields["summary"].annotation is CohortSummaryDto diff --git a/tests/unit/runtime/test_communication_service.py b/tests/unit/runtime/test_communication_service.py new file mode 100644 index 00000000..f64c8115 --- /dev/null +++ b/tests/unit/runtime/test_communication_service.py @@ -0,0 +1,114 @@ +from collections.abc import Iterator +from uuid import uuid4 + +import pytest +from sqlalchemy.pool import StaticPool +from sqlmodel import Session, SQLModel, create_engine, select + +from ergon_core.core.runtime.services import communication_service as module +from ergon_core.core.runtime.services.communication_schemas import CreateMessageRequest + +Thread = module.Thread + + +@pytest.fixture() +def session_factory() -> Iterator[tuple[Session, object]]: + engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + SQLModel.metadata.create_all(engine) + + def _get_session() -> Session: + return Session(engine) + + yield _get_session + + +@pytest.mark.asyncio +async def test_save_message_persists_thread_summary_and_emits_it( + monkeypatch: pytest.MonkeyPatch, + session_factory, +) -> None: + emitted: list[tuple[object, object]] = [] + + async def _record_thread_event(*, run_id: object, thread: object, message: object) -> None: + emitted.append((thread, message)) + + monkeypatch.setattr(module, "get_session", session_factory) + monkeypatch.setattr(module.dashboard_emitter, "thread_message_created", _record_thread_event) + + run_id = uuid4() + execution_id = uuid4() + summary = "Leaf workers report completion artifacts and probe exit status." + + response = await module.CommunicationService().save_message( + CreateMessageRequest( + run_id=run_id, + from_agent_id="leaf-l_1", + to_agent_id="parent", + thread_topic="smoke-completion", + thread_summary=summary, + content="l_1: done exit=0", + task_execution_id=execution_id, + ) + ) + + with session_factory() as session: + thread = session.exec(select(Thread).where(Thread.id == response.thread_id)).one() + + assert thread.summary == summary + assert emitted + thread_dto, message_dto = emitted[0] + assert thread_dto.summary == summary + assert message_dto.task_execution_id == str(execution_id) + + +@pytest.mark.asyncio +async def test_save_message_backfills_missing_summary_without_overwriting_existing( + monkeypatch: pytest.MonkeyPatch, + session_factory, +) -> None: + async def _ignore_thread_event(*, run_id: object, thread: object, message: object) -> None: + return None + + monkeypatch.setattr(module, "get_session", session_factory) + monkeypatch.setattr(module.dashboard_emitter, "thread_message_created", _ignore_thread_event) + + service = module.CommunicationService() + run_id = uuid4() + await service.save_message( + CreateMessageRequest( + run_id=run_id, + from_agent_id="leaf-l_1", + to_agent_id="parent", + thread_topic="smoke-completion", + content="l_1: done exit=0", + ) + ) + await service.save_message( + CreateMessageRequest( + run_id=run_id, + from_agent_id="leaf-l_2", + to_agent_id="parent", + thread_topic="smoke-completion", + thread_summary="Completion reports from leaf workers.", + content="l_2: done exit=0", + ) + ) + await service.save_message( + CreateMessageRequest( + run_id=run_id, + from_agent_id="leaf-l_3", + to_agent_id="parent", + thread_topic="smoke-completion", + thread_summary="Replacement summary should not win.", + content="l_3: done exit=0", + ) + ) + + with session_factory() as session: + thread = session.exec(select(Thread).where(Thread.run_id == run_id)).one() + + assert thread.summary == "Completion reports from leaf workers." diff --git a/tests/unit/runtime/test_failed_task_sandbox_cleanup.py b/tests/unit/runtime/test_failed_task_sandbox_cleanup.py new file mode 100644 index 00000000..1fb202c9 --- /dev/null +++ b/tests/unit/runtime/test_failed_task_sandbox_cleanup.py @@ -0,0 +1,25 @@ +from unittest.mock import AsyncMock, patch + +import pytest + +from ergon_core.core.providers.sandbox.lifecycle import ( + SandboxTerminationReason, + SandboxTerminationResult, +) +from ergon_core.core.runtime.inngest.propagate_execution import _terminate_failed_task_sandbox + + +@pytest.mark.asyncio +async def test_failed_task_sandbox_cleanup_delegates_to_lifecycle_service() -> None: + result = SandboxTerminationResult( + sandbox_id="sandbox-real", + terminated=True, + reason=SandboxTerminationReason.TERMINATED, + ) + with patch( + "ergon_core.core.runtime.inngest.propagate_execution.terminate_sandbox_by_id", + new=AsyncMock(return_value=result), + ) as terminate: + await _terminate_failed_task_sandbox("sandbox-real") + + terminate.assert_awaited_once_with("sandbox-real") diff --git a/tests/unit/runtime/test_worker_execute_output_failure.py b/tests/unit/runtime/test_worker_execute_output_failure.py new file mode 100644 index 00000000..f421a542 --- /dev/null +++ b/tests/unit/runtime/test_worker_execute_output_failure.py @@ -0,0 +1,12 @@ +from ergon_core.api.results import WorkerOutput +from ergon_core.core.runtime.inngest.worker_execute import _worker_execute_result_from_output + + +def test_worker_execute_result_preserves_worker_output_failure() -> None: + result = _worker_execute_result_from_output( + WorkerOutput(output="probe failed", success=False), + ) + + assert result.success is False + assert result.final_assistant_message == "probe failed" + assert result.error == "probe failed" diff --git a/tests/unit/sandbox/test_sandbox_lifecycle_service.py b/tests/unit/sandbox/test_sandbox_lifecycle_service.py new file mode 100644 index 00000000..2753ab48 --- /dev/null +++ b/tests/unit/sandbox/test_sandbox_lifecycle_service.py @@ -0,0 +1,30 @@ +from unittest.mock import AsyncMock, patch + +import pytest + +from ergon_core.core.providers.sandbox.lifecycle import ( + SandboxTerminationReason, + terminate_sandbox_by_id, +) + + +@pytest.mark.asyncio +async def test_terminate_sandbox_by_id_dispatches_real_ids() -> None: + with patch( + "ergon_core.core.providers.sandbox.manager.BaseSandboxManager.terminate_by_sandbox_id", + new=AsyncMock(return_value=True), + ) as terminate: + result = await terminate_sandbox_by_id("sbx-live-123") + + terminate.assert_awaited_once_with("sbx-live-123") + assert result.terminated is True + assert result.reason == SandboxTerminationReason.TERMINATED + + +@pytest.mark.asyncio +async def test_terminate_sandbox_by_id_handles_missing_id_explicitly() -> None: + result = await terminate_sandbox_by_id(None) + + assert result.terminated is False + assert result.reason == SandboxTerminationReason.MISSING_ID + assert result.sandbox_id is None diff --git a/tests/unit/sandbox/test_stub_sandbox_id.py b/tests/unit/sandbox/test_stub_sandbox_id.py index 47b8dc4a..daed92c1 100644 --- a/tests/unit/sandbox/test_stub_sandbox_id.py +++ b/tests/unit/sandbox/test_stub_sandbox_id.py @@ -1,7 +1,7 @@ -"""Tests for is_stub_sandbox_id() sentinel check.""" +"""Tests for test-support stub sandbox IDs.""" import pytest -from ergon_core.core.providers.sandbox.manager import is_stub_sandbox_id +from ergon_core.test_support.sandbox import is_stub_sandbox_id @pytest.mark.parametrize( diff --git a/tests/unit/smoke_base/test_e2e_workflow_limits.py b/tests/unit/smoke_base/test_e2e_workflow_limits.py index f869fb0c..5fc338ba 100644 --- a/tests/unit/smoke_base/test_e2e_workflow_limits.py +++ b/tests/unit/smoke_base/test_e2e_workflow_limits.py @@ -1,11 +1,11 @@ from pathlib import Path -def test_e2e_smoke_matrix_is_serialized_for_e2b_quota() -> None: +def test_e2e_smoke_matrix_runs_benchmarks_in_parallel() -> None: workflow = Path(".github/workflows/e2e-benchmarks.yml").read_text() strategy_start = workflow.index(" strategy:") runs_on_start = workflow.index(" runs-on:", strategy_start) strategy_block = workflow[strategy_start:runs_on_start] - assert " max-parallel: 1\n" in strategy_block + assert " max-parallel: 3\n" in strategy_block diff --git a/tests/unit/smoke_base/test_leaf_sends_completion_message.py b/tests/unit/smoke_base/test_leaf_sends_completion_message.py index 5827c0bb..22865dac 100644 --- a/tests/unit/smoke_base/test_leaf_sends_completion_message.py +++ b/tests/unit/smoke_base/test_leaf_sends_completion_message.py @@ -102,6 +102,7 @@ async def _record(request: CreateMessageRequest) -> MagicMock: assert req.from_agent_id == "leaf-l_2" assert req.to_agent_id == "parent" assert req.thread_topic == "smoke-completion" + assert req.thread_summary is None assert "l_2" in req.content assert "exit=0" in req.content diff --git a/tests/unit/smoke_base/test_registry_smoke_entries.py b/tests/unit/smoke_base/test_registry_smoke_entries.py index e09069b2..893b09ea 100644 --- a/tests/unit/smoke_base/test_registry_smoke_entries.py +++ b/tests/unit/smoke_base/test_registry_smoke_entries.py @@ -54,6 +54,8 @@ def test_minif2f_slugs_registered() -> None: assert "minif2f-smoke-worker" in WORKERS assert "minif2f-smoke-leaf" in WORKERS + assert "minif2f-sadpath-smoke-worker" in WORKERS + assert "minif2f-smoke-leaf-failing" in WORKERS assert "minif2f-smoke-criterion" in EVALUATORS @@ -65,6 +67,8 @@ def test_swebench_slugs_registered() -> None: assert "swebench-smoke-worker" in WORKERS assert "swebench-smoke-leaf" in WORKERS + assert "swebench-sadpath-smoke-worker" in WORKERS + assert "swebench-smoke-leaf-failing" in WORKERS assert "swebench-smoke-criterion" in EVALUATORS diff --git a/tests/unit/smoke_base/test_sadpath_worker_routing.py b/tests/unit/smoke_base/test_sadpath_worker_routing.py index fa416e2f..9cad3fe5 100644 --- a/tests/unit/smoke_base/test_sadpath_worker_routing.py +++ b/tests/unit/smoke_base/test_sadpath_worker_routing.py @@ -7,41 +7,56 @@ from uuid import uuid4 from ergon_core.core.persistence.shared.types import AssignedWorkerSlug, TaskSlug -from ergon_core.test_support.smoke_fixtures.workers.researchrubrics_smoke_sadpath import ( +import pytest + +from ergon_core.test_support.smoke_fixtures.workers.minif2f_smoke import ( + MiniF2FSadPathSmokeWorker, +) +from ergon_core.test_support.smoke_fixtures.workers.researchrubrics_smoke import ( ResearchRubricsSadPathSmokeWorker, ) +from ergon_core.test_support.smoke_fixtures.workers.swebench_smoke import ( + SweBenchSadPathSmokeWorker, +) -def _worker() -> ResearchRubricsSadPathSmokeWorker: - return ResearchRubricsSadPathSmokeWorker( +@pytest.mark.parametrize( + ("worker_cls", "happy_leaf", "failing_leaf"), + [ + ( + ResearchRubricsSadPathSmokeWorker, + "researchrubrics-smoke-leaf", + "researchrubrics-smoke-leaf-failing", + ), + (MiniF2FSadPathSmokeWorker, "minif2f-smoke-leaf", "minif2f-smoke-leaf-failing"), + (SweBenchSadPathSmokeWorker, "swebench-smoke-leaf", "swebench-smoke-leaf-failing"), + ], +) +def test_l_2_routed_to_failing_leaf(worker_cls, happy_leaf: str, failing_leaf: str) -> None: + worker = worker_cls( name="unit-test", model=None, task_id=uuid4(), sandbox_id="sbx-unit", ) - - -def test_l_2_routed_to_failing_leaf() -> None: - worker = _worker() spec = worker._spec_for("l_2", ("l_1",), "Line 2") assert spec.task_slug == TaskSlug("l_2") - assert spec.assigned_worker_slug == AssignedWorkerSlug( - "researchrubrics-smoke-leaf-failing", - ) + assert spec.assigned_worker_slug == AssignedWorkerSlug(failing_leaf) assert spec.depends_on == [TaskSlug("l_1")] - -def test_all_other_slugs_use_happy_leaf() -> None: - worker = _worker() for slug in ("d_root", "d_left", "d_right", "d_join", "l_1", "l_3", "s_a", "s_b"): spec = worker._spec_for(slug, (), "…") - assert spec.assigned_worker_slug == AssignedWorkerSlug( - "researchrubrics-smoke-leaf", - ), f"{slug} should use happy leaf, got {spec.assigned_worker_slug}" + assert spec.assigned_worker_slug == AssignedWorkerSlug(happy_leaf), ( + f"{slug} should use happy leaf, got {spec.assigned_worker_slug}" + ) -def test_only_l_2_is_in_failing_slugs() -> None: +@pytest.mark.parametrize( + "worker_cls", + [ResearchRubricsSadPathSmokeWorker, MiniF2FSadPathSmokeWorker, SweBenchSadPathSmokeWorker], +) +def test_only_l_2_is_in_failing_slugs(worker_cls) -> None: """Sanity: future additions to FAILING_SLUGS should be conscious. If this assertion tightens, the sad-path driver's invariants must be updated in lock-step (8 messages vs 7, partial count, etc.).""" - assert ResearchRubricsSadPathSmokeWorker.FAILING_SLUGS == frozenset({"l_2"}) + assert worker_cls.FAILING_SLUGS == frozenset({"l_2"}) diff --git a/tests/unit/smoke_base/test_smoke_sandbox_manager.py b/tests/unit/smoke_base/test_smoke_sandbox_manager.py index 8e0692fe..661cceb7 100644 --- a/tests/unit/smoke_base/test_smoke_sandbox_manager.py +++ b/tests/unit/smoke_base/test_smoke_sandbox_manager.py @@ -1,3 +1,4 @@ +from pathlib import Path from uuid import UUID, uuid4 import pytest @@ -94,11 +95,16 @@ async def test_static_teardown_closes_registered_smoke_sandbox() -> None: try: sandbox_id = await manager.create(task_id, run_id=run_id) + tempdir = SmokeSandboxManager._tempdirs[task_id] + tempdir_path = Path(tempdir.name) terminated = await BaseSandboxManager.terminate_by_sandbox_id(sandbox_id) assert terminated is True assert manager.get_sandbox(task_id) is None + assert sandbox_id not in SmokeSandboxManager._sandbox_ids + assert task_id not in SmokeSandboxManager._tempdirs + assert not tempdir_path.exists() assert sink.closed == [(str(run_id), sandbox_id)] finally: SmokeSandboxManager.set_event_sink(NoopSandboxEventSink()) diff --git a/tests/unit/test_app_mounts_harness_conditionally.py b/tests/unit/test_app_mounts_harness_conditionally.py index 8310bdf9..dd7ce8f4 100644 --- a/tests/unit/test_app_mounts_harness_conditionally.py +++ b/tests/unit/test_app_mounts_harness_conditionally.py @@ -8,6 +8,7 @@ def _reload_app_with(monkeypatch: pytest.MonkeyPatch, env_value: str | None): + monkeypatch.delenv("ERGON_STARTUP_PLUGINS", raising=False) if env_value is None: monkeypatch.delenv("ENABLE_TEST_HARNESS", raising=False) else: diff --git a/tests/unit/test_test_harness.py b/tests/unit/test_test_harness.py index b967e2da..3f34038d 100644 --- a/tests/unit/test_test_harness.py +++ b/tests/unit/test_test_harness.py @@ -8,6 +8,7 @@ from fastapi.testclient import TestClient from ergon_core.core.api import test_harness +from ergon_core.core.api.startup_plugins import run_startup_plugins from ergon_core.core.api.test_harness import get_session_dep, router @@ -108,3 +109,8 @@ def test_reset_requires_secret_header(monkeypatch: pytest.MonkeyPatch) -> None: client = TestClient(app) resp = client.post("/api/test/write/reset", json={"cohort_prefix": "ci-smoke-"}) assert resp.status_code == 401 + + +def test_startup_plugin_loader_rejects_invalid_specs() -> None: + with pytest.raises(RuntimeError, match="expected 'module:function'"): + run_startup_plugins(("ergon_core.test_support.smoke_fixtures",))