diff --git a/.forge/ralph/apply-worker-cli-v5/prd.json b/.forge/ralph/apply-worker-cli-v5/prd.json deleted file mode 100644 index 4437cd4..0000000 --- a/.forge/ralph/apply-worker-cli-v5/prd.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/apply-worker-cli-v5", - "prdFile": "prd.md", - "description": "Expose the existing apply-worker core as a weekly CLI and prove mutation, commit trailers, and validation blocking with integration tests.", - "userStories": [ - { - "id": "US-001", - "title": "Add apply-worker CLI wrapper", - "description": "As a Webster operator, I want a Bun CLI that runs the existing apply-worker core against a weekly history directory so that selected proposal issues become validated experiment commits without manual orchestration.", - "acceptanceCriteria": [ - "Add a script entrypoint following repo conventions, likely scripts/apply-worker/cli.ts, with a #!/usr/bin/env bun shebang and import.meta.main guard.", - "The CLI accepts a week directory argument and reads proposal.md plus decision.json from that directory.", - "The CLI uses existing exports from scripts/apply-worker.ts for parseDecision, parseProposal, applyMutation, runValidation, buildCommitMessage, commitExperiment, emitSkip, and writeApplyLog instead of duplicating US-001 through US-004 core logic.", - "For each selected issue, the CLI applies mutations, runs lint, type-check, and format-check before committing, and creates one Git commit only when all three validation checks pass.", - "Every successful experiment commit message includes the existing trailer format Experiment-Id: exp-NN-slug validated by buildCommitMessage.", - "String mismatches or validation failures are recorded as skipped experiments in apply-log.json and emit structured skip rows through emitSkip without creating a commit for that experiment.", - "Missing week directory, missing proposal.md, or missing decision.json produces a clear non-zero CLI error without mutating files.", - "Type-check passes", - "Lint passes with zero warnings", - "Format check passes", - "Tests pass" - ], - "technicalNotes": "Build on scripts/apply-worker.ts:287-317 for parseDecision/parseProposal, 319-357 for applyMutation, 359-369 for runValidation, 372-393 for buildCommitMessage, 395-429 for commitExperiment, and 442-457 for emitSkip/writeApplyLog. Mirror CLI entrypoint/error shape from scripts/critic-genealogy.ts:676-694 and package script style from package.json scripts. Fixture artifact shapes are visible in history/2026-04-23/decision.json and history/2026-04-23/proposal.md.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/apply-worker-cli.ts, scripts/apply-worker.ts, package.json, .forge/ralph/apply-worker-cli-v5/prd.md." - }, - { - "id": "US-002", - "title": "Add apply-worker integration tests", - "description": "As a Webster maintainer, I want integration tests around the apply-worker CLI so that successful proposals mutate and commit correctly while broken proposals are blocked before commit.", - "acceptanceCriteria": [ - "Add Bun tests under scripts/__tests__/ using the existing bun:test style from scripts/__tests__/memory.test.ts.", - "Create fixture proposal.md and decision.json inputs that match the existing weekly schema and proposal format shown in history/2026-04-23 artifacts.", - "A successful fixture run verifies the target file content changed from the Before block to the After block.", - "A successful fixture run verifies git history contains a commit message trailer exactly matching Experiment-Id: exp-01- for the applied issue.", - "A successful fixture run verifies apply-log.json records the applied experiment with status applied and a commit_sha.", - "A deliberately broken proposal or validation-breaking mutation does not create a Git commit for that experiment.", - "The blocked fixture verifies apply-log.json and/or skips.jsonl records the terminal skip reason, including validation failure when the output fails the lint/type/format floor.", - "Tests isolate Git state and filesystem mutations in temporary directories or fixture repositories and do not mutate Webster's real history or site files.", - "bun run validate passes before committing the story.", - "Type-check passes", - "Lint passes with zero warnings", - "Format check passes", - "Tests pass" - ], - "technicalNotes": "Use scripts/__tests__/memory.test.ts:1-18 as the temp-path and cleanup pattern. Exercise the CLI from US-001 rather than only unit-testing helper functions. Keep fixtures minimal but schema-faithful to history/2026-04-23/decision.json:1-25 and proposal markdown sections from history/2026-04-23/proposal.md:1-28. Inspect commit messages with git log in the isolated fixture repo. The broken-output case should prove runValidation from scripts/apply-worker.ts:359-369 prevents commitExperiment from scripts/apply-worker.ts:395-429.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/__tests__/apply-worker-cli.test.ts." - } - ] -} diff --git a/.forge/ralph/apply-worker-cli-v5/prd.md b/.forge/ralph/apply-worker-cli-v5/prd.md deleted file mode 100644 index 75910b0..0000000 --- a/.forge/ralph/apply-worker-cli-v5/prd.md +++ /dev/null @@ -1,180 +0,0 @@ -# Apply Worker CLI v5 — Product Requirements - -## Overview - -**Problem**: Webster has the apply-worker core for parsing `proposal.md`, applying text mutations, running validation, emitting skip rows, writing `apply-log.json`, and building experiment commit messages, but the weekly operator still lacks a single CLI entrypoint and integration-level proof that the full proposal-to-commit path is safe. -**Solution**: Add a thin CLI wrapper around the existing apply-worker core and integration tests that exercise successful mutation commits, correct `Experiment-Id` trailers, and validation-blocked broken proposals. -**Branch**: `ralph/apply-worker-cli-v5` - ---- - -## Goals & Success - -### Primary Goal - -Expose the merged apply-worker core as a production CLI that can be run against a weekly history directory and can commit only validated experiment mutations. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| CLI entrypoint exists | `scripts/apply-worker/cli.ts` or repo-convention equivalent invokes the core from a weekly directory | Code review and `bun` execution in tests | -| Valid experiment commits | Every applied experiment creates a Git commit with `Experiment-Id: exp-NN-slug` | Integration test inspects `git log --format=%B` | -| Broken output is blocked | A deliberately broken proposal does not create a commit | Integration test compares commit count and apply log/skip output | -| Quality floor | Type, lint, format, validators, markdownlint, and tests pass | `bun run validate` | - -### Non-Goals (Out of Scope) - -- Reimplementing US-001 through US-004 — the core parser, text mutation engine, validation gate, skip-row emission, and apply-log writer already exist in `scripts/apply-worker.ts`. -- Multi-kind proposal routing — tracked separately in Layer 10 #47-#49. -- Visual review or critic rerun gates — downstream of the apply step and not part of this remaining scope. -- Changing proposal or decision schemas — this story consumes the existing `proposal.md` and `decision.json` shapes. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operator running the weekly landing-page improvement loop. -- **Role**: Takes a redesigner proposal and operator decision from `history//`, applies selected edits, and promotes only safe experiments. -- **Current Pain**: The core code exists but the operator cannot reliably run one command that reads weekly artifacts, applies each selected issue, validates, commits, and records blocked experiments. - -### User Journey - -1. **Trigger**: The weekly council produces `history//proposal.md` and `history//decision.json`. -2. **Action**: The operator runs the apply-worker CLI against that week directory. -3. **Outcome**: Each valid selected issue lands as its own commit with an experiment trailer; invalid or validation-breaking issues are skipped and recorded without a commit. - ---- - -## UX Requirements - -### Interaction Model - -Command-line only. The CLI should follow existing script conventions: executable Bun TypeScript files under `scripts/`, `#!/usr/bin/env bun`, `import.meta.main` guard, explicit usage/error output, and non-zero exits for bad invocation. Existing entrypoint patterns appear in `scripts/critic-genealogy.ts:676-694`, `scripts/validate-agents.ts:129`, and `scripts/validate-findings.ts:108`. - -Likely command shape: - -```bash -bun scripts/apply-worker/cli.ts history/2026-04-23 -``` - -The CLI reads: - -- `/proposal.md` -- `/decision.json` - -The CLI writes: - -- `/apply-log.json` -- `/skips.jsonl` when an experiment is skipped -- `/memory.jsonl` skip rows via the existing helper -- one Git commit per validated experiment - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | Week directory or required files are missing | Print usage/error and exit non-zero without mutating files | -| Loading | CLI is applying one selected proposal issue | Log current experiment id/title and validation status to stdout/stderr | -| Error | Proposal parse, string mismatch, validation failure, git add, or git commit fails | Record terminal skip when applicable; fail clearly for unrecoverable setup/git errors | -| Success | All selected issues were either committed or explicitly skipped | Write `apply-log.json`; exit 0 if the run completed deterministically | - ---- - -## Technical Context - -### Patterns to Follow - -- **Apply-worker core**: `scripts/apply-worker.ts:287-317` — `parseDecision` and `parseProposal` already map weekly artifacts into selected proposal issues. -- **Mutation engine**: `scripts/apply-worker.ts:319-357` — `applyMutation` performs exact string replacement and returns `string_mismatch` instead of silently proceeding. -- **Validation floor**: `scripts/apply-worker.ts:359-369` — `runValidation` runs lint, type-check, and format-check; the CLI must treat any failure as a no-commit skip for that experiment. -- **Commit trailer format**: `scripts/apply-worker.ts:372-393` — `buildCommitMessage` validates `exp-NN-slug` and emits `Experiment-Id: ${expId}`. -- **Git commit helper**: `scripts/apply-worker.ts:395-429` — `commitExperiment` stages touched files and parses the commit SHA. -- **Skip/apply-log writers**: `scripts/apply-worker.ts:442-457` — `emitSkip` and `writeApplyLog` already write terminal skip rows and `apply-log.json`. -- **Fixture schemas**: `history/2026-04-23/decision.json:1-25` shows `week` plus `selected_issues`; `history/2026-04-23/proposal.md:1-28` shows issue headings, target files, and Before/After blocks. -- **Test pattern**: `scripts/__tests__/memory.test.ts:1-18` uses `bun:test`, temp paths, and explicit cleanup; use the same style for integration fixtures. -- **CLI error pattern**: `scripts/critic-genealogy.ts:676-694` guards `import.meta.main`, maps usage errors to exit 2, and unexpected failures to exit 1. - -### Types & Interfaces - -```typescript -export interface DecisionJSON { - week: string; - selected_issues: DecisionIssue[]; -} - -export interface ProposalIssue { - index: number; - severity: Severity; - title: string; - files_touched: string[]; - mutations: RawMutation[]; -} - -export interface ApplyExperiment { - exp_id: string; - severity: Severity; - title: string; - status: "applied" | "skipped"; - mutations: MutationResult[]; - commit_sha?: string; - skip_reason?: "string_mismatch" | "lint_failure" | "type_failure" | "format_failure"; - skip_details?: Record; -} - -export interface ApplyLogJSON { - week: string; - run_timestamp: string; - experiments: ApplyExperiment[]; - validation_summary: { - lint_passed: boolean; - type_check_passed: boolean; - format_check_passed: boolean; - }; -} -``` - -### Architecture Notes - -- Build strictly on top of `scripts/apply-worker.ts`; do not duplicate parser, mutation, validation, skip, log, or commit helpers. -- If the current file must be split to support `scripts/apply-worker/cli.ts`, preserve public exports and avoid changing landed US-001-US-004 behavior except where CLI orchestration needs a missing exported helper. -- Each selected issue should be treated as a separate experiment with deterministic id `exp-${NN}-${slug}` through the existing `buildCommitMessage` guard. -- The hard floor is lint + type-check + format-check before commit. Full repository validation (`bun run validate`) remains the story completion gate. -- Integration tests may need to run in a temporary Git repository or carefully isolated fixture repo so real Webster history is not mutated. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Add apply-worker CLI wrapper | 1 | — | -| US-002 | Add apply-worker integration tests | 2 | US-001 | - -### Dependency Graph - -```text -US-001 (CLI wrapper around existing core) - ↓ -US-002 (integration tests for commits, mutation, and validation blocking) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full gate before final commit: `bun run validate` - ---- - -Generated: 2026-04-24T07:47:46Z diff --git a/.forge/ralph/genealogy-gov-v1/prd.json b/.forge/ralph/genealogy-gov-v1/prd.json deleted file mode 100644 index 705e9da..0000000 --- a/.forge/ralph/genealogy-gov-v1/prd.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/genealogy-gov-v1", - "prdFile": "prd.md", - "description": "Implement Feature #55 genealogy governance layers 2-4: dedup, 13-week cap, and archive-on-idle.", - "userStories": [ - { - "id": "US-001", - "title": "Layer 2 embedding dedup blocks overlapping critic specs", - "description": "As a Webster operator, I want new critic specs rejected when their scope substantially overlaps an existing critic so that genealogy does not create duplicate weekly agents.", - "acceptanceCriteria": [ - "Add an orchestrator-side dedup helper in scripts/critic-genealogy.ts that compares a candidate NewCriticSpec against active CriticSummary entries using embedding cosine similarity over scope and description text.", - "Reject candidate specs with cosine similarity >= 0.60 to any active critic before registerAgent() performs POST /v1/agents.", - "Governance rejection prints the closest existing critic name, similarity score, and candidate scope without registering an agent or creating a session.", - "Dry-run mode still runs and reports the dedup decision before printing a would-register spec.", - "Unit tests cover below-threshold allow, exactly-0.60 reject, and above-threshold reject behavior without live Anthropic network calls.", - "Type-check passes", - "Tests pass", - "bun run validate passes" - ], - "technicalNotes": "Modify scripts/critic-genealogy.ts around NewCriticSpec/CriticSummary definitions (lines 31-60), active critic loading (lines 155-168), and the main flow before registerAgent() (lines 457+ and main registration section). Follow fail-fast error style from lines 339-356. Add tests in scripts/__tests__/critic-genealogy.test.ts mirroring direct helper tests at lines 93-120 and 160-203. Existing agents expose metadata.scope and description in agents/*-critic.json.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/critic-genealogy.ts, scripts/__tests__/critic-genealogy.test.ts, .forge/ralph/genealogy-gov-v1/prd.md." - }, - { - "id": "US-002", - "title": "Layer 3 13-week cap with operator soft override", - "description": "As a Webster operator, I want a maximum of 3 new critics per 13 weeks with a deliberate override flag so that genealogy growth is bounded but not blocked in exceptional cases.", - "acceptanceCriteria": [ - "Add a CLI soft-override flag, named --override-quarterly-cap or equivalent, to parseArgs() and CLIArgs in scripts/critic-genealogy.ts.", - "Count spawned critic specs from history//genealogy/spec.json in the rolling 13-week window ending at args.weekDate.", - "Block registration when the count is already 3 or more and the override flag is false.", - "Allow registration when the count is 3 or more only if the override flag is true, and print an explicit operator override message.", - "Ignore malformed or missing non-genealogy history directories only when they are irrelevant; malformed in-window genealogy spec data fails loudly with an actionable error.", - "Unit tests cover count 0, count 2 allow, count 3 block, count 3 with override allow, and boundary dates at exactly 13 weeks.", - "Type-check passes", - "Tests pass", - "bun run validate passes" - ], - "technicalNotes": "Extend scripts/critic-genealogy.ts parseArgs() at lines 72-105 and printUsage() at lines 107-115. Use writeArtifacts() output convention at lines 570-585: each spawn writes history//genealogy/spec.json. Gate the main flow before registerAgent() and createSession(). Add parseArgs tests beside scripts/__tests__/critic-genealogy.test.ts lines 44-75 and helper tests with temporary history fixtures.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/critic-genealogy.ts, scripts/__tests__/critic-genealogy.test.ts." - }, - { - "id": "US-003", - "title": "Layer 4 archive idle spawned critics", - "description": "As a Webster operator, I want spawned critics with no promoted findings in 8 weeks archived so that inactive council members stop consuming weekly attention and tokens.", - "acceptanceCriteria": [ - "Add archive-on-idle logic in scripts/critic-genealogy.ts that evaluates spawned critics and moves idle specs from agents/.json to agents/archive/.json.", - "A spawned critic is archived only when it has 0 findings promoted across the last 8 weeks; critics with at least 1 promoted finding in that window remain active.", - "Original committed baseline critics are not archived by the idle rule unless history proves they were genealogy-spawned.", - "loadExistingCritics() continues to load only active agents/*-critic.json files and excludes agents/archive/*.json by path.", - "Archive actions create agents/archive/ if missing and preserve the JSON spec byte-for-byte except for formatting caused by existing JSON write conventions if needed.", - "Unit tests cover idle spawned critic archived, active spawned critic retained, original critic retained, and archived critic excluded from active critic summaries.", - "Type-check passes", - "Tests pass", - "bun run validate passes" - ], - "technicalNotes": "Implement in scripts/critic-genealogy.ts near loadExistingCritics() lines 155-168 and main startup before active critic summary logging. Use agents/*-critic.json naming shown by agents/brand-voice-critic.json and peers. Use genealogy provenance from history//genealogy/spec.json created by writeArtifacts() lines 570-585. Add tests in scripts/__tests__/critic-genealogy.test.ts next to loadExistingCritics tests at lines 78-90; use temporary fixture directories or exported pure helpers to avoid mutating real agents during tests.", - "dependsOn": [ - "US-001", - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: scripts/critic-genealogy.ts, scripts/__tests__/critic-genealogy.test.ts." - } - ] -} diff --git a/.forge/ralph/genealogy-gov-v1/prd.md b/.forge/ralph/genealogy-gov-v1/prd.md deleted file mode 100644 index bf766f1..0000000 --- a/.forge/ralph/genealogy-gov-v1/prd.md +++ /dev/null @@ -1,172 +0,0 @@ -# Genealogy Governance Layers 2-4 — Product Requirements - -## Overview - -**Problem**: Webster can now spawn new critics at runtime, but without code-level governors the council can duplicate existing critic scopes, exceed a sensible growth rate, and keep idle critics in weekly runs indefinitely. That creates token-waste drift and weakens the demo claim that genealogy is controlled rather than chaotic. -**Solution**: Implement Q5.1 governance layers 2-4 in the existing genealogy registration path: embedding-based deduplication before registration, a 13-week cap with operator soft override, and archive-on-idle pruning for critics with no promoted findings across 8 weeks. -**Branch**: `ralph/genealogy-gov-v1` - ---- - -## Goals & Success - -### Primary Goal - -Bound runtime critic spawning while preserving legitimate, operator-overridable genealogy growth. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| Duplicate critic rejection | New critic specs with >=60% embedding cosine similarity to an existing critic are rejected before `POST /v1/agents` | Unit tests around `scripts/critic-genealogy.ts` registration path | -| Quarterly spawn cap | More than 3 new critics in any rolling 13-week window is blocked unless an operator override flag is present | Unit tests using historical `history/*/genealogy/spec.json` fixtures | -| Idle critic retirement | Spawned critics with 0 promoted findings over 8 weeks are moved to `agents/archive/` and excluded from active critic loading | Unit tests around archive-on-idle logic and `loadExistingCritics()` behavior | -| Validation | `bun run validate` passes with zero lint warnings | Project validation command | - -### Non-Goals (Out of Scope) - -- Layer 1 prompt rubric edits in `prompts/second-wbs-session.md` — explicitly deferred until `feat/orch-memory-planner-v2` PR #6 merges because that branch is actively modifying the same file. -- Redesigning planner or redesigner request schemas — Feature #55 scope is governance layers 2-4 only. -- Deleting retired critics from Git history or the Managed Agents API — Layer 4 archives local specs recoverably rather than destructive deletion. -- Building live embedding infrastructure beyond this path — the dedup check is local to `scripts/critic-genealogy.ts` new-critic registration. - ---- - -## User & Context - -### Target User - -- **Who**: Webster operator running weekly landing-page improvement sessions. -- **Role**: Maintains a council of Claude Managed Agents and reviews automated changes before submission or deployment. -- **Current Pain**: Runtime genealogy is powerful, but every extra critic is a recurring weekly cost. Duplicate or idle critics turn the council into an expensive echo chamber. - -### User Journey - -1. **Trigger**: Planner or genealogy detection identifies a possible unowned concern and `scripts/critic-genealogy.ts` prepares a new critic spec. -2. **Action**: The orchestrator-side genealogy script evaluates overlap, recent spawn count, and idle critic state before registering or invoking agents. -3. **Outcome**: Legitimate critics are registered and invoked; duplicate or over-cap critics are blocked with explicit evidence; idle spawned critics are archived before future council runs. - ---- - -## UX Requirements - -### Interaction Model - -This is backend/CLI orchestration. The primary interface remains: - -```bash -bun scripts/critic-genealogy.ts --branch [--week YYYY-MM-DD] [--lp-target URL] [--dry-run] -bun scripts/critic-genealogy.ts --fixtures [--week YYYY-MM-DD] [--lp-target URL] [--dry-run] -``` - -Layer 3 adds an operator soft-override flag, for example `--override-quarterly-cap`, that allows a human-approved spawn when the 13-week cap has already been reached. Layer 4 archive-on-idle should run from the same script before active critic loading/registration so archived critics are not considered active council members. - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | No spawned genealogy history or no archived critics yet | Dedup still compares against current `agents/*-critic.json`; cap count is 0; retire pass no-ops | -| Loading | Embedding similarity or API-backed registration is in progress | Script prints explicit progress and continues existing fail-fast error behavior | -| Error | Embedding request fails, malformed history exists, archive move fails, or cap blocks without override | Script exits non-zero for operational errors; governance blocks print actionable reason and skip registration | -| Success | New spec is below 60% overlap, under cap or operator-overridden, and idle critics are archived | Script registers/invokes as today and writes artifacts; archive pass moves idle specs to `agents/archive/` | - ---- - -## Technical Context - -### Patterns to Follow - -- **Similar implementation**: `scripts/critic-genealogy.ts:155-168` — active critics are discovered from `agents/*-critic.json`; Layer 4 should exclude `agents/archive/` by keeping archived files outside this glob. -- **Similar implementation**: `scripts/critic-genealogy.ts:457-477` — `registerAgent()` is the correct choke point before `POST /v1/agents`; Layer 2 and Layer 3 checks should run before this call. -- **Similar implementation**: `scripts/critic-genealogy.ts:570-585` — `writeArtifacts()` records genealogy specs under `history//genealogy/`; Layer 3 can count recent spawns from these artifacts. -- **Component pattern**: `scripts/critic-genealogy.ts:72-105` — CLI flags are parsed with explicit mutually-exclusive validation and `CLIError`; add the soft-override flag here. -- **Error handling pattern**: `scripts/critic-genealogy.ts:141-152` and `scripts/critic-genealogy.ts:339-356` — invalid state fails loudly with clear error messages, no silent fallback. -- **Test pattern**: `scripts/__tests__/critic-genealogy.test.ts:44-75` — CLI parsing tests assert accepted and rejected flags. -- **Test pattern**: `scripts/__tests__/critic-genealogy.test.ts:78-90` — active critic loading behavior is unit-tested directly. -- **Test pattern**: `scripts/__tests__/critic-genealogy.test.ts:160-203` — generated agent JSON behavior is tested with direct helpers and schema validation. - -### Types & Interfaces - -```typescript -interface NewCriticSpec { - name: string; - scope: string; - description: string; - rationale: string; - focus_owned: string[]; - focus_not_owned: string[]; - severity_rubric: string; -} - -interface AgentJSON { - name: string; - description: string; - model: string; - system: string; - tools: unknown[]; - mcp_servers?: unknown[]; - metadata?: Record; -} - -interface CriticSummary { - name: string; - scope: string; - description: string; -} - -interface CLIArgs { - branch: string | null; - fixtures: string | null; - weekDate: string; - lpTarget: string; - dryRun: boolean; - // add: overrideQuarterlyCap: boolean; -} -``` - -### Architecture Notes - -- Feature #55 is governed by `context/FEATURES.md:170` and Q5.1 in `context/DOMAIN-MODEL.md:303-333`; use the user's updated thresholds for this PRD: 60% cosine overlap, max 3 critics per 13 weeks, and 0 promoted findings in 8 weeks. -- Existing critic specs live in `agents/*-critic.json`; active critics include the five original critics plus `visual-design-critic.json` if present. -- Spawn artifacts live under `history//genealogy/spec.json`, created by `writeArtifacts()`. -- Registration currently happens through `registerAgent()` after `spliceNewSpec()` creates an `AgentJSON`; governance should block before remote agent creation and before session creation. -- Promoted findings evidence should come from existing history artifacts where available. If implementation needs a source of truth, prefer explicit history rows over inferring from current findings text. -- Validation follows `CLAUDE.md`: zero lint warnings, full type check, format check, tests, and `bun run validate` before declaring done. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Layer 2 embedding dedup blocks overlapping critic specs | 1 | — | -| US-002 | Layer 3 13-week cap with operator soft override | 2 | US-001 | -| US-003 | Layer 4 archive idle spawned critics | 3 | US-001, US-002 | - -### Dependency Graph - -```text -US-001 (dedup guard before registration) - ↓ -US-002 (rolling 13-week cap + soft override) - ↓ -US-003 (archive-on-idle pruning) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full project gate: `bun run validate` - ---- - -Generated: 2026-04-24T00:00:00.000Z diff --git a/.forge/ralph/genealogy-gov-v1/progress.txt b/.forge/ralph/genealogy-gov-v1/progress.txt deleted file mode 100644 index 41625c8..0000000 --- a/.forge/ralph/genealogy-gov-v1/progress.txt +++ /dev/null @@ -1,88 +0,0 @@ -## Codebase Patterns - -### Archive-on-idle uses explicit decision owners -- **Where**: `scripts/critic-genealogy.ts` -- **Pattern**: Treat `history//genealogy/spec.json` as spawn provenance and `history//decision.json:selected_issues[].owner` as promoted-finding evidence; move agent specs with `renameSync` to preserve bytes. -- **Example**: `archiveIdleSpawnedCritics(agentsDir, historyRoot, weekDate)` runs before `loadExistingCritics()` in `main()`. - -### Governance helpers stay pure and injectable -- **Where**: `scripts/critic-genealogy.ts` -- **Pattern**: Put governance decisions in exported pure helpers and inject data providers, then call the helper in `main()` immediately before the side-effecting API boundary. -- **Example**: `evaluateCriticDedup(candidate, critics, embed)` runs before `registerAgent()` and is tested with deterministic vectors. - ---- - -## 2026-04-24T00:00:00.000Z — US-001: Layer 2 embedding dedup blocks overlapping critic specs - -**Status**: PASSED -**Files changed**: -- `scripts/critic-genealogy.ts` — added cosine-similarity dedup helper over candidate/existing critic scope and description, governance rejection output, and pre-registration gate. -- `scripts/__tests__/critic-genealogy.test.ts` — added no-network unit coverage for below-threshold allow, exact 0.60 reject, and above-threshold reject behavior. -- `.forge/ralph/genealogy-gov-v1/prd.md` — fixed markdownlint MD036 formatting so `bun run validate` can pass. -- `.forge/ralph/genealogy-gov-v1/prd.json` — marked US-001 complete. - -**Acceptance criteria verified**: -- [x] Add an orchestrator-side dedup helper in `scripts/critic-genealogy.ts` that compares a candidate `NewCriticSpec` against active `CriticSummary` entries using embedding cosine similarity over scope and description text. -- [x] Reject candidate specs with cosine similarity >= 0.60 to any active critic before `registerAgent()` performs `POST /v1/agents`. -- [x] Governance rejection prints the closest existing critic name, similarity score, and candidate scope without registering an agent or creating a session. -- [x] Dry-run mode still runs and reports the dedup decision before printing a would-register spec. -- [x] Unit tests cover below-threshold allow, exactly-0.60 reject, and above-threshold reject behavior without live Anthropic network calls. -- [x] Type-check passes. -- [x] Tests pass. -- [x] `bun run validate` passes. - -**Learnings**: -- `bun run validate` includes markdownlint over `.forge/ralph/**/*.md`; generated PRD footer emphasis triggered MD036 and had to be normalized. -- The local package has no `cli` script, so Ralph workflow event emits fail harmlessly with `Script not found "cli"` under the required `|| true` guard. - ---- -## 2026-04-24T08:29:55.000Z — US-002: Layer 3 13-week cap with operator soft override - -**Status**: PASSED -**Files changed**: -- `scripts/critic-genealogy.ts` — added `--override-quarterly-cap`, rolling 13-week genealogy spawn counting, malformed in-window spec validation, and pre-registration cap gate. -- `scripts/__tests__/critic-genealogy.test.ts` — added no-network unit coverage for count 0, count 2 allow, count 3 block, count 3 override allow, exactly-13-week boundary inclusion, and malformed in-window genealogy spec failure. -- `.forge/ralph/genealogy-gov-v1/prd.json` — marked US-002 complete. - -**Acceptance criteria verified**: -- [x] Add a CLI soft-override flag, named `--override-quarterly-cap` or equivalent, to `parseArgs()` and `CLIArgs` in `scripts/critic-genealogy.ts`. -- [x] Count spawned critic specs from `history//genealogy/spec.json` in the rolling 13-week window ending at `args.weekDate`. -- [x] Block registration when the count is already 3 or more and the override flag is false. -- [x] Allow registration when the count is 3 or more only if the override flag is true, and print an explicit operator override message. -- [x] Ignore malformed or missing non-genealogy history directories only when they are irrelevant; malformed in-window genealogy spec data fails loudly with an actionable error. -- [x] Unit tests cover count 0, count 2 allow, count 3 block, count 3 with override allow, and boundary dates at exactly 13 weeks. -- [x] Type-check passes. -- [x] Tests pass. -- [x] `bun run validate` passes. - -**Learnings**: -- The registration choke point now has two pure governance gates before API side effects: dedup first, then quarterly cap before `registerAgent()` and `createSession()`. -- Treat the 13-week boundary as inclusive: a spawn exactly 91 days before `args.weekDate` counts toward the cap. -- Non-date history directories can be ignored; in-window `genealogy/spec.json` files must parse as agent-like JSON so broken genealogy artifacts fail loudly. - ---- -## 2026-04-24T09:15:00.000Z — US-003: Layer 4 archive idle spawned critics - -**Status**: PASSED -**Files changed**: -- `scripts/critic-genealogy.ts` — added genealogy-spawn provenance loading, promoted-finding owner parsing from decision history, archive-on-idle moves to `agents/archive/`, and startup pruning before active critic loading. -- `scripts/__tests__/critic-genealogy.test.ts` — added unit coverage for idle spawned critic archival, active spawned critic retention, original critic retention, archived critic exclusion from active summaries, promoted owner parsing, and spawned provenance loading. -- `.forge/ralph/genealogy-gov-v1/prd.json` — marked US-003 complete. - -**Acceptance criteria verified**: -- [x] Add archive-on-idle logic in `scripts/critic-genealogy.ts` that evaluates spawned critics and moves idle specs from `agents/.json` to `agents/archive/.json`. -- [x] A spawned critic is archived only when it has 0 findings promoted across the last 8 weeks; critics with at least 1 promoted finding in that window remain active. -- [x] Original committed baseline critics are not archived by the idle rule unless history proves they were genealogy-spawned. -- [x] `loadExistingCritics()` continues to load only active `agents/*-critic.json` files and excludes `agents/archive/*.json` by path. -- [x] Archive actions create `agents/archive/` if missing and preserve the JSON spec byte-for-byte except for formatting caused by existing JSON write conventions if needed. -- [x] Unit tests cover idle spawned critic archived, active spawned critic retained, original critic retained, and archived critic excluded from active critic summaries. -- [x] Type-check passes. -- [x] Tests pass. -- [x] `bun run validate` passes. - -**Learnings**: -- Spawned-critic provenance should come from `history//genealogy/spec.json`, not from current agent names alone, so baseline critics are safe by default. -- Promoted-finding evidence is explicit in `decision.json:selected_issues[].owner`; missing decision files mean no promoted findings for that week, while malformed present decision files fail loudly. -- `renameSync` preserves archived agent JSON bytes and avoids rewriting specs during governance pruning. - ---- diff --git a/.forge/ralph/orch-memory-planner-v1/prd.json b/.forge/ralph/orch-memory-planner-v1/prd.json deleted file mode 100644 index 74f15e8..0000000 --- a/.forge/ralph/orch-memory-planner-v1/prd.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "project": "Webster", - "branchName": "feat/orch-memory-planner-v1", - "prdFile": "prd.md", - "description": "Add the orchestrator step that marshals memory + verdict + monitor context, invokes webster-planner via the Managed Agents flow, parses the JSON response, writes history//plan.md, and appends a verdict-ready event to history/memory.jsonl.", - "userStories": [ - { - "id": "US-001", - "title": "Memory marshaling helper", - "description": "As a Webster operator, I want a pure TypeScript helper that reads memory.jsonl tail plus recent verdicts plus the monitor anomaly report and returns a single concatenated user.message text so that the planner always receives the same shape of context.", - "acceptanceCriteria": [ - "Add scripts/planner-context.ts exporting marshalPlannerContext(opts: { memoryPath: string; verdictDir: string; monitorPath: string; tailN?: number }): string.", - "The function uses the feature #51 tailN helper from src/memory (or scripts/memory) to read the last N=50 events from memoryPath; it does not re-implement tail logic.", - "The function reads the two most recent history//verdict.json files under verdictDir sorted by week slug descending; missing verdict files are skipped without throwing.", - "The function reads the monitor anomaly report text file at monitorPath; a missing monitor file is skipped without throwing.", - "The returned string begins with a MEMORY_TAIL section, then a RECENT_VERDICTS section, then a MONITOR_ANOMALIES section, each delimited by a stable header the test file can match.", - "When all three inputs are empty or missing, the function returns a string that explicitly labels the cold-start state rather than an empty string.", - "Add a Bun test under scripts/__tests__/planner-context.test.ts with fixtures under tmp paths verifying: tailN wiring, two-verdict ordering, missing-file skips, and cold-start labeling.", - "bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass." - ], - "technicalNotes": "Follow the style in scripts/critic-genealogy.ts and scripts/memory.ts (or src/memory). Use readFileSync + path.join; do not use async file APIs unless the existing modules already do. Keep the function free of network I/O. Reuse the feature #51 export rather than re-reading JSONL lines directly.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/planner-context.ts, scripts/__tests__/planner-context.test.ts." - }, - { - "id": "US-002", - "title": "Planner invocation + plan writer", - "description": "As a Webster operator, I want a helper that invokes the webster-planner Managed Agent using the marshaled context, parses the structured response, writes history//plan.md, and appends a verdict-ready row to history/memory.jsonl so that downstream critics can consume the plan.", - "acceptanceCriteria": [ - "Add scripts/planner-invoke.ts exporting invokePlanner(opts: { contextText: string; week: string; historyDir: string; apiKey: string }): Promise<{ planPath: string; plan: PlanRecord }>.", - "The function looks up or registers the webster-planner agent via POST /v1/agents, mirroring the find-or-register pattern in scripts/critic-genealogy.ts:440-556.", - "The function creates a session via POST /v1/sessions, sends contextText as the user.message event, and polls until the session is idle.", - "The function extracts the final assistant text and parses it as JSON with fields classification, next_action, direction_hint, optional new_critic_request, and rationale; next_action must be one of promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly.", - "The function writes history//plan.md as human-readable markdown that embeds the parsed JSON in a fenced code block and prints the parsed fields as a bulleted summary above the fence.", - "The function appends one event row to history/memory.jsonl with event = 'verdict-ready' using the feature #51 appendEvent helper and includes refs.plan = relative path to the written plan.md.", - "Invalid JSON, missing required fields, or unknown next_action values raise an Error with a descriptive message and do NOT write plan.md or append to memory.jsonl.", - "Add a Bun test under scripts/__tests__/planner-invoke.test.ts that mocks fetch (global.fetch or bun:test mock) to exercise: happy path with a valid JSON response, malformed response rejection, and the memory.jsonl append side effect.", - "bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass." - ], - "technicalNotes": "Reuse helper names / patterns from scripts/critic-genealogy.ts (registerAgent, createSession, sendUserMessage, pollUntilIdle) rather than duplicating low-level fetch code. Keep the HTTP base URL configurable via env (ANTHROPIC_API_BASE defaulting to https://api.anthropic.com). Import appendEvent from the same module feature #51 added. Do NOT edit agents/webster-planner.json (owned by feature #50) and do NOT edit prompts/second-wbs-session.md in this story (US-003 owns that).", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/planner-invoke.ts, scripts/__tests__/planner-invoke.test.ts." - }, - { - "id": "US-003", - "title": "Orchestrator step in prompts/second-wbs-session.md", - "description": "As a Webster operator, I want a new orchestration step in prompts/second-wbs-session.md that runs BEFORE critic fan-out and calls the US-002 helper with US-001 inputs so that the planner's plan.md is ready for critics to read.", - "acceptanceCriteria": [ - "Edit prompts/second-wbs-session.md to add a new numbered step titled 'Run planner' placed BEFORE the critic fan-out step.", - "The step shows the bash/bun invocation that marshals context via scripts/planner-context.ts and invokes the planner via scripts/planner-invoke.ts, with the week argument set to the current ISO week folder name under history/.", - "The step specifies that on planner error the run halts with a non-zero exit status and a pointer to the error message.", - "The step references history//plan.md as the output artifact consumed by later steps.", - "Update README.md or an adjacent doc section if the prior council flow explicitly enumerated the steps, so the step count remains accurate.", - "bun run validate passes." - ], - "technicalNotes": "Edit prompts/second-wbs-session.md only \u2014 do not change orchestrator-owned I/O in the helpers. The step should read like the existing numbered steps: plain bash with comments and exit-on-error semantics. Do not implement runtime invocation of critics from this file; that remains in the later fan-out step.", - "dependsOn": [ - "US-001", - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: prompts/second-wbs-session.md, README.md." - } - ] -} diff --git a/.forge/ralph/orch-memory-planner-v1/prd.md b/.forge/ralph/orch-memory-planner-v1/prd.md deleted file mode 100644 index d50e0c7..0000000 --- a/.forge/ralph/orch-memory-planner-v1/prd.md +++ /dev/null @@ -1,65 +0,0 @@ -# Orchestrator Memory Marshaling + Planner Invocation (L11 #52) - -## Problem - -Webster's council flow fans out critics + redesigner, but week-over-week -learning currently has no explicit planner step. Feature #50 ships the -`webster-planner` Managed Agent spec. Feature #51 ships the -`history/memory.jsonl` event substrate + `appendEvent` / `tailN` helpers. -This feature wires the two together: an orchestrator step that runs BEFORE -critic fan-out, invokes the planner, writes `plan.md`, and logs the event. - -Per ADR-0001 the orchestrator owns all JSONL I/O. The planner agent never -touches disk — it receives marshaled context as `user.message` text and -returns structured output the orchestrator parses. - -## Scope - -- Add a TypeScript helper module that marshals the planner's input context. -- Add a TypeScript helper module that invokes the planner via the - Anthropic Agents Managed-Agents flow and writes the decoded `plan.md`. -- Add a new orchestration step to `prompts/second-wbs-session.md` that - calls the helpers BEFORE the critic fan-out step. - -Out of scope (covered by separate features): - -- Plan → critic context wiring (#53). -- Cold-start explore-broadly defaults (#54) — this feature must not - crash when memory tail is empty, but the dedicated cold-start logic is #54. -- Critic-genealogy invocation of `new_critic_request` (#55). - -## Invariants - -- Orchestrator-owned I/O. No disk writes from inside the planner agent - prompt or tool definitions. -- Append-only `history/memory.jsonl`. Use the `appendEvent` helper from - feature #51. Never mutate prior rows. -- Zero lint warnings. `bun run validate` must pass. -- No silent fallbacks. If the planner call fails or returns unparseable - output, surface the error — do not fabricate a plan. -- No API keys in committed code. Load from environment. - -## Stories - -### US-001 — Memory marshaling helper - -Add `scripts/planner-context.ts` exporting a pure function that reads the -last N memory events plus recent verdict files plus the monitor anomaly -report and returns a single concatenated text payload suitable for the -planner's `user.message`. - -### US-002 — Planner invocation + plan writer - -Add `scripts/planner-invoke.ts` exporting a function that registers the -`webster-planner` agent (idempotent lookup), creates a session, sends the -marshaled user message, polls until idle, parses the planner's JSON -response, writes `history//plan.md`, and appends a `verdict-ready` -row to `history/memory.jsonl` via the feature #51 helper. - -### US-003 — Orchestrator integration step - -Edit `prompts/second-wbs-session.md` to add a new numbered step that runs -BEFORE the critic fan-out step. The step invokes the helper from US-002 -using the marshaled context from US-001, writes `plan.md` into the current -week's `history//` directory, and halts the run if the planner call -returns an error. diff --git a/.forge/ralph/orch-memory-planner-v1/progress.txt b/.forge/ralph/orch-memory-planner-v1/progress.txt deleted file mode 100644 index 09c3091..0000000 --- a/.forge/ralph/orch-memory-planner-v1/progress.txt +++ /dev/null @@ -1,81 +0,0 @@ -## Codebase Patterns - -### Planner invocation fails closed before disk writes -- **Where**: `scripts/planner-invoke.ts` -- **Pattern**: Parse and validate the final assistant JSON before creating `history//plan.md` or appending to `history/memory.jsonl`; failed planner output leaves no partial plan artifact. -- **Example**: `const plan = parsePlanRecord(extractFinalAssistantText(snapshot));` - -### Orchestrator memory helpers import the substrate directly -- **Where**: `scripts/planner-context.ts` -- **Pattern**: Higher-level orchestrator helpers should import `tailN` from `scripts/memory.ts` instead of re-reading JSONL lines. -- **Example**: `const memoryEvents = readMemoryTail(opts.tailN ?? DEFAULT_TAIL_N, opts.memoryPath);` - ---- - -## 2026-04-24 — US-001: Memory marshaling helper - -**Status**: PASSED -**Files changed**: -- `scripts/planner-context.ts` — added `marshalPlannerContext` with memory tail, recent verdict, monitor anomaly, and cold-start sections. -- `scripts/__tests__/planner-context.test.ts` — added Bun coverage for tailN wiring, verdict ordering, missing-file skips, and cold-start labeling. - -**Acceptance criteria verified**: -- [x] Add scripts/planner-context.ts exporting marshalPlannerContext(opts: { memoryPath: string; verdictDir: string; monitorPath: string; tailN?: number }): string. -- [x] The function uses the feature #51 tailN helper from src/memory (or scripts/memory) to read the last N=50 events from memoryPath; it does not re-implement tail logic. -- [x] The function reads the two most recent history//verdict.json files under verdictDir sorted by week slug descending; missing verdict files are skipped without throwing. -- [x] The function reads the monitor anomaly report text file at monitorPath; a missing monitor file is skipped without throwing. -- [x] The returned string begins with a MEMORY_TAIL section, then a RECENT_VERDICTS section, then a MONITOR_ANOMALIES section, each delimited by a stable header the test file can match. -- [x] When all three inputs are empty or missing, the function returns a string that explicitly labels the cold-start state rather than an empty string. -- [x] Add a Bun test under scripts/__tests__/planner-context.test.ts with fixtures under tmp paths verifying: tailN wiring, two-verdict ordering, missing-file skips, and cold-start labeling. -- [x] bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass. - -**Learnings**: -- `scripts/memory.ts` exposes `tailN(n, logPath)`, so callers pass the count first and the memory path second. -- No prior `progress.txt` existed for this PRD directory, so this iteration created it with the reusable pattern section. - ---- - -## 2026-04-24 — US-002: Planner invocation + plan writer - -**Status**: PASSED -**Files changed**: -- `scripts/planner-invoke.ts` — added `invokePlanner` with Managed Agent lookup/registration, session creation, context message send, idle polling, strict plan JSON parsing, plan markdown writing, and `verdict-ready` memory append. -- `scripts/__tests__/planner-invoke.test.ts` — added Bun fetch-mock coverage for a valid planner response, malformed JSON rejection, unknown `next_action` rejection, and memory append side effects. - -**Acceptance criteria verified**: -- [x] Add scripts/planner-invoke.ts exporting invokePlanner(opts: { contextText: string; week: string; historyDir: string; apiKey: string }): Promise<{ planPath: string; plan: PlanRecord }>. -- [x] The function looks up or registers the webster-planner agent via POST /v1/agents, mirroring the find-or-register pattern in scripts/critic-genealogy.ts:440-556. -- [x] The function creates a session via POST /v1/sessions, sends contextText as the user.message event, and polls until the session is idle. -- [x] The function extracts the final assistant text and parses it as JSON with fields classification, next_action, direction_hint, optional new_critic_request, and rationale; next_action must be one of promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly. -- [x] The function writes history//plan.md as human-readable markdown that embeds the parsed JSON in a fenced code block and prints the parsed fields as a bulleted summary above the fence. -- [x] The function appends one event row to history/memory.jsonl with event = 'verdict-ready' using the feature #51 appendEvent helper and includes refs.plan = relative path to the written plan.md. -- [x] Invalid JSON, missing required fields, or unknown next_action values raise an Error with a descriptive message and do NOT write plan.md or append to memory.jsonl. -- [x] Add a Bun test under scripts/__tests__/planner-invoke.test.ts that mocks fetch (global.fetch or bun:test mock) to exercise: happy path with a valid JSON response, malformed response rejection, and the memory.jsonl append side effect. -- [x] bun run type-check, bun run lint --max-warnings 0, bun test, bun run format:check pass. - -**Learnings**: -- `environments/webster-council-env.id` is available in this worktree, so planner session creation can mirror genealogy sessions by passing `environment_id` without expanding the public `invokePlanner` options. -- `agents/webster-planner.json` is absent in this branch even though feature #50 is marked landed elsewhere; the helper supports registration when the spec exists and reuses an already-registered planner without reading the missing file. -- `bun run cli` is not defined in this repo, so Ralph workflow event emissions failed harmlessly under the required `|| true` wrapper. - ---- -## 2026-04-24 — US-003: Orchestrator step in prompts/second-wbs-session.md - -**Status**: PASSED -**Files changed**: -- `prompts/second-wbs-session.md` — added fail-closed Step 3 planner orchestration before critic fan-out, marshaling context through `scripts/planner-context.ts`, invoking `scripts/planner-invoke.ts`, committing `history//plan.md` + `history/memory.jsonl`, and passing `PLAN_PATH` to downstream agents. -- `README.md` — updated architecture diagram, weekly-flow step count, runtime estimate, and weekly-run summary to include the planner step. - -**Acceptance criteria verified**: -- [x] Edit prompts/second-wbs-session.md to add a new numbered step titled 'Run planner' placed BEFORE the critic fan-out step. -- [x] The step shows the bash/bun invocation that marshals context via scripts/planner-context.ts and invokes the planner via scripts/planner-invoke.ts, with the week argument set to the current ISO week folder name under history/. -- [x] The step specifies that on planner error the run halts with a non-zero exit status and a pointer to the error message. -- [x] The step references history//plan.md as the output artifact consumed by later steps. -- [x] Update README.md or an adjacent doc section if the prior council flow explicitly enumerated the steps, so the step count remains accurate. -- [x] bun run validate passes. - -**Learnings**: -- The weekly runner uses `WEEK_DATE=$(date -u +%Y-%m-%d)` as its history folder slug, so the planner step reuses that existing ISO-8601 UTC folder naming pattern instead of introducing a second week format. -- `bun --eval` receives user arguments at `process.argv.slice(1)`, which keeps the prompt-only invocation small without adding CLI code to the helper modules. - ---- diff --git a/.forge/ralph/planner-agent-spec-v5/prd.json b/.forge/ralph/planner-agent-spec-v5/prd.json deleted file mode 100644 index 4328b68..0000000 --- a/.forge/ralph/planner-agent-spec-v5/prd.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/planner-agent-spec-v5", - "prdFile": "prd.md", - "description": "Add a schema-valid Opus 4.7 webster-planner Managed Agent spec and tests for its plan.md contract and registration-flow shape.", - "userStories": [ - { - "id": "US-001", - "title": "Add schema-valid planner Managed Agent spec", - "description": "As a Webster implementation operator, I want agents/webster-planner.json to exist as an Opus 4.7 Managed Agent spec so that later orchestration can register and invoke the planner.", - "acceptanceCriteria": [ - "Create agents/webster-planner.json with name \"webster-planner\", model \"claude-opus-4-7\", and required fields accepted by scripts/schemas/agent.schema.json.", - "The spec uses field \"system\" and does not include rejected fields such as \"system_prompt\" or \"callable_agents\".", - "The system prompt states that user.message supplies marshaled memory context: memory.jsonl tail, last two weeks verdict context, and monitor anomaly report.", - "The system prompt defines the plan.md JSON fields: classification, next_action, direction_hint, optional new_critic_request, and rationale.", - "The next_action enum in the system prompt includes exactly promote_and_experiment, hold_baseline, revert_and_retry, and explore_broadly.", - "The system prompt instructs cold-start/week-1/no-prior-verdict handling to use explore_broadly.", - "Feature #52 and #53 behavior is not implemented in this story.", - "bun run validate:agents passes" - ], - "technicalNotes": "Modify agents/webster-planner.json. Mirror the spec shape in agents/webster-redesigner.json and agents/webster-monitor.json: name, description, model, system, tools, optional mcp_servers, metadata. Follow scripts/schemas/agent.schema.json constraints: required name/description/model/system/tools, no additional properties, model enum includes claude-opus-4-7, metadata.role must be one of critic/monitor/redesigner/orchestrator. Use metadata { role: \"orchestrator\", scope: \"planning\" } because the schema does not currently allow role \"planner\". Scope guard: do not edit prompts/second-wbs-session.md, scripts/memory.ts, or council fan-out code for runtime invocation.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: agents/webster-planner.json, .forge/ralph/planner-agent-spec-v5/prd.md." - }, - { - "id": "US-002", - "title": "Add planner output contract tests", - "description": "As a Webster implementation operator, I want tests for the planner's plan.md contract so that future orchestration can rely on stable fields and action values.", - "acceptanceCriteria": [ - "Add a Bun test that reads agents/webster-planner.json and asserts its system prompt contains all required output fields: classification, next_action, direction_hint, new_critic_request, and rationale.", - "Add a Bun test that asserts the planner system prompt contains all four allowed next_action values: promote_and_experiment, hold_baseline, revert_and_retry, explore_broadly.", - "Add a Bun test that asserts the planner system prompt describes cold-start behavior for week 1/no prior verdict and ties it to explore_broadly.", - "Add a Bun test that asserts the planner system prompt names all three input context sources: memory.jsonl, verdict, and monitor anomaly report or alerts.", - "Tests fail if agents/webster-planner.json is missing or invalid JSON.", - "bun test passes" - ], - "technicalNotes": "Add tests under scripts/__tests__ using the existing Bun style in scripts/__tests__/validate-agents.test.ts and scripts/__tests__/critic-genealogy.test.ts: import { describe, expect, test } from \"bun:test\", read JSON with readFileSync, and resolve ROOT via import.meta.dir. Keep tests focused on the agent spec contract; do not create runtime planner invocation helpers because feature #52 owns invocation.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/__tests__/planner-agent-contract.test.ts." - }, - { - "id": "US-003", - "title": "Add registration-flow guard tests and validate", - "description": "As a Webster implementation operator, I want tests that guard the Managed Agents registration shape so that the planner can be posted to /v1/agents and invoked through the existing session pattern later.", - "acceptanceCriteria": [ - "Add or extend a test to validate agents/webster-planner.json against scripts/schemas/agent.schema.json with AJV 2020, matching scripts/__tests__/validate-agents.test.ts patterns.", - "Add a test that asserts the planner spec has registration-compatible top-level fields only and no research-preview callable_agents field.", - "Add a test or assertion that the planner spec includes tools with type agent_toolset_20260401, matching the Managed Agents beta pattern in existing agent specs.", - "Add a test assertion or technical note in the test name referencing the existing registration/session flow in scripts/critic-genealogy.ts: find/register agent, create session, send user.message, poll until idle.", - "Run bun run format:check, bun run type-check, bun run lint --max-warnings 0, bun run validate:agents, bun test, and bun run validate before declaring completion.", - "Do not implement scripts that call /v1/sessions or write history//plan.md; that remains feature #52." - ], - "technicalNotes": "Use the same AJV setup as scripts/__tests__/validate-agents.test.ts: Ajv2020 from ajv/dist/2020.js plus addFormats.default(ajv). Registration flow references should be grounded in scripts/critic-genealogy.ts:440-556, where registerAgent POSTs to /v1/agents, createSession POSTs to /v1/sessions, sendUserMessage POSTs to /sessions/{id}/events, and pollUntilIdle reads /sessions/{id}. This story should only test that the planner spec is compatible with that flow, not duplicate or export those helpers.", - "dependsOn": [ - "US-001", - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: scripts/__tests__/planner-agent-contract.test.ts." - } - ] -} diff --git a/.forge/ralph/planner-agent-spec-v5/prd.md b/.forge/ralph/planner-agent-spec-v5/prd.md deleted file mode 100644 index 9ae0459..0000000 --- a/.forge/ralph/planner-agent-spec-v5/prd.md +++ /dev/null @@ -1,151 +0,0 @@ -# Planner Agent Spec — Product Requirements - -## Overview - -**Problem**: Webster's weekly council can critique and redesign, but Layer 11 needs an experiment-aware planning brain before the critics run. Without a schema-valid `webster-planner` Managed Agent spec, later orchestration work (#52) has no registered agent to invoke and no stable `plan.md` contract to hand to critics (#53). -**Solution**: Add `agents/webster-planner.json` as an Opus 4.7 Managed Agent spec that matches the existing Managed Agents beta schema, reads marshaled memory context supplied by the orchestrator, and emits a `plan.md` containing a strict JSON object with `classification`, `next_action`, `direction_hint`, optional `new_critic_request`, and `rationale`. -**Branch**: `ralph/planner-agent-spec-v5` - ---- - -## Goals & Success - -### Primary Goal - -Ship the planner agent spec and tests that prove it is schema-valid and aligned with Webster's registration/invocation pattern, without implementing the later orchestrator memory marshaling or council integration features. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| Managed Agent schema validity | `agents/webster-planner.json` passes the committed schema | `bun run validate:agents` and `bun test` | -| Planner output contract coverage | Tests verify required `plan.md` JSON fields and `next_action` enum values | New/updated Bun tests | -| Registration-flow alignment | Tests assert planner uses `POST /v1/agents`-compatible fields and no research-preview fields | New/updated Bun tests referencing existing schema and critic-genealogy flow | -| Scope containment | No orchestrator prompt, memory helper, or council fan-out implementation changes | Git diff review | - -### Non-Goals (Out of Scope) - -- Implementing orchestrator memory marshaling or planner invocation — explicitly owned by feature #52. -- Passing `plan.md` into critics/redesigner or spawning genealogy from planner output — explicitly owned by feature #53. -- Building cold-start orchestration behavior beyond planner spec instructions — feature #54 owns runtime cold-start plumbing. -- Changing the Managed Agent schema shape unless strictly required for the new `orchestrator` metadata role already allowed by `scripts/schemas/agent.schema.json`. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operators preparing the Layer 11 planner + experiment-aware council. -- **Role**: They maintain Managed Agent specs, validation gates, and orchestration scripts for the hackathon submission. -- **Current Pain**: Later features cannot safely invoke a planner because there is no registered-agent spec or tested `plan.md` output contract. - -### User Journey - -1. **Trigger**: Operator picks feature #50 from `context/FEATURES.md` and needs a schema-valid planner agent spec. -2. **Action**: Operator adds `agents/webster-planner.json`, runs validation/tests, and confirms it follows the beta Managed Agents registration shape. -3. **Outcome**: Feature #52 can register/invoke this planner via `/v1/agents`, `/v1/sessions`, events, and polling, then persist the returned `plan.md`. - ---- - -## UX Requirements - -### Interaction Model - -Backend/spec-only. Users do not interact with UI. The planner is registered through the same Managed Agents beta API shape used by existing specs and later invoked by orchestration code using the five-step pattern visible in `scripts/critic-genealogy.ts:440-556`: find/register agent, create session, send `user.message`, poll session status, inspect output. - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | Memory tail and prior verdicts are absent in week 1 | Planner instructions must choose `next_action: "explore_broadly"` and explain cold-start classification. | -| Loading | Runtime session is polling after a planner `user.message` | Out of scope for #50; covered by existing pattern in `scripts/critic-genealogy.ts:503-556` and future #52. | -| Error | Marshaled memory is contradictory, malformed, or missing key sections | Planner instructions must still emit valid `plan.md` JSON and state uncertainty in `rationale`. | -| Success | Planner has memory tail, verdicts, and monitor anomaly report | Planner emits a single `plan.md` JSON object with an allowed `next_action` and concrete `direction_hint`. | - ---- - -## Technical Context - -### Patterns to Follow - -- **Managed Agent spec pattern**: `agents/webster-redesigner.json` — Opus 4.7 agent with `name`, `description`, `model`, long `system`, `tools`, `mcp_servers`, and `metadata`. -- **Monitor context pattern**: `agents/webster-monitor.json` — reads analytics inputs, handles missing prior week, and writes structured output without proposing fixes. -- **Registration + session pattern**: `scripts/critic-genealogy.ts:440-556` — `findAgentByName`, `registerAgent`, `createSession`, `sendUserMessage`, and `pollUntilIdle` use `/v1/agents`, `/v1/sessions`, `/events`, and polling with `managed-agents-2026-04-01` beta headers. -- **Schema validation pattern**: `scripts/schemas/agent.schema.json` — requires `name`, `description`, `model`, `system`, and `tools`; rejects `system_prompt`, `callable_agents`, and unknown models. -- **Agent validation tests**: `scripts/__tests__/validate-agents.test.ts` — compiles the schema with AJV 2020 and validates every `agents/*.json` file. -- **Registration gotcha tests**: `scripts/__tests__/critic-genealogy.test.ts` — verifies generated specs preserve tools/MCP servers and remain valid against `agent.schema.json`. - -### Types & Interfaces - -```typescript -// Existing schema-level contract from scripts/schemas/agent.schema.json -type PlannerAgentSpec = { - name: string; - description: string; - model: "claude-opus-4-7" | "claude-opus-4-7-20260101"; - system: string; - tools: Array<{ type: "agent_toolset_20260401" } | { type: "mcp_toolset"; mcp_server_name: string }>; - mcp_servers?: Array<{ type: "url"; name: string; url: string }>; - metadata?: { role?: "orchestrator"; scope?: string }; -}; - -type PlannerPlan = { - classification: string; - next_action: "promote_and_experiment" | "hold_baseline" | "revert_and_retry" | "explore_broadly"; - direction_hint: string; - new_critic_request?: { - scope: string; - rationale: string; - evidence_refs: string[]; - }; - rationale: string; -}; -``` - -### Architecture Notes - -- The planner is an Opus 4.7 Managed Agent per Q1 ADR-0001 and `context/FEATURES.md` feature #50. -- The agent must not read repository files itself for memory; #50's spec should state that the orchestrator supplies marshaled `memory.jsonl` tail, last two weeks of verdicts, and monitor anomaly report in `user.message`. -- The planner output contract is `plan.md` whose body contains one JSON object; tests can assert the system prompt includes the required schema fields and enum values. -- The spec should likely reuse the GitHub MCP toolset pattern from `webster-redesigner`/`webster-monitor` only if the planner is instructed to commit `plan.md` itself. Feature #52 says the orchestrator extracts output and writes `history//plan.md`, so the planner spec can be agent-toolset-only unless existing Managed Agent registration expectations require MCP parity. -- Metadata should use `role: "orchestrator"` and `scope: "planning"` because `scripts/schemas/agent.schema.json` already allows `orchestrator` but not `planner`. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Add schema-valid planner Managed Agent spec | 1 | — | -| US-002 | Add planner output contract tests | 2 | US-001 | -| US-003 | Add registration-flow guard tests and validate | 3 | US-001, US-002 | - -### Dependency Graph - -```text -US-001 (agent spec) - ↓ -US-002 (plan.md output contract tests) - ↓ -US-003 (registration-flow guard tests + validation) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Agent schema validation: `bun run validate:agents` -- [ ] Full validation before completion: `bun run validate` - ---- - -Generated: 2026-04-24T00:00:00.000Z diff --git a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.json b/.forge/ralph/seed-demo-arc-w3w4-v5/prd.json deleted file mode 100644 index 6499e5f..0000000 --- a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "project": "Webster", - "branchName": "ralph/seed-demo-arc-w3w4-v5", - "prdFile": "prd.md", - "description": "Complete feature #57 by extending the existing demo arc seeder from W1/W2 through W3/W4 with 6-of-7 lane coverage and one W4 genealogy spawn.", - "userStories": [ - { - "id": "US-003", - "title": "Add W3 gate-fail and auto-rollback seeding", - "description": "As a Webster demo operator, I want W3 demo-arc artifacts for archive-gate-fail, auto-rollback, and hold outcomes so that the demo can show failure learning without touching live history.", - "acceptanceCriteria": [ - "`bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W3/proposal.md`, `decision.json`, and `verdict.json`.", - "W3 verdict includes `exp-05-mid-section-image-swap` with outcome `archive-gate-fail` and a failing `bounce_rate` gate.", - "W3 verdict includes `exp-06-cta-color-shift` with outcome `auto-rollback`, classification `hurt`, and `reward_delta_pct` of `-11`.", - "W3 verdict includes `exp-07-subhead-rewrite` with outcome `hold` and classification `neutral`.", - "`history/demo-arc/baselines.jsonl` records W3 lane statuses as `archived-gate-fail`, `rolled-back`, and no promoted baseline for the held experiment.", - "`history/demo-arc/memory.jsonl` contains W3 rows whose final events reflect archive/rollback/hold behavior rather than labeling every W3 experiment as a promotion.", - "Running the seeder twice produces deterministic W3 output under `history/demo-arc/` and does not write outside that directory.", - "`bun run validate` passes." - ], - "technicalNotes": "Build on `scripts/seed-demo-arc.ts` only. Reuse existing W3 entries in `EXPERIMENT_SPECS` at `scripts/seed-demo-arc.ts:240-291`; mirror `writeW1`/`writeW2` at `scripts/seed-demo-arc.ts:459-476` with a `writeW3`. Update shared baseline/memory helpers at `scripts/seed-demo-arc.ts:437-457` if needed so lane-specific statuses/events are represented correctly. Preserve `initDemoArcDir()` isolation under `history/demo-arc/` from `scripts/seed-demo-arc.ts:340-351`. Follow the locked W3 narrative in `context/DOMAIN-MODEL.md:415-420` and feature #57 scope in `context/FEATURES.md:172`.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/seed-demo-arc.ts, history/demo-arc/demo-W3/proposal.md, history/demo-arc/demo-W3/decision.json, history/demo-arc/demo-W3/verdict.json, history/demo-arc/baselines.jsonl, history/demo-arc/memory.jsonl." - }, - { - "id": "US-004", - "title": "Add W4 conservative wins and genealogy spawn", - "description": "As a Webster demo operator, I want W4 demo-arc artifacts plus one spawned critic artifact set so that the demo closes the loop from W3 failure to critic genealogy and safe recovery wins.", - "acceptanceCriteria": [ - "`bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W4/proposal.md`, `decision.json`, and `verdict.json`.", - "W4 verdict includes `exp-08-hero-safety-copy` and `exp-09-cta-size-adjust` as passing conservative experiments.", - "Outcome coverage across W1-W4 includes exactly these six lanes at minimum: `promote-fast-track`, `promote-fallback`, `promote-gate-win`, `archive-gate-fail`, `auto-rollback`, and `hold`.", - "The seeder writes one W4 genealogy-spawned critic spec for a bounce-risk concern under `history/demo-arc/demo-W4/genealogy/`.", - "The genealogy artifacts include a `NewCriticSpec`-shaped JSON payload and an `AgentJSON`-shaped critic registration payload using the existing exported interfaces.", - "W4 memory rows include a `gap-detected` event or equivalent genealogy trigger referencing the W3 bounce/gate-fail pattern and the spawned critic.", - "The script completion message reflects seeding through demo-W4 instead of demo-W2.", - "Running the seeder twice produces deterministic W4/genealogy output under `history/demo-arc/` and does not write outside that directory.", - "`bun run validate` passes." - ], - "technicalNotes": "Depends on US-003's lane-correct baseline and memory helper behavior. Reuse existing W4 entries in `EXPERIMENT_SPECS` at `scripts/seed-demo-arc.ts:292-338`; mirror the existing week writer pattern at `scripts/seed-demo-arc.ts:459-476` with `writeW4`. Use the existing `AgentJSON` and `NewCriticSpec` interfaces from `scripts/seed-demo-arc.ts:68-87` for deterministic genealogy JSON files. `initDemoArcDir()` already creates `history/demo-arc/demo-W4/genealogy` at `scripts/seed-demo-arc.ts:348`; write artifacts there. Follow the W4 table row and Git-state expectations in `context/DOMAIN-MODEL.md:421-429`. Do not call real Managed Agent APIs or alter `agents/` live specs.", - "dependsOn": ["US-003"], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/seed-demo-arc.ts, history/demo-arc/demo-W4/proposal.md, history/demo-arc/demo-W4/decision.json, history/demo-arc/demo-W4/verdict.json, history/demo-arc/demo-W4/genealogy/new-critic-spec.json, history/demo-arc/demo-W4/genealogy/agent-registration.json, history/demo-arc/baselines.jsonl, history/demo-arc/memory.jsonl, context/FEATURES.md." - } - ] -} diff --git a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.md b/.forge/ralph/seed-demo-arc-w3w4-v5/prd.md deleted file mode 100644 index 23f24cd..0000000 --- a/.forge/ralph/seed-demo-arc-w3w4-v5/prd.md +++ /dev/null @@ -1,153 +0,0 @@ -# Seed Demo Arc W3/W4 — Product Requirements - -## Overview - -**Problem**: Feature #57 is only half shipped. `scripts/seed-demo-arc.ts` already seeds W1/W2, but the demo arc still cannot show the dramatic W3 failure/rollback beat or the W4 critic-genealogy response promised in the Webster narrative. -**Solution**: Extend the existing seeder with the already-modeled W3 and W4 experiment specs, artifact writers, baseline/memory rows, and W4 genealogy artifacts. Do not rework US-001 or US-002. -**Branch**: `ralph/seed-demo-arc-w3w4-v5` - ---- - -## Goals & Success - -### Primary Goal - -Complete feature #57 by adding only US-003 and US-004 so `bun scripts/seed-demo-arc.ts` creates a complete, idempotent four-week demo arc under `history/demo-arc/`. - -### Success Metrics - -| Metric | Target | How Measured | -| ------ | ------ | ------------ | -| Week coverage | W1, W2, W3, and W4 artifacts exist | Run seeder and inspect `history/demo-arc/demo-W*/` | -| Outcome coverage | 6 of 7 Q4 lanes represented | Inspect `verdict.json` outcomes across all weeks | -| Genealogy proof | One W4 spawned critic artifact set exists | Inspect `history/demo-arc/demo-W4/genealogy/` | -| Runtime safety | No live history mutation | Seeder writes only beneath `history/demo-arc/` | -| Quality gate | Validation green | `bun run validate` | - -### Non-Goals (Out of Scope) - -- Re-implementing W1/W2 scaffold or artifact writers — already landed in `fb3256e`. -- Creating real Managed Agents through the Anthropic API — this is a deterministic mock seeder. -- Touching live weekly history outside `history/demo-arc/` — demo data must remain isolated. -- Covering the 7th outcome lane — the locked hero claim is deliberately 6/7. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operator preparing the hackathon demo. -- **Role**: Maintains deterministic run artifacts that let the council/planner story be replayed. -- **Current Pain**: The seeded output stops at W2, so the best narrative beats are absent. - -### User Journey - -1. **Trigger**: Operator needs a four-week mock arc for the submission demo. -2. **Action**: Operator runs `bun scripts/seed-demo-arc.ts`. -3. **Outcome**: `history/demo-arc/` contains W1-W4 proposals, decisions, verdicts, memory, baselines, and W4 genealogy artifacts. - ---- - -## UX Requirements - -### Interaction Model - -CLI-only deterministic seed script. The user runs `bun scripts/seed-demo-arc.ts`; the script recreates `history/demo-arc/` from scratch and prints a completion message. - -### States to Handle - -| State | Description | Behavior | -| ----- | ----------- | -------- | -| Empty | `history/demo-arc/` does not exist | Create directory tree and all artifacts | -| Loading | Script is running | Synchronous file writes; no progress UI required | -| Error | Filesystem or type errors occur | Let Bun/Node error surface; no silent fallback | -| Success | Seeder completes | W1-W4 artifacts are present and deterministic | - ---- - -## Technical Context - -### Patterns to Follow - -- **Existing seeder scaffold**: `scripts/seed-demo-arc.ts:12-129` — constants, demo week identifiers, and TypeScript interfaces already define the artifact model. -- **Existing W3/W4 data**: `scripts/seed-demo-arc.ts:240-338` — W3 and W4 `EXPERIMENT_SPECS` already encode experiment IDs, outcomes, gates, and insights. -- **Artifact writer pattern**: `scripts/seed-demo-arc.ts:365-435` — proposal, decision, verdict, baseline, and memory writes are pure helper functions. -- **Existing W1/W2 orchestration**: `scripts/seed-demo-arc.ts:459-479` — `writeW1`, `writeW2`, and `main` show the intended week writer shape. -- **Locked domain narrative**: `context/DOMAIN-MODEL.md:411-431` — Q9 table defines W3/W4 experiments, outcomes, and genealogy demo beat. -- **Feature tracking**: `context/FEATURES.md:172` — #57 status and remaining scope are canonical. -- **Validation rules**: `CLAUDE.md:18-31` and `package.json:scripts.validate` — type-check, lint, format, agent/findings validation, markdownlint, and tests are mandatory. - -### Types & Interfaces - -```typescript -type OutcomeLane = - | "promote-fast-track" - | "promote-fallback" - | "promote-gate-win" - | "archive-gate-fail" - | "auto-rollback" - | "hold"; - -interface ExperimentSpec extends ExperimentVerdict { - week: DemoWeek; - target_files: string[]; - proposed_change: string; - rationale: string; - baseline_sha: string; - verdict_ready_insight: string; - promote_insight: string; -} - -interface NewCriticSpec { - name: string; - scope: string; - description: string; - rationale: string; - focus_owned: string[]; - focus_not_owned: string[]; - severity_rubric: string; -} -``` - -### Architecture Notes - -- `initDemoArcDir()` currently creates all week directories and `demo-W4/genealogy`, so US-003/US-004 should add writers rather than new directory bootstrapping. -- `buildBaselineRows()` currently marks every row as `promoted`; US-003 must preserve `archived-gate-fail` and `rolled-back` statuses for W3 lanes. -- `buildWeekMemoryRows()` currently emits `promote` for every final event; US-003 must emit event names matching each outcome where relevant, especially rollback and skip/hold semantics. -- W4 genealogy should use the existing `AgentJSON` and `NewCriticSpec` shapes and write deterministic local JSON/Markdown artifacts under `history/demo-arc/demo-W4/genealogy/`. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -| -- | ----- | -------- | ------------ | -| US-003 | Add W3 gate-fail and auto-rollback seeding | 1 | -- | -| US-004 | Add W4 conservative wins and genealogy spawn | 2 | US-003 | - -### Dependency Graph - -```text -US-003 (W3 artifact writers + lane-correct baseline/memory rows) - ↓ -US-004 (W4 artifact writers + genealogy spawn artifacts) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full validation: `bun run validate` - ---- - -Generated: 2026-04-24T07:47:55Z diff --git a/.forge/ralph/seed-demo-arc-w3w4-v5/progress.txt b/.forge/ralph/seed-demo-arc-w3w4-v5/progress.txt deleted file mode 100644 index 2318beb..0000000 --- a/.forge/ralph/seed-demo-arc-w3w4-v5/progress.txt +++ /dev/null @@ -1,88 +0,0 @@ -## Codebase Patterns - -### Deterministic local genealogy artifacts -- **Where**: `scripts/seed-demo-arc.ts` -- **Pattern**: Model demo-only critic genealogy with typed constants satisfying `NewCriticSpec` and `AgentJSON`, then write those JSON payloads under the week-local `history/demo-arc/demo-W4/genealogy/` directory. Do not mutate live `agents/` specs or call Managed Agent APIs from the seeder. -- **Example**: `BOUNCE_GUARD_CRITIC_SPEC satisfies NewCriticSpec` and `BOUNCE_GUARD_AGENT_JSON satisfies AgentJSON`. - -### Outcome lane mapping for demo baselines and memory -- **Where**: `scripts/seed-demo-arc.ts` -- **Pattern**: Keep experiment specs as the single source of truth, then derive baseline status and final memory event from `experiment.outcome`. -- **Example**: `archive-gate-fail -> archived-gate-fail + regression`, `auto-rollback -> rolled-back + rollback`, `hold -> no baseline row + skip`. - ---- - -## 2026-04-24T07:55:39Z — US-003: Add W3 gate-fail and auto-rollback seeding - -**Status**: PASSED -**Files changed**: -- `scripts/seed-demo-arc.ts` — added W3 writer and lane-specific baseline/memory helpers. -- `history/demo-arc/demo-W3/proposal.md` — seeded W3 proposal artifacts. -- `history/demo-arc/demo-W3/decision.json` — seeded W3 planner decision artifacts. -- `history/demo-arc/demo-W3/verdict.json` — seeded W3 verdict artifacts. -- `history/demo-arc/baselines.jsonl` — added W3 archive/rollback rows with no held-experiment promotion. -- `history/demo-arc/memory.jsonl` — added W3 verdict-ready plus regression/rollback/skip final events. -- `.forge/ralph/seed-demo-arc-w3w4-v5/prd.md` — removed emphasis from generated footer so repository markdown validation passes. - -**Acceptance criteria verified**: -- [x] `bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W3/proposal.md`, `decision.json`, and `verdict.json`. -- [x] W3 verdict includes `exp-05-mid-section-image-swap` with outcome `archive-gate-fail` and a failing `bounce_rate` gate. -- [x] W3 verdict includes `exp-06-cta-color-shift` with outcome `auto-rollback`, classification `hurt`, and `reward_delta_pct` of `-11`. -- [x] W3 verdict includes `exp-07-subhead-rewrite` with outcome `hold` and classification `neutral`. -- [x] `history/demo-arc/baselines.jsonl` records W3 lane statuses as `archived-gate-fail`, `rolled-back`, and no promoted baseline for the held experiment. -- [x] `history/demo-arc/memory.jsonl` contains W3 rows whose final events reflect archive/rollback/hold behavior rather than labeling every W3 experiment as a promotion. -- [x] Running the seeder twice produces deterministic W3 output under `history/demo-arc/` and does not write outside that directory. -- [x] `bun run validate` passes. - -**Learnings**: -- `progress.txt` was absent at iteration start, so this iteration created it with a codebase pattern section. -- The existing W3 specs already contained the required verdict details; implementation only needed orchestration plus derived baseline/memory semantics. -- `bun run validate` initially failed on the generated PRD footer being emphasis-only markdown; removing the emphasis made markdownlint pass. - -**Verification**: -- `bun scripts/seed-demo-arc.ts && cp -R history/demo-arc /tmp/demo-arc-first && bun scripts/seed-demo-arc.ts && diff -qr /tmp/demo-arc-first history/demo-arc` -- `jq '.experiments[] | {exp_id,outcome,classification,reward_delta_pct,gates}' history/demo-arc/demo-W3/verdict.json` -- `grep 'exp-0[567]' history/demo-arc/baselines.jsonl` -- `grep 'demo-W3' history/demo-arc/memory.jsonl` -- `bun run type-check && bun run lint --max-warnings 0 && bun run test && bun run format:check && bun run validate` - ---- - -## 2026-04-24T07:59:41Z — US-004: Add W4 conservative wins and genealogy spawn - -**Status**: PASSED -**Files changed**: -- `scripts/seed-demo-arc.ts` — added W4 writer, deterministic bounce-guard genealogy payloads, W4 gap-detected memory row, and demo-W4 completion output. -- `history/demo-arc/demo-W4/proposal.md` — seeded W4 proposal artifacts. -- `history/demo-arc/demo-W4/decision.json` — seeded W4 planner decision artifacts. -- `history/demo-arc/demo-W4/verdict.json` — seeded W4 verdict artifacts for exp-08 and exp-09. -- `history/demo-arc/demo-W4/genealogy/new-critic-spec.json` — seeded `NewCriticSpec`-shaped bounce-guard critic payload. -- `history/demo-arc/demo-W4/genealogy/agent-registration.json` — seeded `AgentJSON`-shaped critic registration payload. -- `history/demo-arc/baselines.jsonl` — added W4 promoted baseline rows. -- `history/demo-arc/memory.jsonl` — added W4 gap-detected genealogy trigger and W4 verdict/promote rows. -- `context/FEATURES.md` — marked feature #57 done. - -**Acceptance criteria verified**: -- [x] `bun scripts/seed-demo-arc.ts` writes `history/demo-arc/demo-W4/proposal.md`, `decision.json`, and `verdict.json`. -- [x] W4 verdict includes `exp-08-hero-safety-copy` and `exp-09-cta-size-adjust` as passing conservative experiments. -- [x] Outcome coverage across W1-W4 includes exactly these six lanes at minimum: `promote-fast-track`, `promote-fallback`, `promote-gate-win`, `archive-gate-fail`, `auto-rollback`, and `hold`. -- [x] The seeder writes one W4 genealogy-spawned critic spec for a bounce-risk concern under `history/demo-arc/demo-W4/genealogy/`. -- [x] The genealogy artifacts include a `NewCriticSpec`-shaped JSON payload and an `AgentJSON`-shaped critic registration payload using the existing exported interfaces. -- [x] W4 memory rows include a `gap-detected` event or equivalent genealogy trigger referencing the W3 bounce/gate-fail pattern and the spawned critic. -- [x] The script completion message reflects seeding through demo-W4 instead of demo-W2. -- [x] Running the seeder twice produces deterministic W4/genealogy output under `history/demo-arc/` and does not write outside that directory. -- [x] `bun run validate` passes. - -**Learnings**: -- W4 experiment specs already contained the conservative passing outcomes, so implementation needed orchestration and genealogy artifact emission rather than new experiment modeling. -- The seeder's exported interfaces can enforce local demo payload shape with `satisfies` while still avoiding live Managed Agent registration. -- Prettier reformats the long `context/FEATURES.md` table row when feature #57 status changes. - -**Verification**: -- `bun scripts/seed-demo-arc.ts && cp -R history/demo-arc /tmp/demo-arc-first && bun scripts/seed-demo-arc.ts && diff -qr /tmp/demo-arc-first history/demo-arc` -- `jq -r '.experiments[].outcome' history/demo-arc/demo-W*/verdict.json | sort -u` -- `jq '.experiments[] | {exp_id,outcome,classification,reward_delta_pct,gates}' history/demo-arc/demo-W4/verdict.json` -- `grep 'gap-detected' history/demo-arc/memory.jsonl` -- `bun run type-check && bun run lint --max-warnings 0 && bun run test && bun run format:check && bun run validate` - ---- diff --git a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.json b/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.json deleted file mode 100644 index 3bdf936..0000000 --- a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "project": "webster", - "branchName": "ralph/webster-feature-number-58-pair-alpha-secondary-sub", - "prdFile": "prd.md", - "description": "Seed Pair Alpha secondary SaaS and local-service substrates with deterministic mock run artifacts.", - "userStories": [ - { - "id": "US-001", - "title": "Add deterministic secondary substrate model and HTML writers", - "description": "As a Webster implementation operator, I want deterministic SaaS and local-service HTML fixtures so that the submission can demonstrate council generalization beyond the primary substrate.", - "acceptanceCriteria": [ - "Create `scripts/seed-secondary-substrates.ts` with `#!/usr/bin/env bun` and pure TypeScript imports from Node/Bun standard libraries only.", - "Define typed constants for exactly two substrates: `saas-alpha` and `local-service-alpha`.", - "Write `site/secondary/saas-alpha/index.html` and `site/secondary/local-service-alpha/index.html` as complete single-file HTML landing pages.", - "HTML output is deterministic and contains no remote scripts, remote stylesheets, or network-fetching code.", - "Export constants or helper functions needed by tests without executing `main()` on import.", - "Type-check passes", - "Tests pass" - ], - "technicalNotes": "Mirror `scripts/seed-demo-arc.ts:8-17` for ROOT/path constants and fs/path imports. Follow `scripts/seed-demo-arc.ts:21-64` for literal-union types/interfaces. The new script owns `site/secondary/` only for site output and must not touch `site/before/` or `site/after/`.", - "dependsOn": [], - "priority": 1, - "passes": true, - "notes": "Implemented in iteration 1. Files: scripts/seed-secondary-substrates.ts, site/secondary/saas-alpha/index.html, site/secondary/local-service-alpha/index.html." - }, - { - "id": "US-002", - "title": "Write secondary mock run artifacts", - "description": "As a Webster implementation operator, I want onboard and two weekly mock runs per secondary substrate so that the demo can show a complete two-cycle council arc for each new vertical.", - "acceptanceCriteria": [ - "Create `history/secondary-arc/saas-alpha/{onboard,week-1,week-2}/` and `history/secondary-arc/local-service-alpha/{onboard,week-1,week-2}/`.", - "Every run folder contains exactly the required artifact names: `proposal.md`, `decision.json`, `verdict.json`, and `apply-log.json`.", - "Each `proposal.md` includes experiment blocks with exp IDs, kind, target files, proposed change, and rationale.", - "Each `decision.json` includes substrate, run, selected issues, reasoning, and monitor signal fields that mirror the `history/demo-arc` decision convention.", - "Each `verdict.json` includes substrate, run, experiments, reward delta, p-value, classification, and outcome fields that mirror the `history/demo-arc` verdict convention.", - "Each `apply-log.json` records applied status, touched files, skipped rows, and notes for that run.", - "Type-check passes", - "Tests pass" - ], - "technicalNotes": "Follow writer shape in `scripts/seed-demo-arc.ts:393-419`: build typed objects, write `JSON.stringify(value, null, 2)` plus trailing newline, and generate Markdown proposal bodies from typed experiment specs. Use the outcome lane names from `scripts/seed-demo-arc.ts:25-31`.", - "dependsOn": [ - "US-001" - ], - "priority": 2, - "passes": true, - "notes": "Implemented in iteration 2. Files: scripts/seed-secondary-substrates.ts, history/secondary-arc/*/{onboard,week-1,week-2}/{proposal.md,decision.json,verdict.json,apply-log.json}." - }, - { - "id": "US-003", - "title": "Wire CLI package script and scope guards", - "description": "As a Webster implementation operator, I want a single package command with strict output boundaries so that seeding is repeatable and cannot corrupt primary demo artifacts.", - "acceptanceCriteria": [ - "Add `seed:secondary` to `package.json` scripts with value `bun scripts/seed-secondary-substrates.ts`.", - "The script removes/recreates or overwrites only `site/secondary/` and `history/secondary-arc/`.", - "The script never reads from or writes to `history/demo-arc/`, `site/before/`, or `site/after/`.", - "Running `bun run seed:secondary` exits 0 and prints a concise deterministic success message.", - "`main()` is guarded with `if (import.meta.main)` so tests can import the module safely.", - "Type-check passes", - "Tests pass" - ], - "technicalNotes": "Mirror CLI/export pattern in `scripts/seed-demo-arc.ts:485-510`. Add the package script near existing scripts in `package.json:12-23`. Protected paths are explicit feature requirements from `context/FEATURES.md:173` and the PRD input.", - "dependsOn": [ - "US-002" - ], - "priority": 3, - "passes": true, - "notes": "Implemented in iteration 3. Files: package.json; verified scripts/seed-secondary-substrates.ts guarded main and owned output boundaries." - }, - { - "id": "US-004", - "title": "Add Bun tests for layout, idempotency, and protected paths", - "description": "As a Webster maintainer, I want automated tests around the seeder so that future changes cannot break file layout, determinism, or safety constraints.", - "acceptanceCriteria": [ - "Create `scripts/__tests__/seed-secondary-substrates.test.ts` using Bun's `describe`, `test`, and `expect` APIs.", - "Test verifies both secondary HTML files and all six run folders exist after seeding.", - "Test verifies every run folder contains `proposal.md`, `decision.json`, `verdict.json`, and `apply-log.json`.", - "Test captures contents of all seeded files, runs the seeder a second time, and asserts byte-identical contents for idempotency.", - "Test fingerprints `history/demo-arc/`, `site/before/`, and `site/after/` before and after seeding and asserts they are unchanged.", - "`bun test` passes.", - "`bun run validate` passes." - ], - "technicalNotes": "Follow filesystem testing style in `scripts/__tests__/memory.test.ts:1-85`: import Bun test helpers, use fs/path utilities, and cleanly assert deterministic data. Existing tests import source modules directly, as shown by `scripts/__tests__/critic-genealogy.test.ts:1-18`.", - "dependsOn": [ - "US-003" - ], - "priority": 4, - "passes": true, - "notes": "Implemented in iteration 4. Files: scripts/__tests__/seed-secondary-substrates.test.ts." - } - ] -} diff --git a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.md b/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.md deleted file mode 100644 index 3af1442..0000000 --- a/.forge/ralph/webster-feature-number-58-pair-alpha-secondary-sub/prd.md +++ /dev/null @@ -1,186 +0,0 @@ -# Pair Alpha Secondary Substrates — Product Requirements - -## Overview - -**Problem**: Webster's current demo arc proves the council loop on one primary landing page only. Without secondary substrates, judges and operators cannot see whether the planner, critic council, verdict model, and mock history conventions generalize beyond the healthcare landing page. -**Solution**: Build `scripts/seed-secondary-substrates.ts`, a deterministic Bun/TypeScript seeder that creates two synthetic single-file secondary landing pages plus mock onboard/week-1/week-2 run artifacts for each substrate. -**Branch**: `ralph/webster-feature-number-58-pair-alpha-secondary-sub` - ---- - -## Goals & Success - -### Primary Goal - -Create a demo-safe Pair Alpha substrate package that proves Webster can operate on a B2B SaaS landing page and a B2C local-service landing page without touching the primary demo arc or before/after site fork. - -### Success Metrics - -| Metric | Target | How Measured | -|--------|--------|--------------| -| Secondary site files created | `site/secondary/saas-alpha/index.html` and `site/secondary/local-service-alpha/index.html` exist | `bun run seed:secondary` then file existence assertions | -| Mock run layout complete | Each substrate has `onboard`, `week-1`, and `week-2` folders with `proposal.md`, `decision.json`, `verdict.json`, `apply-log.json` | Unit test enumerates expected paths under `history/secondary-arc//` | -| Idempotent deterministic output | Re-running the seeder produces byte-identical files | Test snapshots file contents before and after a second run | -| Scope safety | Seeder never mutates `history/demo-arc/`, `site/before/`, or `site/after/` | Test fingerprints protected directories before/after seeding | -| Validation green | `bun run validate` and `bun test` pass | Local command output | - -### Non-Goals (Out of Scope) - -- Live analytics ingestion — this is a synthetic seed artifact, not runtime telemetry. -- E-commerce substrate — explicitly held out by operator decision; Pair Alpha is SaaS + local service only. -- Modifying `history/demo-arc/` — the primary demo arc is canonical and must remain untouched. -- Modifying `site/before/` or `site/after/` — those directories are the primary before/after fork and are not part of the secondary-substrate proof. -- Network calls or external API integration — deterministic mock data only. - ---- - -## User & Context - -### Target User - -- **Who**: Webster implementation operator preparing the hackathon submission. -- **Role**: Needs a fast, repeatable local command that seeds extra demo evidence. -- **Current Pain**: Current mock history is convincing for one primary substrate, but does not demonstrate cross-vertical generalization. - -### User Journey - -1. **Trigger**: Operator needs to show that Webster can run its council loop beyond the primary healthcare landing page. -2. **Action**: Operator runs `bun run seed:secondary`. -3. **Outcome**: Two synthetic landing pages and six mock run folders appear in stable locations, ready for demo narration and automated checks. - ---- - -## UX Requirements - -### Interaction Model - -CLI-only seed workflow: - -```bash -bun run seed:secondary -``` - -The command should be silent except for a short success message. It should be safe to run repeatedly in local development and CI. The script must use pure TypeScript/Bun stdlib file operations and no network calls. - -### States to Handle - -| State | Description | Behavior | -|-------|-------------|----------| -| Empty | `site/secondary/` or `history/secondary-arc/` does not exist | Create directories and all expected files | -| Loading | Seeder is writing deterministic files | Synchronous file writes are acceptable; no progress UI required | -| Error | Filesystem write fails | Let the thrown error fail the command; do not silently swallow | -| Success | All secondary files are written | Print deterministic success line and exit 0 | - ---- - -## Technical Context - -### Patterns to Follow - -- **Similar implementation**: `scripts/seed-demo-arc.ts:8-17` — use Bun TypeScript, `node:fs`, `node:path`, `ROOT`, and constants for output directories. -- **Type pattern**: `scripts/seed-demo-arc.ts:21-64` — define string-literal unions and interfaces for experiment kinds, verdict outcomes, decisions, verdicts, and run rows. -- **Seed lifecycle pattern**: `scripts/seed-demo-arc.ts:345-354` — initialize owned output directories deterministically. For this feature, remove/recreate only `history/secondary-arc/` and `site/secondary/`, never protected primary paths. -- **Artifact writer pattern**: `scripts/seed-demo-arc.ts:393-419` — emit pretty-printed JSON files with trailing newline and Markdown proposal files. -- **CLI entry/export pattern**: `scripts/seed-demo-arc.ts:485-510` — `main()` gated by `if (import.meta.main)` and export constants/helpers for tests. -- **Package script pattern**: `package.json:12-23` — add a new script beside existing validation/test scripts. -- **Test pattern**: `scripts/__tests__/memory.test.ts:1-85` — Bun test with `describe`, `test`, `expect`, filesystem setup/cleanup, and deterministic assertions. - -### Types & Interfaces - -```typescript -type SecondarySubstrate = "saas-alpha" | "local-service-alpha"; -type SecondaryRun = "onboard" | "week-1" | "week-2"; -type ExperimentKind = "text" | "component" | "asset" | "css"; -type OutcomeLane = - | "promote-fast-track" - | "promote-fallback" - | "promote-gate-win" - | "archive-gate-fail" - | "auto-rollback" - | "hold"; - -interface SecondaryDecisionJSON { - substrate: SecondarySubstrate; - run: SecondaryRun; - selected_issues: Array<{ - exp_id: string; - kind: ExperimentKind; - target_files: string[]; - proposed_change: string; - expected_outcome_lane: OutcomeLane; - }>; - reasoning: string; - monitor_signal: string; -} - -interface SecondaryVerdictJSON { - substrate: SecondarySubstrate; - run: SecondaryRun; - experiments: Array<{ - exp_id: string; - kind: ExperimentKind; - reward_delta_pct: number; - p_value: number; - classification: "improved" | "hurt" | "neutral"; - outcome: OutcomeLane; - }>; -} - -interface SecondaryApplyLogJSON { - substrate: SecondarySubstrate; - run: SecondaryRun; - applied: boolean; - touched_files: string[]; - skipped: Array<{ exp_id: string; reason: string }>; - notes: string; -} -``` - -### Architecture Notes - -- `context/FEATURES.md:173` defines feature #58 as Layer 11 Pair Alpha: SaaS B2B + local service B2C synthetic HTMLs plus onboard/week-1/week-2 mock runs. -- The script owns only `site/secondary/` and `history/secondary-arc/`. -- Mock run artifact filenames must match the existing demo-run convention plus the new apply log: `proposal.md`, `decision.json`, `verdict.json`, `apply-log.json`. -- Artifact JSON shape should mirror `history/demo-arc` conventions: selected issues in `decision.json`, experiment verdict rows in `verdict.json`, Markdown experiment blocks in `proposal.md`. -- Tests should import exported constants/helpers from `scripts/seed-secondary-substrates.ts` rather than shelling out where possible, then separately verify package script presence if useful. - ---- - -## Implementation Summary - -### Story Overview - -| ID | Title | Priority | Dependencies | -|----|-------|----------|--------------| -| US-001 | Add deterministic secondary substrate model and HTML writers | 1 | — | -| US-002 | Write secondary mock run artifacts | 2 | US-001 | -| US-003 | Wire CLI/package script and scope guards | 3 | US-002 | -| US-004 | Add Bun tests for layout, idempotency, and protected paths | 4 | US-003 | - -### Dependency Graph - -```text -US-001 (substrate data + HTML writers) - ↓ -US-002 (history/secondary-arc artifact writers) - ↓ -US-003 (main + package script + protected path discipline) - ↓ -US-004 (tests) -``` - ---- - -## Validation Requirements - -Every story must pass: - -- [ ] Type-check: `bun run type-check` -- [ ] Lint: `bun run lint --max-warnings 0` -- [ ] Tests: `bun run test` -- [ ] Format: `bun run format:check` -- [ ] Full validation: `bun run validate` - ---- - -Generated: 2026-04-24T00:00:00.000Z diff --git a/.gitignore b/.gitignore index e47423f..40b2195 100644 --- a/.gitignore +++ b/.gitignore @@ -83,8 +83,34 @@ demo-output/videos/ /plan.md /research.md - # Claude Design polish handoff bundles — committed per-slot only after review skills/webster-video/polish-slots/**/handoff/ skills/webster-video/polish-slots/handoff-shared/ skills/webster-video/polish-slots.zip + +# Internal tracking docs — preserved in ~/Vault/Projects/webster/internal-tracking/ +context/EXPANSION-TASKS.md +context/E2E-IMPLEMENTATION-TRACKER.md +context/SITE-FORK-CHECKLIST.md +context/ROADMAP.md +context/VIDEO-PLAN.md +context/VIDEO-PLAN-90s.md +context/v2-design.md + +# Intermediate session prompts — only first/second-wbs and sim-council are public-facing +prompts/third-wbs-session.md +prompts/fourth-wbs-session.md +prompts/sim-audit-fix-session.md +prompts/composition-session.md +prompts/e2e-demo-run-session.md +prompts/sim-runner.md + +# History operator notes (story belongs in README/AGENTS.md, not duplicated) +history/AGENTS.md +history/CLAUDE.md + +# Polish-session worktree prompts (local hand-off only) +ONBOARDING-V2-PROMPT.md + +# Personal launchd plist (hardcoded user paths) — preserved in vault +deploy/webster-dispatcher.plist diff --git a/AGENTS.md b/AGENTS.md index 2f98ae8..797fc51 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -22,9 +22,9 @@ Two active workstreams: 2. `context/ARCHITECTURE.md` — current system design 3. `context/FEATURES.md` — shipped state + stream allocation 4. `context/VISION.md` — canonical north-star for the active hackathon expansion. If about to code or make an architectural call, this doc tells you whether you're drifting. -5. `context/EXPANSION-TASKS.md` — topologically ordered tasks with acceptance criteria -6. `context/QUALITY-GATES.md` — validation rules (mirror Forge pattern) -7. `~/Vault/Projects/webster/webster-decision-log.md` — architectural decisions with rationale +5. `context/QUALITY-GATES.md` — validation rules (mirror Forge pattern) +6. `~/Vault/Projects/webster/webster-decision-log.md` — architectural decisions with rationale +7. `~/Vault/Projects/webster/internal-tracking/context/EXPANSION-TASKS.md` — local-only task tracker for the hackathon expansion (vault, not in repo) ## Communication with Richie @@ -103,7 +103,7 @@ bun run validate ## Task pickup protocol (hackathon expansion) -1. Check `context/EXPANSION-TASKS.md` — pick next unblocked task in topological order. Do NOT skip T0. +1. Check `~/Vault/Projects/webster/internal-tracking/context/EXPANSION-TASKS.md` (vault, local-only) — pick next unblocked task in topological order. Do NOT skip T0. 2. Re-read the task's acceptance criteria 3. Read every file the task touches before editing 4. Implement minimally — no scope expansion, no drive-by refactors @@ -147,4 +147,4 @@ State the conflict. Don't paper over it. Consult `~/Vault/Projects/webster/webster-decision-log.md` — every locked decision with rationale. -If a path isn't clear and VISION.md / EXPANSION-TASKS.md don't answer, leave a `[STUCK]` or `[QUESTION]` prefix in your session output. Don't compose around it. +If a path isn't clear and VISION.md doesn't answer, leave a `[STUCK]` or `[QUESTION]` prefix in your session output. Don't compose around it. diff --git a/README.md b/README.md index af1c102..d38dced 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,18 @@ Current state: 175 tests passing, 0 lint warnings, 0 type errors, 18 JSON specs - `git` with commit-signing configured - An Anthropic API key stored in macOS keychain under service `anthropic-webster`. First-session will show the exact `security add-generic-password` command if missing. +### The `wbs` alias (project convention) + +The `wbs @prompts/...` commands below assume a shell alias that launches Claude Code into Webster's dispatcher mode (Opus 4.7, 1M context, custom system prompt at `.claude/dispatcher.md`, custom settings at `.claude/dispatcher-settings.json`). Add to your shell rc: + +```bash +alias wbs='cd ~/Projects/webster && claude --dangerously-skip-permissions --model claude-opus-4-7 \ + --settings .claude/dispatcher-settings.json \ + --system-prompt "$(cat .claude/dispatcher.md)"' +``` + +Or run the equivalent `claude --settings ... --system-prompt ...` directly without aliasing. Either works. + ### Bootstrap (one-time) ```bash diff --git a/context/E2E-IMPLEMENTATION-TRACKER.md b/context/E2E-IMPLEMENTATION-TRACKER.md deleted file mode 100644 index 08003f4..0000000 --- a/context/E2E-IMPLEMENTATION-TRACKER.md +++ /dev/null @@ -1,206 +0,0 @@ -# Webster E2E Implementation Tracker - -> Handoff file for compaction recovery. Read this first if the session is resumed. Last updated: 2026-04-25. - -## Operating mode - -- Execute directly; do not wait for approval unless a task has unresolved ambiguity. -- Preserve production Webster: - - Do not modify the existing production `agents/webster-*` specs unless the task explicitly says so. - - Do not modify `prompts/second-wbs-session.md`. -- Validate before claiming completion. -- Prefer narrow reads and targeted edits. -- Use subagents for repo-wide audits or isolated review so the main context stays lean. - -## Current repo state summary - -Verified by main session + subagent scout: - -| Task | Current status | Evidence / notes | -| ------------------------ | -------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| T0 Pass-7 fixes | Done in tree | `scripts/apply-worker-cli.ts`, `scripts/apply-worker.ts`, `.husky/pre-commit`, `scripts/anthropic-agents.ts`, `scripts/planner-invoke.ts`, `scripts/critic-genealogy.ts`; targeted tests passed. Perl byte-count acceptance says 13 but actual correct count is 14. | -| T1 Memory stores | Appears implemented | `scripts/provision-memory-stores.ts`, tests exist. Needs validation/live run if not already done. | -| T2 Sim agents | Appears implemented | 18 `agents/webster-{lp,site}-sim-*.json` specs and `scripts/register-sim-agents.ts` exist. | -| T3 Contexts | Appears implemented | `demo-landing-page/context/*`, `demo-sites/northwest-reno/context/*`, `scripts/context-schema.ts` exist. | -| T4 Ugly sites | Partial / uncommitted | Ugly files exist but `git status` shows modified/untracked assets. Needs browser/render check and commit cleanup. | -| T5 Synthetic analytics | Appears implemented | `scripts/synthetic-analytics.ts` and tests exist. | -| T6 Sim council fork | Appears implemented | `prompts/sim-council.md` and tests exist. | -| T7 Simulation wrapper | Appears implemented | `scripts/run-simulation.ts` and tests exist. | -| T8 Entrypoints | Appears implemented | `scripts/run-simulation-lp.ts`, `scripts/run-simulation-site.ts` exist. | -| T9 Manifest/final sheets | Appears implemented | `scripts/build-demo-manifest.ts` and tests exist. | -| T10 Full dry run | Not done | No evidence of full dual 10-week run and handoff. | -| T11 Auto-capture | Implemented / needs authenticated screenshot | Added capture script, bridge, preflight, screenshot manifest, package scripts, and CAPTURE_TRIGGER emission. `browser-use` requires `-b real --profile Default`; current local Console session is logged out, so authenticated PNG acceptance still needs Richie login. | -| T12 Onboarding v2 | Not started | Missing `scripts/onboarding/*` and status flow. Secondary/case-study path. | -| T13 Empire Asphalt | Not started / blocked | Blocked on consent artifact. Missing brand corpus and external demo repo. | - -## Current validation state - -Latest full validation is green: - -```bash -bun run validate -# 181 pass, 0 fail -``` - -T11 preflight correctly fails until Richie logs into Anthropic Console in local Chrome `Default` profile: - -```bash -bun run sim:preflight -# AUTH_EXPIRED: Anthropic Console Memory Stores page is not authenticated/reachable -``` - -Auth-expired capture path was verified against the current local Chrome state: - -```bash -bun scripts/capture-mem-stores.ts '' -# exits 1 with AUTH_EXPIRED because local Anthropic Console is not logged in -``` - -Targeted T0 tests passed earlier: - -```bash -bun test scripts/__tests__/anthropic-agents.test.ts scripts/__tests__/critic-genealogy.test.ts scripts/__tests__/apply-worker-cli.test.ts -# 50 pass, 0 fail -``` - -## Immediate next steps - -### Step A — Restore green baseline - -Done. `bun run validate` is green. - -### Step B — Verify implemented tasks T1-T9 before adding new code - -Run fast, scoped checks first: - -```bash -bun test \ - scripts/__tests__/provision-memory-stores.test.ts \ - scripts/__tests__/register-sim-agents.test.ts \ - scripts/__tests__/context-schema.test.ts \ - scripts/__tests__/synthetic-analytics.test.ts \ - scripts/__tests__/sim-council.test.ts \ - scripts/__tests__/run-simulation.test.ts \ - scripts/__tests__/run-simulation-entrypoints.test.ts \ - scripts/__tests__/build-demo-manifest.test.ts -``` - -Then run full `bun run validate` again. - -### Step C — Finalize T4 uncommitted ugly-site state - -1. Review ugly-site diffs: - - ```bash - git diff -- demo-landing-page/ugly demo-sites/northwest-reno/ugly - find demo-landing-page/ugly demo-sites/northwest-reno/ugly -maxdepth 3 -type f - ``` - -2. Confirm no JS and no external network resources: - - ```bash - rg -n " Topologically ordered. Implement in sequence. Do NOT skip T0. Read `context/VISION.md` before each task and re-read it before marking any task done. - -## Session start protocol - -When a new session starts on this repo with a prompt like "Go" or "start" or "continue": - -1. **Read the first-actions list in `AGENTS.md`** in full (including `context/VISION.md` and this file) before writing any code -2. **Start T0 immediately** — no confirmation needed to begin work -3. **Stop after T0 completes** (validate green + committed). Report completion to Richie in 3–5 lines: what changed, test results, commit hash. Wait for his green-light before starting T1. -4. **From T1 onward, proceed task-by-task without waiting for approval** BUT before starting each new task, post a 2-line announcement: - - Line 1: `Starting T: ` - - Line 2: `Files I'll touch: ` - - This gives Richie visibility to interrupt if the approach is drifting without blocking the default path. - -5. **At any point**, if ambiguity exceeds what VISION.md + this file answer: stop and surface `[STUCK]` with a concrete question. Do not compose around it. - -## Per-task loop - -1. Re-read the task's acceptance criteria here -2. Read the files the task touches before editing them -3. Implement minimally — no scope expansion, no drive-by refactors, no "while I'm here" -4. Write the tests listed in acceptance criteria -5. `bun run validate` must be green -6. Conventional commit (`fix:` for T0, `feat:` for expansion tasks). One task = one commit (or one small series) -7. Before marking done, re-read VISION.md's "what's locked" + the task's acceptance criteria. If anything drifted, revisit. - -## Day-by-day target - -- **Day 1**: T0, T1, T3, T4 (infrastructure + assets, parallel-friendly) -- **Day 2**: T2, T5 (agent specs + synthetic analytics) -- **Day 3**: T6, T7, T8 + first dry run -- **Day 4**: T9, T10 + diagnose/re-run if needed + handoff - ---- - -## T0 — Pass-7 review fixes - -**Status**: blocking. 4 of 5 fixes touch simulation-path code; skipping T0 risks contaminating the demo with known bugs. - -**Files**: - -- `scripts/apply-worker-cli.ts:142` — og_card dims 1200x630 → 1536x1024 (or closest supported) -- `scripts/apply-worker.ts:733-739` — `runtime_failure` drops from visual-veto branch, falls through to `apply-fail` -- `.husky/pre-commit:13-15` — add `chomp;` + `print "$_\0"` in perl pipeline -- `scripts/critic-genealogy.ts` — wrap `fetchSessionSnapshot` call in `main()` with try/catch; persist spec.json + snapshot-error sentinel + agent JSON on failure; exit non-zero after commitArtifacts -- Extract shared paginated `findAgentByName` helper, import from both `scripts/planner-invoke.ts` and `scripts/critic-genealogy.ts` - -**Accept**: - -- `bun run validate` green -- New/updated unit tests: `runtime_failure → apply-fail`, snapshot-fetch-fail still writes spec.json, pagination helper finds name on page 2 -- `printf 'foo.ts\0bar.md\0baz.txt\0' | perl -0ne 'chomp; print "$_\0" if /\.(ts|js|json|md|jsonc)$/;' | wc -c` returns 13 -- Conventional commits (one per fix, or one bundled `fix: apply pass 7 review items`) - ---- - -## T1 — Memory store provisioning - -**Depends on**: T0 - -Create `scripts/provision-memory-stores.ts` — idempotent provisioner that creates 12 memory stores via `POST /v1/memory_stores` (beta header `managed-agents-2026-04-01`). - -**Stores** (6 per substrate): - -| Store name | Writer | Readers | -| ------------------------------------- | ----------------------------- | ----------------------------- | -| `webster-council-memory-lp` | orchestrator (RW) | all LP sim agents (read_only) | -| `webster-planner-memory-lp` | planner (RW) | planner (RW) | -| `webster-redesigner-memory-lp` | redesigner (RW) | redesigner (RW) | -| `webster-genealogy-memory-lp` | orchestrator (RW) | genealogy logic (read_only) | -| `webster-conversion-critic-memory-lp` | conversion-critic (RW) | conversion-critic (RW) | -| `webster-visual-reviewer-memory-lp` | visual-reviewer (RW) | visual-reviewer (RW) | -| (same 6 names with `-site` suffix) | (parallel for site substrate) | (parallel) | - -**Output**: `context/memory-stores.json`: - -```json -{ - "lp": { - "council": "memstore_01...", - "planner": "memstore_01...", - "redesigner": "memstore_01...", - "genealogy": "memstore_01...", - "conversion-critic": "memstore_01...", - "visual-reviewer": "memstore_01..." - }, - "site": { ... } -} -``` - -**Accept**: - -- Running script twice produces identical output (idempotent by name lookup — if store with `name` already exists, reuse its ID) -- `context/memory-stores.json` contains 12 entries keyed by substrate + role -- Unit test mocks the API, verifies idempotency + error handling (network fail + partial completion resume) -- Script is safe to re-run after partial failure - ---- - -## T2 — 18 new sim-specific agent specs - -**Depends on**: T0 - -Create 18 new MCP-native agent specs. **Existing 9 `webster-*` agents are NOT modified.** - -**LP sim set** (9 files under `agents/`): - -- `webster-lp-sim-monitor.json` (Haiku 4.5) -- `webster-lp-sim-seo-critic.json` (Sonnet 4.6) -- `webster-lp-sim-brand-voice-critic.json` (Sonnet 4.6) -- `webster-lp-sim-fh-compliance-critic.json` (Sonnet 4.6) -- `webster-lp-sim-conversion-critic.json` (Sonnet 4.6) -- `webster-lp-sim-copy-critic.json` (Sonnet 4.6) -- `webster-lp-sim-redesigner.json` (Opus 4.7) -- `webster-lp-sim-planner.json` (Opus 4.7) -- `webster-lp-sim-visual-reviewer.json` (Opus 4.7) - -**Site sim set** (9 files under `agents/`): - -- `webster-site-sim-monitor.json` -- `webster-site-sim-seo-critic.json` -- `webster-site-sim-brand-voice-critic.json` -- `webster-site-sim-licensing-and-warranty-critic.json` (replaces fh-compliance slot, Sonnet 4.6) -- `webster-site-sim-conversion-critic.json` -- `webster-site-sim-copy-critic.json` -- `webster-site-sim-redesigner.json` -- `webster-site-sim-planner.json` -- `webster-site-sim-visual-reviewer.json` - -**System prompt differences from existing `webster-*` agents**: - -- **No WebFetch**. All site reads via `get_file_contents` (GitHub MCP) at the demo branch ref passed in user.message (e.g. `ref: demo-sim-lp/w03`) -- **No LP_TARGET URL** reference. Replace with substrate-appropriate context block -- **Context paths substrate-specific**: LP agents read `demo-landing-page/context/business.md`; site agents read `demo-sites/northwest-reno/context/business.md` -- **Site pages (site set only)**: redesigner + critics reference the 3-page structure (`/`, `/services`, `/free-estimate`) -- **licensing-and-warranty-critic**: scoped to contractor licensing number display, insurance claims, warranty terms, service-area clarity -- **Brand-voice critic**: reads `brand.json` + `business.md`, enforces voice + do_not_use - -**Registration**: via idempotent `POST /v1/agents` (by-name lookup before POST). Wrap in `scripts/register-sim-agents.ts` or extend existing registration script. - -**Accept**: - -- All 18 specs validate against existing JSON schema -- `scripts/register-sim-agents.ts` idempotent: re-running doesn't duplicate -- Spec schema tests cover both sets -- No reference to `LP_TARGET` or WebFetch anywhere in the 18 new specs -- Existing 9 `webster-*` agents unchanged (diff check) - ---- - -## T3 — Prefilled contexts - -**Depends on**: T0. Can run in parallel with T2. - -### 3a — Richer Health (LP) - -Directory: `demo-landing-page/context/` - -- `business.md` — copy from existing `context/business.md` (already Richer-Health-scoped) -- `personas.json` — 3 personas extracted from `.claude/skills/nicolette-richer/references/brand-bible.md`. Each persona: `{id, name, archetype, goals, anxieties, conversion_triggers, behavior_hints}`. Suggested: "credentials-conscious-executive" / "curious-self-starter" / "skeptical-researcher". -- `brand.json` — structured: `{voice, tone, palette, typography, signature_phrases, do_not_use}`. Extract from brand bible. - -### 3b — Northwest Home Renovations (site) - -Directory: `demo-sites/northwest-reno/context/` - -- `business.md` — invent from scratch. Fields: business name, owner ("Sam Reyes"), location (Pacific Northwest, non-specific town), services (kitchen / bath / deck renovation), license number (fictional, e.g. WA-CONTR-NWR-2024), warranty terms ("5-year workmanship, 10-year structural"), insurance ("$2M liability"), tone ("competent, direct, trust-heavy"). -- `personas.json` — 3 B2C homeowner personas: "first-time-homeowner-anxious" (scared of being scammed), "price-comparing-pragmatist" (getting 3 quotes), "warranty-conscious-veteran" (has been burned before). -- `brand.json` — palette (navy/white/safety-orange OR forest-green/cream/brass — pick one, document choice), typography (clear sans-serif + utility), voice (direct + trust-heavy), do_not_use (no superlatives, no "world-class", no generic "quality"). - -**Accept**: - -- Both contexts validate against a shared schema you define in the task (even a simple Zod schema in `scripts/context-schema.ts` is fine) -- Both brand extracts are rich enough to give the brand-voice critic concrete rules to enforce (at least 5 do_not_use items, palette with hex codes, typography with font families) -- No cross-contamination (contractor context never references Richer Health; LP context never references Northwest Reno) - ---- - -## T4 — Ugly sites - -**Depends on**: T3 (needs brand.json to know what the ideal is, so we can deliberately violate it). Can run in parallel with T2. - -### 4a — Richer Health ugly - -Directory: `demo-landing-page/ugly/` - -- `index.html` — single file, intentionally unpolished -- `style.css` — inline acceptable; keep minimal -- `README.md` — "Intentionally ugly. Do NOT improve outside simulation." - -**Characteristics** (each is something a specific critic should flag): - -- Generic stock hero image (not Nicolette's actual photo) — conversion / brand-voice / SEO ding -- Vague headline "Health & Wellness Coaching" — copy / conversion ding -- No credentials anywhere — fh-compliance / brand-voice ding -- Weak CTA "Learn More" — conversion ding -- Times New Roman everywhere — brand-voice / visual-review ding -- Center-aligned body text, no hierarchy — visual-review / copy ding -- No testimonials / social proof — conversion ding - -Reference (human-read only, not committed to repo references): existing `site/before/index.html` for layout structure. Do NOT copy — derive an intentionally-worse version. - -### 4b — Contractor ugly (3 pages) - -Directory: `demo-sites/northwest-reno/ugly/` - -- `index.html` (home) + `style.css` -- `services.html` -- `free-estimate.html` -- `README.md` - -**Characteristics**: - -- Home: Times New Roman, clip-art header, no photos of real work, generic phrases ("Best in the business!"), CTA is bare text link "Contact us" -- Services: a bulleted list with no descriptions, no prices, no warranties mentioned -- Free-estimate: unlabeled form inputs, no required-field markers, no phone number option, no expected-response-time -- Cross-page: inconsistent nav, no footer, no license number anywhere, no insurance mention, no before/after photos - -**Accept**: - -- Both ugly states commit to dedicated demo branches (`demo-sim-lp/w00`, `demo-sim-site/w00`) -- No JavaScript, no external network resources (self-contained HTML/CSS) -- Loaded in a browser they render (no broken markup); they're ugly, not broken -- Diff against `brand.json` shows broad violation — every persona and every brand rule has something to attack - ---- - -## T5 — Synthetic Analytics Agent - -**Depends on**: T3 - -Build `scripts/synthetic-analytics.ts` — generates per-week analytics reacting to current site state. - -**Inputs** (JSON file passed via CLI or stdin): - -```ts -{ - substrate: "lp" | "site", - week: number, // 0-indexed, 0 = baseline - weekDate: string, // ISO, for seasonality - sitePath: string, // absolute path to site dir for current week - contextPath: string, // absolute path to context dir - previousAnalytics?: AnalyticsJson, // week N-1, absent on week 0 - seed: string // determinism -} -``` - -**Output**: - -- `analytics.json` — schema matches existing `scripts/analytics-ingestion.ts` (`sessions`, `bounce_rate`, `avg_time_s`, `scroll_depth_{25,50,75,100}`, `cta_clicks` per CTA, `section_engagement[]`) -- `analytics-reasoning.md` — per-persona narrative of why metrics moved (3–5 sentences each) - -**Agent invocation**: - -- Uses `/v1/messages` (not Managed Agents) for simplicity — synthetic analytics is one-shot, no memory needed -- Model: Opus 4.7 (judgment-heavy) -- System prompt includes: persona distribution (5000 users × 3 personas, fixed), hard continuity (±15% per metric unless justified), seasonality hints, realistic event variance, no bias toward specific gaps - -**Accept**: - -- Golden-file test: given fixed seed + fixed week-0 HTML + personas, produces identical analytics.json on re-run -- Continuity test: given week-0 output as previousAnalytics + SAME site (unchanged), week-1 deltas stay within ±5% per metric (no change = no reason to swing) -- Continuity test: given week-0 output + MUTATED site (hero copy improved), week-1 bounce_rate drops by 5–20%, justification in reasoning.md -- Schema-compatibility test: output `analytics.json` parses cleanly via existing `analytics-ingestion.ts` normalizer - ---- - -## T6 — Sim orchestrator fork - -**Depends on**: T2, T3, T4 - -Fork `prompts/second-wbs-session.md` → `prompts/sim-council.md`. Parameterize the hardcoded values. - -**Changes from source**: - -- Header block takes env vars: `SUBSTRATE` (`lp`|`site`), `WEEK_DATE`, `BRANCH` (e.g. `demo-sim-lp/w03`), `AGENT_SET` (`webster-lp-sim`|`webster-site-sim`), `CONTEXT_PATH`, `SITE_PATH`, `MEMORY_STORES_JSON` -- Drop the `LP_TARGET=https://certified.richerhealth.ca` line and remove all WebFetch-based critic instructions (sim agents already read via MCP) -- Drop the 10-week mock-history seeder (Step 1) — simulation wrapper generates fresh analytics per week via T5 -- Agent IDs sourced from `context/sim-agents.json` (produced by T2's registration script), keyed by `$AGENT_SET` -- Memory-store attachment in every `POST /v1/sessions` call — attach the role-appropriate store from `$MEMORY_STORES_JSON` - -**Accept**: - -- `sim-council.md` validates shellcheck on its bash blocks -- Running with `SUBSTRATE=lp WEEK_DATE=2026-02-01 BRANCH=demo-sim-lp/w00 ... wbs @prompts/sim-council.md` produces a week-0 council run with all agents invoked via sim IDs -- Production `prompts/second-wbs-session.md` untouched (diff check) - ---- - -## T7 — Simulation wrapper - -**Depends on**: T5, T6 - -Build `scripts/run-simulation.ts` — library + CLI that loops N weeks for one substrate. - -**Flow per week**: - -1. Checkout/create demo branch `demo-sim-/w` -2. If week 0: commit the ugly site; else use previous week's branch as base -3. Call Synthetic Analytics Agent (T5) → write `history//w/analytics.json` -4. Spawn `prompts/sim-council.md` with env vars for this week -5. After orchestrator completes: capture screenshots at 3 breakpoints × all pages using Playwright on the local file (no deploy needed — Playwright can open file:// URLs) -6. Write memory-store summaries via REST API (council + planner + redesigner insights) -7. Bundle week artifacts into `demo-output//week-NN/` - -**Accept**: - -- Config-driven (substrate specifier, week count, paths) — not substrate-hardcoded -- Unit test with mock council (no real API calls) runs 2-week loop end-to-end -- Screenshot capture works with Playwright headless on `demo-landing-page/ugly/index.html` (file://) -- Fixed seed → identical demo branch HEAD after N weeks - ---- - -## T8 — Per-substrate invocations - -**Depends on**: T7 - -Thin entry scripts: - -- `scripts/run-simulation-lp.ts` — calls `run-simulation.ts` with `substrate=lp` + LP paths + 10 weeks -- `scripts/run-simulation-site.ts` — calls `run-simulation.ts` with `substrate=site` + site paths + 10 weeks - -**Accept**: - -- `bun scripts/run-simulation-lp.ts` runs 10 weeks end-to-end, ~30–45 min -- `bun scripts/run-simulation-site.ts` same -- Output directories `demo-output/landing-page/` and `demo-output/northwest-reno/` both populated with week-00 through week-10 artifacts -- Memory Stores Console shows 12 entries populated - ---- - -## T9 — Demo manifest + final sheets - -**Depends on**: T8 - -Build `scripts/build-demo-manifest.ts` — aggregates simulation output. - -**Per-substrate outputs**: - -- `demo-output//demo-manifest.json` — machine-parseable index of all weeks, screenshots, council artifacts, genealogy events, memory-store references -- `demo-output//final-sheet.png` — side-by-side week-0 vs week-10 desktop hero shot (ffmpeg or ImageMagick) - -**Accept**: - -- Manifest validates against a schema you define -- Final sheet is visually compelling (real improvement visible) -- Manifest includes absolute paths the downstream video-composition session can feed to Remotion - ---- - -## T10 — End-to-end dry run + handoff - -**Depends on**: T7, T8, T9 - -Run both simulations. Inspect outputs. Decide. - -**Accept — all must be true before handoff to video composition**: - -- Both `demo-output/` substrates contain full 10-week progressions -- Screenshots visually coherent (no blank pages, no JS errors, layouts render at all 3 breakpoints) -- Memory Stores Console shows 12 stores with content (open one, verify it contains meaningful summaries) -- Genealogy log shows what happened (a spawn, or a diagnosed-then-fixed non-spawn, or an explicit "no spawn in 10 weeks" with investigation notes) -- If no spawn and no budget to re-run: accept outcome, update VISION.md risk section with the finding, proceed to video composition with improvement-only narrative - -**Handoff deliverable** (for fresh Claude Code session to compose video): - -- `demo-output//demo-manifest.json` × 2 -- `demo-output//final-sheet.png` × 2 -- Memory-Stores-Console screenshots (captured manually by Richie) -- Nicolette clip (recorded separately by Richie) -- Onboarding skill recording (recorded separately by Richie) -- Brand bible content for copy/narration reference - ---- - -## Validation checkpoints - -Before moving to the next task, verify: - -1. `bun run validate` green -2. Committed (conventional commit message) -3. Re-read VISION.md's "what's locked" section — did you drift? -4. Flag anything unexpected with `[STUCK]` prefix before continuing - -## When genuinely stuck - -- Re-read VISION.md. The vision is the real contract. -- Surface the block to Richie. Don't produce composed-looking workarounds. -- Visible struggle > invisible corner-cutting. - ---- - -## Tier 2 implementation tasks (case-study + auto-capture support) - -> Added 2026-04-25. These tasks support the Tier 2 demo asset (Empire Asphalt onboarding case study video + automated Anthropic Console screenshot capture for Beat 5). Specced in `context/ONBOARDING-CASE-STUDY.md` and `prompts/sim-runner.md`. T11 is **blocking** T8/T10 because the sim must emit capture triggers and the bridge must consume them; T12 and T13 are case-study-only and can run parallel to T8–T10. - -## T11 — Auto-capture infrastructure - -**Depends on**: T7 (sim wrapper) -**Blocks**: T8 (sim invocations should emit capture triggers from the start), T10 handoff (Memory Stores screenshots are part of the deliverable) - -Wire capture-trigger emission into the sim wrapper, build the bridge process that reads triggers and spawns captures, and build the capture script that drives the `browser-use` CLI. - -**Before writing any T11 code (5-min pre-flight):** manually drive `browser-use` once against the real Anthropic Console memory stores page and capture the actual selectors: - -```bash -browser-use --profile "Default" open https://console.anthropic.com # navigate to memory stores via the UI -browser-use state # dump real selectors and URL -``` - -Copy the real list-page URL and a real container selector from the `state` output into `scripts/capture-mem-stores.ts`. The `[data-testid='memory-stores-list']` selector and `/settings/memory-stores` path used in design docs are intuition, not verified — replacing them with what `browser-use state` actually returns prevents a silent hang in the capture script. - -**Code:** - -- Modify `scripts/simulation-core.ts` to emit `CAPTURE_TRIGGER` JSON lines on stdout at weeks 1, 5, and 10 (exact format spec in `prompts/sim-runner.md` "Trigger protocol") -- Add `scripts/capture-mem-stores.ts` — accepts `{substrate, week, output}` from a trigger payload, shells out to `browser-use --profile "Default"` for navigation + screenshot, verifies the captured PNG is not a login page (size + text heuristic), exits 0 on success or non-zero with `AUTH_EXPIRED` on stderr if logged out -- Add `scripts/sim-capture-bridge.ts` — reads stdin line-by-line, passes through unchanged to its own stdout, parses lines that match `{"event":"CAPTURE_TRIGGER",...}`, spawns the capture script for each, halts the pipe on capture failure -- Add `bun run sim:preflight` script — checks: 18 sim agents registered, 12 memory stores provisioned, `console.anthropic.com` reachable via `browser-use`, `bun run` for sim scripts compiles -- Add `bun run sim:emit-manifest` script — at end of sim, walks `assets/memory-stores-screenshots/` and writes `manifest.json` consolidating the 6 PNG paths and per-week sizes - -**Accept:** - -- `bun run sim:preflight` returns 0 against a fully-provisioned environment -- A 1-week dry run (force `CAPTURE_TRIGGER` at week 1) writes a real authenticated Anthropic Console screenshot to `assets/memory-stores-screenshots/lp/week-1.png` — file > 100KB, visibly contains the memory stores list page (not a login screen) -- An auth-expired dry run (intentionally signed out of Console) makes the capture script exit non-zero with `AUTH_EXPIRED` on stderr, and the bridge halts the pipe rather than silently continuing -- Trigger protocol JSON format exactly matches `prompts/sim-runner.md` "Trigger protocol" section - -## T12 — `webster-onboarding` v2 skill + verify-all script - -**Depends on**: T1 (memory provisioning script), T2 (production agent specs already registered) -**Blocks**: case-study video recording - -Rewrite the onboarding skill from the b3fd05f baseline to fit the v2 phase model and v2 stack. Build the rollup verify script the skill drives at P3/P4 gates. - -**Code:** - -- `skills/webster-onboarding/SKILL.md` — phase model (P0–P5), status file at `context/onboarding-status.json`, dynamic Q&A in P1, key-safety disclaimer at P2, machine-checked gates at each phase boundary, resume-from-status-file at startup. Full spec in `context/ONBOARDING-CASE-STUDY.md` "Skill design — webster-onboarding v2" -- `scripts/onboarding/verify-env.ts` — reads `.env.local`, hits each provider's verify endpoint, returns ok/fail without echoing key values -- `scripts/onboarding/verify-all.ts` — runs all P3 + P4 checks (env + repo + memory stores + agents) as a single rollup; supports `--phase {p3,p4}` flag -- `scripts/onboarding/scaffold-repo.ts` — creates a fresh GitHub repo under the user's account, scaffolds an Astro starter using brand identity from `context/business.yaml` - -**Accept:** - -- `bun run onboarding:verify-all` exits 0 only when all of: `.env.local` has the 3 keys verified live, target GitHub repo is reachable via the user's PAT, `GET /v1/agents` returns ≥9 production agents, `GET /v1/memory_stores` returns ≥6 stores -- Skill, run twice on a fresh environment, produces identical state (idempotent) -- A test run on a clean environment, with all gates failing intentionally, reports the specific failing check + remediation hint, persists the status file, and resumes correctly when re-run after fixes -- No key values appear in stdout, stderr, or any committed file at any point - -## T13 — Empire Asphalt Paving substrate prep - -**Depends on**: dad consent (logged at `assets/onboarding-case-study/dad-consent.txt`) -**Blocks**: case-study video recording - -Hand-craft the ugly v0 of dad's site, fill the brand corpus, and create a fresh GitHub repo for the case study install to land into. - -**Code + assets:** - -- `context/brand-corpus/` populated with: logo.png, business-card.jpg, past-jobs/{1..3}.jpg, service-list.md, reviews.md, voice-notes.md (full spec in `context/ONBOARDING-CASE-STUDY.md` "Brand corpus") -- Fresh GitHub repo `richsak/empire-paving-demo` (private) containing a hand-crafted ugly v0 — single Astro page with the brand colors (`#1B47A1` royal blue, `#F9D71C` yellow), bad layout, missing trust signals, no responsive breakpoints. Acceptable to piggyback on T4's ugly-site fork script if it generalizes cleanly. -- `assets/onboarding-case-study/dad-consent.txt` — one-line acknowledgment confirming dad has agreed to use of business name, logo, and paraphrased quotes in the submission video. Do not commit a PII-heavy version. - -**Accept:** - -- `git clone richsak/empire-paving-demo` succeeds and the cloned site builds (`bun run build`) without errors -- The ugly v0 visibly uses the Empire palette and identity (not generic gray) -- Dad consent artifact exists in `assets/onboarding-case-study/` -- Brand corpus directory contains all 6 corpus categories, with at least placeholder contents for any items dad doesn't have real assets for (e.g. reviews paraphrased from real reviews if Google reviews are sparse) diff --git a/context/ONBOARDING-CASE-STUDY.md b/context/ONBOARDING-CASE-STUDY.md index e0c4654..97be371 100644 --- a/context/ONBOARDING-CASE-STUDY.md +++ b/context/ONBOARDING-CASE-STUDY.md @@ -23,7 +23,7 @@ This is **not** a role-play. Richie narrates from the operator/builder perspecti | Q1 | asset = case study video, not role-play | dad's domain is real, dad's quote is real (paraphrased), Richie remains himself | | Q2 | persona dissolved — Richie is Richie, dad is the user | no character swap | | Q3 | skill v2 = thin shell + scripts | matches Layer 4 architecture; UX layer over orchestration | -| Q4 | skill provisions full v2 stack: 9 agents + 6 memory stores + first council | matches video marquee feature | +| Q4 | skill provisions full v2 stack: 10 production agents + 6 memory stores + first council | matches video marquee feature | | Q5 | skill = brand context + infra wiring only; site code is upstream | Claude Design zip → Astro is a separate future skill | | Q6 | substrate = Empire Asphalt Paving (`empireasphalt.ca` parked, repo modern but undeployed) | strongest narrative — "domain owned, no real site, Webster built it" | | Q7 | context capture has 3 sources: URL scrape, file uploads, dynamic Q&A | fills brand memory from whatever surfaces exist | @@ -51,7 +51,7 @@ This is **not** a role-play. Richie narrates from the operator/builder perspecti | 8–33s | P1 Context capture | drag `logo.png`, `business-card.jpg`, `past-jobs/`, `voice-notes.md` into chat; skill auto-asks 2–3 dynamic gap-fills (voice register, do-not-use list, target customer) | Richie VO paraphrasing dad: _"Eighteen years paving. Family business. Premium handcraft, not the cheap-truck guys."_ | | 33–41s | P2 Prep checklist | checklist appears in chat: Anthropic key, GitHub access, Cloudflare token | VO: _"Three keys. He pastes them on his own machine. The skill never sees them."_ | | 41–56s | P3 Execute | user pastes keys locally (off-screen disclaimer overlay: _"Keys never typed in chat — pasted into `.env.local` on dad's machine"_); GitHub repo scaffolded; `.env.local` appears | VO: _"Skill writes nothing it can't see. Keys stay local."_ | -| 56–68s | P4 Verify | green checks roll in: env ✓ / repo ✓ / 6 memory stores provisioned ✓ / 9 agents registered ✓ | VO: _"Six memory stores. Nine agents. Wired in seconds."_ (deliberately vague — actual install time will be measured at recording and the pacing edited to match what the visuals show) | +| 56–68s | P4 Verify | green checks roll in: env ✓ / repo ✓ / 6 memory stores provisioned ✓ / 10 production agents registered ✓ | VO: _"Six memory stores. Ten agents. Wired in seconds."_ (deliberately vague — actual install time will be measured at recording and the pacing edited to match what the visuals show) | | 68–90s | P5 First council | session ID flashes; PR URL surfaces; week-1 redesign of dad's site appears in browser tab; cut to Webster wordmark | VO: _"First council fires. Reads his brand. Proposes week-one redesign. Dad reviews. Merges if he likes it."_ + paraphrased dad quote: _"He told me, 'I don't even need to think about it.'"_ | **Hard length**: 90s. **Floor**: 60s collapse via the drop priority below. @@ -127,14 +127,14 @@ P5 First council — trigger session, surface PR URL, end ### Phase exit gates (machine-checked) -| phase | gate | check | -| ----- | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| P0 | soft | user typed "ready" | -| P1 | hard | `context/business.yaml` exists + ≥1 source signal recorded | -| P2 | hard | checklist all `[x]` | -| P3 | hard rollup | `bun run onboarding:verify-all` green: `.env.local` exists + `gh repo view` ok + `GET /v1/agents` returns 9 + `GET /v1/memory_stores` returns 6 | -| P4 | hard | same rollup re-runs green | -| P5 | hard | session_id returned + PR URL surfaced | +| phase | gate | check | +| ----- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| P0 | soft | user typed "ready" | +| P1 | hard | `context/business.yaml` exists + ≥1 source signal recorded | +| P2 | hard | checklist all `[x]` | +| P3 | hard rollup | `bun run onboarding:verify-all` green: `.env.local` exists + `gh repo view` ok + `GET /v1/agents` returns the count of production specs in `agents/*.json` (currently 10) + `GET /v1/memory_stores` returns ≥6 | +| P4 | hard | same rollup re-runs green | +| P5 | hard | session_id returned + PR URL surfaced | Gate failure → show the specific check that failed + remediation hint + halt with status file preserved. User fixes, re-runs skill, resume from same phase. diff --git a/context/ROADMAP.md b/context/ROADMAP.md deleted file mode 100644 index 8ba1823..0000000 --- a/context/ROADMAP.md +++ /dev/null @@ -1,192 +0,0 @@ -# Webster Roadmap — the map - -> Single source of truth for "where am I, what's next, what did I sign up for." -> Read top-to-bottom when lost. Regenerate from `context/FEATURES.md` if it drifts. - -## The one-paragraph map - -Webster is a **Council of Claude Managed Agents** that autonomously redesigns a small-business landing page, week after week, with **Opus 4.7 spawning new critics at runtime** when it spots patterns existing critics miss. The novel mechanic is **Critic Genealogy** — agents creating agents. The hackathon submission for Anthropic × Cerebral Valley "Built with Opus 4.7" is due **Sunday April 26 2026, 8PM EST** (~70h from now). Target prize lanes: Managed Agents $5K (62-72/100) + Creative Exploration $5K (48-58/100) + Grand $50K (18-25/100). - -## North-star invariant - -**Validate before human approval.** Every change passes the full validation stack — static critics → runtime gate → visual reviewer → autoresearch verdict — before it reaches a PR in Richie's inbox. Human is the last ratchet, not the first debugger. If a feature doesn't connect to this principle, it's out of scope. - -## Where we are right now (2026-04-23) - -- **Branch**: `main`, 5 commits ahead of `origin/main` (push-blocked by permission policy — Richie's action) -- **Submission runway**: ~70 hours to deadline -- **Shipped**: Layers 1–4 + 7 — 24 features in ~12 hours of focus-work -- **In-progress**: 1 (Layer 1 live-artifact pattern) -- **Blocked**: Layer 6 video (5 features, Richie voice record) -- **Open loops**: 3 deadline-critical + 4 scope-expansion layers (see below) -- **Full submission scope** (updated): Layers 1–11 — everything ships before 4/26 - -## Timeline correction (my estimation bias) - -My earlier estimates assumed calendar-hours. Actual tempo: **24 features in 12 hours** with Forge + Pi workers dispatched in parallel. The remaining 4 layers (L8 + L9 + L10 + L11) total ~55 focus-hours of feature work, which at your tempo compresses to ~18–25h of your wall time. Fits in 70h runway with room for the video, form, and a voice-surgery polish pass. - -Operating assumption from here forward: **every feature you name is in pre-submission scope unless you explicitly say otherwise.** - -## Layer-by-layer truth - -| Layer | Theme | Status | Features | -| ----- | ------------------------------------------ | -------------------------------------- | ---------------------------------------- | -| L1 | Routine + Orchestrator | shipped | #2–6 done; #1 cut; #5 in-progress | -| L2 | 7 Managed Agent Critics | shipped | #7–#12 done | -| L3 | **Critic Genealogy (HERO)** | shipped, live-validated | #13–#17 done | -| L4 | Onboarding Skill | shipped | #18, #19, #23, #24 done; #20–#22 cut | -| L5 | Substrate + Mock History | core shipped | #27 done; #25, #26, #28 cut | -| L6 | Meta Video | blocked | #29–#33 waiting on voice record | -| L7 | Polish | mostly shipped | #34–#36 done; #37 todo (Richie action) | -| L8 | **v2: Apply worker, text-only** | planned — ships FIRST | #38 done; #39a–e, #40a–d todo | -| L10 | **v2.5: Designer scope expansion** | planned — ships SECOND | #47–#49 todo (kind+constraints+verifier) | -| L9 | **v3: Visual review + Autoresearch** | planned — ships THIRD (0bb9db2) | #41a–d, #42–#46 todo | -| L11 | **v4: Planner + experiment-aware council** | planned — ships LAST (closes the loop) | #50–#53 todo (NEW this session) | - -## What's new THIS session (session 4) - -- `61cfae4` — `site/before/` + `site/after/` forked from live `certified.richerhealth.ca`; 5-issue proposal applied by hand to `after/` -- `475e129` — `context/v2-design.md` grill-me answers; Layer 8 decomposed into #39a-e + #40a-d -- `a1cb0e5` — advisor-caught regression fix: "No more patient churn" restored in Issue 4 hero -- `0bb9db2` — Layer 9 added (9 sub-features: visual-reviewer chain + autoresearch chain) + 6 hero screenshots as motivating evidence -- `f34858d` — `context/ROADMAP.md` — single source of truth for roadmap + narrative -- **PENDING (this phase)**: `context/DOMAIN-MODEL.md` (NEW) + Layer 10 + Layer 11 added to FEATURES.md - -All commits local-only. Push permission policy blocks direct push to main; Richie-action item. - -## Architectural shift locked this session - -Autoresearch is NOT a back-end post-merge feedback loop. It's the **input** to the next council run. A **planning agent** sits before the 5 critics + redesigner, reads last week's verdict + what-changed, decides experiment direction for this week, then the council runs with `plan.md` as context. - -This is the shift from **autonomous weekly redesigner** → **autonomous experiment agent**. See `context/DOMAIN-MODEL.md` for the formal model + week lifecycle + grill-me questions on the 7 remaining architectural decisions. - -## Deadline-critical loops for 4/26 - -**Human-only actions** (no Claude can do these): - -1. **Cerebral Valley submission form** (#37; ~15 min; Richie-only) -2. **Demo video voice record** (Layer 6 blocker; ~1h record + Saturday assembly) -3. **Push local commits to origin/main** (1-min terminal action, required before submission) - -**Scope-expansion layers that ALSO ship pre-submission** (Claude+worker parallelizable): - -- **Layer 8** — apply worker text-only (~18h feature work) -- **Layer 10** — designer scope expansion (~7h feature work) -- **Layer 9** — visual review + autoresearch measurement (~18h feature work) -- **Layer 11** — planner agent + experiment-aware council (~12h feature work, NEW) - -Ordering: 1–3 anytime. 4–7 in dependency order (L8 → L10 → L9 → L11). Grill-me on DOMAIN-MODEL.md open questions unblocks L11 implementation. - -## Pre-submission roadmap — L8 → L10 → L9 → L11 (dependency order) - -All three layers exist to make Webster **genuinely autonomous**, not just autonomously-change-producing. Build order matters: - -### L8 (v2) — Apply worker, text-only | ~18h total - -**Why it exists**: today the council emits `proposal.md`. No code changes. L8 turns proposal into PR diffs. Text-level only — council says "change X to Y", apply runs find-replace, runs lint/type/format, emits a PR. - -| # | Feature | Hours | -| ------ | ------------------------------------------------------------------------- | ----- | -| #38 | site/ fork — DONE session 4 | ✅ | -| #39a | Apply worker core (Pi worker via Forge, worktree-isolated) | 4–6 | -| #39b | Runtime validation gate (Playwright: CTAs resolve, no JS errors) | 2–3 | -| #39c | Critic re-run gate (0 new CRITICAL, ≤2 new HIGH; 3-iter fix loop) | 2 | -| #39d | Per-cluster PR emission (1–3 issues/PR, max 3 PRs/week) | 3 | -| #39e | CF Pages preview URL wiring | 1–2 | -| #40a–d | Image-gen tool (tool schema, backend, brand persistence, #39 integration) | 7 | - -**Testable when**: `wbs @prompts/fifth-wbs-session.md` produces a PR with real code diffs, not just `proposal.md`. - -### L10 (v2.5) — Designer scope expansion | ~7h total - -**Why it exists**: session-4 proved text-only proposals aren't enough. Longer copy needs smaller font-size to keep hero rhythm. Without L10, the council is a **copy-editor council**, not a **design council**. L10 lets the designer propose CSS/layout/component changes as first-class issues. - -| # | Feature | Hours | -| --- | ------------------------------------------------------------------------------------- | ----- | -| #47 | Proposal schema v2 (kind-aware: text/css/component/asset + constraints block) | 2 | -| #48 | Apply worker multi-kind routing (tool per kind) | 3 | -| #49 | Visual-reviewer constraint verifier (asserts declared constraints in rendered output) | 2 | - -**Testable when**: council proposes "shorter subhead + 0.75× hero font-size + 3-line desktop H1 constraint" as ONE atomic issue; apply worker executes all three together; visual-reviewer confirms constraint met. - -### L9 (v3) — Visual review + Autoresearch | ~18h total - -**Why it exists**: L8 and L10 ship changes. L9 **verifies they work**. Two halves: - -**Visual reviewer** (runs immediately post-apply, pre-PR): - -| # | Feature | Hours | -| ---- | --------------------------------------------------------------------------------------------------- | ----- | -| #41a | `agents/webster-visual-reviewer.json` spec (Opus 4.7) | 1 | -| #41b | `skills/webster-browser-audit/SKILL.md` (Playwright screenshot + a11y tree + interaction recording) | 3 | -| #41c | Proposal-intent verifier (content presence + overflow detection) | 2 | -| #41d | #39 integration (3-iteration fix-hint loop back to apply worker) | 1 | - -**Autoresearch** (runs post-merge, week+ cycles): - -| # | Feature | Hours | -| --- | ------------------------------------------------------------------------------------- | ----- | -| #42 | Analytics ingestion (CF Worker pixel → D1 or PostHog/GA4 webhook) | 3 | -| #43 | Baseline tracker + change log | 2 | -| #44 | Verdict engine (proxy-first fast signal + CVR slow confirm; asymmetric rollback gate) | 3 | -| #45 | Auto-rollback worker (git revert → CF preview → draft PR for override) | 2 | -| #46 | Baseline promoter (2-week sustained improvement → new baseline) | 1 | - -**Testable when**: visual-reviewer blocks a known-bad session-4-style regression; autoresearch rolls back a week that hurts proxy metrics; baseline promoter advances after 2 good weeks. - -### L11 (v4) — Planner + experiment-aware council | ~12h total | NEW - -**Why it exists**: L9 measures last week's experiment. But measurement without decision is half a loop. L11 adds a **planning agent** that sits BEFORE the 5 critics + redesigner, reads last week's verdict + what-changed, decides direction for this week (promote / hold / rollback), and feeds `plan.md` as context to the council run. Closes the autonomy loop — Webster becomes an **experiment agent**, not a weekly redesigner. - -| # | Feature | Hours | -| --- | ----------------------------------------------------------------------------------------------------------------- | ----- | -| #50 | `agents/webster-planner.json` (Opus 4.7) — reads verdict + what-changed, decides next-experiment direction | 2 | -| #51 | Verdict → plan pipeline — orchestrator invokes planner with verdict.json + proposal.md + apply-log + monitor data | 3 | -| #52 | Plan → council integration — critics + redesigner read plan.md as input context | 3 | -| #53 | Cold-start behavior — week 1 with no prev verdict; planner outputs "explore broadly" default plan | 2 | - -**Testable when**: week N+1 council run reads week N verdict automatically; planner outputs plan.md before critics spawn; critics + redesigner have plan.md in context; end-to-end cycle (propose → apply → review → merge → measure → verdict → plan → propose) runs in simulator without human touch between measure and plan. - -**Grill-me questions blocking L11**: 7 open decisions listed in `context/DOMAIN-MODEL.md`. Richie answers → implementation unblocks. - -## Decisions waiting on you - -Ranked by blast radius: - -1. **Push path for 5 local commits** — direct push to main, OR PR branch? (blocks submission) -2. **Cerebral Valley submission form** (#37) — Richie-only 15-min task -3. **Voice record scheduling** — Sat AM? blocks Layer 6 video (~3h cleanup after) -4. **L11 grill-me answers** — 7 open questions in `context/DOMAIN-MODEL.md` unblock planner implementation -5. **Session-4 hero voice-surgery** — revert copy to BEFORE wording (85/100), or trim line 3 (75/100), or keep as cautionary-tale artifact (45/100)? My pick: option 1 after v2 apply worker lands, as the first-ever apply-worker PR demo -6. **`[R-confirm]` in `context/v2-design.md`** (3 items): visual-regression cost threshold, `gpt-image-1` as image backend default, PR `summary.json` alongside markdown - -## Three things to hold in your head - -Everything else is noise until these land: - -1. **Submit by 4/26** — form + video + push origin (human-only actions) -2. **Answer L11 grill-me** — 7 questions in `context/DOMAIN-MODEL.md` unblock the planner agent and the full-cycle autonomous claim -3. **Kick off L8 #39a** — apply worker core. First real PR with mutated code. Everything downstream (L10 → L9 → L11) layers on top - -The rest exists. Those three are the **bottleneck path**. Full submission scope is now all 11 layers — ~18–25h wall time at your tempo, fits in 70h runway. - -## How this doc relates to the rest - -- `context/FEATURES.md` — canonical per-row status. This doc quotes it; FEATURES.md is authoritative for "what's `todo` vs `done`." -- `context/ARCHITECTURE.md` — system diagram + layer breakdown. This doc is the narrative overlay. -- `context/v2-design.md` — grill-me answers + rationale for Layer 8 decomposition. This doc summarizes; v2-design.md is the detailed record. -- `~/Vault/Projects/webster/Webster.md` — cross-session hub + pitch. -- `~/Vault/Projects/webster/webster-open-loops.md` — action queue (vault-scoped, broader than this doc's 3 open loops). - -## How to use this doc - -- **Lost** → read top to bottom in 5 min -- **Before a session** → skim "what's new this session" + "three things to hold in your head" -- **After a decision** → update "decisions waiting on you" or ask me to -- **On a pull request** → cross-check "Layer-by-layer truth" table - -This doc is the truth-source for roadmap questions. If `context/FEATURES.md` contradicts this about per-row state, FEATURES.md wins. If anything contradicts this about layer-narrative or ordering, this wins. - ---- - -Last regenerated: 2026-04-23 (session 4 Phase 5, after Layer 9 commit + Layer 10 proposal). diff --git a/context/SITE-FORK-CHECKLIST.md b/context/SITE-FORK-CHECKLIST.md deleted file mode 100644 index 964bb08..0000000 --- a/context/SITE-FORK-CHECKLIST.md +++ /dev/null @@ -1,94 +0,0 @@ -# site/ Fork Checklist - -Run this the moment `site/` is forked from `certified.richerhealth.ca`. Everything here is a one-shot onboarding for the submitted code's own quality gates. Root-level webster gates (JSON schemas, findings validator, markdownlint) already run against the repo and will continue to; this page covers what to add _inside_ `site/`. - -## Build surface - -- `site/package.json` exists with Astro scripts -- `site/bun.lock` committed -- `bun install --frozen-lockfile` in `site/` succeeds on CI -- `bun run build` in `site/` succeeds (will flip on the `site-build` job in `.github/workflows/test.yml`) - -## site/ toolchain to install - -```bash -cd site -bun add -D @astrojs/check astro-eslint-parser eslint-plugin-astro prettier-plugin-astro -``` - -## site/eslint.config.js - -```js -import js from "@eslint/js"; -import tseslint from "typescript-eslint"; -import astro from "eslint-plugin-astro"; - -export default tseslint.config( - { ignores: ["dist", ".astro", "node_modules"] }, - js.configs.recommended, - ...tseslint.configs.strict, - ...astro.configs.recommended, -); -``` - -## site/.prettierrc (inherits from root) - -```json -{ "plugins": ["prettier-plugin-astro"] } -``` - -## site/package.json scripts - -```json -{ - "scripts": { - "dev": "astro dev", - "build": "astro check && astro build", - "preview": "astro preview", - "lint": "eslint . --cache --max-warnings 0", - "format:check": "prettier --check .", - "type-check": "astro check" - } -} -``` - -## Root workflow flips - -Once `site/package.json` exists, the `site-build` job in `.github/workflows/test.yml` starts running. Additions to make at the same time: - -- Add `site-lint` job running `bun run lint --max-warnings 0` in `site/` -- Add `site-format` job running `bun run format:check` in `site/` -- Remove `continue-on-error` from any remaining site-build steps once it's stable - -## Pre-commit hook bump - -When `site/` lands, append to `.husky/pre-commit`: - -```sh -if [ -d site ]; then - (cd site && bun run lint --max-warnings 0 && bun run format:check) || exit 1 -fi -``` - -## Playwright (Day 5 polish, optional) - -If time holds after core fan-out + redesigner works: - -```bash -cd site -bun add -D @playwright/test -bunx playwright install chromium -``` - -One smoke test confirming the redesigned LP renders and the Acuity booking CTA is present at `site/tests/hero.spec.ts`. Run in CI matrix against Cloudflare preview URLs. - -## Do NOT add preemptively - -These buy nothing until `site/` exists, and installing them now balloons the root `node_modules`: - -- `astro` / `@astrojs/cloudflare` -- `eslint-plugin-astro` / `astro-eslint-parser` -- `prettier-plugin-astro` -- `@playwright/test` - -They go in `site/package.json` when `site/` lands. diff --git a/context/VIDEO-PLAN-90s.md b/context/VIDEO-PLAN-90s.md deleted file mode 100644 index 1735ee0..0000000 --- a/context/VIDEO-PLAN-90s.md +++ /dev/null @@ -1,189 +0,0 @@ -# Demo Video Plan — 90-second cut (genealogy spine) - -> Active spec for the 2026-04-26 hackathon submission. Supersedes `context/VIDEO-PLAN.md` for this submission. The 180s plan in `VIDEO-PLAN.md` is preserved as historical artifact for any post-submission re-cut. - -## Why 90s, not 180s - -Submission deadline is 2026-04-26 (one calendar day from 2026-04-25). Nicolette interview window is post-deadline, so Beat 1 of the original 180s plan is unavailable. Without Beat 1 the 180s arc loses its emotional opener. Recasting around the genealogy moment — the only capability that strictly requires Opus 4.7 reasoning — produces a tighter, harder-hitting submission than a degraded 3-min cut. - -## Hard constraints - -- **Deadline**: 2026-04-26 -- **Length target**: 90s (acceptable floor: 80s) -- **Voice**: Richie records; no AI voice; no Nicolette clip -- **Composition stack**: Forge Remotion (per locked decision in `VIDEO-PLAN.md`) -- **Real artifacts only**: every on-screen artifact is a real file path / commit / agent name from the Webster repo. No fabrication. -- **Spine**: runtime agent genealogy — Webster spawned `visual-design-critic` in the live `history/2026-04-23/` council run - -## 5-beat structure - -| # | Beat | Time | Spoken (~) | Hero visual | Asset source | -| --- | ---------- | --------- | ---------- | ------------------------------------------------------------------------- | ----------------------------------------------------------- | -| 1 | Hook | 0:00–0:10 | 17 words | Black → "Webster" text card → council diagram zoom | Forge-Remotion-authored title card | -| 2 | Setup | 0:10–0:25 | 29 words | 7-node council fan-out animation | Forge Remotion comp from `agents/*.json` registry | -| 3 | The Moment | 0:25–0:55 | 61 words | Cursor scrolling `spec.json` → terminal `POST /v1/agents` → first finding | `history/2026-04-23/genealogy/spec.json` + screen recording | -| 4 | Receipt | 0:55–1:15 | 35 words | Genealogy log on screen → before/after LP morph | `history/2026-04-23/genealogy/` + sim `final-sheet.png` | -| 5 | Frame | 1:15–1:30 | 27 words | Feature grid page scroll → end card | `assets/feature-grid/index.html` (Tier 3 item #1 output) | - -Total spoken: ~169 words at ~130 wpm = ~78s. Buffer: 12s of silence/visual breathing, distributed unevenly (most at end of Beat 3 and Beat 5). - -## VO recording script (Richie reads top to bottom) - -Read at ~130 wpm. Record per-line takes; 3-5 takes minimum per line. Pauses between sentences are fine — they get cut at edit. Save raw takes to `assets/voiceover/raw/-.wav` (or `.mp3`). External mic only — laptop mic kills credibility. - -### Beat 1 — Hook (~7s spoken) - -**Take 1** (declarative, intimate, set the tone for the whole video): - -> "I taught a website to improve itself." - -**Take 2** (matter-of-fact, slightly faster): - -> "Every week, a council of Claude agents debates it." - -### Beat 2 — Setup (~12s spoken) - -**Take 1** (factual, clipped): - -> "Seven Managed Agents — five critics, a planner, a redesigner." - -**Take 2** (slightly slower, give "audit" weight): - -> "Each owns one slice of the audit." - -**Take 3** (three-beat rhythm, slight pause between): - -> "They propose. Apply. Verify." - -### Beat 3 — The Moment (~26s spoken — the hero beat) - -Slow ~10% on this beat. The phrase "they wrote a new agent" is the emotional pivot — let it breathe. - -**Take 1** (date-stamp opener, factual): - -> "April twenty-third — they hit a problem nothing could solve." - -**Take 2** (recite the three categories with slight pauses between, mirroring the visual cut): - -> "Three critics flagged the same kind of issue. Hero imagery. Layout rhythm. Visual hierarchy." - -**Take 3** (declarative, slight resignation in tone): - -> "All three said it was outside their scope." - -**Take 4** (the punchline — slow down, lean in): - -> "So they wrote a new agent. With code. From scratch." - -### Beat 4 — Receipt (~16s spoken) - -**Take 1** (clipped, technical, receipts-energy): - -> "Visual-design-critic. Sonnet four-point-six." - -**Take 2** (factual, even): - -> "Registered through the Managed Agents API. Same session." - -**Take 3** (declarative, slight pride): - -> "Six critics now. Append-only. Every spawn auditable." - -**Take 4** (warmth — the only soft moment in the video, slow it down): - -> "Like growing a new sense." - -### Beat 5 — Frame (~12s spoken) - -**Take 1** (bookend energy, mirrors Beat 1 opener): - -> "A website that improves itself." - -**Take 2** (declarative, hackathon claim — slight smile in voice): - -> "Built with Opus four-point-seven." - -**Take 3** (read the URL in natural English: "github" + "dot com" as words, not letters): - -> "Receipts at github dot com slash richsak slash webster." - -## Recording order recommendation - -Record in this order to warm up your voice: - -1. **Beat 4 takes** (technical, low stakes, easy warm-up) -2. **Beat 5 takes** (3 short lines, closer energy) -3. **Beat 2 takes** (mid-stakes, three-beat rhythm) -4. **Beat 1 takes** (set the tone — your voice should be warm here, not stiff) -5. **Beat 3 takes** (highest stakes — save your best vocal energy for the hero beat, do these last when you're warmest) - -Total session length target: 30-45 min including retakes. - -## Anti-goals - -- Don't try to "sound like a podcast voiceover." That's the AI-slop voice. Sound like you. -- Don't rush Beat 3 punchlines. "From scratch." needs air around it. -- Don't add words. Read the locked script verbatim. Tone variation only. -- Don't pronounce "github.com" as "github-dot-com." Read "dot com" as natural words. -- Don't go up at the end of Beat 5. Land flat. -- Don't try one continuous take. Per-line takes give the editor (or composition session) room. - -## Forge Remotion composition handoff brief - -The video composition session reads this doc and assembles in Forge Remotion. Order: - -### 1. Asset readiness check (composition session blocks until all green) - -- [ ] `assets/voiceover/raw/` populated with per-line takes per the 5-beat script above -- [ ] `history/2026-04-23/genealogy/spec.json` — exists (verified 2026-04-25) -- [ ] `history/2026-04-23/genealogy/rationale.md` — exists (verified 2026-04-25) -- [ ] `demo-output/lp/week-{1,10}/screenshots/1440/index.png` — for Beat 4 before/after morph (T8 sim run output) -- [ ] `assets/feature-grid/index.html` — for Beat 5 scroll (Tier 3 item #1 output) -- [ ] Council fan-out diagram source — Forge Remotion authors from scratch using `agents/*.json` names - -### 2. Composition phases - -1. **Beat 1**: Title card (Forge Remotion text comp). Black → white "Webster" lockup → quick scale-zoom into council diagram (transition into Beat 2). VO Take 1 + Take 2 layered. -2. **Beat 2**: Council fan-out animation. 7 nodes appearing in sequence (planner → 5 critics → redesigner). Use real agent names from `agents/*.json` as node labels. VO Takes 1-3 layered with the node-appearance choreography. -3. **Beat 3**: Cursor-scrolling-`spec.json` screen recording → cut to terminal showing `POST /v1/agents` (mocked from real session record at `history/2026-04-23/genealogy/session.json`) → cut to first finding rendering. VO Takes 1-4 layered with cuts. SFX stinger optional on "they wrote a new agent." -4. **Beat 4**: Genealogy folder structure on screen (real `history/2026-04-23/genealogy/` listing) → before/after morph of Richer Health LP at week 0 vs week 10 (sim outputs). VO Takes 1-4 layered. Soften visual energy for Take 4 ("Like growing a new sense"). -5. **Beat 5**: Quick scroll-through of `assets/feature-grid/index.html` → end card with "Built with Opus 4.7" lockup + GitHub URL + small QR code linking to repo. VO Takes 1-3 layered. - -### 3. Length check - -- Target: 90s. Floor: 80s. -- If overshoot: trim Beat 2 first (drop "They propose. Apply. Verify." — saves ~3s) → trim Beat 4 by removing "Append-only. Every spawn auditable." (saves ~3s) → only trim Beat 3 if absolutely required. -- If undershoot: do NOT pad. Submit short. - -### 4. Output format - -- 1080p MP4 -- Confirm Cerebral Valley submission format requirements before bake. - -### 5. Pre-submission gates - -- Verify Beat 3 narration matches what actually spawned: `visual-design-critic` (NOT visual-reviewer or visual-critic — exact name from `history/2026-04-23/genealogy/spec.json`) -- Verify Beat 2 agent count matches reality at the time of the demonstrated council run (5 critics + planner + redesigner = 7, before the spawn) -- Verify Beat 4 post-spawn count: 6 critics -- Watch end-to-end at least once on a phone (most judges watch on phones) - -### 6. Submit - -- Cerebral Valley form -- GitHub repo link -- DM if relevant - -## Open dependencies before composition can run - -| Asset | Status | Owner | Blocks | -| ---------------------------------------------------- | ------------------------------- | ---------------------- | ------------ | -| Voiceover takes | not recorded | Richie | Beats 1-5 | -| Sim outputs (`demo-output/lp/week-{1,10}/`) | not run | T0-T13 finishing track | Beat 4 morph | -| Feature grid HTML (`assets/feature-grid/index.html`) | not built | Tier 3 item #1 session | Beat 5 | -| Genealogy artifacts | available (verified 2026-04-25) | n/a | n/a | - -## When in doubt - -- Tone questions: read the existing VO script in `VIDEO-PLAN.md` for register reference (warm, first-person, conversational; not declamatory). -- Asset ambiguity: surface `[STUCK]` to Richie. Do not silently improvise. -- Anything else: the spine is genealogy. If a composition decision pulls focus away from the genealogy moment (Beat 3), reject it. diff --git a/context/VIDEO-PLAN.md b/context/VIDEO-PLAN.md deleted file mode 100644 index 3a3c28f..0000000 --- a/context/VIDEO-PLAN.md +++ /dev/null @@ -1,665 +0,0 @@ -# Demo Video Plan — Webster Hackathon - -> **SUPERSEDED for the 2026-04-26 submission cut.** See `context/VIDEO-PLAN-90s.md` for the active 90-second genealogy-spine spec. This doc is preserved as historical artifact (180s 6-beat plan with Nicolette clip in Beat 1) for any post-submission re-cut. -> Session-durable working doc. Evolves across compactions. If resuming mid-session, read this first, then `VISION.md`. - -## Session purpose - -Grill through the 6-beat demo video arc until every decision is locked. Output is a shot-list / narrative spec that the video-composition session (fresh Claude Code + Forge Remotion) builds against. This is NOT the simulation implementation track (that's `EXPANSION-TASKS.md`). - -## Hard constraints - -- **Deadline**: 2026-04-28 (Built with Opus 4.7 by Anthropic × Cerebral Valley) -- **Today**: 2026-04-24 (4 days to submission, 3 full work days) -- **Locked scope**: `VISION.md` — two substrates, 10 weeks × 2 sims, hybrid memory, pure-organic genealogy -- **Human-in-loop assets** (Richie records separately): - - Nicolette clip (A/B testing pain) - - Voiceover narration - - Onboarding skill role-play (contractor persona) - - Memory Stores Console screenshots -- **Composition stack**: Claude Design (claude.ai/design, research preview, code-powered animations + UI) for diagram + artifact micro-UIs; Forge Remotion for final video composition (voiceover + Nicolette clip + Claude Design output + screenshot timelapses + transitions). Deployed via fresh session after sim assets exist. - -## Implementation track status (as of compaction) - -- T0 Pass-7 fixes: ✓ shipped -- T1 memory stores: ✓ shipped -- T2 sim agent specs: ✓ shipped -- T3 substrate contexts: ✓ shipped -- T4 ugly baselines: ✓ shipped -- T5 Synthetic Analytics Agent: next -- T6–T10: queued - -Video planning runs in parallel with T5–T10. The video-composition session happens AFTER T10 when all assets exist. - -## Video arc (6 beats) - -1. **Problem** — Nicolette clip (manual A/B testing pain) -2. **Solution intro** — Voiceover + council UI animation -3. **LP timelapse** — Richer Health 10-week timelapse (one veto/skip beat) -4. **Site timelapse** — Northwest Home Renovations 10-week timelapse -5. **Genealogy reveal** — Memory Stores Console + spawn moment -6. **Close** — Tagline, CTA, Anthropic framing - -## Grill status - -| Beat | Budget (target/floor) | Status | Next action | -| ---- | --------------------- | ----------------------------------- | -------------------------------- | -| 1 | 45s / 35s | LOCKED (talking points + drop list) | Richie sends to Nicolette | -| 2 | 35s / 25s | LOCKED | (composition session implements) | -| 3 | 35s / 25s | LOCKED | (composition session implements) | -| 4 | 30s / 22s | LOCKED | (composition session implements) | -| 5 | 25s / 18s | LOCKED with fallback rules | (composition session implements) | -| 6 | 10s / 8s | LOCKED | (composition session implements) | - -## Composition session brief (handoff start) - -This document is the locked spec. The video-composition session reads it and executes. Order: - -1. **Confirm asset readiness**: - - `demo-output/lp/week-{1..10}/` populated (screenshots at 375/768/1440, manifests, council reasoning, analytics.json) - - `demo-output/site/week-{1..10}/` populated - - Memory Stores Console screenshots captured at relative weeks (Richie) - - Nicolette clip recorded (Richie) - - Voiceover recorded per Beat 2/3/4/5/6 scripts (Richie) - - Real artifact bodies extracted: best critic finding (Beat 2 Window 1) + actual `POST /v1/agents` request+response (Beat 2 Window 2) - - If genealogy didn't spawn → see Beat 5 fallback rules; do NOT silently improvise - -2. **Build animated assets in Claude Design** (claude.ai/design): - - Beat 2: council diagram + 2 artifact windows (per Beat 2 spec, including 5 critic nodes with `xxx-critic` + role subtitles) - - Beat 3 + Beat 4: animated bounce-rate line chart, fed by `analytics.json` per substrate - - Beat 5: composite Console-styled UI with genealogy tree, week N captions, spawn animation - - Beat 6: final frame composition (Webster wordmark + "Built with Opus 4.7" lockup + GitHub URL + small QR) - -3. **Compose in Forge Remotion**: - - Sequence: Beat 1 (45s) → Beat 2 (35s) → Beat 3 (35s) → Beat 4 (30s) → Beat 5 (25s) → Beat 6 (10s) = 180s - - Embed Claude Design output via `` / `