From d1dac1474c26fab4832527370fd0228c51e24694 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Thu, 23 Apr 2026 15:47:43 +0200 Subject: [PATCH] Cut executor prompt replay through checkpoint-first guidance Guardex already warned about fragmented loops, but it still underspecified how to collapse raw terminal chatter into compact state. This tightens the repo contract, managed template, and gx prompt task loop so operators checkpoint after milestones, keep only the latest checkpoint summaries in active context, and treat execution logs as separate from reasoning context. Constraint: Guardex does not control Codex internal prompt assembly, so the fix had to land in repo guidance and prompt surfaces rather than the launcher. Rejected: Patch templates/scripts/codex-agent.sh invocation | launcher only starts Codex and already records compact session metadata. Confidence: medium Scope-risk: narrow Directive: Keep future token-efficiency guidance anchored on checkpoint summaries and summarized tool outcomes instead of longer anti-loop prose. Tested: node --test test/prompt.test.js; openspec validate agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43 --type change --strict Not-tested: openspec validate --specs (repo reported no items found to validate) --- AGENTS.md | 4 ++++ .../proposal.md | 8 +++++++ .../spec.md | 21 +++++++++++++++++++ .../tasks.md | 21 +++++++++++++++++++ src/context.js | 2 ++ templates/AGENTS.multiagent-safety.md | 3 +++ test/prompt.test.js | 7 +++++++ 7 files changed, 66 insertions(+) create mode 100644 openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/proposal.md create mode 100644 openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/specs/reduce-prompt-replay-checkpoint-guidance/spec.md create mode 100644 openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/tasks.md diff --git a/AGENTS.md b/AGENTS.md index 757e9e6..2a59f53 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,6 +24,7 @@ When writing complex features or significant refactors, use an ExecPlan (as desc - Low output alone is not a defect. A bounded run that finishes in roughly <=10 steps is usually fine; low output stretched across 20+ steps with rising input is fragmentation. - Treat obvious follow-on actions as part of the active phase; do not stop for tiny internal checkpoints. - If context grows or the session becomes fragmented, write a short working summary and continue from it. +- Checkpoint after each milestone or roughly every 15-25 tool calls: keep only `task`, `done`, `current status`, `next`, and the latest meaningful evidence; drop the raw transcript from active context. ### Token Discipline @@ -332,8 +333,11 @@ Default: less word, same proof. - Front-load scaffold/path discovery into one grouped inspection pass. Avoid serial `ls` / `find` / `rg` / `cat` retries that only rediscover the same path state. - Treat repeated `write_stdin`, repeated `sed` / `cat` peeks, and tiny diagnostic follow-up checks as strong negative signals. If they appear alongside climbing input cost, stop the probe loop and batch the next phase. - Tool / hook summaries stay tiny: command, status, last meaningful lines only. Drop routine hook boilerplate. +- Keep raw terminal interaction out of long-lived context. For `write_stdin` or interactive babysitting, retain only process, action sent, current result, and next action. +- Keep execution log separate from reasoning context: full commands/stdout belong in logs, while prompt context keeps only the latest 1-2 checkpoints plus the newest tool-result summary. - Treat local edit/commit, remote publish/PR, CI diagnosis, and cleanup as bounded phases. Do not spend fresh narration or approval turns on obvious safe follow-ons inside an already authorized phase unless the risk changes. - When a session turns fragmented, collapse back to inspect once, patch once, verify once, and summarize once. +- Use a fixed checkpoint shape when compacting: `Task`, `Done`, `Current status`, and `Next`. - Keep `.omx/notepad.md` lean: live handoffs only. Use exactly `branch`, `task`, `blocker`, `next step`, and `evidence`; move narrative proof into OpenSpec artifacts, PRs, or command output. ## OMX Caveman Style diff --git a/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/proposal.md b/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/proposal.md new file mode 100644 index 0000000..8c13c11 --- /dev/null +++ b/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/proposal.md @@ -0,0 +1,8 @@ +# Proposal: reduce prompt replay checkpoint guidance + +Guardex already warns against `write_stdin` churn and fragmented loops, but the repo contract still leaves too much room for raw terminal chatter, repeated progress narration, and long execution transcripts to linger in active reasoning context. This change tightens the operator-facing guidance so prompt assembly favors checkpoints and compact tool-result summaries instead of replaying the whole run. + +- teach the repo contract to checkpoint after milestones or roughly every 15-25 tool calls +- require raw interactive shell chatter to collapse into process/action/result/next summaries +- teach `gx prompt` to keep execution logs separate from reasoning context +- lock the new wording with focused prompt/snippet tests diff --git a/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/specs/reduce-prompt-replay-checkpoint-guidance/spec.md b/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/specs/reduce-prompt-replay-checkpoint-guidance/spec.md new file mode 100644 index 0000000..7868363 --- /dev/null +++ b/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/specs/reduce-prompt-replay-checkpoint-guidance/spec.md @@ -0,0 +1,21 @@ +# reduce-prompt-replay-checkpoint-guidance Specification + +## ADDED Requirements + +### Requirement: Token-efficiency guidance teaches checkpoint compaction +Guardex prompt and template guidance MUST tell operators to replace long rolling transcripts with compact checkpoints when a run becomes fragmented or crosses multiple milestones. + +#### Scenario: Managed AGENTS template covers checkpoint-only context +- **WHEN** an operator uses the managed multi-agent safety template for token-sensitive work +- **THEN** the template tells them to keep raw terminal interaction out of long-lived context +- **AND** it instructs them to retain only the latest one or two checkpoints plus the latest tool-result summary +- **AND** it names a fixed checkpoint shape containing `Task`, `Done`, `Current status`, and `Next` + +### Requirement: gx prompt task loop teaches transcript-vs-context separation +The `gx prompt` task loop MUST describe how to checkpoint a run instead of replaying raw terminal chatter. + +#### Scenario: Task loop output includes checkpoint instructions +- **WHEN** an operator runs `gx prompt` +- **THEN** the task loop output includes checkpoint wording that names `Task -> Done -> Current status -> Next` +- **AND** it tells operators to keep only the latest checkpoint(s) in active context +- **AND** it tells operators to summarize tool results while keeping execution logs separate from reasoning context diff --git a/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/tasks.md b/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/tasks.md new file mode 100644 index 0000000..50cb777 --- /dev/null +++ b/openspec/changes/agent-codex-reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43/tasks.md @@ -0,0 +1,21 @@ +# Tasks + +## 1. Spec +- [x] 1.1 Capture the prompt-replay checkpointing problem and intended guidance surfaces in `proposal.md` +- [x] 1.2 Define the prompt-guidance requirement in `spec.md` + +## 2. Tests +- [x] 2.1 Extend focused prompt/snippet tests for checkpoint compaction wording + +## 3. Implementation +- [x] 3.1 Tighten repo `AGENTS.md` token/context rules around checkpoints and transcript compaction +- [x] 3.2 Keep managed template parity in `templates/AGENTS.multiagent-safety.md` +- [x] 3.3 Extend `gx prompt` task-loop output with checkpoint and context-separation guidance + +## 4. Verification +- [x] 4.1 Run focused prompt integration tests + +## 5. Cleanup +- [ ] 5.1 Commit with Lore protocol message +- [ ] 5.2 Run `gx branch finish --branch "agent/codex/reduce-prompt-replay-checkpoint-guidance-2026-04-23-15-43" --base main --via-pr --wait-for-merge --cleanup` +- [ ] 5.3 Record PR URL and final `MERGED` evidence diff --git a/src/context.js b/src/context.js index 6fa31a4..4ec49da 100644 --- a/src/context.js +++ b/src/context.js @@ -447,6 +447,8 @@ const AI_SETUP_PARTS = [ 'gx branch start "" ""', 'then gx locks claim --branch "" -> inspect once -> patch once -> verify once -> gx branch finish', 'batch discovery, git/PR, and CI by phase; avoid repeated peeks or stdin loops', + 'checkpoint after each milestone: Task -> Done -> Current status -> Next; keep only the latest checkpoint(s) in active context', + 'summarize tool results, keep stdin/process chatter ephemeral, and keep execution log separate from reasoning context', ], execLines: [ 'gx branch start "" ""', diff --git a/templates/AGENTS.multiagent-safety.md b/templates/AGENTS.multiagent-safety.md index cb13f7a..f512e59 100644 --- a/templates/AGENTS.multiagent-safety.md +++ b/templates/AGENTS.multiagent-safety.md @@ -21,8 +21,11 @@ Default: less word, same proof. - Front-load scaffold/path discovery into one grouped inspection pass. Avoid serial `ls` / `find` / `rg` / `cat` retries that only rediscover the same path state. - Treat repeated `write_stdin`, repeated `sed` / `cat` peeks, and tiny diagnostic follow-up checks as strong negative signals. If they appear alongside climbing input cost, stop the probe loop and batch the next phase. - Tool / hook summaries stay tiny: command, status, last meaningful lines only. Drop routine hook boilerplate. +- Keep raw terminal interaction out of long-lived context. For `write_stdin` or interactive babysitting, retain only process, action sent, current result, and next action. +- Keep execution log separate from reasoning context: full commands/stdout belong in logs, while prompt context keeps only the latest 1-2 checkpoints plus the newest tool-result summary. - Treat local edit/commit, remote publish/PR, CI diagnosis, and cleanup as bounded phases. Do not spend fresh narration or approval turns on obvious safe follow-ons inside an already authorized phase unless the risk changes. - When a session turns fragmented, collapse back to inspect once, patch once, verify once, and summarize once. +- Use a fixed checkpoint shape when compacting: `Task`, `Done`, `Current status`, and `Next`. - Keep `.omx/notepad.md` lean: live handoffs only. Use exactly `branch`, `task`, `blocker`, `next step`, and `evidence`; move narrative proof into OpenSpec artifacts, PRs, or command output. ## OMX Caveman Style diff --git a/test/prompt.test.js b/test/prompt.test.js index 93d9ec6..d561297 100644 --- a/test/prompt.test.js +++ b/test/prompt.test.js @@ -75,6 +75,8 @@ test('prompt outputs AI setup instructions', () => { assert.match(result.stdout, /gx locks claim/); assert.match(result.stdout, /inspect once -> patch once -> verify once -> gx branch finish/); assert.match(result.stdout, /avoid repeated peeks or stdin loops/); + assert.match(result.stdout, /checkpoint after each milestone: Task -> Done -> Current status -> Next/); + assert.match(result.stdout, /keep execution log separate from reasoning context/); assert.match(result.stdout, /gx finish --all/); assert.match(result.stdout, /\/opsx:propose/); assert.match(result.stdout, /https:\/\/github\.com\/apps\/pull/); @@ -104,6 +106,8 @@ test('prompt --part outputs only the selected checklist slices', () => { assert.match(result.stdout, /^Task loop:/m); assert.match(result.stdout, /gx branch start "" ""/); assert.match(result.stdout, /inspect once -> patch once -> verify once -> gx branch finish/); + assert.match(result.stdout, /checkpoint after each milestone: Task -> Done -> Current status -> Next/); + assert.match(result.stdout, /keep execution log separate from reasoning context/); assert.match(result.stdout, /^Finish:/m); assert.match(result.stdout, /gx finish --all/); assert.doesNotMatch(result.stdout, /GitGuardex \(gx\) setup checklist/); @@ -148,6 +152,9 @@ test('prompt --snippet prints the managed AGENTS template with token budget and assert.match(result.stdout, //); assert.match(result.stdout, /## Token \/ Context Budget/); assert.match(result.stdout, /Default: less word, same proof\./); + assert.match(result.stdout, /Keep raw terminal interaction out of long-lived context/); + assert.match(result.stdout, /Keep execution log separate from reasoning context/); + assert.match(result.stdout, /Use a fixed checkpoint shape when compacting: `Task`, `Done`, `Current status`, and `Next`\./); assert.match(result.stdout, /## OMX Caveman Style/); assert.match(result.stdout, /Answer order stays fixed: answer first, cause next, fix or next step last\./); });