From 8fa025f18546ea5fdb1a9cdc52ec6cad97dc1fcd Mon Sep 17 00:00:00 2001 From: Axel Delafosse Date: Wed, 25 Mar 2026 14:19:36 -0700 Subject: [PATCH 1/4] Add clean-vm agent skill --- .agents/skills/clean-vm/SKILL.md | 155 +++++++++++++++++++++ .agents/skills/clean-vm/agents/openai.yaml | 7 + 2 files changed, 162 insertions(+) create mode 100644 .agents/skills/clean-vm/SKILL.md create mode 100644 .agents/skills/clean-vm/agents/openai.yaml diff --git a/.agents/skills/clean-vm/SKILL.md b/.agents/skills/clean-vm/SKILL.md new file mode 100644 index 0000000..78a2e59 --- /dev/null +++ b/.agents/skills/clean-vm/SKILL.md @@ -0,0 +1,155 @@ +--- +name: clean-vm +description: Safely clean a local development VM when repeated loop, Claude, or Codex runs leave behind stale tmux sessions, orphaned agent processes, inactive Next.js or Storybook servers, closed-browser clutter, or unused git worktrees. Use this skill for machine cleanup, port/process triage, and reclaiming loop-created worktrees without disrupting active work. +--- + +# Clean VM + +Use this skill when the machine has stale local dev state and you need a careful cleanup pass. +Prefer proving an item is inactive over killing it by name. + +## Safety Rules + +- Start with detection. Do not kill or remove anything until you can explain why it is stale. +- Treat loop manifests in `~/.loop/runs/*/*/manifest.json` as the source of truth for paired runs. +- A loop run is active when its manifest state is `submitted`, `working`, `reviewing`, or `input-required` and either its `pid` is alive or its `tmuxSession` still exists. +- Never kill or remove anything tied to an attached tmux session. +- Never remove the current repo checkout or the worktree containing `pwd`. +- Never delete a dirty worktree automatically. Report it and leave it alone unless the user explicitly asks to discard changes. +- Avoid broad `pkill` patterns. Prefer per-PID `kill -TERM` after inspection. +- Browser closure is destructive. Only do it when the user explicitly wants browser cleanup as part of the VM reset. + +## Workflow + +### 1. Snapshot loop state + +Collect the current state first. + +```bash +tmux ls 2>/dev/null +tmux list-panes -a -F '#{session_name} #{pane_dead} #{pane_current_command} #{pane_current_path}' 2>/dev/null +find ~/.loop/runs -maxdepth 5 -name manifest.json 2>/dev/null +git worktree list --porcelain +``` + +For loop manifests, inspect `cwd`, `pid`, `state`, `updatedAt`, and `tmuxSession`. +Useful states from loop are: + +- active: `submitted`, `working`, `reviewing`, `input-required` +- inactive: `completed`, `failed`, `stopped` + +If a manifest claims to be active but both the `pid` and `tmuxSession` are gone, treat it as stale. +If a manifest still looks active but `updatedAt` is very old, treat it as suspicious and report it before killing anything that still has a live PID. + +### 2. Classify Claude and Codex processes + +Prefer loop-aware checks before process-name matching. + +For each manifest candidate: + +1. Check whether the `pid` is still alive. +2. Check whether the `tmuxSession` still exists with `tmux has-session -t `. +3. If both are gone, the run is stale. + +For non-loop Claude or Codex processes, only kill them if you can prove they are orphaned or tied to stale loop state. +Inspect first: + +```bash +pgrep -af '(^|/)(claude|codex)( |$)' +ps -o pid=,ppid=,etime=,tty=,command= -p +lsof -a -d cwd -p +``` + +Safer rule: + +- kill only when the process is detached from a live tmux session, not tied to an active manifest, and clearly belongs to stale local work +- otherwise report it and leave it running + +When killing, use: + +```bash +kill -TERM +sleep 2 +kill -0 2>/dev/null && kill -KILL +``` + +### 3. Clean inactive Next.js and Storybook servers + +Only target dev servers that are not part of active work. +Inspect listening processes and map them back to a cwd before killing them. + +```bash +lsof -nP -iTCP -sTCP:LISTEN | grep -E 'node|next|storybook' +pgrep -af 'next dev|next-server|storybook|start-storybook' +lsof -a -d cwd -p +tmux list-panes -a -F '#{session_name} #{pane_dead} #{pane_current_command} #{pane_current_path}' 2>/dev/null +``` + +Good cleanup candidates: + +- server process cwd belongs to a loop worktree whose tmux session is gone +- server process cwd is not open in any live tmux pane +- server process cwd belongs to a repo/worktree with no active manifest +- long-lived local dev server with no attached tmux and no recent interactive owner + +Do not kill a server just because its command contains `node`. + +### 4. Close browser windows only on explicit cleanup requests + +This is macOS-only and destructive. Skip it on non-macOS hosts or if the user did not ask for browser cleanup. + +Use AppleScript and report failures instead of retrying aggressively: + +```bash +osascript -e 'tell application "System Events" to if exists process "Google Chrome" then tell application "Google Chrome" to close every window' +osascript -e 'tell application "System Events" to if exists process "Safari" then tell application "Safari" to close every window' +``` + +If automation permissions block the command, report that and continue. + +### 5. Remove unused worktrees carefully + +Use `git worktree list --porcelain` to classify worktrees. + +Safe removals: + +- entries already marked `prunable` +- missing loop-created worktrees after `git worktree prune` +- clean loop-created worktrees whose matching run is stale, whose tmux session is gone, and whose path is not open in a live tmux pane + +Inspect before removing: + +```bash +git worktree list --porcelain +git -C status --short +tmux list-panes -a -F '#{session_name} #{pane_dead} #{pane_current_path}' 2>/dev/null +``` + +Rules: + +- never remove the main worktree +- never remove the worktree that contains the current shell cwd +- never remove a worktree referenced by an active manifest `cwd` +- if the worktree is dirty, report it and skip it + +Cleanup commands: + +```bash +git worktree prune +git worktree remove +git worktree remove --force +``` + +Use `--force` only after a plain `git worktree remove ` fails because the worktree is locked or still registered elsewhere. A dirty `git status --short` result still means `skip`, even if `--force` would succeed. + +## Report + +End with a short cleanup report that includes: + +- processes killed, with PID and reason +- dev servers stopped, with cwd and reason +- browser actions taken or skipped +- worktrees pruned or removed +- anything suspicious you left alone because it was active, dirty, or ambiguous + +If any item is ambiguous, prefer `skipped` over `cleaned`. diff --git a/.agents/skills/clean-vm/agents/openai.yaml b/.agents/skills/clean-vm/agents/openai.yaml new file mode 100644 index 0000000..2e60d74 --- /dev/null +++ b/.agents/skills/clean-vm/agents/openai.yaml @@ -0,0 +1,7 @@ +interface: + display_name: "Clean VM" + short_description: "Clean stale loop runs, tabs, and worktrees" + default_prompt: "Use $clean-vm to clean this loop VM without touching active sessions." + +policy: + allow_implicit_invocation: false From 8dd957028be32734e7029659ceb9b27f86abd4ee Mon Sep 17 00:00:00 2001 From: Axel Delafosse Date: Wed, 25 Mar 2026 14:29:57 -0700 Subject: [PATCH 2/4] Add clean-vm skill --- .agents/skills/clean-vm/SKILL.md | 150 ++---- .agents/skills/clean-vm/agents/openai.yaml | 2 +- .agents/skills/clean-vm/scripts/clean_vm.py | 545 ++++++++++++++++++++ 3 files changed, 587 insertions(+), 110 deletions(-) create mode 100644 .agents/skills/clean-vm/scripts/clean_vm.py diff --git a/.agents/skills/clean-vm/SKILL.md b/.agents/skills/clean-vm/SKILL.md index 78a2e59..f6c9349 100644 --- a/.agents/skills/clean-vm/SKILL.md +++ b/.agents/skills/clean-vm/SKILL.md @@ -1,146 +1,78 @@ --- name: clean-vm -description: Safely clean a local development VM when repeated loop, Claude, or Codex runs leave behind stale tmux sessions, orphaned agent processes, inactive Next.js or Storybook servers, closed-browser clutter, or unused git worktrees. Use this skill for machine cleanup, port/process triage, and reclaiming loop-created worktrees without disrupting active work. +description: "Safely clean the local loop VM by reporting and removing stale loop runs, inactive Next.js or Storybook servers, optional browser windows, and unused loop-created worktrees without disturbing active tmux-backed sessions." --- # Clean VM -Use this skill when the machine has stale local dev state and you need a careful cleanup pass. -Prefer proving an item is inactive over killing it by name. +Use this skill when repeated loop, Claude, or Codex runs leave the machine in a bad state. +Start with the bundled script. It does a dry run by default and only mutates the machine with `--apply`. -## Safety Rules - -- Start with detection. Do not kill or remove anything until you can explain why it is stale. -- Treat loop manifests in `~/.loop/runs/*/*/manifest.json` as the source of truth for paired runs. -- A loop run is active when its manifest state is `submitted`, `working`, `reviewing`, or `input-required` and either its `pid` is alive or its `tmuxSession` still exists. -- Never kill or remove anything tied to an attached tmux session. -- Never remove the current repo checkout or the worktree containing `pwd`. -- Never delete a dirty worktree automatically. Report it and leave it alone unless the user explicitly asks to discard changes. -- Avoid broad `pkill` patterns. Prefer per-PID `kill -TERM` after inspection. -- Browser closure is destructive. Only do it when the user explicitly wants browser cleanup as part of the VM reset. - -## Workflow - -### 1. Snapshot loop state - -Collect the current state first. - -```bash -tmux ls 2>/dev/null -tmux list-panes -a -F '#{session_name} #{pane_dead} #{pane_current_command} #{pane_current_path}' 2>/dev/null -find ~/.loop/runs -maxdepth 5 -name manifest.json 2>/dev/null -git worktree list --porcelain -``` - -For loop manifests, inspect `cwd`, `pid`, `state`, `updatedAt`, and `tmuxSession`. -Useful states from loop are: - -- active: `submitted`, `working`, `reviewing`, `input-required` -- inactive: `completed`, `failed`, `stopped` - -If a manifest claims to be active but both the `pid` and `tmuxSession` are gone, treat it as stale. -If a manifest still looks active but `updatedAt` is very old, treat it as suspicious and report it before killing anything that still has a live PID. - -### 2. Classify Claude and Codex processes - -Prefer loop-aware checks before process-name matching. - -For each manifest candidate: - -1. Check whether the `pid` is still alive. -2. Check whether the `tmuxSession` still exists with `tmux has-session -t `. -3. If both are gone, the run is stale. +## Default Workflow -For non-loop Claude or Codex processes, only kill them if you can prove they are orphaned or tied to stale loop state. -Inspect first: +1. Run a dry run first: ```bash -pgrep -af '(^|/)(claude|codex)( |$)' -ps -o pid=,ppid=,etime=,tty=,command= -p -lsof -a -d cwd -p +python3 .agents/skills/clean-vm/scripts/clean_vm.py ``` -Safer rule: +2. Review the report. The script only targets: -- kill only when the process is detached from a live tmux session, not tied to an active manifest, and clearly belongs to stale local work -- otherwise report it and leave it running +- loop manifests under `~/.loop/runs/` +- loop helper processes tied to stale run dirs +- Next.js and Storybook servers running inside stale loop worktrees +- loop-created worktrees from `git worktree list --porcelain` -When killing, use: +3. Apply the cleanup once the plan looks safe: ```bash -kill -TERM -sleep 2 -kill -0 2>/dev/null && kill -KILL +python3 .agents/skills/clean-vm/scripts/clean_vm.py --apply ``` -### 3. Clean inactive Next.js and Storybook servers - -Only target dev servers that are not part of active work. -Inspect listening processes and map them back to a cwd before killing them. - -```bash -lsof -nP -iTCP -sTCP:LISTEN | grep -E 'node|next|storybook' -pgrep -af 'next dev|next-server|storybook|start-storybook' -lsof -a -d cwd -p -tmux list-panes -a -F '#{session_name} #{pane_dead} #{pane_current_command} #{pane_current_path}' 2>/dev/null -``` - -Good cleanup candidates: - -- server process cwd belongs to a loop worktree whose tmux session is gone -- server process cwd is not open in any live tmux pane -- server process cwd belongs to a repo/worktree with no active manifest -- long-lived local dev server with no attached tmux and no recent interactive owner - -Do not kill a server just because its command contains `node`. - -### 4. Close browser windows only on explicit cleanup requests - -This is macOS-only and destructive. Skip it on non-macOS hosts or if the user did not ask for browser cleanup. - -Use AppleScript and report failures instead of retrying aggressively: +4. Only close browser windows when the user explicitly wants browser cleanup: ```bash -osascript -e 'tell application "System Events" to if exists process "Google Chrome" then tell application "Google Chrome" to close every window' -osascript -e 'tell application "System Events" to if exists process "Safari" then tell application "Safari" to close every window' +python3 .agents/skills/clean-vm/scripts/clean_vm.py --apply --browsers ``` -If automation permissions block the command, report that and continue. +## Safety Rules -### 5. Remove unused worktrees carefully +- Always inspect the dry run before using `--apply`. +- Treat loop manifests in `~/.loop/runs` as the source of truth for paired runs. +- Keep any run whose manifest state is `submitted`, `working`, `reviewing`, or `input-required` and whose `pid` or `tmuxSession` is still live. +- Never mass-kill `claude`, `codex`, or `node`. Kill only per PID after the script proves the process belongs to stale loop state. +- Never remove the main worktree, the worktree containing the current `pwd`, or a dirty worktree. +- The script does not auto-force worktree removal. If a plain `git worktree remove` fails, it reports the failure and leaves escalation to a manual follow-up. +- Treat any live tmux session as in use even if the manifest looks stale. +- Browser cleanup is opt-in and macOS-only. -Use `git worktree list --porcelain` to classify worktrees. +## What the Script Checks -Safe removals: +- repo identity via `git rev-parse --git-common-dir`, using the same repo id scheme as loop +- run manifests under `~/.loop/runs` +- tmux liveness with `tmux has-session -t ` +- helper processes whose command line references a stale run dir +- dev servers matching `next dev`, `next-server`, `storybook`, or `start-storybook` +- worktrees from `git worktree list --porcelain` -- entries already marked `prunable` -- missing loop-created worktrees after `git worktree prune` -- clean loop-created worktrees whose matching run is stale, whose tmux session is gone, and whose path is not open in a live tmux pane +## Manual Fallback -Inspect before removing: +If the script cannot classify something safely, leave it alone and inspect it manually: ```bash +tmux ls 2>/dev/null +tmux list-panes -a -F '#{session_name} #{pane_dead} #{pane_current_command} #{pane_current_path}' 2>/dev/null git worktree list --porcelain -git -C status --short -tmux list-panes -a -F '#{session_name} #{pane_dead} #{pane_current_path}' 2>/dev/null +lsof -nP -iTCP -sTCP:LISTEN | grep -E 'next|storybook|node' +ps -axo pid=,tty=,command= | grep -E 'claude|codex|next dev|storybook' ``` -Rules: - -- never remove the main worktree -- never remove the worktree that contains the current shell cwd -- never remove a worktree referenced by an active manifest `cwd` -- if the worktree is dirty, report it and skip it - -Cleanup commands: +Useful loop states: -```bash -git worktree prune -git worktree remove -git worktree remove --force -``` +- active: `submitted`, `working`, `reviewing`, `input-required` +- inactive: `completed`, `failed`, `stopped` -Use `--force` only after a plain `git worktree remove ` fails because the worktree is locked or still registered elsewhere. A dirty `git status --short` result still means `skip`, even if `--force` would succeed. +If a manifest claims to be active but both the `pid` and `tmuxSession` are gone, treat it as stale. ## Report diff --git a/.agents/skills/clean-vm/agents/openai.yaml b/.agents/skills/clean-vm/agents/openai.yaml index 2e60d74..655a13b 100644 --- a/.agents/skills/clean-vm/agents/openai.yaml +++ b/.agents/skills/clean-vm/agents/openai.yaml @@ -1,7 +1,7 @@ interface: display_name: "Clean VM" short_description: "Clean stale loop runs, tabs, and worktrees" - default_prompt: "Use $clean-vm to clean this loop VM without touching active sessions." + default_prompt: "Use $clean-vm to dry-run cleanup for this loop VM, then apply it without touching active sessions." policy: allow_implicit_invocation: false diff --git a/.agents/skills/clean-vm/scripts/clean_vm.py b/.agents/skills/clean-vm/scripts/clean_vm.py new file mode 100644 index 0000000..9b97baa --- /dev/null +++ b/.agents/skills/clean-vm/scripts/clean_vm.py @@ -0,0 +1,545 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import signal +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +ACTIVE_STATES = {"submitted", "working", "reviewing", "input-required"} +AGENT_RE = re.compile(r"(^|/)(claude|codex)(\s|$)") +LOOP_HELPER_MARKERS = ("__bridge-mcp", "__codex-tmux-proxy") +SAFE_NAME_RE = re.compile(r"[^a-z0-9-]+") +SERVER_RE = re.compile(r"next dev|next-server|storybook|start-storybook") + + +class CleanVmError(Exception): + pass + + +@dataclass +class RepoContext: + repo_id: str + repo_root: Path + start_cwd: Path + + +@dataclass +class ProcessInfo: + pid: int + tty: str + command: str + + +@dataclass +class RunInfo: + cwd: Optional[Path] + pid: Optional[int] + pid_alive: bool + run_dir: Path + run_id: str + state: str + tmux_alive: bool + tmux_session: str + + @property + def active(self) -> bool: + return self.state in ACTIVE_STATES and (self.pid_alive or self.tmux_alive) + + +@dataclass +class WorktreeInfo: + path: Path + prunable: bool + run_id: Optional[str] + + +def run_command( + args: list[str], cwd: Optional[Path] = None +) -> subprocess.CompletedProcess[str]: + return subprocess.run( + args, + capture_output=True, + cwd=str(cwd) if cwd else None, + text=True, + ) + + +def sanitize_base(value: str) -> str: + cleaned = SAFE_NAME_RE.sub("-", value.lower()).strip("-") + return cleaned or "loop" + + +def git_output(repo: Path, args: list[str]) -> str: + result = run_command(["git", *args], cwd=repo) + if result.returncode == 0: + return result.stdout.strip() + message = result.stderr.strip() or result.stdout.strip() or "git command failed" + raise CleanVmError(message) + + +def resolve_repo(start: Path) -> RepoContext: + repo_root = Path( + git_output(start, ["rev-parse", "--path-format=absolute", "--show-toplevel"]) + ).resolve() + common_dir = Path( + git_output(start, ["rev-parse", "--path-format=absolute", "--git-common-dir"]) + ).resolve() + label = common_dir.parent.name + seed = str(common_dir) + repo_id = f"{sanitize_base(label)}-{hashlib.sha256(seed.encode()).hexdigest()[:12]}" + return RepoContext( + repo_id=repo_id, + repo_root=repo_root, + start_cwd=start.resolve(), + ) + + +def pid_exists(pid: Optional[int]) -> bool: + if not pid or pid <= 0: + return False + try: + os.kill(pid, 0) + except OSError: + return False + return True + + +def tmux_session_exists(session: str) -> bool: + if not session: + return False + return run_command(["tmux", "has-session", "-t", session]).returncode == 0 + + +def parse_int(value: object) -> Optional[int]: + if isinstance(value, int): + return value + if isinstance(value, str) and value.isdigit(): + return int(value) + return None + + +def parse_string(value: object) -> str: + return value if isinstance(value, str) else "" + + +def normalize_run_state(state: str) -> str: + if state in ACTIVE_STATES: + return state + if state in {"active", "running"}: + return "working" + if state == "done": + return "completed" + return state + + +def is_inside(root: Path, child: Optional[Path]) -> bool: + if child is None: + return False + try: + child.resolve().relative_to(root.resolve()) + except ValueError: + return False + return True + + +def paths_overlap(left: Optional[Path], right: Optional[Path]) -> bool: + if left is None or right is None: + return False + return is_inside(left, right) or is_inside(right, left) + + +def ancestor_pids() -> set[int]: + protected: set[int] = set() + pid = os.getpid() + while pid > 1 and pid not in protected: + protected.add(pid) + result = run_command(["ps", "-o", "ppid=", "-p", str(pid)]) + parent = result.stdout.strip() + if not parent.isdigit(): + break + next_pid = int(parent) + if next_pid == pid: + break + pid = next_pid + return protected + + +def process_cwd(pid: int) -> Optional[Path]: + result = run_command(["lsof", "-a", "-d", "cwd", "-Fn", "-p", str(pid)]) + if result.returncode != 0: + return None + for line in result.stdout.splitlines(): + if line.startswith("n"): + return Path(line[1:]).resolve() + return None + + +def process_list() -> list[ProcessInfo]: + result = run_command(["ps", "-axo", "pid=,tty=,command="]) + items: list[ProcessInfo] = [] + for line in result.stdout.splitlines(): + parts = line.strip().split(None, 2) + if len(parts) != 3 or not parts[0].isdigit(): + continue + items.append(ProcessInfo(pid=int(parts[0]), tty=parts[1], command=parts[2])) + return items + + +def live_tmux_paths() -> list[Path]: + result = run_command( + ["tmux", "list-panes", "-a", "-F", "#{pane_dead} #{pane_current_path}"] + ) + if result.returncode != 0: + return [] + paths: list[Path] = [] + for line in result.stdout.splitlines(): + parts = line.strip().split(None, 1) + if len(parts) != 2 or parts[0] != "0": + continue + paths.append(Path(parts[1]).resolve()) + return paths + + +def load_runs(context: RepoContext) -> tuple[list[RunInfo], list[str]]: + repo_runs = Path.home() / ".loop" / "runs" / context.repo_id + runs: list[RunInfo] = [] + warnings: list[str] = [] + if not repo_runs.exists(): + return runs, warnings + for path in sorted(repo_runs.glob("*/manifest.json")): + try: + data = json.loads(path.read_text()) + except Exception as exc: + warnings.append(f"skipped invalid manifest {path}: {exc}") + continue + cwd_value = parse_string(data.get("cwd")) + tmux_session = parse_string(data.get("tmuxSession") or data.get("tmux_session")) + pid = parse_int(data.get("pid")) + runs.append( + RunInfo( + cwd=Path(cwd_value).resolve() if cwd_value else None, + pid=pid, + pid_alive=pid_exists(pid), + run_dir=path.parent, + run_id=parse_string(data.get("runId") or data.get("run_id")) or path.parent.name, + state=normalize_run_state( + parse_string(data.get("state") or data.get("status")) or "unknown" + ), + tmux_alive=tmux_session_exists(tmux_session), + tmux_session=tmux_session, + ) + ) + return runs, warnings + + +def parse_worktree_run_id(path: Path) -> Optional[str]: + if "-loop-" not in path.name: + return None + run_id = path.name.split("-loop-", 1)[1] + return run_id or None + + +def load_worktrees(context: RepoContext) -> list[WorktreeInfo]: + output = git_output(context.repo_root, ["worktree", "list", "--porcelain"]) + worktrees: list[WorktreeInfo] = [] + block: dict[str, str] = {} + for line in [*output.splitlines(), ""]: + if line: + key, _, value = line.partition(" ") + block[key] = value + continue + path_value = block.get("worktree") + if path_value: + path = Path(path_value).resolve() + worktrees.append( + WorktreeInfo( + path=path, + prunable="prunable" in block, + run_id=parse_worktree_run_id(path), + ) + ) + block = {} + return worktrees + + +def classify_worktrees( + context: RepoContext, runs: list[RunInfo], tmux_paths: list[Path] +) -> tuple[list[tuple[WorktreeInfo, str]], list[str], bool]: + active_run_ids = {run.run_id for run in runs if run.active} + active_cwds = [run.cwd for run in runs if run.active and run.cwd] + removable: list[tuple[WorktreeInfo, str]] = [] + notes: list[str] = [] + needs_prune = False + for worktree in load_worktrees(context): + if worktree.path == context.repo_root: + notes.append(f"kept main worktree {worktree.path}") + continue + if is_inside(worktree.path, context.start_cwd): + notes.append(f"kept current worktree {worktree.path}") + continue + if any(is_inside(worktree.path, cwd) for cwd in active_cwds): + notes.append(f"kept active worktree {worktree.path}") + continue + if any(is_inside(worktree.path, path) for path in tmux_paths): + notes.append(f"kept worktree open in tmux {worktree.path}") + continue + if tmux_session_exists(worktree.path.name): + notes.append(f"kept tmux-backed worktree {worktree.path}") + continue + if worktree.prunable: + needs_prune = True + notes.append(f"prunable worktree {worktree.path}") + continue + if worktree.run_id is None: + notes.append(f"skipped non-loop worktree {worktree.path}") + continue + if worktree.run_id in active_run_ids: + notes.append(f"kept run-backed worktree {worktree.path}") + continue + status = run_command(["git", "-C", str(worktree.path), "status", "--porcelain"]) + if status.stdout.strip(): + notes.append(f"skipped dirty worktree {worktree.path}") + continue + removable.append((worktree, "loop worktree is stale and clean")) + return removable, notes, needs_prune + + +def classify_run_processes( + runs: list[RunInfo], + protected: set[int], + processes: list[ProcessInfo], + tmux_paths: list[Path], +) -> tuple[list[tuple[int, str]], list[str]]: + active_roots = [run.cwd for run in runs if run.active and run.cwd] + kill: dict[int, str] = {} + notes: list[str] = [] + for run in runs: + if run.active: + notes.append(f"kept active run {run.run_id}") + continue + if run.pid and run.pid not in protected and pid_exists(run.pid) and not run.tmux_alive: + cwd = process_cwd(run.pid) + if any(is_inside(root, cwd) for root in active_roots) or any( + paths_overlap(cwd, path) for path in tmux_paths + ): + notes.append(f"left stale run pid {run.pid} alone because it is in active work") + else: + kill[run.pid] = f"stale loop run {run.run_id} ({run.state})" + for process in processes: + if process.pid in protected or process.pid == run.pid: + continue + if str(run.run_dir) not in process.command or run.tmux_alive: + continue + cwd = process_cwd(process.pid) + if any(is_inside(root, cwd) for root in active_roots) or any( + paths_overlap(cwd, path) for path in tmux_paths + ): + continue + if cwd is None and not any( + marker in process.command for marker in LOOP_HELPER_MARKERS + ): + notes.append( + f"left helper process {process.pid} alone because its cwd is unknown" + ) + continue + kill[process.pid] = f"helper for stale run {run.run_id}" + return sorted(kill.items()), notes + + +def classify_agent_processes( + runs: list[RunInfo], protected: set[int], processes: list[ProcessInfo] +) -> list[str]: + active_roots = [run.cwd for run in runs if run.active and run.cwd] + notes: list[str] = [] + for process in processes: + if process.pid in protected or not AGENT_RE.search(process.command): + continue + cwd = process_cwd(process.pid) + if any(is_inside(root, cwd) for root in active_roots): + continue + notes.append(f"left standalone agent process {process.pid} alone") + return notes + + +def classify_servers( + removable_worktrees: list[tuple[WorktreeInfo, str]], + runs: list[RunInfo], + protected: set[int], + processes: list[ProcessInfo], + tmux_paths: list[Path], +) -> tuple[list[tuple[int, str]], list[str]]: + removable_roots = [worktree.path for worktree, _ in removable_worktrees] + active_roots = [run.cwd for run in runs if run.active and run.cwd] + kill: list[tuple[int, str]] = [] + notes: list[str] = [] + for process in processes: + if process.pid in protected or not SERVER_RE.search(process.command): + continue + cwd = process_cwd(process.pid) + if cwd is None: + notes.append(f"left server {process.pid} alone because cwd is unknown") + continue + if any(paths_overlap(cwd, path) for path in tmux_paths): + notes.append(f"kept tmux-backed server {process.pid} in {cwd}") + continue + if any(is_inside(root, cwd) for root in active_roots): + notes.append(f"kept active server {process.pid} in {cwd}") + continue + matched = False + for root in removable_roots: + if is_inside(root, cwd): + kill.append((process.pid, f"server in stale worktree {root}")) + matched = True + break + if not matched: + notes.append(f"left server {process.pid} alone in {cwd}") + return kill, notes + + +def terminate_pid(pid: int, apply: bool) -> str: + if not apply: + return "would kill" + try: + os.kill(pid, signal.SIGTERM) + except OSError: + return "already gone" + time.sleep(1) + if pid_exists(pid): + try: + os.kill(pid, signal.SIGKILL) + except OSError: + return "already gone" + return "killed" + + +def prune_worktrees(context: RepoContext, apply: bool) -> str: + if not apply: + return "would run git worktree prune" + result = run_command(["git", "worktree", "prune"], cwd=context.repo_root) + if result.returncode == 0: + return "ran git worktree prune" + message = result.stderr.strip() or result.stdout.strip() or "git worktree prune failed" + return f"failed git worktree prune: {message}" + + +def remove_worktree(context: RepoContext, path: Path, apply: bool) -> str: + if not apply: + return f"would remove {path}" + first = run_command(["git", "worktree", "remove", str(path)], cwd=context.repo_root) + if first.returncode == 0: + return f"removed {path}" + message = first.stderr.strip() or first.stdout.strip() or "git worktree remove failed" + return f"failed to remove {path}: {message}" + + +def close_browsers(apply: bool) -> list[str]: + if sys.platform != "darwin": + return ["skipped browser cleanup on non-macOS host"] + actions: list[str] = [] + for app in ("Google Chrome", "Safari"): + if not apply: + actions.append(f"would close {app} windows") + continue + result = run_command( + [ + "osascript", + "-e", + f'tell application "System Events" to set appRunning to exists process "{app}"', + "-e", + f'if appRunning then tell application "{app}" to close every window', + ] + ) + if result.returncode == 0: + actions.append(f"closed {app} windows") + continue + message = result.stderr.strip() or result.stdout.strip() or "unknown error" + actions.append(f"failed to close {app} windows: {message}") + return actions + + +def print_section(title: str, lines: list[str]) -> None: + print(f"{title}:") + if not lines: + print(" none") + return + for line in lines: + print(f" - {line}") + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Clean stale loop runs, servers, browsers, and worktrees." + ) + parser.add_argument("--apply", action="store_true", help="Apply cleanup changes.") + parser.add_argument( + "--browsers", + action="store_true", + help="Close Safari and Chrome windows on macOS.", + ) + parser.add_argument( + "--repo", + default=".", + help="Repo path or worktree path for the loop checkout to inspect.", + ) + args = parser.parse_args() + + try: + context = resolve_repo(Path(args.repo)) + except CleanVmError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + + runs, warnings = load_runs(context) + processes = process_list() + protected = ancestor_pids() + tmux_paths = live_tmux_paths() + + removable_worktrees, worktree_notes, needs_prune = classify_worktrees( + context, runs, tmux_paths + ) + run_kills, run_notes = classify_run_processes( + runs, protected, processes, tmux_paths + ) + agent_notes = classify_agent_processes(runs, protected, processes) + server_kills, server_notes = classify_servers( + removable_worktrees, runs, protected, processes, tmux_paths + ) + + process_actions: list[str] = [] + for pid, reason in [*run_kills, *server_kills]: + process_actions.append(f"{terminate_pid(pid, args.apply)} pid {pid}: {reason}") + + worktree_actions: list[str] = [] + if needs_prune: + worktree_actions.append(prune_worktrees(context, args.apply)) + for worktree, reason in removable_worktrees: + outcome = remove_worktree(context, worktree.path, args.apply) + worktree_actions.append(f"{outcome} ({reason})") + + browser_actions = ( + close_browsers(args.apply) if args.browsers else ["skipped browser cleanup"] + ) + + print(f"mode: {'apply' if args.apply else 'dry-run'}") + print(f"repo: {context.repo_root}") + print(f"repo id: {context.repo_id}") + print_section("runs", warnings + run_notes) + print_section("processes", process_actions + agent_notes) + print_section("servers", server_notes) + print_section("worktrees", worktree_actions + worktree_notes) + print_section("browsers", browser_actions) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 13f62665f4831703ce43cfc1d7ad3ed87b8e796b Mon Sep 17 00:00:00 2001 From: Axel Delafosse Date: Wed, 25 Mar 2026 15:03:08 -0700 Subject: [PATCH 3/4] Tighten tmux cleanup checks --- .agents/skills/clean-vm/SKILL.md | 4 +-- .agents/skills/clean-vm/scripts/clean_vm.py | 39 +++++++++++++++++++-- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/.agents/skills/clean-vm/SKILL.md b/.agents/skills/clean-vm/SKILL.md index f6c9349..ab056c3 100644 --- a/.agents/skills/clean-vm/SKILL.md +++ b/.agents/skills/clean-vm/SKILL.md @@ -50,7 +50,7 @@ python3 .agents/skills/clean-vm/scripts/clean_vm.py --apply --browsers - repo identity via `git rev-parse --git-common-dir`, using the same repo id scheme as loop - run manifests under `~/.loop/runs` -- tmux liveness with `tmux has-session -t ` +- tmux liveness with exact session targets like `tmux has-session -t =` plus a live-pane check from `tmux list-panes` - helper processes whose command line references a stale run dir - dev servers matching `next dev`, `next-server`, `storybook`, or `start-storybook` - worktrees from `git worktree list --porcelain` @@ -72,7 +72,7 @@ Useful loop states: - active: `submitted`, `working`, `reviewing`, `input-required` - inactive: `completed`, `failed`, `stopped` -If a manifest claims to be active but both the `pid` and `tmuxSession` are gone, treat it as stale. +If a manifest claims to be active but its `pid` is gone and its exact tmux session is missing or every pane in that exact session is dead, treat it as stale. ## Report diff --git a/.agents/skills/clean-vm/scripts/clean_vm.py b/.agents/skills/clean-vm/scripts/clean_vm.py index 9b97baa..7901663 100644 --- a/.agents/skills/clean-vm/scripts/clean_vm.py +++ b/.agents/skills/clean-vm/scripts/clean_vm.py @@ -114,10 +114,43 @@ def pid_exists(pid: Optional[int]) -> bool: return True +def exact_tmux_target(session: str) -> str: + return f"={session}" + + def tmux_session_exists(session: str) -> bool: if not session: return False - return run_command(["tmux", "has-session", "-t", session]).returncode == 0 + return ( + run_command(["tmux", "has-session", "-t", exact_tmux_target(session)]).returncode + == 0 + ) + + +def tmux_session_has_live_panes(session: str) -> bool: + if not tmux_session_exists(session): + return False + result = run_command( + [ + "tmux", + "list-panes", + "-t", + exact_tmux_target(session), + "-F", + "#{pane_dead} #{pane_pid}", + ] + ) + if result.returncode != 0: + return False + for line in result.stdout.splitlines(): + parts = line.strip().split(None, 1) + if len(parts) != 2 or parts[0] != "0": + continue + if not parts[1].isdigit(): + return True + if pid_exists(int(parts[1])): + return True + return False def parse_int(value: object) -> Optional[int]: @@ -235,7 +268,7 @@ def load_runs(context: RepoContext) -> tuple[list[RunInfo], list[str]]: state=normalize_run_state( parse_string(data.get("state") or data.get("status")) or "unknown" ), - tmux_alive=tmux_session_exists(tmux_session), + tmux_alive=tmux_session_has_live_panes(tmux_session), tmux_session=tmux_session, ) ) @@ -293,7 +326,7 @@ def classify_worktrees( if any(is_inside(worktree.path, path) for path in tmux_paths): notes.append(f"kept worktree open in tmux {worktree.path}") continue - if tmux_session_exists(worktree.path.name): + if tmux_session_has_live_panes(worktree.path.name): notes.append(f"kept tmux-backed worktree {worktree.path}") continue if worktree.prunable: From 0cc6e14adce4b568e3acb2502d8376c73251e57b Mon Sep 17 00:00:00 2001 From: Axel Delafosse Date: Wed, 25 Mar 2026 15:09:32 -0700 Subject: [PATCH 4/4] Clean up stale bridge servers --- .agents/skills/clean-vm/SKILL.md | 2 + .agents/skills/clean-vm/scripts/clean_vm.py | 96 +++++++++++++++++++-- 2 files changed, 92 insertions(+), 6 deletions(-) diff --git a/.agents/skills/clean-vm/SKILL.md b/.agents/skills/clean-vm/SKILL.md index ab056c3..9550f93 100644 --- a/.agents/skills/clean-vm/SKILL.md +++ b/.agents/skills/clean-vm/SKILL.md @@ -19,6 +19,7 @@ python3 .agents/skills/clean-vm/scripts/clean_vm.py 2. Review the report. The script only targets: - loop manifests under `~/.loop/runs/` +- direct `loop __bridge-mcp ` processes for stale runs - loop helper processes tied to stale run dirs - Next.js and Storybook servers running inside stale loop worktrees - loop-created worktrees from `git worktree list --porcelain` @@ -52,6 +53,7 @@ python3 .agents/skills/clean-vm/scripts/clean_vm.py --apply --browsers - run manifests under `~/.loop/runs` - tmux liveness with exact session targets like `tmux has-session -t =` plus a live-pane check from `tmux list-panes` - helper processes whose command line references a stale run dir +- direct loop bridge MCP server processes whose run dir is stale or orphaned - dev servers matching `next dev`, `next-server`, `storybook`, or `start-storybook` - worktrees from `git worktree list --porcelain` diff --git a/.agents/skills/clean-vm/scripts/clean_vm.py b/.agents/skills/clean-vm/scripts/clean_vm.py index 7901663..dfeef10 100644 --- a/.agents/skills/clean-vm/scripts/clean_vm.py +++ b/.agents/skills/clean-vm/scripts/clean_vm.py @@ -7,6 +7,7 @@ import json import os import re +import shlex import signal import subprocess import sys @@ -17,6 +18,7 @@ ACTIVE_STATES = {"submitted", "working", "reviewing", "input-required"} AGENT_RE = re.compile(r"(^|/)(claude|codex)(\s|$)") +BRIDGE_SUBCOMMAND = "__bridge-mcp" LOOP_HELPER_MARKERS = ("__bridge-mcp", "__codex-tmux-proxy") SAFE_NAME_RE = re.compile(r"[^a-z0-9-]+") SERVER_RE = re.compile(r"next dev|next-server|storybook|start-storybook") @@ -36,6 +38,7 @@ class RepoContext: @dataclass class ProcessInfo: pid: int + ppid: int tty: str command: str @@ -63,6 +66,13 @@ class WorktreeInfo: run_id: Optional[str] +@dataclass +class BridgeProcess: + pid: int + run_dir: Path + source: str + + def run_command( args: list[str], cwd: Optional[Path] = None ) -> subprocess.CompletedProcess[str]: @@ -218,13 +228,24 @@ def process_cwd(pid: int) -> Optional[Path]: def process_list() -> list[ProcessInfo]: - result = run_command(["ps", "-axo", "pid=,tty=,command="]) + result = run_command(["ps", "-axo", "pid=,ppid=,tty=,command="]) items: list[ProcessInfo] = [] for line in result.stdout.splitlines(): - parts = line.strip().split(None, 2) - if len(parts) != 3 or not parts[0].isdigit(): + parts = line.strip().split(None, 3) + if ( + len(parts) != 4 + or not parts[0].isdigit() + or not parts[1].isdigit() + ): continue - items.append(ProcessInfo(pid=int(parts[0]), tty=parts[1], command=parts[2])) + items.append( + ProcessInfo( + pid=int(parts[0]), + ppid=int(parts[1]), + tty=parts[2], + command=parts[3], + ) + ) return items @@ -243,6 +264,27 @@ def live_tmux_paths() -> list[Path]: return paths +def parse_bridge_process(process: ProcessInfo) -> Optional[BridgeProcess]: + try: + tokens = shlex.split(process.command) + except ValueError: + return None + try: + index = tokens.index(BRIDGE_SUBCOMMAND) + except ValueError: + return None + if index + 2 >= len(tokens): + return None + source = tokens[index + 2] + if source not in {"claude", "codex"}: + return None + return BridgeProcess( + pid=process.pid, + run_dir=Path(tokens[index + 1]).resolve(), + source=source, + ) + + def load_runs(context: RepoContext) -> tuple[list[RunInfo], list[str]]: repo_runs = Path.home() / ".loop" / "runs" / context.repo_id runs: list[RunInfo] = [] @@ -389,6 +431,41 @@ def classify_run_processes( return sorted(kill.items()), notes +def classify_bridge_processes( + runs: list[RunInfo], protected: set[int], processes: list[ProcessInfo] +) -> tuple[list[tuple[int, str]], list[str]]: + kill: list[tuple[int, str]] = [] + notes: list[str] = [] + runs_by_dir = {run.run_dir.resolve(): run for run in runs} + for process in processes: + if process.pid in protected: + continue + bridge = parse_bridge_process(process) + if bridge is None: + continue + run = runs_by_dir.get(bridge.run_dir) + if run is None: + notes.append( + f"left bridge server {bridge.pid} alone because run dir is unknown" + ) + continue + parent_alive = bridge.pid != process.ppid and pid_exists(process.ppid) + if run.active and parent_alive: + continue + detail = ( + f"stale run {run.run_id}" + if not run.active + else f"orphaned parent for active run {run.run_id}" + ) + kill.append( + ( + bridge.pid, + f"unused loop-bridge server for {detail} ({bridge.source})", + ) + ) + return kill, notes + + def classify_agent_processes( runs: list[RunInfo], protected: set[int], processes: list[ProcessInfo] ) -> list[str]: @@ -543,13 +620,20 @@ def main() -> int: run_kills, run_notes = classify_run_processes( runs, protected, processes, tmux_paths ) + bridge_kills, bridge_notes = classify_bridge_processes( + runs, protected, processes + ) agent_notes = classify_agent_processes(runs, protected, processes) server_kills, server_notes = classify_servers( removable_worktrees, runs, protected, processes, tmux_paths ) + kill_candidates: dict[int, str] = {} + for pid, reason in [*run_kills, *bridge_kills, *server_kills]: + kill_candidates.setdefault(pid, reason) + process_actions: list[str] = [] - for pid, reason in [*run_kills, *server_kills]: + for pid, reason in sorted(kill_candidates.items()): process_actions.append(f"{terminate_pid(pid, args.apply)} pid {pid}: {reason}") worktree_actions: list[str] = [] @@ -567,7 +651,7 @@ def main() -> int: print(f"repo: {context.repo_root}") print(f"repo id: {context.repo_id}") print_section("runs", warnings + run_notes) - print_section("processes", process_actions + agent_notes) + print_section("processes", process_actions + bridge_notes + agent_notes) print_section("servers", server_notes) print_section("worktrees", worktree_actions + worktree_notes) print_section("browsers", browser_actions)