diff --git a/.gitignore b/.gitignore index 77dd181..50275e8 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ build/ # MCP files openspace/config/config_mcp.json +.mcp.json # Logs logs/ @@ -46,6 +47,7 @@ showcase/.openspace/* # GDPVal benchmark cache gdpval_bench/.openspace/* !gdpval_bench/.openspace/*.db +gdpval_bench/results/ # Embedding cache embedding_cache/ @@ -73,3 +75,6 @@ openspace/skills/* node_modules/ # Frontend local dependency link frontend/node_modules + +# Local scratch +tmp/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..3854817 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,87 @@ +# AGENTS + +## Project Skill Bucket + +For this repository, the project-scoped OpenSpace skill bucket is: + +- `~/.codex/projects/openspace/skills` +- index: `~/.codex/projects/openspace/SKILL_INDEX.md` + +Routing preference for work inside this repo: + +1. project bucket `openspace` +2. shared local bucket `default` +3. common global skills + +Mirror OpenSpace's own pattern: +- first run `/Users/admin/.codex/tools/route_codex_skills_via_openspace.py` +- prefilter by skill header metadata first +- only open the most likely 1-2 `SKILL.md` files +- avoid scanning every project skill file unless the user explicitly asks + +## Codex Desktop Sidecar Evolution + +Use this workflow when the user is coding in Codex Desktop with their normal subscription login and wants OpenSpace to do post-task skill capture through the isolated `openspace_evolution` sidecar. + +Rules: +- Keep the main coding workflow unchanged. +- Do not switch the main Codex Desktop session to a provider-backed model. +- Do not modify code as part of sidecar evolution unless the user separately asks for code changes. +- Do not let OpenSpace take over the main task. +- Use the sidecar only for post-task skill capture. +- Prefer at most 1 new high-reuse skill per invocation unless the user explicitly asks for more. + +When the user asks for sidecar self-evolution, call: +- `openspace_evolution.evolve_from_context` + +Trigger phrases: +- `sidecar 自进化一下` +- `做一次 sidecar 自进化` +- `对当前这轮工作做一次 sidecar 自进化` +- `用 sidecar 沉淀一个 skill` +- `基于当前改动做一次 sidecar skill capture` +- `不要改代码,做一次 sidecar 自进化` + +If the user uses one of these phrases, default to this workflow automatically unless they explicitly ask for a different behavior. + +Derive the tool inputs from: +- the current conversation +- the current `git diff` +- the key changed files + +Behavior: +- Infer a concise `task` +- Infer a concise but specific `summary` +- Pass the most relevant changed files in `file_paths` +- Use `max_skills = 1` by default +- After the tool returns, report: + - the skill name + - the skill path + - why the skill is worth keeping + +Recommended user-facing invocation: + +```text +对当前这轮工作做一次 sidecar 自进化。不要改代码,不要接管任务。请调用 openspace_evolution.evolve_from_context,基于当前对话、git diff 和关键改动,自动提炼 task/summary,最多生成 1 个高复用 skill,并告诉我 skill 名称、路径、为什么值得保留。 +``` + +## New Project Bootstrap + +If the user wants this sidecar workflow in a new repository, treat it as a project bootstrap task first. + +Bootstrap order: +- Add or update a project launcher before relying on sidecar evolution. +- Point `OPENSPACE_WORKSPACE` at the new repository root. +- Keep the user's main Codex Desktop workflow unchanged. +- Do not modify global `~/.codex` defaults unless the user explicitly asks. + +Expected bootstrap outputs: +- a project-level launcher such as `scripts/codex-desktop-evolution` +- a project-level `AGENTS.md` section documenting the sidecar trigger phrases +- sidecar skill output routed to `~/.codex-openspace-desktop/projects//skills` + +When a user asks to initialize a new project for this workflow, default to: +- creating the launcher first +- wiring `OPENSPACE_WORKSPACE` to the repository root +- preserving the normal Codex Desktop login path +- only then enabling phrases like `sidecar 自进化一下` diff --git a/README.md b/README.md index 871298d..c15b45d 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,22 @@ pip install -e . openspace-mcp --help # verify installation ``` +> [!TIP] +> **Recommended split routing for OpenAI-compatible gateways** +> +> If your main model runs through an OpenAI-compatible provider or local relay (for example `gpt-5.4` via `http://127.0.0.1:8080/v1`), the recommended default is: +> - keep the main LLM on that provider via `OPENSPACE_LLM_*` +> - keep skill-router embeddings local via `OPENSPACE_SKILL_EMBEDDING_BACKEND=local` +> - use `OPENSPACE_SKILL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5` +> +> Why this is the default recommendation: +> - lower latency for routing and prefilter +> - no dependence on a remote `/v1/embeddings` endpoint +> - no extra token spend for embedding generation +> - stronger main LLM still handles final reasoning and selection +> +> Architecture notes and flow diagram: [`docs/current-routing-flow.md`](docs/current-routing-flow.md) + > [!TIP] > **Slow clone?** The `assets/` folder (~50 MB of images) makes the default clone large. Use this lightweight alternative to skip it: > ```bash diff --git a/README_CN.md b/README_CN.md index 09aee71..0575f3f 100644 --- a/README_CN.md +++ b/README_CN.md @@ -157,6 +157,22 @@ pip install -e . openspace-mcp --help # 验证安装 ``` +> [!TIP] +> **OpenAI 兼容网关下,默认推荐双路由方案 A** +> +> 如果你的主模型走的是 OpenAI 兼容 provider 或本地 relay(例如 `gpt-5.4` 走 `http://127.0.0.1:8080/v1`),当前最推荐的默认配置是: +> - 主 LLM 继续走 `OPENSPACE_LLM_*` +> - skill router 的 embedding 走本地:`OPENSPACE_SKILL_EMBEDDING_BACKEND=local` +> - 本地 embedding 模型使用:`OPENSPACE_SKILL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5` +> +> 这样选的原因: +> - 路由和预筛选延迟更低 +> - 不依赖远程 `/v1/embeddings` +> - embedding 不额外消耗 provider token +> - 更强的主 LLM 仍然负责最终推理和选择 +> +> 架构说明和流程图见:[`docs/current-routing-flow.md`](docs/current-routing-flow.md) + > [!TIP] > **Clone 太慢?** `assets/` 目录包含约 50 MB 的图片文件,导致仓库较大。使用以下轻量方式跳过它: > ```bash diff --git a/context/local-machine/admin-macos/README.md b/context/local-machine/admin-macos/README.md new file mode 100644 index 0000000..8e38241 --- /dev/null +++ b/context/local-machine/admin-macos/README.md @@ -0,0 +1,28 @@ +## Admin macOS Local Context + +This directory keeps machine-specific snapshots that are useful for future +deployment, migration, and debugging on other machines, while avoiding noise in +the repo root. + +Included here: + +- `mcp/repo-local.mcp.json` + - snapshot of the repo-local MCP wiring that was used during local debugging +- `gdpval_bench/...` + - selected benchmark result snapshots that were useful during local call-rate + and provider-path investigation + +Intentional choices: + +- absolute local paths are preserved because they are part of the context +- localhost API base values are preserved because they document the local stack +- secrets are not preserved + - any benchmark config copied here has API keys redacted + +Intentionally omitted from this snapshot: + +- SQLite/WAL benchmark databases +- raw recording directories +- ad hoc `tmp/` scratch files + +Those source locations remain local-only and are ignored via `.gitignore`. diff --git a/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke/config.json b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke/config.json new file mode 100644 index 0000000..c8a5631 --- /dev/null +++ b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke/config.json @@ -0,0 +1,19 @@ +{ + "clawwork_root": "/tmp/openspace-bench-mp4o3U", + "gdpval_path": null, + "model": "gpt-5.4", + "max_iterations": 20, + "backend_scope": [ + "shell" + ], + "use_clawwork_productivity": false, + "run_name": "codex_callrate_smoke", + "max_tasks": 3, + "per_occupation": null, + "sectors": null, + "occupations": null, + "task_ids": null, + "record_call_details": true, + "enable_evaluation": false, + "concurrency": 1 +} \ No newline at end of file diff --git a/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/config.json b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/config.json new file mode 100644 index 0000000..2d190a5 --- /dev/null +++ b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/config.json @@ -0,0 +1,19 @@ +{ + "clawwork_root": "/tmp/openspace-bench-noref-JwyV4l", + "gdpval_path": null, + "model": "gpt-5.4", + "max_iterations": 20, + "backend_scope": [ + "shell" + ], + "use_clawwork_productivity": false, + "run_name": "codex_callrate_smoke_noref", + "max_tasks": 3, + "per_occupation": null, + "sectors": null, + "occupations": null, + "task_ids": null, + "record_call_details": true, + "enable_evaluation": false, + "concurrency": 1 +} \ No newline at end of file diff --git a/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/phase1_results.jsonl b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/phase1_results.jsonl new file mode 100644 index 0000000..933c773 --- /dev/null +++ b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/phase1_results.jsonl @@ -0,0 +1,6 @@ +{"task_id": "0112fc9b-c3b2-4084-8993-5a4abb1f54f1", "phase": "phase1", "occupation": "Nurse Practitioners", "sector": "Health Care and Social Assistance", "task_value_usd": 0.0, "status": "error", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 0.01, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 0, "tool_calls": 0, "time_sec": 0.01}, "skills": {"before": 2, "after": 2, "new_this_task": 0, "evolved": [], "used": []}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:37:44.273256"} +{"task_id": "02314fc6-a24e-42f4-a8cd-362cae0f0ec1", "phase": "phase1", "occupation": "General and Operations Managers", "sector": "Retail Trade", "task_value_usd": 0.0, "status": "error", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 0.0, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 0, "tool_calls": 0, "time_sec": 0.0}, "skills": {"before": 2, "after": 2, "new_this_task": 0, "evolved": [], "used": []}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:37:44.278249"} +{"task_id": "02aa1805-c658-4069-8a6a-02dec146063a", "phase": "phase1", "occupation": "Project Management Specialists", "sector": "Professional, Scientific, and Technical Services", "task_value_usd": 0.0, "status": "error", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 0.0, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 0, "tool_calls": 0, "time_sec": 0.0}, "skills": {"before": 2, "after": 2, "new_this_task": 0, "evolved": [], "used": []}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:37:44.282846"} +{"task_id": "0112fc9b-c3b2-4084-8993-5a4abb1f54f1", "phase": "phase1", "occupation": "Nurse Practitioners", "sector": "Health Care and Social Assistance", "task_value_usd": 0.0, "status": "error", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 0.0, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 0, "tool_calls": 0, "time_sec": 0.0}, "skills": {"before": 2, "after": 2, "new_this_task": 0, "evolved": [], "used": []}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:38:19.523517"} +{"task_id": "02314fc6-a24e-42f4-a8cd-362cae0f0ec1", "phase": "phase1", "occupation": "General and Operations Managers", "sector": "Retail Trade", "task_value_usd": 0.0, "status": "error", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 0.0, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 0, "tool_calls": 0, "time_sec": 0.0}, "skills": {"before": 2, "after": 2, "new_this_task": 0, "evolved": [], "used": []}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:38:19.526964"} +{"task_id": "02aa1805-c658-4069-8a6a-02dec146063a", "phase": "phase1", "occupation": "Project Management Specialists", "sector": "Professional, Scientific, and Technical Services", "task_value_usd": 0.0, "status": "error", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 0.0, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 0, "tool_calls": 0, "time_sec": 0.0}, "skills": {"before": 2, "after": 2, "new_this_task": 0, "evolved": [], "used": []}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:38:19.529894"} diff --git a/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/skills_snapshot.json b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/skills_snapshot.json new file mode 100644 index 0000000..8c4d283 --- /dev/null +++ b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref/skills_snapshot.json @@ -0,0 +1,70 @@ +[ + { + "skill_id": "single-retry-transient-failures__v0_cb599f5d", + "name": "single-retry-transient-failures", + "description": "Perform one minimal retry for likely transient upstream or infrastructure errors before deeper investigation, and record whether the failure is intermittent or reproducible.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/single-retry-transient-failures/SKILL.md", + "is_active": true, + "category": "workflow", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "imported", + "generation": 0, + "parent_skill_ids": [], + "source_task_id": null, + "change_summary": "", + "content_diff": "--- a/.upload_meta.json\n+++ b/.upload_meta.json\n@@ -0,0 +1,7 @@\n+{\n+ \"origin\": \"captured\",\n+ \"parent_skill_ids\": [],\n+ \"change_summary\": \"Captured a workflow for handling likely transient upstream failures by doing one minimal retry first and explicitly recording whether the issue is intermittent or reproducible.\",\n+ \"created_by\": \"gpt-5.4\",\n+ \"tags\": []\n+}\n\n--- a/SKILL.md\n+++ b/SKILL.md\n@@ -0,0 +1,228 @@\n+---\n+name: single-retry-transient-failures\n+description: Perform one minimal retry for likely transient upstream or infrastructure errors before deeper investigation, and record whether the failure is intermittent or reproducible.\n+---\n+\n+# Single Retry for Transient Failures\n+\n+## Purpose\n+\n+Use this workflow when a task fails with a likely transient infrastructure problem, such as:\n+\n+- HTTP 5xx from an upstream service\n+- gateway/proxy failures like 502, 503, 504\n+- temporary connection resets or timeouts\n+- rate-limit edge cases that may clear immediately\n+- brief platform instability not obviously caused by local changes\n+\n+The goal is to quickly distinguish:\n+\n+- transient platform instability, vs.\n+- a reproducible product or code issue\n+\n+This avoids wasting time debugging or changing code before confirming the problem persists.\n+\n+## When to use this\n+\n+Apply this skill when all of the following are true:\n+\n+1. The failure appears external or infrastructural.\n+2. There is no strong evidence yet that a recent code/config change caused it.\n+3. Retrying once is low-risk and does not create harmful side effects.\n+4. You can re-run the smallest possible failing step.\n+\n+Do not use this as an excuse for repeated blind retries.\n+\n+## Core rule\n+\n+Before deeper investigation, perform exactly one minimal retry of the failing operation.\n+\n+- Minimal retry = rerun only the smallest step needed to test whether the issue persists.\n+- One retry only = enough to test intermittency without masking a real defect.\n+\n+After the retry, explicitly record the outcome.\n+\n+## Procedure\n+\n+### 1) Confirm the error looks transient\n+\n+Treat the issue as potentially transient if signals include:\n+\n+- HTTP 502/503/504\n+- upstream service unavailable\n+- network timeout during dependency call\n+- temporary DNS/TLS/proxy/connectivity failure\n+- infrastructure-generated error page\n+- job failure with no local stack trace pointing to your code\n+\n+If the error clearly points to deterministic local logic, skip this skill and debug normally.\n+\n+### 2) Check retry safety\n+\n+Before retrying, verify:\n+\n+- rerunning will not duplicate destructive side effects\n+- rerunning will not create bad state\n+- if side effects exist, there is idempotency protection or safe rollback\n+- credentials, environment, and inputs remain unchanged\n+\n+If retry is unsafe, do not retry blindly. Instead document why and investigate more carefully.\n+\n+### 3) Perform one minimal retry\n+\n+Retry only the failing step, not the entire workflow unless necessary.\n+\n+Examples:\n+\n+- rerun one API request, not the whole deployment\n+- rerun one test shard, not the whole test suite\n+- rerun one build/download step, not all CI jobs\n+- repeat the same command with the same inputs\n+\n+Keep the retry minimal:\n+\n+- same inputs\n+- same environment\n+- no code changes\n+- no config changes\n+- no workaround added yet\n+\n+This preserves the diagnostic value of the retry.\n+\n+### 4) Record both attempts clearly\n+\n+Capture:\n+\n+- original failure timestamp\n+- original error summary\n+- whether the retry used identical inputs\n+- retry result: passed or failed\n+- if failed again, whether the error is materially the same\n+\n+Use short, explicit wording so future follow-up can tell what happened.\n+\n+Example status notes:\n+\n+- \"Initial run failed with upstream 502. Performed one identical retry; retry succeeded. Treat as transient platform issue unless it recurs.\"\n+- \"Initial run failed with 503. One minimal retry failed with same 503. Issue appears reproducible; continue investigation.\"\n+- \"Initial timeout retried once and succeeded. No code changes made; likely intermittent dependency instability.\"\n+\n+### 5) Branch based on the retry result\n+\n+#### If the retry succeeds\n+\n+Conclude that the failure was likely intermittent.\n+\n+Then:\n+\n+- avoid unnecessary code changes\n+- note that the issue is not currently reproducible\n+- proceed with the task if appropriate\n+- flag possible platform instability if relevant\n+- monitor for recurrence\n+\n+Do not over-investigate unless:\n+- the task requires high confidence in reliability, or\n+- the same transient pattern is recurring often\n+\n+#### If the retry fails again\n+\n+Treat the issue as likely reproducible.\n+\n+Then:\n+\n+- begin deeper investigation\n+- inspect logs, traces, and recent changes\n+- isolate the failing subsystem\n+- consider fixes, mitigations, or escalations\n+\n+The single retry has now served its purpose: it ruled out a one-off blip.\n+\n+## Decision table\n+\n+| Situation | Action |\n+|---|---|\n+| Likely upstream/infrastructure 5xx, safe to retry | Retry once minimally |\n+| Retry succeeds | Record as likely transient; avoid unnecessary changes |\n+| Retry fails similarly | Treat as reproducible; investigate deeply |\n+| Retry unsafe due to side effects | Do not retry blindly; document and investigate safely |\n+| Error clearly caused by deterministic local logic | Skip this workflow |\n+\n+## Recommended phrasing for reports\n+\n+Use concise language that preserves diagnostic value.\n+\n+### Success on retry\n+\n+- \"Observed likely transient upstream failure; one minimal retry succeeded with no changes.\"\n+- \"Issue appears intermittent rather than reproducible.\"\n+- \"No product/code change made because retry cleared the failure.\"\n+\n+### Failure on retry\n+\n+- \"Performed one minimal retry to test intermittency; failure reproduced.\"\n+- \"Issue is not a one-off transient; continuing root-cause investigation.\"\n+\n+### Retry not attempted\n+\n+- \"Likely transient error observed, but retry was not safe due to non-idempotent side effects.\"\n+\n+## Guardrails\n+\n+### Do\n+\n+- retry once\n+- retry the smallest possible unit\n+- keep inputs unchanged\n+- record what happened\n+- use the retry result to guide next steps\n+\n+### Do not\n+\n+- keep retrying until success\n+- modify code before confirming reproducibility\n+- rerun a large workflow when a smaller reproduction is available\n+- hide the fact that the first failure occurred\n+- assume success on retry means the system is healthy\n+\n+## Example workflow\n+\n+### Example 1: Upstream 502 during a task\n+\n+1. A command fails with HTTP 502 from a dependency.\n+2. No local code changes explain it.\n+3. Rerun the same command once with the same inputs.\n+4. If it succeeds, record:\n+ - \"Initial 502 appears transient; single retry succeeded.\"\n+5. Continue without code changes.\n+\n+### Example 2: Timeout during CI dependency fetch\n+\n+1. Dependency download times out.\n+2. Retry only the fetch step once.\n+3. If it times out again, record:\n+ - \"Single retry reproduced timeout; investigating network/dependency issue.\"\n+4. Move into deeper diagnosis.\n+\n+## Why this works\n+\n+A single minimal retry is a fast diagnostic tool:\n+\n+- low effort\n+- low risk when safe\n+- preserves signal\n+- prevents premature debugging\n+- separates intermittent infrastructure instability from true reproducible defects\n+\n+This is especially useful in automated environments, CI systems, remote execution, and tasks that depend on external services.\n+\n+## Completion checklist\n+\n+Before finishing, confirm:\n+\n+- [ ] The error was plausibly transient\n+- [ ] Retry was safe\n+- [ ] Only one minimal retry was performed\n+- [ ] No code/config changes were made before the retry\n+- [ ] The retry outcome was explicitly documented\n+- [ ] Next actions were chosen based on whether the issue reproduced", + "content_snapshot": { + ".upload_meta.json": "{\n \"origin\": \"captured\",\n \"parent_skill_ids\": [],\n \"change_summary\": \"Captured a workflow for handling likely transient upstream failures by doing one minimal retry first and explicitly recording whether the issue is intermittent or reproducible.\",\n \"created_by\": \"gpt-5.4\",\n \"tags\": []\n}\n", + "SKILL.md": "---\nname: single-retry-transient-failures\ndescription: Perform one minimal retry for likely transient upstream or infrastructure errors before deeper investigation, and record whether the failure is intermittent or reproducible.\n---\n\n# Single Retry for Transient Failures\n\n## Purpose\n\nUse this workflow when a task fails with a likely transient infrastructure problem, such as:\n\n- HTTP 5xx from an upstream service\n- gateway/proxy failures like 502, 503, 504\n- temporary connection resets or timeouts\n- rate-limit edge cases that may clear immediately\n- brief platform instability not obviously caused by local changes\n\nThe goal is to quickly distinguish:\n\n- transient platform instability, vs.\n- a reproducible product or code issue\n\nThis avoids wasting time debugging or changing code before confirming the problem persists.\n\n## When to use this\n\nApply this skill when all of the following are true:\n\n1. The failure appears external or infrastructural.\n2. There is no strong evidence yet that a recent code/config change caused it.\n3. Retrying once is low-risk and does not create harmful side effects.\n4. You can re-run the smallest possible failing step.\n\nDo not use this as an excuse for repeated blind retries.\n\n## Core rule\n\nBefore deeper investigation, perform exactly one minimal retry of the failing operation.\n\n- Minimal retry = rerun only the smallest step needed to test whether the issue persists.\n- One retry only = enough to test intermittency without masking a real defect.\n\nAfter the retry, explicitly record the outcome.\n\n## Procedure\n\n### 1) Confirm the error looks transient\n\nTreat the issue as potentially transient if signals include:\n\n- HTTP 502/503/504\n- upstream service unavailable\n- network timeout during dependency call\n- temporary DNS/TLS/proxy/connectivity failure\n- infrastructure-generated error page\n- job failure with no local stack trace pointing to your code\n\nIf the error clearly points to deterministic local logic, skip this skill and debug normally.\n\n### 2) Check retry safety\n\nBefore retrying, verify:\n\n- rerunning will not duplicate destructive side effects\n- rerunning will not create bad state\n- if side effects exist, there is idempotency protection or safe rollback\n- credentials, environment, and inputs remain unchanged\n\nIf retry is unsafe, do not retry blindly. Instead document why and investigate more carefully.\n\n### 3) Perform one minimal retry\n\nRetry only the failing step, not the entire workflow unless necessary.\n\nExamples:\n\n- rerun one API request, not the whole deployment\n- rerun one test shard, not the whole test suite\n- rerun one build/download step, not all CI jobs\n- repeat the same command with the same inputs\n\nKeep the retry minimal:\n\n- same inputs\n- same environment\n- no code changes\n- no config changes\n- no workaround added yet\n\nThis preserves the diagnostic value of the retry.\n\n### 4) Record both attempts clearly\n\nCapture:\n\n- original failure timestamp\n- original error summary\n- whether the retry used identical inputs\n- retry result: passed or failed\n- if failed again, whether the error is materially the same\n\nUse short, explicit wording so future follow-up can tell what happened.\n\nExample status notes:\n\n- \"Initial run failed with upstream 502. Performed one identical retry; retry succeeded. Treat as transient platform issue unless it recurs.\"\n- \"Initial run failed with 503. One minimal retry failed with same 503. Issue appears reproducible; continue investigation.\"\n- \"Initial timeout retried once and succeeded. No code changes made; likely intermittent dependency instability.\"\n\n### 5) Branch based on the retry result\n\n#### If the retry succeeds\n\nConclude that the failure was likely intermittent.\n\nThen:\n\n- avoid unnecessary code changes\n- note that the issue is not currently reproducible\n- proceed with the task if appropriate\n- flag possible platform instability if relevant\n- monitor for recurrence\n\nDo not over-investigate unless:\n- the task requires high confidence in reliability, or\n- the same transient pattern is recurring often\n\n#### If the retry fails again\n\nTreat the issue as likely reproducible.\n\nThen:\n\n- begin deeper investigation\n- inspect logs, traces, and recent changes\n- isolate the failing subsystem\n- consider fixes, mitigations, or escalations\n\nThe single retry has now served its purpose: it ruled out a one-off blip.\n\n## Decision table\n\n| Situation | Action |\n|---|---|\n| Likely upstream/infrastructure 5xx, safe to retry | Retry once minimally |\n| Retry succeeds | Record as likely transient; avoid unnecessary changes |\n| Retry fails similarly | Treat as reproducible; investigate deeply |\n| Retry unsafe due to side effects | Do not retry blindly; document and investigate safely |\n| Error clearly caused by deterministic local logic | Skip this workflow |\n\n## Recommended phrasing for reports\n\nUse concise language that preserves diagnostic value.\n\n### Success on retry\n\n- \"Observed likely transient upstream failure; one minimal retry succeeded with no changes.\"\n- \"Issue appears intermittent rather than reproducible.\"\n- \"No product/code change made because retry cleared the failure.\"\n\n### Failure on retry\n\n- \"Performed one minimal retry to test intermittency; failure reproduced.\"\n- \"Issue is not a one-off transient; continuing root-cause investigation.\"\n\n### Retry not attempted\n\n- \"Likely transient error observed, but retry was not safe due to non-idempotent side effects.\"\n\n## Guardrails\n\n### Do\n\n- retry once\n- retry the smallest possible unit\n- keep inputs unchanged\n- record what happened\n- use the retry result to guide next steps\n\n### Do not\n\n- keep retrying until success\n- modify code before confirming reproducibility\n- rerun a large workflow when a smaller reproduction is available\n- hide the fact that the first failure occurred\n- assume success on retry means the system is healthy\n\n## Example workflow\n\n### Example 1: Upstream 502 during a task\n\n1. A command fails with HTTP 502 from a dependency.\n2. No local code changes explain it.\n3. Rerun the same command once with the same inputs.\n4. If it succeeds, record:\n - \"Initial 502 appears transient; single retry succeeded.\"\n5. Continue without code changes.\n\n### Example 2: Timeout during CI dependency fetch\n\n1. Dependency download times out.\n2. Retry only the fetch step once.\n3. If it times out again, record:\n - \"Single retry reproduced timeout; investigating network/dependency issue.\"\n4. Move into deeper diagnosis.\n\n## Why this works\n\nA single minimal retry is a fast diagnostic tool:\n\n- low effort\n- low risk when safe\n- preserves signal\n- prevents premature debugging\n- separates intermittent infrastructure instability from true reproducible defects\n\nThis is especially useful in automated environments, CI systems, remote execution, and tasks that depend on external services.\n\n## Completion checklist\n\nBefore finishing, confirm:\n\n- [ ] The error was plausibly transient\n- [ ] Retry was safe\n- [ ] Only one minimal retry was performed\n- [ ] No code/config changes were made before the retry\n- [ ] The retry outcome was explicitly documented\n- [ ] Next actions were chosen based on whether the issue reproduced" + }, + "created_at": "2026-04-12T02:38:19.517472", + "created_by": "" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 0, + "total_applied": 0, + "total_completions": 0, + "total_fallbacks": 0, + "recent_analyses": [], + "first_seen": "2026-04-12T02:38:19.517476", + "last_updated": "2026-04-12T02:38:19.517477" + }, + { + "skill_id": "verify-before-confirming__v1_acf0d1bc", + "name": "verify-before-confirming", + "description": "Require a minimal evidence check before asserting that a status or success condition is true.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/verify-before-confirming/SKILL.md", + "is_active": true, + "category": "workflow", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "imported", + "generation": 0, + "parent_skill_ids": [], + "source_task_id": null, + "change_summary": "", + "content_diff": "--- a/.upload_meta.json\n+++ b/.upload_meta.json\n@@ -0,0 +1,9 @@\n+{\n+ \"origin\": \"fixed\",\n+ \"parent_skill_ids\": [\n+ \"verify-before-confirming__v0_109e009a\"\n+ ],\n+ \"change_summary\": \"Added claim-matching verification guidance so confirmations must validate the exact named tool or report the claim as unverified.\",\n+ \"created_by\": \"openspace\",\n+ \"tags\": []\n+}\n\n--- a/SKILL.md\n+++ b/SKILL.md\n@@ -0,0 +1,126 @@\n+---\n+name: verify-before-confirming\n+description: Require a minimal evidence check before asserting that a status or success condition is true.\n+---\n+\n+# Verify Before Confirming\n+\n+Use this workflow when a task asks for status confirmation, success confirmation, or a short “is it working?” answer. Do not assert success based only on expectation, prior context, or a single unverified statement. First perform at least one lightweight verification step, then answer briefly with the verified result.\n+\n+## When to Use\n+\n+Apply this skill when the user asks you to confirm whether something is:\n+\n+- working\n+- complete\n+- running\n+- connected\n+- deployed\n+- fixed\n+- available\n+- healthy\n+- successful\n+\n+Examples:\n+\n+- “Confirm the relay is working.”\n+- “Is the service up?”\n+- “Check whether the job completed.”\n+- “Verify the recording was saved.”\n+- “Make sure the integration is active.”\n+\n+## Workflow\n+\n+1. Identify the claim being confirmed.\n+ - Example: “The relay is working.”\n+ - Example: “The build completed successfully.”\n+ - Example: “The recording exists.”\n+\n+2. Choose the smallest available verification method.\n+ Prefer the least invasive check that can provide real evidence, such as:\n+ - reading recent logs\n+ - checking a status endpoint\n+ - listing expected output files\n+ - inspecting a recording/artifact\n+ - running a diagnostic command\n+ - checking a process, job, or service status\n+ - verifying timestamps, exit codes, or health checks\n+ The verification must match the exact claim being confirmed. Do not substitute a nearby or weaker check that only shows general availability, routing, or partial system access.\n+\n+ - If the claim names a specific tool, service, artifact, integration, or endpoint, verify that named thing directly when possible.\n+ - Do not treat generic introspection, provider discovery, or session visibility as proof that a specific tool invocation works unless the claim is specifically about introspection or discovery.\n+ - For tool-availability or connectivity claims, prefer directly invoking or inspecting the named tool itself with a safe read-only or no-op check when available.\n+ - If the specific thing cannot be inspected or invoked, say the claim remains unverified rather than inferring success from related signals.\n+\n+3. Run or inspect the verification source before responding.\n+ Do not skip this step unless no verification source or tool access exists.\n+\n+4. Interpret the evidence conservatively.\n+ - If the check passes, say what was verified.\n+ - If the check is inconclusive, say it is inconclusive and mention what was checked.\n+ - If the check fails, state the failure rather than asserting success.\n+ - If the evidence only confirms a related capability but not the exact claim, explicitly say that the exact claim was not verified.\n+\n+5. Final answer: one short sentence when the user requested only confirmation.\n+ Include the verified evidence in compressed form.\n+\n+## Response Patterns\n+\n+Verified success:\n+\n+“Verified: the service health check returned OK.”\n+\n+“Confirmed: the latest log shows the relay connected successfully.”\n+\n+“Verified: the expected recording file exists and was updated today.”\n+\n+Inconclusive:\n+\n+“I couldn’t fully confirm it; the process is running, but no recent success log was available.”\n+\n+“Status is inconclusive: the endpoint responded, but the expected artifact was not present.”\n+\n+Failure:\n+\n+“Not confirmed: the diagnostic command failed with a connection timeout.”\n+\n+“Not working yet: the latest log shows authentication errors.”\n+\n+## Rules\n+\n+- Do not mark a status-confirmation task complete without checking at least one available evidence source.\n+- The evidence source must be relevant to the exact claim, not just to a neighboring capability or tool path.\n+- Do not claim success merely because no error is visible.\n+- Do not treat tool listing, backend/provider discovery, or session metadata as confirmation that a specific named tool executes successfully unless that exact execution claim was tested.\n+- When asked whether a particular tool is available, connected, or working, validate that specific tool directly when possible; otherwise report that the claim is unverified.\n+- Do not over-verify if the task only needs a simple confirmation; one relevant check is usually enough.\n+- Keep the final answer concise, especially when the user asked for a simple status confirmation.\n+- If tools are unavailable, say so explicitly and avoid pretending verification occurred.\n+\n+## Minimal Verification Examples\n+\n+For a service:\n+\n+1. Run a health/status command or check recent logs.\n+2. Reply: “Verified: the health check reports the service is running.”\n+\n+For a file or recording:\n+\n+1. List or inspect the expected artifact path.\n+2. Reply: “Verified: the recording file exists and has a recent timestamp.”\n+\n+For a relay, integration, or connection:\n+\n+1. Check logs, events, or a diagnostic command showing recent activity.\n+2. Reply: “Confirmed: recent logs show the relay connected and forwarded traffic.”\n+\n+For a background job:\n+\n+1. Check job status, exit code, or output artifact.\n+2. Reply: “Verified: the job completed successfully with exit code 0.”\n+\n+For a named tool or tool-connectivity claim:\n+\n+1. Verify the specifically named tool directly, ideally with a safe invocation or direct inspection of that tool's availability/status.\n+2. Do not rely only on provider listing, session lookup, or presence of other tools as a proxy.\n+3. Reply: “Verified: the named tool responded to a read-only check.” or “Unverified: I could inspect the provider, but I could not directly confirm the named tool.”\n", + "content_snapshot": { + ".upload_meta.json": "{\n \"origin\": \"fixed\",\n \"parent_skill_ids\": [\n \"verify-before-confirming__v0_109e009a\"\n ],\n \"change_summary\": \"Added claim-matching verification guidance so confirmations must validate the exact named tool or report the claim as unverified.\",\n \"created_by\": \"openspace\",\n \"tags\": []\n}\n", + "SKILL.md": "---\nname: verify-before-confirming\ndescription: Require a minimal evidence check before asserting that a status or success condition is true.\n---\n\n# Verify Before Confirming\n\nUse this workflow when a task asks for status confirmation, success confirmation, or a short “is it working?” answer. Do not assert success based only on expectation, prior context, or a single unverified statement. First perform at least one lightweight verification step, then answer briefly with the verified result.\n\n## When to Use\n\nApply this skill when the user asks you to confirm whether something is:\n\n- working\n- complete\n- running\n- connected\n- deployed\n- fixed\n- available\n- healthy\n- successful\n\nExamples:\n\n- “Confirm the relay is working.”\n- “Is the service up?”\n- “Check whether the job completed.”\n- “Verify the recording was saved.”\n- “Make sure the integration is active.”\n\n## Workflow\n\n1. Identify the claim being confirmed.\n - Example: “The relay is working.”\n - Example: “The build completed successfully.”\n - Example: “The recording exists.”\n\n2. Choose the smallest available verification method.\n Prefer the least invasive check that can provide real evidence, such as:\n - reading recent logs\n - checking a status endpoint\n - listing expected output files\n - inspecting a recording/artifact\n - running a diagnostic command\n - checking a process, job, or service status\n - verifying timestamps, exit codes, or health checks\n The verification must match the exact claim being confirmed. Do not substitute a nearby or weaker check that only shows general availability, routing, or partial system access.\n\n - If the claim names a specific tool, service, artifact, integration, or endpoint, verify that named thing directly when possible.\n - Do not treat generic introspection, provider discovery, or session visibility as proof that a specific tool invocation works unless the claim is specifically about introspection or discovery.\n - For tool-availability or connectivity claims, prefer directly invoking or inspecting the named tool itself with a safe read-only or no-op check when available.\n - If the specific thing cannot be inspected or invoked, say the claim remains unverified rather than inferring success from related signals.\n\n3. Run or inspect the verification source before responding.\n Do not skip this step unless no verification source or tool access exists.\n\n4. Interpret the evidence conservatively.\n - If the check passes, say what was verified.\n - If the check is inconclusive, say it is inconclusive and mention what was checked.\n - If the check fails, state the failure rather than asserting success.\n - If the evidence only confirms a related capability but not the exact claim, explicitly say that the exact claim was not verified.\n\n5. Final answer: one short sentence when the user requested only confirmation.\n Include the verified evidence in compressed form.\n\n## Response Patterns\n\nVerified success:\n\n“Verified: the service health check returned OK.”\n\n“Confirmed: the latest log shows the relay connected successfully.”\n\n“Verified: the expected recording file exists and was updated today.”\n\nInconclusive:\n\n“I couldn’t fully confirm it; the process is running, but no recent success log was available.”\n\n“Status is inconclusive: the endpoint responded, but the expected artifact was not present.”\n\nFailure:\n\n“Not confirmed: the diagnostic command failed with a connection timeout.”\n\n“Not working yet: the latest log shows authentication errors.”\n\n## Rules\n\n- Do not mark a status-confirmation task complete without checking at least one available evidence source.\n- The evidence source must be relevant to the exact claim, not just to a neighboring capability or tool path.\n- Do not claim success merely because no error is visible.\n- Do not treat tool listing, backend/provider discovery, or session metadata as confirmation that a specific named tool executes successfully unless that exact execution claim was tested.\n- When asked whether a particular tool is available, connected, or working, validate that specific tool directly when possible; otherwise report that the claim is unverified.\n- Do not over-verify if the task only needs a simple confirmation; one relevant check is usually enough.\n- Keep the final answer concise, especially when the user asked for a simple status confirmation.\n- If tools are unavailable, say so explicitly and avoid pretending verification occurred.\n\n## Minimal Verification Examples\n\nFor a service:\n\n1. Run a health/status command or check recent logs.\n2. Reply: “Verified: the health check reports the service is running.”\n\nFor a file or recording:\n\n1. List or inspect the expected artifact path.\n2. Reply: “Verified: the recording file exists and has a recent timestamp.”\n\nFor a relay, integration, or connection:\n\n1. Check logs, events, or a diagnostic command showing recent activity.\n2. Reply: “Confirmed: recent logs show the relay connected and forwarded traffic.”\n\nFor a background job:\n\n1. Check job status, exit code, or output artifact.\n2. Reply: “Verified: the job completed successfully with exit code 0.”\n\nFor a named tool or tool-connectivity claim:\n\n1. Verify the specifically named tool directly, ideally with a safe invocation or direct inspection of that tool's availability/status.\n2. Do not rely only on provider listing, session lookup, or presence of other tools as a proxy.\n3. Reply: “Verified: the named tool responded to a read-only check.” or “Unverified: I could inspect the provider, but I could not directly confirm the named tool.”\n" + }, + "created_at": "2026-04-12T02:38:19.518081", + "created_by": "" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 0, + "total_applied": 0, + "total_completions": 0, + "total_fallbacks": 0, + "recent_analyses": [], + "first_seen": "2026-04-12T02:38:19.518083", + "last_updated": "2026-04-12T02:38:19.518084" + } +] \ No newline at end of file diff --git a/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/config.json b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/config.json new file mode 100644 index 0000000..79e4c8f --- /dev/null +++ b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/config.json @@ -0,0 +1,23 @@ +{ + "clawwork_root": "/tmp/openspace-bench-noref-JwyV4l", + "gdpval_path": null, + "model": "gpt-5.4", + "max_iterations": 20, + "backend_scope": [ + "shell" + ], + "use_clawwork_productivity": false, + "run_name": "codex_callrate_smoke_noref_fix", + "max_tasks": 2, + "per_occupation": null, + "sectors": null, + "occupations": null, + "task_ids": null, + "record_call_details": true, + "llm_kwargs": { + "api_key": "REDACTED", + "api_base": "http://127.0.0.1:8080/v1" + }, + "enable_evaluation": false, + "concurrency": 1 +} diff --git a/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/phase1_results.jsonl b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/phase1_results.jsonl new file mode 100644 index 0000000..2aa1076 --- /dev/null +++ b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/phase1_results.jsonl @@ -0,0 +1,2 @@ +{"task_id": "0112fc9b-c3b2-4084-8993-5a4abb1f54f1", "phase": "phase1", "occupation": "Nurse Practitioners", "sector": "Health Care and Social Assistance", "task_value_usd": 0.0, "status": "success", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 93.54, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 1, "tool_calls": 0, "time_sec": 93.54}, "skills": {"before": 3, "after": 4, "new_this_task": 1, "evolved": [{"name": "clinical-soap-note-from-narrative-enhanced", "origin": "derived", "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/clinical-soap-note-from-narrative-enhanced/SKILL.md"}], "used": ["clinical-soap-note-from-narrative__v0_9ff620af"]}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:50:48.493407"} +{"task_id": "02314fc6-a24e-42f4-a8cd-362cae0f0ec1", "phase": "phase1", "occupation": "General and Operations Managers", "sector": "Retail Trade", "task_value_usd": 0.0, "status": "success", "tokens": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "llm_calls": 0, "cost_usd": 0.0, "wall_time_sec": 269.94, "agent_prompt_tokens": 0, "agent_completion_tokens": 0, "agent_total_tokens": 0, "agent_llm_calls": 0, "call_details": []}, "execution": {"iterations": 10, "tool_calls": 10, "time_sec": 269.94}, "skills": {"before": 4, "after": 6, "new_this_task": 2, "evolved": [{"name": "adaptive-document-output", "origin": "captured", "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/adaptive-document-output/SKILL.md"}, {"name": "verify-build-artifacts", "origin": "captured", "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/verify-build-artifacts/SKILL.md"}], "used": []}, "evaluation": {"has_evaluation": false}, "timestamp": "2026-04-12T02:55:18.445041"} diff --git a/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/skills_snapshot.json b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/skills_snapshot.json new file mode 100644 index 0000000..a609ab2 --- /dev/null +++ b/context/local-machine/admin-macos/gdpval_bench/codex_callrate_smoke_noref_fix/skills_snapshot.json @@ -0,0 +1,232 @@ +[ + { + "skill_id": "clinical-soap-note-from-narrative__v0_9ff620af", + "name": "clinical-soap-note-from-narrative", + "description": "Convert a narrative clinical encounter into a structured SOAP note by separating subjective and objective data, generating a problem-focused assessment, and adding an appropriate safety-conscious plan when red flags or neurologic abnormalities are present.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/clinical-soap-note-from-narrative/SKILL.md", + "is_active": true, + "category": "workflow", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "imported", + "generation": 0, + "parent_skill_ids": [], + "source_task_id": null, + "change_summary": "", + "content_diff": "--- a/SKILL.md\n+++ b/SKILL.md\n@@ -0,0 +1,207 @@\n+---\n+name: clinical-soap-note-from-narrative\n+description: Convert a narrative clinical encounter into a structured SOAP note by separating subjective and objective data, generating a problem-focused assessment, and adding an appropriate safety-conscious plan when red flags or neurologic abnormalities are present.\n+---\n+\n+# Clinical SOAP Note From Narrative\n+\n+Use this skill when you are given a free-text clinical encounter, visit summary, or narrative case description and need to produce a structured SOAP note.\n+\n+This workflow is designed to:\n+- preserve the distinction between patient-reported history and clinician-observed findings\n+- avoid inventing exam findings, diagnostics, or treatment details\n+- produce a concise, problem-focused assessment\n+- add an appropriately cautious plan when red-flag symptoms or abnormal neurologic findings are described\n+\n+## Core principles\n+\n+1. Do not add facts that were not provided.\n+2. Keep subjective and objective information separated.\n+3. Preserve uncertainty when the source is uncertain.\n+4. Make the assessment clinically coherent but problem-focused.\n+5. Escalate the plan when the narrative includes red flags, worsening symptoms, or abnormal neurologic findings.\n+6. If the source lacks information, omit it or mark it as not provided rather than fabricating content.\n+\n+## Step-by-step workflow\n+\n+### 1) Read the encounter and extract source facts\n+\n+Identify and group all explicit facts from the narrative into:\n+- symptoms and history\n+- timing and progression\n+- relevant negatives\n+- exam findings\n+- prior treatments or self-care\n+- clinician impression or likely diagnosis if stated\n+- follow-up or return precautions if stated\n+\n+Before drafting, mentally separate:\n+- Subjective = what the patient reports, describes, denies, or experiences\n+- Objective = observed exam findings, measured values, test results, and clinician-documented physical findings\n+\n+### 2) Build the Subjective section\n+\n+Include only history elements supported by the narrative:\n+- chief concern\n+- onset, duration, course\n+- associated symptoms\n+- aggravating/alleviating factors\n+- pertinent negatives\n+- relevant prior history or attempted treatments, if provided\n+\n+Keep it concise and clinical. Rewrite into standard medical style without changing meaning.\n+\n+Example transformation:\n+- Narrative: \"He says the headache started 3 days ago, is getting worse, and now he has tingling in his left arm.\"\n+- Subjective: \"Reports 3 days of worsening headache with associated left arm tingling.\"\n+\n+### 3) Build the Objective section\n+\n+Include only documented objective data:\n+- vital signs, if given\n+- physical exam findings\n+- neurologic findings\n+- imaging/lab results, if given\n+\n+Important:\n+- Do not infer a normal exam if none was described.\n+- Do not add routine headings with normal findings unless explicitly provided.\n+- If the exam is sparse, keep the Objective sparse.\n+\n+Good:\n+- \"Neuro: Decreased sensation in left upper extremity.\"\n+Avoid:\n+- \"CN II-XII intact\" unless the narrative explicitly states this.\n+\n+### 4) Write a problem-focused Assessment\n+\n+Synthesize the likely clinical issue using only the available facts.\n+\n+The assessment should:\n+- summarize the main problem in 1-3 sentences\n+- mention key supporting features\n+- note important uncertainty when present\n+- avoid overcommitting to a diagnosis not established in the source\n+\n+Preferred style:\n+- \"Presentation is concerning for...\"\n+- \"Symptoms are most consistent with...\"\n+- \"Given X and Y, cannot exclude...\"\n+\n+If the narrative includes neurologic deficits, red-flag symptoms, or concerning progression, the assessment should explicitly reflect concern rather than minimizing it.\n+\n+Example:\n+- \"Worsening headache with unilateral sensory symptoms is concerning for a neurologic process and warrants urgent evaluation; benign primary headache cannot be assumed from the available information.\"\n+\n+### 5) Write the Plan with safety escalation when warranted\n+\n+Base the plan on what is provided, but structure it clearly.\n+\n+Possible plan components:\n+- symptomatic treatment already recommended or used\n+- monitoring or follow-up\n+- diagnostic evaluation if stated or clearly warranted by the narrative\n+- return precautions / emergency precautions\n+\n+When red flags are present, include a safety-conscious plan such as:\n+- urgent ED evaluation\n+- same-day in-person neurologic assessment\n+- immediate imaging/workup if the narrative supports that urgency\n+- explicit return precautions for worsening weakness, numbness, severe headache, speech changes, vision changes, gait instability, altered mental status, bowel/bladder dysfunction, fever, or other relevant warning signs\n+\n+Do not understate urgency if the input includes:\n+- focal neurologic deficits\n+- new weakness or numbness\n+- severe or worsening headache with neuro symptoms\n+- altered mental status\n+- seizure\n+- saddle anesthesia or bowel/bladder changes\n+- rapidly progressive symptoms\n+- other clearly dangerous features\n+\n+### 6) Final quality check\n+\n+Before finalizing, verify:\n+- no invented vitals, exam findings, or test results\n+- subjective and objective are not mixed\n+- assessment reflects the actual level of concern\n+- plan contains clear escalation when red flags are present\n+- note is concise and clinically readable\n+\n+## SOAP note template\n+\n+Use this structure:\n+\n+Subjective:\n+- [Chief concern and history in concise prose or bullets]\n+\n+Objective:\n+- [Only provided exam findings, vitals, tests]\n+\n+Assessment:\n+- [Focused synthesis of the main clinical issue, including uncertainty or concern]\n+\n+Plan:\n+- [Treatment / next steps / follow-up / precautions]\n+\n+## Safety trigger guide\n+\n+Escalate the plan when the narrative includes any of the following:\n+- focal neurologic deficit\n+- progressive neurologic symptoms\n+- new unilateral weakness or numbness\n+- severe or worsening headache with neurologic symptoms\n+- syncope, seizure, confusion, or altered mental status\n+- bowel/bladder dysfunction with spine symptoms\n+- rapidly worsening pain plus neurologic changes\n+- other explicitly concerning clinician findings\n+\n+Suggested plan language:\n+- \"Given the presence of red-flag symptoms/abnormal neurologic findings, urgent emergency department evaluation is recommended.\"\n+- \"Patient was advised to seek immediate care for any worsening weakness, numbness, severe headache, vision change, speech difficulty, gait instability, or other new neurologic symptoms.\"\n+\n+## Style guidance\n+\n+- Be concise.\n+- Use neutral clinical language.\n+- Prefer \"reports,\" \"endorses,\" \"denies,\" \"exam notable for,\" \"concerning for.\"\n+- If details are missing, do not compensate by guessing.\n+- If the source already includes a likely diagnosis or plan, preserve it faithfully in cleaner SOAP format.\n+\n+## Example\n+\n+Input narrative:\n+\"Patient reports low back pain for 1 week after lifting a box. Over the last 2 days, pain radiates down the right leg with numbness in the right foot. Exam shows decreased sensation over the dorsum of the right foot and positive straight-leg raise on the right. No bowel or bladder symptoms mentioned.\"\n+\n+Output:\n+Subjective:\n+- Reports 1 week of low back pain after lifting injury, with 2 days of radiation down the right leg and associated numbness in the right foot.\n+\n+Objective:\n+- Exam notable for decreased sensation over the dorsum of the right foot.\n+- Positive straight-leg raise on the right.\n+\n+Assessment:\n+- Acute low back pain with right-sided radicular symptoms and objective sensory deficit, concerning for lumbar radiculopathy. Given the documented neurologic abnormality, symptoms warrant prompt in-person evaluation and close monitoring for progression.\n+\n+Plan:\n+- Recommend prompt clinical follow-up/in-person evaluation.\n+- Advise immediate emergency evaluation for new or worsening weakness, progressive numbness, saddle anesthesia, bowel or bladder dysfunction, fever, or uncontrolled pain.\n+\n+## Common mistakes to avoid\n+\n+- Moving patient-reported symptoms into Objective\n+- Adding normal findings that were never stated\n+- Writing an assessment that ignores documented red flags\n+- Producing a routine conservative plan despite focal neurologic deficits\n+- Inventing medications, dosages, imaging, or follow-up intervals\n+- Overly broad differentials that distract from the main problem\n+\n+## Minimal decision rule\n+\n+If the case is routine and non-concerning:\n+- produce a standard SOAP note faithful to the narrative\n+\n+If the case includes red flags or abnormal neurologic findings:\n+- make the assessment explicitly cautious\n+- make the plan explicitly safety-oriented and escalated", + "content_snapshot": { + "SKILL.md": "---\nname: clinical-soap-note-from-narrative\ndescription: Convert a narrative clinical encounter into a structured SOAP note by separating subjective and objective data, generating a problem-focused assessment, and adding an appropriate safety-conscious plan when red flags or neurologic abnormalities are present.\n---\n\n# Clinical SOAP Note From Narrative\n\nUse this skill when you are given a free-text clinical encounter, visit summary, or narrative case description and need to produce a structured SOAP note.\n\nThis workflow is designed to:\n- preserve the distinction between patient-reported history and clinician-observed findings\n- avoid inventing exam findings, diagnostics, or treatment details\n- produce a concise, problem-focused assessment\n- add an appropriately cautious plan when red-flag symptoms or abnormal neurologic findings are described\n\n## Core principles\n\n1. Do not add facts that were not provided.\n2. Keep subjective and objective information separated.\n3. Preserve uncertainty when the source is uncertain.\n4. Make the assessment clinically coherent but problem-focused.\n5. Escalate the plan when the narrative includes red flags, worsening symptoms, or abnormal neurologic findings.\n6. If the source lacks information, omit it or mark it as not provided rather than fabricating content.\n\n## Step-by-step workflow\n\n### 1) Read the encounter and extract source facts\n\nIdentify and group all explicit facts from the narrative into:\n- symptoms and history\n- timing and progression\n- relevant negatives\n- exam findings\n- prior treatments or self-care\n- clinician impression or likely diagnosis if stated\n- follow-up or return precautions if stated\n\nBefore drafting, mentally separate:\n- Subjective = what the patient reports, describes, denies, or experiences\n- Objective = observed exam findings, measured values, test results, and clinician-documented physical findings\n\n### 2) Build the Subjective section\n\nInclude only history elements supported by the narrative:\n- chief concern\n- onset, duration, course\n- associated symptoms\n- aggravating/alleviating factors\n- pertinent negatives\n- relevant prior history or attempted treatments, if provided\n\nKeep it concise and clinical. Rewrite into standard medical style without changing meaning.\n\nExample transformation:\n- Narrative: \"He says the headache started 3 days ago, is getting worse, and now he has tingling in his left arm.\"\n- Subjective: \"Reports 3 days of worsening headache with associated left arm tingling.\"\n\n### 3) Build the Objective section\n\nInclude only documented objective data:\n- vital signs, if given\n- physical exam findings\n- neurologic findings\n- imaging/lab results, if given\n\nImportant:\n- Do not infer a normal exam if none was described.\n- Do not add routine headings with normal findings unless explicitly provided.\n- If the exam is sparse, keep the Objective sparse.\n\nGood:\n- \"Neuro: Decreased sensation in left upper extremity.\"\nAvoid:\n- \"CN II-XII intact\" unless the narrative explicitly states this.\n\n### 4) Write a problem-focused Assessment\n\nSynthesize the likely clinical issue using only the available facts.\n\nThe assessment should:\n- summarize the main problem in 1-3 sentences\n- mention key supporting features\n- note important uncertainty when present\n- avoid overcommitting to a diagnosis not established in the source\n\nPreferred style:\n- \"Presentation is concerning for...\"\n- \"Symptoms are most consistent with...\"\n- \"Given X and Y, cannot exclude...\"\n\nIf the narrative includes neurologic deficits, red-flag symptoms, or concerning progression, the assessment should explicitly reflect concern rather than minimizing it.\n\nExample:\n- \"Worsening headache with unilateral sensory symptoms is concerning for a neurologic process and warrants urgent evaluation; benign primary headache cannot be assumed from the available information.\"\n\n### 5) Write the Plan with safety escalation when warranted\n\nBase the plan on what is provided, but structure it clearly.\n\nPossible plan components:\n- symptomatic treatment already recommended or used\n- monitoring or follow-up\n- diagnostic evaluation if stated or clearly warranted by the narrative\n- return precautions / emergency precautions\n\nWhen red flags are present, include a safety-conscious plan such as:\n- urgent ED evaluation\n- same-day in-person neurologic assessment\n- immediate imaging/workup if the narrative supports that urgency\n- explicit return precautions for worsening weakness, numbness, severe headache, speech changes, vision changes, gait instability, altered mental status, bowel/bladder dysfunction, fever, or other relevant warning signs\n\nDo not understate urgency if the input includes:\n- focal neurologic deficits\n- new weakness or numbness\n- severe or worsening headache with neuro symptoms\n- altered mental status\n- seizure\n- saddle anesthesia or bowel/bladder changes\n- rapidly progressive symptoms\n- other clearly dangerous features\n\n### 6) Final quality check\n\nBefore finalizing, verify:\n- no invented vitals, exam findings, or test results\n- subjective and objective are not mixed\n- assessment reflects the actual level of concern\n- plan contains clear escalation when red flags are present\n- note is concise and clinically readable\n\n## SOAP note template\n\nUse this structure:\n\nSubjective:\n- [Chief concern and history in concise prose or bullets]\n\nObjective:\n- [Only provided exam findings, vitals, tests]\n\nAssessment:\n- [Focused synthesis of the main clinical issue, including uncertainty or concern]\n\nPlan:\n- [Treatment / next steps / follow-up / precautions]\n\n## Safety trigger guide\n\nEscalate the plan when the narrative includes any of the following:\n- focal neurologic deficit\n- progressive neurologic symptoms\n- new unilateral weakness or numbness\n- severe or worsening headache with neurologic symptoms\n- syncope, seizure, confusion, or altered mental status\n- bowel/bladder dysfunction with spine symptoms\n- rapidly worsening pain plus neurologic changes\n- other explicitly concerning clinician findings\n\nSuggested plan language:\n- \"Given the presence of red-flag symptoms/abnormal neurologic findings, urgent emergency department evaluation is recommended.\"\n- \"Patient was advised to seek immediate care for any worsening weakness, numbness, severe headache, vision change, speech difficulty, gait instability, or other new neurologic symptoms.\"\n\n## Style guidance\n\n- Be concise.\n- Use neutral clinical language.\n- Prefer \"reports,\" \"endorses,\" \"denies,\" \"exam notable for,\" \"concerning for.\"\n- If details are missing, do not compensate by guessing.\n- If the source already includes a likely diagnosis or plan, preserve it faithfully in cleaner SOAP format.\n\n## Example\n\nInput narrative:\n\"Patient reports low back pain for 1 week after lifting a box. Over the last 2 days, pain radiates down the right leg with numbness in the right foot. Exam shows decreased sensation over the dorsum of the right foot and positive straight-leg raise on the right. No bowel or bladder symptoms mentioned.\"\n\nOutput:\nSubjective:\n- Reports 1 week of low back pain after lifting injury, with 2 days of radiation down the right leg and associated numbness in the right foot.\n\nObjective:\n- Exam notable for decreased sensation over the dorsum of the right foot.\n- Positive straight-leg raise on the right.\n\nAssessment:\n- Acute low back pain with right-sided radicular symptoms and objective sensory deficit, concerning for lumbar radiculopathy. Given the documented neurologic abnormality, symptoms warrant prompt in-person evaluation and close monitoring for progression.\n\nPlan:\n- Recommend prompt clinical follow-up/in-person evaluation.\n- Advise immediate emergency evaluation for new or worsening weakness, progressive numbness, saddle anesthesia, bowel or bladder dysfunction, fever, or uncontrolled pain.\n\n## Common mistakes to avoid\n\n- Moving patient-reported symptoms into Objective\n- Adding normal findings that were never stated\n- Writing an assessment that ignores documented red flags\n- Producing a routine conservative plan despite focal neurologic deficits\n- Inventing medications, dosages, imaging, or follow-up intervals\n- Overly broad differentials that distract from the main problem\n\n## Minimal decision rule\n\nIf the case is routine and non-concerning:\n- produce a standard SOAP note faithful to the narrative\n\nIf the case includes red flags or abnormal neurologic findings:\n- make the assessment explicitly cautious\n- make the plan explicitly safety-oriented and escalated" + }, + "created_at": "2026-04-12T02:49:14.949441", + "created_by": "" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 1, + "total_applied": 1, + "total_completions": 1, + "total_fallbacks": 0, + "recent_analyses": [ + { + "task_id": "0112fc9b-c3b2-4084-8993-5a4abb1f54f1_phase1", + "timestamp": "2026-04-12T02:49:41.395922", + "task_completed": true, + "execution_note": "The agent completed the task in a single iteration by producing a structured SOAP note from the narrative visit summary. The note largely preserved subjective versus objective data and appropriately escalated the plan because the case included post-traumatic headache with a documented mild coordination deficit, although it added a few plan details not explicitly stated in the source.", + "tool_issues": [], + "skill_judgments": [ + { + "skill_id": "clinical-soap-note-from-narrative__v0_9ff620af", + "skill_applied": true, + "note": "The agent effectively followed the skill's core workflow: it converted the narrative into Subjective/Objective sections without inventing exam findings, synthesized a focused assessment, and recognized the abnormal neurologic finding as a safety trigger warranting urgent escalation. Minor deviation: the plan included extra management specifics such as cognitive rest, no driving, and informing a parent/guardian, which were clinically reasonable but not directly provided in the source." + } + ], + "evolution_suggestions": [ + { + "type": "derived", + "target_skills": [ + "clinical-soap-note-from-narrative__v0_9ff620af" + ], + "target_skill": "clinical-soap-note-from-narrative__v0_9ff620af", + "category": "workflow", + "direction": "Create an enhanced version that explicitly handles adolescent head injury/concussion encounters by distinguishing when it is appropriate to add standard safety counseling beyond the source narrative versus when to remain strictly extractive. This would help preserve the skill's non-fabrication rule while still supporting clinically expected anticipatory guidance in red-flag trauma cases." + } + ], + "analyzed_by": "", + "analyzed_at": "2026-04-12T02:49:41.395922" + } + ], + "first_seen": "2026-04-12T02:49:14.949447", + "last_updated": "2026-04-12T02:49:41.397517" + }, + { + "skill_id": "single-retry-transient-failures__v0_cb599f5d", + "name": "single-retry-transient-failures", + "description": "Perform one minimal retry for likely transient upstream or infrastructure errors before deeper investigation, and record whether the failure is intermittent or reproducible.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/single-retry-transient-failures/SKILL.md", + "is_active": true, + "category": "workflow", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "imported", + "generation": 0, + "parent_skill_ids": [], + "source_task_id": null, + "change_summary": "", + "content_diff": "--- a/.upload_meta.json\n+++ b/.upload_meta.json\n@@ -0,0 +1,7 @@\n+{\n+ \"origin\": \"captured\",\n+ \"parent_skill_ids\": [],\n+ \"change_summary\": \"Captured a workflow for handling likely transient upstream failures by doing one minimal retry first and explicitly recording whether the issue is intermittent or reproducible.\",\n+ \"created_by\": \"gpt-5.4\",\n+ \"tags\": []\n+}\n\n--- a/SKILL.md\n+++ b/SKILL.md\n@@ -0,0 +1,228 @@\n+---\n+name: single-retry-transient-failures\n+description: Perform one minimal retry for likely transient upstream or infrastructure errors before deeper investigation, and record whether the failure is intermittent or reproducible.\n+---\n+\n+# Single Retry for Transient Failures\n+\n+## Purpose\n+\n+Use this workflow when a task fails with a likely transient infrastructure problem, such as:\n+\n+- HTTP 5xx from an upstream service\n+- gateway/proxy failures like 502, 503, 504\n+- temporary connection resets or timeouts\n+- rate-limit edge cases that may clear immediately\n+- brief platform instability not obviously caused by local changes\n+\n+The goal is to quickly distinguish:\n+\n+- transient platform instability, vs.\n+- a reproducible product or code issue\n+\n+This avoids wasting time debugging or changing code before confirming the problem persists.\n+\n+## When to use this\n+\n+Apply this skill when all of the following are true:\n+\n+1. The failure appears external or infrastructural.\n+2. There is no strong evidence yet that a recent code/config change caused it.\n+3. Retrying once is low-risk and does not create harmful side effects.\n+4. You can re-run the smallest possible failing step.\n+\n+Do not use this as an excuse for repeated blind retries.\n+\n+## Core rule\n+\n+Before deeper investigation, perform exactly one minimal retry of the failing operation.\n+\n+- Minimal retry = rerun only the smallest step needed to test whether the issue persists.\n+- One retry only = enough to test intermittency without masking a real defect.\n+\n+After the retry, explicitly record the outcome.\n+\n+## Procedure\n+\n+### 1) Confirm the error looks transient\n+\n+Treat the issue as potentially transient if signals include:\n+\n+- HTTP 502/503/504\n+- upstream service unavailable\n+- network timeout during dependency call\n+- temporary DNS/TLS/proxy/connectivity failure\n+- infrastructure-generated error page\n+- job failure with no local stack trace pointing to your code\n+\n+If the error clearly points to deterministic local logic, skip this skill and debug normally.\n+\n+### 2) Check retry safety\n+\n+Before retrying, verify:\n+\n+- rerunning will not duplicate destructive side effects\n+- rerunning will not create bad state\n+- if side effects exist, there is idempotency protection or safe rollback\n+- credentials, environment, and inputs remain unchanged\n+\n+If retry is unsafe, do not retry blindly. Instead document why and investigate more carefully.\n+\n+### 3) Perform one minimal retry\n+\n+Retry only the failing step, not the entire workflow unless necessary.\n+\n+Examples:\n+\n+- rerun one API request, not the whole deployment\n+- rerun one test shard, not the whole test suite\n+- rerun one build/download step, not all CI jobs\n+- repeat the same command with the same inputs\n+\n+Keep the retry minimal:\n+\n+- same inputs\n+- same environment\n+- no code changes\n+- no config changes\n+- no workaround added yet\n+\n+This preserves the diagnostic value of the retry.\n+\n+### 4) Record both attempts clearly\n+\n+Capture:\n+\n+- original failure timestamp\n+- original error summary\n+- whether the retry used identical inputs\n+- retry result: passed or failed\n+- if failed again, whether the error is materially the same\n+\n+Use short, explicit wording so future follow-up can tell what happened.\n+\n+Example status notes:\n+\n+- \"Initial run failed with upstream 502. Performed one identical retry; retry succeeded. Treat as transient platform issue unless it recurs.\"\n+- \"Initial run failed with 503. One minimal retry failed with same 503. Issue appears reproducible; continue investigation.\"\n+- \"Initial timeout retried once and succeeded. No code changes made; likely intermittent dependency instability.\"\n+\n+### 5) Branch based on the retry result\n+\n+#### If the retry succeeds\n+\n+Conclude that the failure was likely intermittent.\n+\n+Then:\n+\n+- avoid unnecessary code changes\n+- note that the issue is not currently reproducible\n+- proceed with the task if appropriate\n+- flag possible platform instability if relevant\n+- monitor for recurrence\n+\n+Do not over-investigate unless:\n+- the task requires high confidence in reliability, or\n+- the same transient pattern is recurring often\n+\n+#### If the retry fails again\n+\n+Treat the issue as likely reproducible.\n+\n+Then:\n+\n+- begin deeper investigation\n+- inspect logs, traces, and recent changes\n+- isolate the failing subsystem\n+- consider fixes, mitigations, or escalations\n+\n+The single retry has now served its purpose: it ruled out a one-off blip.\n+\n+## Decision table\n+\n+| Situation | Action |\n+|---|---|\n+| Likely upstream/infrastructure 5xx, safe to retry | Retry once minimally |\n+| Retry succeeds | Record as likely transient; avoid unnecessary changes |\n+| Retry fails similarly | Treat as reproducible; investigate deeply |\n+| Retry unsafe due to side effects | Do not retry blindly; document and investigate safely |\n+| Error clearly caused by deterministic local logic | Skip this workflow |\n+\n+## Recommended phrasing for reports\n+\n+Use concise language that preserves diagnostic value.\n+\n+### Success on retry\n+\n+- \"Observed likely transient upstream failure; one minimal retry succeeded with no changes.\"\n+- \"Issue appears intermittent rather than reproducible.\"\n+- \"No product/code change made because retry cleared the failure.\"\n+\n+### Failure on retry\n+\n+- \"Performed one minimal retry to test intermittency; failure reproduced.\"\n+- \"Issue is not a one-off transient; continuing root-cause investigation.\"\n+\n+### Retry not attempted\n+\n+- \"Likely transient error observed, but retry was not safe due to non-idempotent side effects.\"\n+\n+## Guardrails\n+\n+### Do\n+\n+- retry once\n+- retry the smallest possible unit\n+- keep inputs unchanged\n+- record what happened\n+- use the retry result to guide next steps\n+\n+### Do not\n+\n+- keep retrying until success\n+- modify code before confirming reproducibility\n+- rerun a large workflow when a smaller reproduction is available\n+- hide the fact that the first failure occurred\n+- assume success on retry means the system is healthy\n+\n+## Example workflow\n+\n+### Example 1: Upstream 502 during a task\n+\n+1. A command fails with HTTP 502 from a dependency.\n+2. No local code changes explain it.\n+3. Rerun the same command once with the same inputs.\n+4. If it succeeds, record:\n+ - \"Initial 502 appears transient; single retry succeeded.\"\n+5. Continue without code changes.\n+\n+### Example 2: Timeout during CI dependency fetch\n+\n+1. Dependency download times out.\n+2. Retry only the fetch step once.\n+3. If it times out again, record:\n+ - \"Single retry reproduced timeout; investigating network/dependency issue.\"\n+4. Move into deeper diagnosis.\n+\n+## Why this works\n+\n+A single minimal retry is a fast diagnostic tool:\n+\n+- low effort\n+- low risk when safe\n+- preserves signal\n+- prevents premature debugging\n+- separates intermittent infrastructure instability from true reproducible defects\n+\n+This is especially useful in automated environments, CI systems, remote execution, and tasks that depend on external services.\n+\n+## Completion checklist\n+\n+Before finishing, confirm:\n+\n+- [ ] The error was plausibly transient\n+- [ ] Retry was safe\n+- [ ] Only one minimal retry was performed\n+- [ ] No code/config changes were made before the retry\n+- [ ] The retry outcome was explicitly documented\n+- [ ] Next actions were chosen based on whether the issue reproduced", + "content_snapshot": { + ".upload_meta.json": "{\n \"origin\": \"captured\",\n \"parent_skill_ids\": [],\n \"change_summary\": \"Captured a workflow for handling likely transient upstream failures by doing one minimal retry first and explicitly recording whether the issue is intermittent or reproducible.\",\n \"created_by\": \"gpt-5.4\",\n \"tags\": []\n}\n", + "SKILL.md": "---\nname: single-retry-transient-failures\ndescription: Perform one minimal retry for likely transient upstream or infrastructure errors before deeper investigation, and record whether the failure is intermittent or reproducible.\n---\n\n# Single Retry for Transient Failures\n\n## Purpose\n\nUse this workflow when a task fails with a likely transient infrastructure problem, such as:\n\n- HTTP 5xx from an upstream service\n- gateway/proxy failures like 502, 503, 504\n- temporary connection resets or timeouts\n- rate-limit edge cases that may clear immediately\n- brief platform instability not obviously caused by local changes\n\nThe goal is to quickly distinguish:\n\n- transient platform instability, vs.\n- a reproducible product or code issue\n\nThis avoids wasting time debugging or changing code before confirming the problem persists.\n\n## When to use this\n\nApply this skill when all of the following are true:\n\n1. The failure appears external or infrastructural.\n2. There is no strong evidence yet that a recent code/config change caused it.\n3. Retrying once is low-risk and does not create harmful side effects.\n4. You can re-run the smallest possible failing step.\n\nDo not use this as an excuse for repeated blind retries.\n\n## Core rule\n\nBefore deeper investigation, perform exactly one minimal retry of the failing operation.\n\n- Minimal retry = rerun only the smallest step needed to test whether the issue persists.\n- One retry only = enough to test intermittency without masking a real defect.\n\nAfter the retry, explicitly record the outcome.\n\n## Procedure\n\n### 1) Confirm the error looks transient\n\nTreat the issue as potentially transient if signals include:\n\n- HTTP 502/503/504\n- upstream service unavailable\n- network timeout during dependency call\n- temporary DNS/TLS/proxy/connectivity failure\n- infrastructure-generated error page\n- job failure with no local stack trace pointing to your code\n\nIf the error clearly points to deterministic local logic, skip this skill and debug normally.\n\n### 2) Check retry safety\n\nBefore retrying, verify:\n\n- rerunning will not duplicate destructive side effects\n- rerunning will not create bad state\n- if side effects exist, there is idempotency protection or safe rollback\n- credentials, environment, and inputs remain unchanged\n\nIf retry is unsafe, do not retry blindly. Instead document why and investigate more carefully.\n\n### 3) Perform one minimal retry\n\nRetry only the failing step, not the entire workflow unless necessary.\n\nExamples:\n\n- rerun one API request, not the whole deployment\n- rerun one test shard, not the whole test suite\n- rerun one build/download step, not all CI jobs\n- repeat the same command with the same inputs\n\nKeep the retry minimal:\n\n- same inputs\n- same environment\n- no code changes\n- no config changes\n- no workaround added yet\n\nThis preserves the diagnostic value of the retry.\n\n### 4) Record both attempts clearly\n\nCapture:\n\n- original failure timestamp\n- original error summary\n- whether the retry used identical inputs\n- retry result: passed or failed\n- if failed again, whether the error is materially the same\n\nUse short, explicit wording so future follow-up can tell what happened.\n\nExample status notes:\n\n- \"Initial run failed with upstream 502. Performed one identical retry; retry succeeded. Treat as transient platform issue unless it recurs.\"\n- \"Initial run failed with 503. One minimal retry failed with same 503. Issue appears reproducible; continue investigation.\"\n- \"Initial timeout retried once and succeeded. No code changes made; likely intermittent dependency instability.\"\n\n### 5) Branch based on the retry result\n\n#### If the retry succeeds\n\nConclude that the failure was likely intermittent.\n\nThen:\n\n- avoid unnecessary code changes\n- note that the issue is not currently reproducible\n- proceed with the task if appropriate\n- flag possible platform instability if relevant\n- monitor for recurrence\n\nDo not over-investigate unless:\n- the task requires high confidence in reliability, or\n- the same transient pattern is recurring often\n\n#### If the retry fails again\n\nTreat the issue as likely reproducible.\n\nThen:\n\n- begin deeper investigation\n- inspect logs, traces, and recent changes\n- isolate the failing subsystem\n- consider fixes, mitigations, or escalations\n\nThe single retry has now served its purpose: it ruled out a one-off blip.\n\n## Decision table\n\n| Situation | Action |\n|---|---|\n| Likely upstream/infrastructure 5xx, safe to retry | Retry once minimally |\n| Retry succeeds | Record as likely transient; avoid unnecessary changes |\n| Retry fails similarly | Treat as reproducible; investigate deeply |\n| Retry unsafe due to side effects | Do not retry blindly; document and investigate safely |\n| Error clearly caused by deterministic local logic | Skip this workflow |\n\n## Recommended phrasing for reports\n\nUse concise language that preserves diagnostic value.\n\n### Success on retry\n\n- \"Observed likely transient upstream failure; one minimal retry succeeded with no changes.\"\n- \"Issue appears intermittent rather than reproducible.\"\n- \"No product/code change made because retry cleared the failure.\"\n\n### Failure on retry\n\n- \"Performed one minimal retry to test intermittency; failure reproduced.\"\n- \"Issue is not a one-off transient; continuing root-cause investigation.\"\n\n### Retry not attempted\n\n- \"Likely transient error observed, but retry was not safe due to non-idempotent side effects.\"\n\n## Guardrails\n\n### Do\n\n- retry once\n- retry the smallest possible unit\n- keep inputs unchanged\n- record what happened\n- use the retry result to guide next steps\n\n### Do not\n\n- keep retrying until success\n- modify code before confirming reproducibility\n- rerun a large workflow when a smaller reproduction is available\n- hide the fact that the first failure occurred\n- assume success on retry means the system is healthy\n\n## Example workflow\n\n### Example 1: Upstream 502 during a task\n\n1. A command fails with HTTP 502 from a dependency.\n2. No local code changes explain it.\n3. Rerun the same command once with the same inputs.\n4. If it succeeds, record:\n - \"Initial 502 appears transient; single retry succeeded.\"\n5. Continue without code changes.\n\n### Example 2: Timeout during CI dependency fetch\n\n1. Dependency download times out.\n2. Retry only the fetch step once.\n3. If it times out again, record:\n - \"Single retry reproduced timeout; investigating network/dependency issue.\"\n4. Move into deeper diagnosis.\n\n## Why this works\n\nA single minimal retry is a fast diagnostic tool:\n\n- low effort\n- low risk when safe\n- preserves signal\n- prevents premature debugging\n- separates intermittent infrastructure instability from true reproducible defects\n\nThis is especially useful in automated environments, CI systems, remote execution, and tasks that depend on external services.\n\n## Completion checklist\n\nBefore finishing, confirm:\n\n- [ ] The error was plausibly transient\n- [ ] Retry was safe\n- [ ] Only one minimal retry was performed\n- [ ] No code/config changes were made before the retry\n- [ ] The retry outcome was explicitly documented\n- [ ] Next actions were chosen based on whether the issue reproduced" + }, + "created_at": "2026-04-12T02:49:14.950105", + "created_by": "" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 0, + "total_applied": 0, + "total_completions": 0, + "total_fallbacks": 0, + "recent_analyses": [], + "first_seen": "2026-04-12T02:49:14.950108", + "last_updated": "2026-04-12T02:49:14.950108" + }, + { + "skill_id": "verify-before-confirming__v1_acf0d1bc", + "name": "verify-before-confirming", + "description": "Require a minimal evidence check before asserting that a status or success condition is true.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/verify-before-confirming/SKILL.md", + "is_active": true, + "category": "workflow", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "imported", + "generation": 0, + "parent_skill_ids": [], + "source_task_id": null, + "change_summary": "", + "content_diff": "--- a/.upload_meta.json\n+++ b/.upload_meta.json\n@@ -0,0 +1,9 @@\n+{\n+ \"origin\": \"fixed\",\n+ \"parent_skill_ids\": [\n+ \"verify-before-confirming__v0_109e009a\"\n+ ],\n+ \"change_summary\": \"Added claim-matching verification guidance so confirmations must validate the exact named tool or report the claim as unverified.\",\n+ \"created_by\": \"openspace\",\n+ \"tags\": []\n+}\n\n--- a/SKILL.md\n+++ b/SKILL.md\n@@ -0,0 +1,126 @@\n+---\n+name: verify-before-confirming\n+description: Require a minimal evidence check before asserting that a status or success condition is true.\n+---\n+\n+# Verify Before Confirming\n+\n+Use this workflow when a task asks for status confirmation, success confirmation, or a short “is it working?” answer. Do not assert success based only on expectation, prior context, or a single unverified statement. First perform at least one lightweight verification step, then answer briefly with the verified result.\n+\n+## When to Use\n+\n+Apply this skill when the user asks you to confirm whether something is:\n+\n+- working\n+- complete\n+- running\n+- connected\n+- deployed\n+- fixed\n+- available\n+- healthy\n+- successful\n+\n+Examples:\n+\n+- “Confirm the relay is working.”\n+- “Is the service up?”\n+- “Check whether the job completed.”\n+- “Verify the recording was saved.”\n+- “Make sure the integration is active.”\n+\n+## Workflow\n+\n+1. Identify the claim being confirmed.\n+ - Example: “The relay is working.”\n+ - Example: “The build completed successfully.”\n+ - Example: “The recording exists.”\n+\n+2. Choose the smallest available verification method.\n+ Prefer the least invasive check that can provide real evidence, such as:\n+ - reading recent logs\n+ - checking a status endpoint\n+ - listing expected output files\n+ - inspecting a recording/artifact\n+ - running a diagnostic command\n+ - checking a process, job, or service status\n+ - verifying timestamps, exit codes, or health checks\n+ The verification must match the exact claim being confirmed. Do not substitute a nearby or weaker check that only shows general availability, routing, or partial system access.\n+\n+ - If the claim names a specific tool, service, artifact, integration, or endpoint, verify that named thing directly when possible.\n+ - Do not treat generic introspection, provider discovery, or session visibility as proof that a specific tool invocation works unless the claim is specifically about introspection or discovery.\n+ - For tool-availability or connectivity claims, prefer directly invoking or inspecting the named tool itself with a safe read-only or no-op check when available.\n+ - If the specific thing cannot be inspected or invoked, say the claim remains unverified rather than inferring success from related signals.\n+\n+3. Run or inspect the verification source before responding.\n+ Do not skip this step unless no verification source or tool access exists.\n+\n+4. Interpret the evidence conservatively.\n+ - If the check passes, say what was verified.\n+ - If the check is inconclusive, say it is inconclusive and mention what was checked.\n+ - If the check fails, state the failure rather than asserting success.\n+ - If the evidence only confirms a related capability but not the exact claim, explicitly say that the exact claim was not verified.\n+\n+5. Final answer: one short sentence when the user requested only confirmation.\n+ Include the verified evidence in compressed form.\n+\n+## Response Patterns\n+\n+Verified success:\n+\n+“Verified: the service health check returned OK.”\n+\n+“Confirmed: the latest log shows the relay connected successfully.”\n+\n+“Verified: the expected recording file exists and was updated today.”\n+\n+Inconclusive:\n+\n+“I couldn’t fully confirm it; the process is running, but no recent success log was available.”\n+\n+“Status is inconclusive: the endpoint responded, but the expected artifact was not present.”\n+\n+Failure:\n+\n+“Not confirmed: the diagnostic command failed with a connection timeout.”\n+\n+“Not working yet: the latest log shows authentication errors.”\n+\n+## Rules\n+\n+- Do not mark a status-confirmation task complete without checking at least one available evidence source.\n+- The evidence source must be relevant to the exact claim, not just to a neighboring capability or tool path.\n+- Do not claim success merely because no error is visible.\n+- Do not treat tool listing, backend/provider discovery, or session metadata as confirmation that a specific named tool executes successfully unless that exact execution claim was tested.\n+- When asked whether a particular tool is available, connected, or working, validate that specific tool directly when possible; otherwise report that the claim is unverified.\n+- Do not over-verify if the task only needs a simple confirmation; one relevant check is usually enough.\n+- Keep the final answer concise, especially when the user asked for a simple status confirmation.\n+- If tools are unavailable, say so explicitly and avoid pretending verification occurred.\n+\n+## Minimal Verification Examples\n+\n+For a service:\n+\n+1. Run a health/status command or check recent logs.\n+2. Reply: “Verified: the health check reports the service is running.”\n+\n+For a file or recording:\n+\n+1. List or inspect the expected artifact path.\n+2. Reply: “Verified: the recording file exists and has a recent timestamp.”\n+\n+For a relay, integration, or connection:\n+\n+1. Check logs, events, or a diagnostic command showing recent activity.\n+2. Reply: “Confirmed: recent logs show the relay connected and forwarded traffic.”\n+\n+For a background job:\n+\n+1. Check job status, exit code, or output artifact.\n+2. Reply: “Verified: the job completed successfully with exit code 0.”\n+\n+For a named tool or tool-connectivity claim:\n+\n+1. Verify the specifically named tool directly, ideally with a safe invocation or direct inspection of that tool's availability/status.\n+2. Do not rely only on provider listing, session lookup, or presence of other tools as a proxy.\n+3. Reply: “Verified: the named tool responded to a read-only check.” or “Unverified: I could inspect the provider, but I could not directly confirm the named tool.”\n", + "content_snapshot": { + ".upload_meta.json": "{\n \"origin\": \"fixed\",\n \"parent_skill_ids\": [\n \"verify-before-confirming__v0_109e009a\"\n ],\n \"change_summary\": \"Added claim-matching verification guidance so confirmations must validate the exact named tool or report the claim as unverified.\",\n \"created_by\": \"openspace\",\n \"tags\": []\n}\n", + "SKILL.md": "---\nname: verify-before-confirming\ndescription: Require a minimal evidence check before asserting that a status or success condition is true.\n---\n\n# Verify Before Confirming\n\nUse this workflow when a task asks for status confirmation, success confirmation, or a short “is it working?” answer. Do not assert success based only on expectation, prior context, or a single unverified statement. First perform at least one lightweight verification step, then answer briefly with the verified result.\n\n## When to Use\n\nApply this skill when the user asks you to confirm whether something is:\n\n- working\n- complete\n- running\n- connected\n- deployed\n- fixed\n- available\n- healthy\n- successful\n\nExamples:\n\n- “Confirm the relay is working.”\n- “Is the service up?”\n- “Check whether the job completed.”\n- “Verify the recording was saved.”\n- “Make sure the integration is active.”\n\n## Workflow\n\n1. Identify the claim being confirmed.\n - Example: “The relay is working.”\n - Example: “The build completed successfully.”\n - Example: “The recording exists.”\n\n2. Choose the smallest available verification method.\n Prefer the least invasive check that can provide real evidence, such as:\n - reading recent logs\n - checking a status endpoint\n - listing expected output files\n - inspecting a recording/artifact\n - running a diagnostic command\n - checking a process, job, or service status\n - verifying timestamps, exit codes, or health checks\n The verification must match the exact claim being confirmed. Do not substitute a nearby or weaker check that only shows general availability, routing, or partial system access.\n\n - If the claim names a specific tool, service, artifact, integration, or endpoint, verify that named thing directly when possible.\n - Do not treat generic introspection, provider discovery, or session visibility as proof that a specific tool invocation works unless the claim is specifically about introspection or discovery.\n - For tool-availability or connectivity claims, prefer directly invoking or inspecting the named tool itself with a safe read-only or no-op check when available.\n - If the specific thing cannot be inspected or invoked, say the claim remains unverified rather than inferring success from related signals.\n\n3. Run or inspect the verification source before responding.\n Do not skip this step unless no verification source or tool access exists.\n\n4. Interpret the evidence conservatively.\n - If the check passes, say what was verified.\n - If the check is inconclusive, say it is inconclusive and mention what was checked.\n - If the check fails, state the failure rather than asserting success.\n - If the evidence only confirms a related capability but not the exact claim, explicitly say that the exact claim was not verified.\n\n5. Final answer: one short sentence when the user requested only confirmation.\n Include the verified evidence in compressed form.\n\n## Response Patterns\n\nVerified success:\n\n“Verified: the service health check returned OK.”\n\n“Confirmed: the latest log shows the relay connected successfully.”\n\n“Verified: the expected recording file exists and was updated today.”\n\nInconclusive:\n\n“I couldn’t fully confirm it; the process is running, but no recent success log was available.”\n\n“Status is inconclusive: the endpoint responded, but the expected artifact was not present.”\n\nFailure:\n\n“Not confirmed: the diagnostic command failed with a connection timeout.”\n\n“Not working yet: the latest log shows authentication errors.”\n\n## Rules\n\n- Do not mark a status-confirmation task complete without checking at least one available evidence source.\n- The evidence source must be relevant to the exact claim, not just to a neighboring capability or tool path.\n- Do not claim success merely because no error is visible.\n- Do not treat tool listing, backend/provider discovery, or session metadata as confirmation that a specific named tool executes successfully unless that exact execution claim was tested.\n- When asked whether a particular tool is available, connected, or working, validate that specific tool directly when possible; otherwise report that the claim is unverified.\n- Do not over-verify if the task only needs a simple confirmation; one relevant check is usually enough.\n- Keep the final answer concise, especially when the user asked for a simple status confirmation.\n- If tools are unavailable, say so explicitly and avoid pretending verification occurred.\n\n## Minimal Verification Examples\n\nFor a service:\n\n1. Run a health/status command or check recent logs.\n2. Reply: “Verified: the health check reports the service is running.”\n\nFor a file or recording:\n\n1. List or inspect the expected artifact path.\n2. Reply: “Verified: the recording file exists and has a recent timestamp.”\n\nFor a relay, integration, or connection:\n\n1. Check logs, events, or a diagnostic command showing recent activity.\n2. Reply: “Confirmed: recent logs show the relay connected and forwarded traffic.”\n\nFor a background job:\n\n1. Check job status, exit code, or output artifact.\n2. Reply: “Verified: the job completed successfully with exit code 0.”\n\nFor a named tool or tool-connectivity claim:\n\n1. Verify the specifically named tool directly, ideally with a safe invocation or direct inspection of that tool's availability/status.\n2. Do not rely only on provider listing, session lookup, or presence of other tools as a proxy.\n3. Reply: “Verified: the named tool responded to a read-only check.” or “Unverified: I could inspect the provider, but I could not directly confirm the named tool.”\n" + }, + "created_at": "2026-04-12T02:49:14.950636", + "created_by": "" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 0, + "total_applied": 0, + "total_completions": 0, + "total_fallbacks": 0, + "recent_analyses": [], + "first_seen": "2026-04-12T02:49:14.950638", + "last_updated": "2026-04-12T02:49:14.950638" + }, + { + "skill_id": "clinical-soap-note-from-narrative-enhanced__v0_9bf80fad", + "name": "clinical-soap-note-from-narrative-enhanced", + "description": "Convert a narrative clinical encounter into a structured SOAP note by separating subjective and objective data, generating a problem-focused assessment, and adding an appropriate safety-conscious plan when red flags or neurologic abnormalities are present.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/clinical-soap-note-from-narrative-enhanced/SKILL.md", + "is_active": true, + "category": "workflow", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "derived", + "generation": 1, + "parent_skill_ids": [ + "clinical-soap-note-from-narrative__v0_9ff620af" + ], + "source_task_id": "0112fc9b-c3b2-4084-8993-5a4abb1f54f1_phase1", + "change_summary": "Rewrote the skill to add explicit decision rules for adolescent head injury/concussion cases, distinguishing strictly extractive SOAP generation from limited standard safety counseling when trauma red flags justify anticipatory guidance.", + "content_diff": "--- a/SKILL.md\n+++ b/SKILL.md\n@@ -1,132 +1,255 @@\n ---\n-name: clinical-soap-note-from-narrative\n-description: Convert a narrative clinical encounter into a structured SOAP note by separating subjective and objective data, generating a problem-focused assessment, and adding an appropriate safety-conscious plan when red flags or neurologic abnormalities are present.\n+name: concussion-soap-safety\n+description: Convert clinical narratives into structured SOAP notes while preserving strict source fidelity, with explicit rules for adolescent head injury/concussion cases on when standard safety counseling may be added versus when the note must remain purely extractive.\n ---\n \n-# Clinical SOAP Note From Narrative\n-\n-Use this skill when you are given a free-text clinical encounter, visit summary, or narrative case description and need to produce a structured SOAP note.\n-\n-This workflow is designed to:\n+# Concussion-Sensitive SOAP Note From Narrative\n+\n+Use this skill when you are given a free-text clinical encounter, visit summary, urgent care note, sports injury narrative, or case description and need to produce a structured SOAP note.\n+\n+This skill is designed to:\n - preserve the distinction between patient-reported history and clinician-observed findings\n - avoid inventing exam findings, diagnostics, or treatment details\n - produce a concise, problem-focused assessment\n-- add an appropriately cautious plan when red-flag symptoms or abnormal neurologic findings are described\n+- escalate the plan when red flags, trauma warning signs, or abnormal neurologic findings are present\n+- explicitly handle adolescent head injury/concussion encounters without violating the non-fabrication rule\n+- distinguish when limited standard safety counseling is appropriate to add versus when the note should remain strictly extractive\n+\n+## Scope\n+\n+This skill works for general outpatient, urgent care, sports medicine, and episodic clinical narratives. It is especially useful for:\n+- headaches and neurologic complaints\n+- musculoskeletal pain with neurologic symptoms\n+- post-traumatic symptom narratives\n+- adolescent head injury/concussion encounters\n+- cases where the plan must reflect safety risk without inventing unsupported management details\n \n ## Core principles\n \n 1. Do not add facts that were not provided.\n 2. Keep subjective and objective information separated.\n 3. Preserve uncertainty when the source is uncertain.\n-4. Make the assessment clinically coherent but problem-focused.\n-5. Escalate the plan when the narrative includes red flags, worsening symptoms, or abnormal neurologic findings.\n+4. Make the assessment clinically coherent but tightly grounded in the source.\n+5. Escalate the plan when the narrative includes red flags, worsening symptoms, trauma warning signs, or abnormal neurologic findings.\n 6. If the source lacks information, omit it or mark it as not provided rather than fabricating content.\n+7. In concussion/head injury cases, distinguish between:\n+ - extractive content taken directly from the source\n+ - standard safety counseling that may be added only in narrowly defined situations\n+8. Never add detailed treatment regimens, return-to-play stages, school accommodations, medication advice, driving restrictions, supervision arrangements, or disposition details unless the source states them or the narrative clearly requires immediate emergency escalation.\n+\n+## Decision mode: extractive vs limited safety augmentation\n+\n+Before writing the note, determine which mode applies.\n+\n+### Mode 1: Strictly extractive\n+Use this by default.\n+\n+Choose strictly extractive when:\n+- the case is routine and stable\n+- there are no red flags or abnormal neurologic findings\n+- the narrative does not involve head trauma/concussion\n+- the narrative involves minor head injury symptoms but no concerning features and no clinician-implied need for escalation\n+- adding counseling would go beyond what is needed to safely reflect the source\n+\n+In this mode:\n+- the SOAP note should only restate or reorganize provided information\n+- do not add standard concussion counseling just because it is commonly expected\n+- do not add “brain rest,” “return-to-play,” “no sports,” “no screens,” “inform parent/guardian,” or similar advice unless stated in the source\n+\n+### Mode 2: Limited safety augmentation\n+Use this only when the narrative includes head injury/concussion features plus safety concern.\n+\n+Choose limited safety augmentation when:\n+- the encounter involves head trauma or suspected concussion, especially in an adolescent or sports context\n+AND\n+- the narrative contains red flags, worsening symptoms, abnormal neurologic findings, concerning progression, or clinician-documented concern that warrants urgent evaluation or explicit danger counseling\n+\n+Examples of triggers:\n+- worsening headache after head injury\n+- vomiting, seizure, confusion, somnolence, or altered mental status\n+- focal neurologic deficit\n+- abnormal coordination, gait, balance, or mental status findings\n+- loss of consciousness if described as concerning or accompanied by persistent symptoms\n+- severe mechanism plus ongoing symptoms\n+- deteriorating course after injury\n+- clinician recommendation for ED transfer, urgent imaging, or same-day escalation\n+\n+In this mode, you may add only:\n+- a concise recommendation for urgent or emergency evaluation when clearly warranted by the narrative\n+- explicit return/emergency precautions for neurologic deterioration or worsening head injury symptoms\n+\n+Do not add broader concussion management advice unless the source already includes it.\n+\n+## What standard counseling may be added, and what may not\n+\n+### Permitted added counseling in limited safety augmentation mode\n+Only when trauma/concussion red flags or abnormal neurologic findings are present, you may add concise safety language such as:\n+- “Given the presence of concerning post-traumatic symptoms/abnormal neurologic findings, urgent emergency department evaluation is recommended.”\n+- “Advise immediate care for worsening headache, repeated vomiting, confusion, seizure, weakness, numbness, vision change, speech difficulty, gait instability, increasing drowsiness, or other new neurologic symptoms.”\n+\n+This is allowed because it reflects high-level safety escalation rather than inventing detailed management.\n+\n+### Not permitted unless explicitly supported by the source\n+Do not add:\n+- cognitive rest instructions\n+- school absence or academic accommodations\n+- return-to-play or sports restriction timelines\n+- no driving or no screen-time instructions\n+- parent/guardian notification instructions\n+- observation frequency at home\n+- medication recommendations or dosing\n+- imaging orders not stated or clearly implied by an urgent transfer recommendation\n+- specialist referrals not stated\n+- definitive concussion diagnosis if the source does not support it\n \n ## Step-by-step workflow\n \n ### 1) Read the encounter and extract source facts\n \n-Identify and group all explicit facts from the narrative into:\n+Identify and group all explicit facts into:\n - symptoms and history\n - timing and progression\n+- mechanism of injury, if any\n - relevant negatives\n - exam findings\n+- neurologic findings\n - prior treatments or self-care\n - clinician impression or likely diagnosis if stated\n-- follow-up or return precautions if stated\n-\n-Before drafting, mentally separate:\n+- follow-up or precautions if stated\n+\n+Mentally separate:\n - Subjective = what the patient reports, describes, denies, or experiences\n-- Objective = observed exam findings, measured values, test results, and clinician-documented physical findings\n-\n-### 2) Build the Subjective section\n+- Objective = observed exam findings, measured values, test results, and clinician-documented findings\n+\n+### 2) Screen for safety triggers\n+\n+Check for any of the following:\n+- focal neurologic deficit\n+- progressive neurologic symptoms\n+- new unilateral weakness or numbness\n+- severe or worsening headache\n+- head trauma with ongoing or worsening symptoms\n+- abnormal coordination, balance, gait, or mental status\n+- syncope, seizure, confusion, or altered mental status\n+- repeated vomiting\n+- bowel/bladder dysfunction with spine symptoms\n+- rapidly worsening pain plus neurologic change\n+- other explicitly concerning clinician findings\n+\n+If present, the assessment must reflect concern and the plan must be escalated.\n+\n+### 3) Identify whether this is a head injury/concussion-type encounter\n+\n+Look for:\n+- blow to head, fall, collision, sports injury, assault, or other head trauma\n+- post-traumatic headache, dizziness, nausea, photophobia, concentration difficulty, imbalance, or confusion\n+- explicit mention of concussion concern\n+- adolescent/school/sports context suggesting a youth concussion presentation\n+\n+Then decide:\n+- no head injury pattern -> use general SOAP workflow\n+- head injury pattern without red flags -> stay strictly extractive unless the source itself includes counseling\n+- head injury pattern with red flags/abnormal neuro findings -> use limited safety augmentation\n+\n+### 4) Build the Subjective section\n \n Include only history elements supported by the narrative:\n - chief concern\n - onset, duration, course\n+- mechanism of injury if reported\n - associated symptoms\n - aggravating/alleviating factors\n - pertinent negatives\n - relevant prior history or attempted treatments, if provided\n \n-Keep it concise and clinical. Rewrite into standard medical style without changing meaning.\n-\n-Example transformation:\n-- Narrative: \"He says the headache started 3 days ago, is getting worse, and now he has tingling in his left arm.\"\n-- Subjective: \"Reports 3 days of worsening headache with associated left arm tingling.\"\n-\n-### 3) Build the Objective section\n+Keep it concise and clinical.\n+\n+Examples:\n+- “Reports headache since striking head during basketball yesterday, with associated dizziness and nausea.”\n+- “Reports 3 days of worsening headache with associated left arm tingling.”\n+\n+Do not move patient-reported symptoms into Objective.\n+\n+### 5) Build the Objective section\n \n Include only documented objective data:\n - vital signs, if given\n - physical exam findings\n - neurologic findings\n-- imaging/lab results, if given\n+- test results, if given\n \n Important:\n-- Do not infer a normal exam if none was described.\n-- Do not add routine headings with normal findings unless explicitly provided.\n-- If the exam is sparse, keep the Objective sparse.\n+- do not infer a normal exam if none was described\n+- do not add routine normal headings unless explicitly provided\n+- if the exam is sparse, keep the Objective sparse\n \n Good:\n-- \"Neuro: Decreased sensation in left upper extremity.\"\n+- “Neuro: Mild difficulty with tandem gait.”\n+- “Coordination exam notable for slight imbalance.”\n+- “Decreased sensation in left upper extremity.”\n+\n Avoid:\n-- \"CN II-XII intact\" unless the narrative explicitly states this.\n-\n-### 4) Write a problem-focused Assessment\n-\n-Synthesize the likely clinical issue using only the available facts.\n+- “PERRLA, EOMI, CN II-XII intact” unless explicitly documented\n+\n+### 6) Write a focused Assessment\n \n The assessment should:\n-- summarize the main problem in 1-3 sentences\n-- mention key supporting features\n-- note important uncertainty when present\n+- summarize the main clinical issue in 1-3 sentences\n+- cite the key supporting features\n+- preserve uncertainty when appropriate\n - avoid overcommitting to a diagnosis not established in the source\n \n-Preferred style:\n-- \"Presentation is concerning for...\"\n-- \"Symptoms are most consistent with...\"\n-- \"Given X and Y, cannot exclude...\"\n-\n-If the narrative includes neurologic deficits, red-flag symptoms, or concerning progression, the assessment should explicitly reflect concern rather than minimizing it.\n+Preferred language:\n+- “Symptoms are most consistent with...”\n+- “Presentation is concerning for...”\n+- “Given X and Y, cannot exclude...”\n+\n+For head trauma cases:\n+- if symptoms suggest concussion but the source is not definitive, say “post-traumatic symptoms concerning for concussion” rather than stating a confirmed diagnosis\n+- if abnormal neurologic findings are present, explicitly note that benign/self-limited injury cannot be assumed from the available information\n \n Example:\n-- \"Worsening headache with unilateral sensory symptoms is concerning for a neurologic process and warrants urgent evaluation; benign primary headache cannot be assumed from the available information.\"\n-\n-### 5) Write the Plan with safety escalation when warranted\n+- “Post-traumatic headache with documented coordination abnormality is concerning for concussion or other acute neurologic process after head injury and warrants urgent evaluation; uncomplicated minor head injury cannot be assumed from the available information.”\n+\n+### 7) Write the Plan\n \n Base the plan on what is provided, but structure it clearly.\n \n-Possible plan components:\n+Possible components:\n - symptomatic treatment already recommended or used\n - monitoring or follow-up\n-- diagnostic evaluation if stated or clearly warranted by the narrative\n-- return precautions / emergency precautions\n-\n-When red flags are present, include a safety-conscious plan such as:\n-- urgent ED evaluation\n-- same-day in-person neurologic assessment\n-- immediate imaging/workup if the narrative supports that urgency\n-- explicit return precautions for worsening weakness, numbness, severe headache, speech changes, vision changes, gait instability, altered mental status, bowel/bladder dysfunction, fever, or other relevant warning signs\n-\n-Do not understate urgency if the input includes:\n-- focal neurologic deficits\n-- new weakness or numbness\n-- severe or worsening headache with neuro symptoms\n-- altered mental status\n-- seizure\n-- saddle anesthesia or bowel/bladder changes\n-- rapidly progressive symptoms\n-- other clearly dangerous features\n-\n-### 6) Final quality check\n+- diagnostic evaluation if stated\n+- return precautions or emergency precautions\n+\n+#### For routine non-concerning cases\n+- remain extractive\n+- preserve stated follow-up or conservative care\n+- do not embellish the plan\n+\n+#### For red-flag or abnormal neurologic cases\n+- explicitly escalate urgency\n+- include urgent ED evaluation or prompt in-person neurologic assessment when supported by the narrative\n+- include clear return precautions for worsening neurologic or trauma symptoms\n+\n+#### For adolescent head injury/concussion cases\n+Apply the mode rules:\n+\n+If stable and non-red-flag:\n+- remain extractive unless the source already includes concussion counseling\n+\n+If red-flag or abnormal neurologic:\n+- you may add brief standard emergency precautions\n+- do not add broad concussion management instructions beyond those precautions\n+\n+### 8) Final quality check\n \n Before finalizing, verify:\n-- no invented vitals, exam findings, or test results\n+- no invented vitals, exam findings, tests, or treatments\n - subjective and objective are not mixed\n-- assessment reflects the actual level of concern\n-- plan contains clear escalation when red flags are present\n-- note is concise and clinically readable\n+- the assessment matches the actual level of concern\n+- the plan is not falsely detailed\n+- if safety counseling was added, it is limited to high-level emergency/return precautions justified by red flags or abnormal neurologic findings\n+- no unsupported concussion-specific restrictions or school/sports instructions were inserted\n \n ## SOAP note template\n \n@@ -151,27 +274,58 @@\n - progressive neurologic symptoms\n - new unilateral weakness or numbness\n - severe or worsening headache with neurologic symptoms\n+- post-traumatic neurologic abnormality\n - syncope, seizure, confusion, or altered mental status\n+- repeated vomiting after head injury\n - bowel/bladder dysfunction with spine symptoms\n - rapidly worsening pain plus neurologic changes\n - other explicitly concerning clinician findings\n \n-Suggested plan language:\n-- \"Given the presence of red-flag symptoms/abnormal neurologic findings, urgent emergency department evaluation is recommended.\"\n-- \"Patient was advised to seek immediate care for any worsening weakness, numbness, severe headache, vision change, speech difficulty, gait instability, or other new neurologic symptoms.\"\n-\n-## Style guidance\n-\n-- Be concise.\n-- Use neutral clinical language.\n-- Prefer \"reports,\" \"endorses,\" \"denies,\" \"exam notable for,\" \"concerning for.\"\n-- If details are missing, do not compensate by guessing.\n-- If the source already includes a likely diagnosis or plan, preserve it faithfully in cleaner SOAP format.\n-\n-## Example\n-\n+Suggested escalated plan language:\n+- “Given the presence of red-flag symptoms/abnormal neurologic findings, urgent emergency department evaluation is recommended.”\n+- “Patient was advised to seek immediate care for worsening weakness, numbness, severe headache, repeated vomiting, vision change, speech difficulty, gait instability, confusion, seizure, increasing drowsiness, or other new neurologic symptoms.”\n+\n+## Adolescent head injury/concussion guidance\n+\n+Use this section when the narrative suggests a youth concussion-type encounter.\n+\n+### If the narrative is mild and uncomplicated\n+Examples:\n+- headache after a sports collision\n+- dizziness after a minor head bump\n+- possible concussion discussed, but no red flags or objective neurologic deficits\n+\n+Then:\n+- keep the note extractive\n+- preserve any provided diagnosis, plan, and counseling\n+- do not independently add standard concussion handouts or anticipatory guidance\n+\n+### If the narrative includes danger features\n+Examples:\n+- worsening headache after injury\n+- abnormal balance or coordination\n+- confusion or unusual behavior\n+- vomiting\n+- focal weakness/numbness\n+- significant exam abnormality\n+- clinician concern requiring urgent escalation\n+\n+Then:\n+- make the assessment explicitly cautious\n+- include urgent evaluation/emergency precautions\n+- keep any added counseling narrow, safety-oriented, and high level\n+\n+### Practical rule\n+Ask:\n+1. Am I adding a fact? If yes, do not add it.\n+2. Am I adding a detailed management instruction not stated in the source? If yes, do not add it.\n+3. Am I adding only generic emergency precautions because the narrative shows red flags or abnormal neurologic findings? If yes, that is acceptable.\n+\n+## Examples\n+\n+### Example 1: Routine non-traumatic case\n Input narrative:\n-\"Patient reports low back pain for 1 week after lifting a box. Over the last 2 days, pain radiates down the right leg with numbness in the right foot. Exam shows decreased sensation over the dorsum of the right foot and positive straight-leg raise on the right. No bowel or bladder symptoms mentioned.\"\n+“Patient reports low back pain for 1 week after lifting a box. Over the last 2 days, pain radiates down the right leg with numbness in the right foot. Exam shows decreased sensation over the dorsum of the right foot and positive straight-leg raise on the right. No bowel or bladder symptoms mentioned.”\n \n Output:\n Subjective:\n@@ -188,20 +342,80 @@\n - Recommend prompt clinical follow-up/in-person evaluation.\n - Advise immediate emergency evaluation for new or worsening weakness, progressive numbness, saddle anesthesia, bowel or bladder dysfunction, fever, or uncontrolled pain.\n \n+### Example 2: Adolescent head injury with red flags\n+Input narrative:\n+“Teen hit his head during soccer 2 days ago. Since then he has had worsening headache and nausea. On exam he is alert but has mild difficulty with tandem gait. Provider is concerned and recommends further evaluation.”\n+\n+Output:\n+Subjective:\n+- Reports head injury during soccer 2 days ago followed by worsening headache and nausea.\n+\n+Objective:\n+- Alert on exam.\n+- Mild difficulty with tandem gait.\n+\n+Assessment:\n+- Post-traumatic headache and nausea with documented gait abnormality after head injury are concerning for concussion or another acute neurologic process. Given the abnormal neurologic finding and worsening symptoms, uncomplicated minor head injury cannot be assumed.\n+\n+Plan:\n+- Recommend urgent further evaluation.\n+- Advise immediate emergency care for worsening headache, repeated vomiting, confusion, seizure, weakness, numbness, vision change, speech difficulty, gait worsening, increasing drowsiness, or other new neurologic symptoms.\n+\n+Note why this is acceptable:\n+- The urgent evaluation is supported by the source.\n+- The added precautions are narrow safety counseling tied to red flags.\n+- The note does not add cognitive rest, return-to-play, school restrictions, or family supervision instructions.\n+\n+### Example 3: Adolescent head injury without red flags\n+Input narrative:\n+“15-year-old struck head lightly on cabinet yesterday. Today has mild headache. Neuro exam normal. Clinician suspects mild concussion and recommends rest and pediatric follow-up.”\n+\n+Output:\n+Subjective:\n+- Reports mild headache beginning after striking head on a cabinet yesterday.\n+\n+Objective:\n+- Neurologic exam normal.\n+\n+Assessment:\n+- Mild post-traumatic headache; clinician suspects mild concussion.\n+\n+Plan:\n+- Rest and pediatric follow-up as recommended in the source.\n+\n+Note why this is correct:\n+- The plan remains extractive.\n+- No extra return-to-play, school, driving, or screen-time advice is added unless present in the source.\n+\n ## Common mistakes to avoid\n \n - Moving patient-reported symptoms into Objective\n - Adding normal findings that were never stated\n-- Writing an assessment that ignores documented red flags\n-- Producing a routine conservative plan despite focal neurologic deficits\n+- Ignoring documented red flags or neurologic abnormalities\n+- Producing a routine conservative plan despite dangerous features\n - Inventing medications, dosages, imaging, or follow-up intervals\n-- Overly broad differentials that distract from the main problem\n+- Adding standard concussion counseling in every adolescent head injury case\n+- Adding return-to-play, school, parent/guardian, or cognitive-rest instructions not supported by the source\n+- Writing a definitive concussion diagnosis when the source only supports concern or suspicion\n \n ## Minimal decision rule\n \n If the case is routine and non-concerning:\n - produce a standard SOAP note faithful to the narrative\n \n-If the case includes red flags or abnormal neurologic findings:\n+If the case includes red flags, post-traumatic warning signs, or abnormal neurologic findings:\n - make the assessment explicitly cautious\n-- make the plan explicitly safety-oriented and escalated+- make the plan explicitly safety-oriented and escalated\n+\n+If the case is an adolescent head injury/concussion encounter:\n+- remain strictly extractive unless the narrative shows clear safety concern\n+- only add narrow emergency precautions when red flags or abnormal neurologic findings justify them\n+\n+## Output expectations\n+\n+The final note should be:\n+- concise\n+- clinically readable\n+- faithful to the source\n+- appropriately cautious when danger signs are present\n+- restrained about adding counseling unless the limited safety augmentation rule is met\n", + "content_snapshot": { + "SKILL.md": "---\nname: concussion-soap-safety\ndescription: Convert clinical narratives into structured SOAP notes while preserving strict source fidelity, with explicit rules for adolescent head injury/concussion cases on when standard safety counseling may be added versus when the note must remain purely extractive.\n---\n\n# Concussion-Sensitive SOAP Note From Narrative\n\nUse this skill when you are given a free-text clinical encounter, visit summary, urgent care note, sports injury narrative, or case description and need to produce a structured SOAP note.\n\nThis skill is designed to:\n- preserve the distinction between patient-reported history and clinician-observed findings\n- avoid inventing exam findings, diagnostics, or treatment details\n- produce a concise, problem-focused assessment\n- escalate the plan when red flags, trauma warning signs, or abnormal neurologic findings are present\n- explicitly handle adolescent head injury/concussion encounters without violating the non-fabrication rule\n- distinguish when limited standard safety counseling is appropriate to add versus when the note should remain strictly extractive\n\n## Scope\n\nThis skill works for general outpatient, urgent care, sports medicine, and episodic clinical narratives. It is especially useful for:\n- headaches and neurologic complaints\n- musculoskeletal pain with neurologic symptoms\n- post-traumatic symptom narratives\n- adolescent head injury/concussion encounters\n- cases where the plan must reflect safety risk without inventing unsupported management details\n\n## Core principles\n\n1. Do not add facts that were not provided.\n2. Keep subjective and objective information separated.\n3. Preserve uncertainty when the source is uncertain.\n4. Make the assessment clinically coherent but tightly grounded in the source.\n5. Escalate the plan when the narrative includes red flags, worsening symptoms, trauma warning signs, or abnormal neurologic findings.\n6. If the source lacks information, omit it or mark it as not provided rather than fabricating content.\n7. In concussion/head injury cases, distinguish between:\n - extractive content taken directly from the source\n - standard safety counseling that may be added only in narrowly defined situations\n8. Never add detailed treatment regimens, return-to-play stages, school accommodations, medication advice, driving restrictions, supervision arrangements, or disposition details unless the source states them or the narrative clearly requires immediate emergency escalation.\n\n## Decision mode: extractive vs limited safety augmentation\n\nBefore writing the note, determine which mode applies.\n\n### Mode 1: Strictly extractive\nUse this by default.\n\nChoose strictly extractive when:\n- the case is routine and stable\n- there are no red flags or abnormal neurologic findings\n- the narrative does not involve head trauma/concussion\n- the narrative involves minor head injury symptoms but no concerning features and no clinician-implied need for escalation\n- adding counseling would go beyond what is needed to safely reflect the source\n\nIn this mode:\n- the SOAP note should only restate or reorganize provided information\n- do not add standard concussion counseling just because it is commonly expected\n- do not add “brain rest,” “return-to-play,” “no sports,” “no screens,” “inform parent/guardian,” or similar advice unless stated in the source\n\n### Mode 2: Limited safety augmentation\nUse this only when the narrative includes head injury/concussion features plus safety concern.\n\nChoose limited safety augmentation when:\n- the encounter involves head trauma or suspected concussion, especially in an adolescent or sports context\nAND\n- the narrative contains red flags, worsening symptoms, abnormal neurologic findings, concerning progression, or clinician-documented concern that warrants urgent evaluation or explicit danger counseling\n\nExamples of triggers:\n- worsening headache after head injury\n- vomiting, seizure, confusion, somnolence, or altered mental status\n- focal neurologic deficit\n- abnormal coordination, gait, balance, or mental status findings\n- loss of consciousness if described as concerning or accompanied by persistent symptoms\n- severe mechanism plus ongoing symptoms\n- deteriorating course after injury\n- clinician recommendation for ED transfer, urgent imaging, or same-day escalation\n\nIn this mode, you may add only:\n- a concise recommendation for urgent or emergency evaluation when clearly warranted by the narrative\n- explicit return/emergency precautions for neurologic deterioration or worsening head injury symptoms\n\nDo not add broader concussion management advice unless the source already includes it.\n\n## What standard counseling may be added, and what may not\n\n### Permitted added counseling in limited safety augmentation mode\nOnly when trauma/concussion red flags or abnormal neurologic findings are present, you may add concise safety language such as:\n- “Given the presence of concerning post-traumatic symptoms/abnormal neurologic findings, urgent emergency department evaluation is recommended.”\n- “Advise immediate care for worsening headache, repeated vomiting, confusion, seizure, weakness, numbness, vision change, speech difficulty, gait instability, increasing drowsiness, or other new neurologic symptoms.”\n\nThis is allowed because it reflects high-level safety escalation rather than inventing detailed management.\n\n### Not permitted unless explicitly supported by the source\nDo not add:\n- cognitive rest instructions\n- school absence or academic accommodations\n- return-to-play or sports restriction timelines\n- no driving or no screen-time instructions\n- parent/guardian notification instructions\n- observation frequency at home\n- medication recommendations or dosing\n- imaging orders not stated or clearly implied by an urgent transfer recommendation\n- specialist referrals not stated\n- definitive concussion diagnosis if the source does not support it\n\n## Step-by-step workflow\n\n### 1) Read the encounter and extract source facts\n\nIdentify and group all explicit facts into:\n- symptoms and history\n- timing and progression\n- mechanism of injury, if any\n- relevant negatives\n- exam findings\n- neurologic findings\n- prior treatments or self-care\n- clinician impression or likely diagnosis if stated\n- follow-up or precautions if stated\n\nMentally separate:\n- Subjective = what the patient reports, describes, denies, or experiences\n- Objective = observed exam findings, measured values, test results, and clinician-documented findings\n\n### 2) Screen for safety triggers\n\nCheck for any of the following:\n- focal neurologic deficit\n- progressive neurologic symptoms\n- new unilateral weakness or numbness\n- severe or worsening headache\n- head trauma with ongoing or worsening symptoms\n- abnormal coordination, balance, gait, or mental status\n- syncope, seizure, confusion, or altered mental status\n- repeated vomiting\n- bowel/bladder dysfunction with spine symptoms\n- rapidly worsening pain plus neurologic change\n- other explicitly concerning clinician findings\n\nIf present, the assessment must reflect concern and the plan must be escalated.\n\n### 3) Identify whether this is a head injury/concussion-type encounter\n\nLook for:\n- blow to head, fall, collision, sports injury, assault, or other head trauma\n- post-traumatic headache, dizziness, nausea, photophobia, concentration difficulty, imbalance, or confusion\n- explicit mention of concussion concern\n- adolescent/school/sports context suggesting a youth concussion presentation\n\nThen decide:\n- no head injury pattern -> use general SOAP workflow\n- head injury pattern without red flags -> stay strictly extractive unless the source itself includes counseling\n- head injury pattern with red flags/abnormal neuro findings -> use limited safety augmentation\n\n### 4) Build the Subjective section\n\nInclude only history elements supported by the narrative:\n- chief concern\n- onset, duration, course\n- mechanism of injury if reported\n- associated symptoms\n- aggravating/alleviating factors\n- pertinent negatives\n- relevant prior history or attempted treatments, if provided\n\nKeep it concise and clinical.\n\nExamples:\n- “Reports headache since striking head during basketball yesterday, with associated dizziness and nausea.”\n- “Reports 3 days of worsening headache with associated left arm tingling.”\n\nDo not move patient-reported symptoms into Objective.\n\n### 5) Build the Objective section\n\nInclude only documented objective data:\n- vital signs, if given\n- physical exam findings\n- neurologic findings\n- test results, if given\n\nImportant:\n- do not infer a normal exam if none was described\n- do not add routine normal headings unless explicitly provided\n- if the exam is sparse, keep the Objective sparse\n\nGood:\n- “Neuro: Mild difficulty with tandem gait.”\n- “Coordination exam notable for slight imbalance.”\n- “Decreased sensation in left upper extremity.”\n\nAvoid:\n- “PERRLA, EOMI, CN II-XII intact” unless explicitly documented\n\n### 6) Write a focused Assessment\n\nThe assessment should:\n- summarize the main clinical issue in 1-3 sentences\n- cite the key supporting features\n- preserve uncertainty when appropriate\n- avoid overcommitting to a diagnosis not established in the source\n\nPreferred language:\n- “Symptoms are most consistent with...”\n- “Presentation is concerning for...”\n- “Given X and Y, cannot exclude...”\n\nFor head trauma cases:\n- if symptoms suggest concussion but the source is not definitive, say “post-traumatic symptoms concerning for concussion” rather than stating a confirmed diagnosis\n- if abnormal neurologic findings are present, explicitly note that benign/self-limited injury cannot be assumed from the available information\n\nExample:\n- “Post-traumatic headache with documented coordination abnormality is concerning for concussion or other acute neurologic process after head injury and warrants urgent evaluation; uncomplicated minor head injury cannot be assumed from the available information.”\n\n### 7) Write the Plan\n\nBase the plan on what is provided, but structure it clearly.\n\nPossible components:\n- symptomatic treatment already recommended or used\n- monitoring or follow-up\n- diagnostic evaluation if stated\n- return precautions or emergency precautions\n\n#### For routine non-concerning cases\n- remain extractive\n- preserve stated follow-up or conservative care\n- do not embellish the plan\n\n#### For red-flag or abnormal neurologic cases\n- explicitly escalate urgency\n- include urgent ED evaluation or prompt in-person neurologic assessment when supported by the narrative\n- include clear return precautions for worsening neurologic or trauma symptoms\n\n#### For adolescent head injury/concussion cases\nApply the mode rules:\n\nIf stable and non-red-flag:\n- remain extractive unless the source already includes concussion counseling\n\nIf red-flag or abnormal neurologic:\n- you may add brief standard emergency precautions\n- do not add broad concussion management instructions beyond those precautions\n\n### 8) Final quality check\n\nBefore finalizing, verify:\n- no invented vitals, exam findings, tests, or treatments\n- subjective and objective are not mixed\n- the assessment matches the actual level of concern\n- the plan is not falsely detailed\n- if safety counseling was added, it is limited to high-level emergency/return precautions justified by red flags or abnormal neurologic findings\n- no unsupported concussion-specific restrictions or school/sports instructions were inserted\n\n## SOAP note template\n\nUse this structure:\n\nSubjective:\n- [Chief concern and history in concise prose or bullets]\n\nObjective:\n- [Only provided exam findings, vitals, tests]\n\nAssessment:\n- [Focused synthesis of the main clinical issue, including uncertainty or concern]\n\nPlan:\n- [Treatment / next steps / follow-up / precautions]\n\n## Safety trigger guide\n\nEscalate the plan when the narrative includes any of the following:\n- focal neurologic deficit\n- progressive neurologic symptoms\n- new unilateral weakness or numbness\n- severe or worsening headache with neurologic symptoms\n- post-traumatic neurologic abnormality\n- syncope, seizure, confusion, or altered mental status\n- repeated vomiting after head injury\n- bowel/bladder dysfunction with spine symptoms\n- rapidly worsening pain plus neurologic changes\n- other explicitly concerning clinician findings\n\nSuggested escalated plan language:\n- “Given the presence of red-flag symptoms/abnormal neurologic findings, urgent emergency department evaluation is recommended.”\n- “Patient was advised to seek immediate care for worsening weakness, numbness, severe headache, repeated vomiting, vision change, speech difficulty, gait instability, confusion, seizure, increasing drowsiness, or other new neurologic symptoms.”\n\n## Adolescent head injury/concussion guidance\n\nUse this section when the narrative suggests a youth concussion-type encounter.\n\n### If the narrative is mild and uncomplicated\nExamples:\n- headache after a sports collision\n- dizziness after a minor head bump\n- possible concussion discussed, but no red flags or objective neurologic deficits\n\nThen:\n- keep the note extractive\n- preserve any provided diagnosis, plan, and counseling\n- do not independently add standard concussion handouts or anticipatory guidance\n\n### If the narrative includes danger features\nExamples:\n- worsening headache after injury\n- abnormal balance or coordination\n- confusion or unusual behavior\n- vomiting\n- focal weakness/numbness\n- significant exam abnormality\n- clinician concern requiring urgent escalation\n\nThen:\n- make the assessment explicitly cautious\n- include urgent evaluation/emergency precautions\n- keep any added counseling narrow, safety-oriented, and high level\n\n### Practical rule\nAsk:\n1. Am I adding a fact? If yes, do not add it.\n2. Am I adding a detailed management instruction not stated in the source? If yes, do not add it.\n3. Am I adding only generic emergency precautions because the narrative shows red flags or abnormal neurologic findings? If yes, that is acceptable.\n\n## Examples\n\n### Example 1: Routine non-traumatic case\nInput narrative:\n“Patient reports low back pain for 1 week after lifting a box. Over the last 2 days, pain radiates down the right leg with numbness in the right foot. Exam shows decreased sensation over the dorsum of the right foot and positive straight-leg raise on the right. No bowel or bladder symptoms mentioned.”\n\nOutput:\nSubjective:\n- Reports 1 week of low back pain after lifting injury, with 2 days of radiation down the right leg and associated numbness in the right foot.\n\nObjective:\n- Exam notable for decreased sensation over the dorsum of the right foot.\n- Positive straight-leg raise on the right.\n\nAssessment:\n- Acute low back pain with right-sided radicular symptoms and objective sensory deficit, concerning for lumbar radiculopathy. Given the documented neurologic abnormality, symptoms warrant prompt in-person evaluation and close monitoring for progression.\n\nPlan:\n- Recommend prompt clinical follow-up/in-person evaluation.\n- Advise immediate emergency evaluation for new or worsening weakness, progressive numbness, saddle anesthesia, bowel or bladder dysfunction, fever, or uncontrolled pain.\n\n### Example 2: Adolescent head injury with red flags\nInput narrative:\n“Teen hit his head during soccer 2 days ago. Since then he has had worsening headache and nausea. On exam he is alert but has mild difficulty with tandem gait. Provider is concerned and recommends further evaluation.”\n\nOutput:\nSubjective:\n- Reports head injury during soccer 2 days ago followed by worsening headache and nausea.\n\nObjective:\n- Alert on exam.\n- Mild difficulty with tandem gait.\n\nAssessment:\n- Post-traumatic headache and nausea with documented gait abnormality after head injury are concerning for concussion or another acute neurologic process. Given the abnormal neurologic finding and worsening symptoms, uncomplicated minor head injury cannot be assumed.\n\nPlan:\n- Recommend urgent further evaluation.\n- Advise immediate emergency care for worsening headache, repeated vomiting, confusion, seizure, weakness, numbness, vision change, speech difficulty, gait worsening, increasing drowsiness, or other new neurologic symptoms.\n\nNote why this is acceptable:\n- The urgent evaluation is supported by the source.\n- The added precautions are narrow safety counseling tied to red flags.\n- The note does not add cognitive rest, return-to-play, school restrictions, or family supervision instructions.\n\n### Example 3: Adolescent head injury without red flags\nInput narrative:\n“15-year-old struck head lightly on cabinet yesterday. Today has mild headache. Neuro exam normal. Clinician suspects mild concussion and recommends rest and pediatric follow-up.”\n\nOutput:\nSubjective:\n- Reports mild headache beginning after striking head on a cabinet yesterday.\n\nObjective:\n- Neurologic exam normal.\n\nAssessment:\n- Mild post-traumatic headache; clinician suspects mild concussion.\n\nPlan:\n- Rest and pediatric follow-up as recommended in the source.\n\nNote why this is correct:\n- The plan remains extractive.\n- No extra return-to-play, school, driving, or screen-time advice is added unless present in the source.\n\n## Common mistakes to avoid\n\n- Moving patient-reported symptoms into Objective\n- Adding normal findings that were never stated\n- Ignoring documented red flags or neurologic abnormalities\n- Producing a routine conservative plan despite dangerous features\n- Inventing medications, dosages, imaging, or follow-up intervals\n- Adding standard concussion counseling in every adolescent head injury case\n- Adding return-to-play, school, parent/guardian, or cognitive-rest instructions not supported by the source\n- Writing a definitive concussion diagnosis when the source only supports concern or suspicion\n\n## Minimal decision rule\n\nIf the case is routine and non-concerning:\n- produce a standard SOAP note faithful to the narrative\n\nIf the case includes red flags, post-traumatic warning signs, or abnormal neurologic findings:\n- make the assessment explicitly cautious\n- make the plan explicitly safety-oriented and escalated\n\nIf the case is an adolescent head injury/concussion encounter:\n- remain strictly extractive unless the narrative shows clear safety concern\n- only add narrow emergency precautions when red flags or abnormal neurologic findings justify them\n\n## Output expectations\n\nThe final note should be:\n- concise\n- clinically readable\n- faithful to the source\n- appropriately cautious when danger signs are present\n- restrained about adding counseling unless the limited safety augmentation rule is met\n" + }, + "created_at": "2026-04-12T02:50:48.486763", + "created_by": "gpt-5.4" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 0, + "total_applied": 0, + "total_completions": 0, + "total_fallbacks": 0, + "recent_analyses": [], + "first_seen": "2026-04-12T02:50:48.486771", + "last_updated": "2026-04-12T02:50:48.486771" + }, + { + "skill_id": "verify-build-artifacts__v0_0d98babd", + "name": "verify-build-artifacts", + "description": "Verify that expected output files were actually created and usable before declaring a build successful, especially when commands return nonzero exit codes or emit warning-heavy logs.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/verify-build-artifacts/SKILL.md", + "is_active": true, + "category": "tool_guide", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "captured", + "generation": 0, + "parent_skill_ids": [], + "source_task_id": "02314fc6-a24e-42f4-a8cd-362cae0f0ec1_phase1", + "change_summary": "Captured a reusable skill for confirming output artifacts exist and are usable after build tools emit errors, warnings, or ambiguous logs.", + "content_diff": "--- a/SKILL.md\n+++ b/SKILL.md\n@@ -0,0 +1,245 @@\n+---\n+name: verify-build-artifacts\n+description: Verify that expected output files were actually created and usable before declaring a build successful, especially when commands return nonzero exit codes or emit warning-heavy logs.\n+---\n+\n+# Verify Build Artifacts\n+\n+Use this skill when a build, conversion, or rendering command produces ambiguous results — for example:\n+\n+- the command exits nonzero\n+- the logs contain warnings or partial failures\n+- the tool may still emit usable outputs despite errors\n+- multiple fallback commands were attempted\n+- success cannot be inferred from logs alone\n+\n+This is especially important for artifact-producing tools such as LaTeX, pandoc, compilers, renderers, bundlers, and document converters.\n+\n+## Core principle\n+\n+Never declare completion based only on:\n+- a command having run\n+- partial progress in logs\n+- assumptions about tool behavior\n+- \"it usually produces the file\"\n+\n+Instead, verify the expected artifact directly.\n+\n+## When to apply this\n+\n+Apply artifact verification whenever:\n+\n+1. The command exit code is nonzero.\n+2. The command output includes errors, warnings, or missing-resource messages.\n+3. The tool is known to sometimes produce output even on failure.\n+4. You used fallback strategies and need to know which one actually worked.\n+5. The final answer depends on a specific file existing.\n+\n+Examples:\n+- `pdflatex`, `xelatex`, `lualatex`\n+- `pandoc`\n+- document generation pipelines\n+- code compilation producing binaries or bundles\n+- image/video rendering\n+- archive/export commands\n+\n+## Procedure\n+\n+### 1. Identify the expected artifact\n+\n+Before or immediately after running the build, name the exact file you need.\n+\n+Examples:\n+- `report.pdf`\n+- `dist/app.js`\n+- `build/output.zip`\n+\n+If there are several acceptable outputs, list them explicitly and decide which counts as success.\n+\n+## 2. Run the build command and capture its result\n+\n+Record:\n+- exit status\n+- stdout/stderr or log file\n+- output path(s)\n+\n+Do not treat warnings or a nonzero exit code as automatic proof of total failure or success.\n+\n+## 3. Check whether the artifact exists\n+\n+After the command finishes, inspect the filesystem directly.\n+\n+Shell example:\n+[ -f report.pdf ] && echo \"exists\" || echo \"missing\"\n+\n+For a stronger check:\n+ls -l report.pdf\n+\n+Useful questions:\n+- Does the file exist?\n+- Is it in the expected location?\n+- Is it non-empty?\n+- Was it modified recently?\n+\n+## 4. Perform a basic usability check\n+\n+Existence alone is sometimes insufficient. Confirm the artifact is not obviously broken.\n+\n+Examples:\n+- check file size is greater than zero\n+- inspect MIME/file type\n+- open metadata if available\n+- ensure the file is newer than the source or build start time\n+\n+Examples:\n+file report.pdf\n+stat report.pdf\n+\n+For PDFs specifically:\n+- verify the file exists\n+- ensure it is non-empty\n+- optionally inspect with `pdfinfo` if available\n+\n+Example:\n+pdfinfo report.pdf >/dev/null 2>&1 && echo \"readable PDF\"\n+\n+## 5. Reconcile exit code vs artifact state\n+\n+Use this decision logic:\n+\n+### Case A: Exit code success, artifact present\n+Treat as success.\n+\n+### Case B: Exit code failure, artifact present and usable\n+Treat as partial/qualified success only if the requested deliverable exists and appears usable.\n+In your final report:\n+- say the command reported errors or warnings\n+- say the artifact was nevertheless generated\n+- mention any limitations if known\n+\n+### Case C: Exit code success, artifact missing\n+Treat as failure. Do not claim completion.\n+\n+### Case D: Exit code failure, artifact missing\n+Treat as failure. Investigate or use fallback steps.\n+\n+## 6. If multiple build attempts were used, verify after each meaningful attempt\n+\n+When trying fallbacks, do not assume the last command worked just because it looked better.\n+\n+Example workflow:\n+1. run `pdflatex`\n+2. verify `report.pdf`\n+3. if missing, run `xelatex`\n+4. verify again\n+5. if still missing, run `pandoc`\n+6. verify again\n+\n+Track which command actually produced the artifact.\n+\n+## 7. Base your final response on verified state, not hopeful interpretation\n+\n+Before declaring success, state one of:\n+- artifact verified at ``\n+- artifact not found\n+- artifact found but validity uncertain\n+- artifact generated with warnings/errors\n+\n+This prevents false completion claims.\n+\n+## Recommended reporting template\n+\n+Use wording like:\n+\n+- \"The build command exited with errors, but `output.pdf` was generated and is present at the expected path.\"\n+- \"The logs showed warnings; I verified that `dist/app.js` exists and is non-empty.\"\n+- \"I attempted multiple build paths, but no final artifact was found, so I cannot claim completion.\"\n+- \"The command completed, but the expected artifact was missing; treating this as unsuccessful.\"\n+\n+## Minimal shell patterns\n+\n+### Simple existence check\n+build_command\n+if [ -f output.pdf ]; then\n+ echo \"Artifact verified: output.pdf\"\n+else\n+ echo \"Artifact missing: output.pdf\"\n+fi\n+\n+### Existence and non-empty check\n+build_command\n+if [ -s output.pdf ]; then\n+ echo \"Artifact verified and non-empty\"\n+else\n+ echo \"Artifact missing or empty\"\n+fi\n+\n+### Preserve exit code while still checking artifact\n+build_command\n+status=$?\n+\n+if [ -s output.pdf ]; then\n+ echo \"Artifact exists\"\n+else\n+ echo \"Artifact missing\"\n+fi\n+\n+exit $status\n+\n+### Qualified success logic\n+build_command\n+status=$?\n+\n+if [ -s output.pdf ]; then\n+ echo \"Build returned status $status, but artifact exists\"\n+else\n+ echo \"Build failed and artifact is missing\"\n+fi\n+\n+## Tool-specific notes\n+\n+### LaTeX\n+LaTeX commands may:\n+- return nonzero because of warnings or unresolved references\n+- emit a PDF despite incomplete logs\n+- require multiple passes\n+\n+Always verify the PDF directly after each run:\n+- `pdflatex`\n+- `xelatex`\n+- `lualatex`\n+\n+If bibliography or references are involved, rerun as needed, but do not declare success unless the final PDF exists.\n+\n+### pandoc\n+`pandoc` may fail because of a missing PDF engine, template issue, or embedded resource problem.\n+Even so, verify whether the requested output file was created.\n+\n+For PDF targets, check:\n+- target file exists\n+- file is non-empty\n+- optional readability via `pdfinfo` or `file`\n+\n+## Anti-patterns to avoid\n+\n+Do not:\n+- claim success because \"the command mostly worked\"\n+- assume logs mentioning output imply the file exists\n+- stop at the first promising warning-free line\n+- ignore the possibility of partial output\n+- omit artifact verification after fallback attempts\n+\n+## Compact checklist\n+\n+Before finishing, confirm:\n+\n+- What exact artifact was required?\n+- Did I check the filesystem directly?\n+- Is the file non-empty?\n+- If the command failed, did I still verify whether output exists?\n+- If multiple attempts were made, do I know which one produced the artifact?\n+- Does my final response match the verified artifact state?\n+\n+## Success standard\n+\n+A task involving build output is complete only when the expected artifact has been verified at the filesystem level, not merely inferred from command execution or logs.", + "content_snapshot": { + "SKILL.md": "---\nname: verify-build-artifacts\ndescription: Verify that expected output files were actually created and usable before declaring a build successful, especially when commands return nonzero exit codes or emit warning-heavy logs.\n---\n\n# Verify Build Artifacts\n\nUse this skill when a build, conversion, or rendering command produces ambiguous results — for example:\n\n- the command exits nonzero\n- the logs contain warnings or partial failures\n- the tool may still emit usable outputs despite errors\n- multiple fallback commands were attempted\n- success cannot be inferred from logs alone\n\nThis is especially important for artifact-producing tools such as LaTeX, pandoc, compilers, renderers, bundlers, and document converters.\n\n## Core principle\n\nNever declare completion based only on:\n- a command having run\n- partial progress in logs\n- assumptions about tool behavior\n- \"it usually produces the file\"\n\nInstead, verify the expected artifact directly.\n\n## When to apply this\n\nApply artifact verification whenever:\n\n1. The command exit code is nonzero.\n2. The command output includes errors, warnings, or missing-resource messages.\n3. The tool is known to sometimes produce output even on failure.\n4. You used fallback strategies and need to know which one actually worked.\n5. The final answer depends on a specific file existing.\n\nExamples:\n- `pdflatex`, `xelatex`, `lualatex`\n- `pandoc`\n- document generation pipelines\n- code compilation producing binaries or bundles\n- image/video rendering\n- archive/export commands\n\n## Procedure\n\n### 1. Identify the expected artifact\n\nBefore or immediately after running the build, name the exact file you need.\n\nExamples:\n- `report.pdf`\n- `dist/app.js`\n- `build/output.zip`\n\nIf there are several acceptable outputs, list them explicitly and decide which counts as success.\n\n## 2. Run the build command and capture its result\n\nRecord:\n- exit status\n- stdout/stderr or log file\n- output path(s)\n\nDo not treat warnings or a nonzero exit code as automatic proof of total failure or success.\n\n## 3. Check whether the artifact exists\n\nAfter the command finishes, inspect the filesystem directly.\n\nShell example:\n[ -f report.pdf ] && echo \"exists\" || echo \"missing\"\n\nFor a stronger check:\nls -l report.pdf\n\nUseful questions:\n- Does the file exist?\n- Is it in the expected location?\n- Is it non-empty?\n- Was it modified recently?\n\n## 4. Perform a basic usability check\n\nExistence alone is sometimes insufficient. Confirm the artifact is not obviously broken.\n\nExamples:\n- check file size is greater than zero\n- inspect MIME/file type\n- open metadata if available\n- ensure the file is newer than the source or build start time\n\nExamples:\nfile report.pdf\nstat report.pdf\n\nFor PDFs specifically:\n- verify the file exists\n- ensure it is non-empty\n- optionally inspect with `pdfinfo` if available\n\nExample:\npdfinfo report.pdf >/dev/null 2>&1 && echo \"readable PDF\"\n\n## 5. Reconcile exit code vs artifact state\n\nUse this decision logic:\n\n### Case A: Exit code success, artifact present\nTreat as success.\n\n### Case B: Exit code failure, artifact present and usable\nTreat as partial/qualified success only if the requested deliverable exists and appears usable.\nIn your final report:\n- say the command reported errors or warnings\n- say the artifact was nevertheless generated\n- mention any limitations if known\n\n### Case C: Exit code success, artifact missing\nTreat as failure. Do not claim completion.\n\n### Case D: Exit code failure, artifact missing\nTreat as failure. Investigate or use fallback steps.\n\n## 6. If multiple build attempts were used, verify after each meaningful attempt\n\nWhen trying fallbacks, do not assume the last command worked just because it looked better.\n\nExample workflow:\n1. run `pdflatex`\n2. verify `report.pdf`\n3. if missing, run `xelatex`\n4. verify again\n5. if still missing, run `pandoc`\n6. verify again\n\nTrack which command actually produced the artifact.\n\n## 7. Base your final response on verified state, not hopeful interpretation\n\nBefore declaring success, state one of:\n- artifact verified at ``\n- artifact not found\n- artifact found but validity uncertain\n- artifact generated with warnings/errors\n\nThis prevents false completion claims.\n\n## Recommended reporting template\n\nUse wording like:\n\n- \"The build command exited with errors, but `output.pdf` was generated and is present at the expected path.\"\n- \"The logs showed warnings; I verified that `dist/app.js` exists and is non-empty.\"\n- \"I attempted multiple build paths, but no final artifact was found, so I cannot claim completion.\"\n- \"The command completed, but the expected artifact was missing; treating this as unsuccessful.\"\n\n## Minimal shell patterns\n\n### Simple existence check\nbuild_command\nif [ -f output.pdf ]; then\n echo \"Artifact verified: output.pdf\"\nelse\n echo \"Artifact missing: output.pdf\"\nfi\n\n### Existence and non-empty check\nbuild_command\nif [ -s output.pdf ]; then\n echo \"Artifact verified and non-empty\"\nelse\n echo \"Artifact missing or empty\"\nfi\n\n### Preserve exit code while still checking artifact\nbuild_command\nstatus=$?\n\nif [ -s output.pdf ]; then\n echo \"Artifact exists\"\nelse\n echo \"Artifact missing\"\nfi\n\nexit $status\n\n### Qualified success logic\nbuild_command\nstatus=$?\n\nif [ -s output.pdf ]; then\n echo \"Build returned status $status, but artifact exists\"\nelse\n echo \"Build failed and artifact is missing\"\nfi\n\n## Tool-specific notes\n\n### LaTeX\nLaTeX commands may:\n- return nonzero because of warnings or unresolved references\n- emit a PDF despite incomplete logs\n- require multiple passes\n\nAlways verify the PDF directly after each run:\n- `pdflatex`\n- `xelatex`\n- `lualatex`\n\nIf bibliography or references are involved, rerun as needed, but do not declare success unless the final PDF exists.\n\n### pandoc\n`pandoc` may fail because of a missing PDF engine, template issue, or embedded resource problem.\nEven so, verify whether the requested output file was created.\n\nFor PDF targets, check:\n- target file exists\n- file is non-empty\n- optional readability via `pdfinfo` or `file`\n\n## Anti-patterns to avoid\n\nDo not:\n- claim success because \"the command mostly worked\"\n- assume logs mentioning output imply the file exists\n- stop at the first promising warning-free line\n- ignore the possibility of partial output\n- omit artifact verification after fallback attempts\n\n## Compact checklist\n\nBefore finishing, confirm:\n\n- What exact artifact was required?\n- Did I check the filesystem directly?\n- Is the file non-empty?\n- If the command failed, did I still verify whether output exists?\n- If multiple attempts were made, do I know which one produced the artifact?\n- Does my final response match the verified artifact state?\n\n## Success standard\n\nA task involving build output is complete only when the expected artifact has been verified at the filesystem level, not merely inferred from command execution or logs." + }, + "created_at": "2026-04-12T02:55:10.940721", + "created_by": "gpt-5.4" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 0, + "total_applied": 0, + "total_completions": 0, + "total_fallbacks": 0, + "recent_analyses": [], + "first_seen": "2026-04-12T02:55:10.940769", + "last_updated": "2026-04-12T02:55:10.940771" + }, + { + "skill_id": "adaptive-document-output__v0_1a22c8b2", + "name": "adaptive-document-output", + "description": "Generate documents reliably by probing available conversion tools first, then selecting PDF, HTML, or source output based on what is actually installed.", + "path": "/Users/admin/PycharmProjects/openspace/openspace/skills/adaptive-document-output/SKILL.md", + "is_active": true, + "category": "workflow", + "tags": [], + "visibility": "private", + "creator_id": "", + "lineage": { + "origin": "captured", + "generation": 0, + "parent_skill_ids": [], + "source_task_id": "02314fc6-a24e-42f4-a8cd-362cae0f0ec1_phase1", + "change_summary": "Added a workflow skill for resilient document generation that detects installed converters first and chooses the best available output path instead of assuming PDF tooling exists.", + "content_diff": "--- a/SKILL.md\n+++ b/SKILL.md\n@@ -0,0 +1,272 @@\n+---\n+name: adaptive-document-output\n+description: Generate documents reliably by probing available conversion tools first, then selecting PDF, HTML, or source output based on what is actually installed.\n+---\n+\n+# Adaptive Document Output\n+\n+This skill defines a fallback workflow for document generation when the desired final format may require tools that are not installed. It is especially useful for agents that need to produce reports, resumes, invoices, or other formatted deliverables in constrained environments.\n+\n+The key principle is: do not assume a PDF engine exists. Probe the environment first, then choose the best achievable output format and conversion path.\n+\n+## When to use this skill\n+\n+Use this workflow when:\n+- You need to generate a formatted document, often ideally as PDF\n+- The runtime environment may or may not have document converters installed\n+- Multiple conversion tools might be available\n+- You want graceful degradation instead of repeated failing commands\n+\n+Typical cases:\n+- Markdown to PDF/HTML\n+- HTML to PDF\n+- LaTeX to PDF\n+- Templated report generation\n+- CI or sandboxed agent environments\n+\n+## Supported tool probes\n+\n+Check for these tools in this order unless task constraints suggest otherwise:\n+1. `pandoc`\n+2. `weasyprint`\n+3. `xelatex`\n+4. `pdflatex`\n+5. `wkhtmltopdf`\n+\n+These tools enable different output strategies:\n+- `pandoc`: versatile conversion hub; may still require a PDF engine for direct PDF output\n+- `weasyprint`: strong HTML/CSS to PDF path\n+- `xelatex`: robust Unicode-aware LaTeX to PDF\n+- `pdflatex`: LaTeX to PDF when Unicode/font needs are simpler\n+- `wkhtmltopdf`: HTML to PDF fallback\n+\n+## Workflow\n+\n+### 1. Clarify acceptable outputs\n+\n+Before generating files, determine the acceptable fallback order.\n+\n+Recommended preference:\n+1. PDF\n+2. HTML\n+3. Markdown or LaTeX source\n+\n+If the task explicitly requires PDF and no PDF path exists, still generate the best intermediate source and explain the limitation.\n+\n+### 2. Probe installed tools first\n+\n+Never start by issuing a guessed conversion command. Check what is actually present.\n+\n+Example shell probe:\n+#!/usr/bin/env bash\n+for cmd in pandoc weasyprint xelatex pdflatex wkhtmltopdf; do\n+ if command -v \"$cmd\" >/dev/null 2>&1; then\n+ echo \"$cmd: yes\"\n+ else\n+ echo \"$cmd: no\"\n+ fi\n+done\n+\n+Or compact form:\n+available() { command -v \"$1\" >/dev/null 2>&1; }\n+\n+### 3. Choose a generation strategy from available tools\n+\n+Use the following decision logic.\n+\n+#### Strategy A: `pandoc` plus a working PDF engine\n+Use when:\n+- `pandoc` is installed\n+- and either `xelatex` or `pdflatex` is installed\n+\n+Preferred command:\n+- Prefer `xelatex` over `pdflatex` for Unicode, modern fonts, and fewer encoding issues\n+\n+Examples:\n+pandoc input.md -o output.pdf --pdf-engine=xelatex\n+pandoc input.md -o output.pdf --pdf-engine=pdflatex\n+\n+Use this path when the source is Markdown and you want direct PDF generation.\n+\n+#### Strategy B: HTML intermediate plus `weasyprint`\n+Use when:\n+- `pandoc` is installed and `weasyprint` is installed\n+- or you can generate HTML by another means and `weasyprint` is installed\n+\n+Examples:\n+pandoc input.md -o output.html\n+weasyprint output.html output.pdf\n+\n+This is often the best fallback when direct pandoc-to-PDF fails due to missing TeX engines.\n+\n+#### Strategy C: HTML intermediate plus `wkhtmltopdf`\n+Use when:\n+- `wkhtmltopdf` is installed\n+- and you can produce HTML\n+\n+Examples:\n+pandoc input.md -o output.html\n+wkhtmltopdf output.html output.pdf\n+\n+Use this when `weasyprint` is unavailable.\n+\n+#### Strategy D: Native LaTeX compilation\n+Use when:\n+- you already have `.tex`\n+- and `xelatex` or `pdflatex` is installed\n+\n+Examples:\n+xelatex -interaction=nonstopmode -halt-on-error doc.tex\n+pdflatex -interaction=nonstopmode -halt-on-error doc.tex\n+\n+Prefer `xelatex` for multilingual text and font flexibility.\n+\n+#### Strategy E: No PDF path available\n+Use when:\n+- no viable PDF engine or converter exists\n+\n+Then:\n+- generate HTML if possible\n+- otherwise generate Markdown or LaTeX source\n+- clearly report that PDF could not be produced because no supported PDF toolchain is installed\n+\n+Examples:\n+pandoc input.md -o output.html\n+cp input.md deliverable.md\n+\n+## Recommended decision tree\n+\n+Follow this exact order unless the task has stronger constraints:\n+\n+1. Probe tools.\n+2. If `pandoc` and `xelatex` exist, generate PDF via pandoc + xelatex.\n+3. Else if `pandoc` and `pdflatex` exist, generate PDF via pandoc + pdflatex.\n+4. Else if `pandoc` and `weasyprint` exist, generate HTML then PDF via weasyprint.\n+5. Else if `pandoc` and `wkhtmltopdf` exist, generate HTML then PDF via wkhtmltopdf.\n+6. Else if source is LaTeX and `xelatex` exists, compile directly.\n+7. Else if source is LaTeX and `pdflatex` exists, compile directly.\n+8. Else if `pandoc` exists, emit HTML or another portable intermediate.\n+9. Else emit the highest-value source format you can generate directly.\n+\n+## Practical command patterns\n+\n+### Markdown input, adaptive output\n+#!/usr/bin/env bash\n+set -euo pipefail\n+\n+infile=\"${1:-input.md}\"\n+base=\"${2:-output}\"\n+\n+have() { command -v \"$1\" >/dev/null 2>&1; }\n+\n+if have pandoc && have xelatex; then\n+ pandoc \"$infile\" -o \"${base}.pdf\" --pdf-engine=xelatex\n+ echo \"Produced ${base}.pdf via pandoc + xelatex\"\n+elif have pandoc && have pdflatex; then\n+ pandoc \"$infile\" -o \"${base}.pdf\" --pdf-engine=pdflatex\n+ echo \"Produced ${base}.pdf via pandoc + pdflatex\"\n+elif have pandoc && have weasyprint; then\n+ pandoc \"$infile\" -o \"${base}.html\"\n+ weasyprint \"${base}.html\" \"${base}.pdf\"\n+ echo \"Produced ${base}.pdf via HTML + weasyprint\"\n+elif have pandoc && have wkhtmltopdf; then\n+ pandoc \"$infile\" -o \"${base}.html\"\n+ wkhtmltopdf \"${base}.html\" \"${base}.pdf\"\n+ echo \"Produced ${base}.pdf via HTML + wkhtmltopdf\"\n+elif have pandoc; then\n+ pandoc \"$infile\" -o \"${base}.html\"\n+ echo \"No PDF toolchain found; produced ${base}.html instead\"\n+else\n+ cp \"$infile\" \"${base}.md\"\n+ echo \"No converter found; copied source to ${base}.md\"\n+fi\n+\n+### LaTeX input, adaptive output\n+#!/usr/bin/env bash\n+set -euo pipefail\n+\n+texfile=\"${1:-doc.tex}\"\n+have() { command -v \"$1\" >/dev/null 2>&1; }\n+\n+if have xelatex; then\n+ xelatex -interaction=nonstopmode -halt-on-error \"$texfile\"\n+elif have pdflatex; then\n+ pdflatex -interaction=nonstopmode -halt-on-error \"$texfile\"\n+else\n+ echo \"No LaTeX PDF engine installed; cannot compile PDF\" >&2\n+ exit 1\n+fi\n+\n+## Error-handling guidance\n+\n+### If direct PDF conversion fails\n+Do not loop on the same failing assumption. Instead:\n+1. Inspect whether the failure is due to a missing engine\n+2. Switch to another supported path\n+3. Preserve useful intermediate artifacts such as `.html` or `.tex`\n+\n+Example:\n+- If `pandoc -o output.pdf` fails because no TeX engine is installed, retry with:\n+ - `pandoc input.md -o output.html`\n+ - then use `weasyprint` or `wkhtmltopdf` if available\n+\n+### If fonts or Unicode break LaTeX output\n+Prefer `xelatex` over `pdflatex`.\n+\n+### If no converter exists\n+Still provide a useful deliverable:\n+- polished Markdown\n+- standalone HTML\n+- LaTeX source\n+- a concise explanation of what blocked PDF generation\n+\n+## Output policy for autonomous agents\n+\n+When reporting results:\n+- State which tools were detected\n+- State which path was selected\n+- Name the produced files\n+- If PDF was not produced, explicitly say why and identify the fallback artifact\n+\n+Good example:\n+- Detected: `pandoc`, `weasyprint`\n+- Selected path: Markdown → HTML via pandoc, then HTML → PDF via weasyprint\n+- Produced: `report.html`, `report.pdf`\n+\n+Good fallback example:\n+- Detected: `pandoc` only\n+- No PDF engine available\n+- Produced: `report.html` as the best available deliverable\n+\n+## Heuristics\n+\n+- Prefer successful output over ideal-but-unavailable output\n+- Prefer deterministic detection over trial-and-error guesses\n+- Prefer preserving intermediates when switching strategies\n+- Prefer `xelatex` over `pdflatex` when both exist\n+- Prefer HTML as a universal intermediate when PDF engines are missing\n+\n+## Anti-patterns to avoid\n+\n+Avoid:\n+- Assuming `pandoc -o file.pdf` will work everywhere\n+- Assuming TeX is installed because `pandoc` is installed\n+- Repeating failed PDF commands without probing environment\n+- Discarding useful `.html` or `.tex` intermediates\n+- Claiming a PDF was produced without verifying the file exists\n+\n+## Minimal verification checklist\n+\n+Before declaring success:\n+- Confirm the selected converter command exited successfully\n+- Confirm the expected output file exists\n+- If PDF was requested but unavailable, confirm fallback file exists\n+- Report the exact fallback taken\n+\n+## Summary\n+\n+This workflow turns document generation into an adaptive process:\n+- probe tools first\n+- choose the best available route\n+- fall back cleanly\n+- always leave behind the highest-value output the environment supports", + "content_snapshot": { + "SKILL.md": "---\nname: adaptive-document-output\ndescription: Generate documents reliably by probing available conversion tools first, then selecting PDF, HTML, or source output based on what is actually installed.\n---\n\n# Adaptive Document Output\n\nThis skill defines a fallback workflow for document generation when the desired final format may require tools that are not installed. It is especially useful for agents that need to produce reports, resumes, invoices, or other formatted deliverables in constrained environments.\n\nThe key principle is: do not assume a PDF engine exists. Probe the environment first, then choose the best achievable output format and conversion path.\n\n## When to use this skill\n\nUse this workflow when:\n- You need to generate a formatted document, often ideally as PDF\n- The runtime environment may or may not have document converters installed\n- Multiple conversion tools might be available\n- You want graceful degradation instead of repeated failing commands\n\nTypical cases:\n- Markdown to PDF/HTML\n- HTML to PDF\n- LaTeX to PDF\n- Templated report generation\n- CI or sandboxed agent environments\n\n## Supported tool probes\n\nCheck for these tools in this order unless task constraints suggest otherwise:\n1. `pandoc`\n2. `weasyprint`\n3. `xelatex`\n4. `pdflatex`\n5. `wkhtmltopdf`\n\nThese tools enable different output strategies:\n- `pandoc`: versatile conversion hub; may still require a PDF engine for direct PDF output\n- `weasyprint`: strong HTML/CSS to PDF path\n- `xelatex`: robust Unicode-aware LaTeX to PDF\n- `pdflatex`: LaTeX to PDF when Unicode/font needs are simpler\n- `wkhtmltopdf`: HTML to PDF fallback\n\n## Workflow\n\n### 1. Clarify acceptable outputs\n\nBefore generating files, determine the acceptable fallback order.\n\nRecommended preference:\n1. PDF\n2. HTML\n3. Markdown or LaTeX source\n\nIf the task explicitly requires PDF and no PDF path exists, still generate the best intermediate source and explain the limitation.\n\n### 2. Probe installed tools first\n\nNever start by issuing a guessed conversion command. Check what is actually present.\n\nExample shell probe:\n#!/usr/bin/env bash\nfor cmd in pandoc weasyprint xelatex pdflatex wkhtmltopdf; do\n if command -v \"$cmd\" >/dev/null 2>&1; then\n echo \"$cmd: yes\"\n else\n echo \"$cmd: no\"\n fi\ndone\n\nOr compact form:\navailable() { command -v \"$1\" >/dev/null 2>&1; }\n\n### 3. Choose a generation strategy from available tools\n\nUse the following decision logic.\n\n#### Strategy A: `pandoc` plus a working PDF engine\nUse when:\n- `pandoc` is installed\n- and either `xelatex` or `pdflatex` is installed\n\nPreferred command:\n- Prefer `xelatex` over `pdflatex` for Unicode, modern fonts, and fewer encoding issues\n\nExamples:\npandoc input.md -o output.pdf --pdf-engine=xelatex\npandoc input.md -o output.pdf --pdf-engine=pdflatex\n\nUse this path when the source is Markdown and you want direct PDF generation.\n\n#### Strategy B: HTML intermediate plus `weasyprint`\nUse when:\n- `pandoc` is installed and `weasyprint` is installed\n- or you can generate HTML by another means and `weasyprint` is installed\n\nExamples:\npandoc input.md -o output.html\nweasyprint output.html output.pdf\n\nThis is often the best fallback when direct pandoc-to-PDF fails due to missing TeX engines.\n\n#### Strategy C: HTML intermediate plus `wkhtmltopdf`\nUse when:\n- `wkhtmltopdf` is installed\n- and you can produce HTML\n\nExamples:\npandoc input.md -o output.html\nwkhtmltopdf output.html output.pdf\n\nUse this when `weasyprint` is unavailable.\n\n#### Strategy D: Native LaTeX compilation\nUse when:\n- you already have `.tex`\n- and `xelatex` or `pdflatex` is installed\n\nExamples:\nxelatex -interaction=nonstopmode -halt-on-error doc.tex\npdflatex -interaction=nonstopmode -halt-on-error doc.tex\n\nPrefer `xelatex` for multilingual text and font flexibility.\n\n#### Strategy E: No PDF path available\nUse when:\n- no viable PDF engine or converter exists\n\nThen:\n- generate HTML if possible\n- otherwise generate Markdown or LaTeX source\n- clearly report that PDF could not be produced because no supported PDF toolchain is installed\n\nExamples:\npandoc input.md -o output.html\ncp input.md deliverable.md\n\n## Recommended decision tree\n\nFollow this exact order unless the task has stronger constraints:\n\n1. Probe tools.\n2. If `pandoc` and `xelatex` exist, generate PDF via pandoc + xelatex.\n3. Else if `pandoc` and `pdflatex` exist, generate PDF via pandoc + pdflatex.\n4. Else if `pandoc` and `weasyprint` exist, generate HTML then PDF via weasyprint.\n5. Else if `pandoc` and `wkhtmltopdf` exist, generate HTML then PDF via wkhtmltopdf.\n6. Else if source is LaTeX and `xelatex` exists, compile directly.\n7. Else if source is LaTeX and `pdflatex` exists, compile directly.\n8. Else if `pandoc` exists, emit HTML or another portable intermediate.\n9. Else emit the highest-value source format you can generate directly.\n\n## Practical command patterns\n\n### Markdown input, adaptive output\n#!/usr/bin/env bash\nset -euo pipefail\n\ninfile=\"${1:-input.md}\"\nbase=\"${2:-output}\"\n\nhave() { command -v \"$1\" >/dev/null 2>&1; }\n\nif have pandoc && have xelatex; then\n pandoc \"$infile\" -o \"${base}.pdf\" --pdf-engine=xelatex\n echo \"Produced ${base}.pdf via pandoc + xelatex\"\nelif have pandoc && have pdflatex; then\n pandoc \"$infile\" -o \"${base}.pdf\" --pdf-engine=pdflatex\n echo \"Produced ${base}.pdf via pandoc + pdflatex\"\nelif have pandoc && have weasyprint; then\n pandoc \"$infile\" -o \"${base}.html\"\n weasyprint \"${base}.html\" \"${base}.pdf\"\n echo \"Produced ${base}.pdf via HTML + weasyprint\"\nelif have pandoc && have wkhtmltopdf; then\n pandoc \"$infile\" -o \"${base}.html\"\n wkhtmltopdf \"${base}.html\" \"${base}.pdf\"\n echo \"Produced ${base}.pdf via HTML + wkhtmltopdf\"\nelif have pandoc; then\n pandoc \"$infile\" -o \"${base}.html\"\n echo \"No PDF toolchain found; produced ${base}.html instead\"\nelse\n cp \"$infile\" \"${base}.md\"\n echo \"No converter found; copied source to ${base}.md\"\nfi\n\n### LaTeX input, adaptive output\n#!/usr/bin/env bash\nset -euo pipefail\n\ntexfile=\"${1:-doc.tex}\"\nhave() { command -v \"$1\" >/dev/null 2>&1; }\n\nif have xelatex; then\n xelatex -interaction=nonstopmode -halt-on-error \"$texfile\"\nelif have pdflatex; then\n pdflatex -interaction=nonstopmode -halt-on-error \"$texfile\"\nelse\n echo \"No LaTeX PDF engine installed; cannot compile PDF\" >&2\n exit 1\nfi\n\n## Error-handling guidance\n\n### If direct PDF conversion fails\nDo not loop on the same failing assumption. Instead:\n1. Inspect whether the failure is due to a missing engine\n2. Switch to another supported path\n3. Preserve useful intermediate artifacts such as `.html` or `.tex`\n\nExample:\n- If `pandoc -o output.pdf` fails because no TeX engine is installed, retry with:\n - `pandoc input.md -o output.html`\n - then use `weasyprint` or `wkhtmltopdf` if available\n\n### If fonts or Unicode break LaTeX output\nPrefer `xelatex` over `pdflatex`.\n\n### If no converter exists\nStill provide a useful deliverable:\n- polished Markdown\n- standalone HTML\n- LaTeX source\n- a concise explanation of what blocked PDF generation\n\n## Output policy for autonomous agents\n\nWhen reporting results:\n- State which tools were detected\n- State which path was selected\n- Name the produced files\n- If PDF was not produced, explicitly say why and identify the fallback artifact\n\nGood example:\n- Detected: `pandoc`, `weasyprint`\n- Selected path: Markdown → HTML via pandoc, then HTML → PDF via weasyprint\n- Produced: `report.html`, `report.pdf`\n\nGood fallback example:\n- Detected: `pandoc` only\n- No PDF engine available\n- Produced: `report.html` as the best available deliverable\n\n## Heuristics\n\n- Prefer successful output over ideal-but-unavailable output\n- Prefer deterministic detection over trial-and-error guesses\n- Prefer preserving intermediates when switching strategies\n- Prefer `xelatex` over `pdflatex` when both exist\n- Prefer HTML as a universal intermediate when PDF engines are missing\n\n## Anti-patterns to avoid\n\nAvoid:\n- Assuming `pandoc -o file.pdf` will work everywhere\n- Assuming TeX is installed because `pandoc` is installed\n- Repeating failed PDF commands without probing environment\n- Discarding useful `.html` or `.tex` intermediates\n- Claiming a PDF was produced without verifying the file exists\n\n## Minimal verification checklist\n\nBefore declaring success:\n- Confirm the selected converter command exited successfully\n- Confirm the expected output file exists\n- If PDF was requested but unavailable, confirm fallback file exists\n- Report the exact fallback taken\n\n## Summary\n\nThis workflow turns document generation into an adaptive process:\n- probe tools first\n- choose the best available route\n- fall back cleanly\n- always leave behind the highest-value output the environment supports" + }, + "created_at": "2026-04-12T02:55:18.433237", + "created_by": "gpt-5.4" + }, + "tool_dependencies": [], + "critical_tools": [], + "total_selections": 0, + "total_applied": 0, + "total_completions": 0, + "total_fallbacks": 0, + "recent_analyses": [], + "first_seen": "2026-04-12T02:55:18.433253", + "last_updated": "2026-04-12T02:55:18.433254" + } +] \ No newline at end of file diff --git a/context/local-machine/admin-macos/mcp/repo-local.mcp.json b/context/local-machine/admin-macos/mcp/repo-local.mcp.json new file mode 100644 index 0000000..664da91 --- /dev/null +++ b/context/local-machine/admin-macos/mcp/repo-local.mcp.json @@ -0,0 +1,23 @@ +{ + "mcpServers": { + "openspace_evolution": { + "type": "stdio", + "command": "/Users/admin/PycharmProjects/openspace/.venv/bin/python", + "args": [ + "-m", + "openspace.evolution_mcp_server", + "--transport", + "stdio" + ], + "env": { + "OPENSPACE_WORKSPACE": "/Users/admin/PycharmProjects/openspace", + "OPENSPACE_HOST_SKILL_DIRS": "/Users/admin/.claude/projects/openspace/skills,/Users/admin/.claude/skills", + "OPENSPACE_MODEL": "gpt-5.4", + "OPENSPACE_LLM_API_BASE": "http://127.0.0.1:8080/v1", + "OPENSPACE_LLM_OPENAI_STREAM_COMPAT": "true", + "OPENSPACE_ENABLE_RECORDING": "false", + "OPENSPACE_BACKEND_SCOPE": "shell,system" + } + } + } +} diff --git a/context/project-skills/openspace/README.md b/context/project-skills/openspace/README.md new file mode 100644 index 0000000..247a81a --- /dev/null +++ b/context/project-skills/openspace/README.md @@ -0,0 +1,8 @@ +## OpenSpace Project Skills + +This directory mirrors project-scoped skills that are intentionally installed +outside the repository under `~/.codex/projects/openspace/skills/`. + +Use it to keep reusable project-routing and workflow skills versioned in Git so +they can be reviewed, migrated to other machines, and reinstalled into the +project bucket when needed. diff --git a/context/project-skills/openspace/validate-skill-routing/.skill_id b/context/project-skills/openspace/validate-skill-routing/.skill_id new file mode 100644 index 0000000..df71493 --- /dev/null +++ b/context/project-skills/openspace/validate-skill-routing/.skill_id @@ -0,0 +1 @@ +validate-skill-routing__imp_6944a359 diff --git a/context/project-skills/openspace/validate-skill-routing/.upload_meta.json b/context/project-skills/openspace/validate-skill-routing/.upload_meta.json new file mode 100644 index 0000000..ca8769f --- /dev/null +++ b/context/project-skills/openspace/validate-skill-routing/.upload_meta.json @@ -0,0 +1,7 @@ +{ + "origin": "captured", + "parent_skill_ids": [], + "change_summary": "Captured a workflow for validating OpenSpace skill routing by checking project/default/global bucket behavior, inspecting installed launcher/config drift, and applying minimal repair steps when router sanity checks degrade.", + "created_by": "gpt-5.4", + "tags": [] +} diff --git a/context/project-skills/openspace/validate-skill-routing/SKILL.md b/context/project-skills/openspace/validate-skill-routing/SKILL.md new file mode 100644 index 0000000..6a4c60d --- /dev/null +++ b/context/project-skills/openspace/validate-skill-routing/SKILL.md @@ -0,0 +1,130 @@ +--- +name: validate-skill-routing +description: Diagnose and repair OpenSpace project skill routing in this repo, including skill routing checks, router sanity check, bucket fallback validation, 自动路由, 项目 skill 路由, 路由自检, bucket 回退, 检查自动路由是否生效, and wrapper/index/config drift affecting routing. +--- + +# Validate Skill Routing + +Use this workflow when the task is about verifying or fixing OpenSpace routing behavior for this repository. + +Keep the scope narrow: + +- validate repo root to project-bucket mapping +- validate project/default/global candidate bucket order +- interpret router output +- distinguish task-wording mismatch from a real routing bug +- inspect wrapper-derived `OPENSPACE_WORKSPACE` +- inspect `OPENSPACE_HOST_SKILL_DIRS` coverage +- detect project skill index drift +- decide whether stale process state requires restart + +Do not broaden into generic MCP health or leak cleanup unless the routing checks point there. + +## Workflow + +### 1. Confirm repo root and expected bucket + +Derive the canonical repo root with git, not only the current working directory. + +Use the repo basename as the expected bucket name. + +For this repository, healthy routing means the bucket resolves to `openspace`. + +### 2. Inspect the active routing path + +Inspect the active installed launcher and config path, not only repo templates. + +Check: + +- `~/.codex/bin/openspace-global-mcp` +- `~/.codex/AGENTS.md` +- `~/.codex/config.toml` + +Validate: + +- launcher canonicalizes workspace to git root when possible +- launcher exports `OPENSPACE_WORKSPACE` from that canonical root +- launcher includes the project skill dir before generic global skills +- the configured MCP entry still points at the launcher you expect + +Do not conclude repo-tracked scripts are broken until the installed launchers under `~/.codex` have been checked. + +### 3. Run the router with three probes + +Use `/Users/admin/.codex/tools/route_codex_skills_via_openspace.py --cwd --task "" --json`. + +Run these exact probes: + +1. Project-bucket probe + - `修复 provider rollout 验证流程,确认本地 wrapper 和 repo tracked 配置一致` + - expected: an `openspace` project skill hit + +2. Default-bucket probe + - `重启 MCP 之后做一次健康检查,确认进程祖先和 live tool probe 都恢复正常` + - expected: a `default` bucket hit + +3. Routing-self-check probe + - `检查 openspace 项目 skill 路由、router sanity check、验证 project/default/global bucket fallback` + - expected: `validate-skill-routing` hit after this skill exists + +If useful, rerun from a nested subdirectory inside the repo to confirm the bucket still resolves to `openspace`. + +### 4. Compare router outputs + +Compare these fields across probes: + +- `bucket` +- `candidate_dirs` +- `prefilter_ranked` +- `selected` +- `selection_record` + +Treat the route as healthy only when: + +- the project probe hits the intended `openspace` project bucket +- the default probe still hits the intended `default` bucket +- the routing-self-check probe no longer falls back generically + +### 5. Localize the failure in this order + +If routing is wrong, diagnose in this order: + +1. task wording or frontmatter mismatch +2. missing or weak project skill description +3. stale `SKILL_INDEX.md` +4. wrapper or config drift +5. stale long-lived process state + +A global fallback on a vague meta-task is not automatically a bug. + +Prefer diagnosing wording and metadata mismatch before blaming the router. + +### 6. Apply the minimal repair sequence + +Apply the smallest fix that explains the failure: + +1. strengthen or add project-skill trigger terms in frontmatter description +2. regenerate the openspace project skill index +3. reinstall global wrappers only if launcher behavior changed +4. restart Codex or daemons only if env or wrapper changes require a fresh process state + +Do not restart long-lived processes just because a probe was vague and matched a global skill. + +### 7. Re-run and record before/after + +Re-run the same probes after any repair. + +Record: + +- the probe strings used +- selected bucket before and after +- whether the installed launcher/config changed +- whether index regeneration was required +- whether restart was required + +## Guardrails + +- Validate actual installed launchers before diagnosing repo-tracked launcher bugs. +- Treat vague routing-self-check prompts as ambiguous until probe evidence shows a real regression. +- Prefer precise trigger wording when validating a project skill. +- Keep default-bucket behavior intact while improving project-skill coverage. diff --git a/docs/codex-desktop-sidecar-evolution.md b/docs/codex-desktop-sidecar-evolution.md new file mode 100644 index 0000000..382f785 --- /dev/null +++ b/docs/codex-desktop-sidecar-evolution.md @@ -0,0 +1,246 @@ +# Codex Desktop Sidecar Evolution Integration + +## Goal + +This integration keeps the normal Codex Desktop workflow unchanged while moving OpenSpace skill capture and self-evolution onto a separate provider-backed sidecar path. + +The target user experience is: + +- Main coding still happens in Codex Desktop with the user's normal subscription login. +- OpenSpace does not take over the main task loop. +- Sidecar evolution can be invoked explicitly after a task and spend provider API tokens instead of the main Codex Desktop session. + +## Short Answer: Was this mainly an API-level dual routing change? + +No. + +The final effect does **not** come from a simple in-process "dual route" inside one OpenSpace runtime where: + +- coding uses Codex Desktop subscription auth, and +- evolution uses a provider API + +That approach is not viable because Codex Desktop subscription login is not exposed to the Python process as a reusable API credential. + +Instead, the final implementation uses **process-level split routing**: + +- the main coding session remains in Codex Desktop +- self-evolution runs through an isolated OpenSpace sidecar with its own provider-backed MCP server + +API compatibility work was still necessary, but it is only one part of the solution. + +## Embedding Split Routing + +The sidecar now also supports a separate skill-embedding route from the main LLM. + +Recommended setup: + +```bash +OPENSPACE_MODEL=gpt-5.4 +OPENSPACE_LLM_API_KEY=sk-xxx +OPENSPACE_LLM_API_BASE=http://127.0.0.1:8080/v1 + +OPENSPACE_SKILL_EMBEDDING_BACKEND=local +OPENSPACE_SKILL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 +``` + +If you want a dedicated remote endpoint for skill embeddings instead of local +fastembed, set: + +```bash +OPENSPACE_SKILL_EMBEDDING_BACKEND=remote +OPENSPACE_SKILL_EMBEDDING_API_KEY=sk-embed-xxx +OPENSPACE_SKILL_EMBEDDING_API_BASE=https://example.com/v1 +OPENSPACE_SKILL_EMBEDDING_MODEL=openai/text-embedding-3-small +``` + +## What Was Implemented + +### 1. OpenAI-compatible provider bridge for OpenSpace + +File: + +- `openspace/llm/client.py` + +Why it was needed: + +- The third-party relay worked with Codex's `/responses` path. +- OpenSpace uses LiteLLM / OpenAI-style chat completion flows. +- The relay was not reliable enough for OpenSpace's normal streaming path. + +What changed: + +- Added an OpenAI-compatible streaming fallback that talks directly to `/chat/completions`. +- Reconstructed streamed text, reasoning content, and tool calls into the shape OpenSpace already expects. +- Enabled this path through `OPENSPACE_LLM_OPENAI_STREAM_COMPAT`. + +Effect: + +- OpenSpace can use the relay provider for evolution workloads. + +### 2. Evolution-only MCP sidecar + +File: + +- `openspace/evolution_mcp_server.py` + +Why it was needed: + +- The user wanted OpenSpace to handle only post-task evolution and skill capture. +- The main coding loop had to stay outside OpenSpace. + +What changed: + +- Added a separate MCP server exposing only `evolve_from_context`. +- This server builds context from the current workspace, conversation summary, and git diff. +- It captures reusable skills without becoming the main task executor. + +Effect: + +- OpenSpace now has a narrow sidecar role instead of replacing the host coding agent. + +### 3. Sidecar-capable skill engine without full task recording + +File: + +- `openspace/tool_layer.py` + +Why it was needed: + +- The original skill evolution path assumed a fuller OpenSpace task/recording pipeline. +- The new sidecar path needed to create skills without enabling the normal OpenSpace recording flow. + +What changed: + +- Added `enable_skill_engine_without_recording`. +- Kept execution analysis tied to recording. +- Allowed skill evolution and skill store initialization in sidecar mode without enabling full task recordings. + +Effect: + +- Sidecar capture can work independently without creating full OpenSpace task sessions. + +### 4. Isolated Desktop launcher overlay + +File: + +- `scripts/codex-desktop-evolution` + +Why it was needed: + +- The main Codex Desktop session had to keep the user's normal login and defaults. +- The sidecar config had to be added without polluting `~/.codex`. + +What changed: + +- Created an overlay `CODEX_HOME` at `~/.codex-openspace-desktop`. +- Copied the primary Desktop auth and config base into the overlay. +- Added only one extra MCP server: `openspace_evolution`. +- Scrubbed `OPENSPACE_*` variables before launching the main Codex process. +- Avoided inheriting arbitrary shell state or leaking sidecar credentials into the main coding session. + +Effect: + +- Main Codex Desktop remains normal. +- The sidecar is available only in the isolated overlay profile. + +### 5. Agent instruction trigger for sidecar capture + +File: + +- `AGENTS.md` + +Why it was needed: + +- The sidecar should be callable naturally from the Desktop workflow. +- The user should not need to restate the full MCP call every time. + +What changed: + +- Added a repo-level instruction that maps phrases like `sidecar 自进化一下` to `openspace_evolution.evolve_from_context`. +- Limited the default behavior to: + - no code changes + - no main-task takeover + - at most one high-reuse skill by default + +Effect: + +- The sidecar behaves like a narrow post-task tool integrated into the normal Desktop workflow. + +## Other Supporting Changes + +### MCP stdout flush fix + +File: + +- `openspace/mcp_server.py` + +What changed: + +- Avoided a final stdout flush crash when the MCP stdio transport closes before Python exit. + +### Missing dependency for MCP backend + +Files: + +- `pyproject.toml` +- `requirements.txt` + +What changed: + +- Added `websockets>=15.0.0` +- Added `openspace-evolution-mcp` as a console entrypoint + +### Frontend dependency refresh + +File: + +- `frontend/package-lock.json` + +What changed: + +- Updated `lodash-es` +- Updated `vite` + +This was a maintenance fix and is not part of the sidecar architecture itself. + +## Architecture Summary + +The final architecture is: + +1. Codex Desktop remains the main coding agent. +2. Codex Desktop keeps using the user's normal subscription login. +3. A separate overlay profile adds an `openspace_evolution` MCP server. +4. That MCP server runs OpenSpace with provider-backed credentials. +5. OpenSpace uses the provider only for post-task evolution and skill capture. + +This means the practical "dual routing" exists at the workflow/process boundary, not as a single shared in-process auth router. + +## Usage + +Launch the Desktop profile that includes the sidecar: + +```bash +cd /Users/admin/PycharmProjects/openspace +./scripts/codex-desktop-evolution app +``` + +Inside that Desktop session, trigger sidecar capture with: + +```text +sidecar 自进化一下 +``` + +or the longer explicit form: + +```text +对当前这轮工作做一次 sidecar 自进化。不要改代码,不要接管任务。请调用 openspace_evolution.evolve_from_context,基于当前对话、git diff 和关键改动,自动提炼 task/summary,最多生成 1 个高复用 skill,并告诉我 skill 名称、路径、为什么值得保留。 +``` + +## Result + +The implemented effect is: + +- normal Codex Desktop coding stays unchanged +- OpenSpace self-evolution is available on demand +- provider token spend is isolated to the sidecar path +- the sidecar does not silently take over the main workflow diff --git a/docs/current-routing-flow.md b/docs/current-routing-flow.md new file mode 100644 index 0000000..630c00e --- /dev/null +++ b/docs/current-routing-flow.md @@ -0,0 +1,81 @@ +# Current Routing Flow + +This document records the current OpenSpace routing setup for this local environment. + +## Effective Split Routing + +- Main LLM: + - model: `gpt-5.4` + - API base: `http://127.0.0.1:8080/v1` + - source: `OPENSPACE_LLM_*` +- Skill embeddings: + - backend: `local` + - model: `BAAI/bge-small-en-v1.5` + - source: `OPENSPACE_SKILL_EMBEDDING_*` + +This means: + +- normal OpenSpace generation and tool-calling still use the OpenAI-compatible provider path +- skill-router semantic re-rank does not depend on remote `/v1/embeddings` +- Codex Desktop main session remains isolated from the sidecar/provider env + +## Flow 1: OpenSpace CLI + +```mermaid +flowchart LR + A["User runs ./scripts/openspace.sh"] --> B["Load openspace/.env"] + B --> C["Set OPENSPACE_LLM_*"] + B --> D["Set OPENSPACE_SKILL_EMBEDDING_*"] + C --> E["LLM client"] + D --> F["SkillRanker"] + E --> G["sub2api / local OpenAI-compatible gateway
http://127.0.0.1:8080/v1"] + F --> H["fastembed local model
BAAI/bge-small-en-v1.5"] + G --> I["GroundingAgent execution"] + H --> J["BM25 + vector prefilter"] + J --> I +``` + +## Flow 2: Codex Desktop With OpenSpace Sidecar + +```mermaid +flowchart LR + A["User runs ./scripts/codex-desktop-evolution app"] --> B["Create isolated CODEX_HOME overlay"] + B --> C["Main Codex Desktop session"] + B --> D["openspace_evolution MCP sidecar"] + C --> E["Normal Codex subscription/API workflow"] + D --> F["OpenSpace evolution server"] + F --> G["OPENSPACE_LLM_* -> gpt-5.4 via http://127.0.0.1:8080/v1"] + F --> H["OPENSPACE_SKILL_EMBEDDING_* -> local fastembed"] + G --> I["Evolution / skill capture"] + H --> I +``` + +## Flow 3: Skill Routing Internals + +```mermaid +flowchart LR + A["Task text"] --> B["Early abstain check"] + B --> C["BM25 rough rank"] + C --> D["Local embedding re-rank"] + D --> E["Top candidate skills"] + E --> F["Optional LLM selection"] + F --> G["Injected / selected skills"] +``` + +## Key Config Inputs + +- `OPENSPACE_LLM_API_KEY` +- `OPENSPACE_LLM_API_BASE` +- `OPENSPACE_LLM_OPENAI_STREAM_COMPAT` +- `OPENSPACE_SKILL_EMBEDDING_BACKEND` +- `OPENSPACE_SKILL_EMBEDDING_MODEL` + +## Operational Notes + +- If the provider does not expose `/v1/embeddings`, the main LLM path still works. +- With the current setup, skill embeddings stay local, so router prefilter remains available. +- If needed later, skill embeddings can be moved to a separate remote endpoint by setting: + - `OPENSPACE_SKILL_EMBEDDING_BACKEND=remote` + - `OPENSPACE_SKILL_EMBEDDING_API_KEY` + - `OPENSPACE_SKILL_EMBEDDING_API_BASE` + - `OPENSPACE_SKILL_EMBEDDING_MODEL` diff --git a/docs/global-codex-integration.md b/docs/global-codex-integration.md new file mode 100644 index 0000000..776b3ae --- /dev/null +++ b/docs/global-codex-integration.md @@ -0,0 +1,139 @@ +# Global Codex Integration + +This document records the machine-wide Codex integration used in the local customized OpenSpace setup. + +## What Is Global vs Repo-Tracked + +### Repo-tracked + +These are versioned in this repository: + +- OpenSpace runtime changes +- split routing for main LLM vs skill embeddings +- project launchers under `scripts/` +- integration docs under `docs/` +- installer for global MCP wrappers: + - `scripts/install-global-codex-openspace` + +### Local-only + +These live under `~/.codex` on the local machine and are intentionally not committed directly: + +- `~/.codex/config.toml` +- `~/.codex/AGENTS.md` +- `~/.codex/bin/openspace-global-mcp` +- `~/.codex/bin/openspace-evolution-global-mcp` + +These files are machine-specific because they may contain: + +- absolute local paths +- user-specific Codex settings +- local MCP wiring +- secrets or provider-specific credentials + +## Why `.mcp.json` Was Not Committed + +The repo-local `.mcp.json` is also intentionally excluded from GitHub because it is a local override with: + +- absolute paths into this machine +- host-agent-specific skill directories +- local workflow assumptions + +That file is useful for local experimentation, but it is not the stable source of truth for the global Codex integration. + +## Canonical Global Setup + +The intended machine-wide setup is: + +1. Global Codex config points MCP servers to: + - `~/.codex/bin/openspace-global-mcp` + - `~/.codex/bin/openspace-evolution-global-mcp` +2. Those wrapper scripts: + - detect the current project directory + - normalize it to the git repo root when possible + - set `OPENSPACE_WORKSPACE` + - route project skills to `~/.codex/projects//skills` + - include common global skills from `~/.codex/skills` + - call the shared `stdio` proxy entrypoint + - default `openspace` to `OPENSPACE_MCP_PROXY_MODE=daemon` + - default `openspace_evolution` to `OPENSPACE_MCP_PROXY_MODE=daemon` + - place per-instance daemon state under `OPENSPACE_MCP_DAEMON_STATE_DIR` unless an override is already set +3. Global `~/.codex/AGENTS.md` tells Codex: + - to prefer project skill routing + - to auto-run sidecar evolution for non-trivial repo work + - to treat missing `git init` as a repo bootstrap issue + +## Daemon / Proxy V1 + +The global and local launchers keep the same wrapper names and the same MCP config shape, but they now sit in front of a shared-daemon topology: + +- Codex still talks to stdio wrapper scripts. +- The wrapper scripts keep the existing command names but route into `openspace.mcp_proxy`. +- Both main and evolution now default to `OPENSPACE_MCP_PROXY_MODE=daemon`. +- The proxy path resolves or starts a per-instance daemon using `OPENSPACE_MCP_DAEMON_STATE_DIR`. +- The daemon owns the long-lived OpenSpace engine and serves it over localhost transport. + +This keeps the external Codex contract stable while reducing the number of overlapping OpenSpace engine processes. + +### Fallbacks + +The proxy surface supports two internal overrides: + +- `OPENSPACE_MCP_PROXY_MODE=direct` restores the old direct stdio behavior for debugging or rollback. +- `OPENSPACE_MCP_DAEMON_STATE_DIR=/custom/path` moves daemon state to a different local directory. + +The repo-local `scripts/codex-openspace` helper writes the same daemon defaults into the generated profile so local and global setups stay aligned. + +### Daemon State Metadata + +Each per-key daemon writes a JSON record under `OPENSPACE_MCP_DAEMON_STATE_DIR` named like: + +- `main-.json` +- `evolution-.json` + +For the main daemon path, the record now distinguishes two lifecycle phases: + +- `ready=true`: the daemon is reachable and `list_tools` has succeeded. +- `warmed=true`: background prewarm has completed, so the local embedding backend and candidate cache are ready. + +Useful timestamps: + +- `started_at`: child process spawn time +- `ready_at`: first confirmed MCP-ready time +- `warmed_at`: prewarm completion time + +This makes it possible to tell the difference between: + +- daemon is up but still warming +- daemon is fully warmed and ready for low-latency calls + +## Reinstalling the Global Wrappers + +Use: + +```bash +cd /path/to/OpenSpace +./scripts/install-global-codex-openspace +``` + +This script recreates: + +- `~/.codex/bin/openspace-global-mcp` +- `~/.codex/bin/openspace-evolution-global-mcp` + +It does **not** overwrite your `~/.codex/config.toml` or `~/.codex/AGENTS.md`. + +## Practical Outcome + +With the global integration in place: + +- opening a new project in Codex does not require per-project `.mcp.json` +- OpenSpace MCP is available globally +- OpenSpace evolution MCP is available globally +- repo-scoped skill routing and sidecar evolution use the current project automatically + +## Related Docs + +- `docs/current-routing-flow.md` +- `docs/release-note-local-customization.md` +- `docs/codex-desktop-sidecar-evolution.md` diff --git a/docs/release-note-local-customization.md b/docs/release-note-local-customization.md new file mode 100644 index 0000000..b622908 --- /dev/null +++ b/docs/release-note-local-customization.md @@ -0,0 +1,156 @@ +# Release Note: Local Split-Routing Customization + +This note explains what the local customization changed, why it was needed, and why embeddings looked "broken first, then fixed later". + +## Summary + +The local customization solved a routing mismatch between: + +- the main OpenSpace LLM path +- the skill-router embedding path +- the Codex Desktop sidecar path + +The important result is: + +- OpenSpace main inference can continue using the local OpenAI-compatible gateway and `gpt-5.4` +- skill-router embeddings no longer depend on that gateway exposing `/v1/embeddings` +- Codex Desktop sidecar evolution stays isolated from the user's main Desktop session + +## What Was Wrong Before + +The main provider path was already usable for normal OpenSpace LLM calls: + +- `/v1/chat/completions` +- `/v1/responses`-compatible workflows + +But the skill router has a separate semantic re-rank stage that tries to call: + +- `POST /v1/embeddings` + +The current local gateway worked for the main LLM path, but did not provide a working embeddings endpoint for this use case. + +So the real state before the fix was: + +- main OpenSpace task execution: usable +- skill-router semantic embeddings: degraded or unavailable +- Desktop sidecar: usable for LLM tasks, but still inherited the embedding weakness + +## Why It Looked Like "Embeddings Started Working" + +Embeddings did **not** start working on the same remote provider endpoint. + +What changed was the routing. + +Originally: + +- main LLM and skill embeddings were effectively expected to succeed through the same OpenAI-compatible path + +After the fix: + +- main LLM stayed on the provider path +- skill embeddings were routed to a different backend + +For the current local setup, that different backend is: + +- `fastembed` +- model: `BAAI/bge-small-en-v1.5` + +So the correct explanation is: + +- the remote embeddings path was not fixed +- the system was changed so it no longer needed that remote embeddings path + +## What Was Implemented + +### 1. Split routing for skill embeddings + +Skill embedding generation now supports: + +- `OPENSPACE_SKILL_EMBEDDING_BACKEND=local` +- `OPENSPACE_SKILL_EMBEDDING_BACKEND=remote` +- `OPENSPACE_SKILL_EMBEDDING_BACKEND=auto` + +This was implemented in: + +- `openspace/cloud/embedding.py` +- `openspace/skill_engine/skill_ranker.py` + +Effect: + +- the main LLM provider and the skill embedding backend are now decoupled + +### 2. Local embedding support + +Added `fastembed` as a project dependency and enabled local skill embeddings. + +Effect: + +- the skill router can still do semantic vector re-rank even when the LLM provider does not expose `/v1/embeddings` + +### 3. Launcher propagation + +Updated launchers so the new embedding settings are passed through consistently: + +- `scripts/openspace.sh` +- `scripts/codex-openspace` +- `scripts/codex-desktop-evolution` + +Effect: + +- CLI, isolated Codex profile, and Desktop sidecar all use the same split-routing model + +### 4. Sidecar isolation kept intact + +The Desktop sidecar still runs in an isolated overlay profile and does not overwrite the main Codex Desktop environment. + +Effect: + +- the user can keep their normal Desktop workflow +- OpenSpace evolution still uses the provider-backed sidecar path +- embedding config does not leak back into the main Desktop session + +## Current Effective Routing + +### Main LLM path + +- model: `gpt-5.4` +- API base: `http://127.0.0.1:8080/v1` +- source: `OPENSPACE_LLM_*` + +### Skill-router embedding path + +- backend: `local` +- model: `BAAI/bge-small-en-v1.5` +- source: `OPENSPACE_SKILL_EMBEDDING_*` + +### Desktop sidecar + +- still isolated through `scripts/codex-desktop-evolution` +- uses the same split routing internally + +## Practical Result + +This local customization solved three concrete problems: + +1. OpenSpace no longer treats "main LLM works" and "embeddings work" as the same thing. +2. Skill routing quality is preserved even when the LLM provider has no usable `/v1/embeddings`. +3. Codex Desktop sidecar evolution keeps using provider tokens without polluting the main Desktop environment. + +## Short Version + +Before: + +- one path was effectively assumed to do both LLM and embeddings +- the provider could handle the LLM part +- the embedding part was weak or unavailable + +Now: + +- LLM still goes through the provider +- embeddings go through local fastembed +- sidecar remains isolated + +That is why it first looked broken and later looked fixed. + +It was not a provider repair. +It was a routing repair. diff --git a/frontend/package-lock.json b/frontend/package-lock.json index b4630ac..e2c75a4 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -2512,9 +2512,9 @@ "license": "MIT" }, "node_modules/lodash-es": { - "version": "4.17.23", - "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.23.tgz", - "integrity": "sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==", + "version": "4.18.1", + "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.18.1.tgz", + "integrity": "sha512-J8xewKD/Gk22OZbhpOVSwcs60zhd95ESDwezOFuA3/099925PdHJ7OFHNTGtajL3AlZkykD32HykiMo+BIBI8A==", "license": "MIT" }, "node_modules/loose-envify": { @@ -3455,9 +3455,9 @@ "license": "MIT" }, "node_modules/vite": { - "version": "6.4.1", - "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.1.tgz", - "integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==", + "version": "6.4.2", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.2.tgz", + "integrity": "sha512-2N/55r4JDJ4gdrCvGgINMy+HH3iRpNIz8K6SFwVsA+JbQScLiC+clmAxBgwiSPgcG9U15QmvqCGWzMbqda5zGQ==", "dev": true, "license": "MIT", "dependencies": { diff --git a/gdpval_bench/run_benchmark.py b/gdpval_bench/run_benchmark.py index a3790eb..49d031c 100644 --- a/gdpval_bench/run_benchmark.py +++ b/gdpval_bench/run_benchmark.py @@ -388,6 +388,7 @@ def _make_config(cfg: Dict, phase: str, worker_id: int = 0): return OpenSpaceConfig( llm_model=cfg["model"], + llm_kwargs=cfg.get("llm_kwargs", {}), workspace_dir=str(rd / "workspace" / phase), recording_log_dir=rec_dir, recording_backends=cfg.get("backend_scope", ["shell", "web"]), @@ -1164,6 +1165,16 @@ async def main(args: argparse.Namespace) -> None: if getattr(args, "use_clawwork_productivity", False): cfg["use_clawwork_productivity"] = True + # Match CLI / MCP entrypoints: resolve provider credentials once and pass + # them into OpenSpaceConfig.llm_kwargs so benchmark runs work with custom + # OPENSPACE_LLM_* endpoints too. + from openspace.host_detection import build_llm_kwargs, load_runtime_env + + load_runtime_env() + resolved_model, llm_kwargs = build_llm_kwargs(cfg.get("model", "")) + cfg["model"] = resolved_model + cfg["llm_kwargs"] = llm_kwargs + # When using ClawWork productivity tools, ensure livebench is importable # before OpenSpace.initialize() (ShellSession loads productivity_tools which imports livebench) if cfg.get("use_clawwork_productivity"): diff --git a/openspace/.env.example b/openspace/.env.example index d8e9465..b9aba77 100644 --- a/openspace/.env.example +++ b/openspace/.env.example @@ -38,6 +38,18 @@ OPENROUTER_API_KEY= # OPENSPACE_LLM_API_KEY=sk-xxx # OPENSPACE_LLM_API_BASE=https://openrouter.ai/api/v1 +# --- Recommended split routing for OpenSpace itself --- +# Keep the main LLM on your OpenAI-compatible provider, +# but route skill embeddings separately. +# +# Example: LLM via sub2api / local gateway, skill embeddings via local fastembed +# +# OPENSPACE_MODEL=gpt-5.4 +# OPENSPACE_LLM_API_KEY=sk-xxx +# OPENSPACE_LLM_API_BASE=http://127.0.0.1:8080/v1 +# OPENSPACE_SKILL_EMBEDDING_BACKEND=local +# OPENSPACE_SKILL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 + # ── OpenSpace Cloud (optional) ────────────────────────────── # Register at https://open-space.cloud to get your key. # Enables cloud skill search & upload; local features work without it. @@ -51,9 +63,30 @@ OPENSPACE_API_KEY=sk_xxxxxxxxxxxxxxxx # Optional backup key for rate limit fallback: # ANTHROPIC_API_KEY_BACKUP= -# ── Embedding (optional) ──────────────────────────────────── -# For remote embedding API instead of local model. -# If not set, OpenSpace uses a local embedding model (BAAI/bge-small-en-v1.5). +# ── Skill Embedding (optional, router-only) ───────────────── +# Controls the skill-router semantic re-rank path independently +# from the main LLM provider. +# +# OPENSPACE_SKILL_EMBEDDING_BACKEND=auto +# - auto → prefer explicit remote embedding config, then legacy OpenAI-compatible env, then local fastembed +# - local → force local fastembed model +# - remote → force remote OpenAI-compatible /embeddings endpoint +# +# OPENSPACE_SKILL_EMBEDDING_BACKEND=local +# OPENSPACE_SKILL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 +# +# Or use a dedicated remote embedding endpoint: +# OPENSPACE_SKILL_EMBEDDING_BACKEND=remote +# OPENSPACE_SKILL_EMBEDDING_API_KEY=sk-xxx +# OPENSPACE_SKILL_EMBEDDING_API_BASE=https://example.com/v1 +# OPENSPACE_SKILL_EMBEDDING_MODEL=openai/text-embedding-3-small + +# ── Tool / Generic Embedding (optional) ───────────────────── +# Used by tool search's semantic retrieval. Can also act as a fallback +# remote embedding endpoint for the skill router when the dedicated +# OPENSPACE_SKILL_EMBEDDING_* vars are not set. +# +# If not set, tool search uses a local embedding model (BAAI/bge-small-en-v1.5). # EMBEDDING_BASE_URL= # EMBEDDING_API_KEY= # EMBEDDING_MODEL=openai/text-embedding-3-small @@ -68,4 +101,4 @@ OPENSPACE_API_KEY=sk_xxxxxxxxxxxxxxxx # LOCAL_SERVER_URL=http://127.0.0.1:5000 # ---- Debug (Optional) ---- -# OPENSPACE_DEBUG=true \ No newline at end of file +# OPENSPACE_DEBUG=true diff --git a/openspace/cloud/embedding.py b/openspace/cloud/embedding.py index 6606936..277e851 100644 --- a/openspace/cloud/embedding.py +++ b/openspace/cloud/embedding.py @@ -1,4 +1,13 @@ -"""Embedding generation via OpenAI-compatible API.""" +"""Embedding generation for skill routing. + +Supports a dedicated skill-embedding path that can be routed +independently from the main LLM: + +- ``OPENSPACE_SKILL_EMBEDDING_BACKEND=local`` → local fastembed model +- ``OPENSPACE_SKILL_EMBEDDING_BACKEND=remote`` → dedicated OpenAI-compatible endpoint +- ``OPENSPACE_SKILL_EMBEDDING_BACKEND=auto`` → prefer dedicated/generic remote config, + then fall back to legacy OpenAI-compatible env vars, then local fastembed +""" from __future__ import annotations @@ -11,26 +20,66 @@ logger = logging.getLogger("openspace.cloud") -# Constants (duplicated here to avoid top-level import of skill_ranker) -SKILL_EMBEDDING_MODEL = "openai/text-embedding-3-small" +# Defaults +SKILL_REMOTE_EMBEDDING_MODEL = "openai/text-embedding-3-small" +SKILL_LOCAL_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" SKILL_EMBEDDING_MAX_CHARS = 12_000 SKILL_EMBEDDING_DIMENSIONS = 1536 _OPENROUTER_BASE = "https://openrouter.ai/api/v1" _OPENAI_BASE = "https://api.openai.com/v1" +_VALID_BACKENDS = {"auto", "local", "remote"} +_LOCAL_EMBEDDER = None +_LOCAL_EMBEDDER_MODEL = None +_EMBEDDING_WARMUP_TEXT = "openspace skill embedding warmup" -def resolve_embedding_api() -> Tuple[Optional[str], str]: - """Resolve API key and base URL for embedding requests. +def resolve_skill_embedding_backend() -> str: + """Resolve skill-embedding backend mode.""" + value = os.environ.get("OPENSPACE_SKILL_EMBEDDING_BACKEND", "auto").strip().lower() + if value in _VALID_BACKENDS: + return value + return "auto" - Priority: - 1. ``OPENROUTER_API_KEY`` → OpenRouter base URL - 2. ``OPENAI_API_KEY`` + ``OPENAI_BASE_URL`` (default ``api.openai.com``) - 3. host-agent config (nanobot / openclaw) - Returns: - ``(api_key, base_url)`` — *api_key* may be ``None`` when no key is found. - """ +def resolve_skill_embedding_model(backend: Optional[str] = None) -> str: + """Resolve the model name for skill embeddings.""" + backend = backend or resolve_skill_embedding_backend() + explicit = os.environ.get("OPENSPACE_SKILL_EMBEDDING_MODEL", "").strip() + if explicit: + return explicit + if backend == "local": + return SKILL_LOCAL_EMBEDDING_MODEL + if backend == "auto": + remote_key, _ = _resolve_remote_embedding_api() + if not remote_key: + return SKILL_LOCAL_EMBEDDING_MODEL + return SKILL_REMOTE_EMBEDDING_MODEL + + +def using_local_skill_embeddings(backend: Optional[str] = None) -> bool: + """Return whether skill embeddings resolve to the local fastembed path.""" + backend = backend or resolve_skill_embedding_backend() + if backend == "local": + return True + if backend == "remote": + return False + remote_key, _ = _resolve_remote_embedding_api() + return not bool(remote_key) + + +def _resolve_remote_embedding_api() -> Tuple[Optional[str], str]: + """Resolve remote embedding credentials/base URL for skill routing.""" + dedicated_key = os.environ.get("OPENSPACE_SKILL_EMBEDDING_API_KEY") + dedicated_base = os.environ.get("OPENSPACE_SKILL_EMBEDDING_API_BASE") + if dedicated_key and dedicated_base: + return dedicated_key, dedicated_base.rstrip("/") + + generic_key = os.environ.get("EMBEDDING_API_KEY") + generic_base = os.environ.get("EMBEDDING_BASE_URL") + if generic_key and generic_base: + return generic_key, generic_base.rstrip("/") + or_key = os.environ.get("OPENROUTER_API_KEY") if or_key: return or_key, _OPENROUTER_BASE @@ -42,6 +91,7 @@ def resolve_embedding_api() -> Tuple[Optional[str], str]: try: from openspace.host_detection import get_openai_api_key + host_key = get_openai_api_key() if host_key: base = os.environ.get("OPENAI_BASE_URL", _OPENAI_BASE).rstrip("/") @@ -52,6 +102,22 @@ def resolve_embedding_api() -> Tuple[Optional[str], str]: return None, _OPENAI_BASE +def resolve_embedding_api() -> Tuple[Optional[str], str]: + """Resolve API key and base URL for remote embedding requests. + + Priority: + 1. ``OPENSPACE_SKILL_EMBEDDING_API_*`` dedicated skill-router endpoint + 2. ``EMBEDDING_*`` generic embedding endpoint + 3. ``OPENROUTER_API_KEY`` → OpenRouter base URL + 4. ``OPENAI_API_KEY`` + ``OPENAI_BASE_URL`` (default ``api.openai.com``) + 5. host-agent config (nanobot / openclaw) + + Returns: + ``(api_key, base_url)`` — *api_key* may be ``None`` when no key is found. + """ + return _resolve_remote_embedding_api() + + def cosine_similarity(a: List[float], b: List[float]) -> float: """Compute cosine similarity between two vectors.""" if len(a) != len(b) or not a: @@ -81,33 +147,98 @@ def build_skill_embedding_text( return raw[:max_chars] +def _load_local_embedder(model_name: str): + """Load and cache the local embedding model.""" + global _LOCAL_EMBEDDER, _LOCAL_EMBEDDER_MODEL + + if _LOCAL_EMBEDDER is not None and _LOCAL_EMBEDDER_MODEL == model_name: + return _LOCAL_EMBEDDER + + try: + from fastembed import TextEmbedding + except ImportError: + logger.warning( + "Local skill embeddings requested but fastembed is not installed. " + "Install it with `pip install fastembed`." + ) + return None + + try: + logger.info("Loading local skill embedding model: %s", model_name) + _LOCAL_EMBEDDER = TextEmbedding(model_name=model_name) + _LOCAL_EMBEDDER_MODEL = model_name + return _LOCAL_EMBEDDER + except Exception as exc: + logger.warning("Failed to load local skill embedding model %s: %s", model_name, exc) + return None + + +def _generate_local_embedding(text: str, model_name: str) -> Optional[List[float]]: + embedder = _load_local_embedder(model_name) + if embedder is None: + return None + + try: + vector = next(iter(embedder.embed([text]))) + if hasattr(vector, "tolist"): + return vector.tolist() + return list(vector) + except Exception as exc: + logger.warning("Local skill embedding generation failed: %s", exc) + return None + + +def prewarm_local_skill_embedding_backend() -> bool: + """Warm the local skill embedding backend when local routing is active. + + Returns True when the local backend is active and the embedder produced + a warmup embedding, False otherwise. + """ + backend = resolve_skill_embedding_backend() + if not using_local_skill_embeddings(backend): + return False + + model_name = resolve_skill_embedding_model(backend) + vector = _generate_local_embedding(_EMBEDDING_WARMUP_TEXT, model_name) + return vector is not None + + def generate_embedding(text: str, api_key: Optional[str] = None) -> Optional[List[float]]: - """Generate embedding using OpenAI-compatible API. + """Generate skill embedding using the configured local/remote backend. When *api_key* is ``None``, credentials are resolved automatically via - :func:`resolve_embedding_api` (``OPENROUTER_API_KEY`` → ``OPENAI_API_KEY`` - → host-agent config). + :func:`resolve_embedding_api`. - This is a **synchronous** call (uses urllib). In async contexts, - wrap with ``asyncio.to_thread()``. + Local mode uses ``fastembed``. + Remote mode uses an OpenAI-compatible ``/embeddings`` endpoint. Args: text: The text to embed. - api_key: Explicit API key. When provided, base URL is still resolved - from environment (``OPENROUTER_API_KEY`` presence determines - the endpoint). + api_key: Explicit API key for remote mode. Returns: Embedding vector, or None on failure. """ + backend = resolve_skill_embedding_backend() + model_name = resolve_skill_embedding_model(backend) + + if backend == "local": + return _generate_local_embedding(text, model_name) + resolved_key, base_url = resolve_embedding_api() if api_key is None: api_key = resolved_key + if not api_key: - return None + if backend == "remote": + logger.warning( + "Remote skill embeddings requested but no embedding API key/base was resolved." + ) + return None + return _generate_local_embedding(text, SKILL_LOCAL_EMBEDDING_MODEL) body = json.dumps({ - "model": SKILL_EMBEDDING_MODEL, + "model": model_name, "input": text, }).encode("utf-8") @@ -125,5 +256,7 @@ def generate_embedding(text: str, api_key: Optional[str] = None) -> Optional[Lis data = json.loads(resp.read().decode("utf-8")) return data.get("data", [{}])[0].get("embedding") except Exception as e: - logger.warning("Embedding generation failed: %s", e) + logger.warning("Remote skill embedding generation failed: %s", e) + if backend == "auto": + return _generate_local_embedding(text, SKILL_LOCAL_EMBEDDING_MODEL) return None diff --git a/openspace/cloud/search.py b/openspace/cloud/search.py index bd25436..046aa81 100644 --- a/openspace/cloud/search.py +++ b/openspace/cloud/search.py @@ -158,6 +158,9 @@ def _score_phase( ) -> List[Dict[str, Any]]: """Compute hybrid score = vector_score + lexical_boost.""" from openspace.cloud.embedding import cosine_similarity + from openspace.skill_engine.skill_ranker import SkillCandidate, SkillRanker + + ranker: Optional[SkillRanker] = None scored = [] for candidate in candidates: @@ -170,6 +173,32 @@ def _score_phase( ranking_signal_score = 0.0 if query_embedding: candidate_embedding = candidate.get("_embedding") + if ( + candidate_embedding is None + and candidate.get("source") == "openspace-local" + and candidate.get("_embedding_text") + ): + if ranker is None: + ranker = SkillRanker(enable_cache=True) + + cached = ranker.get_cached_embedding(candidate.get("skill_id", "")) + if cached: + candidate_embedding = cached + else: + skill_candidate = SkillCandidate( + skill_id=candidate.get("skill_id", ""), + name=candidate_name, + description=candidate.get("description", ""), + body="", + metadata=candidate, + ) + skill_candidate.embedding_text = candidate.get("_embedding_text", "") + ranker.prime_candidates([skill_candidate]) + candidate_embedding = skill_candidate.embedding + + if candidate_embedding: + candidate["_embedding"] = candidate_embedding + if candidate_embedding and isinstance(candidate_embedding, list): vector_score = cosine_similarity(query_embedding, candidate_embedding) ranking_signal_score = vector_score @@ -423,14 +452,6 @@ async def hybrid_search_skills( query_embedding: Optional[List[float]] = None try: query_embedding = await asyncio.to_thread(generate_embedding, normalized_query) - if query_embedding: - for candidate in candidates: - if not candidate.get("_embedding") and candidate.get("_embedding_text"): - candidate_embedding = await asyncio.to_thread( - generate_embedding, candidate["_embedding_text"], - ) - if candidate_embedding: - candidate["_embedding"] = candidate_embedding except Exception: pass diff --git a/openspace/codex_session_scenarios.py b/openspace/codex_session_scenarios.py new file mode 100644 index 0000000..9d9ff00 --- /dev/null +++ b/openspace/codex_session_scenarios.py @@ -0,0 +1,627 @@ +from __future__ import annotations + +import argparse +import json +import os +import re +import shutil +import signal +import subprocess +import tempfile +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + + +OPENING_PREFIX = "先做一次 OpenSpace 预检,再开始当前任务。" +SESSION_STATUS_RE = re.compile(r"OpenSpace session:\s*(\S+)") +MACHINE_STATUS_RE = re.compile(r"OpenSpace machine:\s*(\S+)") +FALLBACK_LINE = "当前线程不依赖 OpenSpace 自动沉淀;我会先按本地文档、脚本或手动收尾路径继续。" + + +@dataclass(frozen=True) +class OpeningStatus: + session_status: str + machine_status: str + fallback_present: bool + text: str + + +@dataclass(frozen=True) +class ExecOutput: + thread_id: str | None + events: list[dict[str, Any]] + agent_messages: list[str] + opening: OpeningStatus | None + + +@dataclass(frozen=True) +class SessionFamily: + thread_ids: set[str] + session_files: set[Path] + + +@dataclass(frozen=True) +class RunResult: + label: str + exit_code: int + thread_id: str | None + opening: dict[str, Any] | None + agent_messages: list[str] + mcp_tool_calls: list[dict[str, Any]] + session_files: list[str] + thread_ids: list[str] + stdout_path: str + stderr_path: str + + +@dataclass(frozen=True) +class ScenarioResult: + name: str + ok: bool + details: dict[str, Any] + + +def _iter_json_events(output: str) -> list[dict[str, Any]]: + events: list[dict[str, Any]] = [] + for raw_line in output.splitlines(): + line = raw_line.strip() + if not line.startswith("{"): + continue + try: + payload = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(payload, dict): + events.append(payload) + return events + + +def _extract_agent_message(event: dict[str, Any]) -> str | None: + if event.get("type") != "item.completed": + return None + item = event.get("item") + if not isinstance(item, dict): + return None + if item.get("type") != "agent_message": + return None + text = item.get("text") + return text if isinstance(text, str) else None + + +def _extract_mcp_tool_call(event: dict[str, Any]) -> dict[str, Any] | None: + if event.get("type") != "item.completed": + return None + item = event.get("item") + if not isinstance(item, dict): + return None + if item.get("type") != "mcp_tool_call": + return None + return { + "server": item.get("server"), + "tool": item.get("tool"), + "status": item.get("status"), + "error": item.get("error"), + } + + +def _parse_opening(text: str) -> OpeningStatus | None: + if not text.startswith(OPENING_PREFIX): + return None + session_match = SESSION_STATUS_RE.search(text) + machine_match = MACHINE_STATUS_RE.search(text) + if session_match is None or machine_match is None: + return None + return OpeningStatus( + session_status=session_match.group(1), + machine_status=machine_match.group(1), + fallback_present=FALLBACK_LINE in text, + text=text, + ) + + +def parse_exec_output(output: str) -> ExecOutput: + events = _iter_json_events(output) + thread_id: str | None = None + agent_messages: list[str] = [] + opening: OpeningStatus | None = None + + for event in events: + if event.get("type") == "thread.started" and thread_id is None: + candidate = event.get("thread_id") + if isinstance(candidate, str): + thread_id = candidate + agent_message = _extract_agent_message(event) + if agent_message is not None: + agent_messages.append(agent_message) + if opening is None: + opening = _parse_opening(agent_message) + + return ExecOutput( + thread_id=thread_id, + events=events, + agent_messages=agent_messages, + opening=opening, + ) + + +def _read_session_meta(path: Path) -> dict[str, Any] | None: + try: + first_line = path.read_text(encoding="utf-8").splitlines()[0] + except Exception: + return None + try: + payload = json.loads(first_line) + except json.JSONDecodeError: + return None + if payload.get("type") != "session_meta": + return None + session_payload = payload.get("payload") + return session_payload if isinstance(session_payload, dict) else None + + +def collect_session_family(thread_id: str, sessions_root: Path) -> SessionFamily: + metas: list[tuple[Path, dict[str, Any]]] = [] + for path in sessions_root.glob("**/*.jsonl"): + meta = _read_session_meta(path) + if meta is not None: + metas.append((path, meta)) + + thread_ids: set[str] = {thread_id} + session_files: set[Path] = set() + changed = True + + while changed: + changed = False + for path, meta in metas: + current_id = meta.get("id") + if isinstance(current_id, str) and current_id in thread_ids: + if path not in session_files: + session_files.add(path) + changed = True + + source = meta.get("source") + if not isinstance(source, dict): + continue + subagent = source.get("subagent") + if not isinstance(subagent, dict): + continue + thread_spawn = subagent.get("thread_spawn") + if not isinstance(thread_spawn, dict): + continue + parent_thread_id = thread_spawn.get("parent_thread_id") + if ( + isinstance(parent_thread_id, str) + and parent_thread_id in thread_ids + and isinstance(current_id, str) + and current_id not in thread_ids + ): + thread_ids.add(current_id) + session_files.add(path) + changed = True + + return SessionFamily(thread_ids=thread_ids, session_files=session_files) + + +def cleanup_session_artifacts( + *, + thread_ids: set[str], + session_files: set[Path], + session_index_path: Path, +) -> None: + for path in session_files: + path.unlink(missing_ok=True) + + if not session_index_path.exists(): + return + + kept_lines: list[str] = [] + for line in session_index_path.read_text(encoding="utf-8").splitlines(): + stripped = line.strip() + if not stripped: + continue + try: + payload = json.loads(stripped) + except json.JSONDecodeError: + kept_lines.append(line) + continue + + payload_thread_id = payload.get("thread_id") + payload_session_file = payload.get("session_file") + if payload_thread_id in thread_ids: + continue + if isinstance(payload_session_file, str) and Path(payload_session_file) in session_files: + continue + kept_lines.append(line) + + new_content = "\n".join(kept_lines) + if new_content: + new_content += "\n" + session_index_path.write_text(new_content, encoding="utf-8") + + +def snapshot_daemons(state_dir: Path, workspace: str) -> dict[str, dict[str, Any]]: + snapshot: dict[str, dict[str, Any]] = {} + if not state_dir.exists(): + return snapshot + + for path in state_dir.glob("*.json"): + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + if payload.get("workspace") != workspace: + continue + server_kind = payload.get("server_kind") + if not isinstance(server_kind, str): + continue + snapshot[server_kind] = payload + return snapshot + + +def kill_daemons_in_state_dir(state_dir: Path) -> None: + if not state_dir.exists(): + return + for path in state_dir.glob("*.json"): + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + pid = payload.get("pid") + if not isinstance(pid, int): + continue + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass + except PermissionError: + pass + else: + deadline = time.time() + 2.0 + while time.time() < deadline: + try: + os.kill(pid, 0) + except ProcessLookupError: + break + time.sleep(0.05) + else: + try: + os.kill(pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError, AttributeError): + pass + shutil.rmtree(state_dir, ignore_errors=True) + + +def run_exec_session( + *, + label: str, + prompt: str, + cwd: Path, + extra_configs: list[str] | None = None, + timeout_seconds: int = 180, + codex_binary: str = "codex", + sessions_root: Path | None = None, + session_index_path: Path | None = None, + output_dir: Path | None = None, +) -> RunResult: + sessions_root = sessions_root or (Path.home() / ".codex" / "sessions") + session_index_path = session_index_path or (Path.home() / ".codex" / "session_index.jsonl") + output_dir = output_dir or Path(tempfile.mkdtemp(prefix="openspace-codex-session-")) + output_dir.mkdir(parents=True, exist_ok=True) + stdout_path = output_dir / f"{label}.stdout.log" + stderr_path = output_dir / f"{label}.stderr.log" + + command = [ + codex_binary, + "exec", + "--json", + "--skip-git-repo-check", + "--dangerously-bypass-approvals-and-sandbox", + "-C", + str(cwd), + ] + for item in extra_configs or []: + command.extend(["-c", item]) + command.append(prompt) + + completed = subprocess.run( + command, + capture_output=True, + text=True, + errors="replace", + timeout=timeout_seconds, + check=False, + ) + stdout_path.write_text(completed.stdout, encoding="utf-8") + stderr_path.write_text(completed.stderr, encoding="utf-8") + + parsed = parse_exec_output(completed.stdout) + session_family = ( + collect_session_family(parsed.thread_id, sessions_root) + if parsed.thread_id + else SessionFamily(thread_ids=set(), session_files=set()) + ) + mcp_tool_calls = [ + call + for event in parsed.events + if (call := _extract_mcp_tool_call(event)) is not None + ] + + return RunResult( + label=label, + exit_code=completed.returncode, + thread_id=parsed.thread_id, + opening=asdict(parsed.opening) if parsed.opening else None, + agent_messages=parsed.agent_messages, + mcp_tool_calls=mcp_tool_calls, + session_files=sorted(str(path) for path in session_family.session_files), + thread_ids=sorted(session_family.thread_ids), + stdout_path=str(stdout_path), + stderr_path=str(stderr_path), + ) + + +def _session_env_overrides(state_dir: Path, workspace: Path) -> list[str]: + return [ + f'mcp_servers.openspace.env.OPENSPACE_MCP_DAEMON_STATE_DIR="{state_dir}"', + f'mcp_servers.openspace_evolution.env.OPENSPACE_MCP_DAEMON_STATE_DIR="{state_dir}"', + 'mcp_servers.openspace.env.OPENSPACE_MCP_PROXY_MODE="daemon"', + 'mcp_servers.openspace_evolution.env.OPENSPACE_MCP_PROXY_MODE="daemon"', + f'mcp_servers.openspace.env.OPENSPACE_WORKSPACE="{workspace}"', + f'mcp_servers.openspace_evolution.env.OPENSPACE_WORKSPACE="{workspace}"', + ] + + +def _assert(condition: bool, message: str, errors: list[str]) -> None: + if not condition: + errors.append(message) + + +def _scenario_cold_start(base_dir: Path, cwd: Path) -> ScenarioResult: + state_dir = base_dir / "cold-start-state" + run = run_exec_session( + label="cold-start", + prompt=( + "这是 OpenSpace 冷启动预检测试。严格按 TMP 的 AGENTS.md 做开场预检," + "然后只输出一句“cold-start done”。不要修改任何文件,也不要使用子代理。" + ), + cwd=cwd, + extra_configs=_session_env_overrides(state_dir, cwd), + output_dir=base_dir / "cold-start-output", + ) + + errors: list[str] = [] + opening = run.opening or {} + _assert(run.exit_code == 0, f"cold-start exit_code={run.exit_code}", errors) + _assert(opening.get("session_status") == "ready", f"cold-start session_status={opening.get('session_status')}", errors) + _assert(opening.get("machine_status") == "ready", f"cold-start machine_status={opening.get('machine_status')}", errors) + _assert(opening.get("fallback_present") is False, "cold-start unexpectedly printed fallback line", errors) + _assert(run.agent_messages[-1:] == ["cold-start done"], f"cold-start final message={run.agent_messages[-1:]}", errors) + + return ScenarioResult( + name="cold-start-preflight", + ok=not errors, + details={"errors": errors, "run": asdict(run), "state_dirs": [str(state_dir)]}, + ) + + +def _scenario_warm_reuse(base_dir: Path, cwd: Path) -> ScenarioResult: + state_dir = base_dir / "warm-reuse-state" + extra_configs = _session_env_overrides(state_dir, cwd) + prompt = ( + "这是 OpenSpace warm-session reuse 测试。先完成 TMP 的开场预检," + "然后调用 openspace 的 search_skills 工具,参数用 query='OpenSpace MCP 健康检查'、" + "source='local'、limit=1、auto_import=false。最后只输出一句“warm-session done”。" + "不要修改任何文件,也不要使用子代理。" + ) + + first = run_exec_session( + label="warm-reuse-first", + prompt=prompt, + cwd=cwd, + extra_configs=extra_configs, + output_dir=base_dir / "warm-reuse-output", + ) + first_snapshot = snapshot_daemons(state_dir, str(cwd)) + + second = run_exec_session( + label="warm-reuse-second", + prompt=prompt, + cwd=cwd, + extra_configs=extra_configs, + output_dir=base_dir / "warm-reuse-output", + ) + second_snapshot = snapshot_daemons(state_dir, str(cwd)) + + errors: list[str] = [] + for label, run in (("first", first), ("second", second)): + opening = run.opening or {} + _assert(run.exit_code == 0, f"warm-reuse {label} exit_code={run.exit_code}", errors) + _assert(opening.get("session_status") == "ready", f"warm-reuse {label} session_status={opening.get('session_status')}", errors) + _assert(any(call.get("server") == "openspace" and call.get("tool") == "search_skills" for call in run.mcp_tool_calls), f"warm-reuse {label} missing openspace.search_skills call", errors) + _assert(run.agent_messages[-1:] == ["warm-session done"], f"warm-reuse {label} final message={run.agent_messages[-1:]}", errors) + + _assert("main" in first_snapshot, "warm-reuse first run did not create main daemon metadata", errors) + _assert("main" in second_snapshot, "warm-reuse second run did not create main daemon metadata", errors) + if "main" in first_snapshot and "main" in second_snapshot: + _assert( + first_snapshot["main"].get("pid") == second_snapshot["main"].get("pid"), + f"warm-reuse main pid changed: {first_snapshot['main'].get('pid')} -> {second_snapshot['main'].get('pid')}", + errors, + ) + _assert( + first_snapshot["main"].get("port") == second_snapshot["main"].get("port"), + f"warm-reuse main port changed: {first_snapshot['main'].get('port')} -> {second_snapshot['main'].get('port')}", + errors, + ) + + return ScenarioResult( + name="warm-session-reuse", + ok=not errors, + details={ + "errors": errors, + "first": asdict(first), + "second": asdict(second), + "first_snapshot": first_snapshot, + "second_snapshot": second_snapshot, + "state_dirs": [str(state_dir)], + }, + ) + + +def _scenario_unhealthy_fallback(base_dir: Path, cwd: Path) -> ScenarioResult: + state_dir = base_dir / "unhealthy-state" + missing_command = base_dir / "missing-openspace-command" + run = run_exec_session( + label="unhealthy-fallback", + prompt=( + "这是 OpenSpace unhealthy-session fallback 测试。严格按 TMP 的 AGENTS.md 做开场预检," + "然后只输出一句“unhealthy-session done”。不要修改任何文件,也不要使用子代理。" + ), + cwd=cwd, + extra_configs=_session_env_overrides(state_dir, cwd) + + [ + f'mcp_servers.openspace.command="{missing_command}"', + f'mcp_servers.openspace_evolution.command="{missing_command}"', + ], + output_dir=base_dir / "unhealthy-output", + ) + + errors: list[str] = [] + opening = run.opening or {} + _assert(run.exit_code == 0, f"unhealthy fallback exit_code={run.exit_code}", errors) + _assert( + opening.get("session_status") in {"exposed-but-unhealthy", "unknown"}, + f"unhealthy fallback session_status={opening.get('session_status')}", + errors, + ) + _assert(opening.get("machine_status") == "ready", f"unhealthy fallback machine_status={opening.get('machine_status')}", errors) + _assert(opening.get("fallback_present") is True, "unhealthy fallback missing fallback line", errors) + _assert(run.agent_messages[-1:] == ["unhealthy-session done"], f"unhealthy fallback final message={run.agent_messages[-1:]}", errors) + + return ScenarioResult( + name="unhealthy-session-fallback", + ok=not errors, + details={"errors": errors, "run": asdict(run), "state_dirs": [str(state_dir)]}, + ) + + +def _scenario_agent_team(base_dir: Path, cwd: Path) -> ScenarioResult: + state_dir = base_dir / "agent-team-state" + run = run_exec_session( + label="agent-team", + prompt=( + "这是 OpenSpace agent-team 测试。先完成 TMP 的开场预检,然后使用 agent team," + "至少启动两个只读子代理,分别检查 TMP 仓库里的 AGENTS.md 和 " + "scripts/check_openspace_mcp_preflight.py 与 OpenSpace 预检相关的内容。" + "父线程最后只输出一句“agent-team done”。不要修改任何文件。" + ), + cwd=cwd, + extra_configs=_session_env_overrides(state_dir, cwd), + timeout_seconds=240, + output_dir=base_dir / "agent-team-output", + ) + + errors: list[str] = [] + opening = run.opening or {} + _assert(run.exit_code == 0, f"agent-team exit_code={run.exit_code}", errors) + _assert(opening.get("session_status") == "ready", f"agent-team session_status={opening.get('session_status')}", errors) + _assert(len(run.thread_ids) >= 3, f"agent-team expected parent + >=2 child threads, got {run.thread_ids}", errors) + _assert(run.agent_messages[-1:] == ["agent-team done"], f"agent-team final message={run.agent_messages[-1:]}", errors) + + return ScenarioResult( + name="agent-team-split", + ok=not errors, + details={"errors": errors, "run": asdict(run), "state_dirs": [str(state_dir)]}, + ) + + +def _cleanup_run_artifacts(result: ScenarioResult, session_index_path: Path) -> None: + details = result.details + candidate_runs: list[dict[str, Any]] = [] + if "run" in details and isinstance(details["run"], dict): + candidate_runs.append(details["run"]) + for key in ("first", "second"): + if key in details and isinstance(details[key], dict): + candidate_runs.append(details[key]) + + session_files: set[Path] = set() + thread_ids: set[str] = set() + for run in candidate_runs: + for item in run.get("session_files", []): + session_files.add(Path(item)) + for item in run.get("thread_ids", []): + thread_ids.add(item) + + cleanup_session_artifacts( + thread_ids=thread_ids, + session_files=session_files, + session_index_path=session_index_path, + ) + + +def run_scenarios(*, cwd: Path, cleanup: bool = True) -> dict[str, Any]: + base_dir = Path(tempfile.mkdtemp(prefix="openspace-codex-scenarios-")) + session_index_path = Path.home() / ".codex" / "session_index.jsonl" + started_at = time.time() + + scenario_steps = [ + ("cold-start-preflight", _scenario_cold_start), + ("warm-session-reuse", _scenario_warm_reuse), + ("unhealthy-session-fallback", _scenario_unhealthy_fallback), + ("agent-team-split", _scenario_agent_team), + ] + results: list[ScenarioResult] = [] + + try: + for scenario_name, scenario_fn in scenario_steps: + try: + results.append(scenario_fn(base_dir, cwd)) + except Exception as exc: # noqa: BLE001 + results.append( + ScenarioResult( + name=scenario_name, + ok=False, + details={"errors": [f"{type(exc).__name__}: {exc}"]}, + ) + ) + finally: + if cleanup: + for result in results: + _cleanup_run_artifacts(result, session_index_path) + for state_dir in base_dir.glob("*-state"): + kill_daemons_in_state_dir(state_dir) + shutil.rmtree(base_dir, ignore_errors=True) + + return { + "cwd": str(cwd), + "started_at": started_at, + "cleanup": cleanup, + "all_ok": all(item.ok for item in results), + "results": [asdict(item) for item in results], + } + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run real Codex OpenSpace session scenarios") + parser.add_argument( + "--cwd", + type=Path, + default=Path("/Users/admin/PycharmProjects/TMP"), + help="Working directory for real session scenarios", + ) + parser.add_argument( + "--keep-artifacts", + action="store_true", + help="Keep generated session files, daemon state, and command logs", + ) + args = parser.parse_args() + + summary = run_scenarios(cwd=args.cwd.resolve(), cleanup=not args.keep_artifacts) + print(json.dumps(summary, ensure_ascii=False, indent=2)) + return 0 if summary["all_ok"] else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/openspace/config/README.md b/openspace/config/README.md index f2e0875..cd0222e 100644 --- a/openspace/config/README.md +++ b/openspace/config/README.md @@ -35,6 +35,13 @@ Set via `.env`, MCP config `env` block, or system environment. | `OPENSPACE_LLM_API_BASE` | LLM API base URL | — | | `OPENSPACE_LLM_EXTRA_HEADERS` | Extra LLM headers (JSON) | — | | `OPENSPACE_LLM_CONFIG` | Arbitrary litellm kwargs (JSON) | — | +| `OPENSPACE_SKILL_EMBEDDING_BACKEND` | Skill-router embedding backend: `auto`, `local`, or `remote` | `auto` | +| `OPENSPACE_SKILL_EMBEDDING_MODEL` | Skill-router embedding model | `BAAI/bge-small-en-v1.5` in local mode, `openai/text-embedding-3-small` in remote mode | +| `OPENSPACE_SKILL_EMBEDDING_API_KEY` | Dedicated remote embedding API key for skill routing | — | +| `OPENSPACE_SKILL_EMBEDDING_API_BASE` | Dedicated remote embedding API base for skill routing | — | +| `EMBEDDING_API_KEY` | Generic embedding API key (tool search, optional skill-router fallback) | — | +| `EMBEDDING_BASE_URL` | Generic embedding API base URL | — | +| `EMBEDDING_MODEL` | Generic embedding model for tool search | `BAAI/bge-small-en-v1.5` | | `OPENSPACE_API_KEY` | Cloud API key ([open-space.cloud](https://open-space.cloud)) | — | | `OPENSPACE_MAX_ITERATIONS` | Max agent iterations per task | `20` | | `OPENSPACE_BACKEND_SCOPE` | Enabled backends (comma-separated) | `shell,gui,mcp,web,system` | @@ -47,6 +54,29 @@ Set via `.env`, MCP config `env` block, or system environment. | `OPENSPACE_ENABLE_RECORDING` | Record execution traces | `true` | | `OPENSPACE_LOG_LEVEL` | Log level | `INFO` | +### Split-routing example + +Keep the main LLM on an OpenAI-compatible provider, but force the +skill-router embedding path to stay local: + +```bash +OPENSPACE_MODEL=gpt-5.4 +OPENSPACE_LLM_API_KEY=sk-xxx +OPENSPACE_LLM_API_BASE=http://127.0.0.1:8080/v1 + +OPENSPACE_SKILL_EMBEDDING_BACKEND=local +OPENSPACE_SKILL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 +``` + +Or send skill embeddings to a separate endpoint: + +```bash +OPENSPACE_SKILL_EMBEDDING_BACKEND=remote +OPENSPACE_SKILL_EMBEDDING_API_KEY=sk-embed-xxx +OPENSPACE_SKILL_EMBEDDING_API_BASE=https://example.com/v1 +OPENSPACE_SKILL_EMBEDDING_MODEL=openai/text-embedding-3-small +``` + ## 3. MCP Servers (`config_mcp.json`) Register external MCP servers that OpenSpace connects to as a **client** (e.g. GitHub, Slack, databases): diff --git a/openspace/evolution_mcp_server.py b/openspace/evolution_mcp_server.py new file mode 100644 index 0000000..2587f04 --- /dev/null +++ b/openspace/evolution_mcp_server.py @@ -0,0 +1,759 @@ +"""OpenSpace evolution-only MCP server. + +This sidecar is designed for host-agent workflows where the main coding is +handled elsewhere (for example Codex Desktop with subscription auth), while +OpenSpace is only used to capture reusable skills via a separate provider. +""" + +from __future__ import annotations + +import asyncio +import inspect +import json +import logging +import os +import subprocess +import signal +import sys +import threading +import time +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + +from openspace.mcp_stdio import maybe_redirect_stderr_to_file +from openspace.mcp_tool_registration import register_evolution_tools + +class _MCPSafeStdout: + """Stdout wrapper: binary (.buffer) -> real stdout, text (.write) -> stderr.""" + + def __init__(self, real_stdout, stderr): + self._real = real_stdout + self._stderr = stderr + + @property + def buffer(self): + return self._real.buffer + + def fileno(self): + return self._real.fileno() + + def write(self, s): + return self._stderr.write(s) + + def writelines(self, lines): + return self._stderr.writelines(lines) + + def flush(self): + self._stderr.flush() + try: + self._real.flush() + except ValueError: + pass + + def isatty(self): + return self._stderr.isatty() + + @property + def encoding(self): + return self._stderr.encoding + + @property + def errors(self): + return self._stderr.errors + + @property + def closed(self): + return self._stderr.closed + + def readable(self): + return False + + def writable(self): + return True + + def seekable(self): + return False + + def __getattr__(self, name): + return getattr(self._stderr, name) + + +_LOG_DIR = Path(__file__).resolve().parent.parent / "logs" +_LOG_DIR.mkdir(parents=True, exist_ok=True) + +_real_stdout = sys.stdout +maybe_redirect_stderr_to_file(_LOG_DIR, "evolution_mcp_stderr.log") + +sys.stdout = _MCPSafeStdout(_real_stdout, sys.stderr) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.FileHandler(_LOG_DIR / "evolution_mcp_server.log")], +) +logger = logging.getLogger("openspace.evolution_mcp_server") + +from mcp.server.fastmcp import FastMCP + +_fastmcp_kwargs: dict = {} +try: + if "description" in inspect.signature(FastMCP.__init__).parameters: + _fastmcp_kwargs["description"] = ( + "OpenSpace evolution sidecar: capture reusable skills from host-agent work." + ) +except (TypeError, ValueError): + pass + +mcp = FastMCP("OpenSpace Evolution", **_fastmcp_kwargs) + +_openspace_instance = None +_openspace_lock = asyncio.Lock() +_UPLOAD_META_FILENAME = ".upload_meta.json" +_idle_watchdog_started = False +_activity_lock = threading.Lock() +_active_request_count = 0 +_last_activity_at = time.monotonic() +_shutdown_started = False +_shutdown_lock = threading.Lock() + + +def _json_ok(data: Any) -> str: + return json.dumps(data, ensure_ascii=False, indent=2) + + +def _json_error(error: Any, **extra) -> str: + return json.dumps({"error": str(error), **extra}, ensure_ascii=False) + + +def _mark_request_start() -> None: + global _active_request_count, _last_activity_at + with _activity_lock: + _active_request_count += 1 + _last_activity_at = time.monotonic() + from openspace.shared_mcp_runtime import update_current_daemon_status + + update_current_daemon_status("evolution", touch=True, active_delta=1) + + +def _mark_request_end() -> None: + global _active_request_count, _last_activity_at + with _activity_lock: + _active_request_count = max(0, _active_request_count - 1) + _last_activity_at = time.monotonic() + from openspace.shared_mcp_runtime import update_current_daemon_status + + update_current_daemon_status("evolution", touch=True, active_delta=-1) + + +def _shutdown_worker(reason: str) -> None: + logger.info("Shutting down OpenSpace evolution daemon: %s", reason) + instance = _openspace_instance + if instance is not None and instance.is_initialized(): + try: + asyncio.run(asyncio.wait_for(instance.cleanup(), timeout=10.0)) + except Exception as exc: + logger.warning("OpenSpace evolution cleanup during shutdown failed: %s", exc) + logging.shutdown() + os._exit(0) + + +def _begin_shutdown(reason: str) -> None: + global _shutdown_started + with _shutdown_lock: + if _shutdown_started: + return + _shutdown_started = True + + threading.Thread( + target=_shutdown_worker, + args=(reason,), + name="openspace-evolution-shutdown", + daemon=True, + ).start() + + +def _install_signal_handlers() -> None: + def _handle(signum, _frame) -> None: + try: + signame = signal.Signals(signum).name + except Exception: + signame = str(signum) + _begin_shutdown(f"signal {signame}") + + for signum in (signal.SIGTERM, signal.SIGINT): + try: + signal.signal(signum, _handle) + except Exception: + continue + + +def _idle_watchdog_loop(idle_timeout_seconds: int) -> None: + check_interval = max(1, min(max(idle_timeout_seconds // 3, 1), 60)) + logger.info("Evolution MCP idle watchdog enabled: timeout=%ss", idle_timeout_seconds) + while True: + time.sleep(check_interval) + with _activity_lock: + active = _active_request_count + idle_for = time.monotonic() - _last_activity_at + if active == 0 and idle_for >= idle_timeout_seconds: + logger.info( + "Evolution MCP idle watchdog exiting process after %.1fs idle with no active requests", + idle_for, + ) + _begin_shutdown(f"idle timeout after {idle_for:.1f}s") + return + + +def _maybe_start_idle_watchdog() -> None: + global _idle_watchdog_started + if _idle_watchdog_started: + return + + timeout_raw = os.environ.get("OPENSPACE_EVOLUTION_MCP_IDLE_TIMEOUT_SECONDS", "").strip() + if not timeout_raw: + timeout_raw = os.environ.get("OPENSPACE_MCP_IDLE_TIMEOUT_SECONDS", "").strip() + if timeout_raw: + try: + idle_timeout_seconds = int(timeout_raw) + except ValueError: + logger.warning( + "Invalid evolution MCP idle timeout value=%r " + "(from OPENSPACE_EVOLUTION_MCP_IDLE_TIMEOUT_SECONDS or OPENSPACE_MCP_IDLE_TIMEOUT_SECONDS)", + timeout_raw, + ) + return + else: + idle_timeout_seconds = 900 + + if idle_timeout_seconds <= 0: + return + + watchdog = threading.Thread( + target=_idle_watchdog_loop, + args=(idle_timeout_seconds,), + name="openspace-evolution-mcp-idle-watchdog", + daemon=True, + ) + watchdog.start() + _idle_watchdog_started = True + + +async def _get_openspace(): + global _openspace_instance + if _openspace_instance is not None and _openspace_instance.is_initialized(): + return _openspace_instance + + async with _openspace_lock: + if _openspace_instance is not None and _openspace_instance.is_initialized(): + return _openspace_instance + + logger.info("Initializing OpenSpace evolution engine ...") + from openspace.host_detection import ( + build_grounding_config_path, + build_llm_kwargs, + load_runtime_env, + ) + from openspace.tool_layer import OpenSpace, OpenSpaceConfig + + load_runtime_env() + + env_model = os.environ.get("OPENSPACE_MODEL", "") + workspace = os.environ.get("OPENSPACE_WORKSPACE") + enable_rec = os.environ.get("OPENSPACE_ENABLE_RECORDING", "false").lower() in ( + "true", + "1", + "yes", + ) + backend_scope_raw = os.environ.get("OPENSPACE_BACKEND_SCOPE", "shell,system") + backend_scope = [ + b.strip() for b in backend_scope_raw.split(",") if b.strip() + ] or None + + config_path = build_grounding_config_path() + model, llm_kwargs = build_llm_kwargs(env_model) + + config = OpenSpaceConfig( + llm_model=model, + llm_kwargs=llm_kwargs, + workspace_dir=workspace, + grounding_max_iterations=1, + enable_recording=enable_rec, + enable_skill_engine_without_recording=True, + recording_backends=["shell"] if enable_rec else None, + backend_scope=backend_scope, + grounding_config_path=config_path, + ) + + _openspace_instance = OpenSpace(config=config) + await _openspace_instance.initialize() + logger.info("OpenSpace evolution engine ready (model=%s).", model) + return _openspace_instance + + +def _write_upload_meta(skill_dir: Path, info: Dict[str, Any]) -> None: + meta = { + "origin": info.get("origin", "captured"), + "parent_skill_ids": info.get("parent_skill_ids", []), + "change_summary": info.get("change_summary", ""), + "created_by": info.get("created_by", "openspace"), + "tags": info.get("tags", []), + } + (skill_dir / _UPLOAD_META_FILENAME).write_text( + json.dumps(meta, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + + +def _extract_json_object(text: str) -> Dict[str, Any]: + raw = (text or "").strip() + if raw.startswith("```"): + raw = raw.strip("`") + parts = raw.split("\n", 1) + raw = parts[1] if len(parts) == 2 else raw + if raw.endswith("```"): + raw = raw[:-3].rstrip() + + try: + data = json.loads(raw) + if isinstance(data, dict): + return data + except json.JSONDecodeError: + pass + + start = raw.find("{") + end = raw.rfind("}") + if start >= 0 and end > start: + data = json.loads(raw[start : end + 1]) + if isinstance(data, dict): + return data + + raise ValueError("LLM did not return a valid JSON object") + + +def _run_git(args: List[str], cwd: Path) -> str: + try: + completed = subprocess.run( + ["git", *args], + cwd=str(cwd), + text=True, + capture_output=True, + check=False, + ) + except Exception as exc: + logger.debug("git %s failed: %s", " ".join(args), exc) + return "" + + if completed.returncode != 0: + return "" + return completed.stdout.strip() + + +def _truncate(text: str, limit: int) -> str: + if len(text) <= limit: + return text + return text[: limit - 32].rstrip() + "\n...[truncated]..." + + +def _normalize_file_paths( + workspace: Path, + file_paths: Optional[Iterable[str]], +) -> List[Path]: + normalized: List[Path] = [] + for raw in file_paths or []: + if not raw: + continue + path = Path(raw) + if not path.is_absolute(): + path = workspace / path + normalized.append(path.resolve()) + return normalized + + +def _build_repo_context( + workspace: Path, + file_paths: List[Path], +) -> str: + sections: List[str] = [] + + if (workspace / ".git").exists(): + status = _run_git(["status", "--short"], workspace) + if status: + sections.append("## Git status\n" + _truncate(status, 4_000)) + + diff_stat = _run_git(["diff", "--stat"], workspace) + if diff_stat: + sections.append("## Git diff stat\n" + _truncate(diff_stat, 4_000)) + + staged_stat = _run_git(["diff", "--cached", "--stat"], workspace) + if staged_stat: + sections.append("## Git staged diff stat\n" + _truncate(staged_stat, 4_000)) + + if file_paths: + rel_paths = [] + for path in file_paths: + try: + rel_paths.append(str(path.relative_to(workspace))) + except ValueError: + rel_paths.append(str(path)) + scoped_diff = _run_git( + ["diff", "--unified=1", "--", *rel_paths], + workspace, + ) + if scoped_diff: + sections.append("## Focused diff\n" + _truncate(scoped_diff, 12_000)) + + if file_paths: + lines = ["## Mentioned files"] + for path in file_paths: + lines.append(f"- {path}") + sections.append("\n".join(lines)) + + return "\n\n".join(sections) if sections else "(no repository context available)" + + +def _existing_skill_names(registry) -> List[str]: + names = [] + for meta in registry.list_skills(): + names.append(meta.name) + return sorted(set(names)) + + +def _build_planning_prompt( + *, + task: str, + summary: str, + workspace: Path, + repo_context: str, + existing_skills: List[str], + max_skills: int, +) -> str: + skill_list = "\n".join(f"- {name}" for name in existing_skills[:200]) or "(none)" + return f"""You are deciding which reusable OpenSpace skills should be captured from a completed coding task. + +The main coding work was already completed by a host agent. Your job is ONLY to identify reusable patterns worth turning into new skills. + +Task: +{task} + +Execution summary: +{summary} + +Workspace: +{workspace} + +Repository context: +{repo_context} + +Existing local skill names: +{skill_list} + +Return exactly one JSON object with this shape: +{{ + "suggestions": [ + {{ + "category": "workflow", + "direction": "1-2 sentences describing the reusable pattern to capture." + }} + ] +}} + +Rules: +- Suggest at most {max_skills} skills. +- Only suggest skills that are reusable across future tasks. +- Categories must be one of: "tool_guide", "workflow", "reference". +- Do not suggest trivial one-step actions. +- Do not restate repo-specific one-off details as a reusable skill. +- Avoid duplicating an existing skill unless the new capability is clearly distinct. +- If nothing is worth capturing, return {{"suggestions": []}}. +""" + + +async def _plan_suggestions( + *, + openspace, + task: str, + summary: str, + workspace: Path, + repo_context: str, + max_skills: int, +) -> List[Dict[str, str]]: + registry = openspace._skill_registry + if not registry: + return [] + + logger.info( + "Planning evolution captures for task=%r (max_skills=%d)", + task[:120], + max_skills, + ) + prompt = _build_planning_prompt( + task=task, + summary=summary, + workspace=workspace, + repo_context=repo_context, + existing_skills=_existing_skill_names(registry), + max_skills=max_skills, + ) + + response = await openspace._llm_client.complete( + messages=prompt, + execute_tools=False, + model=openspace.config.llm_model, + ) + data = _extract_json_object(response["message"]["content"]) + raw_suggestions = data.get("suggestions", []) + if not isinstance(raw_suggestions, list): + raise ValueError("suggestions must be a list") + + deduped: List[Dict[str, str]] = [] + seen: set[tuple[str, str]] = set() + for item in raw_suggestions: + if not isinstance(item, dict): + continue + category = str(item.get("category", "")).strip() + direction = str(item.get("direction", "")).strip() + if category not in {"tool_guide", "workflow", "reference"} or not direction: + continue + key = (category, direction.lower()) + if key in seen: + continue + seen.add(key) + deduped.append({"category": category, "direction": direction}) + if len(deduped) >= max_skills: + break + logger.info("Planned %d capture suggestion(s)", len(deduped)) + return deduped + + +async def _prepend_output_dir(openspace, output_dir: Path) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + registry = openspace._skill_registry + if not registry: + return + + if output_dir not in registry._skill_dirs: + registry._skill_dirs.insert(0, output_dir) + + skill_store = openspace._skill_store + metas = registry.discover_from_dirs([output_dir]) + if metas and skill_store: + await skill_store.sync_from_registry(metas) + + +async def _register_extra_skill_dirs(openspace, dirs: List[Path]) -> None: + registry = openspace._skill_registry + skill_store = openspace._skill_store + if not registry: + return + + metas = registry.discover_from_dirs(dirs) + if metas and skill_store: + await skill_store.sync_from_registry(metas) + + +async def _evolve_from_context_impl( + task: str, + summary: str, + workspace_dir: str | None = None, + file_paths: list[str] | None = None, + max_skills: int = 3, + skill_dirs: list[str] | None = None, + output_dir: str | None = None, +) -> str: + """Capture reusable skills from a completed host-agent task. + + Use this when the main task was already handled by another agent + (for example Codex Desktop) and OpenSpace should only spend provider + tokens on post-task skill capture. + + Args: + task: Short description of the completed task. + summary: What changed, what was learned, and what seems reusable. + workspace_dir: Repository/workspace path. Defaults to OPENSPACE_WORKSPACE. + file_paths: Optional files worth emphasizing when planning captures. + max_skills: Maximum number of new skills to capture. + skill_dirs: Optional additional skill directories to register first. + output_dir: Override directory for new skills. Defaults to the first + OPENSPACE_HOST_SKILL_DIRS entry. + """ + _mark_request_start() + try: + if not task.strip(): + return _json_error("task is required", status="error") + if not summary.strip(): + return _json_error("summary is required", status="error") + + openspace = await _get_openspace() + if not openspace._skill_evolver or not openspace._skill_registry: + return _json_error("Skill evolution is not enabled", status="error") + + workspace = Path(workspace_dir or openspace.config.workspace_dir or os.getcwd()).resolve() + normalized_paths = _normalize_file_paths(workspace, file_paths) + + if skill_dirs: + extra_dirs = [Path(p).expanduser().resolve() for p in skill_dirs if p] + if extra_dirs: + await _register_extra_skill_dirs(openspace, extra_dirs) + + if output_dir: + await _prepend_output_dir(openspace, Path(output_dir).expanduser().resolve()) + + repo_context = _build_repo_context(workspace, normalized_paths) + suggestions = await _plan_suggestions( + openspace=openspace, + task=task, + summary=summary, + workspace=workspace, + repo_context=repo_context, + max_skills=max(0, min(max_skills, 8)), + ) + + if not suggestions: + return _json_ok( + { + "status": "success", + "task": task, + "workspace_dir": str(workspace), + "suggestion_count": 0, + "created_skills": [], + "message": "No reusable skill captures were suggested.", + } + ) + + from openspace.skill_engine import EvolutionContext, EvolutionTrigger + from openspace.skill_engine.types import ( + EvolutionSuggestion, + EvolutionType, + ExecutionAnalysis, + SkillCategory, + ) + + evolver = openspace._skill_evolver + task_id = f"sidecar_{uuid.uuid4().hex[:12]}" + now = datetime.now() + analysis = ExecutionAnalysis( + task_id=task_id, + timestamp=now, + task_completed=True, + execution_note=_truncate(summary, 1_500), + analyzed_by=openspace.config.llm_model, + analyzed_at=now, + ) + + created_skills: List[Dict[str, Any]] = [] + skipped: List[Dict[str, str]] = [] + for suggestion in suggestions: + logger.info( + "Capturing skill (%s): %s", + suggestion["category"], + suggestion["direction"][:180], + ) + ctx = EvolutionContext( + trigger=EvolutionTrigger.ANALYSIS, + suggestion=EvolutionSuggestion( + evolution_type=EvolutionType.CAPTURED, + target_skill_ids=[], + category=SkillCategory(suggestion["category"]), + direction=suggestion["direction"], + ), + source_task_id=task_id, + recent_analyses=[analysis], + available_tools=[], + ) + new_record = await evolver.evolve(ctx) + if not new_record: + logger.info("Capture skipped by evolver") + skipped.append(suggestion) + continue + + skill_dir = Path(new_record.path).parent if new_record.path else None + if skill_dir: + _write_upload_meta( + skill_dir, + { + "origin": new_record.lineage.origin.value, + "parent_skill_ids": new_record.lineage.parent_skill_ids, + "change_summary": new_record.lineage.change_summary, + "created_by": new_record.lineage.created_by or "openspace", + "tags": new_record.tags, + }, + ) + + created_skills.append( + { + "name": new_record.name, + "skill_id": new_record.skill_id, + "skill_dir": str(skill_dir) if skill_dir else "", + "path": new_record.path, + "category": suggestion["category"], + "direction": suggestion["direction"], + "upload_ready": bool(skill_dir), + } + ) + + return _json_ok( + { + "status": "success", + "task": task, + "workspace_dir": str(workspace), + "suggestion_count": len(suggestions), + "created_skills": created_skills, + "skipped_suggestions": skipped, + } + ) + except Exception as e: + logger.error("evolve_from_context failed: %s", e, exc_info=True) + return _json_error(e, status="error") + finally: + _mark_request_end() + + +class _DirectEvolutionToolImplementation: + async def evolve_from_context( + self, + task: str, + summary: str, + workspace_dir: str | None = None, + file_paths: list[str] | None = None, + max_skills: int = 3, + skill_dirs: list[str] | None = None, + output_dir: str | None = None, + ) -> str: + return await _evolve_from_context_impl( + task=task, + summary=summary, + workspace_dir=workspace_dir, + file_paths=file_paths, + max_skills=max_skills, + skill_dirs=skill_dirs, + output_dir=output_dir, + ) + + +register_evolution_tools(mcp, _DirectEvolutionToolImplementation()) + + +def run_mcp_server() -> None: + import argparse + + parser = argparse.ArgumentParser(description="OpenSpace Evolution MCP Server") + parser.add_argument( + "--transport", + choices=["stdio", "sse", "streamable-http"], + default="stdio", + ) + parser.add_argument("--port", type=int, default=8080) + args = parser.parse_args() + + if args.transport == "stdio" or os.environ.get("OPENSPACE_MCP_DAEMON") == "1": + _install_signal_handlers() + _maybe_start_idle_watchdog() + + mcp.settings.port = args.port + mcp.run(transport=args.transport) + + +if __name__ == "__main__": + run_mcp_server() diff --git a/openspace/llm/client.py b/openspace/llm/client.py index 611cf87..2a23928 100644 --- a/openspace/llm/client.py +++ b/openspace/llm/client.py @@ -1,8 +1,12 @@ import litellm import json import asyncio +import os import time +from types import SimpleNamespace from typing import List, Sequence, Union, Dict, Optional + +import httpx from openai.types.chat import ChatCompletionToolParam from openspace.grounding.core.types import ToolSchema, ToolResult, ToolStatus @@ -20,6 +24,218 @@ logger = Logger.get_logger(__name__) +def _is_truthy(value: object) -> bool: + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def _should_use_openai_stream_compat(litellm_kwargs: Optional[Dict] = None) -> bool: + if litellm_kwargs and _is_truthy(litellm_kwargs.get("openai_stream_compat")): + return True + return _is_truthy(os.environ.get("OPENSPACE_LLM_OPENAI_STREAM_COMPAT", "")) + + +def _build_stream_response( + *, + content: str, + reasoning_content: Optional[str], + tool_calls: List[Dict[str, object]], +): + response_tool_calls = [] + for tool_call in tool_calls: + response_tool_calls.append( + SimpleNamespace( + id=tool_call.get("id"), + type=tool_call.get("type", "function"), + function=SimpleNamespace( + name=tool_call.get("function", {}).get("name", ""), + arguments=tool_call.get("function", {}).get("arguments", ""), + ), + ) + ) + + message = SimpleNamespace( + content=content, + reasoning_content=reasoning_content, + tool_calls=response_tool_calls or None, + ) + return SimpleNamespace(choices=[SimpleNamespace(message=message)]) + + +async def _openai_compat_stream_completion( + *, + model: str, + messages: List[Dict], + timeout: float, + litellm_kwargs: Optional[Dict] = None, + tools: Optional[List[ChatCompletionToolParam]] = None, + tool_choice: Optional[str] = None, + reasoning_effort: Optional[str] = None, +): + kwargs = dict(litellm_kwargs or {}) + api_key = kwargs.pop("api_key", None) + api_base = kwargs.pop("api_base", None) + extra_headers = kwargs.pop("extra_headers", None) or {} + kwargs.pop("openai_stream_compat", None) + + if not api_key or not api_base: + raise ValueError( + "OpenAI stream compatibility mode requires api_key and api_base." + ) + + payload = { + "model": model, + "messages": messages, + "stream": True, + } + if tools: + payload["tools"] = tools + if tool_choice is not None: + payload["tool_choice"] = tool_choice + if reasoning_effort: + payload["reasoning_effort"] = reasoning_effort + + for key in ( + "temperature", + "top_p", + "presence_penalty", + "frequency_penalty", + "max_tokens", + "max_completion_tokens", + "parallel_tool_calls", + "response_format", + ): + if key in kwargs and kwargs[key] is not None: + payload[key] = kwargs[key] + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + headers.update(extra_headers) + + text_parts: List[str] = [] + reasoning_parts: List[str] = [] + streamed_tool_calls: Dict[int, Dict[str, object]] = {} + + async with httpx.AsyncClient(timeout=timeout) as client: + async with client.stream( + "POST", + f"{api_base.rstrip('/')}/chat/completions", + headers=headers, + json=payload, + ) as response: + if response.status_code >= 400: + body = await response.aread() + raise RuntimeError( + f"OpenAI-compat stream request failed: {response.status_code} " + f"{body.decode(errors='replace')}" + ) + + async for line in response.aiter_lines(): + if not line or not line.startswith("data: "): + continue + data = line[6:].strip() + if not data or data == "[DONE]": + break + + chunk = json.loads(data) + choices = chunk.get("choices") or [] + if not choices: + continue + + delta = choices[0].get("delta") or {} + content_delta = delta.get("content") + if content_delta: + text_parts.append(content_delta) + + reasoning_delta = delta.get("reasoning_content") + if reasoning_delta: + reasoning_parts.append(reasoning_delta) + + for tool_delta in delta.get("tool_calls") or []: + idx = tool_delta.get("index", 0) + state = streamed_tool_calls.setdefault( + idx, + { + "id": None, + "type": "function", + "function": {"name": "", "arguments": ""}, + }, + ) + if tool_delta.get("id"): + state["id"] = tool_delta["id"] + if tool_delta.get("type"): + state["type"] = tool_delta["type"] + fn = tool_delta.get("function") or {} + if fn.get("name"): + state["function"]["name"] += fn["name"] + if fn.get("arguments"): + state["function"]["arguments"] += fn["arguments"] + + return _build_stream_response( + content="".join(text_parts), + reasoning_content="".join(reasoning_parts) or None, + tool_calls=[streamed_tool_calls[i] for i in sorted(streamed_tool_calls)], + ) + + +def _classify_retryable_error(error_text: str) -> tuple[bool, bool, bool]: + is_rate_limit = any( + keyword in error_text + for keyword in ['rate limit', 'rate_limit', 'too many requests', '429'] + ) + is_overloaded = any( + keyword in error_text + for keyword in ['overloaded', '500', '502', '503', '504', 'internal server error', 'service unavailable'] + ) + is_connection_error = any( + keyword in error_text + for keyword in ['cannot connect', 'connection refused', 'connection reset', + 'connectionerror', 'timeout', 'name resolution', + 'temporary failure', 'network unreachable'] + ) + return is_rate_limit, is_overloaded, is_connection_error + + +def _retry_backoff(error_text: str, attempt: int) -> tuple[str, int]: + is_rate_limit, _is_overloaded, is_connection_error = _classify_retryable_error(error_text) + if is_rate_limit: + return "Rate limit", 60 + (attempt * 30) + if is_connection_error: + return "Connection", min(10 * (2 ** attempt), 60) + return "Server overload", min(5 * (2 ** attempt), 60) + + +def _should_retry_completion_error(exc: Exception) -> bool: + error_text = str(exc).lower() + return any( + keyword in error_text + for keyword in [ + 'rate limit', 'rate_limit', 'too many requests', '429', + 'overloaded', '500', '502', '503', '504', 'internal server error', + 'service unavailable', 'cannot connect', 'connection refused', + 'connection reset', 'connectionerror', 'timeout', 'name resolution', + 'temporary failure', 'network unreachable' + ] + ) + + +def _should_use_nonstream_compat_only(completion_kwargs: Optional[Dict] = None) -> bool: + return False + + +async def _execute_openai_compat_completion(completion_kwargs: Dict, timeout: float): + return await _openai_compat_stream_completion( + model=completion_kwargs["model"], + messages=completion_kwargs["messages"], + timeout=timeout, + litellm_kwargs=completion_kwargs, + tools=completion_kwargs.get("tools"), + tool_choice=completion_kwargs.get("tool_choice"), + reasoning_effort=completion_kwargs.get("reasoning_effort"), + ) + + def _sanitize_schema(params: Dict) -> Dict: """Sanitize tool parameter schema to comply with Claude API requirements. @@ -263,15 +479,26 @@ async def _summarize_tool_result( Concise summary:""" _extra = litellm_kwargs or {} - response = await asyncio.wait_for( - litellm.acompletion( - model=model, - messages=[{"role": "user", "content": prompt}], - timeout=timeout, - **_extra, - ), - timeout=timeout + 5 - ) + if _should_use_openai_stream_compat(_extra): + response = await asyncio.wait_for( + _openai_compat_stream_completion( + model=model, + messages=[{"role": "user", "content": prompt}], + timeout=timeout, + litellm_kwargs=_extra, + ), + timeout=timeout + 5, + ) + else: + response = await asyncio.wait_for( + litellm.acompletion( + model=model, + messages=[{"role": "user", "content": prompt}], + timeout=timeout, + **_extra, + ), + timeout=timeout + 5 + ) summary = response.choices[0].message.content.strip() result = f"[SUMMARY of {len(content):,} chars]\n{summary}" @@ -552,10 +779,16 @@ async def _call_with_retry(self, **completion_kwargs): for attempt in range(self.max_retries): try: # Add timeout to the completion call - response = await asyncio.wait_for( - litellm.acompletion(**completion_kwargs), - timeout=self.timeout - ) + if _should_use_openai_stream_compat(completion_kwargs): + response = await asyncio.wait_for( + _execute_openai_compat_completion(completion_kwargs, self.timeout), + timeout=self.timeout, + ) + else: + response = await asyncio.wait_for( + litellm.acompletion(**completion_kwargs), + timeout=self.timeout + ) return response except asyncio.TimeoutError: self._logger.error( @@ -573,35 +806,10 @@ async def _call_with_retry(self, **completion_kwargs): last_exception = e error_str = str(e).lower() - # Check if it's a retryable error - is_rate_limit = any( - keyword in error_str - for keyword in ['rate limit', 'rate_limit', 'too many requests', '429'] - ) - - is_overloaded = any( - keyword in error_str - for keyword in ['overloaded', '500', '502', '503', '504', 'internal server error', 'service unavailable'] - ) - - is_connection_error = any( - keyword in error_str - for keyword in ['cannot connect', 'connection refused', 'connection reset', - 'connectionerror', 'timeout', 'name resolution', - 'temporary failure', 'network unreachable'] - ) - - if attempt < self.max_retries - 1 and (is_rate_limit or is_overloaded or is_connection_error): - if is_rate_limit: - backoff_delay = 60 + (attempt * 30) # 60s, 90s, 120s - error_type = "Rate limit" - elif is_connection_error: - backoff_delay = min(10 * (2 ** attempt), 60) # 10s, 20s, 40s, max 60s - error_type = "Connection" - else: - backoff_delay = min(5 * (2 ** attempt), 60) # 5s, 10s, 20s, max 60s - error_type = "Server overload" - + is_rate_limit, is_overloaded, is_connection_error = _classify_retryable_error(error_str) + + if attempt < self.max_retries - 1 and _should_retry_completion_error(e): + error_type, backoff_delay = _retry_backoff(error_str, attempt) self._logger.warning( f"{error_type} error (attempt {attempt + 1}/{self.max_retries}), " f"waiting {backoff_delay}s before retry..." diff --git a/openspace/mcp_proxy.py b/openspace/mcp_proxy.py new file mode 100644 index 0000000..814d317 --- /dev/null +++ b/openspace/mcp_proxy.py @@ -0,0 +1,255 @@ +from __future__ import annotations + +import asyncio +import argparse +import inspect +import json +import os +from pathlib import Path +from typing import Any + +from mcp.server.fastmcp import FastMCP +from mcp.types import TextContent + +from openspace.mcp_stdio import maybe_redirect_stderr_to_file +from openspace.grounding.backends.mcp.client import MCPClient +from openspace.mcp_tool_registration import ( + register_evolution_tools, + register_main_tools, +) +from openspace.shared_mcp_runtime import ServerKind, ensure_daemon + + +_LOG_DIR = Path(__file__).resolve().parent.parent / "logs" +maybe_redirect_stderr_to_file(_LOG_DIR, "mcp_proxy_stderr.log") + + +def _proxy_mode_for(server_kind: ServerKind) -> str: + raw = os.environ.get("OPENSPACE_MCP_PROXY_MODE", "").strip().lower() + if raw in {"daemon", "direct"}: + return raw + return "daemon" + + +def _json_error(error: Any, **extra: Any) -> str: + return json.dumps({"error": str(error), **extra}, ensure_ascii=False) + + +def _extract_text_payload(result: Any) -> str: + text_parts: list[str] = [] + for item in getattr(result, "content", []): + if isinstance(item, TextContent): + text_parts.append(item.text) + continue + text = getattr(item, "text", None) + if text is not None: + text_parts.append(text) + if not text_parts: + raise RuntimeError("Remote MCP tool returned no text payload") + return "\n".join(text_parts) + + +class _RemoteProxyBase: + def __init__(self, server_kind: ServerKind): + self._server_kind = server_kind + + async def _call_remote_tool_once(self, tool_name: str, args: dict[str, Any]) -> str: + record = await ensure_daemon(self._server_kind) + client = MCPClient( + config={"mcpServers": {"daemon": {"url": record.url}}}, + timeout=10.0, + sse_read_timeout=60 * 60.0, + check_dependencies=False, + ) + try: + session = await client.create_session("daemon", auto_initialize=True) + if session is None: + raise RuntimeError("Failed to create daemon MCP session") + result = await session.connector.call_tool(tool_name, args) + return _extract_text_payload(result) + finally: + await client.close_all_sessions() + + def _call_remote_tool_blocking(self, tool_name: str, args: dict[str, Any]) -> str: + return asyncio.run(self._call_remote_tool_once(tool_name, args)) + + async def _call_remote_tool(self, tool_name: str, args: dict[str, Any]) -> str: + for attempt in range(2): + try: + return await asyncio.to_thread( + self._call_remote_tool_blocking, + tool_name, + args, + ) + except Exception as exc: + if attempt == 1: + return _json_error(exc, status="error") + return _json_error("Unreachable proxy retry path", status="error") + + +class _MainProxyImplementation(_RemoteProxyBase): + def __init__(self): + super().__init__("main") + + async def execute_task( + self, + task: str, + workspace_dir: str | None = None, + max_iterations: int | None = None, + skill_dirs: list[str] | None = None, + search_scope: str = "all", + ) -> str: + return await self._call_remote_tool( + "execute_task", + { + "task": task, + "workspace_dir": workspace_dir, + "max_iterations": max_iterations, + "skill_dirs": skill_dirs, + "search_scope": search_scope, + }, + ) + + async def search_skills( + self, + query: str, + source: str = "all", + limit: int = 20, + auto_import: bool = True, + ) -> str: + return await self._call_remote_tool( + "search_skills", + { + "query": query, + "source": source, + "limit": limit, + "auto_import": auto_import, + }, + ) + + async def fix_skill( + self, + skill_dir: str, + direction: str, + ) -> str: + return await self._call_remote_tool( + "fix_skill", + { + "skill_dir": skill_dir, + "direction": direction, + }, + ) + + async def upload_skill( + self, + skill_dir: str, + visibility: str = "public", + origin: str | None = None, + parent_skill_ids: list[str] | None = None, + tags: list[str] | None = None, + created_by: str | None = None, + change_summary: str | None = None, + ) -> str: + return await self._call_remote_tool( + "upload_skill", + { + "skill_dir": skill_dir, + "visibility": visibility, + "origin": origin, + "parent_skill_ids": parent_skill_ids, + "tags": tags, + "created_by": created_by, + "change_summary": change_summary, + }, + ) + + +class _EvolutionProxyImplementation(_RemoteProxyBase): + def __init__(self): + super().__init__("evolution") + + async def evolve_from_context( + self, + task: str, + summary: str, + workspace_dir: str | None = None, + file_paths: list[str] | None = None, + max_skills: int = 3, + skill_dirs: list[str] | None = None, + output_dir: str | None = None, + ) -> str: + return await self._call_remote_tool( + "evolve_from_context", + { + "task": task, + "summary": summary, + "workspace_dir": workspace_dir, + "file_paths": file_paths, + "max_skills": max_skills, + "skill_dirs": skill_dirs, + "output_dir": output_dir, + }, + ) + + +def _build_fastmcp(server_kind: ServerKind) -> FastMCP: + kwargs: dict[str, Any] = {} + if "description" in inspect.signature(FastMCP.__init__).parameters: + if server_kind == "main": + kwargs["description"] = ( + "OpenSpace: Unite the Agents. Evolve the Mind. Rebuild the World." + ) + else: + kwargs["description"] = ( + "OpenSpace evolution sidecar: capture reusable skills from host-agent work." + ) + name = "OpenSpace" if server_kind == "main" else "OpenSpace Evolution" + return FastMCP(name, **kwargs) + + +def _run_proxy(server_kind: ServerKind) -> None: + if _proxy_mode_for(server_kind) == "direct": + if server_kind == "main": + from openspace.mcp_server import run_mcp_server + else: + from openspace.evolution_mcp_server import run_mcp_server + run_mcp_server() + return + + parser = argparse.ArgumentParser(description="OpenSpace MCP proxy") + parser.add_argument("--transport", choices=["stdio"], default="stdio") + parser.parse_args() + + mcp = _build_fastmcp(server_kind) + if server_kind == "main": + register_main_tools(mcp, _MainProxyImplementation()) + else: + register_evolution_tools(mcp, _EvolutionProxyImplementation()) + mcp.run(transport="stdio") + + +def run_main_mcp_proxy() -> None: + _run_proxy("main") + + +def run_evolution_mcp_proxy() -> None: + _run_proxy("evolution") + + +def main() -> None: + parser = argparse.ArgumentParser(description="OpenSpace MCP proxy") + parser.add_argument("--kind", choices=["main", "evolution"], required=True) + parser.add_argument("--transport", choices=["stdio"], default="stdio") + args = parser.parse_args() + + # Rebuild argv for the generic runner so direct fallback can reuse legacy entrypoints. + transport = args.transport + os.environ.setdefault("OPENSPACE_MCP_PROXY_MODE", _proxy_mode_for(args.kind)) + import sys + + sys.argv = [sys.argv[0], "--transport", transport] + _run_proxy(args.kind) + + +if __name__ == "__main__": + main() diff --git a/openspace/mcp_server.py b/openspace/mcp_server.py index ba4ca31..06b1a02 100644 --- a/openspace/mcp_server.py +++ b/openspace/mcp_server.py @@ -21,10 +21,16 @@ import json import logging import os +import signal import sys +import threading +import time from pathlib import Path from typing import Any, Dict, List, Optional +from openspace.mcp_stdio import maybe_redirect_stderr_to_file +from openspace.mcp_tool_registration import register_main_tools +from openspace.shared_mcp_runtime import update_current_daemon_status class _MCPSafeStdout: """Stdout wrapper: binary (.buffer) → real stdout, text (.write) → stderr.""" @@ -48,7 +54,11 @@ def writelines(self, lines): def flush(self): self._stderr.flush() - self._real.flush() + try: + self._real.flush() + except ValueError: + # The MCP stdio transport may close stdout before Python's final flush. + pass def isatty(self): return self._stderr.isatty() @@ -81,17 +91,7 @@ def __getattr__(self, name): _LOG_DIR.mkdir(parents=True, exist_ok=True) _real_stdout = sys.stdout - -# Windows pipe buffers are small. When using stdio MCP transport, -# the parent process only reads stdout for MCP messages and does NOT -# drain stderr. Heavy log/print output during execute_task fills the stderr -# pipe buffer, blocking this process on write() → deadlock → timeout. -# Redirect stderr to a log file on Windows to prevent this. -if os.name == "nt": - _stderr_file = open( - _LOG_DIR / "mcp_stderr.log", "a", encoding="utf-8", buffering=1 - ) - sys.stderr = _stderr_file +maybe_redirect_stderr_to_file(_LOG_DIR, "mcp_stderr.log") sys.stdout = _MCPSafeStdout(_real_stdout, sys.stderr) @@ -118,6 +118,14 @@ def __getattr__(self, name): _openspace_instance = None _openspace_lock = asyncio.Lock() _standalone_store = None +_idle_watchdog_started = False +_activity_lock = threading.Lock() +_active_request_count = 0 +_last_activity_at = time.monotonic() +_embedding_prewarm_started = False +_embedding_prewarm_lock = threading.Lock() +_shutdown_started = False +_shutdown_lock = threading.Lock() # Internal state: tracks bot skill directories already registered this session. _registered_skill_dirs: set = set() @@ -258,6 +266,94 @@ def _get_local_skill_registry(): return registry +def _prewarm_main_daemon_skill_embeddings() -> None: + """Warm local skill embeddings and cache local candidate vectors. + + Runs in a background thread for the main daemon path so the first user + request is less likely to pay the full fastembed/model cold-start cost. + """ + try: + from openspace.cloud.embedding import ( + prewarm_local_skill_embedding_backend, + using_local_skill_embeddings, + ) + from openspace.cloud.search import build_local_candidates + from openspace.skill_engine.skill_ranker import SkillCandidate, SkillRanker + + if not using_local_skill_embeddings(): + logger.info("Skipping main daemon embedding prewarm: remote skill embeddings active") + update_current_daemon_status("main", warmed=True) + return + + if not prewarm_local_skill_embedding_backend(): + logger.info("Main daemon embedding prewarm did not initialize a local embedder") + update_current_daemon_status( + "main", + warmed=False, + warmup_error="local embedder did not initialize", + ) + return + + registry = _get_local_skill_registry() + if not registry: + logger.info("Skipping main daemon embedding cache prewarm: no local skill registry") + update_current_daemon_status("main", warmed=True) + return + + candidates = build_local_candidates(registry.list_skills(), store=None) + if not candidates: + logger.info("Skipping main daemon embedding cache prewarm: no local candidates") + update_current_daemon_status("main", warmed=True) + return + + ranker = SkillRanker(enable_cache=True) + skill_candidates: list[SkillCandidate] = [] + for candidate in candidates: + skill_candidate = SkillCandidate( + skill_id=candidate.get("skill_id", ""), + name=candidate.get("name", ""), + description=candidate.get("description", ""), + body="", + metadata=candidate, + ) + skill_candidate.embedding_text = candidate.get("_embedding_text", "") + skill_candidates.append(skill_candidate) + + warmed = ranker.prime_candidates(skill_candidates) + logger.info( + "Main daemon skill embedding prewarm complete: %s/%s local candidates ready", + warmed, + len(skill_candidates), + ) + update_current_daemon_status("main", warmed=True, warmup_error=None) + except Exception as exc: + logger.warning("Main daemon embedding prewarm failed: %s", exc) + update_current_daemon_status("main", warmed=False, warmup_error=str(exc)) + + +def _maybe_start_main_daemon_embedding_prewarm() -> None: + global _embedding_prewarm_started + + if os.environ.get("OPENSPACE_MCP_DAEMON") != "1": + return + if os.environ.get("OPENSPACE_MCP_DISABLE_EMBEDDING_PREWARM", "").strip().lower() in { + "1", + "true", + "yes", + }: + return + + with _embedding_prewarm_lock: + if _embedding_prewarm_started: + return + threading.Thread( + target=_prewarm_main_daemon_skill_embeddings, + name="openspace-main-embedding-prewarm", + daemon=True, + ).start() + _embedding_prewarm_started = True + + def _get_cloud_client(): """Get a OpenSpaceClient instance (raises CloudError if not configured).""" from openspace.cloud.auth import get_openspace_auth @@ -525,9 +621,111 @@ def _json_error(error: Any, **extra) -> str: return json.dumps({"error": str(error), **extra}, ensure_ascii=False) -# MCP Tools (4 tools) -@mcp.tool() -async def execute_task( +def _mark_request_start() -> None: + global _active_request_count, _last_activity_at + with _activity_lock: + _active_request_count += 1 + _last_activity_at = time.monotonic() + update_current_daemon_status("main", touch=True, active_delta=1) + + +def _mark_request_end() -> None: + global _active_request_count, _last_activity_at + with _activity_lock: + _active_request_count = max(0, _active_request_count - 1) + _last_activity_at = time.monotonic() + update_current_daemon_status("main", touch=True, active_delta=-1) + + +def _shutdown_worker(reason: str) -> None: + logger.info("Shutting down OpenSpace MCP daemon: %s", reason) + instance = _openspace_instance + if instance is not None and instance.is_initialized(): + try: + asyncio.run(asyncio.wait_for(instance.cleanup(), timeout=10.0)) + except Exception as exc: + logger.warning("OpenSpace MCP cleanup during shutdown failed: %s", exc) + logging.shutdown() + os._exit(0) + + +def _begin_shutdown(reason: str) -> None: + global _shutdown_started + with _shutdown_lock: + if _shutdown_started: + return + _shutdown_started = True + + threading.Thread( + target=_shutdown_worker, + args=(reason,), + name="openspace-mcp-shutdown", + daemon=True, + ).start() + + +def _install_signal_handlers() -> None: + def _handle(signum, _frame) -> None: + try: + signame = signal.Signals(signum).name + except Exception: + signame = str(signum) + _begin_shutdown(f"signal {signame}") + + for signum in (signal.SIGTERM, signal.SIGINT): + try: + signal.signal(signum, _handle) + except Exception: + continue + + +def _idle_watchdog_loop(idle_timeout_seconds: int) -> None: + check_interval = max(1, min(max(idle_timeout_seconds // 3, 1), 60)) + logger.info("MCP idle watchdog enabled: timeout=%ss", idle_timeout_seconds) + while True: + time.sleep(check_interval) + with _activity_lock: + active = _active_request_count + idle_for = time.monotonic() - _last_activity_at + if active == 0 and idle_for >= idle_timeout_seconds: + logger.info( + "MCP idle watchdog exiting process after %.1fs idle with no active requests", + idle_for, + ) + _begin_shutdown(f"idle timeout after {idle_for:.1f}s") + return + + +def _maybe_start_idle_watchdog() -> None: + global _idle_watchdog_started + if _idle_watchdog_started: + return + + timeout_raw = os.environ.get("OPENSPACE_MCP_IDLE_TIMEOUT_SECONDS", "").strip() + if timeout_raw: + try: + idle_timeout_seconds = int(timeout_raw) + except ValueError: + logger.warning("Invalid OPENSPACE_MCP_IDLE_TIMEOUT_SECONDS=%r", timeout_raw) + return + else: + idle_timeout_seconds = 900 + + if idle_timeout_seconds <= 0: + return + + watchdog = threading.Thread( + target=_idle_watchdog_loop, + args=(idle_timeout_seconds,), + name="openspace-mcp-idle-watchdog", + daemon=True, + ) + watchdog.start() + _idle_watchdog_started = True + + +# MCP tool implementations +async def _execute_task_impl( task: str, workspace_dir: str | None = None, max_iterations: int | None = None, @@ -562,6 +760,7 @@ async def execute_task( if no API key is configured. "local" — local SkillRegistry only (fast, no cloud). """ + _mark_request_start() try: openspace = await _get_openspace() @@ -603,10 +802,11 @@ async def execute_task( except Exception as e: logger.error(f"execute_task failed: {e}", exc_info=True) return _json_error(e, status="error") + finally: + _mark_request_end() -@mcp.tool() -async def search_skills( +async def _search_skills_impl( query: str, source: str = "all", limit: int = 20, @@ -632,6 +832,7 @@ async def search_skills( limit: Maximum results to return (default: 20). auto_import: Auto-download top public cloud skills (default: True). """ + _mark_request_start() try: from openspace.cloud.search import hybrid_search_skills @@ -706,10 +907,11 @@ async def search_skills( except Exception as e: logger.error(f"search_skills failed: {e}", exc_info=True) return _json_error(e) + finally: + _mark_request_end() -@mcp.tool() -async def fix_skill( +async def _fix_skill_impl( skill_dir: str, direction: str, ) -> str: @@ -737,6 +939,7 @@ async def fix_skill( e.g. "The API endpoint changed from v1 to v2" or "Add retry logic for HTTP 429 rate limit errors". """ + _mark_request_start() try: from openspace.skill_engine.types import EvolutionSuggestion, EvolutionType from openspace.skill_engine.evolver import EvolutionContext, EvolutionTrigger @@ -824,10 +1027,11 @@ async def fix_skill( except Exception as e: logger.error(f"fix_skill failed: {e}", exc_info=True) return _json_error(e, status="error") + finally: + _mark_request_end() -@mcp.tool() -async def upload_skill( +async def _upload_skill_impl( skill_dir: str, visibility: str = "public", origin: str | None = None, @@ -864,6 +1068,7 @@ async def upload_skill( created_by: Override creator. Default: from .upload_meta.json. change_summary: Override summary. Default: from .upload_meta.json. """ + _mark_request_start() try: skill_path = Path(skill_dir) if not (skill_path / "SKILL.md").exists(): @@ -895,20 +1100,93 @@ async def upload_skill( except Exception as e: logger.error(f"upload_skill failed: {e}", exc_info=True) return _json_error(e, status="error") + finally: + _mark_request_end() + + +class _DirectMainToolImplementation: + async def execute_task( + self, + task: str, + workspace_dir: str | None = None, + max_iterations: int | None = None, + skill_dirs: list[str] | None = None, + search_scope: str = "all", + ) -> str: + return await _execute_task_impl( + task=task, + workspace_dir=workspace_dir, + max_iterations=max_iterations, + skill_dirs=skill_dirs, + search_scope=search_scope, + ) + + async def search_skills( + self, + query: str, + source: str = "all", + limit: int = 20, + auto_import: bool = True, + ) -> str: + return await _search_skills_impl( + query=query, + source=source, + limit=limit, + auto_import=auto_import, + ) + + async def fix_skill( + self, + skill_dir: str, + direction: str, + ) -> str: + return await _fix_skill_impl(skill_dir=skill_dir, direction=direction) + + async def upload_skill( + self, + skill_dir: str, + visibility: str = "public", + origin: str | None = None, + parent_skill_ids: list[str] | None = None, + tags: list[str] | None = None, + created_by: str | None = None, + change_summary: str | None = None, + ) -> str: + return await _upload_skill_impl( + skill_dir=skill_dir, + visibility=visibility, + origin=origin, + parent_skill_ids=parent_skill_ids, + tags=tags, + created_by=created_by, + change_summary=change_summary, + ) + + +register_main_tools(mcp, _DirectMainToolImplementation()) + def run_mcp_server() -> None: """Console-script entry point for ``openspace-mcp``.""" import argparse parser = argparse.ArgumentParser(description="OpenSpace MCP Server") - parser.add_argument("--transport", choices=["stdio", "sse"], default="stdio") + parser.add_argument( + "--transport", + choices=["stdio", "sse", "streamable-http"], + default="stdio", + ) parser.add_argument("--port", type=int, default=8080) args = parser.parse_args() - if args.transport == "sse": - mcp.run(transport="sse", sse_params={"port": args.port}) - else: - mcp.run(transport="stdio") + if args.transport == "stdio" or os.environ.get("OPENSPACE_MCP_DAEMON") == "1": + _install_signal_handlers() + _maybe_start_idle_watchdog() + if args.transport == "streamable-http": + _maybe_start_main_daemon_embedding_prewarm() + + mcp.settings.port = args.port + mcp.run(transport=args.transport) if __name__ == "__main__": diff --git a/openspace/mcp_stdio.py b/openspace/mcp_stdio.py new file mode 100644 index 0000000..f7a4ac5 --- /dev/null +++ b/openspace/mcp_stdio.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import os +import sys +from pathlib import Path +from typing import TextIO + + +_STDERR_CAPTURE_HANDLE: TextIO | None = None + + +def maybe_redirect_stderr_to_file(log_dir: Path, filename: str) -> TextIO | None: + """Redirect stderr to a log file when running as a non-interactive MCP child. + + Codex and similar MCP hosts typically do not surface or continuously drain + child-process stderr. Leaving verbose transport logs attached to a pipe can + back up the buffer and stall stdio tool calls. For interactive terminals we + keep stderr unchanged so local debugging still behaves normally. + """ + global _STDERR_CAPTURE_HANDLE + + if os.environ.get("OPENSPACE_MCP_CAPTURE_STDERR", "").strip().lower() in { + "0", + "false", + "no", + }: + return None + + if _STDERR_CAPTURE_HANDLE is not None: + return _STDERR_CAPTURE_HANDLE + + try: + if sys.stderr is not None and sys.stderr.isatty(): + return None + except Exception: + pass + + log_dir.mkdir(parents=True, exist_ok=True) + handle = (log_dir / filename).open("a", encoding="utf-8", buffering=1) + sys.stderr = handle + _STDERR_CAPTURE_HANDLE = handle + return handle diff --git a/openspace/mcp_tool_registration.py b/openspace/mcp_tool_registration.py new file mode 100644 index 0000000..be9af0b --- /dev/null +++ b/openspace/mcp_tool_registration.py @@ -0,0 +1,254 @@ +from __future__ import annotations + +from typing import Protocol + +from mcp.server.fastmcp import FastMCP + + +class MainMCPToolImplementation(Protocol): + async def execute_task( + self, + task: str, + workspace_dir: str | None = None, + max_iterations: int | None = None, + skill_dirs: list[str] | None = None, + search_scope: str = "all", + ) -> str: ... + + async def search_skills( + self, + query: str, + source: str = "all", + limit: int = 20, + auto_import: bool = True, + ) -> str: ... + + async def fix_skill( + self, + skill_dir: str, + direction: str, + ) -> str: ... + + async def upload_skill( + self, + skill_dir: str, + visibility: str = "public", + origin: str | None = None, + parent_skill_ids: list[str] | None = None, + tags: list[str] | None = None, + created_by: str | None = None, + change_summary: str | None = None, + ) -> str: ... + + +class EvolutionMCPToolImplementation(Protocol): + async def evolve_from_context( + self, + task: str, + summary: str, + workspace_dir: str | None = None, + file_paths: list[str] | None = None, + max_skills: int = 3, + skill_dirs: list[str] | None = None, + output_dir: str | None = None, + ) -> str: ... + + +def register_main_tools(mcp: FastMCP, impl: MainMCPToolImplementation) -> None: + @mcp.tool() + async def execute_task( + task: str, + workspace_dir: str | None = None, + max_iterations: int | None = None, + skill_dirs: list[str] | None = None, + search_scope: str = "all", + ) -> str: + """Execute a task with OpenSpace's full grounding engine. + + OpenSpace will: + 1. Auto-register bot skills from skill_dirs (if provided) + 2. Search for relevant skills (scope controls local vs cloud+local) + 3. Attempt skill-guided execution → fallback to pure tools + 4. Auto-analyze → auto-evolve (FIX/DERIVED/CAPTURED) if needed + + If skills are auto-evolved, the response includes ``evolved_skills`` + with ``upload_ready: true``. Call ``upload_skill`` with just the + ``skill_dir`` + ``visibility`` to upload — metadata is pre-saved. + + Note: This call blocks until the task completes (may take minutes). + Set MCP client tool-call timeout ≥ 600 seconds. + + Args: + task: The task instruction (natural language). + workspace_dir: Working directory. Defaults to OPENSPACE_WORKSPACE env. + max_iterations: Max agent iterations (default: 20). + skill_dirs: Bot's skill directories to auto-register so OpenSpace + can select and track them. Directories are re-scanned + on every call to discover skills created since the last + invocation. + search_scope: Skill search scope before execution. + "all" (default) — local + cloud; falls back to local + if no API key is configured. + "local" — local SkillRegistry only (fast, no cloud). + """ + return await impl.execute_task( + task=task, + workspace_dir=workspace_dir, + max_iterations=max_iterations, + skill_dirs=skill_dirs, + search_scope=search_scope, + ) + + @mcp.tool() + async def search_skills( + query: str, + source: str = "all", + limit: int = 20, + auto_import: bool = True, + ) -> str: + """Search skills across local registry and cloud community. + + Standalone search for browsing / discovery. Use this when the bot + wants to find available skills, then decide whether to handle the + task locally or delegate to ``execute_task``. + + **Scope difference from execute_task**: + - ``search_skills`` returns results to the bot for decision-making. + - ``execute_task``'s internal search feeds directly into execution + (the bot never sees the search results). + + Uses hybrid ranking: BM25 → embedding re-rank → lexical boost. + Embedding requires OPENAI_API_KEY; falls back to lexical-only without it. + + Args: + query: Search query text (natural language or keywords). + source: "all" (cloud + local), "local", or "cloud". Default: "all". + limit: Maximum results to return (default: 20). + auto_import: Auto-download top public cloud skills (default: True). + """ + return await impl.search_skills( + query=query, + source=source, + limit=limit, + auto_import=auto_import, + ) + + @mcp.tool() + async def fix_skill( + skill_dir: str, + direction: str, + ) -> str: + """Manually fix a broken skill. + + This is the **only** manual evolution entry point. DERIVED and + CAPTURED evolutions are triggered automatically by ``execute_task`` + (they need a task to run). Use ``fix_skill`` when: + + - A skill's instructions are wrong or outdated + - The bot knows exactly which skill is broken and what to fix + - Auto-evolution inside ``execute_task`` didn't catch the issue + + The skill does NOT need to be pre-registered in OpenSpace — + provide the skill directory path and OpenSpace will register it + automatically before fixing. + + After fixing, the new skill is saved locally and ``.upload_meta.json`` + is pre-written. Call ``upload_skill`` with just ``skill_dir`` + + ``visibility`` to upload. + + Args: + skill_dir: Path to the broken skill directory (must contain SKILL.md). + direction: What's broken and how to fix it. Be specific: + e.g. "The API endpoint changed from v1 to v2" or + "Add retry logic for HTTP 429 rate limit errors". + """ + return await impl.fix_skill(skill_dir=skill_dir, direction=direction) + + @mcp.tool() + async def upload_skill( + skill_dir: str, + visibility: str = "public", + origin: str | None = None, + parent_skill_ids: list[str] | None = None, + tags: list[str] | None = None, + created_by: str | None = None, + change_summary: str | None = None, + ) -> str: + """Upload a local skill to the cloud. + + For evolved skills (from ``execute_task`` or ``fix_skill``), most + metadata is **pre-saved** in ``.upload_meta.json``. The bot only + needs to provide: + + - ``skill_dir`` — path to the skill directory + - ``visibility`` — "public" or "private" + + All other parameters are optional overrides. If omitted, pre-saved + values are used. If no pre-saved values exist, sensible defaults + are applied. + + **origin + parent_skill_ids constraints** (enforced by cloud): + - imported / captured → parent_skill_ids must be empty + - derived → at least 1 parent + - fixed → exactly 1 parent + + Args: + skill_dir: Path to skill directory (must contain SKILL.md). + visibility: "public" or "private". This is the one thing the + bot MUST decide. + origin: Override origin. Default: from .upload_meta.json or "imported". + parent_skill_ids: Override parents. Default: from .upload_meta.json. + tags: Override tags. Default: from .upload_meta.json. + created_by: Override creator. Default: from .upload_meta.json. + change_summary: Override summary. Default: from .upload_meta.json. + """ + return await impl.upload_skill( + skill_dir=skill_dir, + visibility=visibility, + origin=origin, + parent_skill_ids=parent_skill_ids, + tags=tags, + created_by=created_by, + change_summary=change_summary, + ) + + +def register_evolution_tools( + mcp: FastMCP, + impl: EvolutionMCPToolImplementation, +) -> None: + @mcp.tool() + async def evolve_from_context( + task: str, + summary: str, + workspace_dir: str | None = None, + file_paths: list[str] | None = None, + max_skills: int = 3, + skill_dirs: list[str] | None = None, + output_dir: str | None = None, + ) -> str: + """Capture reusable skills from a completed host-agent task. + + Use this when the main task was already handled by another agent + (for example Codex Desktop) and OpenSpace should only spend provider + tokens on post-task skill capture. + + Args: + task: Short description of the completed task. + summary: What changed, what was learned, and what seems reusable. + workspace_dir: Repository/workspace path. Defaults to OPENSPACE_WORKSPACE. + file_paths: Optional files worth emphasizing when planning captures. + max_skills: Maximum number of new skills to capture. + skill_dirs: Optional additional skill directories to register first. + output_dir: Override directory for new skills. Defaults to the first + OPENSPACE_HOST_SKILL_DIRS entry. + """ + return await impl.evolve_from_context( + task=task, + summary=summary, + workspace_dir=workspace_dir, + file_paths=file_paths, + max_skills=max_skills, + skill_dirs=skill_dirs, + output_dir=output_dir, + ) diff --git a/openspace/shared_mcp_runtime.py b/openspace/shared_mcp_runtime.py new file mode 100644 index 0000000..971b6b2 --- /dev/null +++ b/openspace/shared_mcp_runtime.py @@ -0,0 +1,619 @@ +from __future__ import annotations + +import asyncio +import contextlib +import hashlib +import json +import os +import signal +import socket +import subprocess +import sys +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Literal + +from openspace.config.loader import get_agent_config +from openspace.grounding.backends.mcp.client import MCPClient +from openspace.host_detection import ( + build_grounding_config_path, + build_llm_kwargs, + load_runtime_env, +) +from openspace.utils.logging import Logger + +logger = Logger.get_logger(__name__) + +ServerKind = Literal["main", "evolution"] + +_REPO_ROOT = Path(__file__).resolve().parent.parent +_EXPECTED_TOOL_NAMES: dict[ServerKind, tuple[str, ...]] = { + "main": ("execute_task", "search_skills", "fix_skill", "upload_skill"), + "evolution": ("evolve_from_context",), +} +_SERVER_MODULES: dict[ServerKind, str] = { + "main": "openspace.mcp_server", + "evolution": "openspace.evolution_mcp_server", +} + + +@dataclass(frozen=True) +class MCPDaemonIdentity: + server_kind: ServerKind + workspace: str + resolved_model: str + llm_kwargs_fingerprint: str + backend_scope: tuple[str, ...] + host_skill_dirs: tuple[str, ...] + grounding_config_fingerprint: str + instance_key: str + state_dir: str + + @property + def metadata_path(self) -> Path: + return Path(self.state_dir) / f"{self.server_kind}-{self.instance_key}.json" + + @property + def lock_path(self) -> Path: + return Path(self.state_dir) / f"{self.server_kind}-{self.instance_key}.lock" + + @property + def log_path(self) -> Path: + return Path(self.state_dir) / f"{self.server_kind}-{self.instance_key}.log" + + +@dataclass(frozen=True) +class MCPDaemonRecord: + server_kind: ServerKind + instance_key: str + pid: int + port: int + workspace: str + resolved_model: str + llm_kwargs_fingerprint: str + backend_scope: list[str] + host_skill_dirs: list[str] + grounding_config_fingerprint: str + started_at: float + log_path: str + ready: bool = False + warmed: bool = False + ready_at: float | None = None + warmed_at: float | None = None + warmup_error: str | None = None + last_used_at: float | None = None + active_requests: int = 0 + + @property + def url(self) -> str: + return f"http://127.0.0.1:{self.port}/mcp" + + +class _FileLock: + def __init__(self, path: Path): + self._path = path + self._handle = None + + def __enter__(self): + self._path.parent.mkdir(parents=True, exist_ok=True) + self._handle = self._path.open("a+", encoding="utf-8") + if os.name == "nt": + import msvcrt + + while True: + try: + msvcrt.locking(self._handle.fileno(), msvcrt.LK_LOCK, 1) + break + except OSError: + time.sleep(0.1) + else: + import fcntl + + fcntl.flock(self._handle.fileno(), fcntl.LOCK_EX) + return self + + def __exit__(self, exc_type, exc, tb): + if not self._handle: + return + try: + if os.name == "nt": + import msvcrt + + self._handle.seek(0) + msvcrt.locking(self._handle.fileno(), msvcrt.LK_UNLCK, 1) + else: + import fcntl + + fcntl.flock(self._handle.fileno(), fcntl.LOCK_UN) + finally: + self._handle.close() + self._handle = None + + +def _default_state_dir() -> Path: + override = os.environ.get("OPENSPACE_MCP_DAEMON_STATE_DIR", "").strip() + if override: + return Path(override).expanduser().resolve() + + if sys.platform == "darwin": + base = Path.home() / "Library" / "Application Support" + elif os.name == "nt": + base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) + else: + base = Path(os.environ.get("XDG_STATE_HOME", Path.home() / ".local" / "state")) + return (base / "openspace" / "mcp-daemons").resolve() + + +def _canonical_workspace() -> Path: + workspace = Path(os.environ.get("OPENSPACE_WORKSPACE") or os.getcwd()).expanduser() + workspace = workspace.resolve() + try: + proc = subprocess.run( + ["git", "-C", str(workspace), "rev-parse", "--show-toplevel"], + check=False, + capture_output=True, + text=True, + ) + if proc.returncode == 0 and proc.stdout.strip(): + return Path(proc.stdout.strip()).resolve() + except Exception: + pass + return workspace + + +def _effective_backend_scope(server_kind: ServerKind) -> list[str]: + raw = os.environ.get("OPENSPACE_BACKEND_SCOPE", "").strip() + if raw: + parts = [part.strip().lower() for part in raw.split(",") if part.strip()] + return sorted(dict.fromkeys(parts)) + + if server_kind == "evolution": + return ["shell", "system"] + + agent_cfg = get_agent_config("GroundingAgent") or {} + parts = agent_cfg.get("backend_scope") or ["gui", "shell", "mcp", "web", "system"] + return sorted(dict.fromkeys(str(part).strip().lower() for part in parts if str(part).strip())) + + +def _effective_host_skill_dirs() -> list[str]: + raw = os.environ.get("OPENSPACE_HOST_SKILL_DIRS", "").strip() + if not raw: + return [] + + normalized: list[str] = [] + for item in raw.split(","): + item = item.strip() + if not item: + continue + resolved = str(Path(item).expanduser().resolve()) + if resolved not in normalized: + normalized.append(resolved) + return normalized + + +def _fingerprint_payload(payload: Any) -> str: + encoded = json.dumps(payload, sort_keys=True, ensure_ascii=False, separators=(",", ":")).encode("utf-8") + return hashlib.sha256(encoded).hexdigest() + + +def _grounding_config_fingerprint() -> str: + config_path = build_grounding_config_path() + if not config_path: + return "none" + + path = Path(config_path) + if path.is_file(): + return hashlib.sha256(path.read_bytes()).hexdigest() + return hashlib.sha256(str(path).encode("utf-8")).hexdigest() + + +def compute_daemon_identity(server_kind: ServerKind) -> MCPDaemonIdentity: + load_runtime_env() + + workspace = _canonical_workspace() + env_model = os.environ.get("OPENSPACE_MODEL", "") + resolved_model, llm_kwargs = build_llm_kwargs(env_model) + backend_scope = _effective_backend_scope(server_kind) + host_skill_dirs = _effective_host_skill_dirs() + grounding_config_fingerprint = _grounding_config_fingerprint() + llm_kwargs_fingerprint = _fingerprint_payload(llm_kwargs) + + key_payload = { + "server_kind": server_kind, + "workspace": str(workspace), + "resolved_model": resolved_model, + "llm_kwargs_fingerprint": llm_kwargs_fingerprint, + "backend_scope": backend_scope, + "host_skill_dirs": host_skill_dirs, + "grounding_config_fingerprint": grounding_config_fingerprint, + } + + return MCPDaemonIdentity( + server_kind=server_kind, + workspace=str(workspace), + resolved_model=resolved_model, + llm_kwargs_fingerprint=llm_kwargs_fingerprint, + backend_scope=tuple(backend_scope), + host_skill_dirs=tuple(host_skill_dirs), + grounding_config_fingerprint=grounding_config_fingerprint, + instance_key=_fingerprint_payload(key_payload)[:32], + state_dir=str(_default_state_dir()), + ) + + +def _read_record(path: Path) -> MCPDaemonRecord | None: + if not path.is_file(): + return None + try: + return MCPDaemonRecord(**json.loads(path.read_text(encoding="utf-8"))) + except Exception as exc: + logger.warning("Failed to read daemon metadata %s: %s", path, exc) + return None + + +def _write_record(path: Path, record: MCPDaemonRecord) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp_path = path.with_suffix(path.suffix + ".tmp") + tmp_path.write_text( + json.dumps(asdict(record), ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + tmp_path.replace(path) + + +def _record_last_used_at(record: MCPDaemonRecord) -> float: + return record.last_used_at or record.warmed_at or record.ready_at or record.started_at + + +def _max_daemons_per_kind() -> int: + raw = os.environ.get("OPENSPACE_MCP_MAX_DAEMONS_PER_KIND", "").strip() + if not raw: + return 8 + try: + return int(raw) + except ValueError: + logger.warning("Invalid OPENSPACE_MCP_MAX_DAEMONS_PER_KIND=%r", raw) + return 8 + + +def _unlink_record_artifacts(record: MCPDaemonRecord, state_dir: str) -> None: + metadata_path, lock_path = _metadata_paths(record.server_kind, record.instance_key, state_dir) + with contextlib.suppress(FileNotFoundError): + metadata_path.unlink() + with contextlib.suppress(FileNotFoundError): + lock_path.unlink() + + +def _collect_records(state_dir: str, server_kind: ServerKind) -> list[MCPDaemonRecord]: + state_path = Path(state_dir) + if not state_path.is_dir(): + return [] + + records: list[MCPDaemonRecord] = [] + for metadata_path in sorted(state_path.glob(f"{server_kind}-*.json")): + record = _read_record(metadata_path) + if record is not None: + records.append(record) + return records + + +def _reap_state_dir_records( + state_dir: str, + server_kind: ServerKind, + *, + keep_instance_key: str | None = None, +) -> None: + max_records = _max_daemons_per_kind() + live_records: list[MCPDaemonRecord] = [] + + for record in _collect_records(state_dir, server_kind): + if not _pid_exists(record.pid) or not _pid_matches_server(record): + _unlink_record_artifacts(record, state_dir) + continue + live_records.append(record) + + if max_records <= 0 or len(live_records) <= max_records: + return + + remaining = len(live_records) + for record in sorted(live_records, key=_record_last_used_at): + if remaining <= max_records: + break + if keep_instance_key and record.instance_key == keep_instance_key: + continue + if record.active_requests > 0: + continue + + logger.info( + "Reaping %s daemon pid=%s instance_key=%s last_used_at=%.3f to keep fleet <= %s", + record.server_kind, + record.pid, + record.instance_key, + _record_last_used_at(record), + max_records, + ) + _terminate_record_process(record) + _unlink_record_artifacts(record, state_dir) + remaining -= 1 + + +def _metadata_paths( + server_kind: ServerKind, + instance_key: str, + state_dir: str, +) -> tuple[Path, Path]: + state_path = Path(state_dir) + return ( + state_path / f"{server_kind}-{instance_key}.json", + state_path / f"{server_kind}-{instance_key}.lock", + ) + + +def update_current_daemon_status( + server_kind: ServerKind, + *, + ready: bool | None = None, + warmed: bool | None = None, + warmup_error: str | None = None, + touch: bool = False, + active_delta: int = 0, +) -> MCPDaemonRecord | None: + instance_key = os.environ.get("OPENSPACE_MCP_INSTANCE_KEY", "").strip() + state_dir = os.environ.get("OPENSPACE_MCP_DAEMON_STATE_DIR", "").strip() + if not instance_key or not state_dir: + return None + + metadata_path, lock_path = _metadata_paths(server_kind, instance_key, state_dir) + with _FileLock(lock_path): + record = _read_record(metadata_path) + if record is None: + return None + now = time.time() + + updates: dict[str, Any] = {} + if ready is not None: + updates["ready"] = ready + if ready and record.ready_at is None: + updates["ready_at"] = now + if warmed is not None: + updates["warmed"] = warmed + if warmed and record.warmed_at is None: + updates["warmed_at"] = now + if warmup_error is not None: + updates["warmup_error"] = warmup_error + + if active_delta: + updates["active_requests"] = max(0, record.active_requests + active_delta) + touch = True + if touch: + updates["last_used_at"] = now + + if not updates: + return record + + updated = MCPDaemonRecord( + **{ + **asdict(record), + **updates, + } + ) + _write_record(metadata_path, updated) + return updated + + +def _pid_exists(pid: int) -> bool: + if pid <= 0: + return False + try: + os.kill(pid, 0) + return True + except OSError: + return False + + +def _expected_process_marker(server_kind: ServerKind) -> str: + return _SERVER_MODULES[server_kind] + + +def _pid_matches_server(record: MCPDaemonRecord) -> bool: + if os.name == "nt": + return _pid_exists(record.pid) + try: + proc = subprocess.run( + ["ps", "-o", "command=", "-p", str(record.pid)], + check=False, + capture_output=True, + text=True, + ) + except Exception: + return False + command = proc.stdout.strip() + return bool(command) and _expected_process_marker(record.server_kind) in command + + +def _terminate_record_process(record: MCPDaemonRecord) -> None: + if not _pid_exists(record.pid) or not _pid_matches_server(record): + return + + with contextlib.suppress(Exception): + os.kill(record.pid, signal.SIGTERM) + deadline = time.monotonic() + 3.0 + while time.monotonic() < deadline: + if not _pid_exists(record.pid): + return + time.sleep(0.1) + with contextlib.suppress(Exception): + os.kill(record.pid, signal.SIGKILL) + + +def _pick_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + sock.listen(1) + return int(sock.getsockname()[1]) + + +def _spawn_daemon(identity: MCPDaemonIdentity, port: int) -> MCPDaemonRecord: + env = os.environ.copy() + env["OPENSPACE_MCP_DAEMON"] = "1" + env["OPENSPACE_MCP_INSTANCE_KEY"] = identity.instance_key + env["OPENSPACE_MCP_DAEMON_STATE_DIR"] = identity.state_dir + env["OPENSPACE_WORKSPACE"] = identity.workspace + env["OPENSPACE_MODEL"] = identity.resolved_model + env["OPENSPACE_BACKEND_SCOPE"] = ",".join(identity.backend_scope) + if identity.host_skill_dirs: + env["OPENSPACE_HOST_SKILL_DIRS"] = ",".join(identity.host_skill_dirs) + else: + env.pop("OPENSPACE_HOST_SKILL_DIRS", None) + + log_path = identity.log_path + log_path.parent.mkdir(parents=True, exist_ok=True) + log_handle = log_path.open("ab") + + popen_kwargs: dict[str, Any] = { + "cwd": identity.workspace, + "env": env, + "stdin": subprocess.DEVNULL, + "stdout": log_handle, + "stderr": subprocess.STDOUT, + } + if os.name == "nt": + popen_kwargs["creationflags"] = ( + subprocess.CREATE_NEW_PROCESS_GROUP | subprocess.DETACHED_PROCESS + ) + else: + popen_kwargs["start_new_session"] = True + + proc = subprocess.Popen( + [ + sys.executable, + "-m", + _SERVER_MODULES[identity.server_kind], + "--transport", + "streamable-http", + "--port", + str(port), + ], + **popen_kwargs, + ) + log_handle.close() + started_at = time.time() + return MCPDaemonRecord( + server_kind=identity.server_kind, + instance_key=identity.instance_key, + pid=proc.pid, + port=port, + workspace=identity.workspace, + resolved_model=identity.resolved_model, + llm_kwargs_fingerprint=identity.llm_kwargs_fingerprint, + backend_scope=list(identity.backend_scope), + host_skill_dirs=list(identity.host_skill_dirs), + grounding_config_fingerprint=identity.grounding_config_fingerprint, + started_at=started_at, + log_path=str(log_path), + last_used_at=started_at, + ) + + +async def _probe_record(record: MCPDaemonRecord) -> bool: + client = MCPClient( + config={"mcpServers": {"daemon": {"url": record.url}}}, + timeout=5.0, + sse_read_timeout=15.0, + max_retries=1, + retry_interval=0.1, + check_dependencies=False, + ) + try: + session = await client.create_session("daemon", auto_initialize=True) + if session is None: + return False + tools = await session.list_tools() + actual = {tool.name for tool in tools} + expected = set(_EXPECTED_TOOL_NAMES[record.server_kind]) + return actual == expected + except Exception: + return False + finally: + with contextlib.suppress(Exception): + await client.close_all_sessions() + + +async def _wait_until_ready(record: MCPDaemonRecord, timeout_seconds: float = 15.0) -> bool: + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + if _pid_exists(record.pid) and await _probe_record(record): + return True + await asyncio.sleep(0.25) + return False + + +async def ensure_daemon(server_kind: ServerKind) -> MCPDaemonRecord: + identity = compute_daemon_identity(server_kind) + identity.metadata_path.parent.mkdir(parents=True, exist_ok=True) + + with _FileLock(identity.lock_path): + _reap_state_dir_records( + identity.state_dir, + server_kind, + keep_instance_key=identity.instance_key, + ) + existing = _read_record(identity.metadata_path) + if existing and _pid_exists(existing.pid) and await _probe_record(existing): + now = time.time() + if not existing.ready or (server_kind != "main" and not existing.warmed): + refreshed = MCPDaemonRecord( + **{ + **asdict(existing), + "ready": True, + "ready_at": existing.ready_at or now, + "warmed": (existing.warmed or server_kind != "main"), + "warmed_at": ( + existing.warmed_at + or (now if (existing.warmed or server_kind != "main") else None) + ), + "last_used_at": now, + } + ) + _write_record(identity.metadata_path, refreshed) + return refreshed or existing + refreshed = MCPDaemonRecord( + **{ + **asdict(existing), + "last_used_at": now, + } + ) + _write_record(identity.metadata_path, refreshed) + return refreshed + + if existing: + _terminate_record_process(existing) + _unlink_record_artifacts(existing, identity.state_dir) + + last_error: Exception | None = None + for _ in range(3): + record = _spawn_daemon(identity, _pick_free_port()) + _write_record(identity.metadata_path, record) + if await _wait_until_ready(record): + now = time.time() + updated = MCPDaemonRecord( + **{ + **asdict(record), + "ready": True, + "ready_at": now, + "warmed": (server_kind != "main"), + "warmed_at": (now if server_kind != "main" else None), + "last_used_at": now, + } + ) + _write_record(identity.metadata_path, updated) + return updated + + last_error = RuntimeError( + f"Daemon for key={identity.instance_key} did not become ready" + ) + _terminate_record_process(record) + _unlink_record_artifacts(record, identity.state_dir) + + raise last_error or RuntimeError("Failed to start daemon") diff --git a/openspace/skill_engine/evolver.py b/openspace/skill_engine/evolver.py index 818d8bf..e8ff6fa 100644 --- a/openspace/skill_engine/evolver.py +++ b/openspace/skill_engine/evolver.py @@ -43,6 +43,7 @@ create_skill, fix_skill, derive_skill, + parse_multi_file_full, SKILL_FILENAME, ) from .skill_utils import ( @@ -71,6 +72,7 @@ _SKILL_CONTENT_MAX_CHARS = 12_000 # Max chars of SKILL.md in evolution prompt _MAX_SKILL_NAME_LENGTH = 50 # Max chars for a skill name (directory name) +_MULTI_FILE_FULL_HEADER_RE = re.compile(r"(?m)^\*\*\*\s*File:\s*") def _sanitize_skill_name(name: str) -> str: @@ -95,6 +97,88 @@ def _sanitize_skill_name(name: str) -> str: truncated = truncated[:last_hyphen] return truncated.strip("-") + +def _extract_generated_frontmatter_field(content: str, field_name: str) -> Optional[str]: + """Extract frontmatter fields from single-file or multi-file FULL outputs.""" + direct = _extract_frontmatter_field(content, field_name) + if direct: + return direct + + if _MULTI_FILE_FULL_HEADER_RE.search(content): + try: + skill_md = parse_multi_file_full(content).get(SKILL_FILENAME, "") + except Exception: + skill_md = "" + if skill_md: + nested = _extract_frontmatter_field(skill_md, field_name) + if nested: + return nested + + # Fall back to the top of the generated text to tolerate minor format drift. + head = "\n".join(content.splitlines()[:40]) + match = re.search( + rf"(?im)^\s*{re.escape(field_name)}\s*:\s*(.+?)\s*$", + head, + ) + if not match: + return None + + value = match.group(1).strip().strip("\"'") + return value or None + + +def _serialize_multi_file_full(files: Dict[str, str]) -> str: + """Serialize parsed FULL files back into the ``*** Begin Files`` format.""" + ordered_paths = [SKILL_FILENAME] + ordered_paths.extend(path for path in sorted(files) if path != SKILL_FILENAME) + + lines = ["*** Begin Files"] + for path in ordered_paths: + if path not in files: + continue + lines.append(f"*** File: {path}") + body = files[path].rstrip("\n") + if body: + lines.append(body) + lines.append("*** End Files") + return "\n".join(lines).strip() + "\n" + + +def _set_generated_frontmatter_field(content: str, field_name: str, value: str) -> str: + """Inject or update frontmatter fields in single-file or multi-file FULL outputs.""" + if _MULTI_FILE_FULL_HEADER_RE.search(content): + files = parse_multi_file_full(content) + files[SKILL_FILENAME] = _set_frontmatter_field( + files.get(SKILL_FILENAME, ""), + field_name, + value, + ) + return _serialize_multi_file_full(files) + + return _set_frontmatter_field(content, field_name, value) + + +def _fallback_captured_skill_name( + direction: str, + category: Optional[SkillCategory], +) -> str: + """Generate a deterministic fallback skill name from the capture direction.""" + hint = (direction or "").strip() + hint = re.split(r"[:.;\n]", hint, maxsplit=1)[0].strip() + hint = re.sub(r"(?i)^capture\s+", "", hint) + hint = re.sub(r"(?i)^a\s+reusable\s+", "", hint) + hint = re.sub(r"(?i)^reusable\s+", "", hint) + hint = re.sub(r"(?i)^(workflow|reference|tool[- ]guide)\s+for\s+", "", hint) + hint = re.sub(r"(?i)^(adding|creating|capturing|building)\s+", "", hint) + hint = re.sub(r"(?i)^(a|an|the)\s+", "", hint) + + fallback = _sanitize_skill_name(hint) + if fallback: + return fallback + + prefix = (category.value if category else "captured-skill").replace("_", "-") + return f"{prefix}-{uuid.uuid4().hex[:6]}" + _ANALYSIS_CONTEXT_MAX = 5 # Max recent analyses to include in prompt _ANALYSIS_NOTE_MAX_CHARS = 500 # Per-analysis note truncation @@ -937,15 +1021,26 @@ async def _evolve_captured(self, ctx: EvolutionContext) -> Optional[SkillRecord] new_content, change_summary = _extract_change_summary(new_content) # Extract name/description from the generated content - new_name = _extract_frontmatter_field(new_content, "name") - new_desc = _extract_frontmatter_field(new_content, "description") + new_name = _extract_generated_frontmatter_field(new_content, "name") + new_desc = _extract_generated_frontmatter_field(new_content, "description") if not new_name: - logger.warning("CAPTURED: LLM did not produce a valid skill name") - return None + new_name = _fallback_captured_skill_name( + ctx.suggestion.direction, + ctx.suggestion.category, + ) + logger.warning( + "CAPTURED: LLM did not produce a valid skill name; using fallback '%s'", + new_name, + ) # Sanitize name (enforce length limit + valid chars) new_name = _sanitize_skill_name(new_name) - new_content = _set_frontmatter_field(new_content, "name", new_name) + if not new_name: + new_name = _fallback_captured_skill_name( + ctx.suggestion.direction, + ctx.suggestion.category, + ) + new_content = _set_generated_frontmatter_field(new_content, "name", new_name) # Create new skill directory via create_skill (handles multi-file FULL) skill_dirs = self._registry._skill_dirs @@ -1528,4 +1623,4 @@ def _diagnose_skill_health( f"better error handling or alternative approaches." ) - return None, "" \ No newline at end of file + return None, "" diff --git a/openspace/skill_engine/skill_ranker.py b/openspace/skill_engine/skill_ranker.py index 503eda5..25d2b61 100644 --- a/openspace/skill_engine/skill_ranker.py +++ b/openspace/skill_engine/skill_ranker.py @@ -7,7 +7,9 @@ Embedding strategy: - Text = ``name + description + SKILL.md body`` (consistent with MCP ``search_skills`` and the clawhub cloud platform) - - Model: ``qwen/qwen3-embedding-8b`` via OpenRouter API + - Backend is configurable via ``OPENSPACE_SKILL_EMBEDDING_BACKEND`` + and can use either a local fastembed model or a remote + OpenAI-compatible embedding endpoint - Embeddings are cached in-memory keyed by ``skill_id`` and optionally persisted to a pickle file for cross-session reuse @@ -32,8 +34,6 @@ logger = Logger.get_logger(__name__) -# Embedding model — must match clawhub platform for vector-space compatibility -SKILL_EMBEDDING_MODEL = "openai/text-embedding-3-small" SKILL_EMBEDDING_MAX_CHARS = 12_000 # Pre-filter threshold: when local skills exceed this count, BM25 pre-filter @@ -44,7 +44,7 @@ BM25_CANDIDATES_MULTIPLIER = 3 # top_k * 3 # Cache version — increment when format changes -_CACHE_VERSION = 1 +_CACHE_VERSION = 2 @dataclass @@ -172,6 +172,43 @@ def get_or_compute_embedding( self._save_cache() return emb + def get_cached_embedding(self, skill_id: str) -> Optional[List[float]]: + """Return a cached embedding without computing a new one.""" + return self._embedding_cache.get(skill_id) + + def prime_candidates(self, candidates: List[SkillCandidate]) -> int: + """Populate embeddings for candidates, saving cache once at the end. + + Returns the number of candidates that ended with an embedding, whether + loaded from cache or computed during this call. + """ + warmed = 0 + cache_changed = False + + for candidate in candidates: + if candidate.embedding: + warmed += 1 + continue + + cached = self._embedding_cache.get(candidate.skill_id) + if cached: + candidate.embedding = cached + warmed += 1 + continue + + text = self._build_embedding_text(candidate) + emb = self._generate_embedding(text) + if emb: + candidate.embedding = emb + self._embedding_cache[candidate.skill_id] = emb + warmed += 1 + cache_changed = True + + if cache_changed: + self._save_cache() + + return warmed + def invalidate_cache(self, skill_id: str) -> None: """Remove a skill's cached embedding (e.g. after evolution).""" self._embedding_cache.pop(skill_id, None) @@ -238,13 +275,6 @@ def _bm25_rank( return ranked[:top_k] - @staticmethod - def _get_openai_api_key() -> Optional[str]: - """Resolve OpenAI-compatible API key for embedding requests.""" - from openspace.cloud.embedding import resolve_embedding_api - api_key, _ = resolve_embedding_api() - return api_key - @staticmethod def _build_embedding_text(candidate: SkillCandidate) -> str: """Build text for embedding, consistent with MCP search_skills.""" @@ -264,12 +294,8 @@ def _embedding_rank( top_k: int, ) -> List[SkillCandidate]: """Rank candidates using embedding cosine similarity.""" - api_key = self._get_openai_api_key() - if not api_key: - return [] - # Generate query embedding - query_emb = self._generate_embedding(query, api_key=api_key) + query_emb = self._generate_embedding(query) if not query_emb: return [] @@ -281,7 +307,7 @@ def _embedding_rank( c.embedding = cached else: text = self._build_embedding_text(c) - emb = self._generate_embedding(text, api_key=api_key) + emb = self._generate_embedding(text) if emb: c.embedding = emb self._embedding_cache[c.skill_id] = emb @@ -305,53 +331,20 @@ def _generate_embedding( text: str, api_key: Optional[str] = None, ) -> Optional[List[float]]: - """Generate embedding via OpenAI-compatible API (text-embedding-3-small). + """Generate embedding via the configured skill-embedding backend. - Delegates credential / base-URL resolution to - :func:`openspace.cloud.embedding.resolve_embedding_api`. + Delegates backend/model resolution to :mod:`openspace.cloud.embedding`. """ - from openspace.cloud.embedding import resolve_embedding_api - - resolved_key, base_url = resolve_embedding_api() - if not api_key: - api_key = resolved_key - if not api_key: - return None - - import urllib.request - - body = json.dumps({ - "model": SKILL_EMBEDDING_MODEL, - "input": text, - }).encode("utf-8") - - req = urllib.request.Request( - f"{base_url}/embeddings", - data=body, - headers={ - "Content-Type": "application/json", - "Authorization": f"Bearer {api_key}", - }, - method="POST", - ) - import time - last_err = None - for attempt in range(3): - try: - with urllib.request.urlopen(req, timeout=15) as resp: - data = json.loads(resp.read().decode("utf-8")) - return data.get("data", [{}])[0].get("embedding") - except Exception as e: - last_err = e - if attempt < 2: - delay = 2 * (attempt + 1) - logger.debug("Embedding request failed (attempt %d/3), retrying in %ds: %s", attempt + 1, delay, e) - time.sleep(delay) - logger.warning("Skill embedding generation failed after 3 attempts: %s", last_err) - return None + from openspace.cloud.embedding import generate_embedding + + return generate_embedding(text, api_key=api_key) def _cache_file(self) -> Path: - return self._cache_dir / f"skill_embeddings_v{_CACHE_VERSION}.pkl" + from openspace.cloud.embedding import resolve_skill_embedding_model + + model_name = resolve_skill_embedding_model() + safe_model_name = re.sub(r"[^a-zA-Z0-9_.-]+", "_", model_name) + return self._cache_dir / f"skill_embeddings_{safe_model_name}_v{_CACHE_VERSION}.pkl" def _load_cache(self) -> None: """Load embedding cache from disk.""" @@ -376,7 +369,7 @@ def _save_cache(self) -> None: self._cache_dir.mkdir(parents=True, exist_ok=True) data = { "version": _CACHE_VERSION, - "model": SKILL_EMBEDDING_MODEL, + "model": self._cache_file().stem, "last_updated": datetime.now().isoformat(), "embeddings": self._embedding_cache, } @@ -412,4 +405,3 @@ def build_skill_embedding_text( if len(raw) <= max_chars: return raw return raw[:max_chars] - diff --git a/openspace/tool_layer.py b/openspace/tool_layer.py index c2eaca9..1a902fc 100644 --- a/openspace/tool_layer.py +++ b/openspace/tool_layer.py @@ -58,6 +58,7 @@ class OpenSpaceConfig: enable_screenshot: bool = False enable_video: bool = False enable_conversation_log: bool = True # Save LLM conversations to conversations.jsonl + enable_skill_engine_without_recording: bool = False # Allow sidecar evolution without full task recording # Skill Evolution evolution_max_concurrent: int = 3 # Max parallel evolutions per trigger @@ -242,8 +243,13 @@ async def initialize(self) -> None: logger.info(f"✓ Skills: {len(skills)} discovered") self._grounding_agent.set_skill_registry(self._skill_registry) - # Initialize ExecutionAnalyzer (requires recording + skills) - if self.config.enable_recording and self._skill_registry: + # Initialize the skill engine whenever skills are available. + # Execution analysis still requires recordings, but skill capture + # can run without them (for host-agent sidecar workflows). + if self._skill_registry and ( + self.config.enable_recording + or self.config.enable_skill_engine_without_recording + ): try: skill_store = SkillStore() self._skill_store = skill_store # Expose for MCP server reuse @@ -257,19 +263,6 @@ async def initialize(self) -> None: # Bridge: pass quality_manager so analysis can feed back # LLM-identified tool issues to the tool quality system. - quality_mgr = ( - self._grounding_client.quality_manager - if self._grounding_client else None - ) - self._execution_analyzer = ExecutionAnalyzer( - store=skill_store, - llm_client=self._llm_client, - model=self.config.execution_analyzer_model, - skill_registry=self._skill_registry, - quality_manager=quality_mgr, - ) - logger.info("✓ Execution analysis enabled") - # Share store with GroundingAgent so retrieve_skill # can access quality metrics for LLM selection. self._grounding_agent._skill_store = skill_store @@ -287,8 +280,22 @@ async def initialize(self) -> None: f"✓ Skill evolution enabled " f"(concurrent={self.config.evolution_max_concurrent})" ) + + if self.config.enable_recording: + quality_mgr = ( + self._grounding_client.quality_manager + if self._grounding_client else None + ) + self._execution_analyzer = ExecutionAnalyzer( + store=skill_store, + llm_client=self._llm_client, + model=self.config.execution_analyzer_model, + skill_registry=self._skill_registry, + quality_manager=quality_mgr, + ) + logger.info("✓ Execution analysis enabled") except Exception as e: - logger.warning(f"Execution analyzer init failed (non-fatal): {e}") + logger.warning(f"Skill engine init failed (non-fatal): {e}") self._initialized = True logger.info("="*60) @@ -350,6 +357,7 @@ async def execute( self._task_done.clear() self._last_evolved_skills = [] # Reset per-execution tracking start_time = asyncio.get_event_loop().time() + execution_time = 0.0 # Use external task_id if provided, otherwise generate one if task_id is None: task_id = f"task_{uuid.uuid4().hex[:12]}" @@ -548,6 +556,9 @@ async def execute( } finally: + if execution_time <= 0: + execution_time = asyncio.get_event_loop().time() - start_time + recording_dir = None if self._recording_manager and self._recording_manager.recording_status: recording_dir = self._recording_manager.trajectory_dir diff --git a/openspace/utils/logging.py b/openspace/utils/logging.py index 7382374..f164b7e 100644 --- a/openspace/utils/logging.py +++ b/openspace/utils/logging.py @@ -212,7 +212,9 @@ def configure( # Console Handler if log_to_console: - ch = logging.StreamHandler(sys.stdout) + # Logs should never share stdout with structured program output + # such as MCP stdio frames. + ch = logging.StreamHandler(sys.stderr) ch.setLevel(resolved_level) ch.setFormatter(console_formatter) target_logger.addHandler(ch) @@ -229,7 +231,8 @@ def configure( # Record log file location if not cls._configured: - print(f"Log file enabled: {actual_log_file}") + sys.stderr.write(f"Log file enabled: {actual_log_file}\n") + sys.stderr.flush() cls._configured = True @@ -293,7 +296,7 @@ def reset_configuration(cls) -> None: @staticmethod def _stdout_supports_color() -> bool: - return sys.stdout.isatty() and not os.getenv("NO_COLOR") + return sys.stderr.isatty() and not os.getenv("NO_COLOR") @classmethod def _resolve_level(cls, level: Optional[int]) -> int: diff --git a/pyproject.toml b/pyproject.toml index 06c199c..507d3cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,10 @@ dependencies = [ "litellm>=1.70.0,<1.82.7", # pinned to avoid PYSEC-2026-2 supply-chain compromise (1.82.7/1.82.8 were malicious) "python-dotenv>=1.0.0", "openai>=1.0.0", + "fastembed>=0.8.0", "jsonschema>=4.25.0", "mcp>=1.0.0", + "websockets>=15.0.0", "anthropic>=0.71.0", "pillow>=12.0.0", "numpy>=1.24.0", @@ -69,6 +71,7 @@ Repository = "https://github.com/HKUDS/OpenSpace" openspace = "openspace.__main__:run_main" openspace-server = "openspace.local_server.main:main" openspace-mcp = "openspace.mcp_server:run_mcp_server" +openspace-evolution-mcp = "openspace.evolution_mcp_server:run_mcp_server" openspace-download-skill = "openspace.cloud.cli.download_skill:main" openspace-upload-skill = "openspace.cloud.cli.upload_skill:main" openspace-dashboard = "openspace.dashboard_server:main" diff --git a/requirements.txt b/requirements.txt index 6371308..abe1aa6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,10 @@ litellm>=1.70.0,<1.82.7 # pinned to avoid PYSEC-2026-2 supply-chain compromise (1.82.7/1.82.8 were malicious) python-dotenv>=1.0.0 openai>=1.0.0 +fastembed>=0.8.0 jsonschema>=4.25.0 mcp>=1.0.0 +websockets>=15.0.0 anthropic>=0.71.0 pillow>=12.0.0 numpy>=1.24.0 diff --git a/scripts/cleanup_openspace_daemons.py b/scripts/cleanup_openspace_daemons.py new file mode 100755 index 0000000..94d3709 --- /dev/null +++ b/scripts/cleanup_openspace_daemons.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import signal +import subprocess +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Iterable + + +PROCESS_MARKERS = ( + "openspace.mcp_server", + "openspace.evolution_mcp_server", +) +STATE_FILE_PREFIXES = ("main-", "evolution-") +STATE_FILE_SUFFIXES = (".json", ".lock", ".log") + + +@dataclass +class ManagedProcess: + pid: int + command: str + source: str + state_dir: str | None = None + record_file: str | None = None + + +def default_state_dirs() -> list[Path]: + env_override = os.environ.get("OPENSPACE_MCP_DAEMON_STATE_DIR", "").strip() + candidates = [ + Path(env_override).expanduser().resolve() if env_override else None, + Path.home() / ".codex" / "state" / "openspace", + Path.home() / ".codex-openspace" / "state" / "openspace", + Path.home() / "Library" / "Application Support" / "openspace" / "mcp-daemons", + ] + + result: list[Path] = [] + seen: set[str] = set() + for path in candidates: + if path is None: + continue + key = str(path) + if key in seen: + continue + seen.add(key) + result.append(path) + return result + + +def is_target_command(command: str) -> bool: + return any(marker in command for marker in PROCESS_MARKERS) + + +def pid_exists(pid: int) -> bool: + if pid <= 0: + return False + try: + os.kill(pid, 0) + return True + except OSError: + return False + + +def command_for_pid(pid: int) -> str: + if pid <= 0: + return "" + proc = subprocess.run( + ["ps", "-o", "command=", "-p", str(pid)], + check=False, + capture_output=True, + text=True, + ) + return proc.stdout.strip() + + +def collect_metadata_processes(state_dir: Path) -> tuple[list[ManagedProcess], list[dict[str, object]], list[str]]: + managed: list[ManagedProcess] = [] + metadata_rows: list[dict[str, object]] = [] + warnings: list[str] = [] + + if not state_dir.is_dir(): + return managed, metadata_rows, warnings + + for metadata_path in sorted(state_dir.glob("*.json")): + try: + payload = json.loads(metadata_path.read_text(encoding="utf-8")) + except Exception as exc: # noqa: BLE001 + warnings.append(f"failed to read {metadata_path}: {exc}") + continue + + pid = int(payload.get("pid") or 0) + command = command_for_pid(pid) if pid_exists(pid) else "" + live_target = bool(command and is_target_command(command)) + metadata_rows.append( + { + "state_dir": str(state_dir), + "record_file": metadata_path.name, + "pid": pid, + "port": payload.get("port"), + "workspace": payload.get("workspace"), + "server_kind": payload.get("server_kind"), + "live_target": live_target, + "command": command, + } + ) + if live_target: + managed.append( + ManagedProcess( + pid=pid, + command=command, + source="metadata", + state_dir=str(state_dir), + record_file=metadata_path.name, + ) + ) + + return managed, metadata_rows, warnings + + +def collect_orphan_processes(excluded_pids: Iterable[int]) -> list[ManagedProcess]: + excluded = set(excluded_pids) + proc = subprocess.run( + ["ps", "-axo", "pid=,command="], + check=False, + capture_output=True, + text=True, + ) + + results: list[ManagedProcess] = [] + for line in proc.stdout.splitlines(): + stripped = line.strip() + if not stripped: + continue + try: + pid_text, command = stripped.split(None, 1) + except ValueError: + continue + pid = int(pid_text) + if pid in excluded: + continue + if not is_target_command(command): + continue + results.append(ManagedProcess(pid=pid, command=command, source="orphan-scan")) + return results + + +def terminate_process(pid: int, timeout_seconds: float) -> str: + if not pid_exists(pid): + return "already-exited" + + os.kill(pid, signal.SIGTERM) + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + if not pid_exists(pid): + return "terminated" + time.sleep(0.1) + + if pid_exists(pid): + os.kill(pid, signal.SIGKILL) + deadline = time.monotonic() + 1.0 + while time.monotonic() < deadline: + if not pid_exists(pid): + return "killed" + time.sleep(0.05) + return "kill-sent" + + +def removable_state_files(state_dir: Path, keep_logs: bool) -> list[Path]: + if not state_dir.is_dir(): + return [] + + removable: list[Path] = [] + for path in sorted(state_dir.iterdir()): + if not path.is_file(): + continue + if not path.name.startswith(STATE_FILE_PREFIXES): + continue + if not path.name.endswith(STATE_FILE_SUFFIXES): + continue + if keep_logs and path.suffix == ".log": + continue + removable.append(path) + return removable + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Clean OpenSpace MCP daemon processes and state files") + parser.add_argument( + "--state-dir", + action="append", + default=[], + help="Additional state directory to clean. Can be passed multiple times.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be terminated/removed without changing anything.", + ) + parser.add_argument( + "--keep-logs", + action="store_true", + help="Keep *.log files in state dirs while removing json/lock artifacts.", + ) + parser.add_argument( + "--timeout-seconds", + type=float, + default=3.0, + help="How long to wait after SIGTERM before SIGKILL.", + ) + parser.add_argument( + "--json", + action="store_true", + help="Print the final report as JSON.", + ) + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + + state_dirs = default_state_dirs() + for raw in args.state_dir: + state_dirs.append(Path(raw).expanduser().resolve()) + + deduped_state_dirs: list[Path] = [] + seen_state_dirs: set[str] = set() + for path in state_dirs: + key = str(path) + if key in seen_state_dirs: + continue + seen_state_dirs.add(key) + deduped_state_dirs.append(path) + + metadata_processes: list[ManagedProcess] = [] + metadata_rows: list[dict[str, object]] = [] + warnings: list[str] = [] + for state_dir in deduped_state_dirs: + managed, rows, row_warnings = collect_metadata_processes(state_dir) + metadata_processes.extend(managed) + metadata_rows.extend(rows) + warnings.extend(row_warnings) + + orphan_processes = collect_orphan_processes(proc.pid for proc in metadata_processes) + + unique_processes: dict[int, ManagedProcess] = {} + for proc in [*metadata_processes, *orphan_processes]: + unique_processes.setdefault(proc.pid, proc) + + process_actions: list[dict[str, object]] = [] + for proc in sorted(unique_processes.values(), key=lambda item: item.pid): + action = "would-terminate" if args.dry_run else terminate_process(proc.pid, args.timeout_seconds) + process_actions.append( + { + **asdict(proc), + "action": action, + } + ) + + file_actions: list[dict[str, object]] = [] + for state_dir in deduped_state_dirs: + for path in removable_state_files(state_dir, keep_logs=args.keep_logs): + action = "would-remove" + if not args.dry_run: + path.unlink(missing_ok=True) + action = "removed" + file_actions.append( + { + "state_dir": str(state_dir), + "path": str(path), + "action": action, + } + ) + + report = { + "state_dirs": [str(path) for path in deduped_state_dirs], + "metadata_records": metadata_rows, + "process_actions": process_actions, + "file_actions": file_actions, + "warnings": warnings, + "dry_run": args.dry_run, + } + + if args.json: + print(json.dumps(report, ensure_ascii=False, indent=2)) + return 0 + + print("OpenSpace daemon cleanup") + print(f"dry run: {'yes' if args.dry_run else 'no'}") + print("state dirs:") + for path in report["state_dirs"]: + print(f"- {path}") + + print("\nprocesses:") + if process_actions: + for item in process_actions: + print( + f"- pid={item['pid']} source={item['source']} action={item['action']} " + f"record={item.get('record_file') or '-'}" + ) + else: + print("- none") + + print("\nfiles:") + if file_actions: + for item in file_actions: + print(f"- {item['action']}: {item['path']}") + else: + print("- none") + + if warnings: + print("\nwarnings:") + for warning in warnings: + print(f"- {warning}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/codex-desktop-evolution b/scripts/codex-desktop-evolution new file mode 100755 index 0000000..6ff8dfe --- /dev/null +++ b/scripts/codex-desktop-evolution @@ -0,0 +1,196 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$REPO_ROOT/openspace/.env" +PRIMARY_CODEX_HOME="${PRIMARY_CODEX_HOME:-$HOME/.codex}" +PROFILE_HOME="${OPENSPACE_CODEX_HOME:-$HOME/.codex-openspace-desktop}" +PROJECT_NAME="$(basename "$REPO_ROOT")" +PROJECT_SKILL_DIR="$PROFILE_HOME/projects/$PROJECT_NAME/skills" +REPO_PYTHON="$REPO_ROOT/.venv/bin/python" + +if [[ ! -f "$PRIMARY_CODEX_HOME/config.toml" ]]; then + echo "Missing $PRIMARY_CODEX_HOME/config.toml" >&2 + exit 1 +fi + +if [[ ! -f "$PRIMARY_CODEX_HOME/auth.json" ]]; then + echo "Missing $PRIMARY_CODEX_HOME/auth.json" >&2 + exit 1 +fi + +read_env_value() { + local key="$1" + if [[ ! -f "$ENV_FILE" ]]; then + return 0 + fi + + python3 - <<'PY' "$ENV_FILE" "$key" +from pathlib import Path +import sys + +env_path = Path(sys.argv[1]) +target = sys.argv[2] + +for line in env_path.read_text(encoding="utf-8").splitlines(): + stripped = line.strip() + if not stripped or stripped.startswith("#") or "=" not in stripped: + continue + key, value = stripped.split("=", 1) + if key.strip() != target: + continue + value = value.strip().strip('"').strip("'") + print(value, end="") + break +PY +} + +OPENSPACE_MODEL="${OPENSPACE_MODEL:-$(read_env_value OPENSPACE_MODEL)}" +OPENSPACE_MODEL="${OPENSPACE_MODEL:-gpt-5.4}" +OPENSPACE_LLM_API_KEY="${OPENSPACE_LLM_API_KEY:-$(read_env_value OPENSPACE_LLM_API_KEY)}" +OPENSPACE_LLM_API_BASE="${OPENSPACE_LLM_API_BASE:-$(read_env_value OPENSPACE_LLM_API_BASE)}" +OPENSPACE_LLM_API_BASE="${OPENSPACE_LLM_API_BASE:-https://codexapi.space/v1}" +OPENSPACE_LLM_OPENAI_STREAM_COMPAT="${OPENSPACE_LLM_OPENAI_STREAM_COMPAT:-$(read_env_value OPENSPACE_LLM_OPENAI_STREAM_COMPAT)}" +OPENSPACE_LLM_OPENAI_STREAM_COMPAT="${OPENSPACE_LLM_OPENAI_STREAM_COMPAT:-true}" +OPENSPACE_SKILL_EMBEDDING_BACKEND="${OPENSPACE_SKILL_EMBEDDING_BACKEND:-$(read_env_value OPENSPACE_SKILL_EMBEDDING_BACKEND)}" +OPENSPACE_SKILL_EMBEDDING_BACKEND="${OPENSPACE_SKILL_EMBEDDING_BACKEND:-local}" +OPENSPACE_SKILL_EMBEDDING_MODEL="${OPENSPACE_SKILL_EMBEDDING_MODEL:-$(read_env_value OPENSPACE_SKILL_EMBEDDING_MODEL)}" +OPENSPACE_SKILL_EMBEDDING_MODEL="${OPENSPACE_SKILL_EMBEDDING_MODEL:-BAAI/bge-small-en-v1.5}" +OPENSPACE_SKILL_EMBEDDING_API_KEY="${OPENSPACE_SKILL_EMBEDDING_API_KEY:-$(read_env_value OPENSPACE_SKILL_EMBEDDING_API_KEY)}" +OPENSPACE_SKILL_EMBEDDING_API_BASE="${OPENSPACE_SKILL_EMBEDDING_API_BASE:-$(read_env_value OPENSPACE_SKILL_EMBEDDING_API_BASE)}" +EMBEDDING_API_KEY="${EMBEDDING_API_KEY:-$(read_env_value EMBEDDING_API_KEY)}" +EMBEDDING_BASE_URL="${EMBEDDING_BASE_URL:-$(read_env_value EMBEDDING_BASE_URL)}" +EMBEDDING_MODEL="${EMBEDDING_MODEL:-$(read_env_value EMBEDDING_MODEL)}" + +sync_profile_dir() { + local name="$1" + local src="$PRIMARY_CODEX_HOME/$name" + local dst="$PROFILE_HOME/$name" + + if [[ ! -d "$src" ]]; then + return + fi + + mkdir -p "$dst" + rsync -a --delete "$src/" "$dst/" +} + +bootstrap_profile_home() { + mkdir -p "$PROFILE_HOME" + mkdir -p "$PROJECT_SKILL_DIR" + mkdir -p "$PROFILE_HOME/skills" + + sync_profile_dir "plugins" + sync_profile_dir "skills" + sync_profile_dir "pua" + + cp "$PRIMARY_CODEX_HOME/auth.json" "$PROFILE_HOME/auth.json" + + python3 - <<'PY' "$PRIMARY_CODEX_HOME/config.toml" "$PROFILE_HOME/config.toml" "$REPO_ROOT" "$REPO_PYTHON" "$PROJECT_SKILL_DIR" "$PROFILE_HOME/skills" "$OPENSPACE_MODEL" "$OPENSPACE_LLM_API_KEY" "$OPENSPACE_LLM_API_BASE" "$OPENSPACE_LLM_OPENAI_STREAM_COMPAT" "$OPENSPACE_SKILL_EMBEDDING_BACKEND" "$OPENSPACE_SKILL_EMBEDDING_MODEL" "$OPENSPACE_SKILL_EMBEDDING_API_KEY" "$OPENSPACE_SKILL_EMBEDDING_API_BASE" "$EMBEDDING_API_KEY" "$EMBEDDING_BASE_URL" "$EMBEDDING_MODEL" +from pathlib import Path +import os +import sys + +src = Path(sys.argv[1]) +dst = Path(sys.argv[2]) +repo_root = sys.argv[3] +python_cmd = sys.argv[4] +project_skill_dir = sys.argv[5] +profile_skill_dir = sys.argv[6] +model = sys.argv[7] +api_key = sys.argv[8] +api_base = sys.argv[9] +stream_compat = sys.argv[10] +skill_embedding_backend = sys.argv[11] +skill_embedding_model = sys.argv[12] +skill_embedding_api_key = sys.argv[13] +skill_embedding_api_base = sys.argv[14] +embedding_api_key = sys.argv[15] +embedding_base_url = sys.argv[16] +embedding_model = sys.argv[17] + +def strip_tables(text: str, table_names: set[str]) -> str: + kept = [] + skipping = False + for line in text.splitlines(): + stripped = line.strip() + header = stripped.split("#", 1)[0].rstrip() + if header.startswith("[") and header.endswith("]"): + skipping = header in table_names or header.startswith("[mcp_servers.openspace_evolution.") + if skipping: + continue + if skipping: + continue + kept.append(line) + return "\n".join(kept).rstrip() + "\n" + +base = strip_tables( + src.read_text(encoding="utf-8"), + { + "[mcp_servers.openspace_evolution]", + "[mcp_servers.openspace_evolution.env]", + }, +) + +project_marker = f'[projects."{repo_root}"]' +if project_marker not in base: + base += f'\n{project_marker}\ntrust_level = "trusted"\n' + +if api_key and Path(python_cmd).is_file() and os.access(python_cmd, os.X_OK): + base += f''' +[mcp_servers.openspace_evolution] +command = "{python_cmd}" +args = ["-m", "openspace.evolution_mcp_server", "--transport", "stdio"] + +[mcp_servers.openspace_evolution.env] +OPENSPACE_WORKSPACE = "{repo_root}" +OPENSPACE_HOST_SKILL_DIRS = "{project_skill_dir},{profile_skill_dir}" +OPENSPACE_MODEL = "{model}" +OPENSPACE_LLM_API_KEY = "{api_key}" +OPENSPACE_LLM_API_BASE = "{api_base}" +OPENSPACE_LLM_OPENAI_STREAM_COMPAT = "{stream_compat}" +OPENSPACE_SKILL_EMBEDDING_BACKEND = "{skill_embedding_backend}" +OPENSPACE_SKILL_EMBEDDING_MODEL = "{skill_embedding_model}" +OPENSPACE_SKILL_EMBEDDING_API_KEY = "{skill_embedding_api_key}" +OPENSPACE_SKILL_EMBEDDING_API_BASE = "{skill_embedding_api_base}" +EMBEDDING_API_KEY = "{embedding_api_key}" +EMBEDDING_BASE_URL = "{embedding_base_url}" +EMBEDDING_MODEL = "{embedding_model}" +OPENSPACE_ENABLE_RECORDING = "false" +OPENSPACE_BACKEND_SCOPE = "shell,system" +''' + +dst.write_text(base, encoding="utf-8") +PY +} + +bootstrap_profile_home + +clear_openspace_env() { + local var + for var in ${!OPENSPACE_@}; do + unset "$var" + done + unset EMBEDDING_API_KEY + unset EMBEDDING_BASE_URL + unset EMBEDDING_MODEL +} + +if [[ -z "$OPENSPACE_LLM_API_KEY" ]]; then + echo "Warning: OpenSpace evolution sidecar is disabled because no provider key was found in $ENV_FILE or the current environment." >&2 +elif [[ ! -x "$REPO_PYTHON" ]]; then + echo "Warning: OpenSpace evolution sidecar is disabled because $REPO_PYTHON is missing." >&2 +fi + +clear_openspace_env + +if [[ "${1:-}" == "app" ]]; then + shift + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + exec env CODEX_HOME="$PROFILE_HOME" codex app "$@" + fi + exec env CODEX_HOME="$PROFILE_HOME" codex app "$@" "$REPO_ROOT" +fi + +exec env CODEX_HOME="$PROFILE_HOME" codex -C "$REPO_ROOT" "$@" diff --git a/scripts/codex-openspace b/scripts/codex-openspace new file mode 100755 index 0000000..cb7bccf --- /dev/null +++ b/scripts/codex-openspace @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$REPO_ROOT/openspace/.env" +PRIMARY_CODEX_HOME="${PRIMARY_CODEX_HOME:-$HOME/.codex}" +PROFILE_HOME="${CODEX_HOME:-$HOME/.codex-openspace}" + +if [[ ! -f "$ENV_FILE" ]]; then + echo "Missing $ENV_FILE" >&2 + exit 1 +fi + +set -a +source "$ENV_FILE" +set +a + +if [[ -z "${OPENSPACE_LLM_API_KEY:-}" ]]; then + echo "OPENSPACE_LLM_API_KEY is missing in $ENV_FILE" >&2 + exit 1 +fi + +OPENSPACE_MODEL="${OPENSPACE_MODEL:-gpt-5.4}" +OPENSPACE_LLM_API_BASE="${OPENSPACE_LLM_API_BASE:-https://codexapi.space/v1}" +OPENSPACE_LLM_OPENAI_STREAM_COMPAT="${OPENSPACE_LLM_OPENAI_STREAM_COMPAT:-true}" +OPENSPACE_SKILL_EMBEDDING_BACKEND="${OPENSPACE_SKILL_EMBEDDING_BACKEND:-local}" +OPENSPACE_SKILL_EMBEDDING_MODEL="${OPENSPACE_SKILL_EMBEDDING_MODEL:-BAAI/bge-small-en-v1.5}" + +sync_profile_dir() { + local name="$1" + local src="$PRIMARY_CODEX_HOME/$name" + local dst="$PROFILE_HOME/$name" + + if [[ ! -d "$src" ]]; then + return + fi + + if [[ -L "$dst" ]]; then + rm -f "$dst" + fi + + mkdir -p "$dst" + rsync -a --delete "$src/" "$dst/" +} + +bootstrap_profile_home() { + mkdir -p "$PROFILE_HOME" + mkdir -p "$PROFILE_HOME/state/openspace" + + sync_profile_dir "plugins" + sync_profile_dir "skills" + sync_profile_dir "pua" + + cat > "$PROFILE_HOME/auth.json" < "$PROFILE_HOME/config.toml" <&2 + exit 1 +fi + +mkdir -p "$BIN_DIR" + +cat > "$BIN_DIR/openspace-global-mcp" <&2 + exit 1 +fi + +workspace="\${OPENSPACE_WORKSPACE:-\$PWD}" +if git_root="\$(git -C "\$workspace" rev-parse --show-toplevel 2>/dev/null)"; then + workspace="\$git_root" +fi + +project_name="\$(basename "\$workspace")" +if [[ -z "\$project_name" || "\$project_name" == "/" || "\$project_name" == "." ]]; then + project_name="default" +fi + +project_skill_dir="\${HOME}/.codex/projects/\${project_name}/skills" +mkdir -p "\$project_skill_dir" "\${HOME}/.codex/skills" + +export OPENSPACE_WORKSPACE="\$workspace" +export OPENSPACE_HOST_SKILL_DIRS="\${OPENSPACE_HOST_SKILL_DIRS:-\${project_skill_dir},\${HOME}/.codex/skills}" +export OPENSPACE_MCP_PROXY_MODE="\${OPENSPACE_MCP_PROXY_MODE:-daemon}" +export OPENSPACE_MCP_DAEMON_STATE_DIR="\${OPENSPACE_MCP_DAEMON_STATE_DIR:-\${CODEX_HOME}/state/openspace}" +mkdir -p "\$OPENSPACE_MCP_DAEMON_STATE_DIR" + +exec "\$REPO_PYTHON" -m openspace.mcp_proxy --kind main --transport stdio +EOF + +cat > "$BIN_DIR/openspace-evolution-global-mcp" <&2 + exit 1 +fi + +workspace="\${OPENSPACE_WORKSPACE:-\$PWD}" +if git_root="\$(git -C "\$workspace" rev-parse --show-toplevel 2>/dev/null)"; then + workspace="\$git_root" +fi + +export OPENSPACE_WORKSPACE="\$workspace" + +project_name="\$(basename "\$OPENSPACE_WORKSPACE")" +if [[ -z "\$project_name" || "\$project_name" == "/" || "\$project_name" == "." ]]; then + project_name="default" +fi + +project_skill_dir="\${HOME}/.codex/projects/\${project_name}/skills" +mkdir -p "\$project_skill_dir" "\${HOME}/.codex/skills" + +export OPENSPACE_HOST_SKILL_DIRS="\${OPENSPACE_HOST_SKILL_DIRS:-\${project_skill_dir},\${HOME}/.codex/skills}" +export OPENSPACE_MCP_PROXY_MODE="\${OPENSPACE_MCP_PROXY_MODE:-daemon}" +export OPENSPACE_MCP_DAEMON_STATE_DIR="\${OPENSPACE_MCP_DAEMON_STATE_DIR:-\${CODEX_HOME}/state/openspace}" +mkdir -p "\$OPENSPACE_MCP_DAEMON_STATE_DIR" + +exec "\$REPO_PYTHON" -m openspace.mcp_proxy --kind evolution --transport stdio +EOF + +chmod +x "$BIN_DIR/openspace-global-mcp" "$BIN_DIR/openspace-evolution-global-mcp" + +cat <&2 + exit 1 +fi + +export OPENSPACE_MODEL="${OPENSPACE_MODEL:-gpt-5.4}" +export OPENSPACE_LLM_API_KEY="$api_key" +export OPENSPACE_LLM_API_BASE="${OPENSPACE_LLM_API_BASE:-http://127.0.0.1:8080/v1}" +export OPENSPACE_LLM_OPENAI_STREAM_COMPAT="${OPENSPACE_LLM_OPENAI_STREAM_COMPAT:-true}" +export OPENSPACE_SKILL_EMBEDDING_BACKEND="${OPENSPACE_SKILL_EMBEDDING_BACKEND:-local}" +export OPENSPACE_SKILL_EMBEDDING_MODEL="${OPENSPACE_SKILL_EMBEDDING_MODEL:-BAAI/bge-small-en-v1.5}" +export OPENSPACE_HOST_SKILL_DIRS="${OPENSPACE_HOST_SKILL_DIRS:-$ALT_HOME/skills}" +export OPENSPACE_WORKSPACE="${OPENSPACE_WORKSPACE:-$REPO_ROOT}" + +exec "$REPO_ROOT/.venv/bin/openspace" "$@" diff --git a/scripts/test_codex_openspace_sessions.py b/scripts/test_codex_openspace_sessions.py new file mode 100644 index 0000000..b10103e --- /dev/null +++ b/scripts/test_codex_openspace_sessions.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 +from pathlib import Path +import sys + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from openspace.codex_session_scenarios import main + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e7042aa --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import sys +from types import ModuleType +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +try: + import aiohttp # noqa: F401 +except ModuleNotFoundError: + aiohttp_stub = ModuleType("aiohttp") + + class _ClientTimeout: + def __init__(self, *, total=None): + self.total = total + + class _ClientSession: + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + async def close(self): + return None + + class _TCPConnector: + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + class _ClientResponse: + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + class _ClientResponseError(Exception): + def __init__(self, *args, status=None, message="", **kwargs): + super().__init__(message) + self.status = status + self.message = message + + aiohttp_stub.ClientTimeout = _ClientTimeout + aiohttp_stub.ClientSession = _ClientSession + aiohttp_stub.TCPConnector = _TCPConnector + aiohttp_stub.ClientResponse = _ClientResponse + aiohttp_stub.ClientResponseError = _ClientResponseError + sys.modules["aiohttp"] = aiohttp_stub + +try: + import yarl # noqa: F401 +except ModuleNotFoundError: + yarl_stub = ModuleType("yarl") + + class _URL(str): + def __new__(cls, value="", *args, **kwargs): + return str.__new__(cls, value) + + def with_path(self, value): + return type(self)(value) + + def join(self, other): + return type(self)(f"{self.rstrip('/')}/{str(other).lstrip('/')}") + + yarl_stub.URL = _URL + sys.modules["yarl"] = yarl_stub diff --git a/tests/test_codex_session_scenarios.py b/tests/test_codex_session_scenarios.py new file mode 100644 index 0000000..2929579 --- /dev/null +++ b/tests/test_codex_session_scenarios.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from openspace.codex_session_scenarios import ( + cleanup_session_artifacts, + collect_session_family, + parse_exec_output, + snapshot_daemons, +) + + +def _write_session(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps({"type": "session_meta", "payload": payload}, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +def test_parse_exec_output_extracts_thread_and_opening_status() -> None: + output = """ +2026-04-13T00:00:00Z WARN unrelated noise +{"type":"thread.started","thread_id":"thread-parent"} +{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"先做一次 OpenSpace 预检,再开始当前任务。\\n\\nOpenSpace session: ready\\nOpenSpace machine: ready"}} +{"type":"item.completed","item":{"id":"item_2","type":"agent_message","text":"cold-start done"}} +""".strip() + + result = parse_exec_output(output) + + assert result.thread_id == "thread-parent" + assert result.opening is not None + assert result.opening.session_status == "ready" + assert result.opening.machine_status == "ready" + assert result.opening.fallback_present is False + assert result.agent_messages[-1] == "cold-start done" + + +def test_collect_session_family_discovers_descendants(tmp_path: Path) -> None: + sessions_root = tmp_path / "sessions" + parent = sessions_root / "2026/04/13/parent.jsonl" + child = sessions_root / "2026/04/13/child.jsonl" + grandchild = sessions_root / "2026/04/13/grandchild.jsonl" + unrelated = sessions_root / "2026/04/13/unrelated.jsonl" + + _write_session(parent, {"id": "parent-thread"}) + _write_session( + child, + { + "id": "child-thread", + "source": { + "subagent": { + "thread_spawn": { + "parent_thread_id": "parent-thread", + } + } + }, + }, + ) + _write_session( + grandchild, + { + "id": "grandchild-thread", + "source": { + "subagent": { + "thread_spawn": { + "parent_thread_id": "child-thread", + } + } + }, + }, + ) + _write_session(unrelated, {"id": "other-thread"}) + + family = collect_session_family("parent-thread", sessions_root) + + assert family.thread_ids == {"parent-thread", "child-thread", "grandchild-thread"} + assert family.session_files == {parent, child, grandchild} + + +def test_cleanup_session_artifacts_removes_files_and_index_rows(tmp_path: Path) -> None: + sessions_root = tmp_path / "sessions" + session_file = sessions_root / "2026/04/13/parent.jsonl" + _write_session(session_file, {"id": "parent-thread"}) + + session_index = tmp_path / "session_index.jsonl" + session_index.write_text( + "\n".join( + [ + json.dumps({"thread_id": "parent-thread", "session_file": str(session_file)}), + json.dumps({"thread_id": "keep-thread", "session_file": "/tmp/keep.jsonl"}), + ] + ) + + "\n", + encoding="utf-8", + ) + + cleanup_session_artifacts( + thread_ids={"parent-thread"}, + session_files={session_file}, + session_index_path=session_index, + ) + + assert not session_file.exists() + remaining = session_index.read_text(encoding="utf-8") + assert "parent-thread" not in remaining + assert "keep-thread" in remaining + + +def test_snapshot_daemons_filters_by_workspace(tmp_path: Path) -> None: + state_dir = tmp_path / "state" + state_dir.mkdir() + matching = state_dir / "main-match.json" + other = state_dir / "main-other.json" + + matching.write_text( + json.dumps( + { + "server_kind": "main", + "instance_key": "match", + "pid": 111, + "port": 9001, + "workspace": "/tmp/workspace-a", + "ready": True, + } + ), + encoding="utf-8", + ) + other.write_text( + json.dumps( + { + "server_kind": "main", + "instance_key": "other", + "pid": 222, + "port": 9002, + "workspace": "/tmp/workspace-b", + "ready": True, + } + ), + encoding="utf-8", + ) + + snapshot = snapshot_daemons(state_dir, "/tmp/workspace-a") + + assert set(snapshot) == {"main"} + assert snapshot["main"]["pid"] == 111 diff --git a/tests/test_embedding_cache_optimization.py b/tests/test_embedding_cache_optimization.py new file mode 100644 index 0000000..4dd19e2 --- /dev/null +++ b/tests/test_embedding_cache_optimization.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from types import ModuleType + +from openspace.cloud import embedding +from openspace.cloud.search import SkillSearchEngine +from openspace.skill_engine.skill_ranker import SkillCandidate, SkillRanker + + +class _DummyTextEmbedding: + instances: list["_DummyTextEmbedding"] = [] + + def __init__(self, model_name: str): + self.model_name = model_name + self.embed_inputs: list[list[str]] = [] + type(self).instances.append(self) + + def embed(self, texts): + batch = list(texts) + self.embed_inputs.append(batch) + for text in batch: + yield [float(len(text)), float(len(self.model_name))] + + +def _install_fastembed_stub(monkeypatch) -> None: + module = ModuleType("fastembed") + module.TextEmbedding = _DummyTextEmbedding + monkeypatch.setitem(sys.modules, "fastembed", module) + + +def _reset_embedding_state(monkeypatch) -> None: + monkeypatch.setattr(embedding, "_LOCAL_EMBEDDER", None, raising=False) + monkeypatch.setattr(embedding, "_LOCAL_EMBEDDER_MODEL", None, raising=False) + _DummyTextEmbedding.instances.clear() + + +def test_load_local_embedder_reuses_same_model_instance(monkeypatch) -> None: + _install_fastembed_stub(monkeypatch) + _reset_embedding_state(monkeypatch) + + first = embedding._load_local_embedder("unit-model") + second = embedding._load_local_embedder("unit-model") + third = embedding._load_local_embedder("other-model") + + assert first is second + assert third is not first + assert [instance.model_name for instance in _DummyTextEmbedding.instances] == [ + "unit-model", + "other-model", + ] + + +def test_generate_embedding_reuses_prewarmed_local_embedder(monkeypatch) -> None: + _install_fastembed_stub(monkeypatch) + _reset_embedding_state(monkeypatch) + monkeypatch.setenv("OPENSPACE_SKILL_EMBEDDING_BACKEND", "local") + monkeypatch.setenv("OPENSPACE_SKILL_EMBEDDING_MODEL", "unit-model") + + first = embedding.generate_embedding("alpha") + second = embedding.generate_embedding("beta") + + assert first == [5.0, 10.0] + assert second == [4.0, 10.0] + assert len(_DummyTextEmbedding.instances) == 1 + assert _DummyTextEmbedding.instances[0].embed_inputs == [["alpha"], ["beta"]] + + +def test_skill_ranker_reuses_persisted_embedding_cache_between_instances( + monkeypatch, + tmp_path, +) -> None: + calls: list[str] = [] + + monkeypatch.setattr( + "openspace.cloud.embedding.resolve_skill_embedding_model", + lambda backend=None: "unit-model", + ) + + def fake_generate_embedding(text: str, api_key=None): + calls.append(text) + return [float(len(text)), 1.0] + + monkeypatch.setattr( + SkillRanker, + "_generate_embedding", + staticmethod(fake_generate_embedding), + ) + + first_ranker = SkillRanker(cache_dir=tmp_path, enable_cache=True) + candidate = SkillCandidate( + skill_id="skill-1", + name="alpha", + description="beta", + body="gamma", + ) + first_ranker.hybrid_rank("query text", [candidate], top_k=1) + + cache_file = tmp_path / "skill_embeddings_unit-model_v2.pkl" + assert cache_file.exists() + assert calls == [ + "query text", + embedding.build_skill_embedding_text("alpha", "beta", "gamma"), + ] + + calls.clear() + + second_ranker = SkillRanker(cache_dir=tmp_path, enable_cache=True) + assert "skill-1" in second_ranker._embedding_cache + + second_candidate = SkillCandidate( + skill_id="skill-1", + name="alpha", + description="beta", + body="gamma", + ) + second_ranker.hybrid_rank("query text", [second_candidate], top_k=1) + + assert calls == ["query text"] + + +def test_skill_search_engine_uses_ranker_cache_for_local_candidates(monkeypatch) -> None: + events: list[tuple[str, str]] = [] + + class _DummyRanker: + def __init__(self, enable_cache: bool = True): + self.enable_cache = enable_cache + + def get_cached_embedding(self, skill_id: str): + events.append(("cached", skill_id)) + return [0.5, 0.5] + + def prime_candidates(self, candidates): + events.append(("prime", candidates[0].skill_id)) + return 1 + + monkeypatch.setattr( + "openspace.skill_engine.skill_ranker.SkillRanker", + _DummyRanker, + ) + monkeypatch.setattr( + "openspace.cloud.embedding.cosine_similarity", + lambda a, b: 0.75, + ) + + engine = SkillSearchEngine() + scored = engine._score_phase( + candidates=[ + { + "skill_id": "skill-local", + "name": "Local Skill", + "description": "demo", + "source": "openspace-local", + "_embedding_text": "Local Skill\ndemo", + } + ], + query_tokens=["local"], + query_embedding=[1.0, 1.0], + ) + + assert events == [("cached", "skill-local")] + assert scored[0]["vector_score"] == 0.75 diff --git a/tests/test_evolver_captured_names.py b/tests/test_evolver_captured_names.py new file mode 100644 index 0000000..a511911 --- /dev/null +++ b/tests/test_evolver_captured_names.py @@ -0,0 +1,79 @@ +from openspace.skill_engine.evolver import ( + _extract_generated_frontmatter_field, + _fallback_captured_skill_name, + _set_generated_frontmatter_field, +) + + +def test_extract_generated_frontmatter_field_from_single_file() -> None: + content = """--- +name: safe-file-write +description: Write files with validation. +--- + +# Safe File Write +""" + + assert _extract_generated_frontmatter_field(content, "name") == "safe-file-write" + assert ( + _extract_generated_frontmatter_field(content, "description") + == "Write files with validation." + ) + + +def test_extract_generated_frontmatter_field_from_multi_file_full() -> None: + content = """*** Begin Files +*** File: SKILL.md +--- +name: local-acceptance-entry +description: Run a repo's backend and frontend together for review. +--- + +# Local Acceptance Entry +*** File: examples/start.sh +#!/usr/bin/env bash +echo start +*** End Files +""" + + assert ( + _extract_generated_frontmatter_field(content, "name") + == "local-acceptance-entry" + ) + assert ( + _extract_generated_frontmatter_field(content, "description") + == "Run a repo's backend and frontend together for review." + ) + + +def test_set_generated_frontmatter_field_updates_skill_md_in_multi_file_full() -> None: + content = """*** Begin Files +*** File: SKILL.md +--- +description: Run a repo's backend and frontend together for review. +--- + +# Local Acceptance Entry +*** File: examples/start.sh +#!/usr/bin/env bash +echo start +*** End Files +""" + + updated = _set_generated_frontmatter_field(content, "name", "local-acceptance-entry") + + assert "*** File: SKILL.md" in updated + assert "name: local-acceptance-entry" in updated + assert "*** File: examples/start.sh" in updated + + +def test_fallback_captured_skill_name_uses_direction_signal() -> None: + direction = ( + "Capture a reusable workflow for adding a local acceptance entry in " + "split backend/frontend repos: create one canonical foreground acceptance script." + ) + + fallback = _fallback_captured_skill_name(direction, None) + + assert fallback.startswith("local-acceptance-entry-in-split-backend") + assert len(fallback) <= 50 diff --git a/tests/test_logging_stdout_safety.py b/tests/test_logging_stdout_safety.py new file mode 100644 index 0000000..1f028cf --- /dev/null +++ b/tests/test_logging_stdout_safety.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import io +import logging +from pathlib import Path + +from openspace.utils.logging import Logger + + +def test_log_file_enable_announcement_avoids_stdout(monkeypatch, tmp_path: Path) -> None: + stdout = io.StringIO() + stderr = io.StringIO() + + monkeypatch.setattr("sys.stdout", stdout) + monkeypatch.setattr("sys.stderr", stderr) + + Logger.reset_configuration() + Logger.configure( + level=logging.INFO, + log_to_console=False, + log_to_file=str(tmp_path / "unit.log"), + attach_to_root=True, + ) + + assert stdout.getvalue() == "" + assert "Log file enabled:" in stderr.getvalue() + + Logger.reset_configuration() + + +def test_console_logging_avoids_stdout(monkeypatch, tmp_path: Path) -> None: + stdout = io.StringIO() + stderr = io.StringIO() + + monkeypatch.setattr("sys.stdout", stdout) + monkeypatch.setattr("sys.stderr", stderr) + + Logger.reset_configuration() + Logger.configure( + level=logging.INFO, + log_to_console=True, + log_to_file=str(tmp_path / "unit.log"), + attach_to_root=True, + ) + Logger.get_logger("openspace.test").info("console log should stay off stdout") + + assert stdout.getvalue() == "" + assert "console log should stay off stdout" in stderr.getvalue() + + Logger.reset_configuration() diff --git a/tests/test_mcp_entrypoints.py b/tests/test_mcp_entrypoints.py new file mode 100644 index 0000000..ba03e4a --- /dev/null +++ b/tests/test_mcp_entrypoints.py @@ -0,0 +1,119 @@ +from __future__ import annotations + +import argparse +import importlib +from types import SimpleNamespace + +import pytest + + +ENTRYPOINT_MODULES = [ + "openspace.mcp_server", + "openspace.evolution_mcp_server", +] + + +@pytest.mark.parametrize("module_name", ENTRYPOINT_MODULES) +def test_stdio_entrypoint_uses_stdio_transport(module_name, monkeypatch) -> None: + module = importlib.import_module(module_name) + calls: list[tuple[tuple[object, ...], dict[str, object]]] = [] + watchdog_calls: list[bool] = [] + + monkeypatch.setattr( + argparse.ArgumentParser, + "parse_args", + lambda self: SimpleNamespace(transport="stdio", port=9123), + ) + monkeypatch.setattr( + module.mcp, + "run", + lambda *args, **kwargs: calls.append((args, kwargs)), + ) + monkeypatch.setattr( + module, + "_maybe_start_idle_watchdog", + lambda: watchdog_calls.append(True), + ) + monkeypatch.setattr( + module, + "_install_signal_handlers", + lambda: None, + ) + + module.run_mcp_server() + + assert watchdog_calls == [True] + assert calls == [((), {"transport": "stdio"})] + assert module.mcp.settings.port == 9123 + + +@pytest.mark.parametrize("module_name", ENTRYPOINT_MODULES) +def test_sse_entrypoint_does_not_forward_sse_params(module_name, monkeypatch) -> None: + module = importlib.import_module(module_name) + calls: list[tuple[tuple[object, ...], dict[str, object]]] = [] + watchdog_calls: list[bool] = [] + + monkeypatch.setattr( + argparse.ArgumentParser, + "parse_args", + lambda self: SimpleNamespace(transport="sse", port=9123), + ) + monkeypatch.setattr( + module.mcp, + "run", + lambda *args, **kwargs: calls.append((args, kwargs)), + ) + monkeypatch.setattr( + module, + "_maybe_start_idle_watchdog", + lambda: watchdog_calls.append(True), + ) + monkeypatch.setattr( + module, + "_install_signal_handlers", + lambda: None, + ) + + module.run_mcp_server() + + assert watchdog_calls == [] + assert calls == [((), {"transport": "sse"})] + assert module.mcp.settings.port == 9123 + + +@pytest.mark.parametrize("module_name", ENTRYPOINT_MODULES) +def test_streamable_http_entrypoint_uses_watchdog_for_daemon( + module_name, + monkeypatch, +) -> None: + module = importlib.import_module(module_name) + calls: list[tuple[tuple[object, ...], dict[str, object]]] = [] + watchdog_calls: list[bool] = [] + + monkeypatch.setattr( + argparse.ArgumentParser, + "parse_args", + lambda self: SimpleNamespace(transport="streamable-http", port=9234), + ) + monkeypatch.setattr( + module.mcp, + "run", + lambda *args, **kwargs: calls.append((args, kwargs)), + ) + monkeypatch.setattr( + module, + "_maybe_start_idle_watchdog", + lambda: watchdog_calls.append(True), + ) + monkeypatch.setattr( + module, + "_install_signal_handlers", + lambda: None, + ) + monkeypatch.setenv("OPENSPACE_MCP_DAEMON", "1") + + module.run_mcp_server() + + assert watchdog_calls == [True] + assert calls == [((), {"transport": "streamable-http"})] + assert module.mcp.settings.port == 9234 diff --git a/tests/test_mcp_http_connector_transport.py b/tests/test_mcp_http_connector_transport.py new file mode 100644 index 0000000..04c6735 --- /dev/null +++ b/tests/test_mcp_http_connector_transport.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +import importlib.util +import logging +import sys +from pathlib import Path +from types import ModuleType + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[1] +BASE_FILE = REPO_ROOT / "openspace/grounding/backends/mcp/transport/connectors/base.py" +CORE_TM_BASE_FILE = REPO_ROOT / "openspace/grounding/core/transport/task_managers/base.py" +HTTP_FILE = REPO_ROOT / "openspace/grounding/backends/mcp/transport/connectors/http.py" + + +class _DummyStreamableHttpConnectionManager: + instances: list["_DummyStreamableHttpConnectionManager"] = [] + + def __init__(self, url, headers, timeout, read_timeout): + self.url = url + self.headers = headers + self.timeout = timeout + self.read_timeout = read_timeout + self.started = False + self.stopped = False + _DummyStreamableHttpConnectionManager.instances.append(self) + + async def start(self, timeout=None): + self.started = True + self.timeout_used = timeout + return "read-stream", "write-stream" + + def get_streams(self): + return ("read-stream", "write-stream") + + async def stop(self): + self.stopped = True + + +class _ForbiddenSseConnectionManager: + def __init__(self, *args, **kwargs): + raise AssertionError( + "SSE fallback should not be constructed when streamable HTTP succeeds" + ) + + +class _DummyClientSession: + def __init__(self, read_stream, write_stream, sampling_callback=None): + self.read_stream = read_stream + self.write_stream = write_stream + self.sampling_callback = sampling_callback + self.entered = False + self.initialized = False + self.tools_listed = False + self.exited = False + + async def __aenter__(self): + self.entered = True + return self + + async def initialize(self): + self.initialized = True + + async def list_tools(self): + self.tools_listed = True + return [] + + async def __aexit__(self, exc_type, exc, tb): + self.exited = True + + +def _install_package_stub( + monkeypatch, + module_name: str, + **attributes, +) -> ModuleType: + module = ModuleType(module_name) + module.__path__ = [] # mark as package + for key, value in attributes.items(): + setattr(module, key, value) + monkeypatch.setitem(sys.modules, module_name, module) + return module + + +class _BaseConnectorStub: + @classmethod + def __class_getitem__(cls, item): + return cls + + def __init__(self, connection_manager): + self._connection_manager = connection_manager + self._connection = None + self._connected = False + + async def _cleanup_on_connect_failure(self): + if self._connection_manager and hasattr(self._connection_manager, "stop"): + maybe_awaitable = self._connection_manager.stop() + if hasattr(maybe_awaitable, "__await__"): + await maybe_awaitable + self._connection = None + + async def _after_disconnect(self): + return None + + +class _BaseConnectionManagerStub: + @classmethod + def __class_getitem__(cls, item): + return cls + + +class _AsyncContextConnectionManagerStub(_BaseConnectionManagerStub): + pass + + +class _PlaceholderConnectionManagerStub: + def __init__(self, *args, **kwargs): + self._connection = None + + async def start(self, timeout=None): + return self._connection + + async def stop(self, timeout=5.0): + return None + + def get_streams(self): + return self._connection + + +def _load_module(module_name: str, file_path: Path) -> ModuleType: + spec = importlib.util.spec_from_file_location(module_name, file_path) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +def _load_http_module(monkeypatch) -> ModuleType: + # Stub the package layers so we can load the target file without + # importing the broader MCP package tree and its optional deps. + _install_package_stub( + monkeypatch, + "openspace.grounding.core.transport.connectors", + BaseConnector=_BaseConnectorStub, + ) + _install_package_stub( + monkeypatch, + "openspace.grounding.core.transport.task_managers", + BaseConnectionManager=_BaseConnectionManagerStub, + AsyncContextConnectionManager=_AsyncContextConnectionManagerStub, + PlaceholderConnectionManager=_PlaceholderConnectionManagerStub, + ) + _install_package_stub( + monkeypatch, + "openspace.utils.logging", + Logger=type( + "Logger", + (), + {"get_logger": staticmethod(logging.getLogger)}, + ), + ) + _install_package_stub( + monkeypatch, + "openspace.grounding.backends.mcp.transport.task_managers", + SseConnectionManager=type("SseConnectionManager", (), {}), + StreamableHttpConnectionManager=type( + "StreamableHttpConnectionManager", (), {} + ), + ) + _install_package_stub( + monkeypatch, + "openspace.grounding.backends.mcp.transport.connectors", + ) + _install_package_stub( + monkeypatch, + "openspace.grounding.backends.mcp.transport", + ) + _install_package_stub( + monkeypatch, + "openspace.grounding.backends.mcp", + ) + + _load_module( + "openspace.grounding.backends.mcp.transport.connectors.base", + BASE_FILE, + ) + _load_module( + "openspace.grounding.core.transport.task_managers.base", + CORE_TM_BASE_FILE, + ) + return _load_module( + "openspace.grounding.backends.mcp.transport.connectors.http", + HTTP_FILE, + ) + + +@pytest.mark.asyncio +async def test_http_connector_prefers_streamable_http(monkeypatch) -> None: + http_module = _load_http_module(monkeypatch) + + monkeypatch.setattr( + http_module, + "StreamableHttpConnectionManager", + _DummyStreamableHttpConnectionManager, + ) + monkeypatch.setattr( + http_module, + "SseConnectionManager", + _ForbiddenSseConnectionManager, + ) + monkeypatch.setattr(http_module, "ClientSession", _DummyClientSession) + + connector = http_module.HttpConnector("http://127.0.0.1:8123/mcp") + + await connector.connect() + + assert connector.transport_type == "streamable HTTP" + assert isinstance( + connector._connection_manager, _DummyStreamableHttpConnectionManager + ) + assert connector._connection == ("read-stream", "write-stream") + assert connector.client_session.entered is True + assert connector.client_session.initialized is True + assert connector.client_session.tools_listed is True + + client_session = connector.client_session + await connector.disconnect() + + assert client_session.exited is True + assert connector._connected is False + assert connector._connection is None + assert connector._connection_manager.stopped is True diff --git a/tests/test_mcp_proxy_runtime.py b/tests/test_mcp_proxy_runtime.py new file mode 100644 index 0000000..210622e --- /dev/null +++ b/tests/test_mcp_proxy_runtime.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +import asyncio +from pathlib import Path + +from openspace import mcp_proxy +from openspace import shared_mcp_runtime +from openspace.mcp_tool_registration import register_main_tools + + +def test_proxy_mode_defaults_follow_split_rollout(monkeypatch) -> None: + monkeypatch.delenv("OPENSPACE_MCP_PROXY_MODE", raising=False) + + assert mcp_proxy._proxy_mode_for("main") == "daemon" + assert mcp_proxy._proxy_mode_for("evolution") == "daemon" + + monkeypatch.setenv("OPENSPACE_MCP_PROXY_MODE", "daemon") + assert mcp_proxy._proxy_mode_for("main") == "daemon" + assert mcp_proxy._proxy_mode_for("evolution") == "daemon" + + +def test_proxy_registration_is_lazy(monkeypatch) -> None: + async def _fail_if_called(server_kind): + raise AssertionError(f"ensure_daemon should not run during tool registration ({server_kind})") + + monkeypatch.setattr(mcp_proxy, "ensure_daemon", _fail_if_called) + + mcp = mcp_proxy._build_fastmcp("main") + register_main_tools(mcp, mcp_proxy._MainProxyImplementation()) + + assert {tool.name for tool in mcp._tool_manager.list_tools()} == { + "execute_task", + "search_skills", + "fix_skill", + "upload_skill", + } + + +def test_compute_daemon_identity_normalizes_repo_workspace(monkeypatch) -> None: + repo_root = Path(__file__).resolve().parents[1] + nested_path = repo_root / "openspace" + + monkeypatch.setattr( + shared_mcp_runtime, + "build_llm_kwargs", + lambda model: ("resolved-model", {"api_base": "http://unit.test/v1"}), + ) + monkeypatch.setattr(shared_mcp_runtime, "build_grounding_config_path", lambda: None) + monkeypatch.setattr( + shared_mcp_runtime, + "get_agent_config", + lambda name: {"backend_scope": ["mcp", "shell"]}, + ) + monkeypatch.delenv("OPENSPACE_BACKEND_SCOPE", raising=False) + monkeypatch.delenv("OPENSPACE_HOST_SKILL_DIRS", raising=False) + + monkeypatch.setenv("OPENSPACE_WORKSPACE", str(repo_root)) + root_identity = shared_mcp_runtime.compute_daemon_identity("main") + + monkeypatch.setenv("OPENSPACE_WORKSPACE", str(nested_path)) + nested_identity = shared_mcp_runtime.compute_daemon_identity("main") + + assert root_identity.workspace == str(repo_root.resolve()) + assert nested_identity.workspace == str(repo_root.resolve()) + assert root_identity.instance_key == nested_identity.instance_key + + +def test_compute_daemon_identity_changes_when_skill_dirs_change(monkeypatch, tmp_path) -> None: + monkeypatch.setattr( + shared_mcp_runtime, + "build_llm_kwargs", + lambda model: ("resolved-model", {"api_base": "http://unit.test/v1"}), + ) + monkeypatch.setattr(shared_mcp_runtime, "build_grounding_config_path", lambda: None) + monkeypatch.setattr( + shared_mcp_runtime, + "get_agent_config", + lambda name: {"backend_scope": ["shell", "mcp"]}, + ) + monkeypatch.setenv("OPENSPACE_WORKSPACE", str(tmp_path)) + monkeypatch.delenv("OPENSPACE_BACKEND_SCOPE", raising=False) + + first = tmp_path / "skills-a" + second = tmp_path / "skills-b" + first.mkdir() + second.mkdir() + + monkeypatch.setenv("OPENSPACE_HOST_SKILL_DIRS", str(first)) + first_identity = shared_mcp_runtime.compute_daemon_identity("main") + + monkeypatch.setenv("OPENSPACE_HOST_SKILL_DIRS", f"{first},{second}") + second_identity = shared_mcp_runtime.compute_daemon_identity("main") + + assert first_identity.host_skill_dirs == (str(first.resolve()),) + assert second_identity.host_skill_dirs == ( + str(first.resolve()), + str(second.resolve()), + ) + assert first_identity.instance_key != second_identity.instance_key + + +async def _ready_probe(record): + return True + + +def test_ensure_daemon_marks_main_ready_but_not_warmed(monkeypatch, tmp_path) -> None: + identity = shared_mcp_runtime.MCPDaemonIdentity( + server_kind="main", + workspace=str(tmp_path), + resolved_model="model", + llm_kwargs_fingerprint="llm", + backend_scope=("shell",), + host_skill_dirs=(str(tmp_path),), + grounding_config_fingerprint="cfg", + instance_key="main-key", + state_dir=str(tmp_path), + ) + + monkeypatch.setattr(shared_mcp_runtime, "compute_daemon_identity", lambda kind: identity) + monkeypatch.setattr(shared_mcp_runtime, "_pick_free_port", lambda: 12345) + monkeypatch.setattr(shared_mcp_runtime, "_spawn_daemon", lambda ident, port: shared_mcp_runtime.MCPDaemonRecord( + server_kind="main", + instance_key=ident.instance_key, + pid=4321, + port=port, + workspace=ident.workspace, + resolved_model=ident.resolved_model, + llm_kwargs_fingerprint=ident.llm_kwargs_fingerprint, + backend_scope=list(ident.backend_scope), + host_skill_dirs=list(ident.host_skill_dirs), + grounding_config_fingerprint=ident.grounding_config_fingerprint, + started_at=1.0, + log_path=str(identity.log_path), + )) + monkeypatch.setattr(shared_mcp_runtime, "_wait_until_ready", _ready_probe) + monkeypatch.setattr(shared_mcp_runtime, "_pid_exists", lambda pid: True) + + record = asyncio.run(shared_mcp_runtime.ensure_daemon("main")) + + assert record.ready is True + assert record.warmed is False + assert record.ready_at is not None + assert record.warmed_at is None + + +def test_update_current_daemon_status_marks_warmed(monkeypatch, tmp_path) -> None: + metadata_path = tmp_path / "main-key.json" + lock_path = tmp_path / "main-key.lock" + record = shared_mcp_runtime.MCPDaemonRecord( + server_kind="main", + instance_key="key", + pid=4321, + port=12345, + workspace=str(tmp_path), + resolved_model="model", + llm_kwargs_fingerprint="llm", + backend_scope=["shell"], + host_skill_dirs=[str(tmp_path)], + grounding_config_fingerprint="cfg", + started_at=1.0, + log_path=str(tmp_path / "main-key.log"), + ready=True, + warmed=False, + ready_at=2.0, + ) + shared_mcp_runtime._write_record(metadata_path, record) + lock_path.touch() + + monkeypatch.setenv("OPENSPACE_MCP_INSTANCE_KEY", "key") + monkeypatch.setenv("OPENSPACE_MCP_DAEMON_STATE_DIR", str(tmp_path)) + + updated = shared_mcp_runtime.update_current_daemon_status( + "main", + warmed=True, + warmup_error=None, + ) + + assert updated is not None + assert updated.ready is True + assert updated.warmed is True + assert updated.warmed_at is not None diff --git a/tests/test_mcp_stdio.py b/tests/test_mcp_stdio.py new file mode 100644 index 0000000..fecd473 --- /dev/null +++ b/tests/test_mcp_stdio.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import io +from pathlib import Path + +from openspace import mcp_stdio + + +class _FakeStream(io.StringIO): + def __init__(self, *, tty: bool) -> None: + super().__init__() + self._tty = tty + + def isatty(self) -> bool: + return self._tty + + +def _reset_capture() -> None: + handle = mcp_stdio._STDERR_CAPTURE_HANDLE + if handle is not None: + handle.close() + mcp_stdio._STDERR_CAPTURE_HANDLE = None + + +def test_redirects_stderr_for_non_interactive_sessions(monkeypatch, tmp_path: Path) -> None: + _reset_capture() + monkeypatch.setattr("sys.stderr", _FakeStream(tty=False)) + + handle = mcp_stdio.maybe_redirect_stderr_to_file(tmp_path, "stderr.log") + + assert handle is not None + assert Path(handle.name) == tmp_path / "stderr.log" + assert Path(handle.name).exists() + assert handle is mcp_stdio._STDERR_CAPTURE_HANDLE + + _reset_capture() + + +def test_keeps_stderr_for_interactive_sessions(monkeypatch, tmp_path: Path) -> None: + _reset_capture() + original = _FakeStream(tty=True) + monkeypatch.setattr("sys.stderr", original) + + handle = mcp_stdio.maybe_redirect_stderr_to_file(tmp_path, "stderr.log") + + assert handle is None + assert mcp_stdio._STDERR_CAPTURE_HANDLE is None + assert original is not None diff --git a/tests/test_shared_mcp_runtime_metadata.py b/tests/test_shared_mcp_runtime_metadata.py new file mode 100644 index 0000000..0e0405d --- /dev/null +++ b/tests/test_shared_mcp_runtime_metadata.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +import threading +import time +from pathlib import Path + +from openspace import shared_mcp_runtime + + +def _build_identity(state_dir: Path) -> shared_mcp_runtime.MCPDaemonIdentity: + return shared_mcp_runtime.MCPDaemonIdentity( + server_kind="main", + workspace="/Users/admin/PycharmProjects/openspace", + resolved_model="unit-model", + llm_kwargs_fingerprint="llm-fingerprint", + backend_scope=("shell", "mcp"), + host_skill_dirs=("/tmp/unit-skills",), + grounding_config_fingerprint="grounding-fingerprint", + instance_key="unit-instance-key", + state_dir=str(state_dir), + ) + + +def _build_record(identity: shared_mcp_runtime.MCPDaemonIdentity) -> shared_mcp_runtime.MCPDaemonRecord: + return shared_mcp_runtime.MCPDaemonRecord( + server_kind=identity.server_kind, + instance_key=identity.instance_key, + pid=4242, + port=56789, + workspace=identity.workspace, + resolved_model=identity.resolved_model, + llm_kwargs_fingerprint=identity.llm_kwargs_fingerprint, + backend_scope=list(identity.backend_scope), + host_skill_dirs=list(identity.host_skill_dirs), + grounding_config_fingerprint=identity.grounding_config_fingerprint, + started_at=100.0, + log_path=str(Path(identity.state_dir) / "main-unit-instance-key.log"), + ready=False, + warmed=False, + ) + + +def test_daemon_metadata_round_trip_includes_ready_and_warmed(tmp_path) -> None: + identity = _build_identity(tmp_path) + record = _build_record(identity) + metadata_path = identity.metadata_path + metadata_path.parent.mkdir(parents=True, exist_ok=True) + shared_mcp_runtime._write_record(metadata_path, record) + + assert metadata_path.is_file() + + initial = shared_mcp_runtime._read_record(metadata_path) + assert initial is not None + assert initial.ready is False + assert initial.warmed is False + + assert initial.server_kind == "main" + assert initial.instance_key == identity.instance_key + + +def test_update_current_daemon_status_marks_ready_then_warmed_for_main_daemon(monkeypatch, tmp_path) -> None: + identity = _build_identity(tmp_path) + metadata_path = identity.metadata_path + metadata_path.parent.mkdir(parents=True, exist_ok=True) + shared_mcp_runtime._write_record(metadata_path, _build_record(identity)) + + monkeypatch.setenv("OPENSPACE_MCP_INSTANCE_KEY", identity.instance_key) + monkeypatch.setenv("OPENSPACE_MCP_DAEMON_STATE_DIR", identity.state_dir) + + monkeypatch.setattr(shared_mcp_runtime.time, "time", lambda: 101.0) + ready_record = shared_mcp_runtime.update_current_daemon_status("main", ready=True) + assert ready_record is not None + assert ready_record.ready is True + assert ready_record.warmed is False + assert ready_record.ready_at == 101.0 + assert ready_record.warmed_at is None + + monkeypatch.setattr(shared_mcp_runtime.time, "time", lambda: 107.5) + warmed_record = shared_mcp_runtime.update_current_daemon_status("main", warmed=True) + assert warmed_record is not None + assert warmed_record.ready is True + assert warmed_record.warmed is True + assert warmed_record.ready_at == 101.0 + assert warmed_record.warmed_at == 107.5 + + reloaded = shared_mcp_runtime._read_record(metadata_path) + assert reloaded is not None + assert reloaded.ready is True + assert reloaded.warmed is True + assert reloaded.ready_at == 101.0 + assert reloaded.warmed_at == 107.5 + + +def test_spawn_daemon_exports_metadata_env_for_background_updates(monkeypatch, tmp_path) -> None: + identity = _build_identity(tmp_path) + captured: dict[str, object] = {} + + class _FakeProcess: + def __init__(self, argv, **kwargs): + captured["argv"] = argv + captured["env"] = kwargs["env"] + self.pid = 9898 + + monkeypatch.setattr(shared_mcp_runtime.subprocess, "Popen", _FakeProcess) + + record = shared_mcp_runtime._spawn_daemon(identity, 45678) + + env = captured["env"] + assert isinstance(env, dict) + assert env["OPENSPACE_MCP_DAEMON"] == "1" + assert env["OPENSPACE_MCP_INSTANCE_KEY"] == identity.instance_key + assert env["OPENSPACE_MCP_DAEMON_STATE_DIR"] == identity.state_dir + assert record.instance_key == identity.instance_key + assert record.port == 45678 + + +def test_update_current_daemon_status_timestamps_after_lock_wait(monkeypatch, tmp_path) -> None: + identity = _build_identity(tmp_path) + metadata_path = identity.metadata_path + metadata_path.parent.mkdir(parents=True, exist_ok=True) + shared_mcp_runtime._write_record(metadata_path, _build_record(identity)) + + monkeypatch.setenv("OPENSPACE_MCP_INSTANCE_KEY", identity.instance_key) + monkeypatch.setenv("OPENSPACE_MCP_DAEMON_STATE_DIR", identity.state_dir) + + current_time = {"value": 101.0} + monkeypatch.setattr(shared_mcp_runtime.time, "time", lambda: current_time["value"]) + + result_holder: dict[str, shared_mcp_runtime.MCPDaemonRecord | None] = {} + + with shared_mcp_runtime._FileLock(identity.lock_path): + worker = threading.Thread( + target=lambda: result_holder.setdefault( + "record", + shared_mcp_runtime.update_current_daemon_status("main", warmed=True), + ), + daemon=True, + ) + worker.start() + time.sleep(0.1) + current_time["value"] = 107.5 + + worker.join(timeout=2.0) + + updated = result_holder.get("record") + assert updated is not None + assert updated.warmed is True + assert updated.warmed_at == 107.5 + + +def test_update_current_daemon_status_tracks_last_used_and_active_requests(monkeypatch, tmp_path) -> None: + identity = _build_identity(tmp_path) + metadata_path = identity.metadata_path + metadata_path.parent.mkdir(parents=True, exist_ok=True) + shared_mcp_runtime._write_record(metadata_path, _build_record(identity)) + + monkeypatch.setenv("OPENSPACE_MCP_INSTANCE_KEY", identity.instance_key) + monkeypatch.setenv("OPENSPACE_MCP_DAEMON_STATE_DIR", identity.state_dir) + + monkeypatch.setattr(shared_mcp_runtime.time, "time", lambda: 111.0) + started = shared_mcp_runtime.update_current_daemon_status( + "main", + touch=True, + active_delta=1, + ) + assert started is not None + assert started.active_requests == 1 + assert started.last_used_at == 111.0 + + monkeypatch.setattr(shared_mcp_runtime.time, "time", lambda: 114.5) + finished = shared_mcp_runtime.update_current_daemon_status( + "main", + touch=True, + active_delta=-1, + ) + assert finished is not None + assert finished.active_requests == 0 + assert finished.last_used_at == 114.5 + + +def test_reap_state_dir_records_limits_live_daemons_per_kind(monkeypatch, tmp_path) -> None: + records = [] + for index, last_used_at in enumerate((10.0, 20.0, 30.0), start=1): + identity = shared_mcp_runtime.MCPDaemonIdentity( + server_kind="main", + workspace=f"/tmp/workspace-{index}", + resolved_model="unit-model", + llm_kwargs_fingerprint=f"llm-{index}", + backend_scope=("shell",), + host_skill_dirs=(f"/tmp/skills-{index}",), + grounding_config_fingerprint=f"cfg-{index}", + instance_key=f"key-{index}", + state_dir=str(tmp_path), + ) + record = shared_mcp_runtime.MCPDaemonRecord( + server_kind="main", + instance_key=identity.instance_key, + pid=4000 + index, + port=56000 + index, + workspace=identity.workspace, + resolved_model=identity.resolved_model, + llm_kwargs_fingerprint=identity.llm_kwargs_fingerprint, + backend_scope=list(identity.backend_scope), + host_skill_dirs=list(identity.host_skill_dirs), + grounding_config_fingerprint=identity.grounding_config_fingerprint, + started_at=1.0, + log_path=str(tmp_path / f"{identity.instance_key}.log"), + ready=True, + warmed=True, + last_used_at=last_used_at, + active_requests=0, + ) + shared_mcp_runtime._write_record(identity.metadata_path, record) + records.append(record) + + reaped: list[str] = [] + monkeypatch.setattr(shared_mcp_runtime, "_max_daemons_per_kind", lambda: 2) + monkeypatch.setattr(shared_mcp_runtime, "_pid_exists", lambda pid: True) + monkeypatch.setattr(shared_mcp_runtime, "_pid_matches_server", lambda record: True) + monkeypatch.setattr( + shared_mcp_runtime, + "_terminate_record_process", + lambda record: reaped.append(record.instance_key), + ) + + shared_mcp_runtime._reap_state_dir_records( + str(tmp_path), + "main", + keep_instance_key="key-3", + ) + + assert reaped == ["key-1"] + assert not (tmp_path / "main-key-1.json").exists() + assert (tmp_path / "main-key-2.json").exists() + assert (tmp_path / "main-key-3.json").exists()