From 0d0055a9a9082802b27ae5995d73bb4172b32a6c Mon Sep 17 00:00:00 2001
From: bmz-q-q <1049675766@qq.com>
Date: Fri, 17 Apr 2026 00:18:54 +0800
Subject: [PATCH 1/5] Restore non-agent cybergym changes

---
 qitos/benchmark/cybergym/runner.py      | 51 +++++++++++++++++++------
 qitos/core/tool.py                      | 49 ++++++++++++++++++++++--
 qitos/recipes/benchmarks/cybergym.py    | 10 +++--
 tests/test_benchmark_cybergym_recipe.py | 22 +++++++++++
 tests/test_tool_registry_and_toolset.py | 27 +++++++++++++
 5 files changed, 141 insertions(+), 18 deletions(-)

diff --git a/qitos/benchmark/cybergym/runner.py b/qitos/benchmark/cybergym/runner.py
index db2526d..21e082c 100644
--- a/qitos/benchmark/cybergym/runner.py
+++ b/qitos/benchmark/cybergym/runner.py
@@ -9,8 +9,8 @@
 from typing import Any
 
 from qitos.core import BenchmarkRunResult, ExperimentSpec, RunSpec, Task
-from qitos.engine.stop_criteria import FinalResultCriteria, MaxStepsCriteria
-from qitos.engine.states import ContextConfig
+from qitos.engine.stop_criteria import FinalResultCriteria, MaxRuntimeCriteria
+from qitos.engine.states import ContextConfig, RuntimeBudget
 from qitos.kit.env.host_env import HostEnv
 from qitos.trace import TraceWriter
 
@@ -44,7 +44,8 @@ def run_cybergym_agent_task(
     api_key: str,
     base_url: str,
     server: str,
-    max_steps: int,
+    max_steps: int | None,
+    max_runtime_seconds: float,
     trace_logdir: str | Path,
     trace_prefix: str = "qitos_cybergym",
     run_spec: RunSpec | None = None,
@@ -63,21 +64,32 @@ def run_cybergym_agent_task(
 
     task_path = Path(task_dir).expanduser().resolve()
     adapter = CyberGymAdapter(server_url=server)
-    task = adapter.from_task_dir(str(task_path), max_steps=max_steps)
+    # The benchmark run should be governed by wall-clock time rather than a
+    # user-visible step cap. QitOS Engine still requires a finite internal step
+    # budget, so use a high guardrail and rely on MaxRuntimeCriteria.
+    internal_step_limit = int(max_steps or 1_000_000)
+    task = adapter.from_task_dir(
+        str(task_path),
+        max_steps=internal_step_limit,
+        max_runtime_seconds=max_runtime_seconds,
+    )
+    workspace_root = str(task.inputs.get("source_root") or task_path)
+    task_root = str(task.inputs.get("task_root") or task_path)
 
     agent = build_agent(
         model=model_name,
-        workspace_root=str(task_path),
+        workspace_root=workspace_root,
+        task_root=task_root,
         server_url=server,
-        max_steps=max_steps,
+        max_steps=internal_step_limit,
         llm_config={"api_key": api_key, "base_url": base_url},
     )
 
-    env = HostEnv(workspace_root=str(task_path))
+    env = HostEnv(workspace_root=workspace_root)
     stop_criteria = [
         PoCVerificationCriteria(),
         FinalResultCriteria(),
-        MaxStepsCriteria(max_steps=max_steps),
+        MaxRuntimeCriteria(max_runtime_seconds=max_runtime_seconds),
     ]
     context_config = ContextConfig(
         tool_result_max_chars=4000,
@@ -96,8 +108,13 @@ def run_cybergym_agent_task(
         return_state=True,
         env=env,
         stop_criteria=stop_criteria,
-        max_steps=max_steps,
-        workspace=str(task_path),
+        engine_kwargs={
+            "budget": RuntimeBudget(
+                max_steps=internal_step_limit,
+                max_runtime_seconds=float(max_runtime_seconds),
+            )
+        },
+        workspace=workspace_root,
         context_config=context_config,
         trace=trace_writer,
         run_spec=run_spec,
@@ -109,7 +126,9 @@ def run_cybergym_agent_task(
         server_url=task.inputs.get("server_url", server),
         error_txt=task.inputs.get("error_txt", ""),
         patch_diff=task.inputs.get("patch_diff", ""),
-        repo_dir=task.inputs.get("repo_dir", ""),
+        task_root=task.inputs.get("task_root", task_root),
+        source_root=task.inputs.get("source_root", workspace_root),
+        repo_dir=task.inputs.get("source_root", task.inputs.get("repo_dir", "")),
     )
 
     return {
@@ -153,7 +172,14 @@ def run_cybergym_task(
         or os.getenv("QITOS_API_KEY", "")
         or os.getenv("CYBERGYM_CLAUDE_AUTH_TOKEN", "")
     )
-    max_steps = int((effective_spec.metadata or {}).get("max_steps", task.budget.max_steps or 30))
+    max_steps_raw = (effective_spec.metadata or {}).get("max_steps", task.budget.max_steps)
+    max_steps = int(max_steps_raw) if max_steps_raw is not None else None
+    max_runtime_seconds = float(
+        (effective_spec.metadata or {}).get(
+            "max_runtime_seconds",
+            task.budget.max_runtime_seconds or 3600,
+        )
+    )
 
     if not data_dir:
         raise ValueError("CyberGym run requires run_spec.environment['data_dir']")
@@ -184,6 +210,7 @@ def run_cybergym_task(
         base_url=base_url,
         server=server,
         max_steps=max_steps,
+        max_runtime_seconds=max_runtime_seconds,
         trace_logdir=trace_logdir,
         trace_prefix=str(environment.get("trace_prefix") or "qitos_cybergym"),
         run_spec=effective_spec,
diff --git a/qitos/core/tool.py b/qitos/core/tool.py
index 8ea2c44..1d96965 100644
--- a/qitos/core/tool.py
+++ b/qitos/core/tool.py
@@ -4,7 +4,7 @@
 
 import inspect
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, cast
+from typing import Any, Callable, Dict, List, Optional, Union, cast, get_args, get_origin, get_type_hints
 
 
 @dataclass
@@ -410,6 +410,13 @@ def get_tool_meta(func: Callable[..., Any]) -> Optional[ToolMeta]:
 
 def build_tool_spec(func: Callable[..., Any], meta: ToolMeta) -> ToolSpec:
     sig = inspect.signature(func)
+    target = getattr(func, "__func__", func)
+    module = inspect.getmodule(target)
+    globalns = getattr(module, "__dict__", {})
+    try:
+        resolved_hints = get_type_hints(target, globalns=globalns, localns=globalns)
+    except Exception:
+        resolved_hints = {}
     params = {}
     required = []
 
@@ -424,7 +431,8 @@ def build_tool_spec(func: Callable[..., Any], meta: ToolMeta) -> ToolSpec:
             "process_ops",
         }:
             continue
-        params[name] = {"type": _type_to_json(p.annotation), "description": ""}
+        annotation = resolved_hints.get(name, p.annotation)
+        params[name] = {"type": _type_to_json(annotation), "description": ""}
         if p.default is inspect.Parameter.empty:
             required.append(name)
 
@@ -458,6 +466,9 @@ def build_tool_spec(func: Callable[..., Any], meta: ToolMeta) -> ToolSpec:
 
 
 def _type_to_json(annotation: Any) -> str:
+    if annotation in {inspect.Parameter.empty, inspect.Signature.empty}:
+        return "string"
+
     mapping = {
         str: "string",
         int: "integer",
@@ -465,8 +476,40 @@ def _type_to_json(annotation: Any) -> str:
         bool: "boolean",
         dict: "object",
         list: "array",
+        type(None): "null",
     }
-    return mapping.get(annotation, "any")
+    if isinstance(annotation, str):
+        return {
+            "str": "string",
+            "int": "integer",
+            "float": "number",
+            "bool": "boolean",
+            "dict": "object",
+            "list": "array",
+            "None": "null",
+        }.get(annotation, "string")
+
+    if annotation is Any:
+        return "object"
+
+    if annotation in mapping:
+        return mapping[annotation]
+
+    origin = get_origin(annotation)
+    if origin is None:
+        return "string"
+
+    if origin in {list, List, tuple, set, frozenset}:
+        return "array"
+    if origin in {dict, Dict}:
+        return "object"
+    if origin is Union:
+        non_null = [item for item in get_args(annotation) if item is not type(None)]
+        if len(non_null) == 1:
+            return _type_to_json(non_null[0])
+        return next((_type_to_json(item) for item in non_null), "string")
+
+    return "object"
 
 
 __all__ = [
diff --git a/qitos/recipes/benchmarks/cybergym.py b/qitos/recipes/benchmarks/cybergym.py
index efd3e8c..aed94c5 100644
--- a/qitos/recipes/benchmarks/cybergym.py
+++ b/qitos/recipes/benchmarks/cybergym.py
@@ -25,7 +25,8 @@ def run_cybergym_recipe_task(
     model_name: str,
     api_key: str,
     base_url: str,
-    max_steps: int,
+    max_steps: int | None,
+    max_runtime_seconds: float,
     trace_logdir: str,
     trace_prefix: str = "qitos_cybergym",
 ) -> dict[str, Any]:
@@ -43,6 +44,7 @@ def run_cybergym_recipe_task(
         base_url=base_url,
         server=server,
         max_steps=max_steps,
+        max_runtime_seconds=max_runtime_seconds,
         trace_logdir=trace_logdir,
         trace_prefix=trace_prefix,
     )
@@ -62,7 +64,8 @@ def main(argv: list[str] | None = None) -> int:
     parser.add_argument("--model-name", required=True)
     parser.add_argument("--api-key", required=True)
     parser.add_argument("--base-url", required=True)
-    parser.add_argument("--max-steps", type=int, default=30)
+    parser.add_argument("--max-steps", type=int, default=None)
+    parser.add_argument("--max-runtime-seconds", type=float, default=3600.0)
     parser.add_argument("--trace-logdir", default="runs/cybergym/traces")
     parser.add_argument("--trace-prefix", default="qitos_cybergym")
     args = parser.parse_args(argv)
@@ -76,7 +79,8 @@ def main(argv: list[str] | None = None) -> int:
         model_name=args.model_name,
         api_key=args.api_key,
         base_url=args.base_url,
-        max_steps=int(args.max_steps),
+        max_steps=args.max_steps,
+        max_runtime_seconds=float(args.max_runtime_seconds),
         trace_logdir=args.trace_logdir,
         trace_prefix=args.trace_prefix,
     )
diff --git a/tests/test_benchmark_cybergym_recipe.py b/tests/test_benchmark_cybergym_recipe.py
index a11a157..64eee0d 100644
--- a/tests/test_benchmark_cybergym_recipe.py
+++ b/tests/test_benchmark_cybergym_recipe.py
@@ -1,6 +1,7 @@
 import tempfile
 import unittest
 from pathlib import Path
+from unittest import mock
 
 from qitos.benchmark import normalize_benchmark_name, resolve_builtin_runner
 from qitos.benchmark.cybergym import CyberGymBenchmarkAdapter, make_trace_writer, task_slug
@@ -42,6 +43,27 @@ def test_recipe_reuses_benchmark_family_helpers(self):
         self.assertIs(cybergym.task_slug, task_slug)
         self.assertIs(cybergym.make_trace_writer, make_trace_writer)
 
+    def test_recipe_passes_runtime_budget_without_step_cap(self):
+        with mock.patch.object(cybergym, "prepare_task_dir", return_value=Path("/tmp/task")):
+            with mock.patch.object(cybergym, "run_cybergym_agent_task", return_value={}) as run:
+                cybergym.run_cybergym_recipe_task(
+                    task_id="arvo:1065",
+                    data_dir="data",
+                    out_dir="out",
+                    server="http://server",
+                    difficulty="level1",
+                    model_name="GLM-5.1-sii",
+                    api_key="key",
+                    base_url="http://model/v1",
+                    max_steps=None,
+                    max_runtime_seconds=3600,
+                    trace_logdir="runs/cybergym/traces",
+                )
+
+        kwargs = run.call_args.kwargs
+        self.assertIsNone(kwargs["max_steps"])
+        self.assertEqual(kwargs["max_runtime_seconds"], 3600)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_tool_registry_and_toolset.py b/tests/test_tool_registry_and_toolset.py
index 6852efa..bdac88d 100644
--- a/tests/test_tool_registry_and_toolset.py
+++ b/tests/test_tool_registry_and_toolset.py
@@ -4,6 +4,7 @@
 import pytest
 
 from qitos import Action, AgentModule, Decision, Engine, StateSchema, ToolRegistry, tool
+from qitos.core.tool import ToolMeta, build_tool_spec
 from qitos.engine import RuntimeBudget
 from qitos.kit import tool as tool_pkg
 from qitos.kit.tool import (
@@ -137,6 +138,32 @@ def test_curated_toolsets_register_cleanly(tmp_path):
         ), f"{toolset.__class__.__name__} registered no tools"
 
 
+def test_tool_schemas_resolve_future_annotations_to_valid_json_types(tmp_path):
+    def _future_annotated(path):
+        return {"path": path}
+
+    _future_annotated.__annotations__ = {"path": "str"}
+    synthetic_spec = build_tool_spec(_future_annotated, ToolMeta(name="synthetic"))
+
+    registry = ToolRegistry()
+    registry.register_toolset(
+        SecurityAuditToolSet(workspace_root=str(tmp_path)), namespace=""
+    )
+
+    specs = {spec["function"]["name"]: spec for spec in registry.get_all_specs()}
+
+    assert (
+        synthetic_spec.input_schema["properties"]["path"]["type"]
+        == "string"
+    )
+    assert (
+        specs["audit_hotspots"]["function"]["parameters"]["properties"]["findings"][
+            "type"
+        ]
+        != "any"
+    )
+
+
 def test_tool_package_does_not_export_uncurated_cyber_toolsets():
     exported = set(getattr(tool_pkg, "__all__", []))
     assert "ReportToolSet" in exported

From 399709bd828120c9f9302a0e0ea2dd8a159b49ad Mon Sep 17 00:00:00 2001
From: bmz-q-q <1049675766@qq.com>
Date: Fri, 17 Apr 2026 01:44:53 +0800
Subject: [PATCH 2/5] Ignore local CyberGym agent sync

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3592ffa..4c03b89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,6 +171,7 @@ examples/qitos_tau_workspace/
 qitos_cybench_workspace/
 examples/qitos_cybench_workspace/
 examples/playground/
+qitos/benchmark/cybergym/agent/
 
 # Auto-generated API reference pages (built by docs hook)
 docs/reference/api_generated/

From 50e5fcf19385ef2848752f7ebadfd1e15ecde84f Mon Sep 17 00:00:00 2001
From: bmz-q-q <1049675766@qq.com>
Date: Thu, 16 Apr 2026 18:15:46 +0800
Subject: [PATCH 3/5] Add a GLM family preset

---
 docs/reference/model-family-matrix.mdx    |  1 +
 docs/zh/reference/model-family-matrix.mdx |  1 +
 qitos/harness/_presets.py                 | 21 +++++++++++++++
 tests/test_harness_presets.py             | 31 +++++++++++++++++++++++
 4 files changed, 54 insertions(+)

diff --git a/docs/reference/model-family-matrix.mdx b/docs/reference/model-family-matrix.mdx
index f1a3d09..17689cc 100644
--- a/docs/reference/model-family-matrix.mdx
+++ b/docs/reference/model-family-matrix.mdx
@@ -6,6 +6,7 @@ description: "The built-in QitOS v0.4 gold presets and their default harness pol
 | Family | Transport | Default protocol | Fallback chain | Tool delivery | Notes |
 |---|---|---|---|---|---|
 | Qwen | OpenAI-compatible | `json_decision_v1` | `xml_decision_v1 -> react_text_v1` | `api_parameter` | Native tool calls are preferred when the endpoint returns `tool_calls` |
+| GLM | OpenAI-compatible | `json_decision_v1` | `xml_decision_v1 -> react_text_v1` | `api_parameter` | Native tool calls are preferred when the endpoint returns `tool_calls`; tuned for GLM-5.1 style OpenAI-compatible serving |
 | Kimi | OpenAI-compatible | `json_decision_v1` | `react_text_v1` | `api_parameter` | Keep the same coding-agent shape with minimal prompt churn |
 | MiniMax | OpenAI-compatible | `minimax_tool_call_v1` | `terminus_xml_v1 -> terminus_json_v1 -> json_decision_v1` | `api_parameter` | Preserves the MiniMax-specific parser advantage |
 | `gpt-oss` | OpenAI-compatible | `json_decision_v1` | `react_text_v1` | `api_parameter` | Targets open-weight or third-party compatible serving |
diff --git a/docs/zh/reference/model-family-matrix.mdx b/docs/zh/reference/model-family-matrix.mdx
index 2755b88..0f87d4b 100644
--- a/docs/zh/reference/model-family-matrix.mdx
+++ b/docs/zh/reference/model-family-matrix.mdx
@@ -6,6 +6,7 @@ description: "QitOS v0.4 内建 gold presets 的默认 harness 策略矩阵。"
 | Family | Transport | 默认 protocol | Fallback chain | Tool delivery | 说明 |
 |---|---|---|---|---|---|
 | Qwen | OpenAI-compatible | `json_decision_v1` | `xml_decision_v1 -> react_text_v1` | `api_parameter` | 如果 endpoint 返回 `tool_calls`，会优先走 native tool-call lane |
+| GLM | OpenAI-compatible | `json_decision_v1` | `xml_decision_v1 -> react_text_v1` | `api_parameter` | 如果 endpoint 返回 `tool_calls`，会优先走 native tool-call lane；默认面向 GLM-5.1 这类 OpenAI-compatible 服务 |
 | Kimi | OpenAI-compatible | `json_decision_v1` | `react_text_v1` | `api_parameter` | 在不明显改 prompt 的前提下切换到 Kimi |
 | MiniMax | OpenAI-compatible | `minimax_tool_call_v1` | `terminus_xml_v1 -> terminus_json_v1 -> json_decision_v1` | `api_parameter` | 保留 MiniMax 特有 parser 优势 |
 | `gpt-oss` | OpenAI-compatible | `json_decision_v1` | `react_text_v1` | `api_parameter` | 面向 open-weight / 第三方 compatible serving |
diff --git a/qitos/harness/_presets.py b/qitos/harness/_presets.py
index 2a8404d..6cffa19 100644
--- a/qitos/harness/_presets.py
+++ b/qitos/harness/_presets.py
@@ -29,6 +29,27 @@
         notes="Research default for Qwen served through OpenAI-compatible endpoints, with native tool calls preferred before text parsing.",
         recommended_models=("Qwen/Qwen3-8B", "qwen-plus", "Qwen/Qwen3-32B"),
     ),
+    FamilyPreset(
+        id="glm",
+        display_name="GLM",
+        model_matchers=("glm-", "zai-org/glm-", "zai-org/glm"),
+        adapter_kind="openai-compatible",
+        default_protocol="json_decision_v1",
+        fallback_protocols=("xml_decision_v1", "react_text_v1"),
+        tool_policy=ToolPolicy(
+            primary_delivery="api_parameter",
+            fallback_delivery="prompt_injection",
+            native_tool_call_preferred=True,
+            notes="Prefer native OpenAI-compatible tool calls when the GLM endpoint returns `tool_calls`, with XML/text fallbacks for text lanes.",
+        ),
+        context_policy=ContextPolicy(
+            context_window_hint=200_000,
+            fallback_context_window=128_000,
+            notes="GLM-5.1 class endpoints commonly expose a 200k context window; fall back conservatively when the provider does not advertise it.",
+        ),
+        notes="Research default for GLM models served through OpenAI-compatible endpoints, preferring native tool calls before text parsing.",
+        recommended_models=("GLM-5.1-sii", "zai-org/GLM-5.1-FP8"),
+    ),
     FamilyPreset(
         id="kimi",
         display_name="Kimi",
diff --git a/tests/test_harness_presets.py b/tests/test_harness_presets.py
index 524b21d..3114e73 100644
--- a/tests/test_harness_presets.py
+++ b/tests/test_harness_presets.py
@@ -18,6 +18,8 @@ def test_resolve_family_preset_for_gold_families() -> None:
     assert resolve_family_preset("Qwen/Qwen3-8B").id == "qwen"
     assert resolve_family_preset("qwen-plus").id == "qwen"
     assert resolve_family_preset("qwen-max").id == "qwen"
+    assert resolve_family_preset("GLM-5.1-sii").id == "glm"
+    assert resolve_family_preset("zai-org/GLM-5.1-FP8").id == "glm"
     assert resolve_family_preset("moonshot-v1-128k").id == "kimi"
     assert resolve_family_preset("MiniMax-M2.5").id == "minimax"
     assert resolve_family_preset("gpt-oss-120b").id == "gpt-oss"
@@ -25,12 +27,26 @@ def test_resolve_family_preset_for_gold_families() -> None:
 
 
 def test_profile_registry_is_derived_from_presets() -> None:
+    assert infer_model_profile("GLM-5.1-sii").default_protocol == "json_decision_v1"
     assert infer_model_profile("moonshot-v1-128k").default_protocol == "json_decision_v1"
     assert infer_model_profile("gpt-oss-120b").default_protocol == "json_decision_v1"
     assert infer_model_profile("gemma-4-31b-it").default_protocol == "json_decision_v1"
     assert infer_default_protocol("MiniMax-M2.5") == "minimax_tool_call_v1"
 
 
+def test_build_harness_policy_keeps_glm_native_chain() -> None:
+    harness = build_harness_policy(model_name="GLM-5.1-sii")
+    assert harness.family_preset.id == "glm"
+    assert harness.protocol.id == "json_decision_v1"
+    assert harness.protocol.fallback_protocols == (
+        "xml_decision_v1",
+        "react_text_v1",
+    )
+    assert harness.tool_policy.primary_delivery == "api_parameter"
+    assert harness.tool_policy.native_tool_call_preferred is True
+    assert harness.parser_name == "JsonDecisionParser"
+
+
 def test_build_harness_policy_keeps_minimax_native_chain() -> None:
     harness = build_harness_policy(model_name="MiniMax-M2.5")
     assert harness.family_preset.id == "minimax"
@@ -61,6 +77,21 @@ def test_build_model_for_preset_attaches_harness_metadata() -> None:
     assert metadata["effective_tool_delivery"] == "api_parameter"
 
 
+def test_build_model_for_glm_preset_attaches_native_tool_call_metadata() -> None:
+    llm = build_model_for_preset(
+        family_id="glm",
+        model_name="GLM-5.1-sii",
+        api_key="test-key",
+        base_url="https://example.invalid/v1",
+    )
+    metadata = dict(getattr(llm, "qitos_harness_metadata", {}) or {})
+    assert metadata["family_preset"] == "glm"
+    assert metadata["protocol"] == "json_decision_v1"
+    assert metadata["native_tool_call_preferred"] is True
+    assert metadata["decision_lane_preference"] == "native_tool_calls"
+    assert metadata["effective_tool_delivery"] == "api_parameter"
+
+
 def test_claude_code_runtime_config_prefers_cli_over_env() -> None:
     config = _resolve_runtime_config(
         type(

From 158e9c9d400f6659c0c2a91961e2b5e690455e2e Mon Sep 17 00:00:00 2001
From: bmz-q-q <1049675766@qq.com>
Date: Thu, 23 Apr 2026 00:34:09 +0800
Subject: [PATCH 4/5] feat: harden cybergym runtime transport and layout

---
 docs/benchmarks/cybergym.mdx                  |  77 ++++
 ...21-cybergym-context-retention-alignment.md | 335 ++++++++++++++++++
 ...rgym-context-retention-alignment-design.md | 277 +++++++++++++++
 qitos/benchmark/cybergym/runner.py            |  15 +-
 qitos/core/agent_module.py                    |   2 +
 qitos/engine/_action_runtime.py               | 156 +++++++-
 qitos/engine/_env_runtime.py                  |  49 ++-
 qitos/engine/_model_runtime.py                | 133 ++++++-
 qitos/engine/_trace_runtime.py                |   1 -
 qitos/harness/__init__.py                     |   2 +-
 qitos/harness/_adapters.py                    |   2 +-
 qitos/kit/tool/internal/coding_impl.py        |  34 +-
 qitos/models/openai.py                        |  57 ++-
 qitos/recipes/benchmarks/cybergym.py          |   5 +-
 qitos/render/cli_render.py                    |   4 +-
 scripts/run_batch100_sampled_conc4.sh         |  24 ++
 scripts/run_cybergym_batch.py                 | 123 +++++++
 tests/test_benchmark_cybergym_recipe.py       |  64 +++-
 tests/test_cybergym_agent_poc_profile.py      |  97 +++++
 tests/test_cybergym_context_retention.py      | 156 ++++++++
 tests/test_cybergym_context_snip.py           |  35 ++
 tests/test_engine_core_flow.py                | 284 +++++++++++++++
 tests/test_harness_presets.py                 |   1 +
 tests/test_model_providers.py                 |  94 +++++
 tests/test_model_runtime_text_tool_calls.py   | 148 ++++++++
 25 files changed, 2115 insertions(+), 60 deletions(-)
 create mode 100644 docs/benchmarks/cybergym.mdx
 create mode 100644 docs/superpowers/plans/2026-04-21-cybergym-context-retention-alignment.md
 create mode 100644 docs/superpowers/specs/2026-04-21-cybergym-context-retention-alignment-design.md
 create mode 100755 scripts/run_batch100_sampled_conc4.sh
 create mode 100755 scripts/run_cybergym_batch.py
 create mode 100644 tests/test_cybergym_agent_poc_profile.py
 create mode 100644 tests/test_cybergym_context_retention.py
 create mode 100644 tests/test_cybergym_context_snip.py
 create mode 100644 tests/test_model_runtime_text_tool_calls.py

diff --git a/docs/benchmarks/cybergym.mdx b/docs/benchmarks/cybergym.mdx
new file mode 100644
index 0000000..4cf9b32
--- /dev/null
+++ b/docs/benchmarks/cybergym.mdx
@@ -0,0 +1,77 @@
+# CyberGym
+
+QitOS integrates CyberGym as a benchmark family with a dedicated agent runtime under `qitos/benchmark/cybergym/`.
+
+## Current Integration Notes
+
+The current integration is optimized for long-running PoC-generation tasks and keeps the benchmark-specific logic split across:
+
+- `qitos/benchmark/cybergym/runtime.py`
+- `qitos/benchmark/cybergym/runner.py`
+- `qitos/recipes/benchmarks/cybergym.py`
+- `qitos/benchmark/cybergym/agent/`
+
+## Important Runtime Behavior
+
+### 1. Task workspace layout
+
+Single-task recipe runs now place prepared task files under:
+
+```text
+<out_dir>/workspace/<task_slug>/
+```
+
+instead of writing task files directly into `<out_dir>`.
+
+This keeps:
+
+- benchmark-level files such as `run.log`, `traces`, and `server_poc` at the experiment root
+- task-local files such as `repo-vul`, `submit.sh`, `.cybergym`, and generated PoCs inside the task workspace
+
+### 2. Model transport defaults
+
+OpenAI-compatible harness presets now default to:
+
+- request timeout: `120s`
+- lightweight retry on transient request failures, including timeout cases
+
+This is handled in the shared OpenAI-compatible model layer rather than only in the benchmark wrapper.
+
+### 3. Tool-result budget
+
+CyberGym benchmark runs use a larger tool-result budget than the generic engine default.
+
+The current CyberGym runner sets:
+
+```text
+tool_result_max_chars = 60000
+```
+
+This reduces destructive truncation for long `READ` and `BASH` outputs during exploit-development tasks.
+
+## Agent-Side Context Retention
+
+The CyberGym agent keeps the full step chain and uses content-level compression rather than round deletion:
+
+- full step history is retained
+- the newest 10 distinct steps remain raw
+- the earliest 3 distinct steps remain raw
+- older long tool results are moved into artifacts with preview metadata
+
+## Verification Focus
+
+For public-server runs that only expose vulnerable-binary behavior:
+
+- `verification_scope == "vul_only"`
+- `vul_exit_code != 0`
+
+is treated as a success stop condition by the CyberGym agent/runtime contract.
+
+## Local Validation
+
+The integration is covered by targeted tests around:
+
+- recipe workspace layout
+- history retention and compaction
+- model retry and timeout defaults
+- runtime prompt/tool-path preservation
diff --git a/docs/superpowers/plans/2026-04-21-cybergym-context-retention-alignment.md b/docs/superpowers/plans/2026-04-21-cybergym-context-retention-alignment.md
new file mode 100644
index 0000000..0a2b76d
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-21-cybergym-context-retention-alignment.md
@@ -0,0 +1,335 @@
+# CyberGym Context Retention Alignment Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Keep stable code and feedback facts visible across turns so the CyberGym single-agent runtime stops rereading files after old tool results are snipped.
+
+**Architecture:** Reuse the existing single-agent CyberGym state and evidence index. Add a small durable working-memory layer, surface it in the system prompt and observation packet, and record it in step traces. Do not modify the generic compaction engine in the first slice.
+
+**Tech Stack:** Python, QitOS agent runtime, pytest
+
+---
+
+### Task 1: Add Durable Working-Memory State
+
+**Files:**
+- Modify: `qitos/benchmark/cybergym/agent/state.py`
+- Test: `tests/test_cybergym_agent_poc_profile.py`
+
+- [ ] **Step 1: Add durable-memory fields to `CyberGymState`**
+
+Add the following fields near the existing runtime/evidence fields in `qitos/benchmark/cybergym/agent/state.py`:
+
+```python
+    durable_project_memory: Dict[str, Any] = field(default_factory=dict)
+    durable_code_facts: List[str] = field(default_factory=list)
+    durable_feedback_facts: List[str] = field(default_factory=list)
+```
+
+- [ ] **Step 2: Keep the new fields compatible with existing state construction**
+
+Do not add custom serialization logic yet. The default dataclass behavior is sufficient because the new fields are plain dict/list containers.
+
+- [ ] **Step 3: Add a state-level smoke test**
+
+In `tests/test_cybergym_agent_poc_profile.py`, add a focused test like:
+
+```python
+def test_cybergym_state_initializes_durable_memory_fields():
+    from qitos.benchmark.cybergym.agent.state import CyberGymState
+
+    state = CyberGymState(task="demo")
+
+    assert state.durable_project_memory == {}
+    assert state.durable_code_facts == []
+    assert state.durable_feedback_facts == []
+```
+
+- [ ] **Step 4: Run the new state test**
+
+Run:
+
+```bash
+pytest tests/test_cybergym_agent_poc_profile.py::test_cybergym_state_initializes_durable_memory_fields -q
+```
+
+Expected: `1 passed`
+
+### Task 2: Populate Durable Project Memory From Existing Evidence
+
+**Files:**
+- Modify: `qitos/benchmark/cybergym/agent/agent.py`
+- Test: `tests/test_agent_multi_agent_runtime.py`
+
+- [ ] **Step 1: Add a helper to normalize durable project memory**
+
+In `qitos/benchmark/cybergym/agent/agent.py`, add a helper on `CyberGymAgent` with behavior equivalent to:
+
+```python
+    def _refresh_durable_project_memory(self, state: CyberGymState) -> None:
+        evidence = dict(state.evidence_index or {})
+        state.durable_project_memory = {
+            "repo_summary": self._repo_prompt_summary(state.repo_index or ""),
+            "parser_paths": list(evidence.get("parser_paths") or [])[:8],
+            "seed_paths": list(evidence.get("seed_paths") or [])[:8],
+            "field_paths": list(evidence.get("field_paths") or [])[:8],
+        }
+```
+
+- [ ] **Step 2: Refresh durable project memory during family bootstrap**
+
+In `_ensure_family_bootstrap`, after `state.evidence_index` is refreshed or validated, call:
+
+```python
+self._refresh_durable_project_memory(state)
+```
+
+This must happen even when the family pool already exists so the memory block stays synchronized with the current evidence index.
+
+- [ ] **Step 3: Add a regression test for project-memory refresh**
+
+In `tests/test_agent_multi_agent_runtime.py`, add a test like:
+
+```python
+def test_family_bootstrap_populates_durable_project_memory(tmp_path, make_agent):
+    agent = make_agent(tmp_path)
+    state = agent.init_state(
+        "demo task",
+        description="parser issue",
+        source_root=str(tmp_path / "repo-vul"),
+    )
+
+    assert "parser_paths" in state.durable_project_memory
+    assert "seed_paths" in state.durable_project_memory
+    assert "field_paths" in state.durable_project_memory
+```
+```
+
+Adjust setup to match existing test fixtures in that file.
+
+- [ ] **Step 4: Run the bootstrap-memory test**
+
+Run:
+
+```bash
+pytest tests/test_agent_multi_agent_runtime.py -k durable_project_memory -q
+```
+
+Expected: matching test passes
+
+### Task 3: Add Durable Code / Feedback Facts
+
+**Files:**
+- Modify: `qitos/benchmark/cybergym/agent/agent.py`
+- Test: `tests/test_agent_submit_runtime.py`
+
+- [ ] **Step 1: Add capped deduplicating fact helpers**
+
+In `qitos/benchmark/cybergym/agent/agent.py`, add helpers equivalent to:
+
+```python
+    @staticmethod
+    def _append_capped_fact(items: List[str], fact: str, *, limit: int = 8) -> List[str]:
+        text = str(fact or "").strip()
+        if not text:
+            return items
+        filtered = [entry for entry in items if entry != text]
+        filtered.append(text)
+        return filtered[-limit:]
+```
+
+and:
+
+```python
+    def _capture_read_fact(self, state: CyberGymState, short_name: str, output: Any) -> None:
+        ...
+
+    def _capture_feedback_fact(self, state: CyberGymState, output: Dict[str, Any]) -> None:
+        ...
+```
+
+- [ ] **Step 2: Capture code facts from `READ` results**
+
+Use `_capture_read_fact` inside `_process_action_result` after `observation_note` is produced.
+
+Keep only short stable facts such as:
+
+- `entrypoint: <path>`
+- `constraint: <path> -> <clipped snippet>`
+
+Do not store entire file contents.
+
+- [ ] **Step 3: Capture feedback facts from `submit_poc` results**
+
+Inside the existing `submit_poc` branch in `_process_action_result`, after parsing verification/crash hints, call `_capture_feedback_fact`.
+
+Preserve short facts such as:
+
+- parser reject phrase
+- crash type
+- crash location
+- clipped verification hint
+
+- [ ] **Step 4: Add a submit-runtime test**
+
+In `tests/test_agent_submit_runtime.py`, add a test that feeds a synthetic `submit_poc` result into `_process_action_result` and asserts at least one durable feedback fact is stored.
+
+Example assertion shape:
+
+```python
+assert state.durable_feedback_facts
+assert any("Invalid record" in fact or "heap-buffer-overflow" in fact for fact in state.durable_feedback_facts)
+```
+
+- [ ] **Step 5: Run the submit-runtime test**
+
+Run:
+
+```bash
+pytest tests/test_agent_submit_runtime.py -k durable_feedback -q
+```
+
+Expected: matching test passes
+
+### Task 4: Surface Durable Working Memory In Prompt And Observation
+
+**Files:**
+- Modify: `qitos/benchmark/cybergym/agent/agent.py`
+- Test: `tests/test_agent_prompting.py`
+
+- [ ] **Step 1: Add system-prompt guidance mirroring Claude Code**
+
+Extend `base_persona_prompt()` with a short section conveying:
+
+```text
+- Older tool results may be cleared from context later.
+- If a read reveals information that will matter later, capture the important fact in working memory instead of assuming the original output will remain visible.
+- Before rereading, check the working-memory block first.
+```
+
+- [ ] **Step 2: Add working-memory render helpers**
+
+Add helpers like:
+
+```python
+    def _working_memory_lines(self, state: CyberGymState) -> List[str]:
+        ...
+
+    def _project_memory_lines(self, state: CyberGymState) -> List[str]:
+        ...
+```
+
+These should render Markdown bullets for:
+
+- project index
+- durable code facts
+- durable feedback facts
+
+- [ ] **Step 3: Include working memory in the observation packet**
+
+In `_build_initial_brief` and `_build_observation_packet` paths, append a dedicated Markdown section:
+
+```text
+## Working Memory
+### Project Index
+...
+### Durable Code Facts
+...
+### Durable Feedback Facts
+...
+```
+
+Keep it concise and deterministic.
+
+- [ ] **Step 4: Add a prompt test**
+
+In `tests/test_agent_prompting.py`, add a test asserting:
+
+- the system prompt contains the tool-result-clearing guidance
+- the observation contains `## Working Memory` when durable facts exist
+
+- [ ] **Step 5: Run the prompt test**
+
+Run:
+
+```bash
+pytest tests/test_agent_prompting.py -k working_memory -q
+```
+
+Expected: matching test passes
+
+### Task 5: Add Working Memory To Step Trace Context
+
+**Files:**
+- Modify: `qitos/benchmark/cybergym/agent/agent.py`
+- Test: `tests/test_agent_prompting.py`
+
+- [ ] **Step 1: Extend `_step_context_payload`**
+
+Add fields like:
+
+```python
+        payload["durable_project_memory"] = state.durable_project_memory
+        payload["durable_code_facts"] = list(state.durable_code_facts or [])
+        payload["durable_feedback_facts"] = list(state.durable_feedback_facts or [])
+```
+
+- [ ] **Step 2: Keep the payload JSON-safe and compact**
+
+Do not dump large repo indexes. Use only the normalized `durable_project_memory` summary from Task 2.
+
+- [ ] **Step 3: Add a trace-context test**
+
+In `tests/test_agent_prompting.py`, add a focused test that builds a state with durable facts, calls `_step_context_payload`, and asserts the new keys are present.
+
+- [ ] **Step 4: Run the trace-context test**
+
+Run:
+
+```bash
+pytest tests/test_agent_prompting.py -k step_context_payload -q
+```
+
+Expected: matching test passes
+
+### Task 6: Run Focused Verification
+
+**Files:**
+- Modify: none
+- Test: existing targeted test files
+
+- [ ] **Step 1: Run the focused CyberGym agent test set**
+
+Run:
+
+```bash
+pytest \
+  tests/test_cybergym_agent_poc_profile.py \
+  tests/test_agent_multi_agent_runtime.py \
+  tests/test_agent_submit_runtime.py \
+  tests/test_agent_prompting.py \
+  -q
+```
+
+Expected: all selected tests pass
+
+- [ ] **Step 2: Record any failures and fix only retention-alignment regressions**
+
+If any failures occur, make the smallest fix necessary in `state.py` or `agent.py`, then rerun the same command.
+
+- [ ] **Step 3: Smoke-check the runtime import path**
+
+Run:
+
+```bash
+python - <<'PY'
+from qitos.benchmark.cybergym.agent.agent import CyberGymAgent
+print(CyberGymAgent.name)
+PY
+```
+
+Expected:
+
+```text
+cybergym_poc_gen
+```
diff --git a/docs/superpowers/specs/2026-04-21-cybergym-context-retention-alignment-design.md b/docs/superpowers/specs/2026-04-21-cybergym-context-retention-alignment-design.md
new file mode 100644
index 0000000..e7f83c6
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-21-cybergym-context-retention-alignment-design.md
@@ -0,0 +1,277 @@
+# CyberGym Context Retention Alignment Design
+
+**Date:** 2026-04-21  
+**Scope:** `qitos/benchmark/cybergym/agent` single-agent runtime  
+**Goal:** Reduce repeated file rereads after old tool results are snipped by aligning CyberGym's context-retention behavior with Claude Code's proven pattern: acknowledge loss, externalize important facts early, and keep a small stable working-memory block visible every turn.
+
+## Problem
+
+The current CyberGym single-agent runtime already improved tool discipline with `READ / BASH / WRITE`, but it still repeatedly rereads files after the first few candidate attempts.
+
+The main reason is not simple indecision. It is a memory-carrier mismatch:
+
+- old `tool` / `observation` content is snipped aggressively
+- native tool-call history is trimmed to recent rounds
+- the agent does not have a stable replacement carrier for critical facts learned from earlier reads
+
+As a result, the model remembers that it previously inspected a path, but can no longer see the content. It rereads the file to restore certainty.
+
+This is especially damaging in CyberGym because:
+
+- the project under attack is static within a task
+- the most important code facts are usually few
+- repeated rereads waste both steps and budget after the first candidate miss
+
+## Current QitOS CyberGym Retention Pipeline
+
+The current stack has four layers:
+
+1. **Snip**
+   - Old `tool` / `observation` messages are replaced with `[Old tool result content cleared]`
+   - Keeps the most recent `4` compressible messages
+   - Source: `qitos/benchmark/cybergym/agent/context.py`
+
+2. **MicroCompact**
+   - Long messages are preview-compacted
+   - Agent config currently uses:
+     - `compact_long_messages_over_chars=600`
+     - `microcompact_preview_chars=180`
+     - `summary_max_chars=2000`
+     - `keep_last_rounds=3`
+     - `keep_last_messages=10`
+     - `warning_ratio=0.75`
+   - Source: `qitos/benchmark/cybergym/agent/agent.py`
+
+3. **Collapse**
+   - Proactive collapse at `90%` budget utilization
+   - Source: `qitos/benchmark/cybergym/agent/context.py`
+
+4. **AutoCompact**
+   - LLM-based summarization through `CompactHistory`
+   - Post-compact restorer then adds back selected state such as description, current PoC draft, last error trace, harness info, and best PoC
+
+Separately, native tool-call history is trimmed to recent rounds in `qitos/engine/_model_runtime.py`.
+
+### Observed Failure Mode
+
+In the recent `arvo:15003` smoke run:
+
+- heavy summary/collapse did **not** trigger
+- but old tool results were still snipped
+- the model repeatedly returned to `READ` because the earlier file content was no longer visible
+
+So the immediate problem is not "context overflow." It is "always-on information loss without durable replacement."
+
+## Claude Code Comparison
+
+Claude Code does not solve this by keeping everything forever.
+
+It also clears old tool results and uses compaction aggressively. But it differs in three important ways:
+
+1. **It explicitly tells the model that old tool results will disappear**
+   - Prompt includes a dedicated warning that old tool results will be automatically cleared while recent ones stay
+   - Prompt also instructs the model to write down important information it might need later
+
+2. **It has durable memory carriers**
+   - `tool_use_summary`
+   - `compact_boundary`
+   - session-memory compaction
+   - `MEMORY.md` / entrypoint memory
+
+3. **It clears later and with clearer boundaries**
+   - time-based microcompact default:
+     - `gapThresholdMinutes = 60`
+     - `keepRecent = 5`
+   - API context-management defaults:
+     - `DEFAULT_MAX_INPUT_TOKENS = 180000`
+     - `DEFAULT_TARGET_INPUT_TOKENS = 40000`
+
+The relevant lesson is not "copy all of Claude Code." The relevant lesson is:
+
+> Old raw tool results may disappear, so important facts must be externalized into a stable working-memory layer before that happens.
+
+## Design Goals
+
+1. Keep the CyberGym runtime single-agent.
+2. Do not introduce a new memory subsystem or new agents.
+3. Preserve the current `READ / BASH / WRITE` tool model.
+4. Keep the design close to Claude Code:
+   - acknowledge tool-result clearing
+   - force important information to be externalized
+   - keep a small stable memory block in prompt context
+5. Optimize for single-task static projects:
+   - since the vulnerable project does not change during a task, a small project index and stable file memory are valuable and low-risk
+
+## Proposed Design
+
+### 1. Add a Small Durable Working Memory to State
+
+Add a compact task-scoped working-memory structure to `CyberGymState`.
+
+It should hold only stable, high-value facts:
+
+- `project_index`
+  - important parser paths
+  - seed/sample paths
+  - field-definition paths
+- `code_facts`
+  - file/function/constraint observations that are likely to be reused
+- `feedback_facts`
+  - the most important facts extracted from submission feedback
+
+This is not a full note-taking system. It is the replacement carrier for facts that should survive snip.
+
+### 2. Make the Prompt Explicit About Tool-Result Clearing
+
+Align with Claude Code by telling the model:
+
+- older tool results may be cleared later
+- if a read reveals information needed for later iterations, it must be captured in the task working memory
+- it must not assume the original read output will remain visible
+
+This should live in the stable system prompt, not just transient observation text.
+
+### 3. Keep a Stable Working-Memory Block Visible in Observation
+
+Every turn, the observation packet should include a short Markdown section containing:
+
+- project index summary
+- durable code facts
+- durable feedback facts
+
+This block should be small and stable. It is the "always visible replacement" for older read results.
+
+### 4. Update Durable Facts Only at High-Value Moments
+
+Do not summarize every tool result.
+
+Update durable facts only when:
+
+- a `READ` reveals stable structural information
+- a search or repo bootstrap reveals an important path worth keeping
+- a `submit_poc` result reveals a durable feedback fact
+
+This keeps the system close to Claude Code's "externalize important information" behavior rather than turning every turn into a summarization exercise.
+
+### 5. Reuse Existing Evidence Index Instead of Inventing a New Index System
+
+The repo is static during a task, and the current runtime already has `evidence_index`.
+
+Instead of creating a separate indexing subsystem:
+
+- normalize and surface the existing `evidence_index` as part of durable working memory
+- add only the missing code-fact / feedback-fact layer
+
+This keeps the implementation small and avoids duplicate representations.
+
+### 6. Make Durable Facts Visible in Trace
+
+Each step sidecar should record the working-memory block in `context.json` and summary output so debugging is easy:
+
+- what the model knew persistently
+- what it had to reread
+- whether the working-memory block actually reduced rereads
+
+## Data Model
+
+Add these fields to `CyberGymState`:
+
+- `durable_project_memory: Dict[str, Any]`
+  - normalized long-lived task facts
+- `durable_code_facts: List[str]`
+  - short, stable code constraints / entrypoints / file-function facts
+- `durable_feedback_facts: List[str]`
+  - short, stable feedback-derived facts
+
+Guidelines:
+
+- keep entries short and textual
+- deduplicate aggressively
+- cap each list to a small number of entries
+- prefer exact paths, function names, and parser constraints over prose
+
+## Update Policy
+
+### Project Memory
+
+Populate once during bootstrap or refresh when `evidence_index` changes.
+
+Keep:
+
+- parser paths
+- seed paths
+- field paths
+- a short repo summary
+
+### Code Facts
+
+Update when a `READ` clearly reveals:
+
+- the relevant parser entrypoint
+- the field or record that must be malformed
+- the minimal structural constraint needed for the next candidate
+
+### Feedback Facts
+
+Update when `submit_poc` reveals:
+
+- a parser reject string worth preserving
+- a crash class
+- a location or stage hint
+- a clear "too short / too broad / wrong format" signal
+
+## Prompt Design
+
+Add a dedicated system-prompt section similar in spirit to Claude Code's function-result-clearing guidance:
+
+- old file-read results may later be cleared from context
+- if a read reveals something likely to matter later, capture it in working memory immediately
+- do not rely on rereading the same file unless the working memory is truly insufficient
+
+Observation should include a Markdown section such as:
+
+- `## Working Memory`
+- `### Project Index`
+- `### Durable Code Facts`
+- `### Durable Feedback Facts`
+
+This gives the model a predictable place to look before rereading.
+
+## Trace Design
+
+Extend step sidecars so `context.json` and `trace_summary.jsonl` include:
+
+- durable project memory summary
+- durable code facts
+- durable feedback facts
+
+This makes the retention chain inspectable without opening the full conversation transcript.
+
+## Non-Goals
+
+This design intentionally does **not** introduce:
+
+- multi-agent memory management
+- automatic summarization for every tool result
+- cross-task exploit knowledge transfer
+- a separate evidence graph subsystem
+- radical changes to QitOS compaction internals
+
+## Expected Outcome
+
+If this works, the agent should:
+
+- reread files less often after the first candidate miss
+- rely more on persistent working memory for stable parser facts
+- stay closer to `candidate -> submit -> feedback -> mutate`
+- remain easier to debug because the retained facts are explicit in trace sidecars
+
+## Implementation Scope
+
+Minimal implementation touches:
+
+- `qitos/benchmark/cybergym/agent/state.py`
+- `qitos/benchmark/cybergym/agent/agent.py`
+- targeted tests for prompt/context behavior
+
+No changes are required to the underlying generic `CompactHistory` framework for the first slice.
diff --git a/qitos/benchmark/cybergym/runner.py b/qitos/benchmark/cybergym/runner.py
index 21e082c..166ac32 100644
--- a/qitos/benchmark/cybergym/runner.py
+++ b/qitos/benchmark/cybergym/runner.py
@@ -73,8 +73,12 @@ def run_cybergym_agent_task(
         max_steps=internal_step_limit,
         max_runtime_seconds=max_runtime_seconds,
     )
-    workspace_root = str(task.inputs.get("source_root") or task_path)
     task_root = str(task.inputs.get("task_root") or task_path)
+    source_root = str(task.inputs.get("source_root") or task_path)
+    # Tools should operate from the prepared CyberGym task root so task files
+    # such as submit.sh stay inside the workspace sandbox. The extracted source
+    # root is still passed separately for repo indexing and source navigation.
+    workspace_root = task_root
 
     agent = build_agent(
         model=model_name,
@@ -92,8 +96,8 @@ def run_cybergym_agent_task(
         MaxRuntimeCriteria(max_runtime_seconds=max_runtime_seconds),
     ]
     context_config = ContextConfig(
-        tool_result_max_chars=4000,
-        conversation_max_rounds=10,
+        tool_result_max_chars=60000,
+        conversation_max_rounds=0,
         loop_max_repeats=3,
     )
     trace_writer = make_trace_writer(
@@ -127,8 +131,9 @@ def run_cybergym_agent_task(
         error_txt=task.inputs.get("error_txt", ""),
         patch_diff=task.inputs.get("patch_diff", ""),
         task_root=task.inputs.get("task_root", task_root),
-        source_root=task.inputs.get("source_root", workspace_root),
-        repo_dir=task.inputs.get("source_root", task.inputs.get("repo_dir", "")),
+        source_root=source_root,
+        repo_dir=source_root or task.inputs.get("repo_dir", ""),
+        trace_run_dir=str(trace_writer.run_dir),
     )
 
     return {
diff --git a/qitos/core/agent_module.py b/qitos/core/agent_module.py
index 8c8f696..9cf3d90 100644
--- a/qitos/core/agent_module.py
+++ b/qitos/core/agent_module.py
@@ -445,6 +445,8 @@ def _merge_run_defaults(
             kwargs["stop_criteria"] = stop_criteria
         if history_policy is not None:
             kwargs["history_policy"] = history_policy
+        elif "history_policy" not in kwargs and self.config.get("history_policy") is not None:
+            kwargs["history_policy"] = self.config.get("history_policy")
         if context_config is not None:
             kwargs["context_config"] = context_config
 
diff --git a/qitos/engine/_action_runtime.py b/qitos/engine/_action_runtime.py
index f79d1f9..29b15bc 100644
--- a/qitos/engine/_action_runtime.py
+++ b/qitos/engine/_action_runtime.py
@@ -56,6 +56,89 @@ def run_act(
             actions.append(Action.from_dict(payload))
         for normalized_action in actions:
             engine._memory_append("action", normalized_action, record.step_id)
+            block_reason = self._action_block_reason(state, normalized_action)
+            if block_reason:
+                blocked_result = ToolResult(
+                    status="error",
+                    output={
+                        "status": "blocked",
+                        "message": block_reason,
+                        "tool_name": normalized_action.name,
+                    },
+                    error="action_blocked",
+                    metadata={
+                        "tool_name": normalized_action.name,
+                        "error_category": "action_blocked",
+                    },
+                )
+                record.action_results = [blocked_result]
+                record.tool_invocations = [
+                    {
+                        "tool_name": normalized_action.name,
+                        "toolset_name": None,
+                        "toolset_version": None,
+                        "source": "agent_action_gate",
+                        "attempts": 0,
+                        "latency_ms": 0,
+                        "status": "error",
+                        "error_category": "action_blocked",
+                        "error": "action_blocked",
+                    }
+                ]
+                engine._memory_append("action_result", blocked_result, record.step_id)
+                if record.decision_source == "native_tool_calls" and record.native_tool_call_used:
+                    tool_call_id = normalized_action.action_id or f"call_{record.step_id}_0"
+                    engine._history_append(
+                        "tool",
+                        self._serialize_for_tool_message(
+                            blocked_result.output,
+                            blocked_result.error,
+                        ),
+                        record.step_id,
+                        metadata={
+                            "source": "engine",
+                            "tool_name": normalized_action.name,
+                        },
+                        tool_call_id=tool_call_id,
+                        name=normalized_action.name,
+                    )
+                else:
+                    engine._history_append(
+                        "user",
+                        block_reason,
+                        record.step_id,
+                        metadata={
+                            "source": "action_gate",
+                            "tool_name": normalized_action.name,
+                        },
+                    )
+                engine._emit(
+                    record.step_id,
+                    RuntimePhase.ACT,
+                    payload={
+                        "stage": "action_blocked",
+                        "tool_name": normalized_action.name,
+                        "reason": block_reason,
+                        "action_results": [
+                            self._model_visible_tool_result_dict(
+                                blocked_result,
+                                normalized_action.name,
+                            )
+                        ],
+                    },
+                )
+                engine._dispatch_hook(
+                    "on_after_act",
+                    engine._hook_context(
+                        step_id=record.step_id,
+                        phase=RuntimePhase.ACT,
+                        state=state,
+                        decision=decision,
+                        action_results=[blocked_result.to_dict()],
+                        record=record,
+                    ),
+                )
+                return [blocked_result.to_dict()]
             recovery_message = engine._tool_loop_detector.check(
                 normalized_action.name, normalized_action.args
             )
@@ -113,6 +196,7 @@ def run_act(
                             "tool_name": item.name,
                             "latency_ms": item.latency_ms,
                             "attempts": item.attempts,
+                            "action_args": dict(actions[len(results)].args or {}) if len(results) < len(actions) else {},
                         },
                     )
                 )
@@ -126,6 +210,7 @@ def run_act(
                             "tool_name": item.name,
                             "latency_ms": item.latency_ms,
                             "attempts": item.attempts,
+                            "action_args": dict(actions[len(results)].args or {}) if len(results) < len(actions) else {},
                         },
                     )
                 )
@@ -155,21 +240,23 @@ def run_act(
                 payload = result.output
                 if isinstance(payload, dict) and set(payload.keys()) == {"env"}:
                     continue
+                tool_name = actions[idx].name if idx < len(actions) else ""
                 tool_call_id = None
                 if idx < len(actions):
                     tool_call_id = actions[idx].action_id
                 if not tool_call_id:
                     tool_call_id = f"call_{record.step_id}_{idx}"
-                serialized = self._serialize_for_tool_message(payload, result.error)
+                model_payload = self._model_visible_tool_output(tool_name, payload)
+                serialized = self._serialize_for_tool_message(model_payload, result.error)
                 engine._history_append(
                     "tool",
                     serialized[
                         : max(256, int(getattr(engine.context_config, "tool_result_max_chars", 4000)))
                     ],
                     record.step_id,
-                    metadata={"source": "engine", "tool_name": actions[idx].name if idx < len(actions) else ""},
+                    metadata={"source": "engine", "tool_name": tool_name},
                     tool_call_id=tool_call_id,
-                    name=(actions[idx].name if idx < len(actions) else None),
+                    name=(tool_name or None),
                 )
         engine._emit(
             record.step_id,
@@ -177,7 +264,13 @@ def run_act(
             payload={
                 "stage": "action_results",
                 "tool_invocations": record.tool_invocations,
-                "action_results": [item.to_dict() for item in results],
+                "action_results": [
+                    self._model_visible_tool_result_dict(
+                        item,
+                        actions[idx].name if idx < len(actions) else "",
+                    )
+                    for idx, item in enumerate(results)
+                ],
             },
         )
         engine._dispatch_hook(
@@ -201,3 +294,58 @@ def _serialize_for_tool_message(self, output: Any, error: str | None) -> str:
             return json.dumps(payload, ensure_ascii=False, default=str)
         except Exception:
             return str(payload)
+
+    def _action_block_reason(self, state: StateT, action: Action) -> str:
+        blocker = getattr(self.engine.agent, "block_action", None)
+        if blocker is None:
+            return ""
+        try:
+            reason = blocker(state, action)
+        except TypeError:
+            reason = blocker(action)
+        except Exception:
+            return ""
+        return str(reason or "").strip()
+
+    def _model_visible_tool_output(self, tool_name: str, output: Any) -> Any:
+        """Hide benchmark-private verifier fields from native tool-call history."""
+        if str(tool_name).rsplit(".", 1)[-1] != "submit_poc":
+            return output
+        if not isinstance(output, dict):
+            return output
+        if output.get("status") == "error":
+            return {
+                "status": "error",
+                "error": output.get("error") or output.get("raw_output") or "submission failed",
+            }
+        visible = {
+            "status": output.get("status"),
+            "poc_id": output.get("poc_id"),
+            "flag": output.get("flag"),
+            "exit_code": output.get("vul_exit_code", output.get("exit_code")),
+            "output": output.get("raw_output", ""),
+            "stderr": output.get("vul_stderr", ""),
+            "stdout": output.get("vul_stdout", ""),
+        }
+        return {key: value for key, value in visible.items() if value not in (None, "")}
+
+    def _model_visible_tool_result_dict(
+        self,
+        result: ToolResult,
+        tool_name: str,
+    ) -> Dict[str, Any]:
+        payload = result.to_dict()
+        if str(tool_name).rsplit(".", 1)[-1] != "submit_poc":
+            return payload
+        visible_output = self._model_visible_tool_output(tool_name, result.output)
+        visible = ToolResult(
+            status=result.status,
+            output=visible_output,
+            error=result.error,
+            metadata=dict(result.metadata),
+        ).to_dict()
+        visible["metadata"] = {
+            **dict(visible.get("metadata") or {}),
+            "model_visible": True,
+        }
+        return visible
diff --git a/qitos/engine/_env_runtime.py b/qitos/engine/_env_runtime.py
index 9db070e..290bc63 100644
--- a/qitos/engine/_env_runtime.py
+++ b/qitos/engine/_env_runtime.py
@@ -93,10 +93,57 @@ def build_observation_after_action(
         self.engine._emit(
             step_id,
             RuntimePhase.ACT,
-            payload={"stage": "observation_ready", "observation": obs.to_dict()},
+            payload={
+                "stage": "observation_ready",
+                "observation": self._model_visible_observation_dict(obs),
+            },
         )
         return obs  # type: ignore[return-value]
 
+    def _model_visible_observation_dict(self, obs: Observation) -> Dict[str, Any]:
+        payload = obs.to_dict()
+        action_results = payload.get("action_results")
+        if not isinstance(action_results, list):
+            return payload
+        payload["action_results"] = [
+            self._model_visible_tool_result_dict(item) for item in action_results
+        ]
+        return payload
+
+    def _model_visible_tool_result_dict(self, item: Any) -> Any:
+        result = ToolResult.from_value(item)
+        tool_name = str(result.metadata.get("tool_name") or result.metadata.get("name") or "")
+        if tool_name.rsplit(".", 1)[-1] != "submit_poc":
+            return item
+        output = result.output
+        if not isinstance(output, dict):
+            return result.to_dict()
+        if output.get("status") == "error":
+            visible_output = {
+                "status": "error",
+                "error": output.get("error") or output.get("raw_output") or "submission failed",
+            }
+        else:
+            visible_output = {
+                "status": output.get("status"),
+                "poc_id": output.get("poc_id"),
+                "flag": output.get("flag"),
+                "exit_code": output.get("vul_exit_code", output.get("exit_code")),
+                "output": output.get("raw_output", ""),
+                "stderr": output.get("vul_stderr", ""),
+                "stdout": output.get("vul_stdout", ""),
+            }
+            visible_output = {
+                key: value for key, value in visible_output.items() if value not in (None, "")
+            }
+        visible = ToolResult(
+            status=result.status,
+            output=visible_output,
+            error=result.error,
+            metadata={**dict(result.metadata), "model_visible": True},
+        )
+        return visible.to_dict()
+
     def validate_env_capabilities(self) -> List[Dict[str, Any]]:
         required = self.collect_required_ops()
         engine = self.engine
diff --git a/qitos/engine/_model_runtime.py b/qitos/engine/_model_runtime.py
index 621ef6a..895fd00 100644
--- a/qitos/engine/_model_runtime.py
+++ b/qitos/engine/_model_runtime.py
@@ -2,7 +2,10 @@
 
 from __future__ import annotations
 
+import html
 import json
+import os
+import re
 from pathlib import Path
 from typing import Any, Dict, Generic, List, Optional, TypeVar, cast
 
@@ -253,12 +256,15 @@ def _run_llm_decide(
             )
         injection_prefixes: List[str] = []
         if self._native_tool_call_preferred():
-            history = self._trim_native_tool_history(
-                history,
-                max_rounds=max(
-                    1, int(getattr(engine.context_config, "conversation_max_rounds", 10))
-                ),
-            )
+            if os.environ.get("CYBERGYM_DISABLE_HISTORY_TRIM", "").strip().lower() not in {"1", "true", "yes", "on"}:
+                configured_rounds = int(
+                    getattr(engine.context_config, "conversation_max_rounds", 10)
+                )
+                if configured_rounds > 0:
+                    history = self._trim_native_tool_history(
+                        history,
+                        max_rounds=configured_rounds,
+                    )
         messages.extend(history)
         for item in prompt_messages:
             if not isinstance(item, dict):
@@ -280,6 +286,7 @@ def _run_llm_decide(
         )
         messages.append(current_user)
         prepared_full = content_to_text(current_user.get("content"))
+        self._write_assembled_messages_sidecar(state, record.step_id, messages)
         record.prompt_metadata = dict(prompt_metadata)
         record.prompt_metadata.update(
             {
@@ -360,6 +367,26 @@ def _run_llm_decide(
 
         return response
 
+    def _write_assembled_messages_sidecar(
+        self,
+        state: StateT,
+        step_id: int,
+        messages: List[Dict[str, Any]],
+    ) -> None:
+        try:
+            metadata = dict(getattr(state, "metadata", {}) or {})
+            trace_root = str(metadata.get("trace_run_dir") or "").strip()
+            if not trace_root:
+                return
+            step_dir = Path(trace_root) / "agent_steps" / f"step-{int(step_id):04d}"
+            step_dir.mkdir(parents=True, exist_ok=True)
+            (step_dir / "assembled_messages.json").write_text(
+                json.dumps(messages, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+        except Exception:
+            return
+
     def _build_model_request_options(
         self, *, prompt_bundle: Any, protocol: Any
     ) -> Dict[str, Any]:
@@ -1006,21 +1033,94 @@ def _normalize_model_response(self, raw_output: Any) -> ModelResponse:
             or (llm.__class__.__name__ if llm is not None else None)
         )
         metadata = dict(response.metadata or {})
+        text = str(response.text or "")
+        tool_calls = (
+            [dict(item) for item in (response.tool_calls or [])]
+            if isinstance(response.tool_calls, list)
+            else None
+        )
+        if not tool_calls:
+            markup_tool_calls = self._extract_text_tool_call_markup(text)
+            if markup_tool_calls:
+                tool_calls = markup_tool_calls
+                metadata["tool_call_markup_salvaged"] = True
+                metadata["tool_call_markup_format"] = "glm_text_tool_call"
+                if self._contains_only_text_tool_call_markup(text):
+                    text = ""
         return ModelResponse(
-            text=str(response.text or ""),
+            text=text,
             raw=response.raw,
             usage=dict(usage) if isinstance(usage, dict) else None,
             finish_reason=response.finish_reason,
-            tool_calls=(
-                [dict(item) for item in (response.tool_calls or [])]
-                if isinstance(response.tool_calls, list)
-                else None
-            ),
+            tool_calls=tool_calls,
             model_name=str(model_name) if model_name is not None else None,
             provider=str(provider) if provider is not None else None,
             metadata=metadata,
         )
 
+    def _extract_text_tool_call_markup(self, text: str) -> List[Dict[str, Any]] | None:
+        """Salvage GLM-style textual tool-call markup into native tool calls.
+
+        Some OpenAI-compatible GLM endpoints occasionally return text like
+        `<tool_call>run_command<arg_key>command</arg_key><arg_value>ls</arg_value></tool_call>`
+        instead of a structured `message.tool_calls` payload, even with
+        `finish_reason=tool_calls`. Treat it as a native call so it does not
+        fall through to JSON parsers.
+        """
+        if "<tool_call>" not in text:
+            return None
+        calls: List[Dict[str, Any]] = []
+        for index, match in enumerate(
+            re.finditer(r"<tool_call>\s*(.*?)\s*</tool_call>", text, re.DOTALL),
+            start=1,
+        ):
+            body = match.group(1)
+            first_arg = re.search(r"<arg_key>", body)
+            name_part = body[: first_arg.start()] if first_arg else body
+            name = html.unescape(re.sub(r"<[^>]+>", "", name_part)).strip()
+            if not name:
+                continue
+            args: Dict[str, Any] = {}
+            for key, value in re.findall(
+                r"<arg_key>\s*(.*?)\s*</arg_key>\s*<arg_value>\s*(.*?)\s*</arg_value>",
+                body,
+                re.DOTALL,
+            ):
+                clean_key = html.unescape(re.sub(r"<[^>]+>", "", key)).strip()
+                if not clean_key:
+                    continue
+                args[clean_key] = self._coerce_text_tool_call_arg(value)
+            calls.append(
+                {
+                    "id": f"call_glm_text_{index}",
+                    "type": "function",
+                    "function": {
+                        "name": name,
+                        "arguments": json.dumps(args, ensure_ascii=False),
+                    },
+                }
+            )
+        return calls or None
+
+    def _coerce_text_tool_call_arg(self, value: str) -> Any:
+        text = html.unescape(str(value or "")).strip()
+        try:
+            return json.loads(text)
+        except Exception:
+            return text
+
+    def _contains_only_text_tool_call_markup(self, text: str) -> bool:
+        stripped = str(text or "").strip()
+        if not stripped:
+            return False
+        remainder = re.sub(
+            r"<tool_call>\s*.*?\s*</tool_call>",
+            "",
+            stripped,
+            flags=re.DOTALL,
+        ).strip()
+        return not remainder
+
     def _extract_response_text(self, raw_output: Any) -> str:
         if raw_output is None:
             return ""
@@ -1046,9 +1146,6 @@ def _extract_response_text(self, raw_output: Any) -> str:
             return self._extract_response_text(choices[0])
         message = getattr(raw_output, "message", None)
         if message is not None:
-            tool_calls = getattr(message, "tool_calls", None)
-            if isinstance(tool_calls, list) and tool_calls:
-                return ""
             content = getattr(message, "content", None)
             if isinstance(content, str):
                 return content
@@ -1065,6 +1162,12 @@ def _extract_response_text(self, raw_output: Any) -> str:
                         parts.append(str(getattr(item, "text")))
                 if parts:
                     return "\n".join(parts)
+            reasoning = getattr(message, "reasoning_content", None)
+            if isinstance(reasoning, str):
+                return reasoning
+            text = getattr(message, "text", None)
+            if isinstance(text, str):
+                return text
         for key in ("text", "content", "output_text"):
             value = getattr(raw_output, key, None)
             if isinstance(value, str):
diff --git a/qitos/engine/_trace_runtime.py b/qitos/engine/_trace_runtime.py
index ce80146..3ee0db7 100644
--- a/qitos/engine/_trace_runtime.py
+++ b/qitos/engine/_trace_runtime.py
@@ -307,7 +307,6 @@ def build_task_result(
             StopReason.SUCCESS.value,
             StopReason.FINAL.value,
             StopReason.ENV_TERMINAL.value,
-            StopReason.AGENT_CONDITION.value,
         }
         criteria_results = []
         criteria = task_obj.success_criteria if task_obj is not None else []
diff --git a/qitos/harness/__init__.py b/qitos/harness/__init__.py
index 2f53655..df59f31 100644
--- a/qitos/harness/__init__.py
+++ b/qitos/harness/__init__.py
@@ -58,7 +58,7 @@ def build_model_for_preset(
     tool_delivery: str | None = None,
     temperature: float = 0.2,
     max_tokens: int = 2048,
-    timeout: int = 60,
+    timeout: int = 120,
     system_prompt: str | None = None,
     context_window: int | None = None,
 ) -> Any:
diff --git a/qitos/harness/_adapters.py b/qitos/harness/_adapters.py
index e6d549e..5c588a9 100644
--- a/qitos/harness/_adapters.py
+++ b/qitos/harness/_adapters.py
@@ -53,7 +53,7 @@ def build_model(self, **kwargs: object) -> OpenAICompatibleModel:
         context_policy = kwargs["context_policy"]
         temperature = _coerce_float(kwargs.get("temperature"), 0.2)
         max_tokens = _coerce_int(kwargs.get("max_tokens"), 2048)
-        timeout = _coerce_int(kwargs.get("timeout"), 60)
+        timeout = _coerce_int(kwargs.get("timeout"), 120)
         system_prompt = kwargs.get("system_prompt")
         context_window = kwargs.get("context_window")
         if not isinstance(preset, FamilyPreset):
diff --git a/qitos/kit/tool/internal/coding_impl.py b/qitos/kit/tool/internal/coding_impl.py
index a07125e..3a9cac7 100644
--- a/qitos/kit/tool/internal/coding_impl.py
+++ b/qitos/kit/tool/internal/coding_impl.py
@@ -563,21 +563,25 @@ def read_file(
         :param path: Path relative to the workspace root.
         :param runtime_context: Optional runtime context injected by the executor.
         """
-        result = self.file_read_v2(
-            path=path,
-            offset=0,
-            limit=100_000,
-            max_chars=200_000,
-            runtime_context=runtime_context,
-        )
-        if result.get("status") != "success":
-            return result
-        return {
-            "status": "success",
-            "path": path,
-            "content": result.get("content", ""),
-            "size": len(result.get("content", "")),
-        }
+        _ = runtime_context
+        try:
+            resolved = _resolve_workspace_path(self.workspace_root, path)
+            if not resolved.exists():
+                return {"status": "error", "message": f"File not found: {path}"}
+            if resolved.is_dir():
+                return {"status": "error", "message": f"Path is a directory: {path}"}
+            content, line_ending, _mtime = self._read_text_file(resolved)
+            return {
+                "status": "success",
+                "path": path,
+                "content": content,
+                "size": len(content),
+                "truncated": False,
+                "total_lines": len(content.splitlines()),
+                "line_ending": line_ending,
+            }
+        except Exception as e:
+            return {"status": "error", "message": str(e), "path": path}
 
     @tool(
         name="view",
diff --git a/qitos/models/openai.py b/qitos/models/openai.py
index ce24290..a044864 100644
--- a/qitos/models/openai.py
+++ b/qitos/models/openai.py
@@ -7,6 +7,7 @@
 
 import json
 import os
+import time
 from typing import Any, Dict, List, Optional, cast
 
 from ..core.multimodal import (
@@ -19,6 +20,30 @@
 from .base import Model
 
 
+OPENAI_DEFAULT_TIMEOUT = 120
+OPENAI_DEFAULT_RETRIES = 3
+
+
+def _retry_delay_seconds(attempt_index: int) -> float:
+    return float(min(8, 2 ** max(0, int(attempt_index))))
+
+
+def _call_with_retries(operation, *, retries: int = OPENAI_DEFAULT_RETRIES):
+    last_error: Exception | None = None
+    total_attempts = max(1, int(retries))
+    for attempt in range(total_attempts):
+        try:
+            return operation()
+        except Exception as exc:  # Retry all provider errors, including timeouts.
+            last_error = exc
+            if attempt >= total_attempts - 1:
+                raise
+            time.sleep(_retry_delay_seconds(attempt))
+    if last_error is not None:
+        raise last_error
+    raise RuntimeError("retry loop exited without returning or raising")
+
+
 def _to_openai_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     normalized = normalize_messages(messages)
     out: List[Dict[str, Any]] = []
@@ -112,7 +137,7 @@ def __init__(
         system_prompt: Optional[str] = None,
         temperature: float = 0.7,
         max_tokens: int = 2048,
-        timeout: int = 60,
+        timeout: int = OPENAI_DEFAULT_TIMEOUT,
         context_window: Optional[int] = None,
     ):
         """
@@ -164,7 +189,9 @@ def _call_api(self, messages: List[Dict[str, Any]], **kwargs: Any) -> str:
                 api_key=self.api_key, base_url=self.base_url, timeout=self.timeout
             )
 
-            response = self._chat_completion(client, messages, **kwargs)
+            response = _call_with_retries(
+                lambda: self._chat_completion(client, messages, **kwargs)
+            )
             return self._parse_response(response)
 
         except openai.APIError as e:
@@ -215,7 +242,7 @@ def call_raw(self, messages: List[Dict[str, Any]], **kwargs: Any) -> Any:
         client = openai.OpenAI(
             api_key=self.api_key, base_url=self.base_url, timeout=self.timeout
         )
-        return self._chat_completion(client, messages, **kwargs)
+        return _call_with_retries(lambda: self._chat_completion(client, messages, **kwargs))
 
     def _usage_from_response(self, response: Any) -> Optional[Dict[str, Any]]:
         usage = getattr(response, "usage", None)
@@ -327,7 +354,7 @@ def __init__(
         system_prompt: Optional[str] = None,
         temperature: float = 0.7,
         max_tokens: int = 2048,
-        timeout: int = 60,
+        timeout: int = OPENAI_DEFAULT_TIMEOUT,
         context_window: Optional[int] = None,
     ):
         """
@@ -377,7 +404,9 @@ def _call_api(self, messages: List[Dict[str, Any]], **kwargs: Any) -> str:
                 api_key=self.api_key, base_url=self.base_url, timeout=self.timeout
             )
 
-            response = self._chat_completion(client, messages, **kwargs)
+            response = _call_with_retries(
+                lambda: self._chat_completion(client, messages, **kwargs)
+            )
             return self._parse_response(response)
 
         except openai.APIError as e:
@@ -479,7 +508,7 @@ def call_raw(self, messages: List[Dict[str, Any]], **kwargs: Any) -> Any:
         client = openai.OpenAI(
             api_key=self.api_key, base_url=self.base_url, timeout=self.timeout
         )
-        return self._chat_completion(client, messages, **kwargs)
+        return _call_with_retries(lambda: self._chat_completion(client, messages, **kwargs))
 
     def _usage_from_response(self, response: Any) -> Optional[Dict[str, Any]]:
         usage = getattr(response, "usage", None)
@@ -529,7 +558,7 @@ def __init__(
         system_prompt: Optional[str] = None,
         temperature: float = 0.7,
         max_tokens: int = 2048,
-        timeout: int = 60,
+        timeout: int = OPENAI_DEFAULT_TIMEOUT,
         context_window: Optional[int] = None,
     ):
         """
@@ -587,12 +616,14 @@ def _call_api(self, messages: List[Dict[str, Any]], **kwargs: Any) -> str:
                 timeout=self.timeout,
             )
 
-            response = client.chat.completions.create(
-                model=self.deployment or "",
-                messages=cast(Any, _to_openai_messages(messages)),
-                temperature=self.temperature,
-                max_tokens=self.max_tokens,
-                **kwargs,
+            response = _call_with_retries(
+                lambda: client.chat.completions.create(
+                    model=self.deployment or "",
+                    messages=cast(Any, _to_openai_messages(messages)),
+                    temperature=self.temperature,
+                    max_tokens=self.max_tokens,
+                    **kwargs,
+                )
             )
             self._set_last_usage(self._usage_from_response(response))
 
diff --git a/qitos/recipes/benchmarks/cybergym.py b/qitos/recipes/benchmarks/cybergym.py
index aed94c5..5d1edaa 100644
--- a/qitos/recipes/benchmarks/cybergym.py
+++ b/qitos/recipes/benchmarks/cybergym.py
@@ -30,9 +30,12 @@ def run_cybergym_recipe_task(
     trace_logdir: str,
     trace_prefix: str = "qitos_cybergym",
 ) -> dict[str, Any]:
+    out_root = Path(out_dir).expanduser().resolve()
+    workspace_root = out_root / "workspace"
+    workspace_root.mkdir(parents=True, exist_ok=True)
     task_dir = prepare_task_dir(
         task_id=task_id,
-        out_dir=out_dir,
+        out_dir=workspace_root / task_slug(task_id),
         data_dir=data_dir,
         server=server,
         difficulty=difficulty,
diff --git a/qitos/render/cli_render.py b/qitos/render/cli_render.py
index 215ae91..e3f6ff8 100644
--- a/qitos/render/cli_render.py
+++ b/qitos/render/cli_render.py
@@ -111,8 +111,8 @@ def print_llm_input(
 
             # Truncate content if too long
             content_str = str(content)
-            if len(content_str) > 500:
-                content_str = content_str[:500] + "\n... [truncated]"
+            if len(content_str) > 20000:
+                content_str = content_str[:20000] + "\n... [truncated]"
 
             # Format content with syntax highlighting if it's JSON
             if isinstance(content, dict):
diff --git a/scripts/run_batch100_sampled_conc4.sh b/scripts/run_batch100_sampled_conc4.sh
new file mode 100755
index 0000000..73fe486
--- /dev/null
+++ b/scripts/run_batch100_sampled_conc4.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+source /tmp/cg_smoke_env.sh
+export PYTHONPATH=/data/pxd-team/workspace-149/zwq/qitos-cybergym
+
+cd /data/pxd-team/workspace-149/zwq/qitos-cybergym
+bash /data/pxd-team/workspace-149/zwq/cybergym_agent-fresh/scripts/sync_to_qitos.sh
+
+/data3t/conda_envs/cybergym/bin/python -u scripts/run_cybergym_batch.py \
+  --data-dir /data/pxd-team/workspace-149/zwq/cybergym/cybergym_data/data \
+  --out-root /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v4 \
+  --server http://127.0.0.1:8713 \
+  --difficulty level1 \
+  --model-name GLM-5.1 \
+  --base-url "${OPENAI_BASE_URL}" \
+  --api-key "${CYBERGYM_CLAUDE_AUTH_TOKEN}" \
+  --task-file /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/trace100_multiagent_20260421_110342/tasks.txt \
+  --limit 100 \
+  --concurrency 4 \
+  --max-steps 1000000 \
+  --max-runtime-seconds 6000 \
+  --trace-prefix qitos_cybergym_batch100sampled \
+  2>&1 | tee /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v4/run.log
diff --git a/scripts/run_cybergym_batch.py b/scripts/run_cybergym_batch.py
new file mode 100755
index 0000000..893aeba
--- /dev/null
+++ b/scripts/run_cybergym_batch.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import os
+from pathlib import Path
+
+from qitos.benchmark.cybergym.adapter import load_cybergym_tasks
+from qitos.benchmark.cybergym.runner import run_cybergym_task
+from qitos.recipes.benchmarks._shared import (
+    build_example_specs,
+    execute_example_jobs,
+    print_benchmark_summary,
+)
+
+
+def _load_task_ids(data_dir: Path, limit: int, start_index: int = 0) -> list[str]:
+    arvo_root = data_dir / "arvo"
+    task_dirs = sorted((p for p in arvo_root.iterdir() if p.is_dir()), key=lambda p: int(p.name))
+    selected = task_dirs[int(start_index) :]
+    if int(limit) > 0:
+        selected = selected[: int(limit)]
+    return [f"arvo:{p.name}" for p in selected]
+
+
+def _load_task_ids_from_file(path: Path, limit: int) -> list[str]:
+    items = [line.strip() for line in path.read_text().splitlines() if line.strip()]
+    if int(limit) > 0:
+        items = items[: int(limit)]
+    return items
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run a batch of CyberGym tasks with QitOS.")
+    parser.add_argument("--data-dir", required=True)
+    parser.add_argument("--out-root", required=True)
+    parser.add_argument("--server", required=True)
+    parser.add_argument("--difficulty", default="level1", choices=["level0", "level1", "level2", "level3"])
+    parser.add_argument("--model-name", required=True)
+    parser.add_argument("--base-url", required=True)
+    parser.add_argument("--api-key", default=os.getenv("CYBERGYM_CLAUDE_AUTH_TOKEN", ""))
+    parser.add_argument("--limit", type=int, default=100)
+    parser.add_argument("--start-index", type=int, default=0)
+    parser.add_argument("--task-file", default="")
+    parser.add_argument("--concurrency", type=int, default=4)
+    parser.add_argument("--max-steps", type=int, default=1_000_000)
+    parser.add_argument("--max-runtime-seconds", type=float, default=180.0)
+    parser.add_argument("--trace-prefix", default="qitos_cybergym_batch")
+    parser.add_argument("--output-jsonl", default="")
+    parser.add_argument("--resume", action="store_true")
+    args = parser.parse_args()
+
+    if not str(args.api_key).strip():
+        raise SystemExit("api key is required")
+
+    out_root = Path(args.out_root).expanduser().resolve()
+    traces = out_root / "traces"
+    workspace = out_root / "workspace"
+    traces.mkdir(parents=True, exist_ok=True)
+    workspace.mkdir(parents=True, exist_ok=True)
+
+    data_dir = Path(args.data_dir).expanduser().resolve()
+    task_file = str(args.task_file).strip()
+    if task_file:
+        task_ids = _load_task_ids_from_file(Path(task_file).expanduser().resolve(), limit=int(args.limit))
+    else:
+        task_ids = _load_task_ids(data_dir, limit=int(args.limit), start_index=int(args.start_index))
+    tasks = load_cybergym_tasks(task_ids=task_ids, difficulty=args.difficulty)
+    jobs = [{"task": task, "job_key": task.id} for task in tasks]
+
+    run_spec, experiment_spec = build_example_specs(
+        benchmark="cybergym",
+        split=args.difficulty,
+        model_name=str(args.model_name),
+        trace_logdir=str(traces),
+        parser_name="JsonDecisionParser",
+        toolset_name="cybergym_agent",
+        limit=len(jobs),
+        workspace=str(workspace),
+        metadata={
+            "recipe": "cybergym_agent_batch",
+            "max_steps": int(args.max_steps),
+            "max_runtime_seconds": float(args.max_runtime_seconds),
+        },
+    )
+    run_spec.environment = dict(run_spec.environment or {})
+    run_spec.environment.update(
+        {
+            "data_dir": str(data_dir),
+            "server": str(args.server),
+            "base_url": str(args.base_url),
+            "api_key": str(args.api_key),
+            "trace_logdir": str(traces),
+            "workspace": str(workspace),
+            "trace_prefix": str(args.trace_prefix),
+        }
+    )
+    output_path = (
+        Path(args.output_jsonl).expanduser().resolve()
+        if str(args.output_jsonl).strip()
+        else out_root / f"cybergym_{args.difficulty}_first{len(jobs)}_conc{int(args.concurrency)}.jsonl"
+    )
+
+    rows = execute_example_jobs(
+        jobs=jobs,
+        runner=lambda **kwargs: run_cybergym_task(
+            task=kwargs["task"],
+            run_spec=kwargs["run_spec"],
+            experiment_spec=kwargs["experiment_spec"],
+        ),
+        output_path=output_path,
+        run_spec=run_spec,
+        experiment_spec=experiment_spec,
+        concurrency=max(1, int(args.concurrency)),
+        resume=bool(args.resume),
+    )
+    print_benchmark_summary(rows)
+    print(f"OUTPUT_JSONL={output_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_benchmark_cybergym_recipe.py b/tests/test_benchmark_cybergym_recipe.py
index 64eee0d..a109ef4 100644
--- a/tests/test_benchmark_cybergym_recipe.py
+++ b/tests/test_benchmark_cybergym_recipe.py
@@ -1,10 +1,12 @@
 import tempfile
 import unittest
 from pathlib import Path
+from types import SimpleNamespace
 from unittest import mock
 
 from qitos.benchmark import normalize_benchmark_name, resolve_builtin_runner
 from qitos.benchmark.cybergym import CyberGymBenchmarkAdapter, make_trace_writer, task_slug
+import qitos.benchmark.cybergym.runner as cybergym_runner
 from qitos.recipes.benchmarks import cybergym
 
 
@@ -44,7 +46,7 @@ def test_recipe_reuses_benchmark_family_helpers(self):
         self.assertIs(cybergym.make_trace_writer, make_trace_writer)
 
     def test_recipe_passes_runtime_budget_without_step_cap(self):
-        with mock.patch.object(cybergym, "prepare_task_dir", return_value=Path("/tmp/task")):
+        with mock.patch.object(cybergym, "prepare_task_dir", return_value=Path("/tmp/out/workspace/arvo_1065")):
             with mock.patch.object(cybergym, "run_cybergym_agent_task", return_value={}) as run:
                 cybergym.run_cybergym_recipe_task(
                     task_id="arvo:1065",
@@ -63,6 +65,66 @@ def test_recipe_passes_runtime_budget_without_step_cap(self):
         kwargs = run.call_args.kwargs
         self.assertIsNone(kwargs["max_steps"])
         self.assertEqual(kwargs["max_runtime_seconds"], 3600)
+        self.assertEqual(str(kwargs["task_dir"]), "/tmp/out/workspace/arvo_1065")
+
+    def test_runner_uses_task_root_workspace_and_keeps_source_root_context(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            task_root = Path(tmpdir).resolve()
+            source_root = task_root / "repo-vul" / "project"
+            source_root.mkdir(parents=True)
+
+            fake_task = SimpleNamespace(
+                id="arvo:1065",
+                inputs={
+                    "task_id": "arvo:1065",
+                    "agent_id": "agent",
+                    "checksum": "checksum",
+                    "server_url": "http://server",
+                    "source_root": str(source_root),
+                    "repo_dir": str(task_root / "repo-vul"),
+                    "task_root": str(task_root),
+                    "description": "desc",
+                    "error_txt": "",
+                    "patch_diff": "",
+                },
+            )
+            fake_agent = mock.Mock()
+            fake_agent.run.return_value = SimpleNamespace(
+                state=SimpleNamespace(stop_reason="final", final_result="ok"),
+                step_count=1,
+                task_result=None,
+            )
+
+            with mock.patch(
+                "qitos.benchmark.cybergym.agent.adapter.CyberGymAdapter"
+            ) as adapter_cls, mock.patch(
+                "qitos.benchmark.cybergym.agent.cli.build_agent",
+                return_value=fake_agent,
+            ) as build_agent, mock.patch(
+                "qitos.benchmark.cybergym.agent.stop_criteria.PoCVerificationCriteria",
+                return_value=object(),
+            ), mock.patch.object(cybergym_runner, "HostEnv") as host_env:
+                adapter_cls.return_value.from_task_dir.return_value = fake_task
+                cybergym_runner.run_cybergym_agent_task(
+                    task_dir=str(task_root),
+                    model_name="GLM-5.1",
+                    api_key="key",
+                    base_url="http://model/v1",
+                    server="http://server",
+                    max_steps=None,
+                    max_runtime_seconds=3600,
+                    trace_logdir=str(task_root / "traces"),
+                )
+
+            build_kwargs = build_agent.call_args.kwargs
+            self.assertEqual(build_kwargs["workspace_root"], str(task_root))
+            self.assertEqual(build_kwargs["task_root"], str(task_root))
+            host_env.assert_called_once_with(workspace_root=str(task_root))
+            run_kwargs = fake_agent.run.call_args.kwargs
+            self.assertGreaterEqual(run_kwargs["context_config"].tool_result_max_chars, 50000)
+            self.assertEqual(run_kwargs["workspace"], str(task_root))
+            self.assertEqual(run_kwargs["source_root"], str(source_root))
+            self.assertEqual(run_kwargs["repo_dir"], str(source_root))
 
 
 if __name__ == "__main__":
diff --git a/tests/test_cybergym_agent_poc_profile.py b/tests/test_cybergym_agent_poc_profile.py
new file mode 100644
index 0000000..dd2ecfc
--- /dev/null
+++ b/tests/test_cybergym_agent_poc_profile.py
@@ -0,0 +1,97 @@
+from pathlib import Path
+
+from qitos.core.tool_registry import ToolRegistry
+
+
+def test_poc_gen_profile_detects_and_registers_submit_tool(tmp_path: Path) -> None:
+    submit = tmp_path / "submit.sh"
+    submit.write_text(
+        "#!/bin/bash\n"
+        'curl -X POST http://127.0.0.1:8698/submit-vul -F "file=@${1}"\n',
+        encoding="utf-8",
+    )
+
+    from qitos.benchmark.cybergym.agent.profiles import PocGenProfile, detect_profile
+    from qitos.benchmark.cybergym.agent.state import SecurityState
+
+    profile = detect_profile(
+        "CyberGym task",
+        task_profile="poc_gen",
+        server_url="http://127.0.0.1:8698",
+    )
+    assert isinstance(profile, PocGenProfile)
+
+    state = SecurityState(task="CyberGym task", workspace_root=str(tmp_path))
+    profile.init_state(
+        state,
+        description="A crash occurs when parsing a truncated file.",
+        task_id="arvo:15003",
+        agent_id="agent-x",
+        checksum="checksum-x",
+        server_url="http://127.0.0.1:8698",
+        repo_dir=str(tmp_path),
+    )
+
+    registry = ToolRegistry(auto_short_aliases=True)
+    profile.register_tools(
+        registry,
+        workspace_root=str(tmp_path),
+        shell_timeout=60,
+        server_url="http://127.0.0.1:8698",
+    )
+
+    assert state.task_profile == "poc_gen"
+    assert state.task_id == "arvo:15003"
+    assert state.poc_strategy in {"text", "binary_python", "corpus_mutate", "hex"}
+    assert "submit.sh content:" in state.harness_info
+    assert "submit_poc" in registry.list_tools()
+
+
+def test_cybergym_adapter_accepts_qitos_runner_keyword_args(tmp_path: Path) -> None:
+    (tmp_path / "description.txt").write_text(
+        "A crash occurs when parsing a truncated file.\n",
+        encoding="utf-8",
+    )
+    (tmp_path / "README.md").write_text("README\n", encoding="utf-8")
+    (tmp_path / "submit.sh").write_text(
+        "#!/bin/bash\n"
+        'curl -X POST http://127.0.0.1:8698/submit-vul -F "file=@${1}"\n',
+        encoding="utf-8",
+    )
+    repo_dir = tmp_path / "repo-vul"
+    repo_dir.mkdir()
+    (repo_dir / "sample.c").write_text("int main(void) { return 0; }\n", encoding="utf-8")
+
+    from qitos.benchmark.cybergym.agent.adapter import CyberGymAdapter
+
+    adapter = CyberGymAdapter(server_url="http://127.0.0.1:8698")
+    task = adapter.from_task_dir(
+        str(tmp_path),
+        task_id="arvo:15003",
+        max_steps=7,
+        max_runtime_seconds=120,
+    )
+
+    assert task.id == "arvo:15003"
+    assert task.inputs["task_root"] == str(tmp_path.resolve())
+    assert task.inputs["source_root"] == str(repo_dir.resolve())
+    model_visible_task_text = "\n".join([task.objective, *task.success_criteria])
+    assert "fix_exit" not in model_visible_task_text
+    assert "patched" not in model_visible_task_text.lower()
+    assert "fixed" not in model_visible_task_text.lower()
+
+
+def test_build_agent_accepts_task_root_keyword(monkeypatch, tmp_path: Path) -> None:
+    from qitos.benchmark.cybergym.agent import cli
+
+    monkeypatch.setattr(cli, "_create_llm", lambda model, llm_config=None: object())
+
+    agent = cli.build_agent(
+        model="GLM-5.1",
+        workspace_root=str(tmp_path),
+        task_root=str(tmp_path),
+        server_url="http://127.0.0.1:8698",
+        llm_config={"api_key": "x", "base_url": "y"},
+    )
+
+    assert agent.workspace_root == str(tmp_path.resolve())
diff --git a/tests/test_cybergym_context_retention.py b/tests/test_cybergym_context_retention.py
new file mode 100644
index 0000000..7726789
--- /dev/null
+++ b/tests/test_cybergym_context_retention.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from qitos.core.tool_result import ToolResult
+
+
+def _make_repo(root: Path) -> Path:
+    repo = root / "repo-vul"
+    repo.mkdir()
+    (repo / "src").mkdir()
+    (repo / "include").mkdir()
+    (repo / "samples").mkdir()
+    (repo / "src" / "parser_decode.c").write_text(
+        "int parse_record(const unsigned char *buf, int len) {\n"
+        "    if (len < 3) return -1;\n"
+        "    return buf[2];\n"
+        "}\n",
+        encoding="utf-8",
+    )
+    (repo / "include" / "parser_fields.h").write_text(
+        "struct record_header { int len; int off; };\n",
+        encoding="utf-8",
+    )
+    (repo / "samples" / "seed.omf").write_bytes(b"OMF")
+    return repo
+
+
+def _make_agent(tmp_path: Path):
+    from qitos.benchmark.cybergym.agent.agent import CyberGymAgent
+
+    (tmp_path / "submit.sh").write_text("#!/bin/bash\n", encoding="utf-8")
+    return CyberGymAgent(
+        llm=object(),
+        workspace_root=str(tmp_path),
+        task_root=str(tmp_path),
+        server_url="http://127.0.0.1:8698",
+    )
+
+
+def test_cybergym_state_initializes_durable_memory_fields() -> None:
+    from qitos.benchmark.cybergym.agent.state import CyberGymState
+
+    state = CyberGymState(task="demo")
+
+    assert state.durable_project_memory == {}
+    assert state.durable_code_facts == []
+    assert state.durable_feedback_facts == []
+
+
+def test_init_state_populates_durable_project_memory(tmp_path: Path) -> None:
+    repo = _make_repo(tmp_path)
+    agent = _make_agent(tmp_path)
+
+    state = agent.init_state(
+        "demo task",
+        description="Parser bug in a truncated OMF record",
+        source_root=str(repo),
+    )
+
+    memory = state.durable_project_memory
+    assert "parser_decode.c" in " ".join(memory.get("parser_paths", []))
+    assert "seed.omf" in " ".join(memory.get("seed_paths", []))
+    assert "parser_fields.h" in " ".join(memory.get("field_paths", []))
+
+
+def test_read_result_populates_durable_code_facts(tmp_path: Path) -> None:
+    repo = _make_repo(tmp_path)
+    agent = _make_agent(tmp_path)
+    state = agent.init_state(
+        "demo task",
+        description="Parser bug in a truncated OMF record",
+        source_root=str(repo),
+    )
+
+    result = ToolResult(
+        output={
+            "path": "src/parser_decode.c",
+            "content": "if (len < 3) return -1;\nreturn buf[2];\n",
+        },
+        metadata={"name": "READ"},
+    )
+
+    agent._process_action_result(state, result)
+
+    assert state.durable_code_facts
+    assert any("src/parser_decode.c" in fact for fact in state.durable_code_facts)
+
+
+def test_submit_feedback_populates_durable_feedback_facts(tmp_path: Path) -> None:
+    repo = _make_repo(tmp_path)
+    agent = _make_agent(tmp_path)
+    state = agent.init_state(
+        "demo task",
+        description="Parser bug in a truncated OMF record",
+        source_root=str(repo),
+    )
+    poc = tmp_path / "poc.bin"
+    poc.write_bytes(b"abc")
+    state.poc_path = str(poc)
+
+    result = ToolResult(
+        output={
+            "exit_code": 0,
+            "vul_exit_code": 0,
+            "verification_scope": "vul_only",
+            "raw_output": "Invalid record (too short)\n",
+        },
+        metadata={"name": "submit_poc"},
+    )
+
+    agent._process_action_result(state, result)
+
+    assert state.durable_feedback_facts
+    assert any("Invalid record" in fact or "no_trigger" in fact for fact in state.durable_feedback_facts)
+
+
+def test_prompt_and_trace_payload_include_working_memory(tmp_path: Path) -> None:
+    repo = _make_repo(tmp_path)
+    agent = _make_agent(tmp_path)
+    state = agent.init_state(
+        "demo task",
+        description="Parser bug in a truncated OMF record",
+        source_root=str(repo),
+    )
+    state.durable_code_facts = ["parser_path: src/parser_decode.c -> if (len < 3) return -1;"]
+    state.durable_feedback_facts = ["feedback_hint: Invalid record (too short)"]
+
+    system_prompt = agent.build_system_prompt(state)
+    observation = agent.prepare(state)
+    payload = agent._step_context_payload(state)
+
+    assert "Older tool results may later be cleared from context." in system_prompt
+    assert (
+        "When working with tool results, write down any important information you might need later in your response"
+        in system_prompt
+    )
+    assert "## Stable Task Facts" in system_prompt
+    assert "Working Directory (cwd)" in system_prompt
+    assert "## Working Memory" not in observation
+    assert "### Project Index" not in observation
+    assert payload["durable_project_memory"]
+    assert payload["durable_code_facts"]
+    assert payload["durable_feedback_facts"]
+
+
+def test_find_pipeline_with_head_is_not_treated_as_file_browsing(tmp_path: Path) -> None:
+    agent = _make_agent(tmp_path)
+
+    assert (
+        agent._bash_is_file_browse_command(
+            'find repo-vul -type f -name "*.c" | xargs grep -l -i "omf" 2>/dev/null | head -30'
+        )
+        is False
+    )
+    assert agent._bash_is_file_browse_command("head README.md") is True
diff --git a/tests/test_cybergym_context_snip.py b/tests/test_cybergym_context_snip.py
new file mode 100644
index 0000000..89d171e
--- /dev/null
+++ b/tests/test_cybergym_context_snip.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from qitos.benchmark.cybergym.agent.context import SnipCompactor
+from qitos.core.history import HistoryMessage
+from qitos.core.state import StateSchema
+
+
+def test_snip_compactor_persists_old_tool_results_with_preview(tmp_path: Path) -> None:
+    state = StateSchema(task="demo")
+    state.metadata["trace_run_dir"] = str(tmp_path / "trace")
+
+    older = "HEAD line\n" + ("A" * 600) + "\nTAIL line"
+    recent = "recent tool output"
+    messages = [
+        HistoryMessage(role="tool", content=older, step_id=1, metadata={"source": "engine"}),
+        HistoryMessage(role="assistant", content="thinking", step_id=1),
+        HistoryMessage(role="tool", content=recent, step_id=2, metadata={"source": "engine"}),
+    ]
+
+    result = SnipCompactor(keep_recent=1).snip(messages, state=state)
+
+    assert result[0].metadata.get("snipped") is True
+    assert result[0].metadata.get("snip_saved_path")
+    assert "saved_path:" in str(result[0].content)
+    assert "preview_head:" in str(result[0].content)
+    assert "preview_tail:" in str(result[0].content)
+
+    saved_path = Path(str(result[0].metadata["snip_saved_path"]))
+    assert saved_path.exists()
+    assert saved_path.read_text(encoding="utf-8") == older
+
+    assert result[2].content == recent
+    assert result[2].metadata.get("snipped") is None
diff --git a/tests/test_engine_core_flow.py b/tests/test_engine_core_flow.py
index 89bac53..c1b27a4 100644
--- a/tests/test_engine_core_flow.py
+++ b/tests/test_engine_core_flow.py
@@ -83,6 +83,40 @@ def test_agent_run_shortcut():
     assert agent.run("compute", trace=False, render=False) == "42"
 
 
+def test_agent_condition_stop_is_not_automatic_success():
+    class StopAgent(DemoAgent):
+        def init_state(self, task: str, **kwargs: Any) -> DemoState:
+            _ = kwargs
+            return DemoState(task=task, max_steps=3)
+
+        def decide(self, state: DemoState, observation: dict[str, Any]) -> Decision[Action]:
+            _ = observation
+            return Decision.act(
+                actions=[Action(name="add", args={"a": 1, "b": 1})],
+                rationale="take one action then stop",
+            )
+
+        def reduce(
+            self,
+            state: DemoState,
+            observation: dict[str, Any],
+            decision: Decision[Action],
+        ) -> DemoState:
+            _ = observation, decision
+            return state
+
+        def should_stop(self, state: DemoState) -> bool:
+            _ = state
+            return True
+
+    result = Engine(agent=StopAgent(), budget=RuntimeBudget(max_steps=3)).run("compute")
+    assert result.state.stop_reason == "agent_condition"
+    assert result.state.final_result is None
+    assert result.task_result is not None
+    assert result.task_result.success is False
+    assert all(item.passed is False for item in result.task_result.criteria)
+
+
 def test_agent_run_enables_trace_and_render_by_default(tmp_path):
     workspace = tmp_path / "workspace"
     logdir = tmp_path / "runs"
@@ -782,6 +816,256 @@ def decide(self, state: DemoState, observation: dict[str, Any]):
     assert traced["native_tool_call_used"] is True
 
 
+def test_engine_sanitizes_submit_poc_native_tool_history_without_mutating_result():
+    seen_messages: list[list[dict[str, Any]]] = []
+
+    class _SubmitModel:
+        model = "GLM-5.1"
+        provider = "openai-compatible"
+
+        def __init__(self):
+            self.qitos_harness_metadata = {
+                "family_preset": "glm",
+                "tool_policy": {
+                    "primary_delivery": "api_parameter",
+                    "fallback_delivery": "prompt_injection",
+                    "native_tool_call_preferred": True,
+                },
+            }
+            self.calls = 0
+
+        def call_raw(self, messages):
+            self.calls += 1
+            seen_messages.append(list(messages))
+            if self.calls == 1:
+                return {
+                    "choices": [
+                        {
+                            "message": {
+                                "content": "",
+                                "tool_calls": [
+                                    {
+                                        "id": "call_submit",
+                                        "type": "function",
+                                        "function": {
+                                            "name": "submit_poc",
+                                            "arguments": "{}",
+                                        },
+                                    }
+                                ],
+                            },
+                            "finish_reason": "tool_calls",
+                        }
+                    ],
+                    "model": "GLM-5.1",
+                }
+            return {
+                "choices": [
+                    {
+                        "message": {"content": "Final Answer: done"},
+                        "finish_reason": "stop",
+                    }
+                ],
+                "model": "GLM-5.1",
+            }
+
+    class _SubmitAgent(DemoAgent):
+        def __init__(self):
+            super().__init__()
+            self.llm = _SubmitModel()
+
+            @tool(name="submit_poc")
+            def submit_poc() -> dict[str, Any]:
+                return {
+                    "status": "success",
+                    "vul_exit_code": 0,
+                    "fix_exit_code": 0,
+                    "poc_id": "p1",
+                    "flag": None,
+                    "raw_output": "wrong number of function inputs",
+                    "verification_scope": "full",
+                    "vul_stderr": "target stderr",
+                    "fix_stderr": "hidden stderr",
+                    "vul_stdout": "target stdout",
+                    "fix_stdout": "hidden stdout",
+                }
+
+            self.tool_registry.register(submit_poc)
+
+        def decide(self, state: DemoState, observation: dict[str, Any]):
+            _ = observation
+            return None
+
+        def reduce(
+            self,
+            state: DemoState,
+            observation: dict[str, Any],
+            decision: Decision[Action],
+        ) -> DemoState:
+            _ = observation
+            _ = decision
+            return state
+
+    result = Engine(agent=_SubmitAgent(), budget=RuntimeBudget(max_steps=3)).run("compute")
+
+    assert result.records[0].action_results[0].output["fix_exit_code"] == 0
+    assert len(seen_messages) >= 2
+    second_call_text = "\n".join(str(message) for message in seen_messages[1])
+    assert "wrong number of function inputs" in second_call_text
+    assert "vul_exit_code" not in second_call_text
+    assert "fix_exit_code" not in second_call_text
+    assert "fix_stderr" not in second_call_text
+    assert "fix_stdout" not in second_call_text
+    assert "verification_scope" not in second_call_text
+    act_events = [
+        e for e in result.events if getattr(e.phase, "value", e.phase) == "ACT"
+    ]
+    act_event_text = "\n".join(str(e.payload) for e in act_events)
+    assert "wrong number of function inputs" in act_event_text
+    assert "vul_exit_code" not in act_event_text
+    assert "fix_exit_code" not in act_event_text
+    assert "fix_stderr" not in act_event_text
+    assert "fix_stdout" not in act_event_text
+    assert "verification_scope" not in act_event_text
+
+
+def test_engine_agent_can_block_disallowed_actions_before_execution():
+    executed = {"value": False}
+
+    class _RawResponseModel:
+        model = "qwen-plus"
+        provider = "openai-compatible"
+
+        def __init__(self):
+            self.qitos_harness_metadata = {
+                "family_preset": "qwen",
+                "tool_policy": {
+                    "primary_delivery": "api_parameter",
+                    "fallback_delivery": "prompt_injection",
+                    "native_tool_call_preferred": True,
+                },
+            }
+
+        def call_raw(self, messages):
+            _ = messages
+            return {
+                "choices": [
+                    {
+                        "message": {
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "id": "call_blocked",
+                                    "type": "function",
+                                    "function": {
+                                        "name": "blocked_tool",
+                                        "arguments": "{}",
+                                    },
+                                }
+                            ],
+                        },
+                        "finish_reason": "tool_calls",
+                    }
+                ],
+                "model": "qwen-plus",
+            }
+
+    class _BlockAgent(DemoAgent):
+        def __init__(self):
+            super().__init__()
+            self.llm = _RawResponseModel()
+
+            @tool(name="blocked_tool")
+            def blocked_tool() -> str:
+                executed["value"] = True
+                return "should not run"
+
+            self.tool_registry.register(blocked_tool)
+
+        def decide(self, state: DemoState, observation: dict[str, Any]):
+            _ = observation
+            if state.current_step > 0:
+                return Decision.final("done")
+            return None
+
+        def block_action(self, state: DemoState, action: Action) -> str | None:
+            _ = state
+            if action.name == "blocked_tool":
+                return "blocked for this state"
+            return None
+
+    result = Engine(agent=_BlockAgent(), budget=RuntimeBudget(max_steps=3)).run("compute")
+
+    assert executed["value"] is False
+    first_result = result.records[0].action_results[0]
+    assert first_result.status == "error"
+    assert first_result.error == "action_blocked"
+    assert first_result.metadata["error_category"] == "action_blocked"
+    assert "blocked for this state" in str(first_result.output)
+
+
+def test_engine_salvages_glm_text_tool_call_markup_before_parser():
+    class _GLMMarkupModel:
+        model = "GLM-5.1"
+        provider = "openai-compatible"
+
+        def __init__(self):
+            self.qitos_harness_metadata = {
+                "family_preset": "glm",
+                "tool_policy": {
+                    "primary_delivery": "api_parameter",
+                    "fallback_delivery": "prompt_injection",
+                    "native_tool_call_preferred": True,
+                },
+            }
+
+        def call_raw(self, messages):
+            _ = messages
+            return {
+                "choices": [
+                    {
+                        "message": {
+                            "content": (
+                                "<tool_call>add"
+                                "<arg_key>a</arg_key><arg_value>20</arg_value>"
+                                "<arg_key>b</arg_key><arg_value>22</arg_value>"
+                                "</tool_call>"
+                            ),
+                        },
+                        "finish_reason": "tool_calls",
+                    }
+                ],
+                "model": "GLM-5.1",
+            }
+
+    class _NeverParser:
+        def parse(self, raw_output, context=None):
+            _ = raw_output
+            _ = context
+            raise AssertionError("GLM text tool-call markup should bypass the parser")
+
+    class _Agent(DemoAgent):
+        def __init__(self):
+            super().__init__()
+            self.llm = _GLMMarkupModel()
+            self.model_parser = _NeverParser()
+
+        def decide(self, state: DemoState, observation: dict[str, Any]):
+            _ = observation
+            if state.current_step > 0:
+                return Decision.final("42")
+            return None
+
+    result = Engine(agent=_Agent(), budget=RuntimeBudget(max_steps=3)).run("compute")
+    assert result.state.final_result == "42"
+    record = result.records[0]
+    assert record.decision_source == "native_tool_calls"
+    assert record.native_tool_call_used is True
+    assert record.actions[0].name == "add"
+    assert record.actions[0].args == {"a": 20, "b": 22}
+    assert record.model_response["tool_calls"][0]["function"]["name"] == "add"
+
+
 def test_engine_native_tool_call_lane_falls_back_to_parser_on_bad_arguments():
     class _BadArgsModel:
         model = "qwen-plus"
diff --git a/tests/test_harness_presets.py b/tests/test_harness_presets.py
index 3114e73..3b5bd27 100644
--- a/tests/test_harness_presets.py
+++ b/tests/test_harness_presets.py
@@ -75,6 +75,7 @@ def test_build_model_for_preset_attaches_harness_metadata() -> None:
     assert metadata["native_tool_call_preferred"] is True
     assert metadata["decision_lane_preference"] == "native_tool_calls"
     assert metadata["effective_tool_delivery"] == "api_parameter"
+    assert llm.timeout == 120
 
 
 def test_build_model_for_glm_preset_attaches_native_tool_call_metadata() -> None:
diff --git a/tests/test_model_providers.py b/tests/test_model_providers.py
index 638c763..0c730b5 100644
--- a/tests/test_model_providers.py
+++ b/tests/test_model_providers.py
@@ -2,6 +2,7 @@
 
 import base64
 import sys
+from types import ModuleType
 from types import SimpleNamespace
 
 from qitos.models import (
@@ -290,6 +291,99 @@ def __init__(self, **kwargs):
     assert image_block["image_url"]["url"].startswith("data:image/png;base64,")
 
 
+def test_openai_compatible_model_retries_and_uses_120s_timeout(monkeypatch) -> None:
+    captured = {"attempts": 0, "client_kwargs": None}
+
+    class _TransientError(Exception):
+        pass
+
+    class _FakeCompletions:
+        def create(self, **kwargs):
+            captured["attempts"] += 1
+            if captured["attempts"] < 3:
+                raise _TransientError("request time out")
+            return SimpleNamespace(
+                choices=[
+                    SimpleNamespace(
+                        message=SimpleNamespace(
+                            content="Final Answer: retried ok", tool_calls=None
+                        )
+                    )
+                ],
+                usage=SimpleNamespace(
+                    prompt_tokens=9, completion_tokens=4, total_tokens=13
+                ),
+            )
+
+    class _FakeClient:
+        def __init__(self, **kwargs):
+            captured["client_kwargs"] = kwargs
+            self.chat = SimpleNamespace(completions=_FakeCompletions())
+
+    fake_openai = ModuleType("openai")
+    fake_openai.OpenAI = lambda **kwargs: _FakeClient(**kwargs)
+    fake_openai.APIError = _TransientError
+    monkeypatch.setitem(sys.modules, "openai", fake_openai)
+    monkeypatch.setattr("qitos.models.openai.time.sleep", lambda _: None)
+
+    llm = OpenAICompatibleModel(
+        model="gpt-4.1-mini",
+        api_key="test-key",
+        base_url="https://example.test/v1",
+    )
+    out = llm([{"role": "user", "content": "Retry please"}])
+
+    assert out == "Final Answer: retried ok"
+    assert captured["attempts"] == 3
+    assert captured["client_kwargs"]["timeout"] == 120
+    assert llm.timeout == 120
+
+
+def test_openai_compatible_model_call_raw_retries_on_transient_errors(monkeypatch) -> None:
+    captured = {"attempts": 0}
+
+    class _TransientError(Exception):
+        pass
+
+    class _FakeCompletions:
+        def create(self, **kwargs):
+            captured["attempts"] += 1
+            if captured["attempts"] < 3:
+                raise _TransientError("request time out")
+            return SimpleNamespace(
+                choices=[
+                    SimpleNamespace(
+                        message=SimpleNamespace(
+                            content="Final Answer: raw retried ok", tool_calls=None
+                        )
+                    )
+                ],
+                usage=SimpleNamespace(
+                    prompt_tokens=7, completion_tokens=3, total_tokens=10
+                ),
+            )
+
+    class _FakeClient:
+        def __init__(self, **kwargs):
+            self.chat = SimpleNamespace(completions=_FakeCompletions())
+
+    fake_openai = ModuleType("openai")
+    fake_openai.OpenAI = lambda **kwargs: _FakeClient(**kwargs)
+    fake_openai.APIError = _TransientError
+    monkeypatch.setitem(sys.modules, "openai", fake_openai)
+    monkeypatch.setattr("qitos.models.openai.time.sleep", lambda _: None)
+
+    llm = OpenAICompatibleModel(
+        model="gpt-4.1-mini",
+        api_key="test-key",
+        base_url="https://example.test/v1",
+    )
+    response = llm.call_raw([{"role": "user", "content": "Retry raw please"}])
+
+    assert captured["attempts"] == 3
+    assert response.choices[0].message.content == "Final Answer: raw retried ok"
+
+
 def test_explicit_provider_override_wins(monkeypatch) -> None:
     monkeypatch.setenv("QITOS_MODEL_PROVIDER", "anthropic")
     monkeypatch.setenv("ANTHROPIC_API_KEY", "anthropic-env")
diff --git a/tests/test_model_runtime_text_tool_calls.py b/tests/test_model_runtime_text_tool_calls.py
new file mode 100644
index 0000000..677f082
--- /dev/null
+++ b/tests/test_model_runtime_text_tool_calls.py
@@ -0,0 +1,148 @@
+from types import SimpleNamespace
+
+from qitos import Action, AgentModule, Decision, Engine, ToolRegistry, tool
+from qitos.core.history import History, HistoryMessage
+from qitos.core.state import StateSchema
+from qitos.engine import RuntimeBudget
+from qitos.kit.parser import ReActTextParser
+
+
+class _HistoryCapture(History):
+    def __init__(self):
+        self.messages: list[HistoryMessage] = []
+
+    def append(self, message: HistoryMessage) -> None:
+        self.messages.append(message)
+
+    def retrieve(self, query=None, state=None, observation=None):
+        _ = query, state, observation
+        return list(self.messages)
+
+    def summarize(self, max_items: int = 5) -> str:
+        _ = max_items
+        return ""
+
+    def evict(self) -> int:
+        return 0
+
+    def reset(self, run_id=None) -> None:
+        _ = run_id
+        self.messages = []
+
+
+class _State(StateSchema):
+    pass
+
+
+class _ToolCallAgent(AgentModule[_State, dict, Action]):
+    def __init__(self, llm):
+        registry = ToolRegistry()
+
+        @tool(name="add")
+        def add(a: int, b: int) -> int:
+            return a + b
+
+        registry.register(add)
+        super().__init__(tool_registry=registry, llm=llm)
+        self.model_parser = ReActTextParser()
+        self.history = _HistoryCapture()
+
+    def init_state(self, task: str, **kwargs):
+        _ = kwargs
+        return _State(task=task, max_steps=2)
+
+    def build_system_prompt(self, state: _State):
+        _ = state
+        return "System prompt"
+
+    def prepare(self, state: _State) -> str:
+        _ = state
+        return "solve"
+
+    def decide(self, state: _State, observation: dict):
+        _ = observation
+        if state.current_step > 0:
+            return Decision.final("done")
+        return None
+
+    def reduce(self, state: _State, observation: dict, decision: Decision[Action]):
+        _ = observation, decision
+        return state
+
+
+def test_extract_response_text_preserves_object_message_content_when_tool_calls_exist():
+    engine = Engine(agent=_ToolCallAgent(llm=None), budget=RuntimeBudget(max_steps=1))
+    runtime = engine._model_runtime
+    raw = SimpleNamespace(
+        message=SimpleNamespace(
+            content="Conclusion: likely 1-byte trigger. Next: write and submit.",
+            tool_calls=[
+                {
+                    "id": "call_1",
+                    "type": "function",
+                    "function": {"name": "add", "arguments": '{"a": 20, "b": 22}'},
+                }
+            ],
+        )
+    )
+
+    text = runtime._extract_response_text(raw)
+
+    assert text == "Conclusion: likely 1-byte trigger. Next: write and submit."
+
+
+def test_extract_response_text_uses_reasoning_content_when_content_is_empty():
+    engine = Engine(agent=_ToolCallAgent(llm=None), budget=RuntimeBudget(max_steps=1))
+    runtime = engine._model_runtime
+    raw = SimpleNamespace(
+        message=SimpleNamespace(
+            content=None,
+            reasoning_content="Conclusion: the checksum logic is the trigger. Next: write a candidate.",
+            tool_calls=[
+                {
+                    "id": "call_1",
+                    "type": "function",
+                    "function": {"name": "add", "arguments": '{"a": 20, "b": 22}'},
+                }
+            ],
+        )
+    )
+
+    text = runtime._extract_response_text(raw)
+
+    assert text == "Conclusion: the checksum logic is the trigger. Next: write a candidate."
+
+
+def test_native_tool_call_history_keeps_assistant_text_and_tool_calls():
+    class _ObjectResponseModel:
+        model = "demo-model"
+        qitos_harness_metadata = {
+            "tool_policy": {"native_tool_call_preferred": True}
+        }
+
+        def __call__(self, messages):
+            _ = messages
+            return SimpleNamespace(
+                message=SimpleNamespace(
+                    content="Conclusion: likely 1-byte trigger. Next: use add.",
+                    tool_calls=[
+                        {
+                            "id": "call_1",
+                            "type": "function",
+                            "function": {"name": "add", "arguments": '{"a": 20, "b": 22}'},
+                        }
+                    ],
+                ),
+                finish_reason="tool_calls",
+            )
+
+    agent = _ToolCallAgent(llm=_ObjectResponseModel())
+    result = Engine(agent=agent, budget=RuntimeBudget(max_steps=2)).run("compute")
+
+    assert result.state.final_result == "done"
+    assistant_messages = [m for m in agent.history.messages if m.role == "assistant"]
+    assert assistant_messages
+    first = assistant_messages[0]
+    assert first.content == "Conclusion: likely 1-byte trigger. Next: use add."
+    assert first.tool_calls
+    assert first.tool_calls[0]["function"]["name"] == "add"

From 2cca37997c03a52d1e2cfddf4f3cfb270a1d7c27 Mon Sep 17 00:00:00 2001
From: bmz-q-q <1049675766@qq.com>
Date: Wed, 29 Apr 2026 18:41:57 +0800
Subject: [PATCH 5/5] chore: checkpoint cybergym qitos updates

---
 qitos/benchmark/cybergym/_imports.py          |  84 ++++
 qitos/benchmark/cybergym/runtime.py           |   2 +
 qitos/core/errors.py                          |  21 +-
 qitos/engine/action_executor.py               | 105 ++++-
 qitos/engine/engine.py                        |   6 +-
 qitos/kit/history/compact_history.py          |   5 +-
 qitos/kit/tool/internal/coding_impl.py        |  34 +-
 qitos/models/openai.py                        | 100 +++++
 scripts/cybergym_run_report.py                | 385 ++++++++++++++++++
 scripts/cybergym_success_rate.py              |  82 ++++
 scripts/run_batch100_sampled_conc4.sh         |   3 +-
 scripts/run_batch100_sampled_conc4_v7.sh      |  25 ++
 scripts/run_batch100_sampled_conc4_v8.sh      |  25 ++
 scripts/run_batch100_strategy_memory_tmux.sh  |  18 +
 scripts/run_failed_maxtok32k_tmux.sh          | 274 +++++++++++++
 .../start_batch100_sampled_conc4_v7_server.sh |  24 ++
 .../start_batch100_sampled_conc4_v8_server.sh |  19 +
 tests/test_advanced_tools_and_executor.py     | 157 ++++++-
 tests/test_benchmark_cybergym_recipe.py       |  37 ++
 tests/test_cybergym_context_retention.py      |   6 +-
 tests/test_cybergym_context_snip.py           |  20 +-
 tests/test_cybergym_parallel_tools_prompt.py  |  26 ++
 tests/test_cybergym_run_report.py             | 102 +++++
 tests/test_cybergym_success_rate_script.py    |  54 +++
 tests/test_glm_tokenizer_count.py             |  55 +++
 tests/test_predefined_atomic_tools.py         |   4 +-
 tests/test_runtime_recovery.py                |  28 ++
 27 files changed, 1667 insertions(+), 34 deletions(-)
 create mode 100644 qitos/benchmark/cybergym/_imports.py
 create mode 100755 scripts/cybergym_run_report.py
 create mode 100755 scripts/cybergym_success_rate.py
 create mode 100755 scripts/run_batch100_sampled_conc4_v7.sh
 create mode 100755 scripts/run_batch100_sampled_conc4_v8.sh
 create mode 100755 scripts/run_batch100_strategy_memory_tmux.sh
 create mode 100755 scripts/run_failed_maxtok32k_tmux.sh
 create mode 100755 scripts/start_batch100_sampled_conc4_v7_server.sh
 create mode 100755 scripts/start_batch100_sampled_conc4_v8_server.sh
 create mode 100644 tests/test_cybergym_parallel_tools_prompt.py
 create mode 100644 tests/test_cybergym_run_report.py
 create mode 100644 tests/test_cybergym_success_rate_script.py
 create mode 100644 tests/test_glm_tokenizer_count.py
 create mode 100644 tests/test_runtime_recovery.py

diff --git a/qitos/benchmark/cybergym/_imports.py b/qitos/benchmark/cybergym/_imports.py
new file mode 100644
index 0000000..7436fa9
--- /dev/null
+++ b/qitos/benchmark/cybergym/_imports.py
@@ -0,0 +1,84 @@
+"""Helpers for importing the local CyberGym source tree."""
+
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+
+_CYBERGYM_ENV_VARS = (
+    "CYBERGYM_SOURCE_ROOT",
+    "CYBERGYM_REPO_ROOT",
+)
+
+
+def _marker_path(root: Path) -> Path:
+    return root / "src" / "cybergym" / "task" / "README.template"
+
+
+def resolve_cybergym_source_root() -> Path:
+    candidates: list[Path] = []
+    for env_name in _CYBERGYM_ENV_VARS:
+        raw = str(os.getenv(env_name) or "").strip()
+        if raw:
+            candidates.append(Path(raw).expanduser().resolve())
+
+    workspace_dir = Path(__file__).resolve().parents[4]
+    candidates.append((workspace_dir / "cybergym").resolve())
+
+    seen: set[Path] = set()
+    for candidate in candidates:
+        if candidate in seen:
+            continue
+        seen.add(candidate)
+        if _marker_path(candidate).exists():
+            return candidate
+
+    searched = ", ".join(str(path) for path in candidates) or "<none>"
+    raise FileNotFoundError(
+        "Unable to locate the CyberGym source tree with src/cybergym/task/README.template. "
+        f"Searched: {searched}"
+    )
+
+
+def ensure_cybergym_source_importable() -> Path:
+    source_root = resolve_cybergym_source_root()
+    src_dir = str((source_root / "src").resolve())
+
+    def _is_stale_cybergym_path(entry: object) -> bool:
+        text = str(entry or "")
+        return text.endswith("/cybergym/src") and text != src_dir
+
+    sys.path[:] = [
+        entry
+        for entry in sys.path
+        if str(entry or "") != src_dir and not _is_stale_cybergym_path(entry)
+    ]
+    sys.path.insert(0, src_dir)
+
+    stale_modules: list[str] = []
+    for name, module in list(sys.modules.items()):
+        if name != "cybergym" and not name.startswith("cybergym."):
+            continue
+        module_file = getattr(module, "__file__", None)
+        if not module_file:
+            continue
+        try:
+            module_path = Path(str(module_file)).resolve()
+        except Exception:
+            stale_modules.append(name)
+            continue
+        if not str(module_path).startswith(src_dir):
+            stale_modules.append(name)
+
+    for name in stale_modules:
+        sys.modules.pop(name, None)
+
+    return source_root
+
+
+__all__ = [
+    "ensure_cybergym_source_importable",
+    "resolve_cybergym_source_root",
+]
diff --git a/qitos/benchmark/cybergym/runtime.py b/qitos/benchmark/cybergym/runtime.py
index 6904f35..67ad3ec 100644
--- a/qitos/benchmark/cybergym/runtime.py
+++ b/qitos/benchmark/cybergym/runtime.py
@@ -7,6 +7,7 @@
 from qitos.core import ExperimentSpec, RunSpec, Task
 
 from ..contracts import BenchmarkRuntimeHook, PreparedBenchmarkTask
+from ._imports import ensure_cybergym_source_importable
 
 
 def prepare_task_dir(
@@ -17,6 +18,7 @@ def prepare_task_dir(
     server: str,
     difficulty: str,
 ) -> Path:
+    ensure_cybergym_source_importable()
     from cybergym.task.gen_task import generate_task
     from cybergym.task.types import TaskConfig, TaskDifficulty
 
diff --git a/qitos/core/errors.py b/qitos/core/errors.py
index 8ec3b63..f331489 100644
--- a/qitos/core/errors.py
+++ b/qitos/core/errors.py
@@ -83,11 +83,22 @@ def classify_exception(exc: Exception, phase: str, step_id: int) -> RuntimeError
         return exc.info
 
     msg = str(exc).lower()
-
-    if isinstance(exc, (TimeoutError, ConnectionError)) and phase.lower() in {
-        "decide",
-        "propose",
-    }:
+    phase_name = phase.lower()
+
+    if phase_name in {"decide", "propose"} and (
+        isinstance(exc, (TimeoutError, ConnectionError))
+        or any(
+            marker in msg
+            for marker in (
+                "timeout",
+                "timed out",
+                "stream timeout",
+                "read timeout",
+                "connection error",
+                "api connection",
+            )
+        )
+    ):
         return RuntimeErrorInfo(
             category=ErrorCategory.MODEL,
             message=str(exc),
diff --git a/qitos/engine/action_executor.py b/qitos/engine/action_executor.py
index 13d19df..81500bd 100644
--- a/qitos/engine/action_executor.py
+++ b/qitos/engine/action_executor.py
@@ -3,6 +3,8 @@
 from __future__ import annotations
 
 import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
 
 from ..core.action import Action, ActionExecutionPolicy, ActionResult, ActionStatus
@@ -28,11 +30,50 @@ def execute(
         self, actions: Sequence[Action], env: Optional[Env] = None, state: Any = None
     ) -> List[ActionResult]:
         if self.policy.mode == "parallel":
-            raise NotImplementedError(
-                "ActionExecutionPolicy.mode='parallel' is not implemented in the canonical executor"
-            )
+            return self._execute_parallel(actions, env=env, state=state)
         return [self._execute_one(action, env=env, state=state) for action in actions]
 
+    def _execute_parallel(
+        self, actions: Sequence[Action], env: Optional[Env] = None, state: Any = None
+    ) -> List[ActionResult]:
+        results: List[ActionResult] = []
+        pending_batch: List[Action] = []
+
+        def _flush_batch() -> None:
+            nonlocal pending_batch
+            if not pending_batch:
+                return
+            max_workers = min(
+                max(1, int(self.policy.max_concurrency)),
+                len(pending_batch),
+            )
+            with ThreadPoolExecutor(max_workers=max_workers) as pool:
+                futures = [
+                    pool.submit(self._execute_one, action, env=env, state=state)
+                    for action in pending_batch
+                ]
+                results.extend(future.result() for future in futures)
+            pending_batch = []
+
+        for action in actions:
+            if self._can_execute_in_parallel(action):
+                pending_batch.append(action)
+                continue
+            _flush_batch()
+            results.append(self._execute_one(action, env=env, state=state))
+
+        _flush_batch()
+        return results
+
+    def _can_execute_in_parallel(self, action: Action) -> bool:
+        tool = self._resolve_tool(action.name)
+        if tool is None:
+            return False
+        spec = getattr(tool, "spec", None)
+        if spec is None:
+            return False
+        return bool(getattr(spec, "read_only", False) and getattr(spec, "concurrency_safe", False))
+
     def _execute_one(
         self, action: Action, env: Optional[Env] = None, state: Any = None
     ) -> ActionResult:
@@ -46,6 +87,27 @@ def _execute_one(
             attempts += 1
             try:
                 tool = self._resolve_tool(action.name)
+                guard_message = self._candidate_submit_ready_guard(action.name, state)
+                if guard_message:
+                    return self._finish_result(
+                        action=action,
+                        status=ActionStatus.ERROR,
+                        start=start,
+                        attempts=attempts,
+                        tool_meta=tool_meta,
+                        output={
+                            "status": "error",
+                            "message": guard_message,
+                            "error_category": "candidate_submit_ready_guard",
+                            "tool": action.name,
+                        },
+                        error=guard_message,
+                        extra_metadata={
+                            "error_category": "candidate_submit_ready_guard",
+                            "progress_count": len(runtime_context["progress_events"]),
+                            "artifacts": list(runtime_context["artifacts"]),
+                        },
+                    )
                 validation = self._validate(tool, action.args, runtime_context)
                 if not validation.valid:
                     return self._finish_result(
@@ -278,6 +340,43 @@ def _call_tool(
             "Unsupported tool registry. Expected object with call() or get()."
         )
 
+    def _candidate_submit_ready_guard(self, name: str, state: Any) -> str:
+        if name == "submit_poc":
+            return ""
+        if not bool(getattr(state, "candidate_ready_for_submit", False)):
+            return ""
+        poc_path = str(getattr(state, "poc_path", "") or "").strip()
+        if not poc_path:
+            return ""
+        if self._candidate_ready_file_missing(state, poc_path):
+            return ""
+        return (
+            "Candidate is ready for submission. Call submit_poc now; "
+            f"{name} is blocked until the ready candidate is submitted."
+        )
+
+    @staticmethod
+    def _candidate_ready_file_missing(state: Any, poc_path: str) -> bool:
+        path = Path(poc_path)
+        candidates: List[Path] = []
+        if path.is_absolute():
+            candidates.append(path)
+        else:
+            workspace_root = str(getattr(state, "workspace_root", "") or "").strip()
+            if workspace_root:
+                candidates.append(Path(workspace_root) / path)
+            candidates.append(path)
+
+        saw_checkable_path = False
+        for candidate in candidates:
+            try:
+                saw_checkable_path = True
+                if candidate.is_file():
+                    return False
+            except OSError:
+                continue
+        return saw_checkable_path
+
     def _normalize_output(self, tool: Optional[BaseTool], output: Any) -> Any:
         if tool is None:
             return output
diff --git a/qitos/engine/engine.py b/qitos/engine/engine.py
index 1b5367a..cb0287d 100644
--- a/qitos/engine/engine.py
+++ b/qitos/engine/engine.py
@@ -8,6 +8,7 @@
 from uuid import uuid4
 
 from ..core.agent_module import AgentModule
+from ..core.action import ActionExecutionPolicy
 from ..core.decision import Decision
 from ..core.errors import ErrorCategory, StopReason
 from ..core.env import Env, EnvObservation, EnvStepResult
@@ -270,7 +271,10 @@ def __init__(
             self.stop_criteria = list(stop_criteria)
 
         self.executor = (
-            ActionExecutor(tool_registry=self.tool_registry)
+            ActionExecutor(
+                tool_registry=self.tool_registry,
+                policy=ActionExecutionPolicy(mode="parallel", max_concurrency=4),
+            )
             if self.tool_registry is not None
             else None
         )
diff --git a/qitos/kit/history/compact_history.py b/qitos/kit/history/compact_history.py
index 7201a98..6b0ee47 100644
--- a/qitos/kit/history/compact_history.py
+++ b/qitos/kit/history/compact_history.py
@@ -96,12 +96,13 @@ def _compact_message(self, message: HistoryMessage) -> HistoryMessage:
         newline_count = text.count("\n")
         blob_kind = self._infer_blob_kind(message, text)
         compacted = (
-            f"[Compacted {blob_kind} from step {message.step_id}; "
-            f"original_chars={len(text)}; original_lines={newline_count + 1}]\n"
+            f"[compact:start step={message.step_id} kind={blob_kind} "
+            f"original_chars={len(text)} original_lines={newline_count + 1}]\n"
             f"{head}"
         )
         if tail and tail != head:
             compacted += f"\n...\n{tail}"
+        compacted += "\n[compact:end]"
 
         metadata = dict(message.metadata)
         metadata.update(
diff --git a/qitos/kit/tool/internal/coding_impl.py b/qitos/kit/tool/internal/coding_impl.py
index 3a9cac7..a8c6759 100644
--- a/qitos/kit/tool/internal/coding_impl.py
+++ b/qitos/kit/tool/internal/coding_impl.py
@@ -49,6 +49,22 @@ def _truncate_text(text: str, max_chars: int) -> tuple[str, bool]:
     return truncate_text(text, max_chars)
 
 
+def _select_line_chunk(
+    lines: List[str], start: int, max_lines: int, max_chars: int
+) -> tuple[List[str], bool]:
+    end = min(len(lines), start + max_lines)
+    chunk: List[str] = []
+    char_count = 0
+    enforce_chars = max_chars > 0
+    for line in lines[start:end]:
+        char_count += len(line) + (1 if chunk else 0)
+        chunk.append(line)
+        if enforce_chars and char_count >= max_chars:
+            break
+    truncated = bool(enforce_chars and start + len(chunk) < end)
+    return chunk, truncated
+
+
 def _build_diff(old_content: str, new_content: str, path: str) -> str:
     return build_diff(old_content, new_content, path)
 
@@ -508,12 +524,13 @@ def file_read_v2(
         runtime_context: Optional[Dict[str, Any]] = None,
     ) -> Dict[str, Any]:
         """
-        Read one workspace file with line metadata.
+        Read one workspace file as a bounded whole-line text chunk.
 
         :param path: Path relative to the workspace root.
         :param offset: Zero-based starting line offset.
         :param limit: Maximum number of lines to return.
-        :param max_chars: Maximum number of characters to return.
+        :param max_chars: Soft maximum characters; the returned chunk stops at a line
+            boundary just after reaching this value.
         :param runtime_context: Optional runtime context injected by the executor.
         """
         _ = runtime_context
@@ -527,22 +544,17 @@ def file_read_v2(
             lines = content.splitlines()
             start = max(0, int(offset))
             size = max(1, int(limit))
-            chunk = lines[start : start + size]
+            chunk, truncated = _select_line_chunk(lines, start, size, int(max_chars))
             chunk_text = "\n".join(chunk)
-            chunk_text, truncated = _truncate_text(chunk_text, int(max_chars))
             return {
                 "status": "success",
                 "path": str(path),
                 "content": chunk_text,
                 "line_ending": line_ending,
                 "offset": start,
-                "limit": size,
+                "limit": len(chunk),
                 "total_lines": len(lines),
-                "lines": [
-                    {"line": start + index + 1, "text": text}
-                    for index, text in enumerate(chunk)
-                ],
-                "has_more": start + size < len(lines),
+                "has_more": start + len(chunk) < len(lines),
                 "truncated": truncated,
             }
         except Exception as e:
@@ -1201,8 +1213,8 @@ def read_file_range(
             "limit": result.get("limit", limit),
             "total_lines": result.get("total_lines", 0),
             "content": result.get("content", ""),
-            "lines": result.get("lines", []),
             "has_more": result.get("has_more", False),
+            "truncated": result.get("truncated", False),
         }
 
     @tool(
diff --git a/qitos/models/openai.py b/qitos/models/openai.py
index a044864..d276e12 100644
--- a/qitos/models/openai.py
+++ b/qitos/models/openai.py
@@ -8,9 +8,12 @@
 import json
 import os
 import time
+from functools import lru_cache
+from pathlib import Path
 from typing import Any, Dict, List, Optional, cast
 
 from ..core.multimodal import (
+    content_to_text,
     ensure_data_url,
     file_to_data_url,
     has_nontext_content,
@@ -22,6 +25,7 @@
 
 OPENAI_DEFAULT_TIMEOUT = 120
 OPENAI_DEFAULT_RETRIES = 3
+GLM_TOKENIZER_ENV_VARS = ("QITOS_GLM_TOKENIZER_PATH", "GLM_TOKENIZER_PATH")
 
 
 def _retry_delay_seconds(attempt_index: int) -> float:
@@ -110,6 +114,65 @@ def _to_openai_content_blocks(content: List[Any]) -> List[Dict[str, Any]]:
     return blocks
 
 
+def _is_glm_model_name(model: str) -> bool:
+    normalized = str(model or "").strip().lower()
+    return normalized.startswith("glm-") or normalized.startswith("zai-org/glm-")
+
+
+def _glm_tokenizer_path() -> Optional[str]:
+    for name in GLM_TOKENIZER_ENV_VARS:
+        value = os.getenv(name, "").strip()
+        if value and Path(value).exists():
+            return value
+    return None
+
+
+@lru_cache(maxsize=4)
+def _load_glm_tokenizer(path: str) -> Any:
+    from transformers import AutoTokenizer
+
+    return AutoTokenizer.from_pretrained(
+        path,
+        trust_remote_code=True,
+        local_files_only=True,
+    )
+
+
+def _tokenizer_count_result(value: Any) -> Optional[int]:
+    if isinstance(value, int):
+        return int(value)
+    if isinstance(value, list):
+        return len(value)
+    getter = getattr(value, "get", None)
+    if callable(getter):
+        ids = getter("input_ids")
+        if isinstance(ids, list):
+            return len(ids)
+    return None
+
+
+def _normalize_messages_for_tokenizer(payload: List[Any]) -> List[Dict[str, str]]:
+    messages: List[Dict[str, str]] = []
+    for item in payload:
+        if not isinstance(item, dict):
+            messages.append({"role": "user", "content": str(item)})
+            continue
+        role = str(item.get("role") or "user").strip() or "user"
+        content = content_to_text(item.get("content"))
+        extras: Dict[str, Any] = {}
+        for key in ("tool_calls", "tool_call_id", "name"):
+            if key in item and item.get(key) not in (None, "", []):
+                extras[key] = item.get(key)
+        if extras:
+            content = (
+                content
+                + "\n"
+                + json.dumps(extras, ensure_ascii=False, sort_keys=True)
+            ).strip()
+        messages.append({"role": role, "content": content})
+    return messages
+
+
 class OpenAIModel(Model):
     """
     OpenAI model calling implementation
@@ -387,6 +450,43 @@ def __init__(
                 "OPENAI_BASE_URL not set. Please set environment variable or pass base_url parameter."
             )
 
+    def count_tokens(self, messages_or_text: Any) -> Optional[int]:
+        if self._should_use_glm_tokenizer():
+            value = self._count_tokens_with_glm_tokenizer(messages_or_text)
+            if isinstance(value, int) and value >= 0:
+                return value
+        return super().count_tokens(messages_or_text)
+
+    def _should_use_glm_tokenizer(self) -> bool:
+        metadata = dict(getattr(self, "qitos_harness_metadata", {}) or {})
+        if str(metadata.get("family_preset") or "").strip().lower() == "glm":
+            return True
+        return _is_glm_model_name(self.model)
+
+    def _count_tokens_with_glm_tokenizer(self, payload: Any) -> Optional[int]:
+        path = _glm_tokenizer_path()
+        if not path:
+            return None
+        try:
+            tokenizer = _load_glm_tokenizer(path)
+        except Exception:
+            return None
+
+        try:
+            if isinstance(payload, list):
+                messages = _normalize_messages_for_tokenizer(payload)
+                encoded = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=True,
+                    add_generation_prompt=False,
+                )
+                return _tokenizer_count_result(encoded)
+            text = self._stringify_token_payload(payload)
+            encoded = tokenizer.encode(text, add_special_tokens=False)
+            return _tokenizer_count_result(encoded)
+        except Exception:
+            return None
+
     def _call_api(self, messages: List[Dict[str, Any]], **kwargs: Any) -> str:
         """
         Call OpenAI compatible API
diff --git a/scripts/cybergym_run_report.py b/scripts/cybergym_run_report.py
new file mode 100755
index 0000000..7c30e9d
--- /dev/null
+++ b/scripts/cybergym_run_report.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from collections import Counter
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Sequence
+
+
+SUCCESS_REASON = "success"
+
+
+@dataclass(frozen=True)
+class TaskResult:
+    task_id: str
+    stop_reason: str
+    trace_id: str
+    manifest_path: Path
+    steps: int | None = None
+    latency_seconds: float | None = None
+    token_usage: int | None = None
+    final_result: str | None = None
+
+    @property
+    def success(self) -> bool:
+        return self.stop_reason == SUCCESS_REASON
+
+
+@dataclass(frozen=True)
+class RunReport:
+    name: str
+    path: Path
+    tasks: dict[str, TaskResult]
+    manifest_count: int
+
+    @property
+    def total(self) -> int:
+        return len(self.tasks)
+
+    @property
+    def success_count(self) -> int:
+        return sum(1 for result in self.tasks.values() if result.success)
+
+    @property
+    def success_rate(self) -> float:
+        return self.success_count / self.total if self.total else 0.0
+
+    @property
+    def stop_reasons(self) -> dict[str, int]:
+        return dict(Counter(result.stop_reason for result in self.tasks.values()))
+
+
+def _nested(mapping: dict[str, Any], *keys: str) -> Any:
+    current: Any = mapping
+    for key in keys:
+        if not isinstance(current, dict):
+            return None
+        current = current.get(key)
+    return current
+
+
+def _task_id_from_manifest(path: Path, obj: dict[str, Any]) -> str:
+    for value in (
+        _nested(obj, "summary", "task_meta", "task_id"),
+        _nested(obj, "summary", "task_result", "task_id"),
+        _nested(obj, "experiment_spec", "benchmark_metadata", "task_id"),
+    ):
+        if value:
+            return str(value)
+    name = path.parent.name
+    marker = "_arvo_"
+    if marker in name:
+        return "arvo:" + name.split(marker, 1)[1].split("_", 1)[0]
+    return ""
+
+
+def _stop_reason_from_manifest(obj: dict[str, Any]) -> str:
+    summary = obj.get("summary") if isinstance(obj.get("summary"), dict) else {}
+    task_result = summary.get("task_result") if isinstance(summary.get("task_result"), dict) else {}
+    if task_result.get("success") is True:
+        return SUCCESS_REASON
+    for value in (
+        task_result.get("stop_reason"),
+        summary.get("stop_reason"),
+        obj.get("status"),
+    ):
+        if value:
+            return str(value)
+    return "unknown"
+
+
+def _optional_int(value: Any) -> int | None:
+    if value is None or value == "":
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _optional_float(value: Any) -> float | None:
+    if value is None or value == "":
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _load_task_result(manifest_path: Path) -> TaskResult | None:
+    try:
+        obj = json.loads(manifest_path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+    if not isinstance(obj, dict):
+        return None
+    summary = obj.get("summary") if isinstance(obj.get("summary"), dict) else {}
+    task_id = _task_id_from_manifest(manifest_path, obj)
+    if not task_id:
+        return None
+    token_usage = _optional_int(summary.get("token_usage"))
+    if token_usage is None:
+        token_usage = _optional_int(_nested(summary, "context", "tokens_total"))
+    return TaskResult(
+        task_id=task_id,
+        stop_reason=_stop_reason_from_manifest(obj),
+        trace_id=manifest_path.parent.name,
+        manifest_path=manifest_path,
+        steps=_optional_int(summary.get("steps")),
+        latency_seconds=_optional_float(summary.get("latency_seconds")),
+        token_usage=token_usage,
+        final_result=str(summary.get("final_result")) if summary.get("final_result") else None,
+    )
+
+
+def _is_better_final_result(candidate: TaskResult, current: TaskResult) -> bool:
+    if candidate.success != current.success:
+        return candidate.success
+    return candidate.manifest_path.stat().st_mtime >= current.manifest_path.stat().st_mtime
+
+
+def collect_run_report(run_folder: Path | str) -> RunReport:
+    root = Path(run_folder).expanduser().resolve()
+    traces = root / "traces"
+    tasks: dict[str, TaskResult] = {}
+    manifest_count = 0
+    if traces.is_dir():
+        for manifest_path in sorted(traces.glob("*/manifest.json")):
+            manifest_count += 1
+            result = _load_task_result(manifest_path)
+            if result is None:
+                continue
+            current = tasks.get(result.task_id)
+            if current is None or _is_better_final_result(result, current):
+                tasks[result.task_id] = result
+    return RunReport(name=root.name, path=root, tasks=tasks, manifest_count=manifest_count)
+
+
+def discover_run_folders(runs_root: Path | str) -> list[Path]:
+    root = Path(runs_root).expanduser()
+    if not root.is_dir():
+        return []
+    return sorted(
+        path
+        for path in root.iterdir()
+        if path.is_dir() and any((path / "traces").glob("*/manifest.json"))
+    )
+
+
+def _load_task_order(task_file: str | None) -> list[str]:
+    if not task_file:
+        return []
+    path = Path(task_file).expanduser()
+    return [line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
+
+
+def _all_task_ids(reports: Sequence[RunReport], task_order: Sequence[str]) -> list[str]:
+    seen: set[str] = set()
+    ordered: list[str] = []
+    for task_id in task_order:
+        if task_id not in seen:
+            ordered.append(task_id)
+            seen.add(task_id)
+    for report in reports:
+        for task_id in sorted(report.tasks):
+            if task_id not in seen:
+                ordered.append(task_id)
+                seen.add(task_id)
+    return ordered
+
+
+def _format_seconds(value: float | None) -> str:
+    if value is None:
+        return ""
+    return f"{value:.1f}"
+
+
+def _format_int(value: int | None) -> str:
+    return "" if value is None else str(value)
+
+
+def write_markdown_report(
+    reports: Sequence[RunReport],
+    *,
+    output_path: Path,
+    task_order: Sequence[str] = (),
+) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    lines: list[str] = [
+        "# CyberGym Run Report",
+        "",
+        f"- Generated: `{datetime.now().isoformat(timespec='seconds')}`",
+        f"- Runs: `{len(reports)}`",
+        "",
+        "## Summary",
+        "",
+        "| run | success | total | rate | manifests | stop reasons |",
+        "| --- | ---: | ---: | ---: | ---: | --- |",
+    ]
+    for report in reports:
+        reasons = ", ".join(
+            f"{reason}:{count}" for reason, count in sorted(report.stop_reasons.items())
+        )
+        lines.append(
+            f"| `{report.name}` | {report.success_count} | {report.total} | "
+            f"{report.success_rate * 100:.2f}% | {report.manifest_count} | {reasons} |"
+        )
+
+    all_tasks = _all_task_ids(reports, task_order)
+    if reports and all_tasks:
+        lines.extend(
+            [
+                "",
+                "## Task Matrix",
+                "",
+                "Legend: `S` success, `-` missing, otherwise stop_reason.",
+                "",
+                "| task_id | " + " | ".join(f"`{report.name}`" for report in reports) + " |",
+                "| --- | " + " | ".join("---" for _ in reports) + " |",
+            ]
+        )
+        for task_id in all_tasks:
+            cells = []
+            for report in reports:
+                result = report.tasks.get(task_id)
+                if result is None:
+                    cells.append("-")
+                elif result.success:
+                    cells.append("S")
+                else:
+                    cells.append(result.stop_reason)
+            lines.append(f"| `{task_id}` | " + " | ".join(cells) + " |")
+
+    lines.extend(["", "## Per-Run Details", ""])
+    for report in reports:
+        lines.extend(
+            [
+                f"### {report.name}",
+                "",
+                "| task_id | stop_reason | steps | latency_s | tokens | final_result | trace |",
+                "| --- | --- | ---: | ---: | ---: | --- | --- |",
+            ]
+        )
+        for task_id in _all_task_ids([report], task_order):
+            result = report.tasks.get(task_id)
+            if result is None:
+                lines.append(f"| `{task_id}` | missing |  |  |  |  |  |")
+                continue
+            lines.append(
+                f"| `{task_id}` | {result.stop_reason} | {_format_int(result.steps)} | "
+                f"{_format_seconds(result.latency_seconds)} | {_format_int(result.token_usage)} | "
+                f"{result.final_result or ''} | `{result.trace_id}` |"
+            )
+        lines.append("")
+
+    output_path.write_text("\n".join(lines), encoding="utf-8")
+
+
+def write_task_csv(
+    reports: Sequence[RunReport],
+    *,
+    output_path: Path,
+    task_order: Sequence[str] = (),
+) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    all_tasks = _all_task_ids(reports, task_order)
+    with output_path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.writer(handle)
+        writer.writerow(
+            [
+                "task_id",
+                "run",
+                "stop_reason",
+                "success",
+                "steps",
+                "latency_seconds",
+                "token_usage",
+                "final_result",
+                "trace_id",
+                "manifest_path",
+            ]
+        )
+        for task_id in all_tasks:
+            for report in reports:
+                result = report.tasks.get(task_id)
+                if result is None:
+                    writer.writerow([task_id, report.name, "missing", "false", "", "", "", "", "", ""])
+                    continue
+                writer.writerow(
+                    [
+                        task_id,
+                        report.name,
+                        result.stop_reason,
+                        str(result.success).lower(),
+                        result.steps or "",
+                        result.latency_seconds or "",
+                        result.token_usage or "",
+                        result.final_result or "",
+                        result.trace_id,
+                        str(result.manifest_path),
+                    ]
+                )
+
+
+def _default_output_path(runs_root: Path) -> Path:
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return runs_root / "reports" / f"cybergym_run_report_{stamp}.md"
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Create a single comparison report for CyberGym run folders."
+    )
+    parser.add_argument(
+        "run_folders",
+        nargs="*",
+        help="Run folders to compare. If omitted, scan --runs-root for folders with traces.",
+    )
+    parser.add_argument(
+        "--runs-root",
+        default="runs/cybergym",
+        help="Parent folder used when run_folders are omitted and for the default output path.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Markdown report path. Defaults to runs/cybergym/reports/cybergym_run_report_<timestamp>.md.",
+    )
+    parser.add_argument(
+        "--csv",
+        dest="csv_path",
+        help="Optional task-level CSV output path.",
+    )
+    parser.add_argument(
+        "--task-file",
+        help="Optional task list used to order rows and show missing tasks.",
+    )
+    args = parser.parse_args(argv)
+
+    runs_root = Path(args.runs_root).expanduser()
+    run_folders = [Path(path).expanduser() for path in args.run_folders]
+    if not run_folders:
+        run_folders = discover_run_folders(runs_root)
+    reports = [collect_run_report(path) for path in run_folders]
+    reports = [report for report in reports if report.manifest_count > 0]
+    if not reports:
+        parser.error("no run folders with traces/*/manifest.json found")
+
+    task_order = _load_task_order(args.task_file)
+    output_path = Path(args.output).expanduser() if args.output else _default_output_path(runs_root)
+    write_markdown_report(reports, output_path=output_path, task_order=task_order)
+    print(f"Wrote markdown report: {output_path}")
+    if args.csv_path:
+        csv_path = Path(args.csv_path).expanduser()
+        write_task_csv(reports, output_path=csv_path, task_order=task_order)
+        print(f"Wrote task CSV: {csv_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/cybergym_success_rate.py b/scripts/cybergym_success_rate.py
new file mode 100755
index 0000000..83848cf
--- /dev/null
+++ b/scripts/cybergym_success_rate.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Sequence
+
+
+@dataclass(frozen=True)
+class SuccessRateStats:
+    total: int
+    success: int
+    stop_reasons: dict[str, int]
+
+    @property
+    def rate(self) -> float:
+        if self.total == 0:
+            return 0.0
+        return self.success / self.total
+
+
+def _manifest_paths(run_folder: Path) -> list[Path]:
+    traces_dir = run_folder / "traces"
+    if not traces_dir.is_dir():
+        return []
+    return sorted(traces_dir.glob("*/manifest.json"))
+
+
+def _load_stop_reason(manifest_path: Path) -> str:
+    data = json.loads(manifest_path.read_text(encoding="utf-8"))
+    summary = data.get("summary")
+    if not isinstance(summary, dict):
+        return "missing"
+    stop_reason = summary.get("stop_reason")
+    if not isinstance(stop_reason, str) or not stop_reason:
+        return "missing"
+    return stop_reason
+
+
+def collect_success_rate(run_folder: Path | str) -> SuccessRateStats:
+    root = Path(run_folder).expanduser()
+    stop_reasons: Counter[str] = Counter()
+    for manifest_path in _manifest_paths(root):
+        stop_reasons[_load_stop_reason(manifest_path)] += 1
+    total = sum(stop_reasons.values())
+    return SuccessRateStats(
+        total=total,
+        success=stop_reasons.get("success", 0),
+        stop_reasons=dict(stop_reasons),
+    )
+
+
+def _format_stats(stats: SuccessRateStats) -> str:
+    lines = [
+        f"success: {stats.success}/{stats.total} ({stats.rate * 100:.2f}%)",
+        "stop_reason distribution:",
+    ]
+    for reason, count in sorted(stats.stop_reasons.items()):
+        lines.append(f"  {reason}: {count}")
+    return "\n".join(lines)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Count success stop_reason ratio under a CyberGym run folder."
+    )
+    parser.add_argument(
+        "run_folder",
+        help="CyberGym run folder, e.g. runs/cybergym/batch100_conc4_v1",
+    )
+    args = parser.parse_args(argv)
+
+    stats = collect_success_rate(args.run_folder)
+    print(_format_stats(stats))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/run_batch100_sampled_conc4.sh b/scripts/run_batch100_sampled_conc4.sh
index 73fe486..c995202 100755
--- a/scripts/run_batch100_sampled_conc4.sh
+++ b/scripts/run_batch100_sampled_conc4.sh
@@ -2,7 +2,8 @@
 set -euo pipefail
 
 source /tmp/cg_smoke_env.sh
-export PYTHONPATH=/data/pxd-team/workspace-149/zwq/qitos-cybergym
+export CYBERGYM_SOURCE_ROOT=/data/pxd-team/workspace-149/zwq/cybergym
+export PYTHONPATH=/data/pxd-team/workspace-149/zwq/qitos-cybergym:/data/pxd-team/workspace-149/zwq/cybergym/src
 
 cd /data/pxd-team/workspace-149/zwq/qitos-cybergym
 bash /data/pxd-team/workspace-149/zwq/cybergym_agent-fresh/scripts/sync_to_qitos.sh
diff --git a/scripts/run_batch100_sampled_conc4_v7.sh b/scripts/run_batch100_sampled_conc4_v7.sh
new file mode 100755
index 0000000..c7715e6
--- /dev/null
+++ b/scripts/run_batch100_sampled_conc4_v7.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+source /tmp/cg_smoke_env.sh
+export CYBERGYM_SOURCE_ROOT=/data/pxd-team/workspace-149/zwq/cybergym
+export PYTHONPATH=/data/pxd-team/workspace-149/zwq/qitos-cybergym:/data/pxd-team/workspace-149/zwq/cybergym/src
+
+cd /data/pxd-team/workspace-149/zwq/qitos-cybergym
+bash /data/pxd-team/workspace-149/zwq/cybergym_agent-fresh/scripts/sync_to_qitos.sh
+
+/data3t/conda_envs/cybergym/bin/python -u scripts/run_cybergym_batch.py \
+  --data-dir /data/pxd-team/workspace-149/zwq/cybergym/cybergym_data/data \
+  --out-root /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v7 \
+  --server http://127.0.0.1:8722 \
+  --difficulty level1 \
+  --model-name GLM-5.1 \
+  --base-url "${OPENAI_BASE_URL}" \
+  --api-key "${CYBERGYM_CLAUDE_AUTH_TOKEN}" \
+  --task-file /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/trace100_multiagent_20260421_110342/tasks.txt \
+  --limit 100 \
+  --concurrency 4 \
+  --max-steps 1000000 \
+  --max-runtime-seconds 7200 \
+  --trace-prefix qitos_cybergym_batch100sampled \
+  2>&1 | tee /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v7/run.log
diff --git a/scripts/run_batch100_sampled_conc4_v8.sh b/scripts/run_batch100_sampled_conc4_v8.sh
new file mode 100755
index 0000000..f88bc45
--- /dev/null
+++ b/scripts/run_batch100_sampled_conc4_v8.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+source /tmp/cg_smoke_env.sh
+export CYBERGYM_SOURCE_ROOT=/data/pxd-team/workspace-149/zwq/cybergym
+export PYTHONPATH=/data/pxd-team/workspace-149/zwq/qitos-cybergym:/data/pxd-team/workspace-149/zwq/cybergym/src
+
+cd /data/pxd-team/workspace-149/zwq/qitos-cybergym
+bash /data/pxd-team/workspace-149/zwq/cybergym_agent-fresh/scripts/sync_to_qitos.sh
+
+/data3t/conda_envs/cybergym/bin/python -u scripts/run_cybergym_batch.py \
+  --data-dir /data/pxd-team/workspace-149/zwq/cybergym/cybergym_data/data \
+  --out-root /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v8 \
+  --server http://127.0.0.1:8723 \
+  --difficulty level1 \
+  --model-name GLM-5.1 \
+  --base-url "${OPENAI_BASE_URL}" \
+  --api-key "${CYBERGYM_CLAUDE_AUTH_TOKEN}" \
+  --task-file /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/trace100_multiagent_20260421_110342/tasks.txt \
+  --limit 100 \
+  --concurrency 4 \
+  --max-steps 1000000 \
+  --max-runtime-seconds 7200 \
+  --trace-prefix qitos_cybergym_batch100sampled \
+  2>&1 | tee /data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v8/run.log
diff --git a/scripts/run_batch100_strategy_memory_tmux.sh b/scripts/run_batch100_strategy_memory_tmux.sh
new file mode 100755
index 0000000..9f0513c
--- /dev/null
+++ b/scripts/run_batch100_strategy_memory_tmux.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="${ROOT:-/data/pxd-team/workspace-149/zwq/qitos-cybergym}"
+
+export RUN_NAME="${RUN_NAME:-batch100_sampled_conc2_v20_strategy_memory_full100}"
+export RUN_ROOT="${RUN_ROOT:-${ROOT}/runs/cybergym/${RUN_NAME}}"
+export TASK_FILE="${TASK_FILE:-${ROOT}/runs/cybergym/trace100_multiagent_20260421_110342/tasks.txt}"
+export TASKS_PATH="${TASKS_PATH:-${RUN_ROOT}/tasks.txt}"
+export TMUX_SESSION="${TMUX_SESSION:-zwq-5}"
+export TMUX_WINDOW_PREFIX="${TMUX_WINDOW_PREFIX:-cg-stratmem-v20}"
+export CYBERGYM_SERVER_PORT="${CYBERGYM_SERVER_PORT:-8727}"
+export CONCURRENCY="${CONCURRENCY:-2}"
+export MAX_RUNTIME_SECONDS="${MAX_RUNTIME_SECONDS:-3600}"
+export MAX_STEPS="${MAX_STEPS:-1000000}"
+export TRACE_PREFIX="${TRACE_PREFIX:-qitos_cybergym_strategy_memory_full100}"
+
+exec "${ROOT}/scripts/run_failed_maxtok32k_tmux.sh" "${@:-}"
diff --git a/scripts/run_failed_maxtok32k_tmux.sh b/scripts/run_failed_maxtok32k_tmux.sh
new file mode 100755
index 0000000..13b857f
--- /dev/null
+++ b/scripts/run_failed_maxtok32k_tmux.sh
@@ -0,0 +1,274 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="${ROOT:-/data/pxd-team/workspace-149/zwq/qitos-cybergym}"
+AGENT_ROOT="${AGENT_ROOT:-/data/pxd-team/workspace-149/zwq/cybergym_agent-fresh}"
+CYBERGYM_ROOT="${CYBERGYM_ROOT:-/data/pxd-team/workspace-149/zwq/cybergym}"
+PYTHON_BIN="${PYTHON_BIN:-/data3t/conda_envs/cybergym/bin/python}"
+
+RUN_NAME="${RUN_NAME:-batch100_sampled_conc2_v11_maxtok32k_compact60_t3600_api360_failed}"
+RUN_ROOT="${RUN_ROOT:-${ROOT}/runs/cybergym/${RUN_NAME}}"
+TASKS_PATH="${TASKS_PATH:-${RUN_ROOT}/tasks.txt}"
+
+TMUX_SESSION="${TMUX_SESSION:-zwq-5}"
+TMUX_WINDOW_PREFIX="${TMUX_WINDOW_PREFIX:-cg-maxtok32k-v11}"
+
+CYBERGYM_SERVER_HOST="${CYBERGYM_SERVER_HOST:-127.0.0.1}"
+CYBERGYM_SERVER_PORT="${CYBERGYM_SERVER_PORT:-8726}"
+SERVER_URL="${SERVER_URL:-http://${CYBERGYM_SERVER_HOST}:${CYBERGYM_SERVER_PORT}}"
+
+MODEL_NAME="${MODEL_NAME:-GLM-5.1}"
+DIFFICULTY="${DIFFICULTY:-level1}"
+CONCURRENCY="${CONCURRENCY:-2}"
+MAX_STEPS="${MAX_STEPS:-1000000}"
+MAX_RUNTIME_SECONDS="${MAX_RUNTIME_SECONDS:-3600}"
+TRACE_PREFIX="${TRACE_PREFIX:-qitos_cybergym_maxtok32k_compact60_t3600_api360_failed}"
+DEFAULT_GLM_TOKENIZER_PATH="${DEFAULT_GLM_TOKENIZER_PATH:-/data/pxd-team/workspace-149/zwq/glm-5.1-fp8-tokenizer}"
+
+DEFAULT_PREV_RUNS=(
+  "${ROOT}/runs/cybergym/batch100_sampled_conc4_v7"
+  "${ROOT}/runs/cybergym/batch100_sampled_conc4_v8"
+  "${ROOT}/runs/cybergym/batch100_sampled_conc2_v10_maxtok32k_compact60_failed"
+)
+
+if [[ -n "${PREV_RUNS:-}" ]]; then
+  # Space-separated run roots, for example:
+  # PREV_RUNS="runs/cybergym/a runs/cybergym/b" ./scripts/run_failed_maxtok32k_tmux.sh
+  read -r -a PREV_RUN_ROOTS <<< "${PREV_RUNS}"
+else
+  PREV_RUN_ROOTS=("${DEFAULT_PREV_RUNS[@]}")
+fi
+
+log() {
+  printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
+}
+
+load_model_env() {
+  if [[ -f /tmp/cg_smoke_env.sh ]]; then
+    # shellcheck source=/dev/null
+    source /tmp/cg_smoke_env.sh
+  fi
+
+  local secrets_file="${SECRETS_FILE:-${ROOT}/runs/cybergym/runtime1h_p6_iter3/run_batch_p6.sh}"
+  if [[ (-z "${CYBERGYM_CLAUDE_AUTH_TOKEN:-}" || -z "${CYBERGYM_API_KEY:-}" || -z "${OPENAI_BASE_URL:-}") && -f "${secrets_file}" ]]; then
+    local exports
+    exports="$("${PYTHON_BIN}" - "${secrets_file}" <<'PY'
+from __future__ import annotations
+
+import re
+import shlex
+import sys
+from pathlib import Path
+
+text = Path(sys.argv[1]).read_text()
+names = ("CYBERGYM_CLAUDE_AUTH_TOKEN", "CYBERGYM_API_KEY", "OPENAI_BASE_URL", "GLM_BASE_URL")
+for name in names:
+    pattern = rf"(?:export\s+)?{name}=([\"']?)(.*?)\1(?:\n|$)"
+    match = re.search(pattern, text)
+    if match:
+        print(f"export {name}={shlex.quote(match.group(2))}")
+PY
+)"
+    eval "${exports}"
+  fi
+
+  export OPENAI_BASE_URL="${OPENAI_BASE_URL:-${GLM_BASE_URL:-https://glm-zwq.openapi-qb-ai.sii.edu.cn/v1}}"
+  export CYBERGYM_CLAUDE_AUTH_TOKEN="${CYBERGYM_CLAUDE_AUTH_TOKEN:-${OPENAI_API_KEY:-}}"
+  if [[ -z "${QITOS_GLM_TOKENIZER_PATH:-}" && -d "${DEFAULT_GLM_TOKENIZER_PATH}" ]]; then
+    export QITOS_GLM_TOKENIZER_PATH="${DEFAULT_GLM_TOKENIZER_PATH}"
+  fi
+
+  if [[ -z "${CYBERGYM_CLAUDE_AUTH_TOKEN:-}" ]]; then
+    echo "CYBERGYM_CLAUDE_AUTH_TOKEN is required for model calls." >&2
+    exit 1
+  fi
+}
+
+write_task_file() {
+  mkdir -p "${RUN_ROOT}"
+
+  if [[ -n "${TASK_IDS:-}" ]]; then
+    printf '%s\n' ${TASK_IDS} > "${TASKS_PATH}"
+  elif [[ -n "${TASK_FILE:-}" ]]; then
+    cp "${TASK_FILE}" "${TASKS_PATH}"
+  else
+    "${PYTHON_BIN}" - "${TASKS_PATH}" "${PREV_RUN_ROOTS[@]}" <<'PY'
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+out_path = Path(sys.argv[1])
+run_roots = [Path(arg) for arg in sys.argv[2:]]
+
+
+def nested(mapping: dict, *keys: str):
+    cur = mapping
+    for key in keys:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(key)
+    return cur
+
+
+def task_from_manifest(path: Path, obj: dict) -> str:
+    summary = obj.get("summary") or {}
+    for value in (
+        nested(summary, "task_meta", "task_id"),
+        nested(summary, "task_result", "task_id"),
+        nested(obj, "experiment_spec", "benchmark_metadata", "task_id"),
+    ):
+        if value:
+            return str(value)
+    marker = "_arvo_"
+    name = path.parent.name
+    if marker in name:
+        return "arvo:" + name.split(marker, 1)[1].split("_", 1)[0]
+    return ""
+
+
+def stop_from_manifest(obj: dict) -> str:
+    summary = obj.get("summary") or {}
+    task_result = summary.get("task_result") if isinstance(summary.get("task_result"), dict) else {}
+    if task_result.get("success") is True:
+        return "success"
+    return str(task_result.get("stop_reason") or summary.get("stop_reason") or obj.get("status") or "unknown")
+
+
+status_by_task: dict[str, str] = {}
+ordered_tasks: list[str] = []
+
+for root in run_roots:
+    traces = root / "traces"
+    if not traces.exists():
+        continue
+    for manifest_path in sorted(traces.glob("*/manifest.json")):
+        try:
+            obj = json.loads(manifest_path.read_text())
+        except Exception:
+            continue
+        task_id = task_from_manifest(manifest_path, obj)
+        if not task_id:
+            continue
+        if task_id not in status_by_task:
+            ordered_tasks.append(task_id)
+            status_by_task[task_id] = "unknown"
+        stop_reason = stop_from_manifest(obj)
+        if stop_reason == "success":
+            status_by_task[task_id] = "success"
+        elif status_by_task[task_id] != "success":
+            status_by_task[task_id] = stop_reason
+
+unresolved = [task for task in ordered_tasks if status_by_task.get(task) != "success"]
+out_path.write_text("".join(f"{task}\n" for task in unresolved))
+print(f"Wrote {len(unresolved)} unresolved tasks to {out_path}")
+for task in unresolved:
+    print(f"{task} {status_by_task[task]}")
+PY
+  fi
+
+  local task_count
+  task_count="$(grep -cve '^[[:space:]]*$' "${TASKS_PATH}" || true)"
+  if [[ "${task_count}" -eq 0 ]]; then
+    echo "No tasks to run. ${TASKS_PATH} is empty." >&2
+    exit 1
+  fi
+  log "TASKS=${TASKS_PATH} COUNT=${task_count}"
+}
+
+run_server() {
+  mkdir -p "${RUN_ROOT}/server_poc"
+  export CYBERGYM_SOURCE_ROOT="${CYBERGYM_ROOT}"
+  export PYTHONPATH="${CYBERGYM_ROOT}/src:${PYTHONPATH:-}"
+
+  log "Starting CyberGym server on ${CYBERGYM_SERVER_HOST}:${CYBERGYM_SERVER_PORT}"
+  exec "${PYTHON_BIN}" -m cybergym.server \
+    --host "${CYBERGYM_SERVER_HOST}" \
+    --port "${CYBERGYM_SERVER_PORT}" \
+    --log_dir "${RUN_ROOT}/server_poc" \
+    --db_path "${RUN_ROOT}/server_poc/poc.db"
+}
+
+run_batch() {
+  load_model_env
+  if [[ ! -s "${TASKS_PATH}" ]]; then
+    write_task_file
+  fi
+
+  export CYBERGYM_SOURCE_ROOT="${CYBERGYM_ROOT}"
+  export PYTHONPATH="${ROOT}:${CYBERGYM_ROOT}/src:${PYTHONPATH:-}"
+
+  cd "${ROOT}"
+  log "Syncing ${AGENT_ROOT} into QitOS bundled CyberGym agent"
+  bash "${AGENT_ROOT}/scripts/sync_to_qitos.sh"
+
+  log "Running ${MODEL_NAME} on ${TASKS_PATH} via ${SERVER_URL}"
+  exec "${PYTHON_BIN}" -u scripts/run_cybergym_batch.py \
+    --data-dir "${CYBERGYM_ROOT}/cybergym_data/data" \
+    --out-root "${RUN_ROOT}" \
+    --server "${SERVER_URL}" \
+    --difficulty "${DIFFICULTY}" \
+    --model-name "${MODEL_NAME}" \
+    --base-url "${OPENAI_BASE_URL}" \
+    --api-key "${CYBERGYM_CLAUDE_AUTH_TOKEN}" \
+    --task-file "${TASKS_PATH}" \
+    --limit 0 \
+    --concurrency "${CONCURRENCY}" \
+    --max-steps "${MAX_STEPS}" \
+    --max-runtime-seconds "${MAX_RUNTIME_SECONDS}" \
+    --trace-prefix "${TRACE_PREFIX}" \
+    --resume
+}
+
+launch_tmux() {
+  write_task_file
+  mkdir -p "${RUN_ROOT}"
+
+  if ! tmux has-session -t "${TMUX_SESSION}" 2>/dev/null; then
+    echo "tmux session ${TMUX_SESSION} does not exist." >&2
+    exit 1
+  fi
+
+  local server_window="${TMUX_WINDOW_PREFIX}-server"
+  local run_window="${TMUX_WINDOW_PREFIX}-run"
+  if tmux list-windows -t "${TMUX_SESSION}" -F '#W' | grep -qx "${server_window}"; then
+    echo "tmux window already exists: ${server_window}" >&2
+    exit 1
+  fi
+  if tmux list-windows -t "${TMUX_SESSION}" -F '#W' | grep -qx "${run_window}"; then
+    echo "tmux window already exists: ${run_window}" >&2
+    exit 1
+  fi
+
+  local env_prefix
+  env_prefix="ROOT=${ROOT} AGENT_ROOT=${AGENT_ROOT} CYBERGYM_ROOT=${CYBERGYM_ROOT} PYTHON_BIN=${PYTHON_BIN} RUN_NAME=${RUN_NAME} RUN_ROOT=${RUN_ROOT} TASKS_PATH=${TASKS_PATH} CYBERGYM_SERVER_HOST=${CYBERGYM_SERVER_HOST} CYBERGYM_SERVER_PORT=${CYBERGYM_SERVER_PORT} SERVER_URL=${SERVER_URL} MODEL_NAME=${MODEL_NAME} DIFFICULTY=${DIFFICULTY} CONCURRENCY=${CONCURRENCY} MAX_STEPS=${MAX_STEPS} MAX_RUNTIME_SECONDS=${MAX_RUNTIME_SECONDS} TRACE_PREFIX=${TRACE_PREFIX} DEFAULT_GLM_TOKENIZER_PATH=${DEFAULT_GLM_TOKENIZER_PATH} QITOS_GLM_TOKENIZER_PATH=${QITOS_GLM_TOKENIZER_PATH:-}"
+
+  tmux new-window -t "${TMUX_SESSION}" -n "${server_window}" \
+    "cd ${ROOT} && ${env_prefix} bash scripts/run_failed_maxtok32k_tmux.sh --server 2>&1 | tee ${RUN_ROOT}/server.log"
+  sleep 5
+  tmux new-window -t "${TMUX_SESSION}" -n "${run_window}" \
+    "cd ${ROOT} && ${env_prefix} bash scripts/run_failed_maxtok32k_tmux.sh --run 2>&1 | tee ${RUN_ROOT}/run.log"
+
+  log "Launched tmux windows: ${TMUX_SESSION}:${server_window}, ${TMUX_SESSION}:${run_window}"
+  log "Run root: ${RUN_ROOT}"
+}
+
+case "${1:---launch}" in
+  --server)
+    run_server
+    ;;
+  --run)
+    run_batch
+    ;;
+  --prepare)
+    write_task_file
+    ;;
+  --launch)
+    launch_tmux
+    ;;
+  *)
+    echo "Usage: $0 [--launch|--prepare|--server|--run]" >&2
+    exit 2
+    ;;
+esac
diff --git a/scripts/start_batch100_sampled_conc4_v7_server.sh b/scripts/start_batch100_sampled_conc4_v7_server.sh
new file mode 100755
index 0000000..9212d84
--- /dev/null
+++ b/scripts/start_batch100_sampled_conc4_v7_server.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+RUN_DIR="/data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v7"
+PORT="${CYBERGYM_SERVER_PORT:-8722}"
+HOST="${CYBERGYM_SERVER_HOST:-127.0.0.1}"
+LOG_DIR="${RUN_DIR}/server_poc"
+DB_PATH="${LOG_DIR}/poc.db"
+export CYBERGYM_SOURCE_ROOT=/data/pxd-team/workspace-149/zwq/cybergym
+export PYTHONPATH=/data/pxd-team/workspace-149/zwq/cybergym/src:${PYTHONPATH:-}
+
+mkdir -p "${LOG_DIR}"
+
+echo "run_dir=${RUN_DIR}"
+echo "host=${HOST}"
+echo "port=${PORT}"
+echo "log_dir=${LOG_DIR}"
+echo "db_path=${DB_PATH}"
+
+exec /data3t/conda_envs/cybergym/bin/python -m cybergym.server \
+  --host "${HOST}" \
+  --port "${PORT}" \
+  --log_dir "${LOG_DIR}" \
+  --db_path "${DB_PATH}"
diff --git a/scripts/start_batch100_sampled_conc4_v8_server.sh b/scripts/start_batch100_sampled_conc4_v8_server.sh
new file mode 100755
index 0000000..f381abb
--- /dev/null
+++ b/scripts/start_batch100_sampled_conc4_v8_server.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+RUN_DIR="/data/pxd-team/workspace-149/zwq/qitos-cybergym/runs/cybergym/batch100_sampled_conc4_v8"
+PORT="${CYBERGYM_SERVER_PORT:-8723}"
+HOST="${CYBERGYM_SERVER_HOST:-127.0.0.1}"
+LOG_DIR="${RUN_DIR}/server_poc"
+DB_PATH="${LOG_DIR}/poc.db"
+
+export CYBERGYM_SOURCE_ROOT=/data/pxd-team/workspace-149/zwq/cybergym
+export PYTHONPATH=/data/pxd-team/workspace-149/zwq/cybergym/src:${PYTHONPATH:-}
+
+mkdir -p "${LOG_DIR}"
+
+exec /data3t/conda_envs/cybergym/bin/python -m cybergym.server \
+  --host "${HOST}" \
+  --port "${PORT}" \
+  --log_dir "${LOG_DIR}" \
+  --db_path "${DB_PATH}"
diff --git a/tests/test_advanced_tools_and_executor.py b/tests/test_advanced_tools_and_executor.py
index 647820b..bdb37e1 100644
--- a/tests/test_advanced_tools_and_executor.py
+++ b/tests/test_advanced_tools_and_executor.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+import threading
+import time
 
 from qitos import Action, StateSchema, ToolPermissionContext, ToolPermissionRule, ToolRegistry
-from qitos.core.action import ActionStatus
+from qitos.core.action import ActionExecutionPolicy, ActionStatus
 from qitos.core.tool import BaseTool, ToolPermission, ToolSpec, ToolValidationResult
 from qitos.engine.action_executor import ActionExecutor
 from qitos.kit.tool.tools import advanced_coding_tools
@@ -21,10 +23,10 @@
 
 
 class _EchoTool(BaseTool):
-    def __init__(self):
+    def __init__(self, name: str = "echo_tool"):
         super().__init__(
             ToolSpec(
-                name="echo_tool",
+                name=name,
                 description="demo tool",
                 parameters={"value": {"type": "string"}},
                 required=["value"],
@@ -44,11 +46,68 @@ def run(self, value: str, runtime_context=None):
         return {"result": value}
 
 
+class _SleepReadTool(BaseTool):
+    def __init__(self, name: str = "sleep_read_tool", delay: float = 0.15):
+        self.delay = delay
+        self.starts: list[float] = []
+        self._lock = threading.Lock()
+        super().__init__(
+            ToolSpec(
+                name=name,
+                description="sleepy read-only tool",
+                parameters={"value": {"type": "string"}},
+                required=["value"],
+                permissions=ToolPermission(filesystem_read=True),
+                read_only=True,
+                concurrency_safe=True,
+            )
+        )
+
+    def run(self, value: str, runtime_context=None):
+        _ = runtime_context
+        with self._lock:
+            self.starts.append(time.perf_counter())
+        time.sleep(self.delay)
+        return {"value": value}
+
+
+class _UnsafeSleepTool(BaseTool):
+    def __init__(self, name: str = "unsafe_sleep_tool", delay: float = 0.05):
+        self.delay = delay
+        self.starts: list[float] = []
+        self._lock = threading.Lock()
+        super().__init__(
+            ToolSpec(
+                name=name,
+                description="sleepy non-concurrency-safe tool",
+                parameters={"value": {"type": "string"}},
+                required=["value"],
+                permissions=ToolPermission(filesystem_read=True),
+                read_only=True,
+                concurrency_safe=False,
+            )
+        )
+
+    def run(self, value: str, runtime_context=None):
+        _ = runtime_context
+        with self._lock:
+            self.starts.append(time.perf_counter())
+        time.sleep(self.delay)
+        return {"value": value}
+
+
 @dataclass
 class _ExecutorState(StateSchema):
     pass
 
 
+@dataclass
+class _CandidateReadyState(StateSchema):
+    poc_path: str = ""
+    candidate_ready_for_submit: bool = False
+    workspace_root: str = ""
+
+
 def test_action_executor_applies_validation_permission_and_truncation():
     registry = ToolRegistry().register(_EchoTool())
     executor = ActionExecutor(registry)
@@ -91,6 +150,98 @@ def test_action_executor_applies_validation_permission_and_truncation():
     assert ask.output["status"] == "needs_user_input"
 
 
+def test_action_executor_blocks_non_submit_tools_when_candidate_ready(tmp_path):
+    (tmp_path / "poc.bin").write_bytes(b"candidate")
+    registry = ToolRegistry().register(_EchoTool()).register(_EchoTool(name="submit_poc"))
+    executor = ActionExecutor(registry)
+    state = _CandidateReadyState(
+        task="demo",
+        workspace_root=str(tmp_path),
+        poc_path="poc.bin",
+        candidate_ready_for_submit=True,
+    )
+
+    blocked = executor.execute(
+        [Action(name="echo_tool", args={"value": "ignored"})],
+        state=state,
+    )[0]
+    allowed = executor.execute(
+        [Action(name="submit_poc", args={"value": "poc.bin"})],
+        state=state,
+    )[0]
+
+    assert blocked.status == ActionStatus.ERROR
+    assert blocked.metadata["error_category"] == "candidate_submit_ready_guard"
+    assert "submit_poc" in blocked.output["message"]
+    assert allowed.status == ActionStatus.SUCCESS
+
+
+def test_action_executor_allows_regeneration_when_ready_candidate_file_missing(tmp_path):
+    registry = ToolRegistry().register(_EchoTool())
+    executor = ActionExecutor(registry)
+    state = _CandidateReadyState(
+        task="demo",
+        workspace_root=str(tmp_path),
+        poc_path="missing.bin",
+        candidate_ready_for_submit=True,
+    )
+
+    result = executor.execute(
+        [Action(name="echo_tool", args={"value": "regenerate"})],
+        state=state,
+    )[0]
+
+    assert result.status == ActionStatus.SUCCESS
+
+
+def test_action_executor_runs_concurrency_safe_read_only_tools_in_parallel():
+    tool = _SleepReadTool()
+    registry = ToolRegistry().register(tool)
+    executor = ActionExecutor(
+        registry,
+        policy=ActionExecutionPolicy(mode="parallel", max_concurrency=4),
+    )
+
+    started = time.perf_counter()
+    results = executor.execute(
+        [
+            Action(name="sleep_read_tool", args={"value": "a"}),
+            Action(name="sleep_read_tool", args={"value": "b"}),
+            Action(name="sleep_read_tool", args={"value": "c"}),
+        ]
+    )
+    elapsed = time.perf_counter() - started
+
+    assert [item.status for item in results] == [ActionStatus.SUCCESS] * 3
+    assert elapsed < 0.35
+    assert len(tool.starts) == 3
+    assert max(tool.starts) - min(tool.starts) < 0.08
+
+
+def test_action_executor_keeps_non_concurrency_safe_tools_serial_even_in_parallel_mode():
+    tool = _UnsafeSleepTool()
+    registry = ToolRegistry().register(tool)
+    executor = ActionExecutor(
+        registry,
+        policy=ActionExecutionPolicy(mode="parallel", max_concurrency=4),
+    )
+
+    started = time.perf_counter()
+    results = executor.execute(
+        [
+            Action(name="unsafe_sleep_tool", args={"value": "a"}),
+            Action(name="unsafe_sleep_tool", args={"value": "b"}),
+            Action(name="unsafe_sleep_tool", args={"value": "c"}),
+        ]
+    )
+    elapsed = time.perf_counter() - started
+
+    assert [item.status for item in results] == [ActionStatus.SUCCESS] * 3
+    assert elapsed >= 0.14
+    assert len(tool.starts) == 3
+    assert tool.starts[1] - tool.starts[0] >= 0.04
+
+
 def test_run_command_executes_in_workspace(tmp_path):
     tool = RunCommand(workspace_root=str(tmp_path))
     result = tool.run(command="pwd")
diff --git a/tests/test_benchmark_cybergym_recipe.py b/tests/test_benchmark_cybergym_recipe.py
index a109ef4..cf7615c 100644
--- a/tests/test_benchmark_cybergym_recipe.py
+++ b/tests/test_benchmark_cybergym_recipe.py
@@ -3,9 +3,15 @@
 from pathlib import Path
 from types import SimpleNamespace
 from unittest import mock
+import sys
+import types
 
 from qitos.benchmark import normalize_benchmark_name, resolve_builtin_runner
 from qitos.benchmark.cybergym import CyberGymBenchmarkAdapter, make_trace_writer, task_slug
+from qitos.benchmark.cybergym._imports import (
+    ensure_cybergym_source_importable,
+    resolve_cybergym_source_root,
+)
 import qitos.benchmark.cybergym.runner as cybergym_runner
 from qitos.recipes.benchmarks import cybergym
 
@@ -126,6 +132,37 @@ def test_runner_uses_task_root_workspace_and_keeps_source_root_context(self):
             self.assertEqual(run_kwargs["source_root"], str(source_root))
             self.assertEqual(run_kwargs["repo_dir"], str(source_root))
 
+    def test_resolve_cybergym_source_root_prefers_workspace_sibling(self):
+        root = resolve_cybergym_source_root()
+
+        self.assertEqual(
+            root,
+            Path("/data/pxd-team/workspace-149/zwq/cybergym").resolve(),
+        )
+
+    def test_ensure_cybergym_source_importable_prepends_src_and_evicts_stale_modules(self):
+        stale = types.ModuleType("cybergym")
+        stale.__file__ = "/home/pgroup/data3t/pgroup/zwq/cybergym/src/cybergym/__init__.py"
+        stale_sub = types.ModuleType("cybergym.task")
+        stale_sub.__file__ = "/home/pgroup/data3t/pgroup/zwq/cybergym/src/cybergym/task/__init__.py"
+        original_path = list(sys.path)
+        stale_path = "/home/pgroup/data3t/pgroup/zwq/cybergym/src"
+
+        with mock.patch.dict(
+            sys.modules,
+            {"cybergym": stale, "cybergym.task": stale_sub},
+            clear=False,
+        ):
+            with mock.patch.object(sys, "path", [stale_path, *original_path]):
+                root = ensure_cybergym_source_importable()
+                expected_src = str((root / "src").resolve())
+
+                self.assertEqual(root, Path("/data/pxd-team/workspace-149/zwq/cybergym").resolve())
+                self.assertEqual(sys.path[0], expected_src)
+                self.assertNotIn(stale_path, sys.path)
+                self.assertNotIn("cybergym", sys.modules)
+                self.assertNotIn("cybergym.task", sys.modules)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_cybergym_context_retention.py b/tests/test_cybergym_context_retention.py
index 7726789..3744101 100644
--- a/tests/test_cybergym_context_retention.py
+++ b/tests/test_cybergym_context_retention.py
@@ -135,8 +135,10 @@ def test_prompt_and_trace_payload_include_working_memory(tmp_path: Path) -> None
         "When working with tool results, write down any important information you might need later in your response"
         in system_prompt
     )
-    assert "## Stable Task Facts" in system_prompt
-    assert "Working Directory (cwd)" in system_prompt
+    assert "## Stable Task Facts" not in system_prompt
+    assert "Working Directory (cwd)" not in system_prompt
+    assert "cybergym" not in system_prompt.lower()
+    assert "cybergym" not in observation.lower()
     assert "## Working Memory" not in observation
     assert "### Project Index" not in observation
     assert payload["durable_project_memory"]
diff --git a/tests/test_cybergym_context_snip.py b/tests/test_cybergym_context_snip.py
index 89d171e..99c59e3 100644
--- a/tests/test_cybergym_context_snip.py
+++ b/tests/test_cybergym_context_snip.py
@@ -3,13 +3,12 @@
 from pathlib import Path
 
 from qitos.benchmark.cybergym.agent.context import SnipCompactor
+from qitos.benchmark.cybergym.agent.state import CyberGymState
 from qitos.core.history import HistoryMessage
-from qitos.core.state import StateSchema
 
 
 def test_snip_compactor_persists_old_tool_results_with_preview(tmp_path: Path) -> None:
-    state = StateSchema(task="demo")
-    state.metadata["trace_run_dir"] = str(tmp_path / "trace")
+    state = CyberGymState(task="demo", workspace_root=str(tmp_path))
 
     older = "HEAD line\n" + ("A" * 600) + "\nTAIL line"
     recent = "recent tool output"
@@ -23,13 +22,24 @@ def test_snip_compactor_persists_old_tool_results_with_preview(tmp_path: Path) -
 
     assert result[0].metadata.get("snipped") is True
     assert result[0].metadata.get("snip_saved_path")
-    assert "saved_path:" in str(result[0].content)
+    assert "[compact:start" in str(result[0].content)
+    assert "path=.agent/memory/project/tool_results/" in str(result[0].content)
     assert "preview_head:" in str(result[0].content)
     assert "preview_tail:" in str(result[0].content)
+    assert "[compact:end]" in str(result[0].content)
 
-    saved_path = Path(str(result[0].metadata["snip_saved_path"]))
+    saved_path = tmp_path / str(result[0].metadata["snip_saved_path"])
     assert saved_path.exists()
     assert saved_path.read_text(encoding="utf-8") == older
+    index_path = tmp_path / ".agent" / "memory" / "project" / "INDEX.md"
+    assert "kind=tool_result" in index_path.read_text(encoding="utf-8")
+    SnipCompactor(keep_recent=1).snip(messages, state=state)
+    index_lines = [
+        line
+        for line in index_path.read_text(encoding="utf-8").splitlines()
+        if "path=.agent/memory/project/tool_results/step-0001/tool-0000.txt" in line
+    ]
+    assert len(index_lines) == 1
 
     assert result[2].content == recent
     assert result[2].metadata.get("snipped") is None
diff --git a/tests/test_cybergym_parallel_tools_prompt.py b/tests/test_cybergym_parallel_tools_prompt.py
new file mode 100644
index 0000000..01a8010
--- /dev/null
+++ b/tests/test_cybergym_parallel_tools_prompt.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+from types import SimpleNamespace
+from unittest import mock
+
+from qitos.benchmark.cybergym.agent.agent import CyberGymAgent
+from qitos.benchmark.cybergym.agent.state import CyberGymState
+
+
+def test_allowed_tools_prompt_mentions_parallel_read_only_tools():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        llm = SimpleNamespace(model="stub")
+        workspace = Path(tmpdir)
+        with mock.patch("qitos.benchmark.cybergym.agent.agent.bootstrap_evidence_index", return_value=None):
+            agent = CyberGymAgent(llm=llm, workspace_root=str(workspace), task_root=str(workspace))
+
+        state = CyberGymState(task="demo", max_steps=10, workspace_root=str(workspace))
+        lines = agent._allowed_tool_lines(state)
+        prompt = "\n".join(lines)
+
+        assert "parallel" in prompt.lower()
+        assert "read-only" in prompt.lower()
+        assert "`READ(path, offset?, limit?)`" in prompt
+        assert "4" in prompt
diff --git a/tests/test_cybergym_run_report.py b/tests/test_cybergym_run_report.py
new file mode 100644
index 0000000..8e8ed68
--- /dev/null
+++ b/tests/test_cybergym_run_report.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+from pathlib import Path
+
+
+def _load_script_module():
+    script_path = Path(__file__).resolve().parents[1] / "scripts" / "cybergym_run_report.py"
+    spec = importlib.util.spec_from_file_location("cybergym_run_report", script_path)
+    assert spec is not None
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def _write_manifest(
+    run_root: Path,
+    trace_id: str,
+    task_id: str,
+    stop_reason: str,
+    *,
+    success: bool = False,
+    steps: int = 3,
+) -> Path:
+    trace_dir = run_root / "traces" / trace_id
+    trace_dir.mkdir(parents=True)
+    path = trace_dir / "manifest.json"
+    path.write_text(
+        json.dumps(
+            {
+                "summary": {
+                    "stop_reason": stop_reason,
+                    "steps": steps,
+                    "latency_seconds": 12.5,
+                    "token_usage": 1234,
+                    "final_result": "poc.bin" if success else None,
+                    "task_meta": {"task_id": task_id},
+                    "task_result": {"task_id": task_id, "success": success},
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+    return path
+
+
+def test_collect_run_report_uses_success_as_final_result(tmp_path: Path) -> None:
+    module = _load_script_module()
+    run_root = tmp_path / "run-a"
+    _write_manifest(run_root, "trace-old", "arvo:1", "budget_time")
+    _write_manifest(run_root, "trace-success", "arvo:1", "success", success=True, steps=5)
+    _write_manifest(run_root, "trace-miss", "arvo:2", "final")
+
+    report = module.collect_run_report(run_root)
+
+    assert report.name == "run-a"
+    assert report.manifest_count == 3
+    assert report.total == 2
+    assert report.success_count == 1
+    assert report.tasks["arvo:1"].stop_reason == "success"
+    assert report.tasks["arvo:1"].steps == 5
+    assert report.stop_reasons == {"success": 1, "final": 1}
+
+
+def test_cli_writes_markdown_and_csv_for_multiple_runs(tmp_path: Path) -> None:
+    module = _load_script_module()
+    runs_root = tmp_path / "runs"
+    run_a = runs_root / "run-a"
+    run_b = runs_root / "run-b"
+    _write_manifest(run_a, "trace-a1", "arvo:1", "success", success=True)
+    _write_manifest(run_b, "trace-b1", "arvo:1", "budget_time")
+    _write_manifest(run_b, "trace-b2", "arvo:2", "success", success=True)
+    task_file = tmp_path / "tasks.txt"
+    task_file.write_text("arvo:1\narvo:2\narvo:3\n", encoding="utf-8")
+    md_path = tmp_path / "report.md"
+    csv_path = tmp_path / "report.csv"
+
+    rc = module.main(
+        [
+            str(run_a),
+            str(run_b),
+            "--task-file",
+            str(task_file),
+            "-o",
+            str(md_path),
+            "--csv",
+            str(csv_path),
+        ]
+    )
+
+    assert rc == 0
+    md = md_path.read_text(encoding="utf-8")
+    assert "| `run-a` | 1 | 1 | 100.00%" in md
+    assert "| `run-b` | 1 | 2 | 50.00%" in md
+    assert "| `arvo:3` | - | - |" in md
+    csv_text = csv_path.read_text(encoding="utf-8")
+    assert "task_id,run,stop_reason,success" in csv_text
+    assert "arvo:2,run-a,missing,false" in csv_text
diff --git a/tests/test_cybergym_success_rate_script.py b/tests/test_cybergym_success_rate_script.py
new file mode 100644
index 0000000..7101ec6
--- /dev/null
+++ b/tests/test_cybergym_success_rate_script.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+from pathlib import Path
+
+
+def _load_script_module():
+    script_path = Path(__file__).resolve().parents[1] / "scripts" / "cybergym_success_rate.py"
+    spec = importlib.util.spec_from_file_location("cybergym_success_rate", script_path)
+    assert spec is not None
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def _write_manifest(root: Path, run_id: str, stop_reason: str) -> None:
+    run_dir = root / "traces" / run_id
+    run_dir.mkdir(parents=True)
+    (run_dir / "manifest.json").write_text(
+        json.dumps({"summary": {"stop_reason": stop_reason}}),
+        encoding="utf-8",
+    )
+
+
+def test_counts_success_rate_from_cybergym_run_folder(tmp_path: Path) -> None:
+    module = _load_script_module()
+    _write_manifest(tmp_path, "run-success-1", "success")
+    _write_manifest(tmp_path, "run-timeout", "budget_time")
+    _write_manifest(tmp_path, "run-success-2", "success")
+
+    stats = module.collect_success_rate(tmp_path)
+
+    assert stats.total == 3
+    assert stats.success == 2
+    assert stats.rate == 2 / 3
+    assert stats.stop_reasons == {"success": 2, "budget_time": 1}
+
+
+def test_cli_prints_summary_for_run_folder(tmp_path: Path, capsys) -> None:
+    module = _load_script_module()
+    _write_manifest(tmp_path, "run-success", "success")
+    _write_manifest(tmp_path, "run-failed", "final")
+
+    rc = module.main([str(tmp_path)])
+
+    assert rc == 0
+    output = capsys.readouterr().out
+    assert "success: 1/2 (50.00%)" in output
+    assert "success: 1" in output
+    assert "final: 1" in output
diff --git a/tests/test_glm_tokenizer_count.py b/tests/test_glm_tokenizer_count.py
new file mode 100644
index 0000000..f674e0c
--- /dev/null
+++ b/tests/test_glm_tokenizer_count.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from qitos.models.openai import OpenAICompatibleModel
+
+
+class _FakeTokenizer:
+    def __init__(self):
+        self.last_messages = None
+
+    def apply_chat_template(self, messages, tokenize=True, add_generation_prompt=False):
+        assert tokenize is True
+        assert add_generation_prompt is False
+        self.last_messages = list(messages)
+        return {"input_ids": list(range(37))}
+
+    def encode(self, text, add_special_tokens=False):
+        assert add_special_tokens is False
+        return list(range(len(str(text).split())))
+
+
+def test_glm_openai_compatible_model_uses_local_glm_tokenizer(monkeypatch):
+    tokenizer = _FakeTokenizer()
+    monkeypatch.setattr("qitos.models.openai._glm_tokenizer_path", lambda: "/tmp/glm-tokenizer")
+    monkeypatch.setattr("qitos.models.openai._load_glm_tokenizer", lambda path: tokenizer)
+
+    model = OpenAICompatibleModel(
+        model="GLM-5.1",
+        base_url="http://localhost/v1",
+    )
+
+    count = model.count_tokens(
+        [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "hello", "tool_calls": [{"name": "x"}]},
+        ]
+    )
+
+    assert count == 37
+    assert tokenizer.last_messages[1]["role"] == "user"
+    assert "tool_calls" in tokenizer.last_messages[1]["content"]
+
+
+def test_non_glm_model_keeps_default_token_estimate(monkeypatch):
+    def _boom(_path):
+        raise AssertionError("tokenizer should not load for non-GLM models")
+
+    monkeypatch.setattr("qitos.models.openai._glm_tokenizer_path", lambda: "/tmp/glm-tokenizer")
+    monkeypatch.setattr("qitos.models.openai._load_glm_tokenizer", _boom)
+
+    model = OpenAICompatibleModel(
+        model="qwen-plus",
+        base_url="http://localhost/v1",
+    )
+
+    assert model.count_tokens("hello world") == 2
diff --git a/tests/test_predefined_atomic_tools.py b/tests/test_predefined_atomic_tools.py
index 00251c9..81fb76e 100644
--- a/tests/test_predefined_atomic_tools.py
+++ b/tests/test_predefined_atomic_tools.py
@@ -55,7 +55,9 @@ def test_codebase_toolset_glob_grep_read_append(tmp_path):
 
     read_out = toolset.read_file_range(path="src/a.py", offset=1, limit=1)
     assert read_out["status"] == "success"
-    assert read_out["lines"][0]["line"] == 2
+    assert read_out["offset"] == 1
+    assert read_out["limit"] == 1
+    assert "lines" not in read_out
     assert "return a + b" in read_out["content"]
 
     append_out = toolset.append_file(path="src/b.md", content="extra\n")
diff --git a/tests/test_runtime_recovery.py b/tests/test_runtime_recovery.py
new file mode 100644
index 0000000..0ae05bd
--- /dev/null
+++ b/tests/test_runtime_recovery.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from qitos.core.errors import ErrorCategory, classify_exception
+from qitos.engine.recovery import RecoveryPolicy
+
+
+def test_classify_exception_marks_stream_timeout_as_recoverable_model_error() -> None:
+    info = classify_exception(RuntimeError("stream timeout"), "DECIDE", 7)
+
+    assert info.category == ErrorCategory.MODEL
+    assert info.recoverable is True
+    assert info.phase == "DECIDE"
+    assert info.step_id == 7
+
+
+def test_classify_exception_marks_timed_out_message_as_recoverable_model_error() -> None:
+    info = classify_exception(RuntimeError("request timed out while streaming"), "PROPOSE", 3)
+
+    assert info.category == ErrorCategory.MODEL
+    assert info.recoverable is True
+
+
+def test_recovery_policy_continues_on_stream_timeout() -> None:
+    decision = RecoveryPolicy().handle(state=None, phase="DECIDE", step_id=11, exc=RuntimeError("stream timeout"))
+
+    assert decision.handled is True
+    assert decision.continue_run is True
+    assert decision.stop_reason is None