From 59cc9280152a53aef8e25597d5f1a6ac572f8e3d Mon Sep 17 00:00:00 2001
From: bmz-q-q <1049675766@qq.com>
Date: Wed, 15 Apr 2026 23:50:15 +0800
Subject: [PATCH] Add CyberGym benchmark integration and docs
---
docs/docs.json | 3 +-
docs/zh/benchmarks/cybergym.mdx | 189 ++++++++++++++++++++
docs/zh/benchmarks/overview.mdx | 3 +-
examples/benchmarks/cybergym_eval.py | 17 ++
qitos/benchmark/__init__.py | 12 ++
qitos/benchmark/cybergym/__init__.py | 43 +++++
qitos/benchmark/cybergym/adapter.py | 85 +++++++++
qitos/benchmark/cybergym/evaluator.py | 28 +++
qitos/benchmark/cybergym/runner.py | 221 ++++++++++++++++++++++++
qitos/benchmark/cybergym/runtime.py | 52 ++++++
qitos/benchmark/cybergym/scorer.py | 32 ++++
qitos/benchmark/runner.py | 42 +++++
qitos/recipes/benchmarks/cybergym.py | 98 +++++++++++
tests/test_benchmark_cybergym_recipe.py | 47 +++++
14 files changed, 870 insertions(+), 2 deletions(-)
create mode 100644 docs/zh/benchmarks/cybergym.mdx
create mode 100644 examples/benchmarks/cybergym_eval.py
create mode 100644 qitos/benchmark/cybergym/__init__.py
create mode 100644 qitos/benchmark/cybergym/adapter.py
create mode 100644 qitos/benchmark/cybergym/evaluator.py
create mode 100644 qitos/benchmark/cybergym/runner.py
create mode 100644 qitos/benchmark/cybergym/runtime.py
create mode 100644 qitos/benchmark/cybergym/scorer.py
create mode 100644 qitos/recipes/benchmarks/cybergym.py
create mode 100644 tests/test_benchmark_cybergym_recipe.py
diff --git a/docs/docs.json b/docs/docs.json
index 63ceb39..588f07a 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -202,7 +202,8 @@
"zh/benchmarks/osworld",
"zh/benchmarks/gaia",
"zh/benchmarks/tau-bench",
- "zh/benchmarks/cybench"
+ "zh/benchmarks/cybench",
+ "zh/benchmarks/cybergym"
]
},
{
diff --git a/docs/zh/benchmarks/cybergym.mdx b/docs/zh/benchmarks/cybergym.mdx
new file mode 100644
index 0000000..4b38510
--- /dev/null
+++ b/docs/zh/benchmarks/cybergym.mdx
@@ -0,0 +1,189 @@
+---
+title: "CyberGym"
+description: "使用 `qitos.benchmark.cybergym`、外部同步的 `cybergym_agent` 和 QitOS trace,在 CyberGym 上运行安全漏洞 PoC benchmark。"
+---
+
+CyberGym 是一个面向漏洞触发与 PoC 生成的安全 benchmark。任务通常会提供漏洞版本代码、描述信息和一个 `submit.sh`,agent 需要构造输入,让 vulnerable 版本触发异常,并最终通过 patched 版本验证。
+
+这次集成把 CyberGym 放进了 QitOS 的正式 benchmark 结构里:
+
+- `qitos.benchmark.cybergym`
+- `qitos.recipes.benchmarks.cybergym`
+- `examples/benchmarks/cybergym_eval.py`
+
+同时把运行产物统一放到 `runs/cybergym/` 下,便于直接用 `qita` 查看轨迹。
+
+注意:这个 PR 不直接提交 `cybergym_agent` 代码。运行前需要维护者手动把 `cybergym_agent` 仓库同步到 `qitos/benchmark/cybergym/agent/`。
+
+## 集成结构
+
+QitOS 侧的主要文件:
+
+- `qitos/benchmark/cybergym/adapter.py`
+- `qitos/benchmark/cybergym/runtime.py`
+- `qitos/benchmark/cybergym/evaluator.py`
+- `qitos/benchmark/cybergym/scorer.py`
+- `qitos/benchmark/cybergym/runner.py`
+- `qitos/recipes/benchmarks/cybergym.py`
+- `examples/benchmarks/cybergym_eval.py`
+
+职责划分:
+
+- `adapter.py`:把 CyberGym 的 `task_id` 转成 QitOS `Task`
+- `runtime.py`:生成 task 目录并附加运行时元信息
+- `runner.py`:调起外部同步进来的 `cybergym_agent`、写 trace、返回标准结果
+- `recipe`:可复现 baseline
+- `example`:最薄的一层命令入口
+
+## 同步 `cybergym_agent`
+
+运行前,需要先把 `cybergym_agent` 仓库同步到 QitOS 里。下面命令在 QitOS 仓库根目录执行:
+
+```bash
+mkdir -p qitos/benchmark/cybergym/agent
+rsync -a \
+ --exclude .git \
+ --exclude __pycache__ \
+ --exclude test_agent.py \
+ ../cybergym_agent/ \
+ qitos/benchmark/cybergym/agent/
+```
+
+如果没有这一步,`qitos.benchmark.cybergym.runner` 会在运行时直接报错,提示先复制 `cybergym_agent`。
+
+## 准备工作
+
+
+
+ 确保可以访问 `cybergym_data/data`,例如放在 QitOS 同级目录的 `../cybergym/cybergym_data/data`。
+
+
+ 需要一台能运行 Docker 镜像的 CyberGym server。示例:
+
+ ```bash
+ cd ../cybergym
+ python -m cybergym.server \
+ --host 127.0.0.1 \
+ --port 8669 \
+ --log_dir ../qitos/runs/cybergym/server_poc \
+ --db_path ../qitos/runs/cybergym/server_poc/poc.db
+ ```
+
+
+ ```bash
+ export CYBERGYM_CLAUDE_AUTH_TOKEN="your-model-key"
+ export CYBERGYM_API_KEY="your-verify-key"
+ ```
+
+
+
+## 单任务运行
+
+从 QitOS 仓库根目录运行:
+
+```bash
+python examples/benchmarks/cybergym_eval.py \
+ --task-id arvo:1065 \
+ --data-dir ../cybergym/cybergym_data/data \
+ --out-dir runs/cybergym/workspace/arvo_1065 \
+ --server http://127.0.0.1:8669 \
+ --difficulty level1 \
+ --model-name GLM-5.1-sii \
+ --api-key "$CYBERGYM_CLAUDE_AUTH_TOKEN" \
+ --base-url https://your-openai-compatible-endpoint/v1 \
+ --max-steps 30 \
+ --trace-logdir runs/cybergym/traces
+```
+
+## 批量跑 100 个任务
+
+假设 `tasks.txt` 每行一个 `task_id`:
+
+```text
+arvo:1065
+arvo:3938
+oss-fuzz:42535201
+...
+```
+
+从 QitOS 仓库根目录顺序跑:
+
+```bash
+export TASKS_FILE=./tasks.txt
+export SERVER=http://your-cybergym-server:8669
+
+while read -r TASK_ID; do
+ [ -z "$TASK_ID" ] && continue
+ SLUG="${TASK_ID/:/_}"
+ echo "===== START $TASK_ID ====="
+ python examples/benchmarks/cybergym_eval.py \
+ --task-id "$TASK_ID" \
+ --data-dir ../cybergym/cybergym_data/data \
+ --out-dir "runs/cybergym/workspace/$SLUG" \
+ --server "$SERVER" \
+ --difficulty level1 \
+ --model-name GLM-5.1-sii \
+ --api-key "$CYBERGYM_CLAUDE_AUTH_TOKEN" \
+ --base-url https://your-openai-compatible-endpoint/v1 \
+ --max-steps 30 \
+ --trace-logdir runs/cybergym/traces
+ echo "===== END $TASK_ID ====="
+done < "$TASKS_FILE" | tee runs/cybergym/run-100.log
+```
+
+小并发时可以自行改成 `xargs -P 2` 或 `xargs -P 4`,但建议先验证模型端和 server 端的稳定性。
+
+## 批量 verify
+
+CyberGym 的 public `/submit-vul` 只返回 vulnerable 侧结果。完整 benchmark 判定还需要 fix 侧 verify。
+
+在 `cybergym` 仓库根目录运行:
+
+```bash
+python scripts/verify_batch_results.py \
+ --logs_dir ../qitos/runs/cybergym/logs \
+ --server http://your-cybergym-server:8669 \
+ --pocdb_path ../qitos/runs/cybergym/server_poc/poc.db \
+ --summary_json ../qitos/runs/cybergym/verify-summary.json
+```
+
+只看当前数据库状态、不真正发 verify:
+
+```bash
+python scripts/verify_batch_results.py \
+ --logs_dir ../qitos/runs/cybergym/logs \
+ --server http://your-cybergym-server:8669 \
+ --pocdb_path ../qitos/runs/cybergym/server_poc/poc.db \
+ --skip_verify
+```
+
+## trace 与产物
+
+运行后主要产物在:
+
+- `runs/cybergym/workspace/`
+- `runs/cybergym/server_poc/`
+- `runs/cybergym/traces/`
+
+查看轨迹:
+
+```bash
+qita board --logdir runs/cybergym/traces
+```
+
+QitOS trace 会写出:
+
+- `manifest.json`
+- `events.jsonl`
+- `steps.jsonl`
+
+## 当前状态
+
+这次集成已经验证了:
+
+- CyberGym task 能转成 QitOS `Task`
+- benchmark family 已注册到 `qitos.benchmark`
+- recipe 和 thin example 能正常调用同一条 runner
+- 真实 smoke 可以生成 task、初始化 `GLM-5.1-sii`、写出 QitOS trace
+
+当前已知限制是模型协议还没有完全对齐。`GLM-5.1-sii` 目前更倾向输出 `...` 风格内容,而当前 agent 还走 `JsonDecisionParser` 期望纯 JSON,因此在短 smoke 里会停在 parser error。这是下一步要处理的模型协议适配问题,不是 CyberGym benchmark 结构接入问题。
diff --git a/docs/zh/benchmarks/overview.mdx b/docs/zh/benchmarks/overview.mdx
index 9180bc6..86e1752 100644
--- a/docs/zh/benchmarks/overview.mdx
+++ b/docs/zh/benchmarks/overview.mdx
@@ -1,6 +1,6 @@
---
title: "基准测试"
-description: "通过统一的官方 QitOS benchmark 路径运行 desktop-starter、OSWorld、GAIA、Tau-Bench 与 CyBench,并产出统一结果与 trace artifacts。"
+description: "通过统一的官方 QitOS benchmark 路径运行 desktop-starter、OSWorld、GAIA、Tau-Bench、CyBench 与 CyberGym,并产出统一结果与 trace artifacts。"
---
在 QitOS 里,benchmark 不是另一套平行 runtime,而是同一条 agent runtime 叙事的延伸。
@@ -23,6 +23,7 @@ description: "通过统一的官方 QitOS benchmark 路径运行 desktop-starter
| [GAIA](/zh/benchmarks/gaia) | 通用 AI assistant 任务 | Exact match |
| [Tau-Bench](/zh/benchmarks/tau-bench) | Tool-agent-user 交互 | Reward / pass^k |
| [CyBench](/zh/benchmarks/cybench) | CTF 风格安全评测 | Guided subtask score |
+| [CyberGym](/zh/benchmarks/cybergym) | 漏洞 PoC 生成与 differential verify | `vul_exit_code != 0` 且 `fix_exit_code == 0` |
## 官方 benchmark 入口
diff --git a/examples/benchmarks/cybergym_eval.py b/examples/benchmarks/cybergym_eval.py
new file mode 100644
index 0000000..5d2b225
--- /dev/null
+++ b/examples/benchmarks/cybergym_eval.py
@@ -0,0 +1,17 @@
+"""Thin CyberGym benchmark entrypoint backed by the canonical recipe."""
+
+from qitos.recipes.benchmarks.cybergym import (
+ main,
+ run_cybergym_agent_task,
+ run_cybergym_recipe_task,
+)
+
+__all__ = [
+ "main",
+ "run_cybergym_agent_task",
+ "run_cybergym_recipe_task",
+]
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/qitos/benchmark/__init__.py b/qitos/benchmark/__init__.py
index 0e0aada..c1ca29a 100644
--- a/qitos/benchmark/__init__.py
+++ b/qitos/benchmark/__init__.py
@@ -40,6 +40,12 @@
"load_cybench_tasks": (".cybench", "load_cybench_tasks"),
"run_cybench_task": (".cybench", "run_cybench_task"),
"score_cybench_submission": (".cybench", "score_cybench_submission"),
+ "CyberGymBenchmarkAdapter": (".cybergym", "CyberGymBenchmarkAdapter"),
+ "CyberGymEvaluator": (".cybergym", "CyberGymEvaluator"),
+ "CyberGymRuntimeHook": (".cybergym", "CyberGymRuntimeHook"),
+ "CyberGymScorer": (".cybergym", "CyberGymScorer"),
+ "load_cybergym_tasks": (".cybergym", "load_cybergym_tasks"),
+ "run_cybergym_task": (".cybergym", "run_cybergym_task"),
"GaiaAdapter": (".gaia", "GaiaAdapter"),
"GaiaEvaluator": (".gaia", "GaiaEvaluator"),
"GaiaRuntimeHook": (".gaia", "GaiaRuntimeHook"),
@@ -100,6 +106,12 @@ def __getattr__(name: str):
"run_cybench_task",
"score_cybench_submission",
"load_cybench_tasks",
+ "CyberGymBenchmarkAdapter",
+ "CyberGymEvaluator",
+ "CyberGymRuntimeHook",
+ "CyberGymScorer",
+ "load_cybergym_tasks",
+ "run_cybergym_task",
"GaiaAdapter",
"GaiaEvaluator",
"GaiaRuntimeHook",
diff --git a/qitos/benchmark/cybergym/__init__.py b/qitos/benchmark/cybergym/__init__.py
new file mode 100644
index 0000000..126efae
--- /dev/null
+++ b/qitos/benchmark/cybergym/__init__.py
@@ -0,0 +1,43 @@
+"""CyberGym benchmark integration."""
+
+from __future__ import annotations
+
+import importlib
+
+_LAZY_ATTRS = {
+ "CyberGymBenchmarkAdapter": (".adapter", "CyberGymBenchmarkAdapter"),
+ "load_cybergym_tasks": (".adapter", "load_cybergym_tasks"),
+ "task_slug": (".adapter", "task_slug"),
+ "CyberGymEvaluator": (".evaluator", "CyberGymEvaluator"),
+ "CyberGymRuntimeHook": (".runtime", "CyberGymRuntimeHook"),
+ "prepare_task_dir": (".runtime", "prepare_task_dir"),
+ "CyberGymScorer": (".scorer", "CyberGymScorer"),
+ "make_trace_writer": (".runner", "make_trace_writer"),
+ "run_cybergym_agent_task": (".runner", "run_cybergym_agent_task"),
+ "run_cybergym_task": (".runner", "run_cybergym_task"),
+}
+
+
+def __getattr__(name: str):
+ target = _LAZY_ATTRS.get(name)
+ if target is None:
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+ module_name, attr_name = target
+ module = importlib.import_module(module_name, __name__)
+ value = getattr(module, attr_name)
+ globals()[name] = value
+ return value
+
+
+__all__ = [
+ "CyberGymBenchmarkAdapter",
+ "CyberGymEvaluator",
+ "CyberGymRuntimeHook",
+ "CyberGymScorer",
+ "load_cybergym_tasks",
+ "make_trace_writer",
+ "prepare_task_dir",
+ "run_cybergym_agent_task",
+ "run_cybergym_task",
+ "task_slug",
+]
diff --git a/qitos/benchmark/cybergym/adapter.py b/qitos/benchmark/cybergym/adapter.py
new file mode 100644
index 0000000..89484b4
--- /dev/null
+++ b/qitos/benchmark/cybergym/adapter.py
@@ -0,0 +1,85 @@
+"""CyberGym benchmark task adapter."""
+
+from __future__ import annotations
+
+from typing import Any, Iterable, Mapping, Optional
+
+from qitos.core import Task, TaskBudget
+
+from ..base import BenchmarkAdapter, BenchmarkSource
+
+
+def task_slug(task_id: str) -> str:
+ return str(task_id).replace(":", "_")
+
+
+class CyberGymBenchmarkAdapter(BenchmarkAdapter):
+ """Convert CyberGym task ids into QitOS Task objects."""
+
+ def __init__(self, *, difficulty: str = "level1") -> None:
+ self.source = BenchmarkSource(name="cybergym", split=str(difficulty))
+ self.difficulty = str(difficulty)
+
+ def load_records(
+ self,
+ *,
+ task_ids: Iterable[str],
+ limit: Optional[int] = None,
+ ) -> list[dict[str, Any]]:
+ records: list[dict[str, Any]] = []
+ for raw_task_id in task_ids:
+ task_id = str(raw_task_id).strip()
+ if not task_id:
+ continue
+ records.append({"task_id": task_id})
+ if limit is not None and len(records) >= int(limit):
+ break
+ return records
+
+ def to_task(
+ self,
+ record: Mapping[str, Any],
+ *,
+ split: str | None = None,
+ idx: int = 0,
+ ) -> Task:
+ task_id = str(record.get("task_id") or record.get("id") or "").strip()
+ if not task_id:
+ raise ValueError("CyberGym record requires task_id")
+ difficulty = str(split or record.get("difficulty") or self.difficulty)
+ return Task(
+ id=task_id,
+ objective=f"Solve CyberGym task {task_id}.",
+ inputs={"task_id": task_id, "difficulty": difficulty},
+ budget=TaskBudget(max_steps=int(record.get("max_steps", 30))),
+ metadata={
+ "benchmark": "cybergym",
+ "split": difficulty,
+ "task_index": int(idx),
+ "raw_record": dict(record),
+ },
+ )
+
+ def to_tasks(
+ self,
+ records: Iterable[Mapping[str, Any]],
+ split: str,
+ limit: Optional[int] = None,
+ ) -> list[Task]:
+ tasks: list[Task] = []
+ for idx, record in enumerate(records):
+ tasks.append(self.to_task(record, split=split, idx=idx))
+ if limit is not None and len(tasks) >= int(limit):
+ break
+ return tasks
+
+
+def load_cybergym_tasks(
+ *,
+ task_ids: Iterable[str],
+ difficulty: str = "level1",
+ limit: Optional[int] = None,
+) -> list[Task]:
+ adapter = CyberGymBenchmarkAdapter(difficulty=difficulty)
+ records = adapter.load_records(task_ids=task_ids, limit=limit)
+ return adapter.to_tasks(records, split=difficulty, limit=limit)
diff --git a/qitos/benchmark/cybergym/evaluator.py b/qitos/benchmark/cybergym/evaluator.py
new file mode 100644
index 0000000..8ecb54c
--- /dev/null
+++ b/qitos/benchmark/cybergym/evaluator.py
@@ -0,0 +1,28 @@
+"""CyberGym benchmark evaluator."""
+
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from qitos.core import ExperimentSpec, RunSpec
+
+from ..contracts import BenchmarkEvaluator, PreparedBenchmarkTask
+
+
+class CyberGymEvaluator(BenchmarkEvaluator):
+ """Return CyberGym-native execution details for scoring."""
+
+ def evaluate(
+ self,
+ *,
+ prepared: PreparedBenchmarkTask,
+ run_spec: RunSpec,
+ experiment_spec: ExperimentSpec,
+ execution: Any,
+ ) -> Dict[str, Any]:
+ _ = (prepared, run_spec, experiment_spec)
+ if isinstance(execution, dict):
+ return dict(execution)
+ if hasattr(execution, "to_dict"):
+ return dict(execution.to_dict())
+ return {"execution": execution}
diff --git a/qitos/benchmark/cybergym/runner.py b/qitos/benchmark/cybergym/runner.py
new file mode 100644
index 0000000..db2526d
--- /dev/null
+++ b/qitos/benchmark/cybergym/runner.py
@@ -0,0 +1,221 @@
+"""CyberGym benchmark runner."""
+
+from __future__ import annotations
+
+import os
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from qitos.core import BenchmarkRunResult, ExperimentSpec, RunSpec, Task
+from qitos.engine.stop_criteria import FinalResultCriteria, MaxStepsCriteria
+from qitos.engine.states import ContextConfig
+from qitos.kit.env.host_env import HostEnv
+from qitos.trace import TraceWriter
+
+from .adapter import task_slug
+from .evaluator import CyberGymEvaluator
+from .runtime import CyberGymRuntimeHook, prepare_task_dir
+from .scorer import CyberGymScorer
+
+
+def make_trace_writer(
+ *,
+ trace_logdir: str | Path,
+ trace_prefix: str,
+ task_id: str,
+ model_id: str,
+) -> TraceWriter:
+ stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f")
+ run_id = f"{trace_prefix}_{task_slug(task_id)}_{stamp}"
+ return TraceWriter(
+ output_dir=str(Path(trace_logdir).expanduser().resolve()),
+ run_id=run_id,
+ strict_validate=True,
+ metadata={"model_id": model_id},
+ )
+
+
+def run_cybergym_agent_task(
+ *,
+ task_dir: str | Path,
+ model_name: str,
+ api_key: str,
+ base_url: str,
+ server: str,
+ max_steps: int,
+ trace_logdir: str | Path,
+ trace_prefix: str = "qitos_cybergym",
+ run_spec: RunSpec | None = None,
+ experiment_spec: ExperimentSpec | None = None,
+) -> dict[str, Any]:
+ try:
+ from .agent.adapter import CyberGymAdapter
+ from .agent.cli import build_agent
+ from .agent.stop_criteria import PoCVerificationCriteria
+ except ModuleNotFoundError as exc:
+ raise ModuleNotFoundError(
+ "CyberGym agent package is not bundled in QitOS. "
+ "Copy the cybergym_agent repository into `qitos/benchmark/cybergym/agent/` "
+ "before running the CyberGym benchmark."
+ ) from exc
+
+ task_path = Path(task_dir).expanduser().resolve()
+ adapter = CyberGymAdapter(server_url=server)
+ task = adapter.from_task_dir(str(task_path), max_steps=max_steps)
+
+ agent = build_agent(
+ model=model_name,
+ workspace_root=str(task_path),
+ server_url=server,
+ max_steps=max_steps,
+ llm_config={"api_key": api_key, "base_url": base_url},
+ )
+
+ env = HostEnv(workspace_root=str(task_path))
+ stop_criteria = [
+ PoCVerificationCriteria(),
+ FinalResultCriteria(),
+ MaxStepsCriteria(max_steps=max_steps),
+ ]
+ context_config = ContextConfig(
+ tool_result_max_chars=4000,
+ conversation_max_rounds=10,
+ loop_max_repeats=3,
+ )
+ trace_writer = make_trace_writer(
+ trace_logdir=trace_logdir,
+ trace_prefix=trace_prefix,
+ task_id=task.id,
+ model_id=model_name,
+ )
+
+ result = agent.run(
+ task=task,
+ return_state=True,
+ env=env,
+ stop_criteria=stop_criteria,
+ max_steps=max_steps,
+ workspace=str(task_path),
+ context_config=context_config,
+ trace=trace_writer,
+ run_spec=run_spec,
+ experiment_spec=experiment_spec,
+ description=task.inputs.get("description", ""),
+ task_id=task.inputs.get("task_id", ""),
+ agent_id=task.inputs.get("agent_id", ""),
+ checksum=task.inputs.get("checksum", ""),
+ server_url=task.inputs.get("server_url", server),
+ error_txt=task.inputs.get("error_txt", ""),
+ patch_diff=task.inputs.get("patch_diff", ""),
+ repo_dir=task.inputs.get("repo_dir", ""),
+ )
+
+ return {
+ "task_id": task.id,
+ "task_dir": str(task_path),
+ "trace_run_dir": str(trace_writer.run_dir),
+ "stop_reason": result.state.stop_reason,
+ "final_result": result.state.final_result,
+ "step_count": result.step_count,
+ "task_result": result.task_result.to_dict() if result.task_result is not None else None,
+ }
+
+
+def run_cybergym_task(
+ *, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec
+) -> BenchmarkRunResult:
+ started = time.time()
+ effective_spec = RunSpec.from_value(run_spec)
+ effective_spec.benchmark_name = effective_spec.benchmark_name or "cybergym"
+ effective_spec.benchmark_split = effective_spec.benchmark_split or str(
+ task.inputs.get("difficulty") or "level1"
+ )
+ effective_spec.toolset_name = effective_spec.toolset_name or "cybergym_agent"
+ effective_spec.metadata = {
+ **dict(effective_spec.metadata or {}),
+ "recipe": "cybergym_agent",
+ }
+
+ environment = dict(effective_spec.environment or {})
+ task_id = str(task.inputs.get("task_id") or task.id)
+ difficulty = str(task.inputs.get("difficulty") or effective_spec.benchmark_split or "level1")
+ workspace = Path(str(environment.get("workspace") or "runs/cybergym/workspace"))
+ task_dir = workspace / task_slug(task_id)
+ data_dir = str(environment.get("data_dir") or "")
+ server = str(environment.get("server") or "")
+ base_url = str(environment.get("base_url") or "")
+ trace_logdir = str(environment.get("trace_logdir") or "runs/cybergym/traces")
+ api_key = str(
+ environment.get("api_key")
+ or os.getenv("OPENAI_API_KEY", "")
+ or os.getenv("QITOS_API_KEY", "")
+ or os.getenv("CYBERGYM_CLAUDE_AUTH_TOKEN", "")
+ )
+ max_steps = int((effective_spec.metadata or {}).get("max_steps", task.budget.max_steps or 30))
+
+ if not data_dir:
+ raise ValueError("CyberGym run requires run_spec.environment['data_dir']")
+ if not server:
+ raise ValueError("CyberGym run requires run_spec.environment['server']")
+ if not base_url:
+ raise ValueError("CyberGym run requires run_spec.environment['base_url']")
+ if not api_key:
+ raise ValueError("CyberGym run requires api_key or OPENAI_API_KEY/QITOS_API_KEY")
+
+ prepare_task_dir(
+ task_id=task_id,
+ out_dir=task_dir,
+ data_dir=data_dir,
+ server=server,
+ difficulty=difficulty,
+ )
+
+ prepared = CyberGymRuntimeHook().prepare(
+ task=task,
+ run_spec=effective_spec,
+ experiment_spec=experiment_spec,
+ )
+ execution = run_cybergym_agent_task(
+ task_dir=task_dir,
+ model_name=str(effective_spec.model_name or ""),
+ api_key=api_key,
+ base_url=base_url,
+ server=server,
+ max_steps=max_steps,
+ trace_logdir=trace_logdir,
+ trace_prefix=str(environment.get("trace_prefix") or "qitos_cybergym"),
+ run_spec=effective_spec,
+ experiment_spec=experiment_spec,
+ )
+ task_result = execution.get("task_result") or {}
+ base_result = BenchmarkRunResult(
+ task_id=task_id,
+ benchmark="cybergym",
+ split=difficulty,
+ prediction=execution.get("final_result"),
+ success=bool(task_result.get("success", False)),
+ stop_reason=str(execution.get("stop_reason") or "unknown"),
+ steps=int(execution.get("step_count") or 0),
+ latency_seconds=float(time.time() - started),
+ token_usage=int((task_result.get("metrics") or {}).get("token_usage", 0)),
+ cost=0.0,
+ trace_run_dir=str(execution.get("trace_run_dir") or ""),
+ run_spec_ref=effective_spec.fingerprint(),
+ metadata={"execution": execution},
+ )
+ evaluation = CyberGymEvaluator().evaluate(
+ prepared=prepared,
+ run_spec=effective_spec,
+ experiment_spec=experiment_spec,
+ execution=execution,
+ )
+ return CyberGymScorer().score(
+ prepared=prepared,
+ run_spec=effective_spec,
+ experiment_spec=experiment_spec,
+ execution=execution,
+ evaluation=evaluation,
+ base_result=base_result,
+ )
diff --git a/qitos/benchmark/cybergym/runtime.py b/qitos/benchmark/cybergym/runtime.py
new file mode 100644
index 0000000..6904f35
--- /dev/null
+++ b/qitos/benchmark/cybergym/runtime.py
@@ -0,0 +1,52 @@
+"""CyberGym benchmark runtime helpers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from qitos.core import ExperimentSpec, RunSpec, Task
+
+from ..contracts import BenchmarkRuntimeHook, PreparedBenchmarkTask
+
+
+def prepare_task_dir(
+ *,
+ task_id: str,
+ out_dir: str | Path,
+ data_dir: str | Path,
+ server: str,
+ difficulty: str,
+) -> Path:
+ from cybergym.task.gen_task import generate_task
+ from cybergym.task.types import TaskConfig, TaskDifficulty
+
+ out_path = Path(out_dir).expanduser().resolve()
+ out_path.mkdir(parents=True, exist_ok=True)
+
+ generate_task(
+ TaskConfig(
+ task_id=task_id,
+ out_dir=out_path,
+ data_dir=Path(data_dir).expanduser().resolve(),
+ server=server,
+ difficulty=TaskDifficulty(difficulty),
+ )
+ )
+ return out_path
+
+
+class CyberGymRuntimeHook(BenchmarkRuntimeHook):
+ """Attach CyberGym runtime metadata to a benchmark task."""
+
+ def prepare(
+ self, *, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec
+ ) -> PreparedBenchmarkTask:
+ _ = experiment_spec
+ environment = dict(run_spec.environment or {})
+ metadata = {
+ "server": environment.get("server"),
+ "data_dir": environment.get("data_dir"),
+ "workspace": environment.get("workspace"),
+ "difficulty": task.inputs.get("difficulty") or task.metadata.get("split"),
+ }
+ return PreparedBenchmarkTask(task=task, runtime_metadata=metadata)
diff --git a/qitos/benchmark/cybergym/scorer.py b/qitos/benchmark/cybergym/scorer.py
new file mode 100644
index 0000000..7e378f2
--- /dev/null
+++ b/qitos/benchmark/cybergym/scorer.py
@@ -0,0 +1,32 @@
+"""CyberGym benchmark scorer."""
+
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from qitos.core import BenchmarkRunResult, ExperimentSpec, RunSpec
+
+from ..contracts import BenchmarkScorer, PreparedBenchmarkTask
+
+
+class CyberGymScorer(BenchmarkScorer):
+ """Keep the normalized result and attach CyberGym-specific metadata."""
+
+ def score(
+ self,
+ *,
+ prepared: PreparedBenchmarkTask,
+ run_spec: RunSpec,
+ experiment_spec: ExperimentSpec,
+ execution: Any,
+ evaluation: Dict[str, Any],
+ base_result: BenchmarkRunResult,
+ ) -> BenchmarkRunResult:
+ _ = (run_spec, experiment_spec, execution)
+ base_result.metadata = {
+ **dict(base_result.metadata or {}),
+ "benchmark_runtime": dict(prepared.runtime_metadata or {}),
+ "cybergym": dict(evaluation or {}),
+ "family": "cybergym",
+ }
+ return base_result
diff --git a/qitos/benchmark/runner.py b/qitos/benchmark/runner.py
index 1f90d49..0178654 100644
--- a/qitos/benchmark/runner.py
+++ b/qitos/benchmark/runner.py
@@ -9,6 +9,7 @@
from qitos.core.task import Task
from .cybench import CyBenchAdapter, run_cybench_task
+from .cybergym import CyberGymBenchmarkAdapter, run_cybergym_task
from .desktop import DesktopStarterAdapter, run_desktop_starter_task
from .gaia import GaiaAdapter, run_gaia_task
from .osworld import OSWorldBenchmarkAdapter, run_osworld_task
@@ -25,6 +26,7 @@ def normalize_benchmark_name(value: str) -> str:
"tau-bench": "tau-bench",
"gaia": "gaia",
"cybench": "cybench",
+ "cybergym": "cybergym",
"desktop": "desktop-starter",
"desktop-starter": "desktop-starter",
"osworld-starter": "desktop-starter",
@@ -60,6 +62,24 @@ def load_benchmark_tasks(
limit=limit,
)
return adapter.to_tasks(rows, split=split, limit=limit)
+ if normalized == "cybergym":
+ if root is None:
+ raise ValueError("CyberGym task loading requires root as a task id or task-id file")
+ root_value = str(root)
+ try:
+ from pathlib import Path
+
+ path = Path(root_value)
+ task_ids = (
+ [line.strip() for line in path.read_text(encoding="utf-8").splitlines()]
+ if path.exists()
+ else [root_value]
+ )
+ except OSError:
+ task_ids = [root_value]
+ adapter = CyberGymBenchmarkAdapter(difficulty=split)
+ rows = adapter.load_records(task_ids=task_ids, limit=limit)
+ return adapter.to_tasks(rows, split=split, limit=limit)
if normalized == "desktop-starter":
adapter = DesktopStarterAdapter(dataset_path=root)
rows = adapter.load_records(split=split)
@@ -179,6 +199,28 @@ def runner(*, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec):
experiment_spec=experiment_spec,
)
+ return runner
+ if normalized == "cybergym" and lane in {
+ "cybergym_baseline",
+ "cybergym_smoke",
+ "baseline",
+ "smoke",
+ }:
+ def runner(*, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec):
+ enriched = RunSpec.from_value(run_spec)
+ if not enriched.prompt_protocol:
+ enriched.prompt_protocol = "cybergym_agent_v1"
+ if not enriched.parser_name:
+ enriched.parser_name = "cybergym_agent"
+ if lane in {"cybergym_smoke", "smoke"}:
+ enriched.metadata = dict(enriched.metadata or {})
+ enriched.metadata["cybergym_smoke"] = True
+ return run_cybergym_task(
+ task=task,
+ run_spec=enriched,
+ experiment_spec=experiment_spec,
+ )
+
return runner
if normalized == "osworld" and lane in {
"osworld_baseline",
diff --git a/qitos/recipes/benchmarks/cybergym.py b/qitos/recipes/benchmarks/cybergym.py
new file mode 100644
index 0000000..efd3e8c
--- /dev/null
+++ b/qitos/recipes/benchmarks/cybergym.py
@@ -0,0 +1,98 @@
+"""Canonical CyberGym recipe for QitOS."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+from qitos.benchmark.cybergym import (
+ make_trace_writer,
+ prepare_task_dir,
+ run_cybergym_agent_task,
+ task_slug,
+)
+
+
+def run_cybergym_recipe_task(
+ *,
+ task_id: str,
+ data_dir: str,
+ out_dir: str,
+ server: str,
+ difficulty: str,
+ model_name: str,
+ api_key: str,
+ base_url: str,
+ max_steps: int,
+ trace_logdir: str,
+ trace_prefix: str = "qitos_cybergym",
+) -> dict[str, Any]:
+ task_dir = prepare_task_dir(
+ task_id=task_id,
+ out_dir=out_dir,
+ data_dir=data_dir,
+ server=server,
+ difficulty=difficulty,
+ )
+ return run_cybergym_agent_task(
+ task_dir=task_dir,
+ model_name=model_name,
+ api_key=api_key,
+ base_url=base_url,
+ server=server,
+ max_steps=max_steps,
+ trace_logdir=trace_logdir,
+ trace_prefix=trace_prefix,
+ )
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(
+ description="Run CyberGym through QitOS with native trace support"
+ )
+ parser.add_argument("--task-id", required=True)
+ parser.add_argument("--data-dir", required=True)
+ parser.add_argument("--out-dir", required=True)
+ parser.add_argument("--server", required=True)
+ parser.add_argument(
+ "--difficulty", default="level1", choices=["level0", "level1", "level2", "level3"]
+ )
+ parser.add_argument("--model-name", required=True)
+ parser.add_argument("--api-key", required=True)
+ parser.add_argument("--base-url", required=True)
+ parser.add_argument("--max-steps", type=int, default=30)
+ parser.add_argument("--trace-logdir", default="runs/cybergym/traces")
+ parser.add_argument("--trace-prefix", default="qitos_cybergym")
+ args = parser.parse_args(argv)
+
+ result = run_cybergym_recipe_task(
+ task_id=args.task_id,
+ data_dir=args.data_dir,
+ out_dir=str(Path(args.out_dir)),
+ server=args.server,
+ difficulty=args.difficulty,
+ model_name=args.model_name,
+ api_key=args.api_key,
+ base_url=args.base_url,
+ max_steps=int(args.max_steps),
+ trace_logdir=args.trace_logdir,
+ trace_prefix=args.trace_prefix,
+ )
+ print(json.dumps(result, ensure_ascii=False, indent=2))
+ return 0
+
+
+__all__ = [
+ "make_trace_writer",
+ "prepare_task_dir",
+ "run_cybergym_agent_task",
+ "run_cybergym_recipe_task",
+ "task_slug",
+ "main",
+]
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tests/test_benchmark_cybergym_recipe.py b/tests/test_benchmark_cybergym_recipe.py
new file mode 100644
index 0000000..a11a157
--- /dev/null
+++ b/tests/test_benchmark_cybergym_recipe.py
@@ -0,0 +1,47 @@
+import tempfile
+import unittest
+from pathlib import Path
+
+from qitos.benchmark import normalize_benchmark_name, resolve_builtin_runner
+from qitos.benchmark.cybergym import CyberGymBenchmarkAdapter, make_trace_writer, task_slug
+from qitos.recipes.benchmarks import cybergym
+
+
+class CybergymRecipeTests(unittest.TestCase):
+ def test_task_slug_replaces_colon(self):
+ self.assertEqual(task_slug("arvo:1065"), "arvo_1065")
+ self.assertEqual(task_slug("oss-fuzz:42535201"), "oss-fuzz_42535201")
+
+ def test_make_trace_writer_uses_prefix_and_task_slug(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ writer = make_trace_writer(
+ trace_logdir=tmpdir,
+ trace_prefix="qitos_cybergym",
+ task_id="arvo:1065",
+ model_id="GLM-5.1-sii",
+ )
+
+ self.assertTrue(writer.run_id.startswith("qitos_cybergym_arvo_1065_"))
+ self.assertEqual(writer.metadata["model_id"], "GLM-5.1-sii")
+ self.assertTrue(Path(writer.run_dir).exists())
+
+ def test_adapter_builds_qitos_task_from_task_id(self):
+ adapter = CyberGymBenchmarkAdapter()
+
+ task = adapter.to_task({"task_id": "arvo:1065"}, split="level1", idx=0)
+
+ self.assertEqual(task.id, "arvo:1065")
+ self.assertEqual(task.inputs["difficulty"], "level1")
+ self.assertEqual(task.metadata["benchmark"], "cybergym")
+
+ def test_cybergym_is_registered_as_benchmark_family(self):
+ self.assertEqual(normalize_benchmark_name("cybergym"), "cybergym")
+ self.assertIsNotNone(resolve_builtin_runner(benchmark="cybergym", strategy="smoke"))
+
+ def test_recipe_reuses_benchmark_family_helpers(self):
+ self.assertIs(cybergym.task_slug, task_slug)
+ self.assertIs(cybergym.make_trace_writer, make_trace_writer)
+
+
+if __name__ == "__main__":
+ unittest.main()