diff --git a/docs/docs.json b/docs/docs.json index 63ceb39..588f07a 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -202,7 +202,8 @@ "zh/benchmarks/osworld", "zh/benchmarks/gaia", "zh/benchmarks/tau-bench", - "zh/benchmarks/cybench" + "zh/benchmarks/cybench", + "zh/benchmarks/cybergym" ] }, { diff --git a/docs/zh/benchmarks/cybergym.mdx b/docs/zh/benchmarks/cybergym.mdx new file mode 100644 index 0000000..4b38510 --- /dev/null +++ b/docs/zh/benchmarks/cybergym.mdx @@ -0,0 +1,189 @@ +--- +title: "CyberGym" +description: "使用 `qitos.benchmark.cybergym`、外部同步的 `cybergym_agent` 和 QitOS trace,在 CyberGym 上运行安全漏洞 PoC benchmark。" +--- + +CyberGym 是一个面向漏洞触发与 PoC 生成的安全 benchmark。任务通常会提供漏洞版本代码、描述信息和一个 `submit.sh`,agent 需要构造输入,让 vulnerable 版本触发异常,并最终通过 patched 版本验证。 + +这次集成把 CyberGym 放进了 QitOS 的正式 benchmark 结构里: + +- `qitos.benchmark.cybergym` +- `qitos.recipes.benchmarks.cybergym` +- `examples/benchmarks/cybergym_eval.py` + +同时把运行产物统一放到 `runs/cybergym/` 下,便于直接用 `qita` 查看轨迹。 + +注意:这个 PR 不直接提交 `cybergym_agent` 代码。运行前需要维护者手动把 `cybergym_agent` 仓库同步到 `qitos/benchmark/cybergym/agent/`。 + +## 集成结构 + +QitOS 侧的主要文件: + +- `qitos/benchmark/cybergym/adapter.py` +- `qitos/benchmark/cybergym/runtime.py` +- `qitos/benchmark/cybergym/evaluator.py` +- `qitos/benchmark/cybergym/scorer.py` +- `qitos/benchmark/cybergym/runner.py` +- `qitos/recipes/benchmarks/cybergym.py` +- `examples/benchmarks/cybergym_eval.py` + +职责划分: + +- `adapter.py`:把 CyberGym 的 `task_id` 转成 QitOS `Task` +- `runtime.py`:生成 task 目录并附加运行时元信息 +- `runner.py`:调起外部同步进来的 `cybergym_agent`、写 trace、返回标准结果 +- `recipe`:可复现 baseline +- `example`:最薄的一层命令入口 + +## 同步 `cybergym_agent` + +运行前,需要先把 `cybergym_agent` 仓库同步到 QitOS 里。下面命令在 QitOS 仓库根目录执行: + +```bash +mkdir -p qitos/benchmark/cybergym/agent +rsync -a \ + --exclude .git \ + --exclude __pycache__ \ + --exclude test_agent.py \ + ../cybergym_agent/ \ + qitos/benchmark/cybergym/agent/ +``` + +如果没有这一步,`qitos.benchmark.cybergym.runner` 会在运行时直接报错,提示先复制 `cybergym_agent`。 + +## 准备工作 + + + + 确保可以访问 `cybergym_data/data`,例如放在 QitOS 同级目录的 `../cybergym/cybergym_data/data`。 + + + 需要一台能运行 Docker 镜像的 CyberGym server。示例: + + ```bash + cd ../cybergym + python -m cybergym.server \ + --host 127.0.0.1 \ + --port 8669 \ + --log_dir ../qitos/runs/cybergym/server_poc \ + --db_path ../qitos/runs/cybergym/server_poc/poc.db + ``` + + + ```bash + export CYBERGYM_CLAUDE_AUTH_TOKEN="your-model-key" + export CYBERGYM_API_KEY="your-verify-key" + ``` + + + +## 单任务运行 + +从 QitOS 仓库根目录运行: + +```bash +python examples/benchmarks/cybergym_eval.py \ + --task-id arvo:1065 \ + --data-dir ../cybergym/cybergym_data/data \ + --out-dir runs/cybergym/workspace/arvo_1065 \ + --server http://127.0.0.1:8669 \ + --difficulty level1 \ + --model-name GLM-5.1-sii \ + --api-key "$CYBERGYM_CLAUDE_AUTH_TOKEN" \ + --base-url https://your-openai-compatible-endpoint/v1 \ + --max-steps 30 \ + --trace-logdir runs/cybergym/traces +``` + +## 批量跑 100 个任务 + +假设 `tasks.txt` 每行一个 `task_id`: + +```text +arvo:1065 +arvo:3938 +oss-fuzz:42535201 +... +``` + +从 QitOS 仓库根目录顺序跑: + +```bash +export TASKS_FILE=./tasks.txt +export SERVER=http://your-cybergym-server:8669 + +while read -r TASK_ID; do + [ -z "$TASK_ID" ] && continue + SLUG="${TASK_ID/:/_}" + echo "===== START $TASK_ID =====" + python examples/benchmarks/cybergym_eval.py \ + --task-id "$TASK_ID" \ + --data-dir ../cybergym/cybergym_data/data \ + --out-dir "runs/cybergym/workspace/$SLUG" \ + --server "$SERVER" \ + --difficulty level1 \ + --model-name GLM-5.1-sii \ + --api-key "$CYBERGYM_CLAUDE_AUTH_TOKEN" \ + --base-url https://your-openai-compatible-endpoint/v1 \ + --max-steps 30 \ + --trace-logdir runs/cybergym/traces + echo "===== END $TASK_ID =====" +done < "$TASKS_FILE" | tee runs/cybergym/run-100.log +``` + +小并发时可以自行改成 `xargs -P 2` 或 `xargs -P 4`,但建议先验证模型端和 server 端的稳定性。 + +## 批量 verify + +CyberGym 的 public `/submit-vul` 只返回 vulnerable 侧结果。完整 benchmark 判定还需要 fix 侧 verify。 + +在 `cybergym` 仓库根目录运行: + +```bash +python scripts/verify_batch_results.py \ + --logs_dir ../qitos/runs/cybergym/logs \ + --server http://your-cybergym-server:8669 \ + --pocdb_path ../qitos/runs/cybergym/server_poc/poc.db \ + --summary_json ../qitos/runs/cybergym/verify-summary.json +``` + +只看当前数据库状态、不真正发 verify: + +```bash +python scripts/verify_batch_results.py \ + --logs_dir ../qitos/runs/cybergym/logs \ + --server http://your-cybergym-server:8669 \ + --pocdb_path ../qitos/runs/cybergym/server_poc/poc.db \ + --skip_verify +``` + +## trace 与产物 + +运行后主要产物在: + +- `runs/cybergym/workspace/` +- `runs/cybergym/server_poc/` +- `runs/cybergym/traces/` + +查看轨迹: + +```bash +qita board --logdir runs/cybergym/traces +``` + +QitOS trace 会写出: + +- `manifest.json` +- `events.jsonl` +- `steps.jsonl` + +## 当前状态 + +这次集成已经验证了: + +- CyberGym task 能转成 QitOS `Task` +- benchmark family 已注册到 `qitos.benchmark` +- recipe 和 thin example 能正常调用同一条 runner +- 真实 smoke 可以生成 task、初始化 `GLM-5.1-sii`、写出 QitOS trace + +当前已知限制是模型协议还没有完全对齐。`GLM-5.1-sii` 目前更倾向输出 `...` 风格内容,而当前 agent 还走 `JsonDecisionParser` 期望纯 JSON,因此在短 smoke 里会停在 parser error。这是下一步要处理的模型协议适配问题,不是 CyberGym benchmark 结构接入问题。 diff --git a/docs/zh/benchmarks/overview.mdx b/docs/zh/benchmarks/overview.mdx index 9180bc6..86e1752 100644 --- a/docs/zh/benchmarks/overview.mdx +++ b/docs/zh/benchmarks/overview.mdx @@ -1,6 +1,6 @@ --- title: "基准测试" -description: "通过统一的官方 QitOS benchmark 路径运行 desktop-starter、OSWorld、GAIA、Tau-Bench 与 CyBench,并产出统一结果与 trace artifacts。" +description: "通过统一的官方 QitOS benchmark 路径运行 desktop-starter、OSWorld、GAIA、Tau-Bench、CyBench 与 CyberGym,并产出统一结果与 trace artifacts。" --- 在 QitOS 里,benchmark 不是另一套平行 runtime,而是同一条 agent runtime 叙事的延伸。 @@ -23,6 +23,7 @@ description: "通过统一的官方 QitOS benchmark 路径运行 desktop-starter | [GAIA](/zh/benchmarks/gaia) | 通用 AI assistant 任务 | Exact match | | [Tau-Bench](/zh/benchmarks/tau-bench) | Tool-agent-user 交互 | Reward / pass^k | | [CyBench](/zh/benchmarks/cybench) | CTF 风格安全评测 | Guided subtask score | +| [CyberGym](/zh/benchmarks/cybergym) | 漏洞 PoC 生成与 differential verify | `vul_exit_code != 0` 且 `fix_exit_code == 0` | ## 官方 benchmark 入口 diff --git a/examples/benchmarks/cybergym_eval.py b/examples/benchmarks/cybergym_eval.py new file mode 100644 index 0000000..5d2b225 --- /dev/null +++ b/examples/benchmarks/cybergym_eval.py @@ -0,0 +1,17 @@ +"""Thin CyberGym benchmark entrypoint backed by the canonical recipe.""" + +from qitos.recipes.benchmarks.cybergym import ( + main, + run_cybergym_agent_task, + run_cybergym_recipe_task, +) + +__all__ = [ + "main", + "run_cybergym_agent_task", + "run_cybergym_recipe_task", +] + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/qitos/benchmark/__init__.py b/qitos/benchmark/__init__.py index 0e0aada..c1ca29a 100644 --- a/qitos/benchmark/__init__.py +++ b/qitos/benchmark/__init__.py @@ -40,6 +40,12 @@ "load_cybench_tasks": (".cybench", "load_cybench_tasks"), "run_cybench_task": (".cybench", "run_cybench_task"), "score_cybench_submission": (".cybench", "score_cybench_submission"), + "CyberGymBenchmarkAdapter": (".cybergym", "CyberGymBenchmarkAdapter"), + "CyberGymEvaluator": (".cybergym", "CyberGymEvaluator"), + "CyberGymRuntimeHook": (".cybergym", "CyberGymRuntimeHook"), + "CyberGymScorer": (".cybergym", "CyberGymScorer"), + "load_cybergym_tasks": (".cybergym", "load_cybergym_tasks"), + "run_cybergym_task": (".cybergym", "run_cybergym_task"), "GaiaAdapter": (".gaia", "GaiaAdapter"), "GaiaEvaluator": (".gaia", "GaiaEvaluator"), "GaiaRuntimeHook": (".gaia", "GaiaRuntimeHook"), @@ -100,6 +106,12 @@ def __getattr__(name: str): "run_cybench_task", "score_cybench_submission", "load_cybench_tasks", + "CyberGymBenchmarkAdapter", + "CyberGymEvaluator", + "CyberGymRuntimeHook", + "CyberGymScorer", + "load_cybergym_tasks", + "run_cybergym_task", "GaiaAdapter", "GaiaEvaluator", "GaiaRuntimeHook", diff --git a/qitos/benchmark/cybergym/__init__.py b/qitos/benchmark/cybergym/__init__.py new file mode 100644 index 0000000..126efae --- /dev/null +++ b/qitos/benchmark/cybergym/__init__.py @@ -0,0 +1,43 @@ +"""CyberGym benchmark integration.""" + +from __future__ import annotations + +import importlib + +_LAZY_ATTRS = { + "CyberGymBenchmarkAdapter": (".adapter", "CyberGymBenchmarkAdapter"), + "load_cybergym_tasks": (".adapter", "load_cybergym_tasks"), + "task_slug": (".adapter", "task_slug"), + "CyberGymEvaluator": (".evaluator", "CyberGymEvaluator"), + "CyberGymRuntimeHook": (".runtime", "CyberGymRuntimeHook"), + "prepare_task_dir": (".runtime", "prepare_task_dir"), + "CyberGymScorer": (".scorer", "CyberGymScorer"), + "make_trace_writer": (".runner", "make_trace_writer"), + "run_cybergym_agent_task": (".runner", "run_cybergym_agent_task"), + "run_cybergym_task": (".runner", "run_cybergym_task"), +} + + +def __getattr__(name: str): + target = _LAZY_ATTRS.get(name) + if target is None: + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + module_name, attr_name = target + module = importlib.import_module(module_name, __name__) + value = getattr(module, attr_name) + globals()[name] = value + return value + + +__all__ = [ + "CyberGymBenchmarkAdapter", + "CyberGymEvaluator", + "CyberGymRuntimeHook", + "CyberGymScorer", + "load_cybergym_tasks", + "make_trace_writer", + "prepare_task_dir", + "run_cybergym_agent_task", + "run_cybergym_task", + "task_slug", +] diff --git a/qitos/benchmark/cybergym/adapter.py b/qitos/benchmark/cybergym/adapter.py new file mode 100644 index 0000000..89484b4 --- /dev/null +++ b/qitos/benchmark/cybergym/adapter.py @@ -0,0 +1,85 @@ +"""CyberGym benchmark task adapter.""" + +from __future__ import annotations + +from typing import Any, Iterable, Mapping, Optional + +from qitos.core import Task, TaskBudget + +from ..base import BenchmarkAdapter, BenchmarkSource + + +def task_slug(task_id: str) -> str: + return str(task_id).replace(":", "_") + + +class CyberGymBenchmarkAdapter(BenchmarkAdapter): + """Convert CyberGym task ids into QitOS Task objects.""" + + def __init__(self, *, difficulty: str = "level1") -> None: + self.source = BenchmarkSource(name="cybergym", split=str(difficulty)) + self.difficulty = str(difficulty) + + def load_records( + self, + *, + task_ids: Iterable[str], + limit: Optional[int] = None, + ) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for raw_task_id in task_ids: + task_id = str(raw_task_id).strip() + if not task_id: + continue + records.append({"task_id": task_id}) + if limit is not None and len(records) >= int(limit): + break + return records + + def to_task( + self, + record: Mapping[str, Any], + *, + split: str | None = None, + idx: int = 0, + ) -> Task: + task_id = str(record.get("task_id") or record.get("id") or "").strip() + if not task_id: + raise ValueError("CyberGym record requires task_id") + difficulty = str(split or record.get("difficulty") or self.difficulty) + return Task( + id=task_id, + objective=f"Solve CyberGym task {task_id}.", + inputs={"task_id": task_id, "difficulty": difficulty}, + budget=TaskBudget(max_steps=int(record.get("max_steps", 30))), + metadata={ + "benchmark": "cybergym", + "split": difficulty, + "task_index": int(idx), + "raw_record": dict(record), + }, + ) + + def to_tasks( + self, + records: Iterable[Mapping[str, Any]], + split: str, + limit: Optional[int] = None, + ) -> list[Task]: + tasks: list[Task] = [] + for idx, record in enumerate(records): + tasks.append(self.to_task(record, split=split, idx=idx)) + if limit is not None and len(tasks) >= int(limit): + break + return tasks + + +def load_cybergym_tasks( + *, + task_ids: Iterable[str], + difficulty: str = "level1", + limit: Optional[int] = None, +) -> list[Task]: + adapter = CyberGymBenchmarkAdapter(difficulty=difficulty) + records = adapter.load_records(task_ids=task_ids, limit=limit) + return adapter.to_tasks(records, split=difficulty, limit=limit) diff --git a/qitos/benchmark/cybergym/evaluator.py b/qitos/benchmark/cybergym/evaluator.py new file mode 100644 index 0000000..8ecb54c --- /dev/null +++ b/qitos/benchmark/cybergym/evaluator.py @@ -0,0 +1,28 @@ +"""CyberGym benchmark evaluator.""" + +from __future__ import annotations + +from typing import Any, Dict + +from qitos.core import ExperimentSpec, RunSpec + +from ..contracts import BenchmarkEvaluator, PreparedBenchmarkTask + + +class CyberGymEvaluator(BenchmarkEvaluator): + """Return CyberGym-native execution details for scoring.""" + + def evaluate( + self, + *, + prepared: PreparedBenchmarkTask, + run_spec: RunSpec, + experiment_spec: ExperimentSpec, + execution: Any, + ) -> Dict[str, Any]: + _ = (prepared, run_spec, experiment_spec) + if isinstance(execution, dict): + return dict(execution) + if hasattr(execution, "to_dict"): + return dict(execution.to_dict()) + return {"execution": execution} diff --git a/qitos/benchmark/cybergym/runner.py b/qitos/benchmark/cybergym/runner.py new file mode 100644 index 0000000..db2526d --- /dev/null +++ b/qitos/benchmark/cybergym/runner.py @@ -0,0 +1,221 @@ +"""CyberGym benchmark runner.""" + +from __future__ import annotations + +import os +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from qitos.core import BenchmarkRunResult, ExperimentSpec, RunSpec, Task +from qitos.engine.stop_criteria import FinalResultCriteria, MaxStepsCriteria +from qitos.engine.states import ContextConfig +from qitos.kit.env.host_env import HostEnv +from qitos.trace import TraceWriter + +from .adapter import task_slug +from .evaluator import CyberGymEvaluator +from .runtime import CyberGymRuntimeHook, prepare_task_dir +from .scorer import CyberGymScorer + + +def make_trace_writer( + *, + trace_logdir: str | Path, + trace_prefix: str, + task_id: str, + model_id: str, +) -> TraceWriter: + stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f") + run_id = f"{trace_prefix}_{task_slug(task_id)}_{stamp}" + return TraceWriter( + output_dir=str(Path(trace_logdir).expanduser().resolve()), + run_id=run_id, + strict_validate=True, + metadata={"model_id": model_id}, + ) + + +def run_cybergym_agent_task( + *, + task_dir: str | Path, + model_name: str, + api_key: str, + base_url: str, + server: str, + max_steps: int, + trace_logdir: str | Path, + trace_prefix: str = "qitos_cybergym", + run_spec: RunSpec | None = None, + experiment_spec: ExperimentSpec | None = None, +) -> dict[str, Any]: + try: + from .agent.adapter import CyberGymAdapter + from .agent.cli import build_agent + from .agent.stop_criteria import PoCVerificationCriteria + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "CyberGym agent package is not bundled in QitOS. " + "Copy the cybergym_agent repository into `qitos/benchmark/cybergym/agent/` " + "before running the CyberGym benchmark." + ) from exc + + task_path = Path(task_dir).expanduser().resolve() + adapter = CyberGymAdapter(server_url=server) + task = adapter.from_task_dir(str(task_path), max_steps=max_steps) + + agent = build_agent( + model=model_name, + workspace_root=str(task_path), + server_url=server, + max_steps=max_steps, + llm_config={"api_key": api_key, "base_url": base_url}, + ) + + env = HostEnv(workspace_root=str(task_path)) + stop_criteria = [ + PoCVerificationCriteria(), + FinalResultCriteria(), + MaxStepsCriteria(max_steps=max_steps), + ] + context_config = ContextConfig( + tool_result_max_chars=4000, + conversation_max_rounds=10, + loop_max_repeats=3, + ) + trace_writer = make_trace_writer( + trace_logdir=trace_logdir, + trace_prefix=trace_prefix, + task_id=task.id, + model_id=model_name, + ) + + result = agent.run( + task=task, + return_state=True, + env=env, + stop_criteria=stop_criteria, + max_steps=max_steps, + workspace=str(task_path), + context_config=context_config, + trace=trace_writer, + run_spec=run_spec, + experiment_spec=experiment_spec, + description=task.inputs.get("description", ""), + task_id=task.inputs.get("task_id", ""), + agent_id=task.inputs.get("agent_id", ""), + checksum=task.inputs.get("checksum", ""), + server_url=task.inputs.get("server_url", server), + error_txt=task.inputs.get("error_txt", ""), + patch_diff=task.inputs.get("patch_diff", ""), + repo_dir=task.inputs.get("repo_dir", ""), + ) + + return { + "task_id": task.id, + "task_dir": str(task_path), + "trace_run_dir": str(trace_writer.run_dir), + "stop_reason": result.state.stop_reason, + "final_result": result.state.final_result, + "step_count": result.step_count, + "task_result": result.task_result.to_dict() if result.task_result is not None else None, + } + + +def run_cybergym_task( + *, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec +) -> BenchmarkRunResult: + started = time.time() + effective_spec = RunSpec.from_value(run_spec) + effective_spec.benchmark_name = effective_spec.benchmark_name or "cybergym" + effective_spec.benchmark_split = effective_spec.benchmark_split or str( + task.inputs.get("difficulty") or "level1" + ) + effective_spec.toolset_name = effective_spec.toolset_name or "cybergym_agent" + effective_spec.metadata = { + **dict(effective_spec.metadata or {}), + "recipe": "cybergym_agent", + } + + environment = dict(effective_spec.environment or {}) + task_id = str(task.inputs.get("task_id") or task.id) + difficulty = str(task.inputs.get("difficulty") or effective_spec.benchmark_split or "level1") + workspace = Path(str(environment.get("workspace") or "runs/cybergym/workspace")) + task_dir = workspace / task_slug(task_id) + data_dir = str(environment.get("data_dir") or "") + server = str(environment.get("server") or "") + base_url = str(environment.get("base_url") or "") + trace_logdir = str(environment.get("trace_logdir") or "runs/cybergym/traces") + api_key = str( + environment.get("api_key") + or os.getenv("OPENAI_API_KEY", "") + or os.getenv("QITOS_API_KEY", "") + or os.getenv("CYBERGYM_CLAUDE_AUTH_TOKEN", "") + ) + max_steps = int((effective_spec.metadata or {}).get("max_steps", task.budget.max_steps or 30)) + + if not data_dir: + raise ValueError("CyberGym run requires run_spec.environment['data_dir']") + if not server: + raise ValueError("CyberGym run requires run_spec.environment['server']") + if not base_url: + raise ValueError("CyberGym run requires run_spec.environment['base_url']") + if not api_key: + raise ValueError("CyberGym run requires api_key or OPENAI_API_KEY/QITOS_API_KEY") + + prepare_task_dir( + task_id=task_id, + out_dir=task_dir, + data_dir=data_dir, + server=server, + difficulty=difficulty, + ) + + prepared = CyberGymRuntimeHook().prepare( + task=task, + run_spec=effective_spec, + experiment_spec=experiment_spec, + ) + execution = run_cybergym_agent_task( + task_dir=task_dir, + model_name=str(effective_spec.model_name or ""), + api_key=api_key, + base_url=base_url, + server=server, + max_steps=max_steps, + trace_logdir=trace_logdir, + trace_prefix=str(environment.get("trace_prefix") or "qitos_cybergym"), + run_spec=effective_spec, + experiment_spec=experiment_spec, + ) + task_result = execution.get("task_result") or {} + base_result = BenchmarkRunResult( + task_id=task_id, + benchmark="cybergym", + split=difficulty, + prediction=execution.get("final_result"), + success=bool(task_result.get("success", False)), + stop_reason=str(execution.get("stop_reason") or "unknown"), + steps=int(execution.get("step_count") or 0), + latency_seconds=float(time.time() - started), + token_usage=int((task_result.get("metrics") or {}).get("token_usage", 0)), + cost=0.0, + trace_run_dir=str(execution.get("trace_run_dir") or ""), + run_spec_ref=effective_spec.fingerprint(), + metadata={"execution": execution}, + ) + evaluation = CyberGymEvaluator().evaluate( + prepared=prepared, + run_spec=effective_spec, + experiment_spec=experiment_spec, + execution=execution, + ) + return CyberGymScorer().score( + prepared=prepared, + run_spec=effective_spec, + experiment_spec=experiment_spec, + execution=execution, + evaluation=evaluation, + base_result=base_result, + ) diff --git a/qitos/benchmark/cybergym/runtime.py b/qitos/benchmark/cybergym/runtime.py new file mode 100644 index 0000000..6904f35 --- /dev/null +++ b/qitos/benchmark/cybergym/runtime.py @@ -0,0 +1,52 @@ +"""CyberGym benchmark runtime helpers.""" + +from __future__ import annotations + +from pathlib import Path + +from qitos.core import ExperimentSpec, RunSpec, Task + +from ..contracts import BenchmarkRuntimeHook, PreparedBenchmarkTask + + +def prepare_task_dir( + *, + task_id: str, + out_dir: str | Path, + data_dir: str | Path, + server: str, + difficulty: str, +) -> Path: + from cybergym.task.gen_task import generate_task + from cybergym.task.types import TaskConfig, TaskDifficulty + + out_path = Path(out_dir).expanduser().resolve() + out_path.mkdir(parents=True, exist_ok=True) + + generate_task( + TaskConfig( + task_id=task_id, + out_dir=out_path, + data_dir=Path(data_dir).expanduser().resolve(), + server=server, + difficulty=TaskDifficulty(difficulty), + ) + ) + return out_path + + +class CyberGymRuntimeHook(BenchmarkRuntimeHook): + """Attach CyberGym runtime metadata to a benchmark task.""" + + def prepare( + self, *, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec + ) -> PreparedBenchmarkTask: + _ = experiment_spec + environment = dict(run_spec.environment or {}) + metadata = { + "server": environment.get("server"), + "data_dir": environment.get("data_dir"), + "workspace": environment.get("workspace"), + "difficulty": task.inputs.get("difficulty") or task.metadata.get("split"), + } + return PreparedBenchmarkTask(task=task, runtime_metadata=metadata) diff --git a/qitos/benchmark/cybergym/scorer.py b/qitos/benchmark/cybergym/scorer.py new file mode 100644 index 0000000..7e378f2 --- /dev/null +++ b/qitos/benchmark/cybergym/scorer.py @@ -0,0 +1,32 @@ +"""CyberGym benchmark scorer.""" + +from __future__ import annotations + +from typing import Any, Dict + +from qitos.core import BenchmarkRunResult, ExperimentSpec, RunSpec + +from ..contracts import BenchmarkScorer, PreparedBenchmarkTask + + +class CyberGymScorer(BenchmarkScorer): + """Keep the normalized result and attach CyberGym-specific metadata.""" + + def score( + self, + *, + prepared: PreparedBenchmarkTask, + run_spec: RunSpec, + experiment_spec: ExperimentSpec, + execution: Any, + evaluation: Dict[str, Any], + base_result: BenchmarkRunResult, + ) -> BenchmarkRunResult: + _ = (run_spec, experiment_spec, execution) + base_result.metadata = { + **dict(base_result.metadata or {}), + "benchmark_runtime": dict(prepared.runtime_metadata or {}), + "cybergym": dict(evaluation or {}), + "family": "cybergym", + } + return base_result diff --git a/qitos/benchmark/runner.py b/qitos/benchmark/runner.py index 1f90d49..0178654 100644 --- a/qitos/benchmark/runner.py +++ b/qitos/benchmark/runner.py @@ -9,6 +9,7 @@ from qitos.core.task import Task from .cybench import CyBenchAdapter, run_cybench_task +from .cybergym import CyberGymBenchmarkAdapter, run_cybergym_task from .desktop import DesktopStarterAdapter, run_desktop_starter_task from .gaia import GaiaAdapter, run_gaia_task from .osworld import OSWorldBenchmarkAdapter, run_osworld_task @@ -25,6 +26,7 @@ def normalize_benchmark_name(value: str) -> str: "tau-bench": "tau-bench", "gaia": "gaia", "cybench": "cybench", + "cybergym": "cybergym", "desktop": "desktop-starter", "desktop-starter": "desktop-starter", "osworld-starter": "desktop-starter", @@ -60,6 +62,24 @@ def load_benchmark_tasks( limit=limit, ) return adapter.to_tasks(rows, split=split, limit=limit) + if normalized == "cybergym": + if root is None: + raise ValueError("CyberGym task loading requires root as a task id or task-id file") + root_value = str(root) + try: + from pathlib import Path + + path = Path(root_value) + task_ids = ( + [line.strip() for line in path.read_text(encoding="utf-8").splitlines()] + if path.exists() + else [root_value] + ) + except OSError: + task_ids = [root_value] + adapter = CyberGymBenchmarkAdapter(difficulty=split) + rows = adapter.load_records(task_ids=task_ids, limit=limit) + return adapter.to_tasks(rows, split=split, limit=limit) if normalized == "desktop-starter": adapter = DesktopStarterAdapter(dataset_path=root) rows = adapter.load_records(split=split) @@ -179,6 +199,28 @@ def runner(*, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec): experiment_spec=experiment_spec, ) + return runner + if normalized == "cybergym" and lane in { + "cybergym_baseline", + "cybergym_smoke", + "baseline", + "smoke", + }: + def runner(*, task: Task, run_spec: RunSpec, experiment_spec: ExperimentSpec): + enriched = RunSpec.from_value(run_spec) + if not enriched.prompt_protocol: + enriched.prompt_protocol = "cybergym_agent_v1" + if not enriched.parser_name: + enriched.parser_name = "cybergym_agent" + if lane in {"cybergym_smoke", "smoke"}: + enriched.metadata = dict(enriched.metadata or {}) + enriched.metadata["cybergym_smoke"] = True + return run_cybergym_task( + task=task, + run_spec=enriched, + experiment_spec=experiment_spec, + ) + return runner if normalized == "osworld" and lane in { "osworld_baseline", diff --git a/qitos/recipes/benchmarks/cybergym.py b/qitos/recipes/benchmarks/cybergym.py new file mode 100644 index 0000000..efd3e8c --- /dev/null +++ b/qitos/recipes/benchmarks/cybergym.py @@ -0,0 +1,98 @@ +"""Canonical CyberGym recipe for QitOS.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from qitos.benchmark.cybergym import ( + make_trace_writer, + prepare_task_dir, + run_cybergym_agent_task, + task_slug, +) + + +def run_cybergym_recipe_task( + *, + task_id: str, + data_dir: str, + out_dir: str, + server: str, + difficulty: str, + model_name: str, + api_key: str, + base_url: str, + max_steps: int, + trace_logdir: str, + trace_prefix: str = "qitos_cybergym", +) -> dict[str, Any]: + task_dir = prepare_task_dir( + task_id=task_id, + out_dir=out_dir, + data_dir=data_dir, + server=server, + difficulty=difficulty, + ) + return run_cybergym_agent_task( + task_dir=task_dir, + model_name=model_name, + api_key=api_key, + base_url=base_url, + server=server, + max_steps=max_steps, + trace_logdir=trace_logdir, + trace_prefix=trace_prefix, + ) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Run CyberGym through QitOS with native trace support" + ) + parser.add_argument("--task-id", required=True) + parser.add_argument("--data-dir", required=True) + parser.add_argument("--out-dir", required=True) + parser.add_argument("--server", required=True) + parser.add_argument( + "--difficulty", default="level1", choices=["level0", "level1", "level2", "level3"] + ) + parser.add_argument("--model-name", required=True) + parser.add_argument("--api-key", required=True) + parser.add_argument("--base-url", required=True) + parser.add_argument("--max-steps", type=int, default=30) + parser.add_argument("--trace-logdir", default="runs/cybergym/traces") + parser.add_argument("--trace-prefix", default="qitos_cybergym") + args = parser.parse_args(argv) + + result = run_cybergym_recipe_task( + task_id=args.task_id, + data_dir=args.data_dir, + out_dir=str(Path(args.out_dir)), + server=args.server, + difficulty=args.difficulty, + model_name=args.model_name, + api_key=args.api_key, + base_url=args.base_url, + max_steps=int(args.max_steps), + trace_logdir=args.trace_logdir, + trace_prefix=args.trace_prefix, + ) + print(json.dumps(result, ensure_ascii=False, indent=2)) + return 0 + + +__all__ = [ + "make_trace_writer", + "prepare_task_dir", + "run_cybergym_agent_task", + "run_cybergym_recipe_task", + "task_slug", + "main", +] + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_benchmark_cybergym_recipe.py b/tests/test_benchmark_cybergym_recipe.py new file mode 100644 index 0000000..a11a157 --- /dev/null +++ b/tests/test_benchmark_cybergym_recipe.py @@ -0,0 +1,47 @@ +import tempfile +import unittest +from pathlib import Path + +from qitos.benchmark import normalize_benchmark_name, resolve_builtin_runner +from qitos.benchmark.cybergym import CyberGymBenchmarkAdapter, make_trace_writer, task_slug +from qitos.recipes.benchmarks import cybergym + + +class CybergymRecipeTests(unittest.TestCase): + def test_task_slug_replaces_colon(self): + self.assertEqual(task_slug("arvo:1065"), "arvo_1065") + self.assertEqual(task_slug("oss-fuzz:42535201"), "oss-fuzz_42535201") + + def test_make_trace_writer_uses_prefix_and_task_slug(self): + with tempfile.TemporaryDirectory() as tmpdir: + writer = make_trace_writer( + trace_logdir=tmpdir, + trace_prefix="qitos_cybergym", + task_id="arvo:1065", + model_id="GLM-5.1-sii", + ) + + self.assertTrue(writer.run_id.startswith("qitos_cybergym_arvo_1065_")) + self.assertEqual(writer.metadata["model_id"], "GLM-5.1-sii") + self.assertTrue(Path(writer.run_dir).exists()) + + def test_adapter_builds_qitos_task_from_task_id(self): + adapter = CyberGymBenchmarkAdapter() + + task = adapter.to_task({"task_id": "arvo:1065"}, split="level1", idx=0) + + self.assertEqual(task.id, "arvo:1065") + self.assertEqual(task.inputs["difficulty"], "level1") + self.assertEqual(task.metadata["benchmark"], "cybergym") + + def test_cybergym_is_registered_as_benchmark_family(self): + self.assertEqual(normalize_benchmark_name("cybergym"), "cybergym") + self.assertIsNotNone(resolve_builtin_runner(benchmark="cybergym", strategy="smoke")) + + def test_recipe_reuses_benchmark_family_helpers(self): + self.assertIs(cybergym.task_slug, task_slug) + self.assertIs(cybergym.make_trace_writer, make_trace_writer) + + +if __name__ == "__main__": + unittest.main()