From 078e87b5f6ae1e1c8d441243ddf3f8be7f9b32d1 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 07:00:55 -0400 Subject: [PATCH] verifier: mkdir -p /app before pytest so --rootdir validation passes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VERIFIER_ENV pins PYTEST_ADDOPTS=--rootdir=/app for test-node-ID anchoring across SkillsBench / TB-style tasks. Tasks whose Dockerfile WORKDIRs elsewhere (e.g. /root) and don't otherwise create /app then trip pytest's at-startup directory check: ERROR: Directory '/app' not found. Check your '--rootdir' option. Verifier aborts before reaching test_outputs.py and the trial scores 0 even when the agent produced correct output. Surfaced today by trajectory analysis of the SkillsBench Apr 2026 patch trial: pg-essay-to-audiobook wrote /root/audiobook.mp3 (29:45 MP3, valid ffmpeg output) and the verifier's pytest still aborted on the missing /app rootdir; same shape on scheduling-email-assistant. Mirrors the skill_eval Dockerfile pattern (skill_eval.py:325) which already RUNs 'mkdir -p /logs/verifier /logs/agent /logs/artifacts /app /tests'. Add /app to the verifier dir prep step in trial.py so all benchflow eval paths (not just skill_eval) get it. Idempotent: tasks that DO populate /app are unaffected — mkdir -p is a no-op when the directory exists, and we deliberately don't chmod /app (any pre-existing task content stays root-owned). --- src/benchflow/trial.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index b255105..8dff593 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -545,9 +545,15 @@ async def soft_verify(self) -> tuple[dict | None, str | None, str | None]: from benchflow._sandbox import _build_cleanup_cmd, _read_hardening_config self._trial_paths.verifier_dir.mkdir(parents=True, exist_ok=True) - # Clean verifier output dir — chmod 777 so non-root verifier processes can write + # Clean verifier output dir — chmod 777 so non-root verifier processes can write. + # Also ensure /app exists: VERIFIER_ENV pins PYTEST_ADDOPTS=--rootdir=/app for + # test-node-ID anchoring, and pytest aborts with "Directory '/app' not found" + # when the task's Dockerfile WORKDIRs elsewhere (e.g. /root). Tasks that DO + # populate /app are unaffected — `mkdir -p` is a no-op when the directory + # already exists, and we don't chmod it (any task content stays root-owned). await self._env.exec( - "rm -rf /logs/verifier && mkdir -p /logs/verifier && chmod 777 /logs/verifier", + "rm -rf /logs/verifier && mkdir -p /logs/verifier /app && " + "chmod 777 /logs/verifier", user="root", timeout_sec=10, ) # Purge agent-injected conftest/sitecustomize/.pth without