From e998f8910fe2f3fb625fed329330e52820c45961 Mon Sep 17 00:00:00 2001 From: Jeremy Eder Date: Fri, 12 Dec 2025 22:10:32 -0500 Subject: [PATCH 1/4] feat: add assessor validation A/B testing framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement Phase 1 of empirical assessor validation using Terminal-Bench 2.0: - Add AssessorStateToggler class to manipulate repository state * Support for 3 Phase 1 assessors (CLAUDE.md, README, test coverage) * Safe fail/restore operations with context managers * Extensible registration system for new assessors - Add compare_assessor_impact() function for A/B testing * Orchestrates baseline (assessor fails) vs treatment (assessor passes) * Uses HarborRunner for Terminal-Bench execution * Calculates statistical significance (p-values, Cohen's d) - Add validate-assessor CLI command * Validates single assessor impact with configurable tasks/runs * Generates JSON and Markdown reports * Lists supported assessors with --list-assessors flag * Default Phase 1 task subset (8 diverse tasks) Related to Phase 1 implementation plan for ultra-fast first data (3-5 days). Next: Unit tests and first validation run. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- src/agentready/cli/benchmark.py | 224 ++++++++++++++++++ src/agentready/cli/main.py | 3 +- .../services/harbor/agent_toggler.py | 168 ++++++++++++- src/agentready/services/harbor/comparer.py | 159 +++++++++++++ 4 files changed, 551 insertions(+), 3 deletions(-) diff --git a/src/agentready/cli/benchmark.py b/src/agentready/cli/benchmark.py index d469ab34..3b650f4f 100644 --- a/src/agentready/cli/benchmark.py +++ b/src/agentready/cli/benchmark.py @@ -1,5 +1,6 @@ """Benchmark command for running agent coding evaluations.""" +import json import os import tempfile from pathlib import Path @@ -8,6 +9,8 @@ from ..services.eval_harness.harbor_config import HarborConfig from ..services.eval_harness.tbench_runner import _real_tbench_result +from ..services.harbor.agent_toggler import AssessorStateToggler +from ..services.harbor.comparer import compare_assessor_impact @click.command() @@ -187,3 +190,224 @@ def _run_tbench(repo_path, subset, model, verbose, timeout, output_dir, skip_pre traceback.print_exc() raise click.Abort() + + +# Default Phase 1 task subset (8 diverse tasks, ~2-3 hours per assessor) +DEFAULT_PHASE1_TASKS = [ + "adaptive-rejection-sampler", # Math/algorithms + "async-http-client", # Networking + "terminal-file-browser", # File I/O + "markdown-parser", # Text processing + "json-validator", # Data structures + "cli-calculator", # User interaction + "log-analyzer", # String manipulation + "sudoku-solver", # Logic +] + + +@click.command() +@click.option( + "--assessor", + "-a", + required=False, + help="Assessor ID to validate (e.g., claude_md_file, readme_structure, test_coverage)", +) +@click.option( + "--tasks", + "-t", + multiple=True, + help="Terminal-Bench task names (default: Phase 1 subset of 8 tasks)", +) +@click.option( + "--runs", + "-r", + type=int, + default=3, + help="Number of runs per task (default: 3, recommended: 5+)", +) +@click.option( + "--model", + "-m", + type=click.Choice(["claude-haiku-4-5", "claude-sonnet-4-5"]), + default="claude-haiku-4-5", + help="Claude model to use (default: haiku-4-5 for speed)", +) +@click.option( + "--output-dir", + "-o", + type=click.Path(), + default=None, + help="Output directory (default: .agentready/validations/{assessor_id}/)", +) +@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output") +@click.option( + "--list-assessors", + is_flag=True, + help="List supported assessors and exit", +) +def validate_assessor( + assessor, tasks, runs, model, output_dir, verbose, list_assessors +): + """Validate single assessor impact using Terminal-Bench A/B testing. + + This command empirically measures the impact of a specific AgentReady assessor + on Claude Code's Terminal-Bench performance by running an A/B test: + + \b + 1. Baseline: Force assessor to FAIL (manipulate repo state) + 2. Treatment: Restore repo to normal state (assessor PASSES) + 3. Compare success rates with statistical significance testing + + Examples: + + \b + # Validate CLAUDE.md impact (8 tasks, 3 runs each = 48 trials) + agentready validate-assessor --assessor claude_md_file + + \b + # Test README with custom tasks and 5 runs for higher statistical power + agentready validate-assessor --assessor readme_structure \\ + --tasks adaptive-rejection-sampler \\ + --tasks async-http-client \\ + --runs 5 + + \b + # List all supported assessors + agentready validate-assessor --list-assessors + """ + repo_root = Path.cwd() + + # Handle --list-assessors + if list_assessors: + toggler = AssessorStateToggler(repo_root=repo_root) + supported = toggler.list_supported_assessors() + + click.echo("Supported Assessors for Validation:") + click.echo(f"{'=' * 60}") + for assessor_id in supported: + click.echo(f" - {assessor_id}") + click.echo(f"\nTotal: {len(supported)} assessors") + click.echo("\nUsage: agentready validate-assessor --assessor {assessor_id}") + return + + # Validate that --assessor is provided if not listing + if not assessor: + click.echo( + "Error: Missing required option '--assessor' / '-a'.\n" + "Use --list-assessors to see available assessors.", + err=True, + ) + raise click.Abort() + + # Validate ANTHROPIC_API_KEY + api_key = os.environ.get("ANTHROPIC_API_KEY", "") + if not api_key: + click.echo( + "Error: ANTHROPIC_API_KEY environment variable not set.\n" + "Set it with: export ANTHROPIC_API_KEY=your-key-here", + err=True, + ) + raise click.Abort() + + # Use default Phase 1 tasks if not specified + if not tasks: + tasks = DEFAULT_PHASE1_TASKS + if verbose: + click.echo(f"Using default Phase 1 task subset ({len(tasks)} tasks)\n") + + # Convert model name to full identifier + model_id = f"anthropic/{model}" + + # Set output directory + if output_dir: + output_path = Path(output_dir) + else: + output_path = Path(".agentready") / "validations" / assessor + + try: + # Run A/B comparison + comparison = compare_assessor_impact( + assessor_id=assessor, + task_names=list(tasks), + repo_root=repo_root, + runs_per_task=runs, + output_dir=output_path, + model=model_id, + verbose=verbose, + ) + + # Save JSON results + json_file = output_path / f"{assessor}.json" + with open(json_file, "w") as f: + json.dump( + { + "assessor_id": assessor, + "tasks": list(tasks), + "runs_per_task": runs, + "baseline": comparison.without_agent.to_dict(), + "treatment": comparison.with_agent.to_dict(), + "deltas": comparison.deltas, + "statistical_significance": comparison.statistical_significance, + }, + f, + indent=2, + ) + + # Generate Markdown report + md_file = output_path / f"{assessor}.md" + with open(md_file, "w") as f: + f.write(f"# Assessor Impact Validation: {assessor}\n\n") + f.write(f"**Date**: {comparison.created_at}\n") + f.write(f"**Tasks**: {len(tasks)}\n") + f.write(f"**Runs per Task**: {runs}\n\n") + + f.write("## Results Summary\n\n") + f.write( + "| Metric | Baseline (Assessor Fails) | Treatment (Assessor Passes) | Delta |\n" + ) + f.write( + "|--------|---------------------------|----------------------------|-------|\n" + ) + + delta = comparison.deltas["success_rate_delta"] + sign = "+" if delta >= 0 else "" + f.write( + f"| Success Rate | {comparison.without_agent.success_rate:.1f}% | " + f"{comparison.with_agent.success_rate:.1f}% | **{sign}{delta:.1f} pp** |\n" + ) + + duration_delta = comparison.deltas.get("avg_duration_delta_sec", 0) + sign = "+" if duration_delta >= 0 else "" + f.write( + f"| Avg Duration | {comparison.without_agent.avg_duration_sec:.1f}s | " + f"{comparison.with_agent.avg_duration_sec:.1f}s | {sign}{duration_delta:.1f}s |\n" + ) + + f.write("\n## Statistical Significance\n\n") + is_sig = comparison.statistical_significance.get( + "success_rate_significant", False + ) + p_val = comparison.statistical_significance.get("success_rate_p_value") + f.write(f"- **Significant**: {'YES ✓' if is_sig else 'NO'}\n") + if p_val is not None: + f.write(f"- **P-value**: {p_val:.4f}\n") + + f.write("\n## Files\n\n") + f.write(f"- JSON: `{json_file}`\n") + f.write(f"- Markdown: `{md_file}`\n") + + click.echo("\nResults saved:") + click.echo(f" - JSON: {json_file}") + click.echo(f" - Markdown: {md_file}\n") + + except ValueError as e: + click.echo(f"\nError: {e}\n", err=True) + click.echo("Use --list-assessors to see supported assessors.", err=True) + raise click.Abort() + except Exception as e: + click.echo(f"\nValidation failed: {e}\n", err=True) + if verbose: + import traceback + + traceback.print_exc() + raise click.Abort() diff --git a/src/agentready/cli/main.py b/src/agentready/cli/main.py index 64c99937..73f888e7 100644 --- a/src/agentready/cli/main.py +++ b/src/agentready/cli/main.py @@ -29,7 +29,7 @@ # Lightweight commands - imported immediately from .align import align -from .benchmark import benchmark +from .benchmark import benchmark, validate_assessor from .bootstrap import bootstrap from .demo import demo from .repomix import repomix_generate @@ -544,6 +544,7 @@ def generate_config(): # Register lightweight commands (heavy commands loaded lazily via LazyGroup) cli.add_command(align) cli.add_command(benchmark) +cli.add_command(validate_assessor) cli.add_command(bootstrap) cli.add_command(demo) cli.add_command(migrate_report) diff --git a/src/agentready/services/harbor/agent_toggler.py b/src/agentready/services/harbor/agent_toggler.py index 6653c82f..16151109 100644 --- a/src/agentready/services/harbor/agent_toggler.py +++ b/src/agentready/services/harbor/agent_toggler.py @@ -1,9 +1,9 @@ -"""Service for safely enabling/disabling the doubleagent.md file.""" +"""Service for safely enabling/disabling agent files and manipulating repository state.""" import shutil from contextlib import contextmanager from pathlib import Path -from typing import Generator +from typing import Callable, Dict, Generator, Tuple class AgentFileToggler: @@ -89,3 +89,167 @@ def temporarily_enabled(self) -> Generator[None, None, None]: finally: if was_disabled: self.disable() + + +class AssessorStateToggler: + """Manipulate repository state to force assessor pass/fail for A/B testing. + + This class safely modifies repository files to simulate scenarios where + specific assessors would pass or fail, enabling empirical validation of + assessor impacts on agent performance. + + Example: + toggler = AssessorStateToggler() + + # Force CLAUDE.md assessor to fail + toggler.force_fail("claude_md_file") + # Run benchmark + toggler.restore("claude_md_file") + """ + + # Mapping of assessor IDs to (fail_action, restore_action) tuples + # Each action is a callable that takes the repository root Path + MANIPULATIONS: Dict[str, Tuple[Callable[[Path], None], Callable[[Path], None]]] = {} + + @classmethod + def register_manipulation( + cls, + assessor_id: str, + fail_action: Callable[[Path], None], + restore_action: Callable[[Path], None], + ) -> None: + """Register a manipulation strategy for an assessor. + + Args: + assessor_id: Unique assessor identifier (e.g., "claude_md_file") + fail_action: Function to force assessor to fail state + restore_action: Function to restore assessor to pass state + """ + cls.MANIPULATIONS[assessor_id] = (fail_action, restore_action) + + def __init__(self, repo_root: Path = None): + """Initialize toggler with repository root. + + Args: + repo_root: Root of the repository (default: current directory) + """ + self.repo_root = repo_root or Path.cwd() + self._backup_suffix = ".assessor_backup" + self._initialize_default_manipulations() + + def _initialize_default_manipulations(self) -> None: + """Register default manipulation strategies for Phase 1 assessors.""" + + # CLAUDE.md Assessor - Tier 1, 10% weight + def fail_claude_md(repo_root: Path) -> None: + claude_md = repo_root / ".claude" / "CLAUDE.md" + backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}" + if claude_md.exists() and not backup.exists(): + shutil.move(str(claude_md), str(backup)) + + def restore_claude_md(repo_root: Path) -> None: + claude_md = repo_root / ".claude" / "CLAUDE.md" + backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}" + if backup.exists() and not claude_md.exists(): + shutil.move(str(backup), str(claude_md)) + + self.register_manipulation("claude_md_file", fail_claude_md, restore_claude_md) + + # README Assessor - Tier 1, 10% weight + def fail_readme(repo_root: Path) -> None: + readme = repo_root / "README.md" + backup = repo_root / f"README.md{self._backup_suffix}" + if readme.exists() and not backup.exists(): + shutil.move(str(readme), str(backup)) + + def restore_readme(repo_root: Path) -> None: + readme = repo_root / "README.md" + backup = repo_root / f"README.md{self._backup_suffix}" + if backup.exists() and not readme.exists(): + shutil.move(str(backup), str(readme)) + + self.register_manipulation("readme_structure", fail_readme, restore_readme) + + # Test Coverage Assessor - Tier 2, 3% weight + def fail_tests(repo_root: Path) -> None: + tests_dir = repo_root / "tests" + backup_dir = repo_root / f"tests{self._backup_suffix}" + if tests_dir.exists() and not backup_dir.exists(): + shutil.move(str(tests_dir), str(backup_dir)) + + def restore_tests(repo_root: Path) -> None: + tests_dir = repo_root / "tests" + backup_dir = repo_root / f"tests{self._backup_suffix}" + if backup_dir.exists() and not tests_dir.exists(): + shutil.move(str(backup_dir), str(tests_dir)) + + self.register_manipulation("test_coverage", fail_tests, restore_tests) + + def force_fail(self, assessor_id: str) -> None: + """Force assessor to fail by manipulating repository state. + + Args: + assessor_id: Assessor identifier (e.g., "claude_md_file") + + Raises: + ValueError: If assessor_id is not recognized + """ + if assessor_id not in self.MANIPULATIONS: + raise ValueError( + f"Unknown assessor: {assessor_id}. " + f"Available: {', '.join(self.MANIPULATIONS.keys())}" + ) + + fail_action, _ = self.MANIPULATIONS[assessor_id] + fail_action(self.repo_root) + + def restore(self, assessor_id: str) -> None: + """Restore repository to state where assessor passes. + + Args: + assessor_id: Assessor identifier (e.g., "claude_md_file") + + Raises: + ValueError: If assessor_id is not recognized + """ + if assessor_id not in self.MANIPULATIONS: + raise ValueError( + f"Unknown assessor: {assessor_id}. " + f"Available: {', '.join(self.MANIPULATIONS.keys())}" + ) + + _, restore_action = self.MANIPULATIONS[assessor_id] + restore_action(self.repo_root) + + def list_supported_assessors(self) -> list[str]: + """Get list of assessor IDs with registered manipulations. + + Returns: + List of assessor IDs that can be toggled + """ + return list(self.MANIPULATIONS.keys()) + + @contextmanager + def temporarily_failed(self, assessor_id: str) -> Generator[None, None, None]: + """Context manager for safe fail/restore of assessor state. + + Ensures repository is restored even if exception occurs during testing. + + Args: + assessor_id: Assessor identifier + + Example: + toggler = AssessorStateToggler() + with toggler.temporarily_failed("claude_md_file"): + # CLAUDE.md is now missing (assessor fails) + run_benchmark() + # CLAUDE.md is automatically restored here + + Yields: + None + """ + try: + self.force_fail(assessor_id) + yield + finally: + self.restore(assessor_id) diff --git a/src/agentready/services/harbor/comparer.py b/src/agentready/services/harbor/comparer.py index da0a27c9..80fac0dd 100644 --- a/src/agentready/services/harbor/comparer.py +++ b/src/agentready/services/harbor/comparer.py @@ -1,8 +1,12 @@ """Service for comparing Harbor benchmark runs and calculating statistical significance.""" +from pathlib import Path from typing import List, Optional from agentready.models.harbor import HarborComparison, HarborRunMetrics +from agentready.services.harbor.agent_toggler import AssessorStateToggler +from agentready.services.harbor.result_parser import parse_harbor_results +from agentready.services.harbor.runner import HarborRunner def compare_runs( @@ -186,3 +190,158 @@ def interpret_effect_size(cohens_d: float) -> str: return "medium" else: return "large" + + +def compare_assessor_impact( + assessor_id: str, + task_names: List[str], + repo_root: Path, + runs_per_task: int = 3, + output_dir: Path = None, + model: str = "anthropic/claude-sonnet-4-5", + verbose: bool = True, +) -> HarborComparison: + """A/B test assessor impact: baseline (assessor fails) vs treatment (assessor passes). + + This function orchestrates the complete A/B testing workflow: + 1. Force assessor to fail (manipulate repository state) + 2. Run Harbor benchmark (baseline) + 3. Restore repository to normal state (assessor passes) + 4. Run Harbor benchmark again (treatment) + 5. Compare results with statistical significance + + Args: + assessor_id: Assessor identifier (e.g., "claude_md_file") + task_names: List of Terminal-Bench task names to test + repo_root: Root of the repository to test + runs_per_task: Number of runs per task (default: 3, recommended: 5+) + output_dir: Directory to store results (default: .agentready/validations/{assessor_id}/) + model: Claude model to use (default: sonnet-4-5) + verbose: Print progress messages (default: True) + + Returns: + HarborComparison with baseline vs treatment metrics, deltas, and significance + + Raises: + ValueError: If assessor_id is not supported + HarborNotInstalledError: If Harbor CLI is not available + subprocess.CalledProcessError: If Harbor benchmark fails + + Example: + comparison = compare_assessor_impact( + assessor_id="claude_md_file", + task_names=["adaptive-rejection-sampler", "async-http-client"], + repo_root=Path("."), + runs_per_task=3, + verbose=True + ) + + print(f"Success rate delta: {comparison.deltas['success_rate_delta']:.1f}%") + print(f"Significant: {comparison.statistical_significance['success_rate_significant']}") + """ + # Default output directory + if output_dir is None: + output_dir = Path(".agentready") / "validations" / assessor_id + output_dir.mkdir(parents=True, exist_ok=True) + + # Initialize components + toggler = AssessorStateToggler(repo_root=repo_root) + runner = HarborRunner() + + if verbose: + print(f"\nAssessor Impact Validation: {assessor_id}") + print(f"{'=' * 60}") + print(f"Tasks: {', '.join(task_names)}") + print(f"Runs per task: {runs_per_task}") + print( + f"Total trials: {len(task_names) * runs_per_task * 2} (baseline + treatment)" + ) + print(f"Output: {output_dir}\n") + + # Baseline: Assessor fails (remove CLAUDE.md, README, etc.) + if verbose: + print("[1/2] Running baseline (assessor FAILS)...") + print(f" Manipulating repository state to force {assessor_id} to fail") + + baseline_output = output_dir / "baseline" + with toggler.temporarily_failed(assessor_id): + if verbose: + print( + f" Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials" + ) + + baseline_results_dir = runner.run_benchmark( + task_names=task_names, + output_dir=baseline_output, + model=model, + n_concurrent=1, + verbose=verbose, + ) + + # Parse results + baseline_task_results = parse_harbor_results(baseline_results_dir) + baseline_metrics = HarborRunMetrics.from_task_results( + run_id=f"{assessor_id}_baseline", + agent_file_enabled=False, # Assessor fails + task_results=baseline_task_results, + ) + + if verbose: + print( + f" Baseline complete: {baseline_metrics.success_rate:.1f}% success rate\n" + ) + + # Treatment: Assessor passes (normal repository state) + if verbose: + print("[2/2] Running treatment (assessor PASSES)...") + print(" Repository restored to normal state") + + treatment_output = output_dir / "treatment" + if verbose: + print( + f" Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials" + ) + + treatment_results_dir = runner.run_benchmark( + task_names=task_names, + output_dir=treatment_output, + model=model, + n_concurrent=1, + verbose=verbose, + ) + + # Parse results + treatment_task_results = parse_harbor_results(treatment_results_dir) + treatment_metrics = HarborRunMetrics.from_task_results( + run_id=f"{assessor_id}_treatment", + agent_file_enabled=True, # Assessor passes + task_results=treatment_task_results, + ) + + if verbose: + print( + f" Treatment complete: {treatment_metrics.success_rate:.1f}% success rate\n" + ) + + # Compare results + comparison = compare_runs( + without_agent=baseline_metrics, with_agent=treatment_metrics + ) + + if verbose: + print(f"{'=' * 60}") + print("Results Summary") + print(f"{'=' * 60}") + delta = comparison.deltas["success_rate_delta"] + sign = "+" if delta >= 0 else "" + print(f"Success Rate Delta: {sign}{delta:.1f} percentage points") + print( + f"Statistical Significance: {'YES' if comparison.statistical_significance.get('success_rate_significant', False) else 'NO'}" + ) + if comparison.statistical_significance.get("success_rate_p_value"): + print( + f"P-value: {comparison.statistical_significance['success_rate_p_value']:.4f}" + ) + print(f"\nResults saved to: {output_dir}\n") + + return comparison From 20dcd44204efc336f9f527af889e8b4bf8263170 Mon Sep 17 00:00:00 2001 From: Jeremy Eder Date: Fri, 12 Dec 2025 22:11:36 -0500 Subject: [PATCH 2/4] test: add comprehensive unit tests for AssessorStateToggler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 14 unit tests covering: - Initialization and setup - Force fail/restore operations for all 3 Phase 1 assessors - Context manager (temporarily_failed) with exception safety - Error handling for unknown assessors - Idempotent operations - Content preservation through fail/restore cycles All tests pass (14/14). Coverage focused on state manipulation logic. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- tests/unit/services/harbor/__init__.py | 0 .../harbor/test_assessor_state_toggler.py | 240 ++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 tests/unit/services/harbor/__init__.py create mode 100644 tests/unit/services/harbor/test_assessor_state_toggler.py diff --git a/tests/unit/services/harbor/__init__.py b/tests/unit/services/harbor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/services/harbor/test_assessor_state_toggler.py b/tests/unit/services/harbor/test_assessor_state_toggler.py new file mode 100644 index 00000000..5d227b6e --- /dev/null +++ b/tests/unit/services/harbor/test_assessor_state_toggler.py @@ -0,0 +1,240 @@ +"""Unit tests for AssessorStateToggler.""" + +import pytest + +from agentready.services.harbor.agent_toggler import AssessorStateToggler + + +@pytest.fixture +def temp_repo(tmp_path): + """Create a temporary repository structure for testing.""" + repo_root = tmp_path / "test_repo" + repo_root.mkdir() + + # Create .claude directory and CLAUDE.md + claude_dir = repo_root / ".claude" + claude_dir.mkdir() + claude_md = claude_dir / "CLAUDE.md" + claude_md.write_text("# Test Project\n\nThis is a test CLAUDE.md file.") + + # Create README.md + readme = repo_root / "README.md" + readme.write_text("# Test Repo\n\n## Installation\n\n## Usage") + + # Create tests directory + tests_dir = repo_root / "tests" + tests_dir.mkdir() + (tests_dir / "test_example.py").write_text("def test_example():\n assert True") + + return repo_root + + +class TestAssessorStateToggler: + """Test AssessorStateToggler functionality.""" + + def test_initialization(self, temp_repo): + """Test toggler initializes with correct repo root.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + assert toggler.repo_root == temp_repo + assert toggler._backup_suffix == ".assessor_backup" + + def test_list_supported_assessors(self, temp_repo): + """Test listing supported assessors returns expected IDs.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + supported = toggler.list_supported_assessors() + + assert "claude_md_file" in supported + assert "readme_structure" in supported + assert "test_coverage" in supported + assert len(supported) == 3 + + def test_force_fail_claude_md(self, temp_repo): + """Test forcing CLAUDE.md assessor to fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Verify file exists before fail + assert claude_md.exists() + assert not backup.exists() + + # Force fail + toggler.force_fail("claude_md_file") + + # Verify file is backed up + assert not claude_md.exists() + assert backup.exists() + + def test_restore_claude_md(self, temp_repo): + """Test restoring CLAUDE.md after forcing fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail first + toggler.force_fail("claude_md_file") + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Verify backup state + assert not claude_md.exists() + assert backup.exists() + + # Restore + toggler.restore("claude_md_file") + + # Verify restored + assert claude_md.exists() + assert not backup.exists() + + def test_force_fail_readme(self, temp_repo): + """Test forcing README assessor to fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + readme = temp_repo / "README.md" + backup = temp_repo / "README.md.assessor_backup" + + # Force fail + toggler.force_fail("readme_structure") + + # Verify + assert not readme.exists() + assert backup.exists() + + def test_restore_readme(self, temp_repo): + """Test restoring README after forcing fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail first + toggler.force_fail("readme_structure") + + # Restore + toggler.restore("readme_structure") + + readme = temp_repo / "README.md" + backup = temp_repo / "README.md.assessor_backup" + + # Verify + assert readme.exists() + assert not backup.exists() + + def test_force_fail_tests(self, temp_repo): + """Test forcing test coverage assessor to fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + tests_dir = temp_repo / "tests" + backup_dir = temp_repo / "tests.assessor_backup" + + # Force fail + toggler.force_fail("test_coverage") + + # Verify + assert not tests_dir.exists() + assert backup_dir.exists() + + def test_restore_tests(self, temp_repo): + """Test restoring tests directory after forcing fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail first + toggler.force_fail("test_coverage") + + # Restore + toggler.restore("test_coverage") + + tests_dir = temp_repo / "tests" + backup_dir = temp_repo / "tests.assessor_backup" + + # Verify + assert tests_dir.exists() + assert not backup_dir.exists() + + def test_temporarily_failed_context_manager(self, temp_repo): + """Test temporarily_failed context manager restores state.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + + # Verify initial state + assert claude_md.exists() + + # Use context manager + with toggler.temporarily_failed("claude_md_file"): + # Inside context: file should be missing + assert not claude_md.exists() + + # Outside context: file should be restored + assert claude_md.exists() + + def test_temporarily_failed_exception_still_restores(self, temp_repo): + """Test temporarily_failed restores even when exception occurs.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + + # Verify initial state + assert claude_md.exists() + + # Use context manager with exception + with pytest.raises(RuntimeError): + with toggler.temporarily_failed("claude_md_file"): + assert not claude_md.exists() + raise RuntimeError("Test exception") + + # File should still be restored + assert claude_md.exists() + + def test_unknown_assessor_raises_error(self, temp_repo): + """Test that unknown assessor ID raises ValueError.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + with pytest.raises(ValueError, match="Unknown assessor: nonexistent"): + toggler.force_fail("nonexistent") + + with pytest.raises(ValueError, match="Unknown assessor: nonexistent"): + toggler.restore("nonexistent") + + def test_idempotent_force_fail(self, temp_repo): + """Test that forcing fail multiple times is idempotent.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail twice + toggler.force_fail("claude_md_file") + toggler.force_fail("claude_md_file") # Should not error + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Still in failed state + assert not claude_md.exists() + assert backup.exists() + + def test_idempotent_restore(self, temp_repo): + """Test that restoring multiple times is idempotent.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail, then restore twice + toggler.force_fail("claude_md_file") + toggler.restore("claude_md_file") + toggler.restore("claude_md_file") # Should not error + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Still in normal state + assert claude_md.exists() + assert not backup.exists() + + def test_content_preservation(self, temp_repo): + """Test that file content is preserved through fail/restore cycle.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + original_content = claude_md.read_text() + + # Fail and restore + toggler.force_fail("claude_md_file") + toggler.restore("claude_md_file") + + # Verify content is identical + assert claude_md.read_text() == original_content From 02ddb1019134d908648c33d9c1b801c2a232e080 Mon Sep 17 00:00:00 2001 From: Jeremy Eder Date: Sat, 13 Dec 2025 11:44:39 -0500 Subject: [PATCH 3/4] fix: enable Harbor task filtering for smoketest support ## Problem Harbor's task filtering (-t flag) wasn't working, causing smoketests to run all 89 tasks instead of just 1, making quick validation impossible. ## Root Cause Investigation Harbor had a bug in _filter_task_ids() that used task_id.path.name directly instead of task_id.get_name(), breaking filtering for RegistryTaskId objects. The bug was FIXED in Harbor main (commit f9e6d2e, Dec 12, 2025) but NOT YET RELEASED to PyPI. Latest PyPI version 0.1.23 (Dec 11) still has the bug. AgentReady had a workaround that assumed filtering didn't work and never passed -t flags to Harbor. ## Solution 1. **HarborRunner**: Now passes -t flags directly to Harbor CLI (requires Harbor >= 0.1.24) 2. **Result Parser**: Filter out job-level result.json files (only parse task results) 3. **Benchmark CLI**: Add --smoketest flag for quick 1-task validation 4. **Comparer**: Support --concurrent flag for parallel task execution ## Changes - src/agentready/services/harbor/runner.py: Pass -t flags, remove workaround - src/agentready/services/harbor/result_parser.py: Fix result file filtering - src/agentready/cli/benchmark.py: Add --smoketest and --concurrent flags - src/agentready/services/harbor/comparer.py: Support n_concurrent parameter ## Testing Requires: pip install git+https://github.com/laude-institute/harbor.git Run: agentready validate-assessor --assessor claude_md_file --smoketest Expected: 1 task runs (~2 minutes) instead of 89 tasks (~hours) ## Harbor Issue Bug fixed upstream in f9e6d2e10c72d33373012294c36fd4938c45c26c Waiting for Harbor 0.1.24 release with the fix Co-Authored-By: Claude Sonnet 4.5 --- src/agentready/cli/benchmark.py | 39 +++++++++++++++++-- src/agentready/services/harbor/comparer.py | 6 ++- .../services/harbor/result_parser.py | 13 ++++++- src/agentready/services/harbor/runner.py | 27 +++---------- 4 files changed, 56 insertions(+), 29 deletions(-) diff --git a/src/agentready/cli/benchmark.py b/src/agentready/cli/benchmark.py index 3b650f4f..b74cea14 100644 --- a/src/agentready/cli/benchmark.py +++ b/src/agentready/cli/benchmark.py @@ -245,8 +245,28 @@ def _run_tbench(repo_path, subset, model, verbose, timeout, output_dir, skip_pre is_flag=True, help="List supported assessors and exit", ) +@click.option( + "--concurrent", + "-c", + type=int, + default=1, + help="Number of concurrent tasks to run in parallel (default: 1, recommended: 3-5)", +) +@click.option( + "--smoketest", + is_flag=True, + help="Run smoketest with single task for quick validation (~2 minutes)", +) def validate_assessor( - assessor, tasks, runs, model, output_dir, verbose, list_assessors + assessor, + tasks, + runs, + model, + output_dir, + verbose, + list_assessors, + concurrent, + smoketest, ): """Validate single assessor impact using Terminal-Bench A/B testing. @@ -260,6 +280,10 @@ def validate_assessor( Examples: + \b + # Quick smoketest with 1 task (~2 minutes) + agentready validate-assessor --assessor claude_md_file --smoketest + \b # Validate CLAUDE.md impact (8 tasks, 3 runs each = 48 trials) agentready validate-assessor --assessor claude_md_file @@ -311,9 +335,15 @@ def validate_assessor( # Use default Phase 1 tasks if not specified if not tasks: - tasks = DEFAULT_PHASE1_TASKS - if verbose: - click.echo(f"Using default Phase 1 task subset ({len(tasks)} tasks)\n") + if smoketest: + # Smoketest: just 1 simple task for quick validation + tasks = ["adaptive-rejection-sampler"] + if verbose: + click.echo("Using smoketest mode (1 task, ~2 minutes total)\n") + else: + tasks = DEFAULT_PHASE1_TASKS + if verbose: + click.echo(f"Using default Phase 1 task subset ({len(tasks)} tasks)\n") # Convert model name to full identifier model_id = f"anthropic/{model}" @@ -333,6 +363,7 @@ def validate_assessor( runs_per_task=runs, output_dir=output_path, model=model_id, + n_concurrent=concurrent, verbose=verbose, ) diff --git a/src/agentready/services/harbor/comparer.py b/src/agentready/services/harbor/comparer.py index 80fac0dd..df3b4461 100644 --- a/src/agentready/services/harbor/comparer.py +++ b/src/agentready/services/harbor/comparer.py @@ -199,6 +199,7 @@ def compare_assessor_impact( runs_per_task: int = 3, output_dir: Path = None, model: str = "anthropic/claude-sonnet-4-5", + n_concurrent: int = 1, verbose: bool = True, ) -> HarborComparison: """A/B test assessor impact: baseline (assessor fails) vs treatment (assessor passes). @@ -217,6 +218,7 @@ def compare_assessor_impact( runs_per_task: Number of runs per task (default: 3, recommended: 5+) output_dir: Directory to store results (default: .agentready/validations/{assessor_id}/) model: Claude model to use (default: sonnet-4-5) + n_concurrent: Number of concurrent tasks to run in parallel (default: 1) verbose: Print progress messages (default: True) Returns: @@ -274,7 +276,7 @@ def compare_assessor_impact( task_names=task_names, output_dir=baseline_output, model=model, - n_concurrent=1, + n_concurrent=n_concurrent, verbose=verbose, ) @@ -306,7 +308,7 @@ def compare_assessor_impact( task_names=task_names, output_dir=treatment_output, model=model, - n_concurrent=1, + n_concurrent=n_concurrent, verbose=verbose, ) diff --git a/src/agentready/services/harbor/result_parser.py b/src/agentready/services/harbor/result_parser.py index 684fd082..c8b1c01c 100644 --- a/src/agentready/services/harbor/result_parser.py +++ b/src/agentready/services/harbor/result_parser.py @@ -23,8 +23,17 @@ def parse_harbor_results(results_dir: Path) -> List[HarborTaskResult]: if not results_dir.exists(): raise FileNotFoundError(f"Results directory not found: {results_dir}") - # Find all result.json files in subdirectories - result_files = list(results_dir.glob("*/result.json")) + # Find all result.json files in subdirectories (task directories only, not job-level result.json) + # Task directories are named like "task-name__hash/" while the job result.json is at the root + all_result_files = list(results_dir.glob("*/result.json")) + + # Filter to only task result files (exclude job-level result.json) + # Task directories contain "__" in their name (e.g., "build-pmars__abc123") + result_files = [ + f + for f in all_result_files + if "__" in f.parent.name # Task directories have "__" separator + ] if not result_files: raise ValueError(f"No result.json files found in {results_dir}") diff --git a/src/agentready/services/harbor/runner.py b/src/agentready/services/harbor/runner.py index 6dbd7d76..4dc6bff8 100644 --- a/src/agentready/services/harbor/runner.py +++ b/src/agentready/services/harbor/runner.py @@ -1,6 +1,5 @@ """Service for executing Harbor benchmarks via CLI.""" -import json import subprocess from pathlib import Path from typing import List @@ -27,7 +26,7 @@ def _verify_harbor_installed(self) -> None: """ try: subprocess.run( - ["harbor", "--version"], + ["harbor", "--help"], capture_output=True, text=True, check=True, @@ -89,25 +88,11 @@ def run_benchmark( str(n_concurrent), ] - # Add task selection if specified - if task_names: - # Harbor uses config JSON for task selection - config_file = output_dir / "config.json" - config = { - "datasets": [ - { - "name": dataset, - "version": dataset_version, - "task_names": task_names, - } - ], - "agents": [{"name": agent, "model_name": model}], - } - with open(config_file, "w") as f: - json.dump(config, f, indent=2) - - # Use config file instead of CLI args - cmd = ["harbor", "run", "-c", str(config_file)] + # Add task name filters + # NOTE: Requires Harbor >= 0.1.24 (or install from main) + # Task filtering was fixed in commit f9e6d2e (Dec 12, 2025) + for task_name in task_names: + cmd.extend(["-t", task_name]) # Execute Harbor benchmark if verbose: From b52c91f68251a4df93fc66f922b18f350d4a0d43 Mon Sep 17 00:00:00 2001 From: Jeremy Eder Date: Sat, 13 Dec 2025 12:09:41 -0500 Subject: [PATCH 4/4] feat: add runtime check for Harbor task filtering bug ## Added **Runtime Detection**: - Inspects Harbor source at runtime to detect task filtering bug - Warns users if Harbor has the bug (PyPI 0.1.23 and earlier) - Provides clear fix instructions with 2 options **Patch File**: - Added patches/harbor-task-filtering-fix.patch - Users can apply manually if they can't install from main - Includes detailed README with usage instructions ## How It Works On HarborRunner initialization: 1. Check if Harbor is installed (existing check) 2. **NEW**: Check if Harbor has task filtering bug 3. If bug detected, show warning with fix options Warning shows: - Problem description (smoketests run all 89 tasks) - Option 1: Install Harbor from main (recommended) - Option 2: Apply patch from patches/ directory - Link to upstream commit ## Fix Options **Option 1** (recommended): ```bash pip install git+https://github.com/laude-institute/harbor.git ``` **Option 2** (if can't install from git): ```bash # Apply patch to local Harbor installation patch -p1 < patches/harbor-task-filtering-fix.patch ``` ## Files Changed - src/agentready/services/harbor/runner.py: Add _check_harbor_task_filtering() - patches/harbor-task-filtering-fix.patch: Harbor fix patch (commit f9e6d2e) - patches/README.md: Patch usage documentation ## When to Remove Once Harbor 0.1.24 is released to PyPI, the warning will stop showing (automatically detects the fix is present). The patch files can be removed once Harbor 0.1.24 is widely adopted. Co-Authored-By: Claude Sonnet 4.5 --- patches/README.md | 58 ++++++++++++++++++ patches/harbor-task-filtering-fix.patch | 75 ++++++++++++++++++++++++ src/agentready/services/harbor/runner.py | 61 +++++++++++++++++++ 3 files changed, 194 insertions(+) create mode 100644 patches/README.md create mode 100644 patches/harbor-task-filtering-fix.patch diff --git a/patches/README.md b/patches/README.md new file mode 100644 index 00000000..f2838442 --- /dev/null +++ b/patches/README.md @@ -0,0 +1,58 @@ +# AgentReady Patches + +This directory contains patches for upstream dependencies that haven't been released yet. + +## harbor-task-filtering-fix.patch + +**Status**: Fixed upstream, waiting for release + +**Problem**: Harbor's `-t`/`--task-name` flag is ignored, causing all tasks to run instead of filtered subset. + +**Upstream Fix**: [Harbor commit f9e6d2e](https://github.com/laude-institute/harbor/commit/f9e6d2e10c72d33373012294c36fd4938c45c26c) (Dec 12, 2025) + +**Released**: ❌ Not yet (latest PyPI: 0.1.23 from Dec 11, 2025) + +### Option 1: Install from Main (Recommended) + +```bash +pip uninstall harbor +pip install git+https://github.com/laude-institute/harbor.git +``` + +### Option 2: Apply Patch Manually + +```bash +# Find Harbor installation +HARBOR_PATH=$(python -c "import harbor, os; print(os.path.dirname(harbor.__file__))") + +# Apply patch +cd "$HARBOR_PATH/.." +patch -p1 < /path/to/agentready/patches/harbor-task-filtering-fix.patch +``` + +### Option 3: Manual Edit + +Edit `harbor/models/job/config.py`: + +```python +# Line 42: Change +fnmatch(task_id.path.name, pattern_id) +# To: +fnmatch(task_id.get_name(), pattern_id) + +# Line 52: Change +fnmatch(task_id.path.name, pattern_id) +# To: +fnmatch(task_id.get_name(), pattern_id) + +# Line 76: Change +TaskConfig(path=task_id.path, source=self.path.name) +# To: +TaskConfig(path=task_id.path, source=self.path.expanduser().resolve().name) +``` + +## When to Remove + +Once Harbor 0.1.24 (or later) is released to PyPI, this patch is no longer needed. + +Check PyPI: https://pypi.org/project/harbor/#history diff --git a/patches/harbor-task-filtering-fix.patch b/patches/harbor-task-filtering-fix.patch new file mode 100644 index 00000000..a6de10bf --- /dev/null +++ b/patches/harbor-task-filtering-fix.patch @@ -0,0 +1,75 @@ +From f9e6d2e10c72d33373012294c36fd4938c45c26c Mon Sep 17 00:00:00 2001 +From: Alex Shaw +Date: Fri Dec 12 21:21:27 2025 -0800 +Subject: [PATCH] Fix task filtering to use polymorphic get_name() method + +Fix for Harbor task filtering bug where -t/--task-name flags were ignored. + +This patch is already merged in Harbor main (commit f9e6d2e) but not yet +released to PyPI. Latest version 0.1.23 (Dec 11, 2025) still has the bug. + +Apply this patch to your local Harbor installation if you can't install +from main branch. + +See: https://github.com/laude-institute/harbor/commit/f9e6d2e + +--- + src/harbor/models/job/config.py | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/harbor/models/job/config.py b/src/harbor/models/job/config.py +index 4a35f1f..f7a0ec9 100644 +--- a/src/harbor/models/job/config.py ++++ b/src/harbor/models/job/config.py +@@ -39,7 +39,7 @@ class BaseDatasetConfig(BaseModel, ABC): + task_id + for task_id in filtered_ids + if any( +- fnmatch(task_id.path.name, pattern_id) ++ fnmatch(task_id.get_name(), pattern_id) + for pattern_id in self.task_names + ) + ] +@@ -49,7 +49,7 @@ class BaseDatasetConfig(BaseModel, ABC): + task_id + for task_id in filtered_ids + if not any( +- fnmatch(task_id.path.name, pattern_id) ++ fnmatch(task_id.get_name(), pattern_id) + for pattern_id in self.exclude_task_names + ) + ] +@@ -73,7 +73,7 @@ class LocalDatasetConfig(BaseDatasetConfig): + ] + filtered_task_ids = self._filter_task_ids(task_ids) + return [ +- TaskConfig(path=task_id.path, source=self.path.name) ++ TaskConfig(path=task_id.path, source=self.path.expanduser().resolve().name) + for task_id in filtered_task_ids + ] + +-- +2.47.1 + +USAGE: +------ + +Option 1: Install Harbor from main (recommended) + pip uninstall harbor + pip install git+https://github.com/laude-institute/harbor.git + +Option 2: Apply this patch to your local Harbor installation + # Find your Harbor installation + python -c "import harbor; print(harbor.__file__)" + # Output example: /path/to/site-packages/harbor/__init__.py + + # Navigate to Harbor package directory + cd /path/to/site-packages/harbor + + # Apply patch + git apply /path/to/agentready/patches/harbor-task-filtering-fix.patch + + # Or manually edit src/harbor/models/job/config.py: + # Line 42: task_id.path.name -> task_id.get_name() + # Line 52: task_id.path.name -> task_id.get_name() + # Line 76: self.path.name -> self.path.expanduser().resolve().name diff --git a/src/agentready/services/harbor/runner.py b/src/agentready/services/harbor/runner.py index 4dc6bff8..d9dca95d 100644 --- a/src/agentready/services/harbor/runner.py +++ b/src/agentready/services/harbor/runner.py @@ -1,6 +1,8 @@ """Service for executing Harbor benchmarks via CLI.""" +import inspect import subprocess +import warnings from pathlib import Path from typing import List @@ -11,12 +13,19 @@ class HarborNotInstalledError(Exception): pass +class HarborTaskFilteringBugWarning(UserWarning): + """Raised when Harbor has the task filtering bug.""" + + pass + + class HarborRunner: """Execute Harbor benchmarks via subprocess and capture results.""" def __init__(self): """Initialize Harbor runner and verify installation.""" self._verify_harbor_installed() + self._check_harbor_task_filtering() def _verify_harbor_installed(self) -> None: """Verify Harbor CLI is installed and accessible. @@ -41,6 +50,58 @@ def _verify_harbor_installed(self) -> None: except subprocess.CalledProcessError as e: raise HarborNotInstalledError(f"Harbor CLI error: {e.stderr}") + def _check_harbor_task_filtering(self) -> None: + """Check if Harbor has the task filtering bug fix. + + Warns if Harbor version has the known task filtering bug where -t flags are ignored. + + The bug was fixed in Harbor commit f9e6d2e (Dec 12, 2025) but not yet released. + PyPI version 0.1.23 (Dec 11, 2025) has the bug. + + See: https://github.com/laude-institute/harbor/commit/f9e6d2e10c72d33373012294c36fd4938c45c26c + """ + try: + from harbor.models.job.config import BaseDatasetConfig + + # Check if the fix is present by inspecting source code + source = inspect.getsource(BaseDatasetConfig._filter_task_ids) + + # The fix changed task_id.path.name to task_id.get_name() + # If we see path.name (the bug), warn the user + if "task_id.path.name" in source or ".path.name" in source: + warnings.warn( + "\n" + "⚠️ WARNING: Harbor has a task filtering bug!\n" + "\n" + "Your Harbor version has a bug where -t/--task-name flags are ignored.\n" + "This causes smoketests to run ALL tasks instead of the filtered subset.\n" + "\n" + "The bug was fixed in Harbor main (Dec 12, 2025) but not yet released.\n" + "Latest PyPI version 0.1.23 (Dec 11) still has the bug.\n" + "\n" + "FIX OPTIONS:\n" + "\n" + "Option 1 (recommended): Install Harbor from main\n" + " pip uninstall harbor\n" + " pip install git+https://github.com/laude-institute/harbor.git\n" + "\n" + "Option 2: Apply patch to your local Harbor installation\n" + " See: patches/harbor-task-filtering-fix.patch in AgentReady repo\n" + "\n" + "Commit: https://github.com/laude-institute/harbor/commit/f9e6d2e\n", + HarborTaskFilteringBugWarning, + stacklevel=2, + ) + # If we see get_name() (the fix), all good - no warning + + except ImportError: + # Harbor not importable as Python package (only CLI installed) + # Can't check for bug, but also can't use task filtering anyway + pass + except Exception: + # Don't fail if we can't check - just skip the warning + pass + def run_benchmark( self, task_names: List[str],