diff --git a/patches/README.md b/patches/README.md new file mode 100644 index 00000000..f2838442 --- /dev/null +++ b/patches/README.md @@ -0,0 +1,58 @@ +# AgentReady Patches + +This directory contains patches for upstream dependencies that haven't been released yet. + +## harbor-task-filtering-fix.patch + +**Status**: Fixed upstream, waiting for release + +**Problem**: Harbor's `-t`/`--task-name` flag is ignored, causing all tasks to run instead of filtered subset. + +**Upstream Fix**: [Harbor commit f9e6d2e](https://github.com/laude-institute/harbor/commit/f9e6d2e10c72d33373012294c36fd4938c45c26c) (Dec 12, 2025) + +**Released**: ❌ Not yet (latest PyPI: 0.1.23 from Dec 11, 2025) + +### Option 1: Install from Main (Recommended) + +```bash +pip uninstall harbor +pip install git+https://github.com/laude-institute/harbor.git +``` + +### Option 2: Apply Patch Manually + +```bash +# Find Harbor installation +HARBOR_PATH=$(python -c "import harbor, os; print(os.path.dirname(harbor.__file__))") + +# Apply patch +cd "$HARBOR_PATH/.." +patch -p1 < /path/to/agentready/patches/harbor-task-filtering-fix.patch +``` + +### Option 3: Manual Edit + +Edit `harbor/models/job/config.py`: + +```python +# Line 42: Change +fnmatch(task_id.path.name, pattern_id) +# To: +fnmatch(task_id.get_name(), pattern_id) + +# Line 52: Change +fnmatch(task_id.path.name, pattern_id) +# To: +fnmatch(task_id.get_name(), pattern_id) + +# Line 76: Change +TaskConfig(path=task_id.path, source=self.path.name) +# To: +TaskConfig(path=task_id.path, source=self.path.expanduser().resolve().name) +``` + +## When to Remove + +Once Harbor 0.1.24 (or later) is released to PyPI, this patch is no longer needed. + +Check PyPI: https://pypi.org/project/harbor/#history diff --git a/patches/harbor-task-filtering-fix.patch b/patches/harbor-task-filtering-fix.patch new file mode 100644 index 00000000..a6de10bf --- /dev/null +++ b/patches/harbor-task-filtering-fix.patch @@ -0,0 +1,75 @@ +From f9e6d2e10c72d33373012294c36fd4938c45c26c Mon Sep 17 00:00:00 2001 +From: Alex Shaw +Date: Fri Dec 12 21:21:27 2025 -0800 +Subject: [PATCH] Fix task filtering to use polymorphic get_name() method + +Fix for Harbor task filtering bug where -t/--task-name flags were ignored. + +This patch is already merged in Harbor main (commit f9e6d2e) but not yet +released to PyPI. Latest version 0.1.23 (Dec 11, 2025) still has the bug. + +Apply this patch to your local Harbor installation if you can't install +from main branch. + +See: https://github.com/laude-institute/harbor/commit/f9e6d2e + +--- + src/harbor/models/job/config.py | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/harbor/models/job/config.py b/src/harbor/models/job/config.py +index 4a35f1f..f7a0ec9 100644 +--- a/src/harbor/models/job/config.py ++++ b/src/harbor/models/job/config.py +@@ -39,7 +39,7 @@ class BaseDatasetConfig(BaseModel, ABC): + task_id + for task_id in filtered_ids + if any( +- fnmatch(task_id.path.name, pattern_id) ++ fnmatch(task_id.get_name(), pattern_id) + for pattern_id in self.task_names + ) + ] +@@ -49,7 +49,7 @@ class BaseDatasetConfig(BaseModel, ABC): + task_id + for task_id in filtered_ids + if not any( +- fnmatch(task_id.path.name, pattern_id) ++ fnmatch(task_id.get_name(), pattern_id) + for pattern_id in self.exclude_task_names + ) + ] +@@ -73,7 +73,7 @@ class LocalDatasetConfig(BaseDatasetConfig): + ] + filtered_task_ids = self._filter_task_ids(task_ids) + return [ +- TaskConfig(path=task_id.path, source=self.path.name) ++ TaskConfig(path=task_id.path, source=self.path.expanduser().resolve().name) + for task_id in filtered_task_ids + ] + +-- +2.47.1 + +USAGE: +------ + +Option 1: Install Harbor from main (recommended) + pip uninstall harbor + pip install git+https://github.com/laude-institute/harbor.git + +Option 2: Apply this patch to your local Harbor installation + # Find your Harbor installation + python -c "import harbor; print(harbor.__file__)" + # Output example: /path/to/site-packages/harbor/__init__.py + + # Navigate to Harbor package directory + cd /path/to/site-packages/harbor + + # Apply patch + git apply /path/to/agentready/patches/harbor-task-filtering-fix.patch + + # Or manually edit src/harbor/models/job/config.py: + # Line 42: task_id.path.name -> task_id.get_name() + # Line 52: task_id.path.name -> task_id.get_name() + # Line 76: self.path.name -> self.path.expanduser().resolve().name diff --git a/src/agentready/cli/benchmark.py b/src/agentready/cli/benchmark.py index d469ab34..b74cea14 100644 --- a/src/agentready/cli/benchmark.py +++ b/src/agentready/cli/benchmark.py @@ -1,5 +1,6 @@ """Benchmark command for running agent coding evaluations.""" +import json import os import tempfile from pathlib import Path @@ -8,6 +9,8 @@ from ..services.eval_harness.harbor_config import HarborConfig from ..services.eval_harness.tbench_runner import _real_tbench_result +from ..services.harbor.agent_toggler import AssessorStateToggler +from ..services.harbor.comparer import compare_assessor_impact @click.command() @@ -187,3 +190,255 @@ def _run_tbench(repo_path, subset, model, verbose, timeout, output_dir, skip_pre traceback.print_exc() raise click.Abort() + + +# Default Phase 1 task subset (8 diverse tasks, ~2-3 hours per assessor) +DEFAULT_PHASE1_TASKS = [ + "adaptive-rejection-sampler", # Math/algorithms + "async-http-client", # Networking + "terminal-file-browser", # File I/O + "markdown-parser", # Text processing + "json-validator", # Data structures + "cli-calculator", # User interaction + "log-analyzer", # String manipulation + "sudoku-solver", # Logic +] + + +@click.command() +@click.option( + "--assessor", + "-a", + required=False, + help="Assessor ID to validate (e.g., claude_md_file, readme_structure, test_coverage)", +) +@click.option( + "--tasks", + "-t", + multiple=True, + help="Terminal-Bench task names (default: Phase 1 subset of 8 tasks)", +) +@click.option( + "--runs", + "-r", + type=int, + default=3, + help="Number of runs per task (default: 3, recommended: 5+)", +) +@click.option( + "--model", + "-m", + type=click.Choice(["claude-haiku-4-5", "claude-sonnet-4-5"]), + default="claude-haiku-4-5", + help="Claude model to use (default: haiku-4-5 for speed)", +) +@click.option( + "--output-dir", + "-o", + type=click.Path(), + default=None, + help="Output directory (default: .agentready/validations/{assessor_id}/)", +) +@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output") +@click.option( + "--list-assessors", + is_flag=True, + help="List supported assessors and exit", +) +@click.option( + "--concurrent", + "-c", + type=int, + default=1, + help="Number of concurrent tasks to run in parallel (default: 1, recommended: 3-5)", +) +@click.option( + "--smoketest", + is_flag=True, + help="Run smoketest with single task for quick validation (~2 minutes)", +) +def validate_assessor( + assessor, + tasks, + runs, + model, + output_dir, + verbose, + list_assessors, + concurrent, + smoketest, +): + """Validate single assessor impact using Terminal-Bench A/B testing. + + This command empirically measures the impact of a specific AgentReady assessor + on Claude Code's Terminal-Bench performance by running an A/B test: + + \b + 1. Baseline: Force assessor to FAIL (manipulate repo state) + 2. Treatment: Restore repo to normal state (assessor PASSES) + 3. Compare success rates with statistical significance testing + + Examples: + + \b + # Quick smoketest with 1 task (~2 minutes) + agentready validate-assessor --assessor claude_md_file --smoketest + + \b + # Validate CLAUDE.md impact (8 tasks, 3 runs each = 48 trials) + agentready validate-assessor --assessor claude_md_file + + \b + # Test README with custom tasks and 5 runs for higher statistical power + agentready validate-assessor --assessor readme_structure \\ + --tasks adaptive-rejection-sampler \\ + --tasks async-http-client \\ + --runs 5 + + \b + # List all supported assessors + agentready validate-assessor --list-assessors + """ + repo_root = Path.cwd() + + # Handle --list-assessors + if list_assessors: + toggler = AssessorStateToggler(repo_root=repo_root) + supported = toggler.list_supported_assessors() + + click.echo("Supported Assessors for Validation:") + click.echo(f"{'=' * 60}") + for assessor_id in supported: + click.echo(f" - {assessor_id}") + click.echo(f"\nTotal: {len(supported)} assessors") + click.echo("\nUsage: agentready validate-assessor --assessor {assessor_id}") + return + + # Validate that --assessor is provided if not listing + if not assessor: + click.echo( + "Error: Missing required option '--assessor' / '-a'.\n" + "Use --list-assessors to see available assessors.", + err=True, + ) + raise click.Abort() + + # Validate ANTHROPIC_API_KEY + api_key = os.environ.get("ANTHROPIC_API_KEY", "") + if not api_key: + click.echo( + "Error: ANTHROPIC_API_KEY environment variable not set.\n" + "Set it with: export ANTHROPIC_API_KEY=your-key-here", + err=True, + ) + raise click.Abort() + + # Use default Phase 1 tasks if not specified + if not tasks: + if smoketest: + # Smoketest: just 1 simple task for quick validation + tasks = ["adaptive-rejection-sampler"] + if verbose: + click.echo("Using smoketest mode (1 task, ~2 minutes total)\n") + else: + tasks = DEFAULT_PHASE1_TASKS + if verbose: + click.echo(f"Using default Phase 1 task subset ({len(tasks)} tasks)\n") + + # Convert model name to full identifier + model_id = f"anthropic/{model}" + + # Set output directory + if output_dir: + output_path = Path(output_dir) + else: + output_path = Path(".agentready") / "validations" / assessor + + try: + # Run A/B comparison + comparison = compare_assessor_impact( + assessor_id=assessor, + task_names=list(tasks), + repo_root=repo_root, + runs_per_task=runs, + output_dir=output_path, + model=model_id, + n_concurrent=concurrent, + verbose=verbose, + ) + + # Save JSON results + json_file = output_path / f"{assessor}.json" + with open(json_file, "w") as f: + json.dump( + { + "assessor_id": assessor, + "tasks": list(tasks), + "runs_per_task": runs, + "baseline": comparison.without_agent.to_dict(), + "treatment": comparison.with_agent.to_dict(), + "deltas": comparison.deltas, + "statistical_significance": comparison.statistical_significance, + }, + f, + indent=2, + ) + + # Generate Markdown report + md_file = output_path / f"{assessor}.md" + with open(md_file, "w") as f: + f.write(f"# Assessor Impact Validation: {assessor}\n\n") + f.write(f"**Date**: {comparison.created_at}\n") + f.write(f"**Tasks**: {len(tasks)}\n") + f.write(f"**Runs per Task**: {runs}\n\n") + + f.write("## Results Summary\n\n") + f.write( + "| Metric | Baseline (Assessor Fails) | Treatment (Assessor Passes) | Delta |\n" + ) + f.write( + "|--------|---------------------------|----------------------------|-------|\n" + ) + + delta = comparison.deltas["success_rate_delta"] + sign = "+" if delta >= 0 else "" + f.write( + f"| Success Rate | {comparison.without_agent.success_rate:.1f}% | " + f"{comparison.with_agent.success_rate:.1f}% | **{sign}{delta:.1f} pp** |\n" + ) + + duration_delta = comparison.deltas.get("avg_duration_delta_sec", 0) + sign = "+" if duration_delta >= 0 else "" + f.write( + f"| Avg Duration | {comparison.without_agent.avg_duration_sec:.1f}s | " + f"{comparison.with_agent.avg_duration_sec:.1f}s | {sign}{duration_delta:.1f}s |\n" + ) + + f.write("\n## Statistical Significance\n\n") + is_sig = comparison.statistical_significance.get( + "success_rate_significant", False + ) + p_val = comparison.statistical_significance.get("success_rate_p_value") + f.write(f"- **Significant**: {'YES ✓' if is_sig else 'NO'}\n") + if p_val is not None: + f.write(f"- **P-value**: {p_val:.4f}\n") + + f.write("\n## Files\n\n") + f.write(f"- JSON: `{json_file}`\n") + f.write(f"- Markdown: `{md_file}`\n") + + click.echo("\nResults saved:") + click.echo(f" - JSON: {json_file}") + click.echo(f" - Markdown: {md_file}\n") + + except ValueError as e: + click.echo(f"\nError: {e}\n", err=True) + click.echo("Use --list-assessors to see supported assessors.", err=True) + raise click.Abort() + except Exception as e: + click.echo(f"\nValidation failed: {e}\n", err=True) + if verbose: + import traceback + + traceback.print_exc() + raise click.Abort() diff --git a/src/agentready/cli/main.py b/src/agentready/cli/main.py index 64c99937..73f888e7 100644 --- a/src/agentready/cli/main.py +++ b/src/agentready/cli/main.py @@ -29,7 +29,7 @@ # Lightweight commands - imported immediately from .align import align -from .benchmark import benchmark +from .benchmark import benchmark, validate_assessor from .bootstrap import bootstrap from .demo import demo from .repomix import repomix_generate @@ -544,6 +544,7 @@ def generate_config(): # Register lightweight commands (heavy commands loaded lazily via LazyGroup) cli.add_command(align) cli.add_command(benchmark) +cli.add_command(validate_assessor) cli.add_command(bootstrap) cli.add_command(demo) cli.add_command(migrate_report) diff --git a/src/agentready/services/harbor/agent_toggler.py b/src/agentready/services/harbor/agent_toggler.py index 6653c82f..16151109 100644 --- a/src/agentready/services/harbor/agent_toggler.py +++ b/src/agentready/services/harbor/agent_toggler.py @@ -1,9 +1,9 @@ -"""Service for safely enabling/disabling the doubleagent.md file.""" +"""Service for safely enabling/disabling agent files and manipulating repository state.""" import shutil from contextlib import contextmanager from pathlib import Path -from typing import Generator +from typing import Callable, Dict, Generator, Tuple class AgentFileToggler: @@ -89,3 +89,167 @@ def temporarily_enabled(self) -> Generator[None, None, None]: finally: if was_disabled: self.disable() + + +class AssessorStateToggler: + """Manipulate repository state to force assessor pass/fail for A/B testing. + + This class safely modifies repository files to simulate scenarios where + specific assessors would pass or fail, enabling empirical validation of + assessor impacts on agent performance. + + Example: + toggler = AssessorStateToggler() + + # Force CLAUDE.md assessor to fail + toggler.force_fail("claude_md_file") + # Run benchmark + toggler.restore("claude_md_file") + """ + + # Mapping of assessor IDs to (fail_action, restore_action) tuples + # Each action is a callable that takes the repository root Path + MANIPULATIONS: Dict[str, Tuple[Callable[[Path], None], Callable[[Path], None]]] = {} + + @classmethod + def register_manipulation( + cls, + assessor_id: str, + fail_action: Callable[[Path], None], + restore_action: Callable[[Path], None], + ) -> None: + """Register a manipulation strategy for an assessor. + + Args: + assessor_id: Unique assessor identifier (e.g., "claude_md_file") + fail_action: Function to force assessor to fail state + restore_action: Function to restore assessor to pass state + """ + cls.MANIPULATIONS[assessor_id] = (fail_action, restore_action) + + def __init__(self, repo_root: Path = None): + """Initialize toggler with repository root. + + Args: + repo_root: Root of the repository (default: current directory) + """ + self.repo_root = repo_root or Path.cwd() + self._backup_suffix = ".assessor_backup" + self._initialize_default_manipulations() + + def _initialize_default_manipulations(self) -> None: + """Register default manipulation strategies for Phase 1 assessors.""" + + # CLAUDE.md Assessor - Tier 1, 10% weight + def fail_claude_md(repo_root: Path) -> None: + claude_md = repo_root / ".claude" / "CLAUDE.md" + backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}" + if claude_md.exists() and not backup.exists(): + shutil.move(str(claude_md), str(backup)) + + def restore_claude_md(repo_root: Path) -> None: + claude_md = repo_root / ".claude" / "CLAUDE.md" + backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}" + if backup.exists() and not claude_md.exists(): + shutil.move(str(backup), str(claude_md)) + + self.register_manipulation("claude_md_file", fail_claude_md, restore_claude_md) + + # README Assessor - Tier 1, 10% weight + def fail_readme(repo_root: Path) -> None: + readme = repo_root / "README.md" + backup = repo_root / f"README.md{self._backup_suffix}" + if readme.exists() and not backup.exists(): + shutil.move(str(readme), str(backup)) + + def restore_readme(repo_root: Path) -> None: + readme = repo_root / "README.md" + backup = repo_root / f"README.md{self._backup_suffix}" + if backup.exists() and not readme.exists(): + shutil.move(str(backup), str(readme)) + + self.register_manipulation("readme_structure", fail_readme, restore_readme) + + # Test Coverage Assessor - Tier 2, 3% weight + def fail_tests(repo_root: Path) -> None: + tests_dir = repo_root / "tests" + backup_dir = repo_root / f"tests{self._backup_suffix}" + if tests_dir.exists() and not backup_dir.exists(): + shutil.move(str(tests_dir), str(backup_dir)) + + def restore_tests(repo_root: Path) -> None: + tests_dir = repo_root / "tests" + backup_dir = repo_root / f"tests{self._backup_suffix}" + if backup_dir.exists() and not tests_dir.exists(): + shutil.move(str(backup_dir), str(tests_dir)) + + self.register_manipulation("test_coverage", fail_tests, restore_tests) + + def force_fail(self, assessor_id: str) -> None: + """Force assessor to fail by manipulating repository state. + + Args: + assessor_id: Assessor identifier (e.g., "claude_md_file") + + Raises: + ValueError: If assessor_id is not recognized + """ + if assessor_id not in self.MANIPULATIONS: + raise ValueError( + f"Unknown assessor: {assessor_id}. " + f"Available: {', '.join(self.MANIPULATIONS.keys())}" + ) + + fail_action, _ = self.MANIPULATIONS[assessor_id] + fail_action(self.repo_root) + + def restore(self, assessor_id: str) -> None: + """Restore repository to state where assessor passes. + + Args: + assessor_id: Assessor identifier (e.g., "claude_md_file") + + Raises: + ValueError: If assessor_id is not recognized + """ + if assessor_id not in self.MANIPULATIONS: + raise ValueError( + f"Unknown assessor: {assessor_id}. " + f"Available: {', '.join(self.MANIPULATIONS.keys())}" + ) + + _, restore_action = self.MANIPULATIONS[assessor_id] + restore_action(self.repo_root) + + def list_supported_assessors(self) -> list[str]: + """Get list of assessor IDs with registered manipulations. + + Returns: + List of assessor IDs that can be toggled + """ + return list(self.MANIPULATIONS.keys()) + + @contextmanager + def temporarily_failed(self, assessor_id: str) -> Generator[None, None, None]: + """Context manager for safe fail/restore of assessor state. + + Ensures repository is restored even if exception occurs during testing. + + Args: + assessor_id: Assessor identifier + + Example: + toggler = AssessorStateToggler() + with toggler.temporarily_failed("claude_md_file"): + # CLAUDE.md is now missing (assessor fails) + run_benchmark() + # CLAUDE.md is automatically restored here + + Yields: + None + """ + try: + self.force_fail(assessor_id) + yield + finally: + self.restore(assessor_id) diff --git a/src/agentready/services/harbor/comparer.py b/src/agentready/services/harbor/comparer.py index da0a27c9..df3b4461 100644 --- a/src/agentready/services/harbor/comparer.py +++ b/src/agentready/services/harbor/comparer.py @@ -1,8 +1,12 @@ """Service for comparing Harbor benchmark runs and calculating statistical significance.""" +from pathlib import Path from typing import List, Optional from agentready.models.harbor import HarborComparison, HarborRunMetrics +from agentready.services.harbor.agent_toggler import AssessorStateToggler +from agentready.services.harbor.result_parser import parse_harbor_results +from agentready.services.harbor.runner import HarborRunner def compare_runs( @@ -186,3 +190,160 @@ def interpret_effect_size(cohens_d: float) -> str: return "medium" else: return "large" + + +def compare_assessor_impact( + assessor_id: str, + task_names: List[str], + repo_root: Path, + runs_per_task: int = 3, + output_dir: Path = None, + model: str = "anthropic/claude-sonnet-4-5", + n_concurrent: int = 1, + verbose: bool = True, +) -> HarborComparison: + """A/B test assessor impact: baseline (assessor fails) vs treatment (assessor passes). + + This function orchestrates the complete A/B testing workflow: + 1. Force assessor to fail (manipulate repository state) + 2. Run Harbor benchmark (baseline) + 3. Restore repository to normal state (assessor passes) + 4. Run Harbor benchmark again (treatment) + 5. Compare results with statistical significance + + Args: + assessor_id: Assessor identifier (e.g., "claude_md_file") + task_names: List of Terminal-Bench task names to test + repo_root: Root of the repository to test + runs_per_task: Number of runs per task (default: 3, recommended: 5+) + output_dir: Directory to store results (default: .agentready/validations/{assessor_id}/) + model: Claude model to use (default: sonnet-4-5) + n_concurrent: Number of concurrent tasks to run in parallel (default: 1) + verbose: Print progress messages (default: True) + + Returns: + HarborComparison with baseline vs treatment metrics, deltas, and significance + + Raises: + ValueError: If assessor_id is not supported + HarborNotInstalledError: If Harbor CLI is not available + subprocess.CalledProcessError: If Harbor benchmark fails + + Example: + comparison = compare_assessor_impact( + assessor_id="claude_md_file", + task_names=["adaptive-rejection-sampler", "async-http-client"], + repo_root=Path("."), + runs_per_task=3, + verbose=True + ) + + print(f"Success rate delta: {comparison.deltas['success_rate_delta']:.1f}%") + print(f"Significant: {comparison.statistical_significance['success_rate_significant']}") + """ + # Default output directory + if output_dir is None: + output_dir = Path(".agentready") / "validations" / assessor_id + output_dir.mkdir(parents=True, exist_ok=True) + + # Initialize components + toggler = AssessorStateToggler(repo_root=repo_root) + runner = HarborRunner() + + if verbose: + print(f"\nAssessor Impact Validation: {assessor_id}") + print(f"{'=' * 60}") + print(f"Tasks: {', '.join(task_names)}") + print(f"Runs per task: {runs_per_task}") + print( + f"Total trials: {len(task_names) * runs_per_task * 2} (baseline + treatment)" + ) + print(f"Output: {output_dir}\n") + + # Baseline: Assessor fails (remove CLAUDE.md, README, etc.) + if verbose: + print("[1/2] Running baseline (assessor FAILS)...") + print(f" Manipulating repository state to force {assessor_id} to fail") + + baseline_output = output_dir / "baseline" + with toggler.temporarily_failed(assessor_id): + if verbose: + print( + f" Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials" + ) + + baseline_results_dir = runner.run_benchmark( + task_names=task_names, + output_dir=baseline_output, + model=model, + n_concurrent=n_concurrent, + verbose=verbose, + ) + + # Parse results + baseline_task_results = parse_harbor_results(baseline_results_dir) + baseline_metrics = HarborRunMetrics.from_task_results( + run_id=f"{assessor_id}_baseline", + agent_file_enabled=False, # Assessor fails + task_results=baseline_task_results, + ) + + if verbose: + print( + f" Baseline complete: {baseline_metrics.success_rate:.1f}% success rate\n" + ) + + # Treatment: Assessor passes (normal repository state) + if verbose: + print("[2/2] Running treatment (assessor PASSES)...") + print(" Repository restored to normal state") + + treatment_output = output_dir / "treatment" + if verbose: + print( + f" Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials" + ) + + treatment_results_dir = runner.run_benchmark( + task_names=task_names, + output_dir=treatment_output, + model=model, + n_concurrent=n_concurrent, + verbose=verbose, + ) + + # Parse results + treatment_task_results = parse_harbor_results(treatment_results_dir) + treatment_metrics = HarborRunMetrics.from_task_results( + run_id=f"{assessor_id}_treatment", + agent_file_enabled=True, # Assessor passes + task_results=treatment_task_results, + ) + + if verbose: + print( + f" Treatment complete: {treatment_metrics.success_rate:.1f}% success rate\n" + ) + + # Compare results + comparison = compare_runs( + without_agent=baseline_metrics, with_agent=treatment_metrics + ) + + if verbose: + print(f"{'=' * 60}") + print("Results Summary") + print(f"{'=' * 60}") + delta = comparison.deltas["success_rate_delta"] + sign = "+" if delta >= 0 else "" + print(f"Success Rate Delta: {sign}{delta:.1f} percentage points") + print( + f"Statistical Significance: {'YES' if comparison.statistical_significance.get('success_rate_significant', False) else 'NO'}" + ) + if comparison.statistical_significance.get("success_rate_p_value"): + print( + f"P-value: {comparison.statistical_significance['success_rate_p_value']:.4f}" + ) + print(f"\nResults saved to: {output_dir}\n") + + return comparison diff --git a/src/agentready/services/harbor/result_parser.py b/src/agentready/services/harbor/result_parser.py index 684fd082..c8b1c01c 100644 --- a/src/agentready/services/harbor/result_parser.py +++ b/src/agentready/services/harbor/result_parser.py @@ -23,8 +23,17 @@ def parse_harbor_results(results_dir: Path) -> List[HarborTaskResult]: if not results_dir.exists(): raise FileNotFoundError(f"Results directory not found: {results_dir}") - # Find all result.json files in subdirectories - result_files = list(results_dir.glob("*/result.json")) + # Find all result.json files in subdirectories (task directories only, not job-level result.json) + # Task directories are named like "task-name__hash/" while the job result.json is at the root + all_result_files = list(results_dir.glob("*/result.json")) + + # Filter to only task result files (exclude job-level result.json) + # Task directories contain "__" in their name (e.g., "build-pmars__abc123") + result_files = [ + f + for f in all_result_files + if "__" in f.parent.name # Task directories have "__" separator + ] if not result_files: raise ValueError(f"No result.json files found in {results_dir}") diff --git a/src/agentready/services/harbor/runner.py b/src/agentready/services/harbor/runner.py index 6dbd7d76..d9dca95d 100644 --- a/src/agentready/services/harbor/runner.py +++ b/src/agentready/services/harbor/runner.py @@ -1,7 +1,8 @@ """Service for executing Harbor benchmarks via CLI.""" -import json +import inspect import subprocess +import warnings from pathlib import Path from typing import List @@ -12,12 +13,19 @@ class HarborNotInstalledError(Exception): pass +class HarborTaskFilteringBugWarning(UserWarning): + """Raised when Harbor has the task filtering bug.""" + + pass + + class HarborRunner: """Execute Harbor benchmarks via subprocess and capture results.""" def __init__(self): """Initialize Harbor runner and verify installation.""" self._verify_harbor_installed() + self._check_harbor_task_filtering() def _verify_harbor_installed(self) -> None: """Verify Harbor CLI is installed and accessible. @@ -27,7 +35,7 @@ def _verify_harbor_installed(self) -> None: """ try: subprocess.run( - ["harbor", "--version"], + ["harbor", "--help"], capture_output=True, text=True, check=True, @@ -42,6 +50,58 @@ def _verify_harbor_installed(self) -> None: except subprocess.CalledProcessError as e: raise HarborNotInstalledError(f"Harbor CLI error: {e.stderr}") + def _check_harbor_task_filtering(self) -> None: + """Check if Harbor has the task filtering bug fix. + + Warns if Harbor version has the known task filtering bug where -t flags are ignored. + + The bug was fixed in Harbor commit f9e6d2e (Dec 12, 2025) but not yet released. + PyPI version 0.1.23 (Dec 11, 2025) has the bug. + + See: https://github.com/laude-institute/harbor/commit/f9e6d2e10c72d33373012294c36fd4938c45c26c + """ + try: + from harbor.models.job.config import BaseDatasetConfig + + # Check if the fix is present by inspecting source code + source = inspect.getsource(BaseDatasetConfig._filter_task_ids) + + # The fix changed task_id.path.name to task_id.get_name() + # If we see path.name (the bug), warn the user + if "task_id.path.name" in source or ".path.name" in source: + warnings.warn( + "\n" + "⚠️ WARNING: Harbor has a task filtering bug!\n" + "\n" + "Your Harbor version has a bug where -t/--task-name flags are ignored.\n" + "This causes smoketests to run ALL tasks instead of the filtered subset.\n" + "\n" + "The bug was fixed in Harbor main (Dec 12, 2025) but not yet released.\n" + "Latest PyPI version 0.1.23 (Dec 11) still has the bug.\n" + "\n" + "FIX OPTIONS:\n" + "\n" + "Option 1 (recommended): Install Harbor from main\n" + " pip uninstall harbor\n" + " pip install git+https://github.com/laude-institute/harbor.git\n" + "\n" + "Option 2: Apply patch to your local Harbor installation\n" + " See: patches/harbor-task-filtering-fix.patch in AgentReady repo\n" + "\n" + "Commit: https://github.com/laude-institute/harbor/commit/f9e6d2e\n", + HarborTaskFilteringBugWarning, + stacklevel=2, + ) + # If we see get_name() (the fix), all good - no warning + + except ImportError: + # Harbor not importable as Python package (only CLI installed) + # Can't check for bug, but also can't use task filtering anyway + pass + except Exception: + # Don't fail if we can't check - just skip the warning + pass + def run_benchmark( self, task_names: List[str], @@ -89,25 +149,11 @@ def run_benchmark( str(n_concurrent), ] - # Add task selection if specified - if task_names: - # Harbor uses config JSON for task selection - config_file = output_dir / "config.json" - config = { - "datasets": [ - { - "name": dataset, - "version": dataset_version, - "task_names": task_names, - } - ], - "agents": [{"name": agent, "model_name": model}], - } - with open(config_file, "w") as f: - json.dump(config, f, indent=2) - - # Use config file instead of CLI args - cmd = ["harbor", "run", "-c", str(config_file)] + # Add task name filters + # NOTE: Requires Harbor >= 0.1.24 (or install from main) + # Task filtering was fixed in commit f9e6d2e (Dec 12, 2025) + for task_name in task_names: + cmd.extend(["-t", task_name]) # Execute Harbor benchmark if verbose: diff --git a/tests/unit/services/harbor/__init__.py b/tests/unit/services/harbor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/services/harbor/test_assessor_state_toggler.py b/tests/unit/services/harbor/test_assessor_state_toggler.py new file mode 100644 index 00000000..5d227b6e --- /dev/null +++ b/tests/unit/services/harbor/test_assessor_state_toggler.py @@ -0,0 +1,240 @@ +"""Unit tests for AssessorStateToggler.""" + +import pytest + +from agentready.services.harbor.agent_toggler import AssessorStateToggler + + +@pytest.fixture +def temp_repo(tmp_path): + """Create a temporary repository structure for testing.""" + repo_root = tmp_path / "test_repo" + repo_root.mkdir() + + # Create .claude directory and CLAUDE.md + claude_dir = repo_root / ".claude" + claude_dir.mkdir() + claude_md = claude_dir / "CLAUDE.md" + claude_md.write_text("# Test Project\n\nThis is a test CLAUDE.md file.") + + # Create README.md + readme = repo_root / "README.md" + readme.write_text("# Test Repo\n\n## Installation\n\n## Usage") + + # Create tests directory + tests_dir = repo_root / "tests" + tests_dir.mkdir() + (tests_dir / "test_example.py").write_text("def test_example():\n assert True") + + return repo_root + + +class TestAssessorStateToggler: + """Test AssessorStateToggler functionality.""" + + def test_initialization(self, temp_repo): + """Test toggler initializes with correct repo root.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + assert toggler.repo_root == temp_repo + assert toggler._backup_suffix == ".assessor_backup" + + def test_list_supported_assessors(self, temp_repo): + """Test listing supported assessors returns expected IDs.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + supported = toggler.list_supported_assessors() + + assert "claude_md_file" in supported + assert "readme_structure" in supported + assert "test_coverage" in supported + assert len(supported) == 3 + + def test_force_fail_claude_md(self, temp_repo): + """Test forcing CLAUDE.md assessor to fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Verify file exists before fail + assert claude_md.exists() + assert not backup.exists() + + # Force fail + toggler.force_fail("claude_md_file") + + # Verify file is backed up + assert not claude_md.exists() + assert backup.exists() + + def test_restore_claude_md(self, temp_repo): + """Test restoring CLAUDE.md after forcing fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail first + toggler.force_fail("claude_md_file") + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Verify backup state + assert not claude_md.exists() + assert backup.exists() + + # Restore + toggler.restore("claude_md_file") + + # Verify restored + assert claude_md.exists() + assert not backup.exists() + + def test_force_fail_readme(self, temp_repo): + """Test forcing README assessor to fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + readme = temp_repo / "README.md" + backup = temp_repo / "README.md.assessor_backup" + + # Force fail + toggler.force_fail("readme_structure") + + # Verify + assert not readme.exists() + assert backup.exists() + + def test_restore_readme(self, temp_repo): + """Test restoring README after forcing fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail first + toggler.force_fail("readme_structure") + + # Restore + toggler.restore("readme_structure") + + readme = temp_repo / "README.md" + backup = temp_repo / "README.md.assessor_backup" + + # Verify + assert readme.exists() + assert not backup.exists() + + def test_force_fail_tests(self, temp_repo): + """Test forcing test coverage assessor to fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + tests_dir = temp_repo / "tests" + backup_dir = temp_repo / "tests.assessor_backup" + + # Force fail + toggler.force_fail("test_coverage") + + # Verify + assert not tests_dir.exists() + assert backup_dir.exists() + + def test_restore_tests(self, temp_repo): + """Test restoring tests directory after forcing fail.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail first + toggler.force_fail("test_coverage") + + # Restore + toggler.restore("test_coverage") + + tests_dir = temp_repo / "tests" + backup_dir = temp_repo / "tests.assessor_backup" + + # Verify + assert tests_dir.exists() + assert not backup_dir.exists() + + def test_temporarily_failed_context_manager(self, temp_repo): + """Test temporarily_failed context manager restores state.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + + # Verify initial state + assert claude_md.exists() + + # Use context manager + with toggler.temporarily_failed("claude_md_file"): + # Inside context: file should be missing + assert not claude_md.exists() + + # Outside context: file should be restored + assert claude_md.exists() + + def test_temporarily_failed_exception_still_restores(self, temp_repo): + """Test temporarily_failed restores even when exception occurs.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + + # Verify initial state + assert claude_md.exists() + + # Use context manager with exception + with pytest.raises(RuntimeError): + with toggler.temporarily_failed("claude_md_file"): + assert not claude_md.exists() + raise RuntimeError("Test exception") + + # File should still be restored + assert claude_md.exists() + + def test_unknown_assessor_raises_error(self, temp_repo): + """Test that unknown assessor ID raises ValueError.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + with pytest.raises(ValueError, match="Unknown assessor: nonexistent"): + toggler.force_fail("nonexistent") + + with pytest.raises(ValueError, match="Unknown assessor: nonexistent"): + toggler.restore("nonexistent") + + def test_idempotent_force_fail(self, temp_repo): + """Test that forcing fail multiple times is idempotent.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail twice + toggler.force_fail("claude_md_file") + toggler.force_fail("claude_md_file") # Should not error + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Still in failed state + assert not claude_md.exists() + assert backup.exists() + + def test_idempotent_restore(self, temp_repo): + """Test that restoring multiple times is idempotent.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + # Force fail, then restore twice + toggler.force_fail("claude_md_file") + toggler.restore("claude_md_file") + toggler.restore("claude_md_file") # Should not error + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" + + # Still in normal state + assert claude_md.exists() + assert not backup.exists() + + def test_content_preservation(self, temp_repo): + """Test that file content is preserved through fail/restore cycle.""" + toggler = AssessorStateToggler(repo_root=temp_repo) + + claude_md = temp_repo / ".claude" / "CLAUDE.md" + original_content = claude_md.read_text() + + # Fail and restore + toggler.force_fail("claude_md_file") + toggler.restore("claude_md_file") + + # Verify content is identical + assert claude_md.read_text() == original_content