diff --git a/patches/README.md b/patches/README.md
new file mode 100644
index 00000000..f2838442
--- /dev/null
+++ b/patches/README.md
@@ -0,0 +1,58 @@
+# AgentReady Patches
+
+This directory contains patches for upstream dependencies that haven't been released yet.
+
+## harbor-task-filtering-fix.patch
+
+**Status**: Fixed upstream, waiting for release
+
+**Problem**: Harbor's `-t`/`--task-name` flag is ignored, causing all tasks to run instead of filtered subset.
+
+**Upstream Fix**: [Harbor commit f9e6d2e](https://github.com/laude-institute/harbor/commit/f9e6d2e10c72d33373012294c36fd4938c45c26c) (Dec 12, 2025)
+
+**Released**: ❌ Not yet (latest PyPI: 0.1.23 from Dec 11, 2025)
+
+### Option 1: Install from Main (Recommended)
+
+```bash
+pip uninstall harbor
+pip install git+https://github.com/laude-institute/harbor.git
+```
+
+### Option 2: Apply Patch Manually
+
+```bash
+# Find Harbor installation
+HARBOR_PATH=$(python -c "import harbor, os; print(os.path.dirname(harbor.__file__))")
+
+# Apply patch
+cd "$HARBOR_PATH/.."
+patch -p1 < /path/to/agentready/patches/harbor-task-filtering-fix.patch
+```
+
+### Option 3: Manual Edit
+
+Edit `harbor/models/job/config.py`:
+
+```python
+# Line 42: Change
+fnmatch(task_id.path.name, pattern_id)
+# To:
+fnmatch(task_id.get_name(), pattern_id)
+
+# Line 52: Change
+fnmatch(task_id.path.name, pattern_id)
+# To:
+fnmatch(task_id.get_name(), pattern_id)
+
+# Line 76: Change
+TaskConfig(path=task_id.path, source=self.path.name)
+# To:
+TaskConfig(path=task_id.path, source=self.path.expanduser().resolve().name)
+```
+
+## When to Remove
+
+Once Harbor 0.1.24 (or later) is released to PyPI, this patch is no longer needed.
+
+Check PyPI: https://pypi.org/project/harbor/#history
diff --git a/patches/harbor-task-filtering-fix.patch b/patches/harbor-task-filtering-fix.patch
new file mode 100644
index 00000000..a6de10bf
--- /dev/null
+++ b/patches/harbor-task-filtering-fix.patch
@@ -0,0 +1,75 @@
+From f9e6d2e10c72d33373012294c36fd4938c45c26c Mon Sep 17 00:00:00 2001
+From: Alex Shaw <alexgshaw64@gmail.com>
+Date: Fri Dec 12 21:21:27 2025 -0800
+Subject: [PATCH] Fix task filtering to use polymorphic get_name() method
+
+Fix for Harbor task filtering bug where -t/--task-name flags were ignored.
+
+This patch is already merged in Harbor main (commit f9e6d2e) but not yet
+released to PyPI. Latest version 0.1.23 (Dec 11, 2025) still has the bug.
+
+Apply this patch to your local Harbor installation if you can't install
+from main branch.
+
+See: https://github.com/laude-institute/harbor/commit/f9e6d2e
+
+---
+ src/harbor/models/job/config.py | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/src/harbor/models/job/config.py b/src/harbor/models/job/config.py
+index 4a35f1f..f7a0ec9 100644
+--- a/src/harbor/models/job/config.py
++++ b/src/harbor/models/job/config.py
+@@ -39,7 +39,7 @@ class BaseDatasetConfig(BaseModel, ABC):
+                 task_id
+                 for task_id in filtered_ids
+                 if any(
+-                    fnmatch(task_id.path.name, pattern_id)
++                    fnmatch(task_id.get_name(), pattern_id)
+                     for pattern_id in self.task_names
+                 )
+             ]
+@@ -49,7 +49,7 @@ class BaseDatasetConfig(BaseModel, ABC):
+                 task_id
+                 for task_id in filtered_ids
+                 if not any(
+-                    fnmatch(task_id.path.name, pattern_id)
++                    fnmatch(task_id.get_name(), pattern_id)
+                     for pattern_id in self.exclude_task_names
+                 )
+             ]
+@@ -73,7 +73,7 @@ class LocalDatasetConfig(BaseDatasetConfig):
+         ]
+         filtered_task_ids = self._filter_task_ids(task_ids)
+         return [
+-            TaskConfig(path=task_id.path, source=self.path.name)
++            TaskConfig(path=task_id.path, source=self.path.expanduser().resolve().name)
+             for task_id in filtered_task_ids
+         ]
+
+--
+2.47.1
+
+USAGE:
+------
+
+Option 1: Install Harbor from main (recommended)
+  pip uninstall harbor
+  pip install git+https://github.com/laude-institute/harbor.git
+
+Option 2: Apply this patch to your local Harbor installation
+  # Find your Harbor installation
+  python -c "import harbor; print(harbor.__file__)"
+  # Output example: /path/to/site-packages/harbor/__init__.py
+
+  # Navigate to Harbor package directory
+  cd /path/to/site-packages/harbor
+
+  # Apply patch
+  git apply /path/to/agentready/patches/harbor-task-filtering-fix.patch
+
+  # Or manually edit src/harbor/models/job/config.py:
+  # Line 42: task_id.path.name -> task_id.get_name()
+  # Line 52: task_id.path.name -> task_id.get_name()
+  # Line 76: self.path.name -> self.path.expanduser().resolve().name
diff --git a/src/agentready/cli/benchmark.py b/src/agentready/cli/benchmark.py
index d469ab34..b74cea14 100644
--- a/src/agentready/cli/benchmark.py
+++ b/src/agentready/cli/benchmark.py
@@ -1,5 +1,6 @@
 """Benchmark command for running agent coding evaluations."""
 
+import json
 import os
 import tempfile
 from pathlib import Path
@@ -8,6 +9,8 @@
 
 from ..services.eval_harness.harbor_config import HarborConfig
 from ..services.eval_harness.tbench_runner import _real_tbench_result
+from ..services.harbor.agent_toggler import AssessorStateToggler
+from ..services.harbor.comparer import compare_assessor_impact
 
 
 @click.command()
@@ -187,3 +190,255 @@ def _run_tbench(repo_path, subset, model, verbose, timeout, output_dir, skip_pre
 
             traceback.print_exc()
         raise click.Abort()
+
+
+# Default Phase 1 task subset (8 diverse tasks, ~2-3 hours per assessor)
+DEFAULT_PHASE1_TASKS = [
+    "adaptive-rejection-sampler",  # Math/algorithms
+    "async-http-client",  # Networking
+    "terminal-file-browser",  # File I/O
+    "markdown-parser",  # Text processing
+    "json-validator",  # Data structures
+    "cli-calculator",  # User interaction
+    "log-analyzer",  # String manipulation
+    "sudoku-solver",  # Logic
+]
+
+
+@click.command()
+@click.option(
+    "--assessor",
+    "-a",
+    required=False,
+    help="Assessor ID to validate (e.g., claude_md_file, readme_structure, test_coverage)",
+)
+@click.option(
+    "--tasks",
+    "-t",
+    multiple=True,
+    help="Terminal-Bench task names (default: Phase 1 subset of 8 tasks)",
+)
+@click.option(
+    "--runs",
+    "-r",
+    type=int,
+    default=3,
+    help="Number of runs per task (default: 3, recommended: 5+)",
+)
+@click.option(
+    "--model",
+    "-m",
+    type=click.Choice(["claude-haiku-4-5", "claude-sonnet-4-5"]),
+    default="claude-haiku-4-5",
+    help="Claude model to use (default: haiku-4-5 for speed)",
+)
+@click.option(
+    "--output-dir",
+    "-o",
+    type=click.Path(),
+    default=None,
+    help="Output directory (default: .agentready/validations/{assessor_id}/)",
+)
+@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
+@click.option(
+    "--list-assessors",
+    is_flag=True,
+    help="List supported assessors and exit",
+)
+@click.option(
+    "--concurrent",
+    "-c",
+    type=int,
+    default=1,
+    help="Number of concurrent tasks to run in parallel (default: 1, recommended: 3-5)",
+)
+@click.option(
+    "--smoketest",
+    is_flag=True,
+    help="Run smoketest with single task for quick validation (~2 minutes)",
+)
+def validate_assessor(
+    assessor,
+    tasks,
+    runs,
+    model,
+    output_dir,
+    verbose,
+    list_assessors,
+    concurrent,
+    smoketest,
+):
+    """Validate single assessor impact using Terminal-Bench A/B testing.
+
+    This command empirically measures the impact of a specific AgentReady assessor
+    on Claude Code's Terminal-Bench performance by running an A/B test:
+
+    \b
+    1. Baseline: Force assessor to FAIL (manipulate repo state)
+    2. Treatment: Restore repo to normal state (assessor PASSES)
+    3. Compare success rates with statistical significance testing
+
+    Examples:
+
+        \b
+        # Quick smoketest with 1 task (~2 minutes)
+        agentready validate-assessor --assessor claude_md_file --smoketest
+
+        \b
+        # Validate CLAUDE.md impact (8 tasks, 3 runs each = 48 trials)
+        agentready validate-assessor --assessor claude_md_file
+
+        \b
+        # Test README with custom tasks and 5 runs for higher statistical power
+        agentready validate-assessor --assessor readme_structure \\
+            --tasks adaptive-rejection-sampler \\
+            --tasks async-http-client \\
+            --runs 5
+
+        \b
+        # List all supported assessors
+        agentready validate-assessor --list-assessors
+    """
+    repo_root = Path.cwd()
+
+    # Handle --list-assessors
+    if list_assessors:
+        toggler = AssessorStateToggler(repo_root=repo_root)
+        supported = toggler.list_supported_assessors()
+
+        click.echo("Supported Assessors for Validation:")
+        click.echo(f"{'=' * 60}")
+        for assessor_id in supported:
+            click.echo(f"  - {assessor_id}")
+        click.echo(f"\nTotal: {len(supported)} assessors")
+        click.echo("\nUsage: agentready validate-assessor --assessor {assessor_id}")
+        return
+
+    # Validate that --assessor is provided if not listing
+    if not assessor:
+        click.echo(
+            "Error: Missing required option '--assessor' / '-a'.\n"
+            "Use --list-assessors to see available assessors.",
+            err=True,
+        )
+        raise click.Abort()
+
+    # Validate ANTHROPIC_API_KEY
+    api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+    if not api_key:
+        click.echo(
+            "Error: ANTHROPIC_API_KEY environment variable not set.\n"
+            "Set it with: export ANTHROPIC_API_KEY=your-key-here",
+            err=True,
+        )
+        raise click.Abort()
+
+    # Use default Phase 1 tasks if not specified
+    if not tasks:
+        if smoketest:
+            # Smoketest: just 1 simple task for quick validation
+            tasks = ["adaptive-rejection-sampler"]
+            if verbose:
+                click.echo("Using smoketest mode (1 task, ~2 minutes total)\n")
+        else:
+            tasks = DEFAULT_PHASE1_TASKS
+            if verbose:
+                click.echo(f"Using default Phase 1 task subset ({len(tasks)} tasks)\n")
+
+    # Convert model name to full identifier
+    model_id = f"anthropic/{model}"
+
+    # Set output directory
+    if output_dir:
+        output_path = Path(output_dir)
+    else:
+        output_path = Path(".agentready") / "validations" / assessor
+
+    try:
+        # Run A/B comparison
+        comparison = compare_assessor_impact(
+            assessor_id=assessor,
+            task_names=list(tasks),
+            repo_root=repo_root,
+            runs_per_task=runs,
+            output_dir=output_path,
+            model=model_id,
+            n_concurrent=concurrent,
+            verbose=verbose,
+        )
+
+        # Save JSON results
+        json_file = output_path / f"{assessor}.json"
+        with open(json_file, "w") as f:
+            json.dump(
+                {
+                    "assessor_id": assessor,
+                    "tasks": list(tasks),
+                    "runs_per_task": runs,
+                    "baseline": comparison.without_agent.to_dict(),
+                    "treatment": comparison.with_agent.to_dict(),
+                    "deltas": comparison.deltas,
+                    "statistical_significance": comparison.statistical_significance,
+                },
+                f,
+                indent=2,
+            )
+
+        # Generate Markdown report
+        md_file = output_path / f"{assessor}.md"
+        with open(md_file, "w") as f:
+            f.write(f"# Assessor Impact Validation: {assessor}\n\n")
+            f.write(f"**Date**: {comparison.created_at}\n")
+            f.write(f"**Tasks**: {len(tasks)}\n")
+            f.write(f"**Runs per Task**: {runs}\n\n")
+
+            f.write("## Results Summary\n\n")
+            f.write(
+                "| Metric | Baseline (Assessor Fails) | Treatment (Assessor Passes) | Delta |\n"
+            )
+            f.write(
+                "|--------|---------------------------|----------------------------|-------|\n"
+            )
+
+            delta = comparison.deltas["success_rate_delta"]
+            sign = "+" if delta >= 0 else ""
+            f.write(
+                f"| Success Rate | {comparison.without_agent.success_rate:.1f}% | "
+                f"{comparison.with_agent.success_rate:.1f}% | **{sign}{delta:.1f} pp** |\n"
+            )
+
+            duration_delta = comparison.deltas.get("avg_duration_delta_sec", 0)
+            sign = "+" if duration_delta >= 0 else ""
+            f.write(
+                f"| Avg Duration | {comparison.without_agent.avg_duration_sec:.1f}s | "
+                f"{comparison.with_agent.avg_duration_sec:.1f}s | {sign}{duration_delta:.1f}s |\n"
+            )
+
+            f.write("\n## Statistical Significance\n\n")
+            is_sig = comparison.statistical_significance.get(
+                "success_rate_significant", False
+            )
+            p_val = comparison.statistical_significance.get("success_rate_p_value")
+            f.write(f"- **Significant**: {'YES ✓' if is_sig else 'NO'}\n")
+            if p_val is not None:
+                f.write(f"- **P-value**: {p_val:.4f}\n")
+
+            f.write("\n## Files\n\n")
+            f.write(f"- JSON: `{json_file}`\n")
+            f.write(f"- Markdown: `{md_file}`\n")
+
+        click.echo("\nResults saved:")
+        click.echo(f"  - JSON: {json_file}")
+        click.echo(f"  - Markdown: {md_file}\n")
+
+    except ValueError as e:
+        click.echo(f"\nError: {e}\n", err=True)
+        click.echo("Use --list-assessors to see supported assessors.", err=True)
+        raise click.Abort()
+    except Exception as e:
+        click.echo(f"\nValidation failed: {e}\n", err=True)
+        if verbose:
+            import traceback
+
+            traceback.print_exc()
+        raise click.Abort()
diff --git a/src/agentready/cli/main.py b/src/agentready/cli/main.py
index 64c99937..73f888e7 100644
--- a/src/agentready/cli/main.py
+++ b/src/agentready/cli/main.py
@@ -29,7 +29,7 @@
 
 # Lightweight commands - imported immediately
 from .align import align
-from .benchmark import benchmark
+from .benchmark import benchmark, validate_assessor
 from .bootstrap import bootstrap
 from .demo import demo
 from .repomix import repomix_generate
@@ -544,6 +544,7 @@ def generate_config():
 # Register lightweight commands (heavy commands loaded lazily via LazyGroup)
 cli.add_command(align)
 cli.add_command(benchmark)
+cli.add_command(validate_assessor)
 cli.add_command(bootstrap)
 cli.add_command(demo)
 cli.add_command(migrate_report)
diff --git a/src/agentready/services/harbor/agent_toggler.py b/src/agentready/services/harbor/agent_toggler.py
index 6653c82f..16151109 100644
--- a/src/agentready/services/harbor/agent_toggler.py
+++ b/src/agentready/services/harbor/agent_toggler.py
@@ -1,9 +1,9 @@
-"""Service for safely enabling/disabling the doubleagent.md file."""
+"""Service for safely enabling/disabling agent files and manipulating repository state."""
 
 import shutil
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Generator
+from typing import Callable, Dict, Generator, Tuple
 
 
 class AgentFileToggler:
@@ -89,3 +89,167 @@ def temporarily_enabled(self) -> Generator[None, None, None]:
         finally:
             if was_disabled:
                 self.disable()
+
+
+class AssessorStateToggler:
+    """Manipulate repository state to force assessor pass/fail for A/B testing.
+
+    This class safely modifies repository files to simulate scenarios where
+    specific assessors would pass or fail, enabling empirical validation of
+    assessor impacts on agent performance.
+
+    Example:
+        toggler = AssessorStateToggler()
+
+        # Force CLAUDE.md assessor to fail
+        toggler.force_fail("claude_md_file")
+        # Run benchmark
+        toggler.restore("claude_md_file")
+    """
+
+    # Mapping of assessor IDs to (fail_action, restore_action) tuples
+    # Each action is a callable that takes the repository root Path
+    MANIPULATIONS: Dict[str, Tuple[Callable[[Path], None], Callable[[Path], None]]] = {}
+
+    @classmethod
+    def register_manipulation(
+        cls,
+        assessor_id: str,
+        fail_action: Callable[[Path], None],
+        restore_action: Callable[[Path], None],
+    ) -> None:
+        """Register a manipulation strategy for an assessor.
+
+        Args:
+            assessor_id: Unique assessor identifier (e.g., "claude_md_file")
+            fail_action: Function to force assessor to fail state
+            restore_action: Function to restore assessor to pass state
+        """
+        cls.MANIPULATIONS[assessor_id] = (fail_action, restore_action)
+
+    def __init__(self, repo_root: Path = None):
+        """Initialize toggler with repository root.
+
+        Args:
+            repo_root: Root of the repository (default: current directory)
+        """
+        self.repo_root = repo_root or Path.cwd()
+        self._backup_suffix = ".assessor_backup"
+        self._initialize_default_manipulations()
+
+    def _initialize_default_manipulations(self) -> None:
+        """Register default manipulation strategies for Phase 1 assessors."""
+
+        # CLAUDE.md Assessor - Tier 1, 10% weight
+        def fail_claude_md(repo_root: Path) -> None:
+            claude_md = repo_root / ".claude" / "CLAUDE.md"
+            backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}"
+            if claude_md.exists() and not backup.exists():
+                shutil.move(str(claude_md), str(backup))
+
+        def restore_claude_md(repo_root: Path) -> None:
+            claude_md = repo_root / ".claude" / "CLAUDE.md"
+            backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}"
+            if backup.exists() and not claude_md.exists():
+                shutil.move(str(backup), str(claude_md))
+
+        self.register_manipulation("claude_md_file", fail_claude_md, restore_claude_md)
+
+        # README Assessor - Tier 1, 10% weight
+        def fail_readme(repo_root: Path) -> None:
+            readme = repo_root / "README.md"
+            backup = repo_root / f"README.md{self._backup_suffix}"
+            if readme.exists() and not backup.exists():
+                shutil.move(str(readme), str(backup))
+
+        def restore_readme(repo_root: Path) -> None:
+            readme = repo_root / "README.md"
+            backup = repo_root / f"README.md{self._backup_suffix}"
+            if backup.exists() and not readme.exists():
+                shutil.move(str(backup), str(readme))
+
+        self.register_manipulation("readme_structure", fail_readme, restore_readme)
+
+        # Test Coverage Assessor - Tier 2, 3% weight
+        def fail_tests(repo_root: Path) -> None:
+            tests_dir = repo_root / "tests"
+            backup_dir = repo_root / f"tests{self._backup_suffix}"
+            if tests_dir.exists() and not backup_dir.exists():
+                shutil.move(str(tests_dir), str(backup_dir))
+
+        def restore_tests(repo_root: Path) -> None:
+            tests_dir = repo_root / "tests"
+            backup_dir = repo_root / f"tests{self._backup_suffix}"
+            if backup_dir.exists() and not tests_dir.exists():
+                shutil.move(str(backup_dir), str(tests_dir))
+
+        self.register_manipulation("test_coverage", fail_tests, restore_tests)
+
+    def force_fail(self, assessor_id: str) -> None:
+        """Force assessor to fail by manipulating repository state.
+
+        Args:
+            assessor_id: Assessor identifier (e.g., "claude_md_file")
+
+        Raises:
+            ValueError: If assessor_id is not recognized
+        """
+        if assessor_id not in self.MANIPULATIONS:
+            raise ValueError(
+                f"Unknown assessor: {assessor_id}. "
+                f"Available: {', '.join(self.MANIPULATIONS.keys())}"
+            )
+
+        fail_action, _ = self.MANIPULATIONS[assessor_id]
+        fail_action(self.repo_root)
+
+    def restore(self, assessor_id: str) -> None:
+        """Restore repository to state where assessor passes.
+
+        Args:
+            assessor_id: Assessor identifier (e.g., "claude_md_file")
+
+        Raises:
+            ValueError: If assessor_id is not recognized
+        """
+        if assessor_id not in self.MANIPULATIONS:
+            raise ValueError(
+                f"Unknown assessor: {assessor_id}. "
+                f"Available: {', '.join(self.MANIPULATIONS.keys())}"
+            )
+
+        _, restore_action = self.MANIPULATIONS[assessor_id]
+        restore_action(self.repo_root)
+
+    def list_supported_assessors(self) -> list[str]:
+        """Get list of assessor IDs with registered manipulations.
+
+        Returns:
+            List of assessor IDs that can be toggled
+        """
+        return list(self.MANIPULATIONS.keys())
+
+    @contextmanager
+    def temporarily_failed(self, assessor_id: str) -> Generator[None, None, None]:
+        """Context manager for safe fail/restore of assessor state.
+
+        Ensures repository is restored even if exception occurs during testing.
+
+        Args:
+            assessor_id: Assessor identifier
+
+        Example:
+            toggler = AssessorStateToggler()
+            with toggler.temporarily_failed("claude_md_file"):
+                # CLAUDE.md is now missing (assessor fails)
+                run_benchmark()
+            # CLAUDE.md is automatically restored here
+
+        Yields:
+            None
+        """
+        try:
+            self.force_fail(assessor_id)
+            yield
+        finally:
+            self.restore(assessor_id)
diff --git a/src/agentready/services/harbor/comparer.py b/src/agentready/services/harbor/comparer.py
index da0a27c9..df3b4461 100644
--- a/src/agentready/services/harbor/comparer.py
+++ b/src/agentready/services/harbor/comparer.py
@@ -1,8 +1,12 @@
 """Service for comparing Harbor benchmark runs and calculating statistical significance."""
 
+from pathlib import Path
 from typing import List, Optional
 
 from agentready.models.harbor import HarborComparison, HarborRunMetrics
+from agentready.services.harbor.agent_toggler import AssessorStateToggler
+from agentready.services.harbor.result_parser import parse_harbor_results
+from agentready.services.harbor.runner import HarborRunner
 
 
 def compare_runs(
@@ -186,3 +190,160 @@ def interpret_effect_size(cohens_d: float) -> str:
         return "medium"
     else:
         return "large"
+
+
+def compare_assessor_impact(
+    assessor_id: str,
+    task_names: List[str],
+    repo_root: Path,
+    runs_per_task: int = 3,
+    output_dir: Path = None,
+    model: str = "anthropic/claude-sonnet-4-5",
+    n_concurrent: int = 1,
+    verbose: bool = True,
+) -> HarborComparison:
+    """A/B test assessor impact: baseline (assessor fails) vs treatment (assessor passes).
+
+    This function orchestrates the complete A/B testing workflow:
+    1. Force assessor to fail (manipulate repository state)
+    2. Run Harbor benchmark (baseline)
+    3. Restore repository to normal state (assessor passes)
+    4. Run Harbor benchmark again (treatment)
+    5. Compare results with statistical significance
+
+    Args:
+        assessor_id: Assessor identifier (e.g., "claude_md_file")
+        task_names: List of Terminal-Bench task names to test
+        repo_root: Root of the repository to test
+        runs_per_task: Number of runs per task (default: 3, recommended: 5+)
+        output_dir: Directory to store results (default: .agentready/validations/{assessor_id}/)
+        model: Claude model to use (default: sonnet-4-5)
+        n_concurrent: Number of concurrent tasks to run in parallel (default: 1)
+        verbose: Print progress messages (default: True)
+
+    Returns:
+        HarborComparison with baseline vs treatment metrics, deltas, and significance
+
+    Raises:
+        ValueError: If assessor_id is not supported
+        HarborNotInstalledError: If Harbor CLI is not available
+        subprocess.CalledProcessError: If Harbor benchmark fails
+
+    Example:
+        comparison = compare_assessor_impact(
+            assessor_id="claude_md_file",
+            task_names=["adaptive-rejection-sampler", "async-http-client"],
+            repo_root=Path("."),
+            runs_per_task=3,
+            verbose=True
+        )
+
+        print(f"Success rate delta: {comparison.deltas['success_rate_delta']:.1f}%")
+        print(f"Significant: {comparison.statistical_significance['success_rate_significant']}")
+    """
+    # Default output directory
+    if output_dir is None:
+        output_dir = Path(".agentready") / "validations" / assessor_id
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Initialize components
+    toggler = AssessorStateToggler(repo_root=repo_root)
+    runner = HarborRunner()
+
+    if verbose:
+        print(f"\nAssessor Impact Validation: {assessor_id}")
+        print(f"{'=' * 60}")
+        print(f"Tasks: {', '.join(task_names)}")
+        print(f"Runs per task: {runs_per_task}")
+        print(
+            f"Total trials: {len(task_names) * runs_per_task * 2} (baseline + treatment)"
+        )
+        print(f"Output: {output_dir}\n")
+
+    # Baseline: Assessor fails (remove CLAUDE.md, README, etc.)
+    if verbose:
+        print("[1/2] Running baseline (assessor FAILS)...")
+        print(f"      Manipulating repository state to force {assessor_id} to fail")
+
+    baseline_output = output_dir / "baseline"
+    with toggler.temporarily_failed(assessor_id):
+        if verbose:
+            print(
+                f"      Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials"
+            )
+
+        baseline_results_dir = runner.run_benchmark(
+            task_names=task_names,
+            output_dir=baseline_output,
+            model=model,
+            n_concurrent=n_concurrent,
+            verbose=verbose,
+        )
+
+        # Parse results
+        baseline_task_results = parse_harbor_results(baseline_results_dir)
+        baseline_metrics = HarborRunMetrics.from_task_results(
+            run_id=f"{assessor_id}_baseline",
+            agent_file_enabled=False,  # Assessor fails
+            task_results=baseline_task_results,
+        )
+
+    if verbose:
+        print(
+            f"      Baseline complete: {baseline_metrics.success_rate:.1f}% success rate\n"
+        )
+
+    # Treatment: Assessor passes (normal repository state)
+    if verbose:
+        print("[2/2] Running treatment (assessor PASSES)...")
+        print("      Repository restored to normal state")
+
+    treatment_output = output_dir / "treatment"
+    if verbose:
+        print(
+            f"      Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials"
+        )
+
+    treatment_results_dir = runner.run_benchmark(
+        task_names=task_names,
+        output_dir=treatment_output,
+        model=model,
+        n_concurrent=n_concurrent,
+        verbose=verbose,
+    )
+
+    # Parse results
+    treatment_task_results = parse_harbor_results(treatment_results_dir)
+    treatment_metrics = HarborRunMetrics.from_task_results(
+        run_id=f"{assessor_id}_treatment",
+        agent_file_enabled=True,  # Assessor passes
+        task_results=treatment_task_results,
+    )
+
+    if verbose:
+        print(
+            f"      Treatment complete: {treatment_metrics.success_rate:.1f}% success rate\n"
+        )
+
+    # Compare results
+    comparison = compare_runs(
+        without_agent=baseline_metrics, with_agent=treatment_metrics
+    )
+
+    if verbose:
+        print(f"{'=' * 60}")
+        print("Results Summary")
+        print(f"{'=' * 60}")
+        delta = comparison.deltas["success_rate_delta"]
+        sign = "+" if delta >= 0 else ""
+        print(f"Success Rate Delta: {sign}{delta:.1f} percentage points")
+        print(
+            f"Statistical Significance: {'YES' if comparison.statistical_significance.get('success_rate_significant', False) else 'NO'}"
+        )
+        if comparison.statistical_significance.get("success_rate_p_value"):
+            print(
+                f"P-value: {comparison.statistical_significance['success_rate_p_value']:.4f}"
+            )
+        print(f"\nResults saved to: {output_dir}\n")
+
+    return comparison
diff --git a/src/agentready/services/harbor/result_parser.py b/src/agentready/services/harbor/result_parser.py
index 684fd082..c8b1c01c 100644
--- a/src/agentready/services/harbor/result_parser.py
+++ b/src/agentready/services/harbor/result_parser.py
@@ -23,8 +23,17 @@ def parse_harbor_results(results_dir: Path) -> List[HarborTaskResult]:
     if not results_dir.exists():
         raise FileNotFoundError(f"Results directory not found: {results_dir}")
 
-    # Find all result.json files in subdirectories
-    result_files = list(results_dir.glob("*/result.json"))
+    # Find all result.json files in subdirectories (task directories only, not job-level result.json)
+    # Task directories are named like "task-name__hash/" while the job result.json is at the root
+    all_result_files = list(results_dir.glob("*/result.json"))
+
+    # Filter to only task result files (exclude job-level result.json)
+    # Task directories contain "__" in their name (e.g., "build-pmars__abc123")
+    result_files = [
+        f
+        for f in all_result_files
+        if "__" in f.parent.name  # Task directories have "__" separator
+    ]
 
     if not result_files:
         raise ValueError(f"No result.json files found in {results_dir}")
diff --git a/src/agentready/services/harbor/runner.py b/src/agentready/services/harbor/runner.py
index 6dbd7d76..d9dca95d 100644
--- a/src/agentready/services/harbor/runner.py
+++ b/src/agentready/services/harbor/runner.py
@@ -1,7 +1,8 @@
 """Service for executing Harbor benchmarks via CLI."""
 
-import json
+import inspect
 import subprocess
+import warnings
 from pathlib import Path
 from typing import List
 
@@ -12,12 +13,19 @@ class HarborNotInstalledError(Exception):
     pass
 
 
+class HarborTaskFilteringBugWarning(UserWarning):
+    """Raised when Harbor has the task filtering bug."""
+
+    pass
+
+
 class HarborRunner:
     """Execute Harbor benchmarks via subprocess and capture results."""
 
     def __init__(self):
         """Initialize Harbor runner and verify installation."""
         self._verify_harbor_installed()
+        self._check_harbor_task_filtering()
 
     def _verify_harbor_installed(self) -> None:
         """Verify Harbor CLI is installed and accessible.
@@ -27,7 +35,7 @@ def _verify_harbor_installed(self) -> None:
         """
         try:
             subprocess.run(
-                ["harbor", "--version"],
+                ["harbor", "--help"],
                 capture_output=True,
                 text=True,
                 check=True,
@@ -42,6 +50,58 @@ def _verify_harbor_installed(self) -> None:
         except subprocess.CalledProcessError as e:
             raise HarborNotInstalledError(f"Harbor CLI error: {e.stderr}")
 
+    def _check_harbor_task_filtering(self) -> None:
+        """Check if Harbor has the task filtering bug fix.
+
+        Warns if Harbor version has the known task filtering bug where -t flags are ignored.
+
+        The bug was fixed in Harbor commit f9e6d2e (Dec 12, 2025) but not yet released.
+        PyPI version 0.1.23 (Dec 11, 2025) has the bug.
+
+        See: https://github.com/laude-institute/harbor/commit/f9e6d2e10c72d33373012294c36fd4938c45c26c
+        """
+        try:
+            from harbor.models.job.config import BaseDatasetConfig
+
+            # Check if the fix is present by inspecting source code
+            source = inspect.getsource(BaseDatasetConfig._filter_task_ids)
+
+            # The fix changed task_id.path.name to task_id.get_name()
+            # If we see path.name (the bug), warn the user
+            if "task_id.path.name" in source or ".path.name" in source:
+                warnings.warn(
+                    "\n"
+                    "⚠️  WARNING: Harbor has a task filtering bug!\n"
+                    "\n"
+                    "Your Harbor version has a bug where -t/--task-name flags are ignored.\n"
+                    "This causes smoketests to run ALL tasks instead of the filtered subset.\n"
+                    "\n"
+                    "The bug was fixed in Harbor main (Dec 12, 2025) but not yet released.\n"
+                    "Latest PyPI version 0.1.23 (Dec 11) still has the bug.\n"
+                    "\n"
+                    "FIX OPTIONS:\n"
+                    "\n"
+                    "Option 1 (recommended): Install Harbor from main\n"
+                    "  pip uninstall harbor\n"
+                    "  pip install git+https://github.com/laude-institute/harbor.git\n"
+                    "\n"
+                    "Option 2: Apply patch to your local Harbor installation\n"
+                    "  See: patches/harbor-task-filtering-fix.patch in AgentReady repo\n"
+                    "\n"
+                    "Commit: https://github.com/laude-institute/harbor/commit/f9e6d2e\n",
+                    HarborTaskFilteringBugWarning,
+                    stacklevel=2,
+                )
+            # If we see get_name() (the fix), all good - no warning
+
+        except ImportError:
+            # Harbor not importable as Python package (only CLI installed)
+            # Can't check for bug, but also can't use task filtering anyway
+            pass
+        except Exception:
+            # Don't fail if we can't check - just skip the warning
+            pass
+
     def run_benchmark(
         self,
         task_names: List[str],
@@ -89,25 +149,11 @@ def run_benchmark(
             str(n_concurrent),
         ]
 
-        # Add task selection if specified
-        if task_names:
-            # Harbor uses config JSON for task selection
-            config_file = output_dir / "config.json"
-            config = {
-                "datasets": [
-                    {
-                        "name": dataset,
-                        "version": dataset_version,
-                        "task_names": task_names,
-                    }
-                ],
-                "agents": [{"name": agent, "model_name": model}],
-            }
-            with open(config_file, "w") as f:
-                json.dump(config, f, indent=2)
-
-            # Use config file instead of CLI args
-            cmd = ["harbor", "run", "-c", str(config_file)]
+        # Add task name filters
+        # NOTE: Requires Harbor >= 0.1.24 (or install from main)
+        # Task filtering was fixed in commit f9e6d2e (Dec 12, 2025)
+        for task_name in task_names:
+            cmd.extend(["-t", task_name])
 
         # Execute Harbor benchmark
         if verbose:
diff --git a/tests/unit/services/harbor/__init__.py b/tests/unit/services/harbor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/services/harbor/test_assessor_state_toggler.py b/tests/unit/services/harbor/test_assessor_state_toggler.py
new file mode 100644
index 00000000..5d227b6e
--- /dev/null
+++ b/tests/unit/services/harbor/test_assessor_state_toggler.py
@@ -0,0 +1,240 @@
+"""Unit tests for AssessorStateToggler."""
+
+import pytest
+
+from agentready.services.harbor.agent_toggler import AssessorStateToggler
+
+
+@pytest.fixture
+def temp_repo(tmp_path):
+    """Create a temporary repository structure for testing."""
+    repo_root = tmp_path / "test_repo"
+    repo_root.mkdir()
+
+    # Create .claude directory and CLAUDE.md
+    claude_dir = repo_root / ".claude"
+    claude_dir.mkdir()
+    claude_md = claude_dir / "CLAUDE.md"
+    claude_md.write_text("# Test Project\n\nThis is a test CLAUDE.md file.")
+
+    # Create README.md
+    readme = repo_root / "README.md"
+    readme.write_text("# Test Repo\n\n## Installation\n\n## Usage")
+
+    # Create tests directory
+    tests_dir = repo_root / "tests"
+    tests_dir.mkdir()
+    (tests_dir / "test_example.py").write_text("def test_example():\n    assert True")
+
+    return repo_root
+
+
+class TestAssessorStateToggler:
+    """Test AssessorStateToggler functionality."""
+
+    def test_initialization(self, temp_repo):
+        """Test toggler initializes with correct repo root."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+        assert toggler.repo_root == temp_repo
+        assert toggler._backup_suffix == ".assessor_backup"
+
+    def test_list_supported_assessors(self, temp_repo):
+        """Test listing supported assessors returns expected IDs."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+        supported = toggler.list_supported_assessors()
+
+        assert "claude_md_file" in supported
+        assert "readme_structure" in supported
+        assert "test_coverage" in supported
+        assert len(supported) == 3
+
+    def test_force_fail_claude_md(self, temp_repo):
+        """Test forcing CLAUDE.md assessor to fail."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        claude_md = temp_repo / ".claude" / "CLAUDE.md"
+        backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup"
+
+        # Verify file exists before fail
+        assert claude_md.exists()
+        assert not backup.exists()
+
+        # Force fail
+        toggler.force_fail("claude_md_file")
+
+        # Verify file is backed up
+        assert not claude_md.exists()
+        assert backup.exists()
+
+    def test_restore_claude_md(self, temp_repo):
+        """Test restoring CLAUDE.md after forcing fail."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        # Force fail first
+        toggler.force_fail("claude_md_file")
+
+        claude_md = temp_repo / ".claude" / "CLAUDE.md"
+        backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup"
+
+        # Verify backup state
+        assert not claude_md.exists()
+        assert backup.exists()
+
+        # Restore
+        toggler.restore("claude_md_file")
+
+        # Verify restored
+        assert claude_md.exists()
+        assert not backup.exists()
+
+    def test_force_fail_readme(self, temp_repo):
+        """Test forcing README assessor to fail."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        readme = temp_repo / "README.md"
+        backup = temp_repo / "README.md.assessor_backup"
+
+        # Force fail
+        toggler.force_fail("readme_structure")
+
+        # Verify
+        assert not readme.exists()
+        assert backup.exists()
+
+    def test_restore_readme(self, temp_repo):
+        """Test restoring README after forcing fail."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        # Force fail first
+        toggler.force_fail("readme_structure")
+
+        # Restore
+        toggler.restore("readme_structure")
+
+        readme = temp_repo / "README.md"
+        backup = temp_repo / "README.md.assessor_backup"
+
+        # Verify
+        assert readme.exists()
+        assert not backup.exists()
+
+    def test_force_fail_tests(self, temp_repo):
+        """Test forcing test coverage assessor to fail."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        tests_dir = temp_repo / "tests"
+        backup_dir = temp_repo / "tests.assessor_backup"
+
+        # Force fail
+        toggler.force_fail("test_coverage")
+
+        # Verify
+        assert not tests_dir.exists()
+        assert backup_dir.exists()
+
+    def test_restore_tests(self, temp_repo):
+        """Test restoring tests directory after forcing fail."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        # Force fail first
+        toggler.force_fail("test_coverage")
+
+        # Restore
+        toggler.restore("test_coverage")
+
+        tests_dir = temp_repo / "tests"
+        backup_dir = temp_repo / "tests.assessor_backup"
+
+        # Verify
+        assert tests_dir.exists()
+        assert not backup_dir.exists()
+
+    def test_temporarily_failed_context_manager(self, temp_repo):
+        """Test temporarily_failed context manager restores state."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        claude_md = temp_repo / ".claude" / "CLAUDE.md"
+
+        # Verify initial state
+        assert claude_md.exists()
+
+        # Use context manager
+        with toggler.temporarily_failed("claude_md_file"):
+            # Inside context: file should be missing
+            assert not claude_md.exists()
+
+        # Outside context: file should be restored
+        assert claude_md.exists()
+
+    def test_temporarily_failed_exception_still_restores(self, temp_repo):
+        """Test temporarily_failed restores even when exception occurs."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        claude_md = temp_repo / ".claude" / "CLAUDE.md"
+
+        # Verify initial state
+        assert claude_md.exists()
+
+        # Use context manager with exception
+        with pytest.raises(RuntimeError):
+            with toggler.temporarily_failed("claude_md_file"):
+                assert not claude_md.exists()
+                raise RuntimeError("Test exception")
+
+        # File should still be restored
+        assert claude_md.exists()
+
+    def test_unknown_assessor_raises_error(self, temp_repo):
+        """Test that unknown assessor ID raises ValueError."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        with pytest.raises(ValueError, match="Unknown assessor: nonexistent"):
+            toggler.force_fail("nonexistent")
+
+        with pytest.raises(ValueError, match="Unknown assessor: nonexistent"):
+            toggler.restore("nonexistent")
+
+    def test_idempotent_force_fail(self, temp_repo):
+        """Test that forcing fail multiple times is idempotent."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        # Force fail twice
+        toggler.force_fail("claude_md_file")
+        toggler.force_fail("claude_md_file")  # Should not error
+
+        claude_md = temp_repo / ".claude" / "CLAUDE.md"
+        backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup"
+
+        # Still in failed state
+        assert not claude_md.exists()
+        assert backup.exists()
+
+    def test_idempotent_restore(self, temp_repo):
+        """Test that restoring multiple times is idempotent."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        # Force fail, then restore twice
+        toggler.force_fail("claude_md_file")
+        toggler.restore("claude_md_file")
+        toggler.restore("claude_md_file")  # Should not error
+
+        claude_md = temp_repo / ".claude" / "CLAUDE.md"
+        backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup"
+
+        # Still in normal state
+        assert claude_md.exists()
+        assert not backup.exists()
+
+    def test_content_preservation(self, temp_repo):
+        """Test that file content is preserved through fail/restore cycle."""
+        toggler = AssessorStateToggler(repo_root=temp_repo)
+
+        claude_md = temp_repo / ".claude" / "CLAUDE.md"
+        original_content = claude_md.read_text()
+
+        # Fail and restore
+        toggler.force_fail("claude_md_file")
+        toggler.restore("claude_md_file")
+
+        # Verify content is identical
+        assert claude_md.read_text() == original_content