From 93d1d16bcac62a6838b3b940f424cf6d514c084f Mon Sep 17 00:00:00 2001
From: DJ <dj@Rachels-MacBook-Air.local>
Date: Sat, 4 Apr 2026 16:41:00 -0700
Subject: [PATCH 1/4] feat: two-stage evaluation pipeline with prefilter

Adds a lightweight Stage 1 pre-filter to BaseEvaluator that classifies
interactions as rule-bearing or not before running full extraction.
Non-rule interactions with confidence > 0.8 skip Stage 2, targeting
50%+ reduction in LLM calls.

Key design decisions:
- Prefilter is implemented in BaseEvaluator, so all evaluators get it
- _parse_bool() fixes the bool("false")==True bug from PR #20
- Fail-open: prefilter errors pass through to full eval
- --skip-prefilter CLI flag to disable Stage 1
- Dashboard shows prefilter skip rate metrics

Closes #14

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 context_scribe/evaluator/__init__.py          |   4 +-
 context_scribe/evaluator/base_evaluator.py    | 107 ++++++--
 .../evaluator/prefilter_template.md           |  24 ++
 context_scribe/main.py                        |  19 +-
 context_scribe/models/evaluator_models.py     |  40 ++-
 tests/test_prefilter.py                       | 237 ++++++++++++++++++
 6 files changed, 401 insertions(+), 30 deletions(-)
 create mode 100644 context_scribe/evaluator/prefilter_template.md
 create mode 100644 tests/test_prefilter.py

diff --git a/context_scribe/evaluator/__init__.py b/context_scribe/evaluator/__init__.py
index 7685c4c..969995f 100644
--- a/context_scribe/evaluator/__init__.py
+++ b/context_scribe/evaluator/__init__.py
@@ -13,7 +13,7 @@
 }
 
 
-def get_evaluator(name: str) -> BaseEvaluator:
+def get_evaluator(name: str, **kwargs) -> BaseEvaluator:
     """Return an evaluator instance by name. Raises ValueError for unknown names."""
     cls = EVALUATOR_REGISTRY.get(name)
     if cls is None:
@@ -21,7 +21,7 @@ def get_evaluator(name: str) -> BaseEvaluator:
             f"Unknown evaluator '{name}'. "
             f"Available: {', '.join(sorted(EVALUATOR_REGISTRY))}"
         )
-    return cls()
+    return cls(**kwargs)
 
 
 __all__ = [
diff --git a/context_scribe/evaluator/base_evaluator.py b/context_scribe/evaluator/base_evaluator.py
index 5299755..c1c38ea 100644
--- a/context_scribe/evaluator/base_evaluator.py
+++ b/context_scribe/evaluator/base_evaluator.py
@@ -7,26 +7,95 @@
 from typing import Optional
 
 from context_scribe.models.interaction import Interaction
-from context_scribe.models.evaluator_models import RuleOutput, INTERNAL_SIGNATURE
+from context_scribe.models.evaluator_models import (
+    RuleOutput, INTERNAL_SIGNATURE, PrefilterResult, PrefilterMetrics,
+)
 
 logger = logging.getLogger(__name__)
 
+_PREFILTER_TEMPLATE_PATH = Path(__file__).parent / "prefilter_template.md"
+
+
+def _parse_bool(value) -> bool:
+    """Safely parse a boolean that may arrive as a string from LLM JSON."""
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.strip().lower() in ("true", "1", "yes")
+    return bool(value)
+
+
 class BaseEvaluator(ABC):
-    def __init__(self):
-        # Load the prompt template
+    def __init__(self, skip_prefilter: bool = False):
+        self.skip_prefilter = skip_prefilter
+        self.metrics = PrefilterMetrics()
+        # Load the prompt templates
         template_path = Path(__file__).parent / "prompt_template.md"
         with open(template_path, "r", encoding="utf-8") as f:
             self.prompt_template = f.read()
+        with open(_PREFILTER_TEMPLATE_PATH, "r", encoding="utf-8") as f:
+            self._prefilter_template = f.read()
 
     @abstractmethod
     def _execute_cli(self, prompt: str) -> str:
         """Executes the specific CLI tool and returns the raw stdout.
-        
+
         Should raise subprocess.TimeoutExpired if the execution takes too long.
         """
         pass
 
+    def _pre_evaluate(self, interaction: Interaction) -> Optional[PrefilterResult]:
+        """Stage 1: Lightweight check to filter non-rule interactions."""
+        prompt = self._prefilter_template.format(
+            internal_signature=INTERNAL_SIGNATURE,
+            content=interaction.content,
+        )
+        try:
+            output = self._execute_cli(prompt)
+
+            # Extract response text from JSON wrapper if present
+            response_text = output
+            try:
+                data = json.loads(output)
+                if isinstance(data, dict):
+                    response_text = data.get("result", data.get("response", output))
+            except json.JSONDecodeError:
+                pass
+
+            response_text = re.sub(r'```(?:json)?\s*', '', str(response_text)).strip()
+
+            # Parse the prefilter JSON response
+            json_match = re.search(r'\{[^}]*"contains_rule"[^}]*\}', response_text)
+            if json_match:
+                pf_data = json.loads(json_match.group(0))
+                return PrefilterResult(
+                    contains_rule=_parse_bool(pf_data.get("contains_rule", True)),
+                    confidence=float(pf_data.get("confidence", 0.0)),
+                )
+
+            logger.warning("Could not parse prefilter response, passing through to full eval")
+            return None
+
+        except subprocess.TimeoutExpired:
+            logger.warning("Prefilter timed out, passing through to full eval")
+            return None
+        except Exception as e:
+            logger.warning("Prefilter error: %s, passing through to full eval", e)
+            return None
+
     def evaluate_interaction(self, interaction: Interaction, existing_global: str = "", existing_project: str = "") -> Optional[RuleOutput]:
+        # Stage 1: Pre-filter
+        if not self.skip_prefilter:
+            prefilter_result = self._pre_evaluate(interaction)
+            self.metrics.record_result(prefilter_result)
+            if prefilter_result and prefilter_result.should_skip_full_eval:
+                logger.info(
+                    "Prefilter: skipping full eval for %s (confidence=%.2f)",
+                    interaction.project_name, prefilter_result.confidence,
+                )
+                return None
+
+        # Stage 2: Full extraction
         prompt = self.prompt_template.format(
             internal_signature=INTERNAL_SIGNATURE,
             project_name=interaction.project_name,
@@ -37,36 +106,29 @@ def evaluate_interaction(self, interaction: Interaction, existing_global: str =
 
         try:
             output = self._execute_cli(prompt)
-            
+
             # Extract response text
             response_text = output
             try:
                 data = json.loads(output)
                 if isinstance(data, dict):
-                    # Handle both gemini ("response") and claude ("result"/"response") formats
                     response_text = data.get("result", data.get("response", output))
             except json.JSONDecodeError:
                 pass
 
-            # Strip markdown code fences if present (Claude often wraps JSON in ```json ... ```)
+            # Strip markdown code fences if present
             response_text = re.sub(r'```(?:json)?\s*', '', str(response_text)).strip()
 
-            # Robust JSON extraction: look for substrings that start with { and end with }
-            # and contain both "scope" and "rules"
+            # Robust JSON extraction
             best_rule_data = None
-            
-            # Find all { and } positions
             start_indices = [i for i, char in enumerate(response_text) if char == '{']
             end_indices = [i for i, char in enumerate(response_text) if char == '}']
-            
-            # Try progressively smaller substrings starting from the first { and ending at the last }
-            # until we find a valid JSON object that has our keys.
+
             for start in start_indices:
                 for end in reversed(end_indices):
                     if end > start:
                         try:
                             candidate = response_text[start:end+1]
-                            # Quick check to avoid expensive json.loads on non-candidates
                             if '"scope"' in candidate and '"rules"' in candidate:
                                 data = json.loads(candidate)
                                 if isinstance(data, dict) and "scope" in data and "rules" in data:
@@ -81,16 +143,16 @@ def evaluate_interaction(self, interaction: Interaction, existing_global: str =
                 try:
                     rules_raw = best_rule_data["rules"]
                     desc = best_rule_data.get("description", "Updated rules")
-                    
+
                     if isinstance(rules_raw, list):
                         rules_content = "\n".join([str(r) for r in rules_raw]).strip()
                     else:
                         rules_content = str(rules_raw).strip()
-                        
+
                     if len(rules_content) > 0:
                         return RuleOutput(
-                            content=rules_content, 
-                            scope=str(best_rule_data["scope"]).upper(), 
+                            content=rules_content,
+                            scope=str(best_rule_data["scope"]).upper(),
                             description=str(desc)
                         )
                 except Exception as e:
@@ -98,18 +160,17 @@ def evaluate_interaction(self, interaction: Interaction, existing_global: str =
 
             if "NO_RULE" in str(response_text):
                 return None
-            
-            # Fallback for non-JSON responses (robustness)
+
+            # Fallback for non-JSON responses
             text_upper = str(response_text).upper()
             if "PROJECT" in text_upper or "GLOBAL" in text_upper:
                 scope = "PROJECT" if "PROJECT" in text_upper else "GLOBAL"
-                # Try to find some content if rules are just listed
                 content = str(response_text)
                 return RuleOutput(content=content, scope=scope, description="Extracted via fallback")
 
             logger.error(f"Failed to parse rule extraction for {interaction.project_name}")
             return None
-            
+
         except subprocess.TimeoutExpired:
             logger.error(f"Evaluation timed out for {interaction.project_name}")
             return None
diff --git a/context_scribe/evaluator/prefilter_template.md b/context_scribe/evaluator/prefilter_template.md
new file mode 100644
index 0000000..d23473b
--- /dev/null
+++ b/context_scribe/evaluator/prefilter_template.md
@@ -0,0 +1,24 @@
+{internal_signature}
+You are a lightweight classifier. Your ONLY job is to determine whether the following
+user-agent interaction contains a NEW persistent preference, project constraint, or
+behavioral rule that should be remembered long-term.
+
+Examples of rule-bearing interactions:
+- "Always use tabs instead of spaces"
+- "For this project, use PostgreSQL not MySQL"
+- "Never use semicolons in TypeScript"
+
+Examples of NON-rule interactions:
+- "Can you help me fix this bug?"
+- "Explain how async/await works"
+- "Generate a function that sorts a list"
+
+INTERACTION:
+'''
+{content}
+'''
+
+Respond with ONLY a JSON object:
+{{"contains_rule": true, "confidence": 0.95}}
+or
+{{"contains_rule": false, "confidence": 0.90}}
diff --git a/context_scribe/main.py b/context_scribe/main.py
index 801c829..1399fa2 100644
--- a/context_scribe/main.py
+++ b/context_scribe/main.py
@@ -40,6 +40,8 @@ def __init__(self, tool: str, bank_path: str):
         self.last_event_time = "N/A"
         self.update_count = 0
         self.history = []  # List of (time, file_path, description) tuples
+        self.prefilter_passed = 0
+        self.prefilter_skipped = 0
 
     def add_history(self, file_path: str, description: str):
         self.update_count += 1
@@ -99,9 +101,13 @@ def generate_layout(self) -> Layout:
         # Footer
         stats = Table.grid(expand=True)
         stats.add_column(justify="left")
+        stats.add_column(justify="center")
         stats.add_column(justify="right")
+        total_processed = self.prefilter_passed + self.prefilter_skipped
+        skip_rate = (self.prefilter_skipped / total_processed * 100) if total_processed > 0 else 0
         stats.add_row(
             Text(f" System: Active", style="green"),
+            Text(f"Prefilter: {self.prefilter_skipped} skipped / {total_processed} total ({skip_rate:.0f}%)", style="dim"),
             Text(f"Total Rules Extracted: {self.update_count} ", style="bold green")
         )
         layout["footer"].update(Panel(stats, border_style="dim"))
@@ -205,7 +211,7 @@ def _status(msg: str, db, live, debug: bool):
         live.update(db.generate_layout())
 
 
-async def run_daemon(tool: str, bank_path: str, debug: bool = False, evaluator_name: str = "auto") -> bool:
+async def run_daemon(tool: str, bank_path: str, debug: bool = False, evaluator_name: str = "auto", skip_prefilter: bool = False) -> bool:
     if tool == "gemini-cli":
         bootstrap_global_config()
         provider = GeminiCliProvider()
@@ -221,7 +227,7 @@ async def run_daemon(tool: str, bank_path: str, debug: bool = False, evaluator_n
 
     if evaluator_name == "auto":
         evaluator_name = _detect_evaluator(tool)
-    evaluator = get_evaluator(evaluator_name)
+    evaluator = get_evaluator(evaluator_name, skip_prefilter=skip_prefilter)
     mcp_client = MemoryBankClient(bank_path=bank_path)
 
     try:
@@ -255,6 +261,10 @@ async def _loop(live=None):
                 _status(f"🧠 Thinking: Extracting rules for {interaction.project_name}...", db, live, debug)
                 rule_output = await loop.run_in_executor(None, evaluator.evaluate_interaction, interaction, existing_global, existing_project)
 
+                # Sync prefilter metrics to dashboard
+                db.prefilter_passed = evaluator.metrics.prefilter_passed
+                db.prefilter_skipped = evaluator.metrics.prefilter_skipped
+
                 if rule_output:
                     dest_proj = "global" if rule_output.scope == "GLOBAL" else interaction.project_name
                     dest_file = "global_rules.md" if rule_output.scope == "GLOBAL" else "rules.md"
@@ -301,12 +311,13 @@ async def _loop(live=None):
 @click.option('--bank-path', default='~/.memory-bank', help='Path to your Memory Bank root')
 @click.option('--evaluator', 'evaluator_name', default='auto', type=click.Choice(['auto'] + sorted(EVALUATOR_REGISTRY)), help='Evaluator LLM to use (default: auto-detect)')
 @click.option('--debug', is_flag=True, default=False, help='Stream plain debug logs instead of dashboard UI')
-def cli(tool, bank_path, evaluator_name, debug):
+@click.option('--skip-prefilter', is_flag=True, default=False, help='Disable Stage 1 prefilter (send all interactions to full eval)')
+def cli(tool, bank_path, evaluator_name, debug, skip_prefilter):
     """Context-Scribe: Persistent Secretary Daemon"""
     if debug:
         logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s')
     try:
-        asyncio.run(run_daemon(tool, bank_path, debug=debug, evaluator_name=evaluator_name))
+        asyncio.run(run_daemon(tool, bank_path, debug=debug, evaluator_name=evaluator_name, skip_prefilter=skip_prefilter))
     except KeyboardInterrupt:
         pass
 
diff --git a/context_scribe/models/evaluator_models.py b/context_scribe/models/evaluator_models.py
index ffc4da7..14979e1 100644
--- a/context_scribe/models/evaluator_models.py
+++ b/context_scribe/models/evaluator_models.py
@@ -1,10 +1,48 @@
 from dataclasses import dataclass
+from typing import Optional
 
 INTERNAL_SIGNATURE = "--- CONTEXT-SCRIBE-INTERNAL-EVALUATION ---"
 INTERNAL_SIGNATURE_UPPER = INTERNAL_SIGNATURE.upper()
 
+
 @dataclass
 class RuleOutput:
     content: str
     scope: str  # "GLOBAL" or "PROJECT"
-    description: str # Concise summary of what changed
+    description: str  # Concise summary of what changed
+
+
+@dataclass
+class PrefilterResult:
+    """Result of the lightweight pre-evaluation stage."""
+    contains_rule: bool
+    confidence: float
+
+    @property
+    def should_skip_full_eval(self) -> bool:
+        """Skip full evaluation if confident there's no rule."""
+        return not self.contains_rule and self.confidence > 0.8
+
+
+@dataclass
+class PrefilterMetrics:
+    """Tracks prefilter pipeline statistics."""
+    prefilter_passed: int = 0
+    prefilter_skipped: int = 0
+    prefilter_errors: int = 0
+
+    @property
+    def skip_rate(self) -> float:
+        total = self.prefilter_passed + self.prefilter_skipped
+        if total == 0:
+            return 0.0
+        return self.prefilter_skipped / total
+
+    def record_result(self, result: Optional["PrefilterResult"]) -> None:
+        if result is None:
+            self.prefilter_errors += 1
+            self.prefilter_passed += 1  # On error, pass through to full eval
+        elif result.should_skip_full_eval:
+            self.prefilter_skipped += 1
+        else:
+            self.prefilter_passed += 1
diff --git a/tests/test_prefilter.py b/tests/test_prefilter.py
new file mode 100644
index 0000000..8bf5f1e
--- /dev/null
+++ b/tests/test_prefilter.py
@@ -0,0 +1,237 @@
+import json
+from datetime import datetime
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from context_scribe.models.evaluator_models import PrefilterResult, PrefilterMetrics
+from context_scribe.models.interaction import Interaction
+from context_scribe.evaluator.base_evaluator import _parse_bool
+
+
+def _make_interaction(content="Can you help me fix this bug?"):
+    return Interaction(
+        timestamp=datetime.now(),
+        role="user",
+        content=content,
+        project_name="test-project",
+        metadata={},
+    )
+
+
+# --- PrefilterResult tests ---
+
+def test_prefilter_result_should_skip_no_rule_high_confidence():
+    r = PrefilterResult(contains_rule=False, confidence=0.95)
+    assert r.should_skip_full_eval is True
+
+
+def test_prefilter_result_should_not_skip_no_rule_low_confidence():
+    r = PrefilterResult(contains_rule=False, confidence=0.5)
+    assert r.should_skip_full_eval is False
+
+
+def test_prefilter_result_should_not_skip_has_rule():
+    r = PrefilterResult(contains_rule=True, confidence=0.95)
+    assert r.should_skip_full_eval is False
+
+
+def test_prefilter_result_boundary_confidence():
+    r = PrefilterResult(contains_rule=False, confidence=0.8)
+    assert r.should_skip_full_eval is False  # must be > 0.8, not >=
+
+
+# --- PrefilterMetrics tests ---
+
+def test_metrics_record_skip():
+    m = PrefilterMetrics()
+    m.record_result(PrefilterResult(contains_rule=False, confidence=0.95))
+    assert m.prefilter_skipped == 1
+    assert m.prefilter_passed == 0
+    assert m.skip_rate == 1.0
+
+
+def test_metrics_record_pass():
+    m = PrefilterMetrics()
+    m.record_result(PrefilterResult(contains_rule=True, confidence=0.9))
+    assert m.prefilter_passed == 1
+    assert m.prefilter_skipped == 0
+    assert m.skip_rate == 0.0
+
+
+def test_metrics_record_error():
+    m = PrefilterMetrics()
+    m.record_result(None)
+    assert m.prefilter_errors == 1
+    assert m.prefilter_passed == 1  # errors pass through
+
+
+def test_metrics_skip_rate_empty():
+    m = PrefilterMetrics()
+    assert m.skip_rate == 0.0
+
+
+# --- _parse_bool tests ---
+
+def test_parse_bool_true_values():
+    assert _parse_bool(True) is True
+    assert _parse_bool("true") is True
+    assert _parse_bool("True") is True
+    assert _parse_bool("1") is True
+    assert _parse_bool("yes") is True
+
+
+def test_parse_bool_false_values():
+    assert _parse_bool(False) is False
+    assert _parse_bool("false") is False
+    assert _parse_bool("False") is False
+    assert _parse_bool("0") is False
+    assert _parse_bool("no") is False
+    assert _parse_bool("") is False
+
+
+# --- BaseEvaluator prefilter integration ---
+
+def test_prefilter_skips_non_rule_interaction():
+    """Evaluator skips full eval when prefilter says no rule with high confidence."""
+    from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator
+
+    with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None):
+        evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator)
+        # Manually init the base class fields
+        from context_scribe.evaluator.base_evaluator import BaseEvaluator
+        from pathlib import Path
+        evaluator.skip_prefilter = False
+        evaluator.metrics = PrefilterMetrics()
+        template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md"
+        with open(template_path) as f:
+            evaluator.prompt_template = f.read()
+        prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md"
+        with open(prefilter_path) as f:
+            evaluator._prefilter_template = f.read()
+
+    prefilter_json = json.dumps({"contains_rule": False, "confidence": 0.95})
+    with patch.object(type(evaluator), '_execute_cli', return_value=prefilter_json):
+        result = evaluator.evaluate_interaction(_make_interaction(), "", "")
+
+    assert result is None
+    assert evaluator.metrics.prefilter_skipped == 1
+
+
+def test_prefilter_passes_rule_interaction():
+    """Evaluator runs full eval when prefilter detects a rule."""
+    from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator
+
+    with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None):
+        evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator)
+        evaluator.skip_prefilter = False
+        evaluator.metrics = PrefilterMetrics()
+        from pathlib import Path
+        template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md"
+        with open(template_path) as f:
+            evaluator.prompt_template = f.read()
+        prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md"
+        with open(prefilter_path) as f:
+            evaluator._prefilter_template = f.read()
+
+    prefilter_json = json.dumps({"contains_rule": True, "confidence": 0.9})
+    full_eval_json = json.dumps({
+        "scope": "GLOBAL",
+        "description": "Tab preference",
+        "rules": ["- Use tabs for indentation"],
+    })
+
+    call_count = [0]
+    def mock_execute(prompt):
+        call_count[0] += 1
+        if call_count[0] == 1:
+            return prefilter_json
+        return full_eval_json
+
+    with patch.object(type(evaluator), '_execute_cli', side_effect=mock_execute):
+        result = evaluator.evaluate_interaction(
+            _make_interaction("Always use tabs"), "", ""
+        )
+
+    assert result is not None
+    assert result.scope == "GLOBAL"
+    assert evaluator.metrics.prefilter_passed == 1
+
+
+def test_skip_prefilter_flag_bypasses_stage1():
+    """With skip_prefilter=True, no prefilter call is made."""
+    from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator
+
+    with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None):
+        evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator)
+        evaluator.skip_prefilter = True
+        evaluator.metrics = PrefilterMetrics()
+        from pathlib import Path
+        template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md"
+        with open(template_path) as f:
+            evaluator.prompt_template = f.read()
+        prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md"
+        with open(prefilter_path) as f:
+            evaluator._prefilter_template = f.read()
+
+    full_eval_json = json.dumps({
+        "scope": "GLOBAL",
+        "description": "Tab preference",
+        "rules": ["- Use tabs"],
+    })
+
+    with patch.object(type(evaluator), '_execute_cli', return_value=full_eval_json) as mock_cli:
+        result = evaluator.evaluate_interaction(
+            _make_interaction("Always use tabs"), "", ""
+        )
+
+    # Should only be called once (full eval), not twice (prefilter + full eval)
+    assert mock_cli.call_count == 1
+    assert result is not None
+
+
+def test_prefilter_error_passes_through():
+    """On prefilter error, pass through to full eval (fail-open)."""
+    from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator
+    import subprocess
+
+    with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None):
+        evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator)
+        evaluator.skip_prefilter = False
+        evaluator.metrics = PrefilterMetrics()
+        from pathlib import Path
+        template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md"
+        with open(template_path) as f:
+            evaluator.prompt_template = f.read()
+        prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md"
+        with open(prefilter_path) as f:
+            evaluator._prefilter_template = f.read()
+
+    full_eval_json = json.dumps({
+        "scope": "GLOBAL",
+        "description": "Tab preference",
+        "rules": ["- Use tabs"],
+    })
+
+    call_count = [0]
+    def mock_execute(prompt):
+        call_count[0] += 1
+        if call_count[0] == 1:
+            raise subprocess.TimeoutExpired("gemini", 30)
+        return full_eval_json
+
+    with patch.object(type(evaluator), '_execute_cli', side_effect=mock_execute):
+        result = evaluator.evaluate_interaction(
+            _make_interaction("Always use tabs"), "", ""
+        )
+
+    assert result is not None
+    assert evaluator.metrics.prefilter_errors == 1
+    assert evaluator.metrics.prefilter_passed == 1  # error counts as pass-through
+
+
+def test_parse_bool_handles_string_false_from_llm():
+    """Regression: bool('false') == True, but _parse_bool('false') == False."""
+    assert _parse_bool("false") is False
+    # This was the original bug in PR #20
+    assert bool("false") is True  # Python's built-in behavior (the bug)

From adaa5af9901a7a88c2cb4172e33b287c7b906114 Mon Sep 17 00:00:00 2001
From: DJ <dj@Rachels-MacBook-Air.local>
Date: Sat, 4 Apr 2026 16:44:52 -0700
Subject: [PATCH 2/4] fix: pass **kwargs through subclass evaluator
 constructors

All evaluator subclasses now accept and forward **kwargs to
BaseEvaluator.__init__(), allowing skip_prefilter and future
params to propagate correctly via get_evaluator().

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 context_scribe/evaluator/claude_llm.py     | 4 ++--
 context_scribe/evaluator/copilot_llm.py    | 4 ++--
 context_scribe/evaluator/gemini_cli_llm.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/context_scribe/evaluator/claude_llm.py b/context_scribe/evaluator/claude_llm.py
index a6fe20d..157a1a1 100644
--- a/context_scribe/evaluator/claude_llm.py
+++ b/context_scribe/evaluator/claude_llm.py
@@ -8,8 +8,8 @@
 class ClaudeEvaluator(BaseEvaluator):
     """Evaluator that uses Claude Code CLI for headless rule extraction."""
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         try:
             subprocess.run(["claude", "--version"], capture_output=True, check=True)
         except (subprocess.CalledProcessError, FileNotFoundError):
diff --git a/context_scribe/evaluator/copilot_llm.py b/context_scribe/evaluator/copilot_llm.py
index 4cf613d..bd5f97a 100644
--- a/context_scribe/evaluator/copilot_llm.py
+++ b/context_scribe/evaluator/copilot_llm.py
@@ -11,8 +11,8 @@
 class CopilotEvaluator(BaseEvaluator):
     """Evaluator that uses the GitHub Copilot CLI for headless rule extraction."""
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         self._cli_path = shutil.which("copilot")
         if not self._cli_path:
             logger.warning(
diff --git a/context_scribe/evaluator/gemini_cli_llm.py b/context_scribe/evaluator/gemini_cli_llm.py
index ec07d23..a5f3892 100644
--- a/context_scribe/evaluator/gemini_cli_llm.py
+++ b/context_scribe/evaluator/gemini_cli_llm.py
@@ -7,8 +7,8 @@
 class GeminiCliEvaluator(BaseEvaluator):
     """Evaluator that uses Gemini CLI for headless rule extraction."""
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         try:
             subprocess.run(["gemini", "--version"], capture_output=True, check=True)
         except (subprocess.CalledProcessError, FileNotFoundError):

From eada07ec871ab208ed4d75facc9509a83758fc88 Mon Sep 17 00:00:00 2001
From: DJ <dj@Rachels-MacBook-Air.local>
Date: Sun, 5 Apr 2026 10:28:37 -0700
Subject: [PATCH 3/4] fix: resolve test failures from prefilter integration

- test_daemons.py: patch get_evaluator instead of removed direct imports
- test_gemini_cli_llm.py: use skip_prefilter=True in CLI flag test
  (prefilter adds a second subprocess.run call)
- main.py: guard prefilter metrics sync with isinstance check to
  prevent TypeError when evaluator is mocked

Found via full test suite run with Python 3.12 + mcp installed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 context_scribe/main.py       | 6 ++++--
 tests/test_daemons.py        | 2 +-
 tests/test_gemini_cli_llm.py | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/context_scribe/main.py b/context_scribe/main.py
index 1399fa2..ce668df 100644
--- a/context_scribe/main.py
+++ b/context_scribe/main.py
@@ -262,8 +262,10 @@ async def _loop(live=None):
                 rule_output = await loop.run_in_executor(None, evaluator.evaluate_interaction, interaction, existing_global, existing_project)
 
                 # Sync prefilter metrics to dashboard
-                db.prefilter_passed = evaluator.metrics.prefilter_passed
-                db.prefilter_skipped = evaluator.metrics.prefilter_skipped
+                metrics = getattr(evaluator, 'metrics', None)
+                if metrics and isinstance(getattr(metrics, 'prefilter_passed', None), int):
+                    db.prefilter_passed = metrics.prefilter_passed
+                    db.prefilter_skipped = metrics.prefilter_skipped
 
                 if rule_output:
                     dest_proj = "global" if rule_output.scope == "GLOBAL" else interaction.project_name
diff --git a/tests/test_daemons.py b/tests/test_daemons.py
index c42de2e..126c35a 100644
--- a/tests/test_daemons.py
+++ b/tests/test_daemons.py
@@ -13,7 +13,7 @@ async def test_run_daemon_tools(tool, provider_class, evaluator_class, bootstrap
     """Test the daemon run loop for all supported tools."""
     
     with patch(f"context_scribe.main.{provider_class}", return_value=daemon_mocks.provider):
-        with patch(f"context_scribe.main.{evaluator_class}", return_value=daemon_mocks.evaluator):
+        with patch("context_scribe.main.get_evaluator", return_value=daemon_mocks.evaluator):
             with patch("context_scribe.main.MemoryBankClient", return_value=daemon_mocks.mcp):
                 with patch(f"context_scribe.main.{bootstrap_func}"):
                     # Mock Live to avoid rich rendering logic completely
diff --git a/tests/test_gemini_cli_llm.py b/tests/test_gemini_cli_llm.py
index 73b9ca8..3c0916c 100644
--- a/tests/test_gemini_cli_llm.py
+++ b/tests/test_gemini_cli_llm.py
@@ -52,7 +52,7 @@ def test_evaluator_timeout_handling():
 
 def test_evaluator_cli_invocation_flags():
     """Verify that the gemini CLI is called with the correct flags."""
-    evaluator = GeminiCliEvaluator()
+    evaluator = GeminiCliEvaluator(skip_prefilter=True)
     mock_interaction = Interaction(timestamp=None, role="user", content="Test prompt", project_name="test")
     
     with patch("subprocess.run") as mock_run:

From 987bb4693058055ba6e91a2a8e5f75b9b2c9b10a Mon Sep 17 00:00:00 2001
From: DJ <dj@Rachels-MacBook-Air.local>
Date: Sun, 5 Apr 2026 10:58:14 -0700
Subject: [PATCH 4/4] fix: fail-open _parse_bool and use importlib.resources
 for templates

Address two Copilot review comments:

1. _parse_bool() now returns None for unrecognised/null values instead
   of defaulting to False. _pre_evaluate() treats None as pass-through
   to full evaluation, ensuring malformed LLM output doesn't silently
   skip rule extraction (fail-open behaviour).

2. Template loading in BaseEvaluator.__init__() now uses
   importlib.resources.files() instead of __file__-relative Path reads,
   so templates are accessible in packaged installs (wheel/zip).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 context_scribe/evaluator/base_evaluator.py | 47 ++++++++++++++++------
 tests/test_prefilter.py                    |  9 ++++-
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/context_scribe/evaluator/base_evaluator.py b/context_scribe/evaluator/base_evaluator.py
index c1c38ea..34396ef 100644
--- a/context_scribe/evaluator/base_evaluator.py
+++ b/context_scribe/evaluator/base_evaluator.py
@@ -1,9 +1,9 @@
+import importlib.resources
 import json
 import logging
 import re
 import subprocess
 from abc import ABC, abstractmethod
-from pathlib import Path
 from typing import Optional
 
 from context_scribe.models.interaction import Interaction
@@ -13,28 +13,42 @@
 
 logger = logging.getLogger(__name__)
 
-_PREFILTER_TEMPLATE_PATH = Path(__file__).parent / "prefilter_template.md"
 
+def _parse_bool(value) -> Optional[bool]:
+    """Safely parse a boolean that may arrive as a string from LLM JSON.
 
-def _parse_bool(value) -> bool:
-    """Safely parse a boolean that may arrive as a string from LLM JSON."""
+    Returns ``None`` for unrecognised or null values so the caller can
+    fall back to full evaluation (fail-open behaviour).
+    """
     if isinstance(value, bool):
         return value
     if isinstance(value, str):
-        return value.strip().lower() in ("true", "1", "yes")
-    return bool(value)
+        normalised = value.strip().lower()
+        if normalised in ("true", "1", "yes"):
+            return True
+        if normalised in ("false", "0", "no"):
+            return False
+        return None  # unrecognised string → pass through
+    return None  # None / other types → pass through
+
+
+def _load_package_template(filename: str) -> str:
+    """Load a template file from this package using importlib.resources."""
+    return (
+        importlib.resources.files("context_scribe.evaluator")
+        .joinpath(filename)
+        .read_text(encoding="utf-8")
+    )
 
 
 class BaseEvaluator(ABC):
     def __init__(self, skip_prefilter: bool = False):
         self.skip_prefilter = skip_prefilter
         self.metrics = PrefilterMetrics()
-        # Load the prompt templates
-        template_path = Path(__file__).parent / "prompt_template.md"
-        with open(template_path, "r", encoding="utf-8") as f:
-            self.prompt_template = f.read()
-        with open(_PREFILTER_TEMPLATE_PATH, "r", encoding="utf-8") as f:
-            self._prefilter_template = f.read()
+        # Load the prompt templates via importlib.resources (works in
+        # packaged installs such as wheels / zip imports).
+        self.prompt_template = _load_package_template("prompt_template.md")
+        self._prefilter_template = _load_package_template("prefilter_template.md")
 
     @abstractmethod
     def _execute_cli(self, prompt: str) -> str:
@@ -68,8 +82,15 @@ def _pre_evaluate(self, interaction: Interaction) -> Optional[PrefilterResult]:
             json_match = re.search(r'\{[^}]*"contains_rule"[^}]*\}', response_text)
             if json_match:
                 pf_data = json.loads(json_match.group(0))
+                parsed = _parse_bool(pf_data.get("contains_rule", True))
+                if parsed is None:
+                    logger.warning(
+                        "Unrecognised contains_rule value %r, passing through to full eval",
+                        pf_data.get("contains_rule"),
+                    )
+                    return None
                 return PrefilterResult(
-                    contains_rule=_parse_bool(pf_data.get("contains_rule", True)),
+                    contains_rule=parsed,
                     confidence=float(pf_data.get("confidence", 0.0)),
                 )
 
diff --git a/tests/test_prefilter.py b/tests/test_prefilter.py
index 8bf5f1e..fa1f19a 100644
--- a/tests/test_prefilter.py
+++ b/tests/test_prefilter.py
@@ -87,7 +87,14 @@ def test_parse_bool_false_values():
     assert _parse_bool("False") is False
     assert _parse_bool("0") is False
     assert _parse_bool("no") is False
-    assert _parse_bool("") is False
+
+
+def test_parse_bool_unknown_values_return_none():
+    """Unrecognised or null values return None (fail-open)."""
+    assert _parse_bool("") is None
+    assert _parse_bool(None) is None
+    assert _parse_bool("maybe") is None
+    assert _parse_bool(42) is None
 
 
 # --- BaseEvaluator prefilter integration ---