From 93d1d16bcac62a6838b3b940f424cf6d514c084f Mon Sep 17 00:00:00 2001 From: DJ Date: Sat, 4 Apr 2026 16:41:00 -0700 Subject: [PATCH 1/4] feat: two-stage evaluation pipeline with prefilter Adds a lightweight Stage 1 pre-filter to BaseEvaluator that classifies interactions as rule-bearing or not before running full extraction. Non-rule interactions with confidence > 0.8 skip Stage 2, targeting 50%+ reduction in LLM calls. Key design decisions: - Prefilter is implemented in BaseEvaluator, so all evaluators get it - _parse_bool() fixes the bool("false")==True bug from PR #20 - Fail-open: prefilter errors pass through to full eval - --skip-prefilter CLI flag to disable Stage 1 - Dashboard shows prefilter skip rate metrics Closes #14 Co-Authored-By: Claude Opus 4.6 (1M context) --- context_scribe/evaluator/__init__.py | 4 +- context_scribe/evaluator/base_evaluator.py | 107 ++++++-- .../evaluator/prefilter_template.md | 24 ++ context_scribe/main.py | 19 +- context_scribe/models/evaluator_models.py | 40 ++- tests/test_prefilter.py | 237 ++++++++++++++++++ 6 files changed, 401 insertions(+), 30 deletions(-) create mode 100644 context_scribe/evaluator/prefilter_template.md create mode 100644 tests/test_prefilter.py diff --git a/context_scribe/evaluator/__init__.py b/context_scribe/evaluator/__init__.py index 7685c4c..969995f 100644 --- a/context_scribe/evaluator/__init__.py +++ b/context_scribe/evaluator/__init__.py @@ -13,7 +13,7 @@ } -def get_evaluator(name: str) -> BaseEvaluator: +def get_evaluator(name: str, **kwargs) -> BaseEvaluator: """Return an evaluator instance by name. Raises ValueError for unknown names.""" cls = EVALUATOR_REGISTRY.get(name) if cls is None: @@ -21,7 +21,7 @@ def get_evaluator(name: str) -> BaseEvaluator: f"Unknown evaluator '{name}'. " f"Available: {', '.join(sorted(EVALUATOR_REGISTRY))}" ) - return cls() + return cls(**kwargs) __all__ = [ diff --git a/context_scribe/evaluator/base_evaluator.py b/context_scribe/evaluator/base_evaluator.py index 5299755..c1c38ea 100644 --- a/context_scribe/evaluator/base_evaluator.py +++ b/context_scribe/evaluator/base_evaluator.py @@ -7,26 +7,95 @@ from typing import Optional from context_scribe.models.interaction import Interaction -from context_scribe.models.evaluator_models import RuleOutput, INTERNAL_SIGNATURE +from context_scribe.models.evaluator_models import ( + RuleOutput, INTERNAL_SIGNATURE, PrefilterResult, PrefilterMetrics, +) logger = logging.getLogger(__name__) +_PREFILTER_TEMPLATE_PATH = Path(__file__).parent / "prefilter_template.md" + + +def _parse_bool(value) -> bool: + """Safely parse a boolean that may arrive as a string from LLM JSON.""" + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.strip().lower() in ("true", "1", "yes") + return bool(value) + + class BaseEvaluator(ABC): - def __init__(self): - # Load the prompt template + def __init__(self, skip_prefilter: bool = False): + self.skip_prefilter = skip_prefilter + self.metrics = PrefilterMetrics() + # Load the prompt templates template_path = Path(__file__).parent / "prompt_template.md" with open(template_path, "r", encoding="utf-8") as f: self.prompt_template = f.read() + with open(_PREFILTER_TEMPLATE_PATH, "r", encoding="utf-8") as f: + self._prefilter_template = f.read() @abstractmethod def _execute_cli(self, prompt: str) -> str: """Executes the specific CLI tool and returns the raw stdout. - + Should raise subprocess.TimeoutExpired if the execution takes too long. """ pass + def _pre_evaluate(self, interaction: Interaction) -> Optional[PrefilterResult]: + """Stage 1: Lightweight check to filter non-rule interactions.""" + prompt = self._prefilter_template.format( + internal_signature=INTERNAL_SIGNATURE, + content=interaction.content, + ) + try: + output = self._execute_cli(prompt) + + # Extract response text from JSON wrapper if present + response_text = output + try: + data = json.loads(output) + if isinstance(data, dict): + response_text = data.get("result", data.get("response", output)) + except json.JSONDecodeError: + pass + + response_text = re.sub(r'```(?:json)?\s*', '', str(response_text)).strip() + + # Parse the prefilter JSON response + json_match = re.search(r'\{[^}]*"contains_rule"[^}]*\}', response_text) + if json_match: + pf_data = json.loads(json_match.group(0)) + return PrefilterResult( + contains_rule=_parse_bool(pf_data.get("contains_rule", True)), + confidence=float(pf_data.get("confidence", 0.0)), + ) + + logger.warning("Could not parse prefilter response, passing through to full eval") + return None + + except subprocess.TimeoutExpired: + logger.warning("Prefilter timed out, passing through to full eval") + return None + except Exception as e: + logger.warning("Prefilter error: %s, passing through to full eval", e) + return None + def evaluate_interaction(self, interaction: Interaction, existing_global: str = "", existing_project: str = "") -> Optional[RuleOutput]: + # Stage 1: Pre-filter + if not self.skip_prefilter: + prefilter_result = self._pre_evaluate(interaction) + self.metrics.record_result(prefilter_result) + if prefilter_result and prefilter_result.should_skip_full_eval: + logger.info( + "Prefilter: skipping full eval for %s (confidence=%.2f)", + interaction.project_name, prefilter_result.confidence, + ) + return None + + # Stage 2: Full extraction prompt = self.prompt_template.format( internal_signature=INTERNAL_SIGNATURE, project_name=interaction.project_name, @@ -37,36 +106,29 @@ def evaluate_interaction(self, interaction: Interaction, existing_global: str = try: output = self._execute_cli(prompt) - + # Extract response text response_text = output try: data = json.loads(output) if isinstance(data, dict): - # Handle both gemini ("response") and claude ("result"/"response") formats response_text = data.get("result", data.get("response", output)) except json.JSONDecodeError: pass - # Strip markdown code fences if present (Claude often wraps JSON in ```json ... ```) + # Strip markdown code fences if present response_text = re.sub(r'```(?:json)?\s*', '', str(response_text)).strip() - # Robust JSON extraction: look for substrings that start with { and end with } - # and contain both "scope" and "rules" + # Robust JSON extraction best_rule_data = None - - # Find all { and } positions start_indices = [i for i, char in enumerate(response_text) if char == '{'] end_indices = [i for i, char in enumerate(response_text) if char == '}'] - - # Try progressively smaller substrings starting from the first { and ending at the last } - # until we find a valid JSON object that has our keys. + for start in start_indices: for end in reversed(end_indices): if end > start: try: candidate = response_text[start:end+1] - # Quick check to avoid expensive json.loads on non-candidates if '"scope"' in candidate and '"rules"' in candidate: data = json.loads(candidate) if isinstance(data, dict) and "scope" in data and "rules" in data: @@ -81,16 +143,16 @@ def evaluate_interaction(self, interaction: Interaction, existing_global: str = try: rules_raw = best_rule_data["rules"] desc = best_rule_data.get("description", "Updated rules") - + if isinstance(rules_raw, list): rules_content = "\n".join([str(r) for r in rules_raw]).strip() else: rules_content = str(rules_raw).strip() - + if len(rules_content) > 0: return RuleOutput( - content=rules_content, - scope=str(best_rule_data["scope"]).upper(), + content=rules_content, + scope=str(best_rule_data["scope"]).upper(), description=str(desc) ) except Exception as e: @@ -98,18 +160,17 @@ def evaluate_interaction(self, interaction: Interaction, existing_global: str = if "NO_RULE" in str(response_text): return None - - # Fallback for non-JSON responses (robustness) + + # Fallback for non-JSON responses text_upper = str(response_text).upper() if "PROJECT" in text_upper or "GLOBAL" in text_upper: scope = "PROJECT" if "PROJECT" in text_upper else "GLOBAL" - # Try to find some content if rules are just listed content = str(response_text) return RuleOutput(content=content, scope=scope, description="Extracted via fallback") logger.error(f"Failed to parse rule extraction for {interaction.project_name}") return None - + except subprocess.TimeoutExpired: logger.error(f"Evaluation timed out for {interaction.project_name}") return None diff --git a/context_scribe/evaluator/prefilter_template.md b/context_scribe/evaluator/prefilter_template.md new file mode 100644 index 0000000..d23473b --- /dev/null +++ b/context_scribe/evaluator/prefilter_template.md @@ -0,0 +1,24 @@ +{internal_signature} +You are a lightweight classifier. Your ONLY job is to determine whether the following +user-agent interaction contains a NEW persistent preference, project constraint, or +behavioral rule that should be remembered long-term. + +Examples of rule-bearing interactions: +- "Always use tabs instead of spaces" +- "For this project, use PostgreSQL not MySQL" +- "Never use semicolons in TypeScript" + +Examples of NON-rule interactions: +- "Can you help me fix this bug?" +- "Explain how async/await works" +- "Generate a function that sorts a list" + +INTERACTION: +''' +{content} +''' + +Respond with ONLY a JSON object: +{{"contains_rule": true, "confidence": 0.95}} +or +{{"contains_rule": false, "confidence": 0.90}} diff --git a/context_scribe/main.py b/context_scribe/main.py index 801c829..1399fa2 100644 --- a/context_scribe/main.py +++ b/context_scribe/main.py @@ -40,6 +40,8 @@ def __init__(self, tool: str, bank_path: str): self.last_event_time = "N/A" self.update_count = 0 self.history = [] # List of (time, file_path, description) tuples + self.prefilter_passed = 0 + self.prefilter_skipped = 0 def add_history(self, file_path: str, description: str): self.update_count += 1 @@ -99,9 +101,13 @@ def generate_layout(self) -> Layout: # Footer stats = Table.grid(expand=True) stats.add_column(justify="left") + stats.add_column(justify="center") stats.add_column(justify="right") + total_processed = self.prefilter_passed + self.prefilter_skipped + skip_rate = (self.prefilter_skipped / total_processed * 100) if total_processed > 0 else 0 stats.add_row( Text(f" System: Active", style="green"), + Text(f"Prefilter: {self.prefilter_skipped} skipped / {total_processed} total ({skip_rate:.0f}%)", style="dim"), Text(f"Total Rules Extracted: {self.update_count} ", style="bold green") ) layout["footer"].update(Panel(stats, border_style="dim")) @@ -205,7 +211,7 @@ def _status(msg: str, db, live, debug: bool): live.update(db.generate_layout()) -async def run_daemon(tool: str, bank_path: str, debug: bool = False, evaluator_name: str = "auto") -> bool: +async def run_daemon(tool: str, bank_path: str, debug: bool = False, evaluator_name: str = "auto", skip_prefilter: bool = False) -> bool: if tool == "gemini-cli": bootstrap_global_config() provider = GeminiCliProvider() @@ -221,7 +227,7 @@ async def run_daemon(tool: str, bank_path: str, debug: bool = False, evaluator_n if evaluator_name == "auto": evaluator_name = _detect_evaluator(tool) - evaluator = get_evaluator(evaluator_name) + evaluator = get_evaluator(evaluator_name, skip_prefilter=skip_prefilter) mcp_client = MemoryBankClient(bank_path=bank_path) try: @@ -255,6 +261,10 @@ async def _loop(live=None): _status(f"🧠 Thinking: Extracting rules for {interaction.project_name}...", db, live, debug) rule_output = await loop.run_in_executor(None, evaluator.evaluate_interaction, interaction, existing_global, existing_project) + # Sync prefilter metrics to dashboard + db.prefilter_passed = evaluator.metrics.prefilter_passed + db.prefilter_skipped = evaluator.metrics.prefilter_skipped + if rule_output: dest_proj = "global" if rule_output.scope == "GLOBAL" else interaction.project_name dest_file = "global_rules.md" if rule_output.scope == "GLOBAL" else "rules.md" @@ -301,12 +311,13 @@ async def _loop(live=None): @click.option('--bank-path', default='~/.memory-bank', help='Path to your Memory Bank root') @click.option('--evaluator', 'evaluator_name', default='auto', type=click.Choice(['auto'] + sorted(EVALUATOR_REGISTRY)), help='Evaluator LLM to use (default: auto-detect)') @click.option('--debug', is_flag=True, default=False, help='Stream plain debug logs instead of dashboard UI') -def cli(tool, bank_path, evaluator_name, debug): +@click.option('--skip-prefilter', is_flag=True, default=False, help='Disable Stage 1 prefilter (send all interactions to full eval)') +def cli(tool, bank_path, evaluator_name, debug, skip_prefilter): """Context-Scribe: Persistent Secretary Daemon""" if debug: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s') try: - asyncio.run(run_daemon(tool, bank_path, debug=debug, evaluator_name=evaluator_name)) + asyncio.run(run_daemon(tool, bank_path, debug=debug, evaluator_name=evaluator_name, skip_prefilter=skip_prefilter)) except KeyboardInterrupt: pass diff --git a/context_scribe/models/evaluator_models.py b/context_scribe/models/evaluator_models.py index ffc4da7..14979e1 100644 --- a/context_scribe/models/evaluator_models.py +++ b/context_scribe/models/evaluator_models.py @@ -1,10 +1,48 @@ from dataclasses import dataclass +from typing import Optional INTERNAL_SIGNATURE = "--- CONTEXT-SCRIBE-INTERNAL-EVALUATION ---" INTERNAL_SIGNATURE_UPPER = INTERNAL_SIGNATURE.upper() + @dataclass class RuleOutput: content: str scope: str # "GLOBAL" or "PROJECT" - description: str # Concise summary of what changed + description: str # Concise summary of what changed + + +@dataclass +class PrefilterResult: + """Result of the lightweight pre-evaluation stage.""" + contains_rule: bool + confidence: float + + @property + def should_skip_full_eval(self) -> bool: + """Skip full evaluation if confident there's no rule.""" + return not self.contains_rule and self.confidence > 0.8 + + +@dataclass +class PrefilterMetrics: + """Tracks prefilter pipeline statistics.""" + prefilter_passed: int = 0 + prefilter_skipped: int = 0 + prefilter_errors: int = 0 + + @property + def skip_rate(self) -> float: + total = self.prefilter_passed + self.prefilter_skipped + if total == 0: + return 0.0 + return self.prefilter_skipped / total + + def record_result(self, result: Optional["PrefilterResult"]) -> None: + if result is None: + self.prefilter_errors += 1 + self.prefilter_passed += 1 # On error, pass through to full eval + elif result.should_skip_full_eval: + self.prefilter_skipped += 1 + else: + self.prefilter_passed += 1 diff --git a/tests/test_prefilter.py b/tests/test_prefilter.py new file mode 100644 index 0000000..8bf5f1e --- /dev/null +++ b/tests/test_prefilter.py @@ -0,0 +1,237 @@ +import json +from datetime import datetime +from unittest.mock import patch, MagicMock + +import pytest + +from context_scribe.models.evaluator_models import PrefilterResult, PrefilterMetrics +from context_scribe.models.interaction import Interaction +from context_scribe.evaluator.base_evaluator import _parse_bool + + +def _make_interaction(content="Can you help me fix this bug?"): + return Interaction( + timestamp=datetime.now(), + role="user", + content=content, + project_name="test-project", + metadata={}, + ) + + +# --- PrefilterResult tests --- + +def test_prefilter_result_should_skip_no_rule_high_confidence(): + r = PrefilterResult(contains_rule=False, confidence=0.95) + assert r.should_skip_full_eval is True + + +def test_prefilter_result_should_not_skip_no_rule_low_confidence(): + r = PrefilterResult(contains_rule=False, confidence=0.5) + assert r.should_skip_full_eval is False + + +def test_prefilter_result_should_not_skip_has_rule(): + r = PrefilterResult(contains_rule=True, confidence=0.95) + assert r.should_skip_full_eval is False + + +def test_prefilter_result_boundary_confidence(): + r = PrefilterResult(contains_rule=False, confidence=0.8) + assert r.should_skip_full_eval is False # must be > 0.8, not >= + + +# --- PrefilterMetrics tests --- + +def test_metrics_record_skip(): + m = PrefilterMetrics() + m.record_result(PrefilterResult(contains_rule=False, confidence=0.95)) + assert m.prefilter_skipped == 1 + assert m.prefilter_passed == 0 + assert m.skip_rate == 1.0 + + +def test_metrics_record_pass(): + m = PrefilterMetrics() + m.record_result(PrefilterResult(contains_rule=True, confidence=0.9)) + assert m.prefilter_passed == 1 + assert m.prefilter_skipped == 0 + assert m.skip_rate == 0.0 + + +def test_metrics_record_error(): + m = PrefilterMetrics() + m.record_result(None) + assert m.prefilter_errors == 1 + assert m.prefilter_passed == 1 # errors pass through + + +def test_metrics_skip_rate_empty(): + m = PrefilterMetrics() + assert m.skip_rate == 0.0 + + +# --- _parse_bool tests --- + +def test_parse_bool_true_values(): + assert _parse_bool(True) is True + assert _parse_bool("true") is True + assert _parse_bool("True") is True + assert _parse_bool("1") is True + assert _parse_bool("yes") is True + + +def test_parse_bool_false_values(): + assert _parse_bool(False) is False + assert _parse_bool("false") is False + assert _parse_bool("False") is False + assert _parse_bool("0") is False + assert _parse_bool("no") is False + assert _parse_bool("") is False + + +# --- BaseEvaluator prefilter integration --- + +def test_prefilter_skips_non_rule_interaction(): + """Evaluator skips full eval when prefilter says no rule with high confidence.""" + from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator + + with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None): + evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator) + # Manually init the base class fields + from context_scribe.evaluator.base_evaluator import BaseEvaluator + from pathlib import Path + evaluator.skip_prefilter = False + evaluator.metrics = PrefilterMetrics() + template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md" + with open(template_path) as f: + evaluator.prompt_template = f.read() + prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md" + with open(prefilter_path) as f: + evaluator._prefilter_template = f.read() + + prefilter_json = json.dumps({"contains_rule": False, "confidence": 0.95}) + with patch.object(type(evaluator), '_execute_cli', return_value=prefilter_json): + result = evaluator.evaluate_interaction(_make_interaction(), "", "") + + assert result is None + assert evaluator.metrics.prefilter_skipped == 1 + + +def test_prefilter_passes_rule_interaction(): + """Evaluator runs full eval when prefilter detects a rule.""" + from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator + + with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None): + evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator) + evaluator.skip_prefilter = False + evaluator.metrics = PrefilterMetrics() + from pathlib import Path + template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md" + with open(template_path) as f: + evaluator.prompt_template = f.read() + prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md" + with open(prefilter_path) as f: + evaluator._prefilter_template = f.read() + + prefilter_json = json.dumps({"contains_rule": True, "confidence": 0.9}) + full_eval_json = json.dumps({ + "scope": "GLOBAL", + "description": "Tab preference", + "rules": ["- Use tabs for indentation"], + }) + + call_count = [0] + def mock_execute(prompt): + call_count[0] += 1 + if call_count[0] == 1: + return prefilter_json + return full_eval_json + + with patch.object(type(evaluator), '_execute_cli', side_effect=mock_execute): + result = evaluator.evaluate_interaction( + _make_interaction("Always use tabs"), "", "" + ) + + assert result is not None + assert result.scope == "GLOBAL" + assert evaluator.metrics.prefilter_passed == 1 + + +def test_skip_prefilter_flag_bypasses_stage1(): + """With skip_prefilter=True, no prefilter call is made.""" + from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator + + with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None): + evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator) + evaluator.skip_prefilter = True + evaluator.metrics = PrefilterMetrics() + from pathlib import Path + template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md" + with open(template_path) as f: + evaluator.prompt_template = f.read() + prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md" + with open(prefilter_path) as f: + evaluator._prefilter_template = f.read() + + full_eval_json = json.dumps({ + "scope": "GLOBAL", + "description": "Tab preference", + "rules": ["- Use tabs"], + }) + + with patch.object(type(evaluator), '_execute_cli', return_value=full_eval_json) as mock_cli: + result = evaluator.evaluate_interaction( + _make_interaction("Always use tabs"), "", "" + ) + + # Should only be called once (full eval), not twice (prefilter + full eval) + assert mock_cli.call_count == 1 + assert result is not None + + +def test_prefilter_error_passes_through(): + """On prefilter error, pass through to full eval (fail-open).""" + from context_scribe.evaluator.gemini_cli_llm import GeminiCliEvaluator + import subprocess + + with patch.object(GeminiCliEvaluator, '__init__', lambda self, **kw: None): + evaluator = GeminiCliEvaluator.__new__(GeminiCliEvaluator) + evaluator.skip_prefilter = False + evaluator.metrics = PrefilterMetrics() + from pathlib import Path + template_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prompt_template.md" + with open(template_path) as f: + evaluator.prompt_template = f.read() + prefilter_path = Path(__file__).parent.parent / "context_scribe" / "evaluator" / "prefilter_template.md" + with open(prefilter_path) as f: + evaluator._prefilter_template = f.read() + + full_eval_json = json.dumps({ + "scope": "GLOBAL", + "description": "Tab preference", + "rules": ["- Use tabs"], + }) + + call_count = [0] + def mock_execute(prompt): + call_count[0] += 1 + if call_count[0] == 1: + raise subprocess.TimeoutExpired("gemini", 30) + return full_eval_json + + with patch.object(type(evaluator), '_execute_cli', side_effect=mock_execute): + result = evaluator.evaluate_interaction( + _make_interaction("Always use tabs"), "", "" + ) + + assert result is not None + assert evaluator.metrics.prefilter_errors == 1 + assert evaluator.metrics.prefilter_passed == 1 # error counts as pass-through + + +def test_parse_bool_handles_string_false_from_llm(): + """Regression: bool('false') == True, but _parse_bool('false') == False.""" + assert _parse_bool("false") is False + # This was the original bug in PR #20 + assert bool("false") is True # Python's built-in behavior (the bug) From adaa5af9901a7a88c2cb4172e33b287c7b906114 Mon Sep 17 00:00:00 2001 From: DJ Date: Sat, 4 Apr 2026 16:44:52 -0700 Subject: [PATCH 2/4] fix: pass **kwargs through subclass evaluator constructors All evaluator subclasses now accept and forward **kwargs to BaseEvaluator.__init__(), allowing skip_prefilter and future params to propagate correctly via get_evaluator(). Co-Authored-By: Claude Opus 4.6 (1M context) --- context_scribe/evaluator/claude_llm.py | 4 ++-- context_scribe/evaluator/copilot_llm.py | 4 ++-- context_scribe/evaluator/gemini_cli_llm.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/context_scribe/evaluator/claude_llm.py b/context_scribe/evaluator/claude_llm.py index a6fe20d..157a1a1 100644 --- a/context_scribe/evaluator/claude_llm.py +++ b/context_scribe/evaluator/claude_llm.py @@ -8,8 +8,8 @@ class ClaudeEvaluator(BaseEvaluator): """Evaluator that uses Claude Code CLI for headless rule extraction.""" - def __init__(self): - super().__init__() + def __init__(self, **kwargs): + super().__init__(**kwargs) try: subprocess.run(["claude", "--version"], capture_output=True, check=True) except (subprocess.CalledProcessError, FileNotFoundError): diff --git a/context_scribe/evaluator/copilot_llm.py b/context_scribe/evaluator/copilot_llm.py index 4cf613d..bd5f97a 100644 --- a/context_scribe/evaluator/copilot_llm.py +++ b/context_scribe/evaluator/copilot_llm.py @@ -11,8 +11,8 @@ class CopilotEvaluator(BaseEvaluator): """Evaluator that uses the GitHub Copilot CLI for headless rule extraction.""" - def __init__(self): - super().__init__() + def __init__(self, **kwargs): + super().__init__(**kwargs) self._cli_path = shutil.which("copilot") if not self._cli_path: logger.warning( diff --git a/context_scribe/evaluator/gemini_cli_llm.py b/context_scribe/evaluator/gemini_cli_llm.py index ec07d23..a5f3892 100644 --- a/context_scribe/evaluator/gemini_cli_llm.py +++ b/context_scribe/evaluator/gemini_cli_llm.py @@ -7,8 +7,8 @@ class GeminiCliEvaluator(BaseEvaluator): """Evaluator that uses Gemini CLI for headless rule extraction.""" - def __init__(self): - super().__init__() + def __init__(self, **kwargs): + super().__init__(**kwargs) try: subprocess.run(["gemini", "--version"], capture_output=True, check=True) except (subprocess.CalledProcessError, FileNotFoundError): From eada07ec871ab208ed4d75facc9509a83758fc88 Mon Sep 17 00:00:00 2001 From: DJ Date: Sun, 5 Apr 2026 10:28:37 -0700 Subject: [PATCH 3/4] fix: resolve test failures from prefilter integration - test_daemons.py: patch get_evaluator instead of removed direct imports - test_gemini_cli_llm.py: use skip_prefilter=True in CLI flag test (prefilter adds a second subprocess.run call) - main.py: guard prefilter metrics sync with isinstance check to prevent TypeError when evaluator is mocked Found via full test suite run with Python 3.12 + mcp installed. Co-Authored-By: Claude Opus 4.6 (1M context) --- context_scribe/main.py | 6 ++++-- tests/test_daemons.py | 2 +- tests/test_gemini_cli_llm.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/context_scribe/main.py b/context_scribe/main.py index 1399fa2..ce668df 100644 --- a/context_scribe/main.py +++ b/context_scribe/main.py @@ -262,8 +262,10 @@ async def _loop(live=None): rule_output = await loop.run_in_executor(None, evaluator.evaluate_interaction, interaction, existing_global, existing_project) # Sync prefilter metrics to dashboard - db.prefilter_passed = evaluator.metrics.prefilter_passed - db.prefilter_skipped = evaluator.metrics.prefilter_skipped + metrics = getattr(evaluator, 'metrics', None) + if metrics and isinstance(getattr(metrics, 'prefilter_passed', None), int): + db.prefilter_passed = metrics.prefilter_passed + db.prefilter_skipped = metrics.prefilter_skipped if rule_output: dest_proj = "global" if rule_output.scope == "GLOBAL" else interaction.project_name diff --git a/tests/test_daemons.py b/tests/test_daemons.py index c42de2e..126c35a 100644 --- a/tests/test_daemons.py +++ b/tests/test_daemons.py @@ -13,7 +13,7 @@ async def test_run_daemon_tools(tool, provider_class, evaluator_class, bootstrap """Test the daemon run loop for all supported tools.""" with patch(f"context_scribe.main.{provider_class}", return_value=daemon_mocks.provider): - with patch(f"context_scribe.main.{evaluator_class}", return_value=daemon_mocks.evaluator): + with patch("context_scribe.main.get_evaluator", return_value=daemon_mocks.evaluator): with patch("context_scribe.main.MemoryBankClient", return_value=daemon_mocks.mcp): with patch(f"context_scribe.main.{bootstrap_func}"): # Mock Live to avoid rich rendering logic completely diff --git a/tests/test_gemini_cli_llm.py b/tests/test_gemini_cli_llm.py index 73b9ca8..3c0916c 100644 --- a/tests/test_gemini_cli_llm.py +++ b/tests/test_gemini_cli_llm.py @@ -52,7 +52,7 @@ def test_evaluator_timeout_handling(): def test_evaluator_cli_invocation_flags(): """Verify that the gemini CLI is called with the correct flags.""" - evaluator = GeminiCliEvaluator() + evaluator = GeminiCliEvaluator(skip_prefilter=True) mock_interaction = Interaction(timestamp=None, role="user", content="Test prompt", project_name="test") with patch("subprocess.run") as mock_run: From 987bb4693058055ba6e91a2a8e5f75b9b2c9b10a Mon Sep 17 00:00:00 2001 From: DJ Date: Sun, 5 Apr 2026 10:58:14 -0700 Subject: [PATCH 4/4] fix: fail-open _parse_bool and use importlib.resources for templates Address two Copilot review comments: 1. _parse_bool() now returns None for unrecognised/null values instead of defaulting to False. _pre_evaluate() treats None as pass-through to full evaluation, ensuring malformed LLM output doesn't silently skip rule extraction (fail-open behaviour). 2. Template loading in BaseEvaluator.__init__() now uses importlib.resources.files() instead of __file__-relative Path reads, so templates are accessible in packaged installs (wheel/zip). Co-Authored-By: Claude Opus 4.6 (1M context) --- context_scribe/evaluator/base_evaluator.py | 47 ++++++++++++++++------ tests/test_prefilter.py | 9 ++++- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/context_scribe/evaluator/base_evaluator.py b/context_scribe/evaluator/base_evaluator.py index c1c38ea..34396ef 100644 --- a/context_scribe/evaluator/base_evaluator.py +++ b/context_scribe/evaluator/base_evaluator.py @@ -1,9 +1,9 @@ +import importlib.resources import json import logging import re import subprocess from abc import ABC, abstractmethod -from pathlib import Path from typing import Optional from context_scribe.models.interaction import Interaction @@ -13,28 +13,42 @@ logger = logging.getLogger(__name__) -_PREFILTER_TEMPLATE_PATH = Path(__file__).parent / "prefilter_template.md" +def _parse_bool(value) -> Optional[bool]: + """Safely parse a boolean that may arrive as a string from LLM JSON. -def _parse_bool(value) -> bool: - """Safely parse a boolean that may arrive as a string from LLM JSON.""" + Returns ``None`` for unrecognised or null values so the caller can + fall back to full evaluation (fail-open behaviour). + """ if isinstance(value, bool): return value if isinstance(value, str): - return value.strip().lower() in ("true", "1", "yes") - return bool(value) + normalised = value.strip().lower() + if normalised in ("true", "1", "yes"): + return True + if normalised in ("false", "0", "no"): + return False + return None # unrecognised string → pass through + return None # None / other types → pass through + + +def _load_package_template(filename: str) -> str: + """Load a template file from this package using importlib.resources.""" + return ( + importlib.resources.files("context_scribe.evaluator") + .joinpath(filename) + .read_text(encoding="utf-8") + ) class BaseEvaluator(ABC): def __init__(self, skip_prefilter: bool = False): self.skip_prefilter = skip_prefilter self.metrics = PrefilterMetrics() - # Load the prompt templates - template_path = Path(__file__).parent / "prompt_template.md" - with open(template_path, "r", encoding="utf-8") as f: - self.prompt_template = f.read() - with open(_PREFILTER_TEMPLATE_PATH, "r", encoding="utf-8") as f: - self._prefilter_template = f.read() + # Load the prompt templates via importlib.resources (works in + # packaged installs such as wheels / zip imports). + self.prompt_template = _load_package_template("prompt_template.md") + self._prefilter_template = _load_package_template("prefilter_template.md") @abstractmethod def _execute_cli(self, prompt: str) -> str: @@ -68,8 +82,15 @@ def _pre_evaluate(self, interaction: Interaction) -> Optional[PrefilterResult]: json_match = re.search(r'\{[^}]*"contains_rule"[^}]*\}', response_text) if json_match: pf_data = json.loads(json_match.group(0)) + parsed = _parse_bool(pf_data.get("contains_rule", True)) + if parsed is None: + logger.warning( + "Unrecognised contains_rule value %r, passing through to full eval", + pf_data.get("contains_rule"), + ) + return None return PrefilterResult( - contains_rule=_parse_bool(pf_data.get("contains_rule", True)), + contains_rule=parsed, confidence=float(pf_data.get("confidence", 0.0)), ) diff --git a/tests/test_prefilter.py b/tests/test_prefilter.py index 8bf5f1e..fa1f19a 100644 --- a/tests/test_prefilter.py +++ b/tests/test_prefilter.py @@ -87,7 +87,14 @@ def test_parse_bool_false_values(): assert _parse_bool("False") is False assert _parse_bool("0") is False assert _parse_bool("no") is False - assert _parse_bool("") is False + + +def test_parse_bool_unknown_values_return_none(): + """Unrecognised or null values return None (fail-open).""" + assert _parse_bool("") is None + assert _parse_bool(None) is None + assert _parse_bool("maybe") is None + assert _parse_bool(42) is None # --- BaseEvaluator prefilter integration ---