From e0981cbe30537ae7e8878c88a2a6274be43cffdb Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 01:22:13 -0700 Subject: [PATCH 01/13] =?UTF-8?q?feat(evaluator):=20complete=20Langfuse=20?= =?UTF-8?q?observability=20pipeline=20=E2=80=94=20observation-level=20eval?= =?UTF-8?q?uation,=20automated=20scheduling,=20retry=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add observation-level evaluation path to runner (EV-01 to EV-04 score individual spans by event_type name filtering) - Fix pagination: cursor-based for observations.get_many(), page-based for trace.list() per V3 SDK - Create all 12 evaluator YAML + prompt files with correct filter alignment against actual emit_trace_event() event_types - Add evaluator-scheduler Docker container (croniter-based cron daemon) in docker-compose.langfuse.yml under langfuse profile - Add exponential backoff retry logic for transient provider errors (500, 502, 503, 429) with configurable max_retries - Make create_score_configs.py truly idempotent with pre-check and --cleanup-duplicates flag - Sanitize all log injection vectors in monitoring/main.py (inline sanitize_log_input at every call site for CodeQL compliance) - Add evaluator files to installer copy paths (both fresh and update) - Add croniter>=2.0.0,<3.0.0 dependency Resolves: TD-280, TD-281, TD-282, TD-283, TD-287, TD-288, TD-100, BUG-217 Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/.env.example | 9 + docker/Dockerfile.evaluator-scheduler | 28 + docker/docker-compose.langfuse.yml | 56 ++ evaluator_config.yaml | 7 +- evaluators/ev01_retrieval_relevance.yaml | 17 +- evaluators/ev01_retrieval_relevance_prompt.md | 2 +- evaluators/ev02_injection_value.yaml | 10 +- evaluators/ev02_injection_value_prompt.md | 2 +- evaluators/ev03_capture_completeness.yaml | 9 +- .../ev03_capture_completeness_prompt.md | 2 +- evaluators/ev04_classification_accuracy.yaml | 9 +- .../ev04_classification_accuracy_prompt.md | 2 +- evaluators/ev05_bootstrap_quality.yaml | 12 +- evaluators/ev06_session_coherence.yaml | 9 +- monitoring/main.py | 20 +- pyproject.toml | 2 + requirements.txt | 3 + scripts/create_score_configs.py | 137 ++- scripts/install.sh | 23 + scripts/memory/evaluator_scheduler.py | 261 ++++++ src/memory/evaluator/provider.py | 120 ++- src/memory/evaluator/runner.py | 476 ++++++++-- tests/test_create_score_configs.py | 294 +++++++ tests/test_evaluator_runner.py | 825 +++++++++++++++++- tests/test_monitoring_api.py | 222 +++++ tests/unit/test_evaluator_retry.py | 222 +++++ tests/unit/test_evaluator_scheduler.py | 401 +++++++++ 27 files changed, 2982 insertions(+), 198 deletions(-) create mode 100644 docker/Dockerfile.evaluator-scheduler create mode 100644 scripts/memory/evaluator_scheduler.py create mode 100644 tests/test_create_score_configs.py create mode 100644 tests/test_monitoring_api.py create mode 100644 tests/unit/test_evaluator_retry.py create mode 100644 tests/unit/test_evaluator_scheduler.py diff --git a/docker/.env.example b/docker/.env.example index 9faec64f..59d2c9f3 100755 --- a/docker/.env.example +++ b/docker/.env.example @@ -58,6 +58,7 @@ AI_MEMORY_LOG_LEVEL=INFO # AI Memory Installation Directory (for volume mounts) # Must match the project installation path +# REQUIRED for the langfuse compose profile (evaluator-scheduler .audit volume mount) AI_MEMORY_INSTALL_DIR=/path/to/your/project # ============================================ @@ -422,6 +423,14 @@ LANGFUSE_FLUSH_INTERVAL=5 LANGFUSE_TRACE_HOOKS=true LANGFUSE_TRACE_SESSIONS=true +# ============================================================================= +# EVALUATOR SCHEDULER (v2.2.3B, S-16.5, DEC-110) +# ============================================================================= +# Custom LLM provider for evaluator-scheduler (provider=custom in evaluator_config.yaml) +# Leave blank if using ollama/openrouter/anthropic/openai (configured in evaluator_config.yaml) +EVALUATOR_BASE_URL= +EVALUATOR_API_KEY= + # ============================================================================= # OLLAMA CLOUD API (v2.2.3, PLAN-012) # ============================================================================= diff --git a/docker/Dockerfile.evaluator-scheduler b/docker/Dockerfile.evaluator-scheduler new file mode 100644 index 00000000..3082a3ad --- /dev/null +++ b/docker/Dockerfile.evaluator-scheduler @@ -0,0 +1,28 @@ +FROM python:3.12-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements and install Python dependencies (includes croniter) +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Create non-root user (security hardening 2025) +RUN useradd -u 1001 -g users -s /usr/sbin/nologin evaluator && \ + chown -R evaluator:users /app + +# Set Python path to include src directory where memory module lives +# (src/ is volume-mounted read-only at runtime: ../src:/app/src:ro) +ENV PYTHONPATH=/app/src + +# Switch to non-root user +USER evaluator + +# Entry point: evaluator scheduler daemon +CMD ["python", "scripts/memory/evaluator_scheduler.py"] diff --git a/docker/docker-compose.langfuse.yml b/docker/docker-compose.langfuse.yml index b402f523..e942cb0b 100644 --- a/docker/docker-compose.langfuse.yml +++ b/docker/docker-compose.langfuse.yml @@ -285,6 +285,62 @@ services: networks: - ai-memory_default + # ─── Evaluator Scheduler (AI Memory component) ──────────────────────────── + # Runs LLM-as-Judge evaluation pipeline on a cron schedule. + # Schedule configured via evaluator_config.yaml (schedule.cron, lookback_hours). + # Requires Langfuse to be healthy so it can post scores via create_score(). + evaluator-scheduler: + build: + context: ../ + dockerfile: docker/Dockerfile.evaluator-scheduler + container_name: ${AI_MEMORY_CONTAINER_PREFIX:-ai-memory}-evaluator-scheduler + profiles: ["langfuse"] + volumes: + - ../src:/app/src:ro + - ../scripts:/app/scripts:ro + - ../evaluators:/app/evaluators:ro + - ../evaluator_config.yaml:/app/evaluator_config.yaml:ro + - ${AI_MEMORY_INSTALL_DIR:-.}/.audit:/app/.audit + environment: + - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY:?Run langfuse_setup.sh first to generate API keys} + - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY:?Run langfuse_setup.sh first to generate API keys} + # Use internal service name for container-to-container communication + - LANGFUSE_BASE_URL=http://langfuse-web:3000 + # Optional LLM provider API keys (read from env; not all are required) + - OLLAMA_API_KEY=${OLLAMA_API_KEY:-} + - OPENROUTER_API_KEY=${OPENROUTER_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + # Optional custom LLM provider (provider=custom in evaluator_config.yaml) + - EVALUATOR_BASE_URL=${EVALUATOR_BASE_URL:-} + - EVALUATOR_API_KEY=${EVALUATOR_API_KEY:-} + # NOTE: EVALUATOR_PROVIDER and EVALUATOR_MODEL are NOT set here — + # they are read from evaluator_config.yaml (evaluator_model section) + - PYTHONPATH=/app/src + - AI_MEMORY_LOG_LEVEL=${AI_MEMORY_LOG_LEVEL:-INFO} + depends_on: + langfuse-web: + condition: service_healthy + restart: unless-stopped + read_only: true + tmpfs: + - /tmp + security_opt: + - no-new-privileges:true + cap_drop: + - ALL + networks: + - ai-memory_default + healthcheck: + # File-based heartbeat — scheduler touches health file on startup and after each + # successful evaluation run. Check: file exists AND modified within last 90000s (25h). + # 25h window accommodates a daily cron + restart without false failure. + test: ["CMD-SHELL", "python3 -c \"import os,sys,time; f='/tmp/evaluator-scheduler.health'; sys.exit(0 if os.path.exists(f) and time.time()-os.path.getmtime(f)<90000 else 1)\""] + interval: 60s + timeout: 5s + retries: 3 + start_period: 90000s + # ─── Network ───────────────────────────────────────────────────────────────── # Join the existing ai-memory stack network so Langfuse services can communicate # with the classifier-worker, embedding service, and other AI Memory components. diff --git a/evaluator_config.yaml b/evaluator_config.yaml index 6f1b741b..2a788bc1 100644 --- a/evaluator_config.yaml +++ b/evaluator_config.yaml @@ -16,6 +16,7 @@ evaluator_model: model_name: llama3.2:8b # Model identifier for the selected provider temperature: 0.0 # Deterministic evaluation (do not change) max_tokens: 4096 # 4096 to accommodate thinking models (reasoning + output) + max_retries: 3 # Retry on 500/502/503/429 and network errors # base_url: Override Ollama endpoint URL. # Local (default): http://localhost:11434/v1 @@ -43,9 +44,9 @@ schedule: lookback_hours: 24 # Evaluate last 24 hours of traces # --------------------------------------------------------------------------- -# Evaluation targets per evaluator -# observation = score individual spans (faster, more granular) -# trace = score whole trace (for session-level metrics) +# DEPRECATED: evaluation_targets — use per-evaluator target: field instead. +# Each evaluator YAML now has target: "observation" or target: "trace". +# This section is no longer read by EvaluatorRunner (S-16.3). # --------------------------------------------------------------------------- evaluation_targets: ev01_retrieval_relevance: observation # Per-retrieval span (high volume) diff --git a/evaluators/ev01_retrieval_relevance.yaml b/evaluators/ev01_retrieval_relevance.yaml index 1c0bbf7f..a78ffd70 100644 --- a/evaluators/ev01_retrieval_relevance.yaml +++ b/evaluators/ev01_retrieval_relevance.yaml @@ -1,6 +1,9 @@ # Evaluator: EV-01 Retrieval Relevance # Scores how relevant retrieved memory is to the trigger context. # High-volume path — 5% sampling to stay within judge budget. +# +# PATH B (Langfuse V3): Filter by observation name (event_type). +# Tags are trace-level only in V3; use event_types for observation filtering. id: EV-01 name: retrieval_relevance @@ -9,7 +12,19 @@ target: observation sampling_rate: 0.05 filter: - tags: [search, retrieval, best_practices] + # All event_types that represent memory retrieval operations across hook scripts + # Derived from grep of emit_trace_event() calls in src/memory/ and .claude/hooks/ + event_types: + - "pattern_retrieval" # first_edit_trigger.py — code pattern lookup + - "convention_retrieval" # new_file_trigger.py — convention lookup + - "error_retrieval" # error_detection.py — error fix lookup + - "context_retrieval" # context_injection_tier2.py, session_start.py + - "best_practices_retrieval" # best_practices_retrieval.py + - "memory_retrieval_session_summaries" # session_start.py + - "memory_retrieval_decisions" # session_start.py + - "memory_retrieval_sessions" # session_start.py + - "bootstrap_retrieval" # injection.py — bootstrap context retrieval + - "search_query" # search.py — main hybrid search execution prompt_file: ev01_retrieval_relevance_prompt.md diff --git a/evaluators/ev01_retrieval_relevance_prompt.md b/evaluators/ev01_retrieval_relevance_prompt.md index 6bda0b5d..966d42d4 100644 --- a/evaluators/ev01_retrieval_relevance_prompt.md +++ b/evaluators/ev01_retrieval_relevance_prompt.md @@ -5,7 +5,7 @@ Your task is to assess how relevant a retrieved memory is to the trigger context ## Data to Evaluate -Analyze the trace data provided in the **## Trace to Evaluate** section below. +Analyze the observation data provided in the **## Observation to Evaluate** section below. - **Input**: The trigger context that caused the memory retrieval (e.g., the user's query or the event that fired the retrieval trigger). - **Output**: The retrieved memory content that was returned. diff --git a/evaluators/ev02_injection_value.yaml b/evaluators/ev02_injection_value.yaml index c07e191d..351eea94 100644 --- a/evaluators/ev02_injection_value.yaml +++ b/evaluators/ev02_injection_value.yaml @@ -1,6 +1,9 @@ # Evaluator: EV-02 Injection Value # Scores whether injected context was valuable (true) or noise (false). # High-volume path — 5% sampling to stay within judge budget. +# +# PATH B (Langfuse V3): Filter by observation name (event_type). +# Tags are trace-level only in V3; use event_types for observation filtering. id: EV-02 name: injection_value @@ -9,7 +12,12 @@ target: observation sampling_rate: 0.05 filter: - tags: [injection, tier2, compact] + # event_types that represent the final injected context delivered to the agent + # context_retrieval is the primary event from context_injection_tier2.py + # error_fix_injection is from error_detection.py (injection into error context) + event_types: + - "context_retrieval" # context_injection_tier2.py, session_start.py — tier-2 injections + - "error_fix_injection" # error_detection.py — error fix context injection prompt_file: ev02_injection_value_prompt.md diff --git a/evaluators/ev02_injection_value_prompt.md b/evaluators/ev02_injection_value_prompt.md index 505dec65..964126db 100644 --- a/evaluators/ev02_injection_value_prompt.md +++ b/evaluators/ev02_injection_value_prompt.md @@ -5,7 +5,7 @@ Your task is to determine whether context injected into a user's prompt window a ## Data to Evaluate -Analyze the trace data provided in the **## Trace to Evaluate** section below. +Analyze the observation data provided in the **## Observation to Evaluate** section below. - **Input**: The user's prompt or request that triggered the injection. - **Output**: The injected context that was added to the user's prompt window. diff --git a/evaluators/ev03_capture_completeness.yaml b/evaluators/ev03_capture_completeness.yaml index c500afff..74f4448e 100644 --- a/evaluators/ev03_capture_completeness.yaml +++ b/evaluators/ev03_capture_completeness.yaml @@ -1,6 +1,9 @@ # Evaluator: EV-03 Capture Completeness # Scores whether hook capture events collected all expected fields. # High-volume path — 5% sampling to stay within judge budget. +# +# PATH B (Langfuse V3): Filter by observation name (event_type). +# Tags are trace-level only in V3; use event_types for observation filtering. id: EV-03 name: capture_completeness @@ -9,7 +12,11 @@ target: observation sampling_rate: 0.05 filter: - tags: [capture, user_prompt, agent_response, session_summary, store] + # 1_capture is the primary event_type for capture operations across all hook scripts: + # user_prompt_capture.py, post_tool_capture.py, agent_response_capture.py, + # error_pattern_capture.py + event_types: + - "1_capture" prompt_file: ev03_capture_completeness_prompt.md diff --git a/evaluators/ev03_capture_completeness_prompt.md b/evaluators/ev03_capture_completeness_prompt.md index d4fac089..0d6dee28 100644 --- a/evaluators/ev03_capture_completeness_prompt.md +++ b/evaluators/ev03_capture_completeness_prompt.md @@ -5,7 +5,7 @@ Your task is to determine whether a hook capture event collected all expected fi ## Data to Evaluate -Analyze the trace data provided in the **## Trace to Evaluate** section below. +Analyze the observation data provided in the **## Observation to Evaluate** section below. - **Input**: The captured content and hook event data (event type, captured fields, and metadata). - **Output**: Any processing result or confirmation from the capture system. diff --git a/evaluators/ev04_classification_accuracy.yaml b/evaluators/ev04_classification_accuracy.yaml index 887e8f69..be193f53 100644 --- a/evaluators/ev04_classification_accuracy.yaml +++ b/evaluators/ev04_classification_accuracy.yaml @@ -1,6 +1,10 @@ # Evaluator: EV-04 Classification Accuracy # Assesses whether the LLM classifier assigned the correct type to content. # 10% sampling — classification is medium volume. +# +# PATH B (Langfuse V3): Filter by observation name (event_type). +# Tags are trace-level only in V3; use event_types for observation filtering. +# CRITICAL: score_type is CATEGORICAL (NOT BOOLEAN). id: EV-04 name: classification_accuracy @@ -14,7 +18,10 @@ categories: - incorrect filter: - tags: [classification, queue, worker] + # 9_classify is the event_type for LLM classification observations: + # classification_worker.py, process_classification_queue.py + event_types: + - "9_classify" prompt_file: ev04_classification_accuracy_prompt.md diff --git a/evaluators/ev04_classification_accuracy_prompt.md b/evaluators/ev04_classification_accuracy_prompt.md index d37da832..5d9a9219 100644 --- a/evaluators/ev04_classification_accuracy_prompt.md +++ b/evaluators/ev04_classification_accuracy_prompt.md @@ -5,7 +5,7 @@ Your task is to assess whether the LLM classifier correctly assigned a memory ty ## Data to Evaluate -Analyze the trace data provided in the **## Trace to Evaluate** section below. +Analyze the observation data provided in the **## Observation to Evaluate** section below. - **Input**: The original captured content that was submitted to the classifier. - **Output**: The classifier's decision including assigned memory type, collection, and reasoning. diff --git a/evaluators/ev05_bootstrap_quality.yaml b/evaluators/ev05_bootstrap_quality.yaml index 8b801c75..7d742540 100644 --- a/evaluators/ev05_bootstrap_quality.yaml +++ b/evaluators/ev05_bootstrap_quality.yaml @@ -1,6 +1,13 @@ # Evaluator: EV-05 Bootstrap Quality # Scores how well cross-session bootstrap context serves the new session. # 100% sampling — one bootstrap event per session (low volume). +# +# PATH B (Langfuse V3): Filter by trace-level tag. +# Rationale: EV-05 targets trace-level data (full session context), so tags is +# the correct filter mechanism — matching the pattern EV-06 uses. event_types +# applies to observation names and will never match trace-level items, making +# it a dead filter. session_trace tags are set by langfuse_stop_hook.py on +# all full-session traces, so this correctly captures bootstrap sessions. id: EV-05 name: bootstrap_quality @@ -9,7 +16,10 @@ target: trace sampling_rate: 1.0 filter: - tags: [injection, tier1, bootstrap, skill] + # Trace-level tags filter — valid for trace-targeted evaluators in V3. + # session_trace tag is set on full-session traces by session hooks. + tags: + - "session_trace" prompt_file: ev05_bootstrap_quality_prompt.md diff --git a/evaluators/ev06_session_coherence.yaml b/evaluators/ev06_session_coherence.yaml index 734da20c..feaaa113 100644 --- a/evaluators/ev06_session_coherence.yaml +++ b/evaluators/ev06_session_coherence.yaml @@ -1,6 +1,10 @@ # Evaluator: EV-06 Session Coherence # Scores overall memory system behaviour across a full session. # 100% sampling — one session summary per session (low volume). +# +# PATH B NOTE: EV-06 correctly uses trace-level tags filter. +# This evaluator targets full session TRACES (not individual observations). +# Trace-level tags ARE supported in Langfuse V3 — this filter is correct. id: EV-06 name: session_coherence @@ -9,7 +13,10 @@ target: trace sampling_rate: 1.0 filter: - tags: [capture, session_summary, injection, tier1, bootstrap] + # Trace-level tags filter — valid for trace-targeted evaluators in V3. + # session_trace tag is set on full-session traces by session hooks. + tags: + - "session_trace" prompt_file: ev06_session_coherence_prompt.md diff --git a/monitoring/main.py b/monitoring/main.py index a0d6d6e2..d792efe8 100755 --- a/monitoring/main.py +++ b/monitoring/main.py @@ -229,25 +229,25 @@ async def update_metrics_periodically(): logger.warning( "pushgateway_push_failed", extra={ - "collection": collection_name, - "error": str(push_error), + "collection": sanitize_log_input(collection_name), + "error": sanitize_log_input(str(push_error)), }, ) logger.debug( "metrics_updated", extra={ - "collection": collection_name, + "collection": sanitize_log_input(collection_name), "total_points": stats.total_points, }, ) except Exception as e: logger.warning( "metrics_update_failed", - extra={"collection": collection_name, "error": str(e)}, + extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))}, ) except Exception as e: - logger.error("metrics_updater_error", extra={"error": str(e)}) + logger.error("metrics_updater_error", extra={"error": sanitize_log_input(str(e))}) await asyncio.sleep(60) # Update every 60 seconds @@ -364,7 +364,7 @@ async def health(): except Exception as e: logger.warning( "stats_check_failed", - extra={"collection": collection_name, "error": str(e)}, + extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))}, ) # Determine status based on warnings @@ -390,7 +390,7 @@ async def health(): warnings=all_warnings, ) except Exception as e: - logger.error("health_check_failed", extra={"error": str(e)}) + logger.error("health_check_failed", extra={"error": sanitize_log_input(str(e))}) return JSONResponse( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={ @@ -426,7 +426,7 @@ async def readiness(): logger.info("readiness_check_passed", extra={"qdrant_available": True}) return {"status": "ready", "qdrant_available": True} except Exception as e: - logger.warning("readiness_check_failed", extra={"error": str(e)}) + logger.warning("readiness_check_failed", extra={"error": sanitize_log_input(str(e))}) return JSONResponse( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"status": "not_ready", "qdrant_available": False}, @@ -481,7 +481,7 @@ async def get_memory(memory_id: str, collection: str = "code-patterns"): except UnexpectedResponse as e: logger.error( "qdrant_error", - extra={"error": str(e), "memory_id": sanitize_log_input(memory_id)}, + extra={"error": sanitize_log_input(str(e)), "memory_id": sanitize_log_input(memory_id)}, ) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Qdrant unavailable" @@ -584,7 +584,7 @@ async def search_memories(request: SearchRequest): logger.error( "search_failed", extra={ - "error": str(e), + "error": sanitize_log_input(str(e)), "collection": sanitize_log_input(request.collection), }, ) diff --git a/pyproject.toml b/pyproject.toml index 855e2032..cece53b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,8 @@ dependencies = [ "numpy>=1.26.0,<3.0.0", # Security scanning - Layer 3 NER-based PII detection (SPEC-009) "spacy>=3.8.0,<4.0.0", + # Cron schedule parsing for evaluator-scheduler daemon (S-16.5, DEC-110) + "croniter>=2.0.0,<3.0.0", # LLM Observability — Langfuse tracing (SPEC-020, PLAN-008) "langfuse>=3.0,<4.0.0", # Auto-instrumentation for Anthropic SDK calls (SPEC-020 §6) diff --git a/requirements.txt b/requirements.txt index 15d290ca..3900ff19 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,6 +48,9 @@ detect-secrets>=1.4.0 # en_core_web_sm model downloaded in install.sh post-install step spacy>=3.8.0,<4.0.0 +# Cron schedule parsing for evaluator-scheduler daemon (S-16.5, DEC-110) +croniter>=2.0.0,<3.0.0 + # LLM Observability — Langfuse tracing (SPEC-020, PLAN-008) langfuse>=3.0,<4.0.0 diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py index 42046541..5e5732f9 100644 --- a/scripts/create_score_configs.py +++ b/scripts/create_score_configs.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 # LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md # FORBIDDEN: Langfuse() constructor, start_span(), start_generation(), langfuse_context -# REQUIRED: get_client(), api.score_configs.create(), flush() +# REQUIRED: get_client(), api.score_configs.create(), api.score_configs.list(), flush() """Idempotent Score Config setup in Langfuse. Creates 6 Score Configs that enforce validation schemas on evaluation scores. -Safe to run multiple times — existing configs are left unchanged. +Safe to run multiple times — existing configs are skipped (pre-checked via list API). Score Configs created: NUMERIC: retrieval_relevance (0-1), bootstrap_quality (0-1), session_coherence (0-1) @@ -14,6 +14,7 @@ Usage: python scripts/create_score_configs.py + python scripts/create_score_configs.py --cleanup-duplicates Requires env vars: LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_BASE_URL @@ -21,9 +22,75 @@ """ import sys +import argparse + + +def _fetch_existing_configs(langfuse) -> dict[str, list]: + """Fetch all existing score configs from Langfuse, keyed by name. + + Returns a dict of {name: [config, ...]} to detect duplicates. + Uses cursor-based pagination to retrieve all pages. + """ + existing: dict[str, list] = {} + page = 1 + while True: + try: + response = langfuse.api.score_configs.list(page=page, limit=100) + except Exception as exc: + print(f" [WARN] Could not list existing score configs: {exc}") + break + + items = getattr(response, "data", None) or [] + if not items: + break + + for cfg in items: + name = getattr(cfg, "name", None) + if name: + existing.setdefault(name, []).append(cfg) + + # Check if there are more pages + meta = getattr(response, "meta", None) + total_pages = getattr(meta, "total_pages", 1) if meta else 1 + if page >= total_pages: + break + page += 1 + + return existing + + +def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None: + """Delete duplicate score configs, keeping the oldest one (first created).""" + for name, configs in existing.items(): + if len(configs) <= 1: + continue + # Sort by created_at ascending, keep first, delete the rest + try: + sorted_configs = sorted(configs, key=lambda c: getattr(c, "created_at", "9999-99-99")) + except Exception: + sorted_configs = configs + + to_delete = sorted_configs[1:] + for cfg in to_delete: + cfg_id = getattr(cfg, "id", None) + if not cfg_id: + continue + try: + langfuse.api.score_configs.delete(cfg_id) + print(f" [DEL] Removed duplicate config '{name}' id={cfg_id}") + except Exception as exc: + print(f" [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}") def main() -> int: + parser = argparse.ArgumentParser(description="Create Langfuse score configs idempotently") + parser.add_argument( + "--cleanup-duplicates", + action="store_true", + help="Delete duplicate score configs (keep oldest), then recreate missing ones", + ) + args = parser.parse_args() + try: from langfuse import ( get_client, # V3 singleton — NEVER use Langfuse() constructor @@ -36,7 +103,20 @@ def main() -> int: langfuse = get_client() - print("Creating Score Configs in Langfuse...") + print("Fetching existing Score Configs from Langfuse...") + existing = _fetch_existing_configs(langfuse) + if existing: + print(f" Found {sum(len(v) for v in existing.values())} existing config(s): {list(existing.keys())}") + else: + print(" No existing configs found.") + + if args.cleanup_duplicates: + print("\nCleaning up duplicate Score Configs...") + _cleanup_duplicates(langfuse, existing) + # Re-fetch after cleanup + existing = _fetch_existing_configs(langfuse) + + print("\nCreating Score Configs in Langfuse...") # --- NUMERIC scores (0.0 - 1.0) --- numeric_names = [ @@ -45,6 +125,9 @@ def main() -> int: "session_coherence", ] for name in numeric_names: + if name in existing: + print(f" [SKIP] NUMERIC: {name} (already exists)") + continue try: langfuse.api.score_configs.create( request=CreateScoreConfigRequest( @@ -57,12 +140,14 @@ def main() -> int: ) print(f" [OK] NUMERIC: {name} (0.0 - 1.0)") except Exception as exc: - # Idempotent: log but don't fail if config already exists - print(f" [SKIP] {name}: {exc}") + print(f" [ERROR] {name}: {exc}") # --- BOOLEAN scores --- boolean_names = ["injection_value", "capture_completeness"] for name in boolean_names: + if name in existing: + print(f" [SKIP] BOOLEAN: {name} (already exists)") + continue try: langfuse.api.score_configs.create( request=CreateScoreConfigRequest( @@ -73,27 +158,31 @@ def main() -> int: ) print(f" [OK] BOOLEAN: {name}") except Exception as exc: - print(f" [SKIP] {name}: {exc}") + print(f" [ERROR] {name}: {exc}") - # --- CATEGORICAL score (PM #190 addition) --- - try: - langfuse.api.score_configs.create( - request=CreateScoreConfigRequest( - name="classification_accuracy", - dataType=ScoreConfigDataType.CATEGORICAL, - categories=[ - ConfigCategory(label="correct", value=1.0), - ConfigCategory(label="partially_correct", value=0.5), - ConfigCategory(label="incorrect", value=0.0), - ], - description="LLM-as-judge classification accuracy (PLAN-012)", + # --- CATEGORICAL score --- + cat_name = "classification_accuracy" + if cat_name in existing: + print(f" [SKIP] CATEGORICAL: {cat_name} (already exists)") + else: + try: + langfuse.api.score_configs.create( + request=CreateScoreConfigRequest( + name=cat_name, + dataType=ScoreConfigDataType.CATEGORICAL, + categories=[ + ConfigCategory(label="correct", value=1.0), + ConfigCategory(label="partially_correct", value=0.5), + ConfigCategory(label="incorrect", value=0.0), + ], + description="LLM-as-judge classification accuracy (PLAN-012)", + ) ) - ) - print( - " [OK] CATEGORICAL: classification_accuracy (correct|partially_correct|incorrect)" - ) - except Exception as exc: - print(f" [SKIP] classification_accuracy: {exc}") + print( + " [OK] CATEGORICAL: classification_accuracy (correct|partially_correct|incorrect)" + ) + except Exception as exc: + print(f" [ERROR] {cat_name}: {exc}") # Flush all buffered data before exit (V3 requirement for short-lived scripts) langfuse.flush() diff --git a/scripts/install.sh b/scripts/install.sh index 93b28d3f..9f5f83aa 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1037,6 +1037,18 @@ update_shared_scripts() { log_debug "Synced $docker_count Docker files to INSTALL_DIR" fi + # Sync evaluator config and definitions (S-16.5, DEC-110) + # Required by evaluator-scheduler container at runtime + if [[ -f "$SCRIPT_DIR/../evaluator_config.yaml" ]]; then + cp "$SCRIPT_DIR/../evaluator_config.yaml" "$INSTALL_DIR/evaluator_config.yaml" || log_warning "Failed to copy evaluator_config.yaml" + log_debug "Updated evaluator_config.yaml" + fi + if [[ -d "$SCRIPT_DIR/../evaluators" ]]; then + mkdir -p "$INSTALL_DIR/evaluators" + cp -r "$SCRIPT_DIR/../evaluators/"* "$INSTALL_DIR/evaluators/" 2>/dev/null || true + log_debug "Updated evaluators/ directory" + fi + # Update templates (new templates added in updates must reach installed dir) if [[ -d "$SCRIPT_DIR/../templates" ]]; then mkdir -p "$INSTALL_DIR/templates" @@ -1576,6 +1588,17 @@ copy_files() { cp -r "$SOURCE_DIR/templates/"* "$INSTALL_DIR/templates/" fi + # Copy evaluator config and definitions (S-16.5, DEC-110) + # Required by evaluator-scheduler container at runtime (../evaluator_config.yaml, ../evaluators/) + log_debug "Copying evaluator configuration..." + if [[ -f "$SOURCE_DIR/evaluator_config.yaml" ]]; then + cp "$SOURCE_DIR/evaluator_config.yaml" "$INSTALL_DIR/evaluator_config.yaml" || log_warning "Failed to copy evaluator_config.yaml" + fi + if [[ -d "$SOURCE_DIR/evaluators" ]]; then + mkdir -p "$INSTALL_DIR/evaluators" + cp -r "$SOURCE_DIR/evaluators/"* "$INSTALL_DIR/evaluators/" 2>/dev/null || log_warning "Failed to copy evaluators directory" + fi + # Copy CHANGELOG for reference (TD-170) if [[ -f "$SOURCE_DIR/CHANGELOG.md" ]]; then cp "$SOURCE_DIR/CHANGELOG.md" "$INSTALL_DIR/" || log_warning "Failed to copy CHANGELOG.md" diff --git a/scripts/memory/evaluator_scheduler.py b/scripts/memory/evaluator_scheduler.py new file mode 100644 index 00000000..10250105 --- /dev/null +++ b/scripts/memory/evaluator_scheduler.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +"""Evaluator scheduler daemon. + +Runs LLM-as-Judge evaluation pipeline on a cron schedule defined in +evaluator_config.yaml. + +Architecture: +- Synchronous daemon with croniter for schedule parsing (DEC-110) +- Health check via /tmp/evaluator-scheduler.health timestamp file +- Graceful shutdown via atexit + SIGTERM handler +- Placed in docker-compose.langfuse.yml under langfuse profile + +Usage: + python scripts/memory/evaluator_scheduler.py + +Reference: +- DEC-110: Standalone evaluator-scheduler container design +- PLAN-012 Phase 2: LLM-as-Judge evaluation pipeline +""" + +# LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md + +import atexit +import logging +import os +import signal +import sys +import time +from datetime import datetime, timedelta, timezone +from pathlib import Path + +# Setup Python path for imports +INSTALL_DIR = os.environ.get( + "AI_MEMORY_INSTALL_DIR", os.path.expanduser("~/.ai-memory") +) +sys.path.insert(0, os.path.join(INSTALL_DIR, "src")) + +# Configuration +CONFIG_PATH = os.environ.get("EVALUATOR_CONFIG_PATH", "/app/evaluator_config.yaml") +HEALTH_FILE = Path("/tmp/evaluator-scheduler.health") + +# Setup logging +logging.basicConfig( + level=os.environ.get("AI_MEMORY_LOG_LEVEL", "INFO"), + format="%(asctime)s %(name)s %(levelname)s %(message)s", +) +logger = logging.getLogger("ai_memory.evaluator.scheduler") + +# Graceful shutdown flag (set by SIGTERM handler) +_shutdown_requested = False + + +# ============================================================================= +# HEALTH CHECK +# ============================================================================= + + +def _touch_health_file() -> None: + """Write health check marker for Docker healthcheck (graceful degradation).""" + try: + HEALTH_FILE.touch() + logger.debug("health_file_updated: %s", HEALTH_FILE) + except Exception as exc: + logger.warning("health_file_update_failed: %s", exc) + + +# ============================================================================= +# SIGNAL HANDLING + LANGFUSE SHUTDOWN +# ============================================================================= + + +def _handle_sigterm(signum, frame) -> None: # noqa: ARG001 + """SIGTERM handler — request graceful shutdown.""" + global _shutdown_requested + logger.info("sigterm_received — requesting graceful shutdown") + _shutdown_requested = True + + +def _register_langfuse_shutdown() -> None: + """Register atexit handler to flush and shutdown Langfuse V3 client on exit.""" + + def _langfuse_shutdown() -> None: + try: + from langfuse import get_client # noqa: PLC0415 + + client = get_client() + if client: + client.flush() + client.shutdown() # V3 spec: NO arguments — shutdown(timeout=x) raises TypeError + except Exception: + pass # Never fail on process exit + + atexit.register(_langfuse_shutdown) + + +# ============================================================================= +# CONFIG LOADING +# ============================================================================= + + +def load_schedule_config(config_path: str) -> dict: + """Load and return the schedule section from evaluator_config.yaml. + + Args: + config_path: Path to evaluator_config.yaml + + Returns: + Schedule config dict with keys: enabled, cron, lookback_hours + """ + import yaml # noqa: PLC0415 + + with open(config_path) as f: + config = yaml.safe_load(f) + return config.get("schedule", {}) + + +# ============================================================================= +# SLEEP HELPER +# ============================================================================= + + +def _interruptible_sleep(seconds: float, chunk: float = 5.0) -> None: + """Sleep for `seconds`, waking every `chunk` seconds to check shutdown flag. + + Args: + seconds: Total sleep duration in seconds + chunk: Maximum sleep chunk size in seconds (default: 5.0) + """ + remaining = seconds + while remaining > 0 and not _shutdown_requested: + sleep_for = min(remaining, chunk) + time.sleep(sleep_for) + remaining -= sleep_for + + +# ============================================================================= +# SCHEDULER LOOP +# ============================================================================= + + +def run_scheduler(config_path: str = CONFIG_PATH) -> None: + """Main scheduler loop. + + Reads schedule config, sleeps until next cron tick, then invokes + EvaluatorRunner.run(). Errors in evaluation are logged and the + scheduler continues to the next cycle — it never crashes on eval failure. + + Args: + config_path: Path to evaluator_config.yaml + """ + global _shutdown_requested + + logger.info("evaluator_scheduler_starting, config=%s", config_path) + + # Write health file on startup (Docker healthcheck passes immediately — like + # classifier worker BUG-045 fix: don't wait until first run completes) + _touch_health_file() + + while not _shutdown_requested: + # Reload schedule config each cycle to support live config changes + try: + schedule = load_schedule_config(config_path) + except Exception as exc: + logger.error("config_load_failed: %s — retrying in 60s", exc) + _interruptible_sleep(60) + continue + + if not schedule.get("enabled", False): + logger.info("schedule_disabled — sleeping 300s before recheck") + _interruptible_sleep(300) + continue + + cron_expr = schedule.get("cron", "0 5 * * *") + lookback_hours = int(schedule.get("lookback_hours", 24)) + + # Calculate next run time using croniter + from croniter import croniter # noqa: PLC0415 + + now = datetime.now(tz=timezone.utc) + try: + cron = croniter(cron_expr, now) + next_run: datetime = cron.get_next(datetime) + except (ValueError, KeyError) as exc: + logger.error("invalid_cron_expression: %s — retrying in 60s", exc) + _interruptible_sleep(60) + continue + + sleep_seconds = (next_run - now).total_seconds() + logger.info( + "next_evaluation_scheduled: cron=%s next_run_utc=%s sleep_seconds=%.0f", + cron_expr, + next_run.isoformat(), + sleep_seconds, + ) + + # Sleep until next cron tick, waking periodically to check shutdown + _interruptible_sleep(sleep_seconds) + + if _shutdown_requested: + break + + # Re-check enabled — config may have changed while sleeping + try: + schedule = load_schedule_config(config_path) + except Exception as exc: + logger.error("config_reload_failed before run: %s — skipping cycle", exc) + continue + + if not schedule.get("enabled", False): + logger.info("schedule_disabled at run time — skipping cycle") + continue + + # Re-read lookback_hours from reloaded config (avoid stale value from first load) + lookback_hours = int(schedule.get("lookback_hours", 24)) + + # Run evaluation with lookback window + since = datetime.now(tz=timezone.utc) - timedelta(hours=lookback_hours) + logger.info( + "evaluation_starting: since=%s lookback_hours=%d", + since.isoformat(), + lookback_hours, + ) + + try: + from memory.evaluator import EvaluatorRunner # noqa: PLC0415 + + runner = EvaluatorRunner(config_path) + summary = runner.run(since=since) + logger.info("evaluation_complete: %s", summary) + # Touch health file only after successful run (not on failure) + _touch_health_file() + except Exception as exc: + logger.error( + "evaluation_failed: %s — continuing to next cycle", + exc, + exc_info=True, + ) + # Do NOT update health file on failure + # Continue scheduling — do not crash daemon on evaluation errors + + +# ============================================================================= +# ENTRY POINT +# ============================================================================= + + +def main() -> None: + """Entry point.""" + signal.signal(signal.SIGTERM, _handle_sigterm) + _register_langfuse_shutdown() + + try: + run_scheduler() + except KeyboardInterrupt: + logger.info("keyboard_interrupt_received") + finally: + logger.info("evaluator_scheduler_stopped") + + +if __name__ == "__main__": + main() diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py index 4a99b0e9..326bcc47 100644 --- a/src/memory/evaluator/provider.py +++ b/src/memory/evaluator/provider.py @@ -11,6 +11,8 @@ import json import logging import os +import random +import time from dataclasses import dataclass, field from typing import Any, Literal @@ -35,6 +37,7 @@ class EvaluatorConfig: max_tokens: int = ( 4096 # Must accommodate thinking tokens + output for reasoning models ) + max_retries: int = 3 _client: Any = field(default=None, init=False, repr=False) @classmethod @@ -51,6 +54,7 @@ def from_yaml(cls, path: str) -> "EvaluatorConfig": base_url=model_cfg.get("base_url"), temperature=model_cfg.get("temperature", 0.0), max_tokens=model_cfg.get("max_tokens", 4096), + max_retries=model_cfg.get("max_retries", 3), ) def get_client(self): @@ -63,7 +67,7 @@ def get_client(self): openai.OpenAI for ollama/openrouter/openai/custom providers anthropic.Anthropic for the anthropic provider (native SDK) """ - if self._client: + if self._client is not None: return self._client if self.provider == "ollama": @@ -129,38 +133,100 @@ def get_client(self): def evaluate(self, prompt: str) -> dict: """Send prompt to LLM and return parsed evaluation result. + Retries on transient HTTP errors (500, 502, 503, 429) and network + errors up to self.max_retries times with exponential backoff + jitter. + Args: prompt: The evaluation prompt (with trace input/output embedded) Returns: dict with keys: "score" (float|bool|str) and "reasoning" (str) """ - client = self.get_client() - - if self.provider == "anthropic": - # Native Anthropic SDK uses client.messages.create() - response = client.messages.create( - model=self.model_name, - max_tokens=self.max_tokens, - temperature=self.temperature, - messages=[{"role": "user", "content": prompt}], - ) - content = response.content[0].text - else: - # All other providers use OpenAI-compatible chat.completions.create() - response = client.chat.completions.create( - model=self.model_name, - temperature=self.temperature, - max_tokens=self.max_tokens, - messages=[{"role": "user", "content": prompt}], - ) - msg = response.choices[0].message - content = msg.content - # Thinking models (e.g., Qwen3) may put output in reasoning field - if not content and hasattr(msg, "reasoning") and msg.reasoning: - content = msg.reasoning - - return _parse_evaluation_response(content) + _RETRYABLE_STATUS_CODES = {500, 502, 503, 429} + _BASE_DELAY = 1.0 + last_exc: Exception | None = None + + for attempt in range(self.max_retries + 1): + try: + client = self.get_client() + + if self.provider == "anthropic": + # Native Anthropic SDK uses client.messages.create() + response = client.messages.create( + model=self.model_name, + max_tokens=self.max_tokens, + temperature=self.temperature, + messages=[{"role": "user", "content": prompt}], + ) + content = response.content[0].text + else: + # All other providers use OpenAI-compatible chat.completions.create() + response = client.chat.completions.create( + model=self.model_name, + temperature=self.temperature, + max_tokens=self.max_tokens, + messages=[{"role": "user", "content": prompt}], + ) + msg = response.choices[0].message + content = msg.content + # Thinking models (e.g., Qwen3) may put output in reasoning field + if not content and hasattr(msg, "reasoning") and msg.reasoning: + content = msg.reasoning + + return _parse_evaluation_response(content) + + except (ConnectionError, TimeoutError, OSError) as exc: + last_exc = exc + if attempt >= self.max_retries: + break + delay = _BASE_DELAY * (2**attempt) + delay += random.uniform(0, 0.5 * delay) + logger.warning( + "Retry %d/%d: network error (%s), waiting %.2fs", + attempt + 1, + self.max_retries, + type(exc).__name__, + delay, + ) + time.sleep(delay) + + except Exception as exc: + last_exc = exc + status_code = getattr(exc, "status_code", None) + if status_code not in _RETRYABLE_STATUS_CODES: + raise + + if attempt >= self.max_retries: + break + + # For 429, honour Retry-After header if present + delay = _BASE_DELAY * (2**attempt) + if status_code == 429: + try: + retry_after = exc.response.headers.get("Retry-After") + if retry_after is not None: + delay = float(retry_after) + except AttributeError: + pass + + delay += random.uniform(0, 0.5 * delay) + logger.warning( + "Retry %d/%d: HTTP %s, waiting %.2fs", + attempt + 1, + self.max_retries, + status_code, + delay, + ) + time.sleep(delay) + + # All retries exhausted — reset client to force fresh connection next call + self._client = None + logger.error( + "Evaluator failed after %d retries. Last error: %s", + self.max_retries, + last_exc, + ) + raise last_exc def _parse_evaluation_response(content: str) -> dict: diff --git a/src/memory/evaluator/runner.py b/src/memory/evaluator/runner.py index 847d9a84..2bad2497 100644 --- a/src/memory/evaluator/runner.py +++ b/src/memory/evaluator/runner.py @@ -1,18 +1,26 @@ # LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md # FORBIDDEN: Langfuse() constructor, start_span(), start_generation(), langfuse_context # REQUIRED: get_client(), create_score(), flush() -"""Evaluation runner — fetches traces, evaluates, attaches scores. +"""Evaluation runner — fetches traces and observations, evaluates, attaches scores. Core pipeline: 1. Load evaluator config + evaluator definitions - 2. Fetch traces using page-based pagination (V3 SDK) - 3. Filter traces by evaluator criteria (event_types, tags) - 4. Sample traces per evaluator's sampling_rate - 5. Evaluate each trace via configurable LLM judge - 6. Attach scores to Langfuse via create_score() (idempotent) - 7. Log each evaluation to audit log + 2. Per evaluator, read target: "trace" or "observation" (per-evaluator YAML field) + 3. Trace path (EV-05, EV-06): page-based pagination via trace.list() + — uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API) + 4. Observation path (EV-01–EV-04): cursor-based pagination via observations.get_many() + — uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217) + 5. Filter observations by name (event_type) — server-side via name= parameter (Path B) + 6. Sample traces/observations per evaluator's sampling_rate + 7. Evaluate each trace/observation via configurable LLM judge + 8. Attach scores to Langfuse via create_score() (idempotent via score_id=) + 9. Log each evaluation to audit log + +Note: evaluation_targets in evaluator_config.yaml is DEPRECATED. + Use the per-evaluator target: field in each evaluator YAML instead. PLAN-012 Phase 2 — Section 5.4 +S-16.3: Observation-level evaluation + cursor-based observation pagination (BUG-217) """ import hashlib @@ -41,7 +49,7 @@ def get_client(): # type: ignore[misc] class EvaluatorRunner: - """Core evaluation pipeline for LLM-as-Judge scoring of Langfuse traces.""" + """Core evaluation pipeline for LLM-as-Judge scoring of Langfuse traces and observations.""" def __init__(self, config_path: str): """Load evaluator config and prepare runner. @@ -157,23 +165,64 @@ def _build_prompt(self, evaluator: dict, trace: Any) -> str: f"**Metadata**: {trace_metadata}" ) + def _build_observation_prompt(self, evaluator: dict, observation: Any) -> str: + """Build evaluation prompt from template + observation data. + + Do NOT pass observation objects to _build_prompt() — observation schema + differs from trace schema (e.g. no tags, name is event_type). + + Args: + evaluator: Evaluator definition dict + observation: Langfuse ObservationV2 object + + Returns: + Formatted evaluation prompt + """ + template = self._load_prompt(evaluator) + + obs_name = str(getattr(observation, "name", "") or "") + obs_input = str(getattr(observation, "input", "") or "")[:TRACE_CONTENT_MAX] + obs_output = str(getattr(observation, "output", "") or "")[:TRACE_CONTENT_MAX] + obs_metadata = str(getattr(observation, "metadata", {}) or {})[:TRACE_CONTENT_MAX] + + return ( + f"{template}\n\n" + f"## Observation to Evaluate\n\n" + f"**Event Type**: {obs_name}\n\n" + f"**Input**: {obs_input}\n\n" + f"**Output**: {obs_output}\n\n" + f"**Metadata**: {obs_metadata}" + ) + def _make_score_id( - self, trace_id: str, evaluator_name: str, since: datetime + self, + trace_id: str, + evaluator_name: str, + since: datetime, + observation_id: str | None = None, ) -> str: """Generate deterministic score ID for idempotency. - Uses MD5 of f"{trace_id}:{evaluator_name}:{since.isoformat()}" - so re-running the same evaluation window produces the same score ID. + Includes observation_id in the hash when present — without it, multiple + observations in the same trace would produce IDENTICAL score_ids (BUG-S163). + + Seed format: + - With observation: f"{trace_id}:{observation_id}:{evaluator_name}:{since.isoformat()}" + - Without observation: f"{trace_id}:{evaluator_name}:{since.isoformat()}" Args: trace_id: Langfuse trace ID evaluator_name: Evaluator name (e.g., "retrieval_relevance") since: Start of evaluation window + observation_id: Optional Langfuse observation ID (for observation-level scores) Returns: 32-character hex MD5 string """ - seed = f"{trace_id}:{evaluator_name}:{since.isoformat()}" + if observation_id: + seed = f"{trace_id}:{observation_id}:{evaluator_name}:{since.isoformat()}" + else: + seed = f"{trace_id}:{evaluator_name}:{since.isoformat()}" return hashlib.md5(seed.encode()).hexdigest() def _append_audit_log(self, entry: dict) -> None: @@ -182,108 +231,272 @@ def _append_audit_log(self, entry: dict) -> None: with open(self.audit_log_path, "a") as f: f.write(json.dumps(entry) + "\n") - def run( + def _run_observation_evaluator( self, - evaluator_id: str | None = None, - *, + langfuse: Any, + evaluator: dict, + ev_name: str, since: datetime, - until: datetime | None = None, - dry_run: bool = False, - batch_size: int = 10, - ) -> dict: - """Run evaluations for all (or one) evaluator. + until: datetime, + dry_run: bool, + batch_size: int, + ) -> tuple[int, int, int, int]: + """Run observation-level evaluation for a single evaluator. + + Fetches observations via observations.get_many() with cursor-based pagination + (BUG-217). Filters by observation name (event_type) server-side via name= + parameter (Path B — tags are trace-level only in V3). Args: - evaluator_id: Optional evaluator ID to run (e.g., "EV-01") - since: Start of evaluation window (required — CLI provides this) - until: End of evaluation window (default: now) - dry_run: If True, evaluate but do not save scores to Langfuse - batch_size: Number of traces to fetch per page + langfuse: Langfuse client from get_client() + evaluator: Evaluator definition dict + ev_name: Evaluator name string + since: Start of evaluation window + until: End of evaluation window + dry_run: If True, evaluate but do not save scores + batch_size: Number of observations per cursor page Returns: - Summary dict with counts: fetched, sampled, evaluated, scored + Tuple of (fetched, sampled, evaluated, scored) """ - # V3 singleton — NEVER use Langfuse() constructor directly - langfuse = get_client() - - if until is None: - until = datetime.now(tz=timezone.utc) + ev_filter = evaluator.get("filter", {}) + sampling_rate = float(evaluator.get("sampling_rate", 1.0)) + score_type = evaluator.get("score_type", "NUMERIC") + event_types = ev_filter.get("event_types", []) - evaluators = self._load_evaluators(evaluator_id) - if not evaluators: - logger.warning( - "No evaluators found (dir=%s, id=%s)", self.evaluators_dir, evaluator_id - ) - return {"fetched": 0, "sampled": 0, "evaluated": 0, "scored": 0} - - total_fetched = 0 - total_sampled = 0 - total_evaluated = 0 - total_scored = 0 - - try: - for evaluator in evaluators: - ev_name = evaluator.get("name", evaluator.get("id", "unknown")) - sampling_rate = float(evaluator.get("sampling_rate", 1.0)) - ev_filter = evaluator.get("filter", {}) + # If no event_types defined, fetch all observations (no name filter). + # Otherwise iterate once per event_type using server-side name= filter. + name_filters: list[str | None] = event_types if event_types else [None] - print(f"\n--- Running {evaluator.get('id', '?')}: {ev_name} ---") + fetched = sampled = evaluated = scored = 0 - page = 1 # V3 uses page-based pagination (1-indexed) + try: # R2-F3: catch fetch-level errors so one evaluator doesn't kill the whole run + for name_filter in name_filters: + cursor: str | None = None while True: - traces_response = langfuse.api.trace.list( - from_timestamp=since, - to_timestamp=until, - page=page, + # Cursor-based pagination — V3 observations API (BUG-217) + obs_response = langfuse.api.observations.get_many( + name=name_filter, + from_start_time=since, + to_start_time=until, + cursor=cursor, limit=batch_size, ) - traces = traces_response.data or [] - total_fetched += len(traces) - - for trace in traces: - # Filter first, then sample from matching traces only (UF-4) - if not self._matches_filter(trace, ev_filter): - continue + observations = obs_response.data or [] + fetched += len(observations) + for obs in observations: # Apply sampling if random.random() > sampling_rate: continue - total_sampled += 1 - # Skip traces with no output - if getattr(trace, "output", None) is None: + obs_id = obs.id + trace_id = getattr(obs, "trace_id", None) or "" + + # R2-F5: skip observations with no trace_id + if not trace_id: + logger.warning( + "Observation %s has no trace_id — skipping", obs_id + ) continue - # Build prompt and evaluate — isolated per trace (UF-3) + # R2-F7: check output BEFORE incrementing sampled + if getattr(obs, "output", None) is None: + continue + + # Note: 'sampled' counts observations that passed random gate + trace_id + output guards — not raw sampling rate + sampled += 1 + try: - prompt = self._build_prompt(evaluator, trace) + prompt = self._build_observation_prompt(evaluator, obs) result = self.evaluator_config.evaluate(prompt) if result.get("score") is None: logger.warning( - "Evaluator returned null score for trace %s", - trace.id, + "Evaluator returned null score for observation %s", obs_id ) continue - total_evaluated += 1 + evaluated += 1 - # Attach score to Langfuse (idempotent via score_id) + # Attach score (idempotent via score_id) if not dry_run: - score_id = self._make_score_id(trace.id, ev_name, since) + score_id = self._make_score_id( + trace_id, ev_name, since, observation_id=obs_id + ) + # CATEGORICAL scores use string value; NUMERIC/BOOLEAN use float + score_value = ( + str(result["score"]) + if score_type == "CATEGORICAL" + else result["score"] + ) + # R1-F3: validate categorical value against defined categories + if score_type == "CATEGORICAL": + valid_categories = evaluator.get("categories", []) + if valid_categories and str(result["score"]) not in valid_categories: + logger.warning( + "Observation %s: categorical value %r not in " + "categories %s — skipping", + obs_id, result["score"], valid_categories, + ) + continue langfuse.create_score( score_id=score_id, - trace_id=trace.id, + trace_id=trace_id, + observation_id=obs_id, name=ev_name, - value=result["score"], - data_type=evaluator.get("score_type", "NUMERIC"), + value=score_value, + data_type=score_type, comment=str(result.get("reasoning", ""))[ :TRACE_CONTENT_MAX ], ) - total_scored += 1 + scored += 1 + + # R2-F2: audit log only written when not dry_run + self._append_audit_log( + { + "timestamp": datetime.now(tz=timezone.utc).isoformat(), + "evaluator_id": evaluator.get("id"), + "evaluator_name": ev_name, + "trace_id": trace_id, + "observation_id": obs_id, + "score": result["score"], + "reasoning": str(result.get("reasoning", ""))[:500], + "dry_run": dry_run, + } + ) + + reasoning_preview = str(result.get("reasoning", ""))[:80] + print( + f" [{ev_name}] obs={obs_id[:8]}... trace={trace_id[:8]}... " + f"score={result['score']} | {reasoning_preview}" + ) + except Exception as exc: + logger.error( + "Error evaluating observation %s with %s: %s", + obs_id, + ev_name, + exc, + ) + continue + + # Cursor-based stop condition — no next cursor means last page + next_cursor = getattr(obs_response.meta, "next_cursor", None) # R2-F1 + if not next_cursor or not observations: + break + cursor = next_cursor + + except Exception as exc: + logger.error("Evaluator %s: error fetching observations: %s", ev_name, exc) - # Audit log + return fetched, sampled, evaluated, scored + + def _run_trace_evaluator( + self, + langfuse: Any, + evaluator: dict, + ev_name: str, + since: datetime, + until: datetime, + dry_run: bool, + batch_size: int, + ) -> tuple[int, int, int, int]: + """Run trace-level evaluation for a single evaluator. + + Fetches traces via trace.list() with page-based pagination. + Uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API). + + Args: + langfuse: Langfuse client from get_client() + evaluator: Evaluator definition dict + ev_name: Evaluator name string + since: Start of evaluation window + until: End of evaluation window + dry_run: If True, evaluate but do not save scores + batch_size: Number of traces per page + + Returns: + Tuple of (fetched, sampled, evaluated, scored) + """ + ev_filter = evaluator.get("filter", {}) + sampling_rate = float(evaluator.get("sampling_rate", 1.0)) + score_type = evaluator.get("score_type", "NUMERIC") + + fetched = sampled = evaluated = scored = 0 + page = 1 # V3 trace.list() is page-based (1-indexed) + + try: # R2-F3: catch fetch-level errors so one evaluator doesn't kill the whole run + while True: + traces_response = langfuse.api.trace.list( + from_timestamp=since, + to_timestamp=until, + page=page, + limit=batch_size, + ) + traces = traces_response.data or [] + fetched += len(traces) + + for trace in traces: + # Filter first, then sample from matching traces only (UF-4) + if not self._matches_filter(trace, ev_filter): + continue + + # Apply sampling + if random.random() > sampling_rate: + continue + + # R2-F7: check output BEFORE incrementing sampled + if getattr(trace, "output", None) is None: + continue + + sampled += 1 + + # Build prompt and evaluate — isolated per trace (UF-3) + try: + prompt = self._build_prompt(evaluator, trace) + result = self.evaluator_config.evaluate(prompt) + if result.get("score") is None: + logger.warning( + "Evaluator returned null score for trace %s", + trace.id, + ) + continue + + evaluated += 1 + + # Attach score to Langfuse (idempotent via score_id) + if not dry_run: + score_id = self._make_score_id(trace.id, ev_name, since) + # CATEGORICAL scores use string value; NUMERIC/BOOLEAN use float + score_value = ( + str(result["score"]) + if score_type == "CATEGORICAL" + else result["score"] + ) + # R1-F3: validate categorical value against defined categories + if score_type == "CATEGORICAL": + valid_categories = evaluator.get("categories", []) + if valid_categories and str(result["score"]) not in valid_categories: + logger.warning( + "Trace %s: categorical value %r not in " + "categories %s — skipping", + trace.id, result["score"], valid_categories, + ) + continue + langfuse.create_score( + score_id=score_id, + trace_id=trace.id, + name=ev_name, + value=score_value, + data_type=score_type, + comment=str(result.get("reasoning", ""))[ + :TRACE_CONTENT_MAX + ], + ) + scored += 1 + + # R2-F2: audit log only written when not dry_run self._append_audit_log( { "timestamp": datetime.now( @@ -298,28 +511,99 @@ def run( } ) - reasoning_preview = str(result.get("reasoning", ""))[:80] - print( - f" [{ev_name}] trace={trace.id[:8]}... " - f"score={result['score']} | {reasoning_preview}" - ) - except Exception as exc: - logger.error( - "Error evaluating trace %s with %s: %s", - trace.id, - ev_name, - exc, - ) - continue + reasoning_preview = str(result.get("reasoning", ""))[:80] + print( + f" [{ev_name}] trace={trace.id[:8]}... " + f"score={result['score']} | {reasoning_preview}" + ) + except Exception as exc: + logger.error( + "Error evaluating trace %s with %s: %s", + trace.id, + ev_name, + exc, + ) + continue + + # Page-based pagination — advance or stop (V3 trace.list() API) + meta = getattr(traces_response, "meta", None) + total_pages = getattr(meta, "total_pages", 1) if meta else 1 + if page >= total_pages or not traces: + break + page += 1 + + except Exception as exc: + logger.error("Evaluator %s: error fetching traces: %s", ev_name, exc) + + return fetched, sampled, evaluated, scored - # Page-based pagination — advance or stop - meta = getattr(traces_response, "meta", None) - total_pages = getattr(meta, "total_pages", 1) if meta else 1 - if page >= total_pages or not traces: - break - page += 1 + def run( + self, + evaluator_id: str | None = None, + *, + since: datetime, + until: datetime | None = None, + dry_run: bool = False, + batch_size: int = 10, + ) -> dict: + """Run evaluations for all (or one) evaluator. + + Routes each evaluator to the correct path based on the per-evaluator + YAML target: field ("trace" or "observation"). The global evaluation_targets + section in evaluator_config.yaml is DEPRECATED in favour of per-evaluator target:. + + Args: + evaluator_id: Optional evaluator ID to run (e.g., "EV-01") + since: Start of evaluation window (required — CLI provides this) + until: End of evaluation window (default: now) + dry_run: If True, evaluate but do not save scores to Langfuse + batch_size: Number of traces/observations to fetch per page/cursor + + Returns: + Summary dict with counts: fetched, sampled, evaluated, scored + """ + # V3 singleton — NEVER use Langfuse() constructor directly + langfuse = get_client() + + if until is None: + until = datetime.now(tz=timezone.utc) + + evaluators = self._load_evaluators(evaluator_id) + if not evaluators: + logger.warning( + "No evaluators found (dir=%s, id=%s)", self.evaluators_dir, evaluator_id + ) + return {"fetched": 0, "sampled": 0, "evaluated": 0, "scored": 0} + + total_fetched = 0 + total_sampled = 0 + total_evaluated = 0 + total_scored = 0 + + try: + for evaluator in evaluators: + ev_name = evaluator.get("name", evaluator.get("id", "unknown")) + # Per-evaluator target field is source of truth (DEPRECATED: global evaluation_targets) + target = evaluator.get("target", "trace") + + print(f"\n--- Running {evaluator.get('id', '?')}: {ev_name} (target={target}) ---") + + if target == "observation": + f, s, e, sc = self._run_observation_evaluator( + langfuse, evaluator, ev_name, since, until, dry_run, batch_size + ) + else: + # Default: trace-level evaluation (EV-05, EV-06) + f, s, e, sc = self._run_trace_evaluator( + langfuse, evaluator, ev_name, since, until, dry_run, batch_size + ) + + total_fetched += f + total_sampled += s + total_evaluated += e + total_scored += sc - print(f" Evaluated: {total_evaluated} | Scored: {total_scored}") + print(f" Evaluated: {e} | Scored: {sc}") finally: # Flush all buffered scores to Langfuse — runs even if an error occurs (UF-5) diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py new file mode 100644 index 00000000..899c614a --- /dev/null +++ b/tests/test_create_score_configs.py @@ -0,0 +1,294 @@ +"""Tests for scripts/create_score_configs.py idempotency and dedup logic. + +Validates: + - Pre-check via list API skips configs that already exist + - New configs are created when absent + - --cleanup-duplicates removes extra copies, keeps oldest + - flush() is called before exit (V3 requirement) + +PLAN-012 Phase 2 — Section 5.6 (S-16.2) +""" + +import importlib +import os +import sys +import types +from contextlib import contextmanager +from unittest.mock import MagicMock + +import pytest + +# --------------------------------------------------------------------------- +# Path setup +# --------------------------------------------------------------------------- + +SCRIPTS_DIR = os.path.join(os.path.dirname(__file__), "..", "scripts") +sys.path.insert(0, os.path.abspath(SCRIPTS_DIR)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +ALL_CONFIG_NAMES = [ + "retrieval_relevance", + "bootstrap_quality", + "session_coherence", + "injection_value", + "capture_completeness", + "classification_accuracy", +] + + +def _make_config(name: str, cfg_id: str = "abc123", created_at: str = "2026-01-01T00:00:00Z"): + cfg = MagicMock() + cfg.name = name + cfg.id = cfg_id + cfg.created_at = created_at + return cfg + + +def _make_list_response(items: list, total_pages: int = 1): + response = MagicMock() + response.data = items + meta = MagicMock() + meta.total_pages = total_pages + response.meta = meta + return response + + +def _build_langfuse_modules(mock_client): + """Build the fake langfuse module tree needed for create_score_configs imports.""" + fake_langfuse = types.ModuleType("langfuse") + fake_langfuse.get_client = MagicMock(return_value=mock_client) + + fake_commons = types.ModuleType("langfuse.api.resources.commons.types") + fake_commons.ConfigCategory = MagicMock(side_effect=lambda **kw: kw) + fake_commons.ScoreConfigDataType = MagicMock() + fake_commons.ScoreConfigDataType.NUMERIC = "NUMERIC" + fake_commons.ScoreConfigDataType.BOOLEAN = "BOOLEAN" + fake_commons.ScoreConfigDataType.CATEGORICAL = "CATEGORICAL" + + fake_sc_types = types.ModuleType("langfuse.api.resources.score_configs.types") + fake_sc_types.CreateScoreConfigRequest = MagicMock(side_effect=lambda **kw: kw) + + modules = { + "langfuse": fake_langfuse, + "langfuse.api": MagicMock(), + "langfuse.api.resources": MagicMock(), + "langfuse.api.resources.commons": MagicMock(), + "langfuse.api.resources.commons.types": fake_commons, + "langfuse.api.resources.score_configs": MagicMock(), + "langfuse.api.resources.score_configs.types": fake_sc_types, + } + return modules + + +@contextmanager +def _patched_module(mock_client): + """Context manager: inject fake langfuse, import module, yield (mod, client).""" + sys.modules.pop("create_score_configs", None) + fake_modules = _build_langfuse_modules(mock_client) + + original = {k: sys.modules.get(k) for k in fake_modules} + sys.modules.update(fake_modules) + try: + mod = importlib.import_module("create_score_configs") + yield mod, mock_client + finally: + sys.modules.pop("create_score_configs", None) + for k, v in original.items(): + if v is None: + sys.modules.pop(k, None) + else: + sys.modules[k] = v + + +def _make_client_with_existing(existing_names: list[str], list_side_effect=None): + """Build a mock client where existing_names are already in Langfuse.""" + client = MagicMock() + configs = [_make_config(n) for n in existing_names] + if list_side_effect is not None: + client.api.score_configs.list.side_effect = list_side_effect + else: + client.api.score_configs.list.return_value = _make_list_response(configs) + client.api.score_configs.create.return_value = MagicMock() + client.api.score_configs.delete.return_value = None + client.flush.return_value = None + return client + + +# --------------------------------------------------------------------------- +# Tests: _fetch_existing_configs +# --------------------------------------------------------------------------- + + +class TestFetchExistingConfigs: + def test_returns_empty_when_no_configs(self): + client = _make_client_with_existing([]) + with _patched_module(client) as (mod, c): + result = mod._fetch_existing_configs(c) + assert result == {} + + def test_returns_configs_by_name(self): + client = _make_client_with_existing(["retrieval_relevance", "injection_value"]) + with _patched_module(client) as (mod, c): + result = mod._fetch_existing_configs(c) + assert "retrieval_relevance" in result + assert "injection_value" in result + + def test_detects_duplicates(self): + configs = [ + _make_config("retrieval_relevance", "id-1"), + _make_config("retrieval_relevance", "id-2"), + ] + client = MagicMock() + client.api.score_configs.list.return_value = _make_list_response(configs) + with _patched_module(client) as (mod, c): + result = mod._fetch_existing_configs(c) + assert len(result["retrieval_relevance"]) == 2 + + def test_handles_list_api_error_gracefully(self): + client = MagicMock() + client.api.score_configs.list.side_effect = RuntimeError("network error") + with _patched_module(client) as (mod, c): + result = mod._fetch_existing_configs(c) + assert result == {} + + +# --------------------------------------------------------------------------- +# Tests: _cleanup_duplicates +# --------------------------------------------------------------------------- + + +class TestCleanupDuplicates: + def test_skips_non_duplicates(self): + client = MagicMock() + with _patched_module(client) as (mod, c): + existing = {"retrieval_relevance": [_make_config("retrieval_relevance", "id-1")]} + mod._cleanup_duplicates(c, existing) + client.api.score_configs.delete.assert_not_called() + + def test_deletes_extra_copies_keeps_oldest(self): + client = MagicMock() + older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z") + newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z") + with _patched_module(client) as (mod, c): + existing = {"retrieval_relevance": [older, newer]} + mod._cleanup_duplicates(c, existing) + # Newer (id-new) should be deleted; oldest (id-old) kept + client.api.score_configs.delete.assert_called_once_with("id-new") + + def test_handles_delete_error_gracefully(self): + client = MagicMock() + client.api.score_configs.delete.side_effect = RuntimeError("delete failed") + older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z") + newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z") + with _patched_module(client) as (mod, c): + existing = {"retrieval_relevance": [older, newer]} + # Should not raise + mod._cleanup_duplicates(c, existing) + + def test_missing_created_at_sorts_last_and_gets_deleted(self): + """Config without created_at should sort last (treated as newest) and be deleted.""" + client = MagicMock() + client.api.score_configs.delete.return_value = None + # Config with a real date — should be kept as the "oldest" + with_date = _make_config("retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z") + # Config with no created_at attribute — fallback "9999-99-99" sorts last, gets deleted + no_date = MagicMock() + no_date.name = "retrieval_relevance" + no_date.id = "id-no-date" + del no_date.created_at # ensure getattr fallback is triggered + with _patched_module(client) as (mod, c): + existing = {"retrieval_relevance": [with_date, no_date]} + mod._cleanup_duplicates(c, existing) + # The one without created_at should be deleted; the dated one kept + c.api.score_configs.delete.assert_called_once_with("id-no-date") + + +# --------------------------------------------------------------------------- +# Tests: main() idempotency +# --------------------------------------------------------------------------- + + +def _run_main(client, argv=None): + """Run main() with injected fake langfuse and given argv.""" + if argv is None: + argv = ["create_score_configs.py"] + orig_argv = sys.argv + sys.argv = argv + try: + with _patched_module(client) as (mod, c): + rc = mod.main() + return rc, c + finally: + sys.argv = orig_argv + + +class TestMainIdempotency: + def test_skips_all_when_all_exist(self): + client = _make_client_with_existing(ALL_CONFIG_NAMES) + rc, c = _run_main(client) + assert rc == 0 + c.api.score_configs.create.assert_not_called() + c.flush.assert_called_once() + + def test_creates_all_when_none_exist(self): + client = _make_client_with_existing([]) + rc, c = _run_main(client) + assert rc == 0 + # 6 configs: 3 NUMERIC + 2 BOOLEAN + 1 CATEGORICAL + assert c.api.score_configs.create.call_count == 6 + c.flush.assert_called_once() + + def test_creates_only_missing_configs(self): + existing = ["retrieval_relevance", "injection_value", "classification_accuracy"] + client = _make_client_with_existing(existing) + rc, c = _run_main(client) + assert rc == 0 + # 3 missing: bootstrap_quality, session_coherence, capture_completeness + assert c.api.score_configs.create.call_count == 3 + c.flush.assert_called_once() + + def test_flush_always_called_on_success(self): + client = _make_client_with_existing([]) + rc, c = _run_main(client) + assert rc == 0 + c.flush.assert_called_once() + + def test_cleanup_duplicates_flag_triggers_dedup(self): + older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z") + newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z") + other_existing = [_make_config(n) for n in ALL_CONFIG_NAMES if n != "retrieval_relevance"] + + # After cleanup, deduped list returned + deduped = [_make_config("retrieval_relevance", "id-old")] + other_existing + + client = MagicMock() + client.api.score_configs.list.side_effect = [ + _make_list_response([older, newer] + other_existing), # initial fetch + _make_list_response(deduped), # post-cleanup fetch + ] + client.api.score_configs.create.return_value = MagicMock() + client.api.score_configs.delete.return_value = None + client.flush.return_value = None + + rc, c = _run_main(client, argv=["create_score_configs.py", "--cleanup-duplicates"]) + + assert rc == 0 + # Duplicate deleted + c.api.score_configs.delete.assert_called_once_with("id-new") + # All 6 configs present after cleanup — nothing to create + c.api.score_configs.create.assert_not_called() + c.flush.assert_called_once() + + def test_idempotent_on_repeated_runs(self): + """Second run with all configs present creates nothing.""" + client = _make_client_with_existing(ALL_CONFIG_NAMES) + # Run twice + rc1, _ = _run_main(client) + rc2, c = _run_main(client) + assert rc1 == 0 + assert rc2 == 0 + c.api.score_configs.create.assert_not_called() diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py index 8c20477d..3cfd63d5 100644 --- a/tests/test_evaluator_runner.py +++ b/tests/test_evaluator_runner.py @@ -4,7 +4,17 @@ Tests trace filtering, sampling, score attachment, and pagination using mocked Langfuse client. +Covers: +- Trace-level evaluation path (EV-05, EV-06 style) +- Observation-level evaluation path (EV-01–EV-04 style) +- Cursor-based pagination for observations (BUG-217) +- Page-based pagination for traces +- CATEGORICAL score type (EV-04) +- observation_id in _make_score_id to prevent collisions (BUG-S163) +- Per-evaluator target: routing + PLAN-012 Phase 2 — AC-8 +S-16.3 """ import hashlib @@ -30,7 +40,7 @@ def minimal_config(tmp_path) -> Path: provider: ollama model_name: llama3.2:8b temperature: 0.0 - max_tokens: 512 + max_tokens: 4096 evaluators_dir: {evaluators_dir} audit: log_file: {audit_log} @@ -49,6 +59,89 @@ def runner(minimal_config) -> EvaluatorRunner: return EvaluatorRunner(config_path=str(minimal_config)) +@pytest.fixture() +def trace_evaluator_yaml(tmp_path, minimal_config) -> dict: + """Create a trace-level evaluator YAML (EV-05 style).""" + runner = EvaluatorRunner(config_path=str(minimal_config)) + ev_yaml = { + "id": "EV-05", + "name": "bootstrap_quality", + "score_type": "NUMERIC", + "target": "trace", + "filter": {"event_types": ["session_bootstrap"]}, + "sampling_rate": 1.0, + "prompt_file": "ev05_prompt.md", + } + ev_dir = runner.evaluators_dir + ev_dir.mkdir(parents=True, exist_ok=True) + + ev_file = ev_dir / "ev05_bootstrap_quality.yaml" + import yaml + + ev_file.write_text(yaml.dump(ev_yaml)) + + prompt_file = ev_dir / "ev05_prompt.md" + prompt_file.write_text("Evaluate this bootstrap session.") + + return ev_yaml + + +@pytest.fixture() +def observation_evaluator_yaml(tmp_path, minimal_config) -> dict: + """Create an observation-level evaluator YAML (EV-01 style).""" + runner = EvaluatorRunner(config_path=str(minimal_config)) + ev_yaml = { + "id": "EV-01", + "name": "retrieval_relevance", + "score_type": "NUMERIC", + "target": "observation", + "filter": {"event_types": ["search_query", "context_retrieval"]}, + "sampling_rate": 1.0, + "prompt_file": "ev01_prompt.md", + } + ev_dir = runner.evaluators_dir + ev_dir.mkdir(parents=True, exist_ok=True) + + ev_file = ev_dir / "ev01_retrieval_relevance.yaml" + import yaml + + ev_file.write_text(yaml.dump(ev_yaml)) + + prompt_file = ev_dir / "ev01_prompt.md" + prompt_file.write_text("Evaluate this retrieval.") + + return ev_yaml + + +@pytest.fixture() +def categorical_evaluator_yaml(tmp_path, minimal_config) -> dict: + """Create a CATEGORICAL observation-level evaluator (EV-04 style).""" + runner = EvaluatorRunner(config_path=str(minimal_config)) + ev_yaml = { + "id": "EV-04", + "name": "classification_accuracy", + "score_type": "CATEGORICAL", + "target": "observation", + "categories": ["correct", "partially_correct", "incorrect"], + "filter": {"event_types": ["9_classify"]}, + "sampling_rate": 1.0, + "prompt_file": "ev04_prompt.md", + } + ev_dir = runner.evaluators_dir + ev_dir.mkdir(parents=True, exist_ok=True) + + ev_file = ev_dir / "ev04_classification_accuracy.yaml" + import yaml + + ev_file.write_text(yaml.dump(ev_yaml)) + + prompt_file = ev_dir / "ev04_prompt.md" + prompt_file.write_text("Evaluate this classification.") + + return ev_yaml + + +# For backward compat — existing tests use evaluator_yaml for a trace evaluator @pytest.fixture() def evaluator_yaml(tmp_path, minimal_config) -> dict: """Create a single evaluator YAML in the evaluators directory.""" @@ -57,7 +150,7 @@ def evaluator_yaml(tmp_path, minimal_config) -> dict: "id": "EV-01", "name": "retrieval_relevance", "score_type": "NUMERIC", - "target": "observation", + "target": "trace", "filter": {"tags": ["retrieval"]}, "sampling_rate": 1.0, "prompt_file": "ev01_prompt.md", @@ -94,7 +187,24 @@ def make_mock_trace( return trace +def make_mock_observation( + obs_id: str = "obs001", + trace_id: str = "trace001", + name: str = "search_query", + output: str | None = "retrieved context", +): + obs = MagicMock() + obs.id = obs_id + obs.trace_id = trace_id + obs.name = name + obs.input = "query text" + obs.output = output + obs.metadata = {} + return obs + + def make_paginated_response(traces: list, total_pages=1): + """Page-based response for trace.list().""" response = MagicMock() response.data = traces response.meta = MagicMock() @@ -102,6 +212,15 @@ def make_paginated_response(traces: list, total_pages=1): return response +def make_observation_response(observations: list, next_cursor: str | None = None): + """Cursor-based response for observations.get_many().""" + response = MagicMock() + response.data = observations + response.meta = MagicMock() + response.meta.next_cursor = next_cursor # R2-F1: V3 API uses next_cursor not cursor + return response + + # --------------------------------------------------------------------------- # Tests: _matches_filter # --------------------------------------------------------------------------- @@ -178,7 +297,7 @@ def test_returns_hex_string(self, runner): assert len(score_id) == 32 int(score_id, 16) # Must be valid hex - def test_matches_expected_md5(self, runner): + def test_matches_expected_md5_without_observation_id(self, runner): since = datetime(2026, 3, 13, 0, 0, 0, tzinfo=timezone.utc) expected = hashlib.md5( f"trace123:retrieval_relevance:{since.isoformat()}".encode() @@ -187,20 +306,51 @@ def test_matches_expected_md5(self, runner): runner._make_score_id("trace123", "retrieval_relevance", since) == expected ) + def test_observation_id_changes_score_id(self, runner): + """Two observations from the same trace must get different score IDs (BUG-S163).""" + since = datetime(2026, 3, 13, tzinfo=timezone.utc) + id_without = runner._make_score_id("trace123", "retrieval_relevance", since) + id_with = runner._make_score_id( + "trace123", "retrieval_relevance", since, observation_id="obs001" + ) + assert id_without != id_with + + def test_different_observation_ids_produce_different_scores(self, runner): + """Multiple observations in same trace must get unique score IDs.""" + since = datetime(2026, 3, 13, tzinfo=timezone.utc) + id1 = runner._make_score_id( + "trace123", "retrieval_relevance", since, observation_id="obs001" + ) + id2 = runner._make_score_id( + "trace123", "retrieval_relevance", since, observation_id="obs002" + ) + assert id1 != id2 + + def test_observation_id_md5_includes_observation_in_seed(self, runner): + since = datetime(2026, 3, 13, 0, 0, 0, tzinfo=timezone.utc) + expected = hashlib.md5( + f"trace123:obs001:retrieval_relevance:{since.isoformat()}".encode() + ).hexdigest() + assert ( + runner._make_score_id( + "trace123", "retrieval_relevance", since, observation_id="obs001" + ) + == expected + ) + # --------------------------------------------------------------------------- -# Tests: run() — mocked Langfuse client +# Tests: run() — trace-level path (existing behaviour, no regression) # --------------------------------------------------------------------------- -class TestRunnerRun: +class TestRunnerRunTracePath: def test_run_with_no_evaluators_returns_zero_summary(self, runner): """When evaluators_dir is empty, run() returns all-zero summary.""" mock_langfuse = MagicMock() since = datetime(2026, 3, 12, tzinfo=timezone.utc) with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse): - # evaluators_dir is empty (created by fixture but no YAMLs) result = runner.run(since=since) assert result == {"fetched": 0, "sampled": 0, "evaluated": 0, "scored": 0} @@ -232,6 +382,8 @@ def test_run_attaches_score_for_matching_trace(self, runner, evaluator_yaml): assert call_kwargs["trace_id"] == "trace001" assert call_kwargs["name"] == "retrieval_relevance" assert call_kwargs["value"] == 0.9 + # Trace-level score must NOT have observation_id + assert call_kwargs.get("observation_id") is None def test_run_dry_run_does_not_call_create_score(self, runner, evaluator_yaml): """dry_run=True should evaluate but NOT call create_score().""" @@ -270,16 +422,15 @@ def test_run_skips_trace_with_no_output(self, runner, evaluator_yaml): result = runner.run(since=since) assert result["evaluated"] == 0 + assert result["sampled"] == 0 # R2-F7: sampled not incremented for no-output traces mock_evaluator_config.evaluate.assert_not_called() - def test_run_uses_page_pagination(self, runner, evaluator_yaml): - """Should advance page number when more pages are available.""" + def test_run_uses_page_pagination_for_trace_path(self, runner, evaluator_yaml): + """Trace path must advance page number when more pages are available.""" trace1 = make_mock_trace(trace_id="trace_p1", tags=["retrieval"]) trace2 = make_mock_trace(trace_id="trace_p2", tags=["retrieval"]) mock_langfuse = MagicMock() - # First call returns trace1 with total_pages=2 - # Second call returns trace2 with total_pages=2 (page 2 of 2, stops) mock_langfuse.api.trace.list.side_effect = [ make_paginated_response([trace1], total_pages=2), make_paginated_response([trace2], total_pages=2), @@ -317,31 +468,32 @@ def test_run_calls_flush_after_all_evaluations(self, runner, evaluator_yaml): mock_langfuse.flush.assert_called_once() def test_run_filters_by_evaluator_id(self, runner, tmp_path): - """--evaluator EV-01 should only load EV-01 definition.""" + """--evaluator EV-05 should only load EV-05 definition.""" ev_dir = runner.evaluators_dir ev_dir.mkdir(parents=True, exist_ok=True) import yaml - # Create two evaluators - ev01 = { - "id": "EV-01", - "name": "retrieval_relevance", + ev05 = { + "id": "EV-05", + "name": "bootstrap_quality", "score_type": "NUMERIC", + "target": "trace", "filter": {}, "sampling_rate": 1.0, "prompt_file": "prompt.md", } - ev02 = { - "id": "EV-02", - "name": "injection_value", - "score_type": "BOOLEAN", + ev06 = { + "id": "EV-06", + "name": "session_coherence", + "score_type": "NUMERIC", + "target": "trace", "filter": {}, "sampling_rate": 1.0, "prompt_file": "prompt.md", } - (ev_dir / "ev01.yaml").write_text(yaml.dump(ev01)) - (ev_dir / "ev02.yaml").write_text(yaml.dump(ev02)) + (ev_dir / "ev05.yaml").write_text(yaml.dump(ev05)) + (ev_dir / "ev06.yaml").write_text(yaml.dump(ev06)) (ev_dir / "prompt.md").write_text("Evaluate this.") mock_langfuse = MagicMock() @@ -350,9 +502,9 @@ def test_run_filters_by_evaluator_id(self, runner, tmp_path): since = datetime(2026, 3, 12, tzinfo=timezone.utc) with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse): - runner.run(evaluator_id="EV-01", since=since) + runner.run(evaluator_id="EV-05", since=since) - # api.trace.list should have been called once (for EV-01 only) + # trace.list should have been called once (for EV-05 only) assert mock_langfuse.api.trace.list.call_count == 1 def test_run_appends_to_audit_log(self, runner, evaluator_yaml, tmp_path): @@ -389,9 +541,7 @@ def test_run_uses_get_client_not_constructor(self, runner, evaluator_yaml): import memory.evaluator.runner as runner_module source = inspect.getsource(runner_module) - # Must import/use get_client assert "get_client" in source - # Must NOT instantiate Langfuse() directly (excluding comments and strings) non_comment_lines = [ line for line in source.splitlines() if not line.strip().startswith("#") ] @@ -400,7 +550,411 @@ def test_run_uses_get_client_not_constructor(self, runner, evaluator_yaml): # --------------------------------------------------------------------------- -# Tests: _load_prompt +# Tests: run() — observation-level path (S-16.3) +# --------------------------------------------------------------------------- + + +class TestRunnerRunObservationPath: + def test_observation_path_calls_observations_get_many( + self, runner, observation_evaluator_yaml + ): + """Observation target must call observations.get_many(), not trace.list().""" + obs = make_mock_observation(obs_id="obs001", trace_id="trace001") + mock_langfuse = MagicMock() + # Two event_types → two API calls (one per name_filter) + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.85, "reasoning": "Good"} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since) + + assert mock_langfuse.api.observations.get_many.call_count == 2 + mock_langfuse.api.trace.list.assert_not_called() + + def test_observation_score_includes_observation_id( + self, runner, observation_evaluator_yaml + ): + """create_score() for observation target must include observation_id.""" + obs = make_mock_observation(obs_id="obs_xyz", trace_id="trace_abc") + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.9, "reasoning": ""} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since) + + assert mock_langfuse.create_score.call_count >= 1 + call_kwargs = mock_langfuse.create_score.call_args.kwargs + assert call_kwargs["observation_id"] == "obs_xyz" + assert call_kwargs["trace_id"] == "trace_abc" + + def test_observation_path_skips_no_output(self, runner, observation_evaluator_yaml): + """Observations with output=None must be skipped.""" + obs = make_mock_observation(obs_id="obs_noout", output=None) + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + result = runner.run(since=since) + + assert result["evaluated"] == 0 + assert result["sampled"] == 0 # R2-F7: sampled not incremented for no-output obs + mock_evaluator_config.evaluate.assert_not_called() + + def test_observation_path_dry_run_no_score( + self, runner, observation_evaluator_yaml + ): + """dry_run=True on observation path must not call create_score().""" + obs = make_mock_observation() + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.7, "reasoning": ""} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + result = runner.run(since=since, dry_run=True) + + assert result["evaluated"] >= 1 + assert result["scored"] == 0 + mock_langfuse.create_score.assert_not_called() + + def test_observation_path_appends_audit_log_with_observation_id( + self, runner, observation_evaluator_yaml + ): + """Observation audit log entries must include observation_id.""" + obs = make_mock_observation(obs_id="obs_audit", trace_id="trace_audit") + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.6, "reasoning": "ok"} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since) + + lines = runner.audit_log_path.read_text().strip().splitlines() + # At least one audit entry + assert len(lines) >= 1 + entry = json.loads(lines[0]) + assert entry.get("observation_id") == "obs_audit" + assert entry.get("trace_id") == "trace_audit" + + +# --------------------------------------------------------------------------- +# Tests: Cursor-based pagination for observations (BUG-217) +# --------------------------------------------------------------------------- + + +class TestCursorPaginationObservations: + def test_cursor_pagination_fetches_all_pages( + self, runner, observation_evaluator_yaml + ): + """observations.get_many() must be called with next_cursor on subsequent pages.""" + obs1 = make_mock_observation(obs_id="obs_p1", trace_id="t1") + obs2 = make_mock_observation(obs_id="obs_p2", trace_id="t2") + + mock_langfuse = MagicMock() + # First call returns obs1 + next_cursor; second call returns obs2 + no cursor + mock_langfuse.api.observations.get_many.side_effect = [ + make_observation_response([obs1], next_cursor="cursor_abc"), + make_observation_response([obs2], next_cursor=None), + # Additional responses for the second event_type + make_observation_response([obs1], next_cursor="cursor_abc"), + make_observation_response([obs2], next_cursor=None), + ] + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.8, "reasoning": ""} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + result = runner.run(since=since) + + # Each event_type does 2 pages → 4 total calls for 2 event_types + assert mock_langfuse.api.observations.get_many.call_count == 4 + # Second call for first event_type must pass cursor + second_call_kwargs = mock_langfuse.api.observations.get_many.call_args_list[ + 1 + ].kwargs + assert second_call_kwargs.get("cursor") == "cursor_abc" + + def test_cursor_pagination_stops_when_no_next_cursor( + self, runner, observation_evaluator_yaml + ): + """Must stop fetching when meta.cursor is None.""" + obs = make_mock_observation(obs_id="obs_single") + + mock_langfuse = MagicMock() + # No next cursor → single page per event_type + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs], next_cursor=None) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.7, "reasoning": ""} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since) + + # 2 event_types, 1 page each → 2 calls total + event_types = observation_evaluator_yaml["filter"]["event_types"] + assert mock_langfuse.api.observations.get_many.call_count == len(event_types) + + def test_observation_get_many_passes_name_filter( + self, runner, observation_evaluator_yaml + ): + """Each event_type must be passed as name= to observations.get_many().""" + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([]) + ) + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse): + runner.run(since=since) + + called_names = [ + call.kwargs.get("name") + for call in mock_langfuse.api.observations.get_many.call_args_list + ] + event_types = observation_evaluator_yaml["filter"]["event_types"] + for et in event_types: + assert et in called_names, f"event_type {et!r} not passed as name= filter" + + def test_observation_get_many_passes_time_range( + self, runner, observation_evaluator_yaml + ): + """Must pass from_start_time/to_start_time to observations.get_many().""" + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([]) + ) + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + until = datetime(2026, 3, 13, tzinfo=timezone.utc) + + with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse): + runner.run(since=since, until=until) + + first_call = mock_langfuse.api.observations.get_many.call_args_list[0].kwargs + assert first_call.get("from_start_time") == since + assert first_call.get("to_start_time") == until + + +# --------------------------------------------------------------------------- +# Tests: evaluation_target routing (target: field) +# --------------------------------------------------------------------------- + + +class TestEvaluationTargetRouting: + def test_trace_target_routes_to_trace_list(self, runner, trace_evaluator_yaml): + """Evaluator with target: trace must use trace.list().""" + mock_langfuse = MagicMock() + mock_langfuse.api.trace.list.return_value = make_paginated_response([]) + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse): + runner.run(since=since) + + # R2-F8: use call_count instead of assert_called() for stronger assertion + assert mock_langfuse.api.trace.list.call_count == 1 + mock_langfuse.api.observations.get_many.assert_not_called() + + def test_observation_target_routes_to_observations_get_many( + self, runner, observation_evaluator_yaml + ): + """Evaluator with target: observation must use observations.get_many().""" + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([]) + ) + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse): + runner.run(since=since) + + # R2-F8: 2 event_types in fixture → 2 calls to get_many + event_types = observation_evaluator_yaml["filter"]["event_types"] + assert mock_langfuse.api.observations.get_many.call_count == len(event_types) + mock_langfuse.api.trace.list.assert_not_called() + + def test_missing_target_defaults_to_trace(self, runner, tmp_path): + """Evaluator with no target: field defaults to trace path.""" + ev_dir = runner.evaluators_dir + ev_dir.mkdir(parents=True, exist_ok=True) + + import yaml + + # No target: field + ev = { + "id": "EV-XX", + "name": "test_eval", + "score_type": "NUMERIC", + "filter": {}, + "sampling_rate": 1.0, + "prompt_file": "prompt.md", + } + (ev_dir / "evxx.yaml").write_text(yaml.dump(ev)) + (ev_dir / "prompt.md").write_text("Evaluate.") + + mock_langfuse = MagicMock() + mock_langfuse.api.trace.list.return_value = make_paginated_response([]) + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse): + runner.run(since=since) + + assert mock_langfuse.api.trace.list.call_count == 1 + mock_langfuse.api.observations.get_many.assert_not_called() + + +# --------------------------------------------------------------------------- +# Tests: CATEGORICAL score type (EV-04) +# --------------------------------------------------------------------------- + + +class TestCategoricalScoreType: + def test_categorical_score_passes_string_value( + self, runner, categorical_evaluator_yaml + ): + """CATEGORICAL evaluators must pass string value to create_score().""" + obs = make_mock_observation(obs_id="obs_cat", trace_id="trace_cat") + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = { + "score": "correct", + "reasoning": "Correct classification", + } + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since) + + mock_langfuse.create_score.assert_called_once() + call_kwargs = mock_langfuse.create_score.call_args.kwargs + assert call_kwargs["value"] == "correct" + assert isinstance(call_kwargs["value"], str) + assert call_kwargs["data_type"] == "CATEGORICAL" + assert call_kwargs["observation_id"] == "obs_cat" + + def test_categorical_score_partially_correct( + self, runner, categorical_evaluator_yaml + ): + """CATEGORICAL 'partially_correct' must be passed as string.""" + obs = make_mock_observation(obs_id="obs_partial") + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = { + "score": "partially_correct", + "reasoning": "Partial match", + } + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since) + + call_kwargs = mock_langfuse.create_score.call_args.kwargs + assert call_kwargs["value"] == "partially_correct" + + def test_numeric_score_passes_float_value(self, runner, observation_evaluator_yaml): + """NUMERIC evaluators must pass float value (not string).""" + obs = make_mock_observation() + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = { + "score": 0.85, + "reasoning": "Relevant", + } + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since) + + call_kwargs = mock_langfuse.create_score.call_args.kwargs + assert call_kwargs["value"] == 0.85 + assert isinstance(call_kwargs["value"], float) + + +# --------------------------------------------------------------------------- +# Tests: error isolation # --------------------------------------------------------------------------- @@ -430,13 +984,48 @@ def test_single_trace_error_does_not_kill_run(self, runner, evaluator_yaml): ): result = runner.run(since=since) - # Second trace must still be evaluated assert result["evaluated"] == 1 assert result["scored"] == 1 - # Error must be logged for the first trace + mock_logger.error.assert_called_once() + + def test_single_observation_error_does_not_kill_run( + self, runner, observation_evaluator_yaml + ): + """An exception on one observation must not prevent subsequent evaluations.""" + obs1 = make_mock_observation(obs_id="obs_err") + obs2 = make_mock_observation(obs_id="obs_ok") + + mock_langfuse = MagicMock() + # Both obs in same response for first event_type, empty for second + mock_langfuse.api.observations.get_many.side_effect = [ + make_observation_response([obs1, obs2]), + make_observation_response([]), + ] + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.side_effect = [ + Exception("judge crashed"), + {"score": 0.7, "reasoning": "OK"}, + ] + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + patch("memory.evaluator.runner.logger") as mock_logger, + ): + result = runner.run(since=since) + + assert result["evaluated"] == 1 mock_logger.error.assert_called_once() +# --------------------------------------------------------------------------- +# Tests: _load_prompt +# --------------------------------------------------------------------------- + + class TestLoadPrompt: def test_load_prompt_uses_actual_file_not_fallback(self, runner): """_load_prompt() should return the file contents, not the fallback stub.""" @@ -451,3 +1040,183 @@ def test_load_prompt_uses_actual_file_not_fallback(self, runner): result = runner._load_prompt(evaluator) assert result == prompt_content + + +# --------------------------------------------------------------------------- +# Tests: _build_observation_prompt +# --------------------------------------------------------------------------- + + +class TestBuildObservationPrompt: + def test_build_observation_prompt_includes_event_type(self, runner): + """_build_observation_prompt must include the observation name (event_type).""" + ev_dir = runner.evaluators_dir + ev_dir.mkdir(parents=True, exist_ok=True) + (ev_dir / "prompt.md").write_text("Evaluate the following.") + + evaluator = {"prompt_file": "prompt.md"} + obs = make_mock_observation(name="search_query") + + result = runner._build_observation_prompt(evaluator, obs) + + assert "search_query" in result + assert "Observation to Evaluate" in result + + def test_build_observation_prompt_different_from_trace_prompt(self, runner): + """_build_observation_prompt must not produce 'Trace to Evaluate' header.""" + ev_dir = runner.evaluators_dir + ev_dir.mkdir(parents=True, exist_ok=True) + (ev_dir / "prompt.md").write_text("Evaluate.") + + evaluator = {"prompt_file": "prompt.md"} + obs = make_mock_observation() + + result = runner._build_observation_prompt(evaluator, obs) + assert "Trace to Evaluate" not in result + assert "Observation to Evaluate" in result + + +# --------------------------------------------------------------------------- +# Tests: R2-F2 — dry_run does not write audit log +# --------------------------------------------------------------------------- + + +class TestDryRunAuditLog: + def test_dry_run_does_not_write_audit_log_trace(self, runner, evaluator_yaml): + """dry_run=True must NOT write audit log for trace path.""" + trace = make_mock_trace(trace_id="trace_dry", tags=["retrieval"]) + mock_langfuse = MagicMock() + mock_langfuse.api.trace.list.return_value = make_paginated_response([trace]) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.7, "reasoning": "ok"} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since, dry_run=True) + + assert not runner.audit_log_path.exists() + + def test_dry_run_does_not_write_audit_log_observation( + self, runner, observation_evaluator_yaml + ): + """dry_run=True must NOT write audit log for observation path.""" + obs = make_mock_observation(obs_id="obs_dry", trace_id="trace_dry") + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = {"score": 0.6, "reasoning": "ok"} + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + runner.run(since=since, dry_run=True) + + assert not runner.audit_log_path.exists() + + +# --------------------------------------------------------------------------- +# Tests: R2-F5 — observation with no trace_id is skipped +# --------------------------------------------------------------------------- + + +class TestObservationNoTraceId: + def test_observation_with_no_trace_id_is_skipped( + self, runner, observation_evaluator_yaml + ): + """Observation with empty trace_id must be skipped with a warning.""" + obs = make_mock_observation(obs_id="obs_notrace", trace_id="") + obs.trace_id = "" + + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + patch("memory.evaluator.runner.logger") as mock_logger, + ): + result = runner.run(since=since) + + assert result["evaluated"] == 0 + assert result["scored"] == 0 + mock_evaluator_config.evaluate.assert_not_called() + mock_logger.warning.assert_called() + + +# --------------------------------------------------------------------------- +# Tests: R1-F3 — invalid categorical value is skipped +# --------------------------------------------------------------------------- + + +class TestCategoricalValidation: + def test_invalid_categorical_value_skipped( + self, runner, categorical_evaluator_yaml + ): + """Categorical score not in categories list must be skipped with warning.""" + obs = make_mock_observation(obs_id="obs_badcat", trace_id="trace_badcat") + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + # "unknown_label" is not in categories: ["correct", "partially_correct", "incorrect"] + mock_evaluator_config.evaluate.return_value = { + "score": "unknown_label", + "reasoning": "hallucinated category", + } + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + patch("memory.evaluator.runner.logger") as mock_logger, + ): + result = runner.run(since=since) + + # evaluated is incremented before validation, scored is not + assert result["scored"] == 0 + mock_langfuse.create_score.assert_not_called() + mock_logger.warning.assert_called() + + def test_valid_categorical_value_is_scored(self, runner, categorical_evaluator_yaml): + """Categorical score within categories list must be scored normally.""" + obs = make_mock_observation(obs_id="obs_goodcat", trace_id="trace_goodcat") + mock_langfuse = MagicMock() + mock_langfuse.api.observations.get_many.return_value = ( + make_observation_response([obs]) + ) + + mock_evaluator_config = MagicMock() + mock_evaluator_config.evaluate.return_value = { + "score": "correct", + "reasoning": "valid", + } + + since = datetime(2026, 3, 12, tzinfo=timezone.utc) + + with ( + patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), + patch.object(runner, "evaluator_config", mock_evaluator_config), + ): + result = runner.run(since=since) + + assert result["scored"] == 1 + mock_langfuse.create_score.assert_called() diff --git a/tests/test_monitoring_api.py b/tests/test_monitoring_api.py new file mode 100644 index 00000000..9b8d044a --- /dev/null +++ b/tests/test_monitoring_api.py @@ -0,0 +1,222 @@ +"""Unit tests for monitoring API log injection sanitization (S-16.6). + +Tests verify that: +- sanitize_log_input() strips log injection payloads (newlines, etc.) +- memory_id and collection path params are sanitized before logging +- API endpoint responses are unchanged for valid inputs + +NOTE: monitoring/main.py requires FastAPI/Qdrant deps that live only in the +monitoring Docker container. We mock those heavy deps at the sys.modules level +before importing main so these tests run in the project's normal venv. +""" + +import sys +from pathlib import Path +from unittest.mock import MagicMock, AsyncMock + +import pytest + + +# --------------------------------------------------------------------------- +# Stub out heavy monitoring-only deps so we can import monitoring/main.py +# without a FastAPI/Qdrant installation. +# --------------------------------------------------------------------------- +def _make_fastapi_stub(): + fastapi_mod = MagicMock() + + class _App: + def __init__(self, **kw): + pass + + def get(self, *a, **kw): + return lambda f: f + + def post(self, *a, **kw): + return lambda f: f + + def mount(self, *a, **kw): + pass + + fastapi_mod.FastAPI = _App + fastapi_mod.HTTPException = Exception + fastapi_mod.status.HTTP_503_SERVICE_UNAVAILABLE = 503 + fastapi_mod.status.HTTP_404_NOT_FOUND = 404 + return fastapi_mod + + +_STUBS = { + "fastapi": _make_fastapi_stub(), + "fastapi.responses": MagicMock(), + "prometheus_client": MagicMock(), + "pydantic": MagicMock(), + "qdrant_client": MagicMock(), + "qdrant_client.http": MagicMock(), + "qdrant_client.http.exceptions": MagicMock(), + # Stub memory package to prevent real imports (src/ is in pythonpath) + "memory": MagicMock(), + "memory.metrics": MagicMock(), + "memory.metrics_push": MagicMock(), + "memory.stats": MagicMock(), + "memory.warnings": MagicMock(), +} + +# Patch sys.modules BEFORE importing main +for _mod, _stub in _STUBS.items(): + if _mod not in sys.modules: + sys.modules[_mod] = _stub + +# Add monitoring dir to path +_monitoring_path = str(Path(__file__).parent.parent / "monitoring") +if _monitoring_path not in sys.path: + sys.path.insert(0, _monitoring_path) + +# Now import the function under test +import importlib +import types + +# Use importlib to import main with patched sys.modules already in place +if "main" in sys.modules: + del sys.modules["main"] + +import main as _monitoring_main # noqa: E402 (monitoring/main.py) +sanitize_log_input = _monitoring_main.sanitize_log_input + + +# --------------------------------------------------------------------------- +# Tests for sanitize_log_input +# --------------------------------------------------------------------------- +class TestSanitizeLogInput: + """Unit tests for the sanitize_log_input helper in monitoring/main.py.""" + + def test_newline_stripped(self): + """Newlines in user input must be escaped to prevent log injection.""" + result = sanitize_log_input("valid-id\nINJECTED: fake log entry") + assert "\n" not in result + + def test_carriage_return_stripped(self): + """Carriage returns must be escaped.""" + result = sanitize_log_input("valid-id\rINJECTED") + assert "\r" not in result + + def test_tab_escaped(self): + """Tabs must be escaped.""" + result = sanitize_log_input("id\twith\ttabs") + assert "\t" not in result + + def test_null_byte_stripped(self): + """Null bytes must be stripped.""" + result = sanitize_log_input("id\x00null") + assert "\x00" not in result + + def test_valid_uuid_unchanged(self): + """Valid UUID-style IDs should pass through cleanly.""" + memory_id = "550e8400-e29b-41d4-a716-446655440000" + result = sanitize_log_input(memory_id) + assert "550e8400" in result + assert "446655440000" in result + + def test_valid_collection_names_pass_through(self): + """Valid collection names should pass through cleanly.""" + for collection in ("code-patterns", "conventions", "discussions"): + result = sanitize_log_input(collection) + assert collection in result + + def test_truncation_at_default_max_length(self): + """Oversized input should be truncated to 200 chars.""" + result = sanitize_log_input("x" * 500) + assert len(result) <= 200 + + def test_custom_max_length_respected(self): + """Custom max_length should be respected.""" + result = sanitize_log_input("a" * 50, max_length=10) + assert len(result) <= 10 + + def test_non_string_coerced(self): + """Non-string values should be coerced to string safely.""" + result = sanitize_log_input(12345) + assert "12345" in result + + def test_multiline_injection_in_collection(self): + """Collection param with embedded newline must not produce raw newline.""" + result = sanitize_log_input("code-patterns\nINJECTED: severity=CRITICAL") + assert "\n" not in result + + def test_unicode_control_chars_stripped(self): + """Unicode control characters should be stripped.""" + payload = "id\u0000\u001f\u007f" + result = sanitize_log_input(payload) + for char in payload: + if not char.isprintable(): + assert char not in result + + def test_empty_string(self): + """Empty string input should return empty string.""" + assert sanitize_log_input("") == "" + + def test_exception_str_sanitized(self): + """Exception messages with injection payloads must be sanitized.""" + try: + raise ValueError("error\nINJECTED: severity=critical") + except ValueError as e: + result = sanitize_log_input(str(e)) + assert "\n" not in result + assert "error" in result + + def test_none_input(self): + assert sanitize_log_input(None) == "None" + + def test_multiline_memory_id_no_raw_newline(self): + """memory_id injection payload must not produce raw newline in sanitized output.""" + result = sanitize_log_input("evil-id\nINJECTED") + assert "\n" not in result + + def test_repr_escaping_preserves_printable_content(self): + """repr()-based escaping should keep the meaningful part of the value.""" + value = "abc123" + result = sanitize_log_input(value) + assert "abc123" in result + + +# --------------------------------------------------------------------------- +# Verify call-site sanitization is inline (not via intermediate variable) +# --------------------------------------------------------------------------- +class TestCallSiteSanitization: + """Verify that log call sites in monitoring/main.py sanitize inline. + + CodeQL requires sanitization to appear inline at the log call site rather + than through intermediate variables. We inspect the source to confirm + the pattern is correct. + """ + + def _read_main_source(self) -> str: + main_path = Path(__file__).parent.parent / "monitoring" / "main.py" + return main_path.read_text() + + def test_str_e_always_wrapped_in_sanitize(self): + """Every str(e) used in a logger call must be wrapped with sanitize_log_input.""" + import re + source = self._read_main_source() + + # More precise: look for "error": str(e) that is NOT inside sanitize_log_input(...) + # We check that sanitize_log_input( appears immediately before str( + for match in re.finditer(r'"error":\s*(str\([^)]+\))', source): + str_pos = match.start(1) + sanitize_prefix = "sanitize_log_input(" + prefix = source[max(0, str_pos - len(sanitize_prefix)):str_pos] + assert prefix.endswith(sanitize_prefix), ( + f"Unsanitized str() found at log call site: {match.group(0)}" + ) + + def test_collection_name_in_extra_always_sanitized(self): + """Every collection_name used in logger extra dicts must be sanitized inline.""" + import re + source = self._read_main_source() + + # Find bare "collection": collection_name (not wrapped in sanitize_log_input) + for match in re.finditer(r'"collection":\s*(collection_name|request\.collection)', source): + value = match.group(1) + context_start = max(0, match.start() - 60) + context = source[context_start:match.end()] + assert "sanitize_log_input" in context, ( + f"Unsanitized collection_name found at log call site: ...{context}..." + ) diff --git a/tests/unit/test_evaluator_retry.py b/tests/unit/test_evaluator_retry.py new file mode 100644 index 00000000..dec2d951 --- /dev/null +++ b/tests/unit/test_evaluator_retry.py @@ -0,0 +1,222 @@ +# LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md +"""Unit tests for S-16.4: Exponential backoff retry in EvaluatorConfig.evaluate().""" + +from unittest.mock import MagicMock, Mock, patch + +import pytest + +from memory.evaluator.provider import EvaluatorConfig + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_config(provider="ollama", max_retries=3) -> EvaluatorConfig: + cfg = EvaluatorConfig(provider=provider, max_retries=max_retries) + return cfg + + +def _http_error(status_code: int, retry_after: str | None = None): + """Create a mock exception that looks like an HTTP error.""" + exc = Exception(f"HTTP {status_code}") + exc.status_code = status_code + if retry_after is not None: + response = Mock() + response.headers = {"Retry-After": retry_after} + exc.response = response + return exc + + +def _good_openai_response(text: str = '{"score": 0.9, "reasoning": "ok"}'): + msg = Mock() + msg.content = text + msg.reasoning = None + choice = Mock() + choice.message = msg + resp = Mock() + resp.choices = [choice] + return resp + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +class TestEvaluatorRetry: + + def test_success_on_first_attempt(self): + """Should succeed immediately with no retries.""" + cfg = _make_config(max_retries=3) + mock_client = MagicMock() + mock_client.chat.completions.create.return_value = _good_openai_response() + cfg._client = mock_client + + result = cfg.evaluate("test prompt") + + assert result["score"] == 0.9 + assert mock_client.chat.completions.create.call_count == 1 + + def test_retry_success_after_transient_500(self): + """Should retry on 500 and succeed on second attempt.""" + cfg = _make_config(max_retries=3) + mock_client = MagicMock() + call_count = 0 + + def side_effect(**kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise _http_error(500) + return _good_openai_response() + + mock_client.chat.completions.create.side_effect = side_effect + cfg._client = mock_client + + with patch("memory.evaluator.provider.time.sleep"): + result = cfg.evaluate("test prompt") + + assert result["score"] == 0.9 + assert call_count == 2 + + def test_exponential_backoff_timing(self): + """Sleep durations should grow with attempt number (base * 2^n).""" + cfg = _make_config(max_retries=3) + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = _http_error(500) + cfg._client = mock_client + + sleep_calls = [] + + with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append): + with patch("memory.evaluator.provider.random.uniform", return_value=0.0): + with pytest.raises(Exception): + cfg.evaluate("test prompt") + + # With max_retries=3 we sleep 3 times (attempts 0,1,2) before final failure + assert len(sleep_calls) == 3 + # Base delays: 1*(2^0)=1, 1*(2^1)=2, 1*(2^2)=4 + assert sleep_calls[0] == pytest.approx(1.0) + assert sleep_calls[1] == pytest.approx(2.0) + assert sleep_calls[2] == pytest.approx(4.0) + + def test_max_retries_exhaustion_raises(self): + """Should raise the last exception after all retries are exhausted.""" + cfg = _make_config(max_retries=2) + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = _http_error(503) + cfg._client = mock_client + + with patch("memory.evaluator.provider.time.sleep"): + with pytest.raises(Exception, match="HTTP 503"): + cfg.evaluate("test prompt") + + assert mock_client.chat.completions.create.call_count == 3 # initial + 2 retries + + def test_non_retryable_400_fails_immediately(self): + """400 errors should not be retried.""" + cfg = _make_config(max_retries=3) + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = _http_error(400) + cfg._client = mock_client + + with pytest.raises(Exception, match="HTTP 400"): + cfg.evaluate("test prompt") + + assert mock_client.chat.completions.create.call_count == 1 + + def test_429_with_retry_after_header(self): + """On 429, Retry-After header should override calculated backoff delay.""" + cfg = _make_config(max_retries=3) + mock_client = MagicMock() + call_count = 0 + + def side_effect(**kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise _http_error(429, retry_after="5") + return _good_openai_response() + + mock_client.chat.completions.create.side_effect = side_effect + cfg._client = mock_client + + sleep_calls = [] + + with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append): + with patch("memory.evaluator.provider.random.uniform", return_value=0.0): + result = cfg.evaluate("test prompt") + + assert result["score"] == 0.9 + # First sleep should use Retry-After=5 as base (plus zero jitter) + assert sleep_calls[0] == pytest.approx(5.0) + + def test_retry_on_connection_error(self): + """ConnectionError should trigger retry.""" + cfg = _make_config(max_retries=2) + mock_client = MagicMock() + call_count = 0 + + def side_effect(**kwargs): + nonlocal call_count + call_count += 1 + if call_count < 3: + raise ConnectionError("connection refused") + return _good_openai_response() + + mock_client.chat.completions.create.side_effect = side_effect + cfg._client = mock_client + + with patch("memory.evaluator.provider.time.sleep"): + result = cfg.evaluate("test prompt") + + assert result["score"] == 0.9 + assert call_count == 3 + + def test_retry_on_timeout_error(self): + """TimeoutError should trigger retry.""" + cfg = _make_config(max_retries=2) + mock_client = MagicMock() + call_count = 0 + + def side_effect(**kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise TimeoutError("timed out") + return _good_openai_response() + + mock_client.chat.completions.create.side_effect = side_effect + cfg._client = mock_client + + with patch("memory.evaluator.provider.time.sleep"): + result = cfg.evaluate("test prompt") + + assert result["score"] == 0.9 + assert call_count == 2 + + def test_client_reset_after_exhaustion(self): + """_client should be set to None after all retries are exhausted.""" + cfg = _make_config(max_retries=1) + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = _http_error(500) + cfg._client = mock_client + + with patch("memory.evaluator.provider.time.sleep"): + with pytest.raises(Exception): + cfg.evaluate("test prompt") + + assert cfg._client is None + + def test_client_reset_after_connection_error_exhaustion(self): + """_client should be set to None after ConnectionError retries are exhausted.""" + cfg = _make_config(max_retries=2) + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = ConnectionError("refused") + cfg._client = mock_client + + with patch("memory.evaluator.provider.time.sleep"): + with pytest.raises(ConnectionError): + cfg.evaluate("test prompt") + + assert cfg._client is None diff --git a/tests/unit/test_evaluator_scheduler.py b/tests/unit/test_evaluator_scheduler.py new file mode 100644 index 00000000..938f910b --- /dev/null +++ b/tests/unit/test_evaluator_scheduler.py @@ -0,0 +1,401 @@ +# LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md +"""Unit tests for S-16.5: Evaluator scheduler daemon. + +Tests: +- Cron schedule parsing via croniter +- lookback_hours calculation +- schedule.enabled=False skips evaluation +- Evaluation failure doesn't crash scheduler +- Health file written after successful run +""" + +import importlib.util +import sys +import time +from datetime import datetime, timedelta, timezone +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# croniter availability guard +# --------------------------------------------------------------------------- +try: + import croniter as _croniter_pkg # noqa: F401 + + CRONITER_AVAILABLE = True +except ImportError: + CRONITER_AVAILABLE = False + +pytestmark_croniter = pytest.mark.skipif( + not CRONITER_AVAILABLE, reason="croniter not installed — add croniter>=2.0.0 to requirements.txt" +) + + +def _make_croniter_mock(next_run_dt: datetime): + """Build a minimal croniter sys.modules mock returning a fixed next_run datetime.""" + mock_instance = MagicMock() + mock_instance.get_next.return_value = next_run_dt + mock_cls = MagicMock(return_value=mock_instance) + mock_module = MagicMock() + mock_module.croniter = mock_cls + return mock_module, mock_cls, mock_instance + + +# --------------------------------------------------------------------------- +# Module loader helper +# --------------------------------------------------------------------------- + +_SCHEDULER_PATH = ( + Path(__file__).parent.parent.parent / "scripts/memory/evaluator_scheduler.py" +) + + +def _load_scheduler(): + """Load evaluator_scheduler module via importlib (not on sys.path as a package).""" + spec = importlib.util.spec_from_file_location("evaluator_scheduler", _SCHEDULER_PATH) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +# Load once for all tests +_mod = _load_scheduler() + + +# --------------------------------------------------------------------------- +# Test: load_schedule_config +# --------------------------------------------------------------------------- + + +class TestLoadScheduleConfig: + def test_parses_schedule_section(self, tmp_path): + """load_schedule_config returns the schedule dict from YAML.""" + config = tmp_path / "evaluator_config.yaml" + config.write_text( + "schedule:\n" + " enabled: true\n" + " cron: '0 5 * * *'\n" + " lookback_hours: 24\n" + ) + result = _mod.load_schedule_config(str(config)) + assert result["enabled"] is True + assert result["cron"] == "0 5 * * *" + assert result["lookback_hours"] == 24 + + def test_returns_empty_dict_when_schedule_missing(self, tmp_path): + """Returns {} when schedule key is absent.""" + config = tmp_path / "evaluator_config.yaml" + config.write_text("evaluator_model:\n provider: ollama\n") + result = _mod.load_schedule_config(str(config)) + assert result == {} + + def test_parses_disabled_schedule(self, tmp_path): + """enabled: false is returned correctly.""" + config = tmp_path / "evaluator_config.yaml" + config.write_text( + "schedule:\n" + " enabled: false\n" + " cron: '0 3 * * *'\n" + " lookback_hours: 48\n" + ) + result = _mod.load_schedule_config(str(config)) + assert result["enabled"] is False + assert result["lookback_hours"] == 48 + + +# --------------------------------------------------------------------------- +# Test: cron schedule parsing + next run +# --------------------------------------------------------------------------- + + +class TestCronScheduleParsing: + @pytestmark_croniter + def test_croniter_returns_future_datetime(self): + """croniter.get_next() returns a datetime after 'now'.""" + from croniter import croniter # noqa: PLC0415 + + now = datetime(2026, 3, 14, 4, 0, 0, tzinfo=timezone.utc) + cron = croniter("0 5 * * *", now) + next_run = cron.get_next(datetime) + + # Should be 05:00 UTC on the same day + assert next_run > now + assert next_run.hour == 5 + assert next_run.minute == 0 + + @pytestmark_croniter + def test_croniter_sleep_seconds_positive(self): + """sleep_seconds should always be positive (next run is in the future).""" + from croniter import croniter # noqa: PLC0415 + + now = datetime(2026, 3, 14, 4, 59, 59, tzinfo=timezone.utc) + cron = croniter("0 5 * * *", now) + next_run = cron.get_next(datetime) + sleep_seconds = (next_run - now).total_seconds() + + assert sleep_seconds > 0 + assert sleep_seconds <= 60 # Less than 1 minute until 05:00 + + def test_lookback_hours_calculation(self): + """since = now - timedelta(hours=lookback_hours).""" + lookback_hours = 24 + now = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) + since = now - timedelta(hours=lookback_hours) + + expected = datetime(2026, 3, 13, 5, 0, 0, tzinfo=timezone.utc) + assert since == expected + + def test_lookback_hours_48(self): + """lookback_hours=48 looks back 2 days.""" + lookback_hours = 48 + now = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) + since = now - timedelta(hours=lookback_hours) + + expected = datetime(2026, 3, 12, 5, 0, 0, tzinfo=timezone.utc) + assert since == expected + + +# --------------------------------------------------------------------------- +# Test: schedule.enabled=False skips evaluation +# --------------------------------------------------------------------------- + + +class TestScheduleDisabled: + def test_enabled_false_does_not_call_runner(self, tmp_path): + """When schedule.enabled=False, EvaluatorRunner.run() is never called.""" + config = tmp_path / "evaluator_config.yaml" + config.write_text( + "schedule:\n" + " enabled: false\n" + " cron: '0 5 * * *'\n" + " lookback_hours: 24\n" + ) + + # Reload module with fresh globals to avoid state bleed + mod = _load_scheduler() + mod._shutdown_requested = False + + call_count = 0 + + def fake_interruptible_sleep(seconds, chunk=5.0): + # After one sleep (schedule disabled), set shutdown to stop the loop + mod._shutdown_requested = True + + with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep): + with patch.object(mod, "load_schedule_config", return_value={"enabled": False}): + with patch( + "memory.evaluator.EvaluatorRunner" + ) as mock_runner_cls: + mod.run_scheduler(str(config)) + mock_runner_cls.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test: evaluation failure doesn't crash scheduler +# --------------------------------------------------------------------------- + + +class TestEvaluationFailure: + def test_evaluation_error_continues_to_next_cycle(self, tmp_path): + """If EvaluatorRunner.run() raises, the scheduler logs and continues.""" + config = tmp_path / "evaluator_config.yaml" + config.write_text( + "schedule:\n" + " enabled: true\n" + " cron: '0 5 * * *'\n" + " lookback_hours: 24\n" + ) + + mod = _load_scheduler() + mod._shutdown_requested = False + + sleep_call_count = [0] + + def fake_interruptible_sleep(seconds, chunk=5.0): + sleep_call_count[0] += 1 + # Shut down after the second sleep (after one failed eval cycle) + if sleep_call_count[0] >= 2: + mod._shutdown_requested = True + + mock_runner = MagicMock() + mock_runner.run.side_effect = RuntimeError("LLM provider unavailable") + + _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) + mock_croniter_mod, _, _ = _make_croniter_mock(_next_run) + + with patch.dict(sys.modules, {"croniter": mock_croniter_mod}): + with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep): + with patch.object( + mod, + "load_schedule_config", + return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1}, + ): + with patch.object(mod, "_touch_health_file") as mock_touch: + with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner): + # Should NOT raise — errors are caught and logged + mod.run_scheduler(str(config)) + + # Runner was called at least once + mock_runner.run.assert_called() + # Health file should have been touched on startup only (not on failure) + assert mock_touch.call_count == 1 + + def test_evaluation_failure_does_not_update_health_file(self, tmp_path): + """Health file is NOT touched after a failed evaluation.""" + config = tmp_path / "evaluator_config.yaml" + config.write_text( + "schedule:\n" + " enabled: true\n" + " cron: '* * * * *'\n" + " lookback_hours: 1\n" + ) + + mod = _load_scheduler() + mod._shutdown_requested = False + + sleep_calls = [0] + startup_touch_done = [False] + touch_after_startup = [0] + + def fake_sleep(seconds, chunk=5.0): + sleep_calls[0] += 1 + if sleep_calls[0] >= 2: + mod._shutdown_requested = True + + def fake_touch(): + if startup_touch_done[0]: + touch_after_startup[0] += 1 + startup_touch_done[0] = True + + mock_runner = MagicMock() + mock_runner.run.side_effect = Exception("eval error") + + _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) + mock_croniter_mod, _, _ = _make_croniter_mock(_next_run) + + with patch.dict(sys.modules, {"croniter": mock_croniter_mod}): + with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep): + with patch.object( + mod, + "load_schedule_config", + return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1}, + ): + with patch.object(mod, "_touch_health_file", side_effect=fake_touch): + with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner): + mod.run_scheduler(str(config)) + + # Health file should NOT be touched after the failed evaluation + assert touch_after_startup[0] == 0 + + +# --------------------------------------------------------------------------- +# Test: health file written after successful run +# --------------------------------------------------------------------------- + + +class TestHealthFile: + def test_health_file_touched_on_startup(self, tmp_path): + """Health file is touched on daemon startup before any evaluation.""" + mod = _load_scheduler() + mod._shutdown_requested = False + + touch_calls = [] + + def fake_touch(): + touch_calls.append("touch") + + def fake_sleep(seconds, chunk=5.0): + # Immediately shut down after first sleep + mod._shutdown_requested = True + + with patch.object(mod, "_touch_health_file", side_effect=fake_touch): + with patch.object( + mod, + "load_schedule_config", + return_value={"enabled": False}, + ): + with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep): + mod.run_scheduler(str(tmp_path / "evaluator_config.yaml")) + + # At minimum, touched on startup + assert len(touch_calls) >= 1 + + def test_health_file_touched_after_successful_run(self, tmp_path): + """Health file is touched after a successful evaluation run.""" + config = tmp_path / "evaluator_config.yaml" + config.write_text( + "schedule:\n" + " enabled: true\n" + " cron: '* * * * *'\n" + " lookback_hours: 1\n" + ) + + mod = _load_scheduler() + mod._shutdown_requested = False + + touch_calls = [0] + sleep_calls = [0] + + def fake_touch(): + touch_calls[0] += 1 + + def fake_sleep(seconds, chunk=5.0): + sleep_calls[0] += 1 + # Shut down after second sleep (after one successful eval cycle) + if sleep_calls[0] >= 2: + mod._shutdown_requested = True + + mock_runner = MagicMock() + mock_runner.run.return_value = { + "fetched": 10, + "sampled": 5, + "evaluated": 5, + "scored": 5, + } + + _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) + mock_croniter_mod, _, _ = _make_croniter_mock(_next_run) + + with patch.dict(sys.modules, {"croniter": mock_croniter_mod}): + with patch.object(mod, "_touch_health_file", side_effect=fake_touch): + with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep): + with patch.object( + mod, + "load_schedule_config", + return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1}, + ): + with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner): + mod.run_scheduler(str(config)) + + # Touched on startup + once after successful run = at least 2 calls + assert touch_calls[0] >= 2 + mock_runner.run.assert_called_once() + + def test_health_file_actual_touch(self, tmp_path): + """_touch_health_file() creates the file on disk.""" + health_file = tmp_path / "evaluator-scheduler.health" + + mod = _load_scheduler() + original_health = mod.HEALTH_FILE + mod.HEALTH_FILE = health_file + + try: + assert not health_file.exists() + mod._touch_health_file() + assert health_file.exists() + finally: + mod.HEALTH_FILE = original_health + + def test_health_file_touch_graceful_on_error(self, tmp_path): + """_touch_health_file() does not raise even if file path is unwritable.""" + mod = _load_scheduler() + original_health = mod.HEALTH_FILE + mod.HEALTH_FILE = Path("/nonexistent_dir/evaluator-scheduler.health") + + try: + # Should not raise + mod._touch_health_file() + finally: + mod.HEALTH_FILE = original_health From 92f7902ed2a9dc4d5b6a7361e3b17ec5780f5ad8 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 01:34:51 -0700 Subject: [PATCH 02/13] style: fix lint errors (ruff, black, isort) in evaluator test files Co-Authored-By: Claude Opus 4.6 (1M context) --- monitoring/main.py | 26 ++++-- scripts/create_score_configs.py | 18 +++- scripts/memory/evaluator_scheduler.py | 10 +- src/memory/evaluator/runner.py | 47 +++++++--- tests/test_create_score_configs.py | 28 ++++-- tests/test_evaluator_runner.py | 21 +++-- tests/test_monitoring_api.py | 32 +++---- tests/unit/test_evaluator_retry.py | 55 +++++++---- tests/unit/test_evaluator_scheduler.py | 124 +++++++++++++++---------- 9 files changed, 231 insertions(+), 130 deletions(-) diff --git a/monitoring/main.py b/monitoring/main.py index d792efe8..64ddc287 100755 --- a/monitoring/main.py +++ b/monitoring/main.py @@ -11,9 +11,8 @@ import asyncio import logging import os -from typing import Any, Dict, Optional - import warnings +from typing import Any, Dict, Optional from fastapi import FastAPI, HTTPException, status from fastapi.responses import JSONResponse, Response @@ -244,10 +243,15 @@ async def update_metrics_periodically(): except Exception as e: logger.warning( "metrics_update_failed", - extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))}, + extra={ + "collection": sanitize_log_input(collection_name), + "error": sanitize_log_input(str(e)), + }, ) except Exception as e: - logger.error("metrics_updater_error", extra={"error": sanitize_log_input(str(e))}) + logger.error( + "metrics_updater_error", extra={"error": sanitize_log_input(str(e))} + ) await asyncio.sleep(60) # Update every 60 seconds @@ -364,7 +368,10 @@ async def health(): except Exception as e: logger.warning( "stats_check_failed", - extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))}, + extra={ + "collection": sanitize_log_input(collection_name), + "error": sanitize_log_input(str(e)), + }, ) # Determine status based on warnings @@ -426,7 +433,9 @@ async def readiness(): logger.info("readiness_check_passed", extra={"qdrant_available": True}) return {"status": "ready", "qdrant_available": True} except Exception as e: - logger.warning("readiness_check_failed", extra={"error": sanitize_log_input(str(e))}) + logger.warning( + "readiness_check_failed", extra={"error": sanitize_log_input(str(e))} + ) return JSONResponse( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"status": "not_ready", "qdrant_available": False}, @@ -481,7 +490,10 @@ async def get_memory(memory_id: str, collection: str = "code-patterns"): except UnexpectedResponse as e: logger.error( "qdrant_error", - extra={"error": sanitize_log_input(str(e)), "memory_id": sanitize_log_input(memory_id)}, + extra={ + "error": sanitize_log_input(str(e)), + "memory_id": sanitize_log_input(memory_id), + }, ) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Qdrant unavailable" diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py index 5e5732f9..06cfc2e6 100644 --- a/scripts/create_score_configs.py +++ b/scripts/create_score_configs.py @@ -21,8 +21,8 @@ PLAN-012 Phase 2 — Section 5.6 """ -import sys import argparse +import sys def _fetch_existing_configs(langfuse) -> dict[str, list]: @@ -66,7 +66,9 @@ def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None: continue # Sort by created_at ascending, keep first, delete the rest try: - sorted_configs = sorted(configs, key=lambda c: getattr(c, "created_at", "9999-99-99")) + sorted_configs = sorted( + configs, key=lambda c: getattr(c, "created_at", "9999-99-99") + ) except Exception: sorted_configs = configs @@ -79,11 +81,15 @@ def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None: langfuse.api.score_configs.delete(cfg_id) print(f" [DEL] Removed duplicate config '{name}' id={cfg_id}") except Exception as exc: - print(f" [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}") + print( + f" [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}" + ) def main() -> int: - parser = argparse.ArgumentParser(description="Create Langfuse score configs idempotently") + parser = argparse.ArgumentParser( + description="Create Langfuse score configs idempotently" + ) parser.add_argument( "--cleanup-duplicates", action="store_true", @@ -106,7 +112,9 @@ def main() -> int: print("Fetching existing Score Configs from Langfuse...") existing = _fetch_existing_configs(langfuse) if existing: - print(f" Found {sum(len(v) for v in existing.values())} existing config(s): {list(existing.keys())}") + print( + f" Found {sum(len(v) for v in existing.values())} existing config(s): {list(existing.keys())}" + ) else: print(" No existing configs found.") diff --git a/scripts/memory/evaluator_scheduler.py b/scripts/memory/evaluator_scheduler.py index 10250105..182152c8 100644 --- a/scripts/memory/evaluator_scheduler.py +++ b/scripts/memory/evaluator_scheduler.py @@ -69,7 +69,7 @@ def _touch_health_file() -> None: # ============================================================================= -def _handle_sigterm(signum, frame) -> None: # noqa: ARG001 +def _handle_sigterm(signum, frame) -> None: """SIGTERM handler — request graceful shutdown.""" global _shutdown_requested logger.info("sigterm_received — requesting graceful shutdown") @@ -81,7 +81,7 @@ def _register_langfuse_shutdown() -> None: def _langfuse_shutdown() -> None: try: - from langfuse import get_client # noqa: PLC0415 + from langfuse import get_client client = get_client() if client: @@ -107,7 +107,7 @@ def load_schedule_config(config_path: str) -> dict: Returns: Schedule config dict with keys: enabled, cron, lookback_hours """ - import yaml # noqa: PLC0415 + import yaml with open(config_path) as f: config = yaml.safe_load(f) @@ -174,7 +174,7 @@ def run_scheduler(config_path: str = CONFIG_PATH) -> None: lookback_hours = int(schedule.get("lookback_hours", 24)) # Calculate next run time using croniter - from croniter import croniter # noqa: PLC0415 + from croniter import croniter now = datetime.now(tz=timezone.utc) try: @@ -222,7 +222,7 @@ def run_scheduler(config_path: str = CONFIG_PATH) -> None: ) try: - from memory.evaluator import EvaluatorRunner # noqa: PLC0415 + from memory.evaluator import EvaluatorRunner runner = EvaluatorRunner(config_path) summary = runner.run(since=since) diff --git a/src/memory/evaluator/runner.py b/src/memory/evaluator/runner.py index 2bad2497..a370ce29 100644 --- a/src/memory/evaluator/runner.py +++ b/src/memory/evaluator/runner.py @@ -7,9 +7,9 @@ 1. Load evaluator config + evaluator definitions 2. Per evaluator, read target: "trace" or "observation" (per-evaluator YAML field) 3. Trace path (EV-05, EV-06): page-based pagination via trace.list() - — uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API) - 4. Observation path (EV-01–EV-04): cursor-based pagination via observations.get_many() - — uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217) + - uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API) + 4. Observation path (EV-01-EV-04): cursor-based pagination via observations.get_many() + - uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217) 5. Filter observations by name (event_type) — server-side via name= parameter (Path B) 6. Sample traces/observations per evaluator's sampling_rate 7. Evaluate each trace/observation via configurable LLM judge @@ -183,7 +183,9 @@ def _build_observation_prompt(self, evaluator: dict, observation: Any) -> str: obs_name = str(getattr(observation, "name", "") or "") obs_input = str(getattr(observation, "input", "") or "")[:TRACE_CONTENT_MAX] obs_output = str(getattr(observation, "output", "") or "")[:TRACE_CONTENT_MAX] - obs_metadata = str(getattr(observation, "metadata", {}) or {})[:TRACE_CONTENT_MAX] + obs_metadata = str(getattr(observation, "metadata", {}) or {})[ + :TRACE_CONTENT_MAX + ] return ( f"{template}\n\n" @@ -313,7 +315,8 @@ def _run_observation_evaluator( result = self.evaluator_config.evaluate(prompt) if result.get("score") is None: logger.warning( - "Evaluator returned null score for observation %s", obs_id + "Evaluator returned null score for observation %s", + obs_id, ) continue @@ -333,11 +336,16 @@ def _run_observation_evaluator( # R1-F3: validate categorical value against defined categories if score_type == "CATEGORICAL": valid_categories = evaluator.get("categories", []) - if valid_categories and str(result["score"]) not in valid_categories: + if ( + valid_categories + and str(result["score"]) not in valid_categories + ): logger.warning( "Observation %s: categorical value %r not in " "categories %s — skipping", - obs_id, result["score"], valid_categories, + obs_id, + result["score"], + valid_categories, ) continue langfuse.create_score( @@ -356,13 +364,17 @@ def _run_observation_evaluator( # R2-F2: audit log only written when not dry_run self._append_audit_log( { - "timestamp": datetime.now(tz=timezone.utc).isoformat(), + "timestamp": datetime.now( + tz=timezone.utc + ).isoformat(), "evaluator_id": evaluator.get("id"), "evaluator_name": ev_name, "trace_id": trace_id, "observation_id": obs_id, "score": result["score"], - "reasoning": str(result.get("reasoning", ""))[:500], + "reasoning": str(result.get("reasoning", ""))[ + :500 + ], "dry_run": dry_run, } ) @@ -382,7 +394,9 @@ def _run_observation_evaluator( continue # Cursor-based stop condition — no next cursor means last page - next_cursor = getattr(obs_response.meta, "next_cursor", None) # R2-F1 + next_cursor = getattr( + obs_response.meta, "next_cursor", None + ) # R2-F1 if not next_cursor or not observations: break cursor = next_cursor @@ -477,11 +491,16 @@ def _run_trace_evaluator( # R1-F3: validate categorical value against defined categories if score_type == "CATEGORICAL": valid_categories = evaluator.get("categories", []) - if valid_categories and str(result["score"]) not in valid_categories: + if ( + valid_categories + and str(result["score"]) not in valid_categories + ): logger.warning( "Trace %s: categorical value %r not in " "categories %s — skipping", - trace.id, result["score"], valid_categories, + trace.id, + result["score"], + valid_categories, ) continue langfuse.create_score( @@ -586,7 +605,9 @@ def run( # Per-evaluator target field is source of truth (DEPRECATED: global evaluation_targets) target = evaluator.get("target", "trace") - print(f"\n--- Running {evaluator.get('id', '?')}: {ev_name} (target={target}) ---") + print( + f"\n--- Running {evaluator.get('id', '?')}: {ev_name} (target={target}) ---" + ) if target == "observation": f, s, e, sc = self._run_observation_evaluator( diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py index 899c614a..3d506506 100644 --- a/tests/test_create_score_configs.py +++ b/tests/test_create_score_configs.py @@ -16,8 +16,6 @@ from contextlib import contextmanager from unittest.mock import MagicMock -import pytest - # --------------------------------------------------------------------------- # Path setup # --------------------------------------------------------------------------- @@ -40,7 +38,9 @@ ] -def _make_config(name: str, cfg_id: str = "abc123", created_at: str = "2026-01-01T00:00:00Z"): +def _make_config( + name: str, cfg_id: str = "abc123", created_at: str = "2026-01-01T00:00:00Z" +): cfg = MagicMock() cfg.name = name cfg.id = cfg_id @@ -165,7 +165,9 @@ class TestCleanupDuplicates: def test_skips_non_duplicates(self): client = MagicMock() with _patched_module(client) as (mod, c): - existing = {"retrieval_relevance": [_make_config("retrieval_relevance", "id-1")]} + existing = { + "retrieval_relevance": [_make_config("retrieval_relevance", "id-1")] + } mod._cleanup_duplicates(c, existing) client.api.score_configs.delete.assert_not_called() @@ -194,7 +196,9 @@ def test_missing_created_at_sorts_last_and_gets_deleted(self): client = MagicMock() client.api.score_configs.delete.return_value = None # Config with a real date — should be kept as the "oldest" - with_date = _make_config("retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z") + with_date = _make_config( + "retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z" + ) # Config with no created_at attribute — fallback "9999-99-99" sorts last, gets deleted no_date = MagicMock() no_date.name = "retrieval_relevance" @@ -260,21 +264,25 @@ def test_flush_always_called_on_success(self): def test_cleanup_duplicates_flag_triggers_dedup(self): older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z") newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z") - other_existing = [_make_config(n) for n in ALL_CONFIG_NAMES if n != "retrieval_relevance"] + other_existing = [ + _make_config(n) for n in ALL_CONFIG_NAMES if n != "retrieval_relevance" + ] # After cleanup, deduped list returned - deduped = [_make_config("retrieval_relevance", "id-old")] + other_existing + deduped = [_make_config("retrieval_relevance", "id-old"), *other_existing] client = MagicMock() client.api.score_configs.list.side_effect = [ - _make_list_response([older, newer] + other_existing), # initial fetch - _make_list_response(deduped), # post-cleanup fetch + _make_list_response([older, newer, *other_existing]), # initial fetch + _make_list_response(deduped), # post-cleanup fetch ] client.api.score_configs.create.return_value = MagicMock() client.api.score_configs.delete.return_value = None client.flush.return_value = None - rc, c = _run_main(client, argv=["create_score_configs.py", "--cleanup-duplicates"]) + rc, c = _run_main( + client, argv=["create_score_configs.py", "--cleanup-duplicates"] + ) assert rc == 0 # Duplicate deleted diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py index 3cfd63d5..ec90fa02 100644 --- a/tests/test_evaluator_runner.py +++ b/tests/test_evaluator_runner.py @@ -6,7 +6,7 @@ Covers: - Trace-level evaluation path (EV-05, EV-06 style) -- Observation-level evaluation path (EV-01–EV-04 style) +- Observation-level evaluation path (EV-01-EV-04 style) - Cursor-based pagination for observations (BUG-217) - Page-based pagination for traces - CATEGORICAL score type (EV-04) @@ -422,7 +422,9 @@ def test_run_skips_trace_with_no_output(self, runner, evaluator_yaml): result = runner.run(since=since) assert result["evaluated"] == 0 - assert result["sampled"] == 0 # R2-F7: sampled not incremented for no-output traces + assert ( + result["sampled"] == 0 + ) # R2-F7: sampled not incremented for no-output traces mock_evaluator_config.evaluate.assert_not_called() def test_run_uses_page_pagination_for_trace_path(self, runner, evaluator_yaml): @@ -567,7 +569,10 @@ def test_observation_path_calls_observations_get_many( ) mock_evaluator_config = MagicMock() - mock_evaluator_config.evaluate.return_value = {"score": 0.85, "reasoning": "Good"} + mock_evaluator_config.evaluate.return_value = { + "score": 0.85, + "reasoning": "Good", + } since = datetime(2026, 3, 12, tzinfo=timezone.utc) @@ -624,7 +629,9 @@ def test_observation_path_skips_no_output(self, runner, observation_evaluator_ya result = runner.run(since=since) assert result["evaluated"] == 0 - assert result["sampled"] == 0 # R2-F7: sampled not incremented for no-output obs + assert ( + result["sampled"] == 0 + ) # R2-F7: sampled not incremented for no-output obs mock_evaluator_config.evaluate.assert_not_called() def test_observation_path_dry_run_no_score( @@ -713,7 +720,7 @@ def test_cursor_pagination_fetches_all_pages( patch("memory.evaluator.runner.get_client", return_value=mock_langfuse), patch.object(runner, "evaluator_config", mock_evaluator_config), ): - result = runner.run(since=since) + _result = runner.run(since=since) # Each event_type does 2 pages → 4 total calls for 2 event_types assert mock_langfuse.api.observations.get_many.call_count == 4 @@ -1196,7 +1203,9 @@ def test_invalid_categorical_value_skipped( mock_langfuse.create_score.assert_not_called() mock_logger.warning.assert_called() - def test_valid_categorical_value_is_scored(self, runner, categorical_evaluator_yaml): + def test_valid_categorical_value_is_scored( + self, runner, categorical_evaluator_yaml + ): """Categorical score within categories list must be scored normally.""" obs = make_mock_observation(obs_id="obs_goodcat", trace_id="trace_goodcat") mock_langfuse = MagicMock() diff --git a/tests/test_monitoring_api.py b/tests/test_monitoring_api.py index 9b8d044a..9bb88819 100644 --- a/tests/test_monitoring_api.py +++ b/tests/test_monitoring_api.py @@ -12,9 +12,7 @@ import sys from pathlib import Path -from unittest.mock import MagicMock, AsyncMock - -import pytest +from unittest.mock import MagicMock # --------------------------------------------------------------------------- @@ -71,14 +69,12 @@ def mount(self, *a, **kw): sys.path.insert(0, _monitoring_path) # Now import the function under test -import importlib -import types - # Use importlib to import main with patched sys.modules already in place if "main" in sys.modules: del sys.modules["main"] import main as _monitoring_main # noqa: E402 (monitoring/main.py) + sanitize_log_input = _monitoring_main.sanitize_log_input @@ -195,6 +191,7 @@ def _read_main_source(self) -> str: def test_str_e_always_wrapped_in_sanitize(self): """Every str(e) used in a logger call must be wrapped with sanitize_log_input.""" import re + source = self._read_main_source() # More precise: look for "error": str(e) that is NOT inside sanitize_log_input(...) @@ -202,21 +199,24 @@ def test_str_e_always_wrapped_in_sanitize(self): for match in re.finditer(r'"error":\s*(str\([^)]+\))', source): str_pos = match.start(1) sanitize_prefix = "sanitize_log_input(" - prefix = source[max(0, str_pos - len(sanitize_prefix)):str_pos] - assert prefix.endswith(sanitize_prefix), ( - f"Unsanitized str() found at log call site: {match.group(0)}" - ) + prefix = source[max(0, str_pos - len(sanitize_prefix)) : str_pos] + assert prefix.endswith( + sanitize_prefix + ), f"Unsanitized str() found at log call site: {match.group(0)}" def test_collection_name_in_extra_always_sanitized(self): """Every collection_name used in logger extra dicts must be sanitized inline.""" import re + source = self._read_main_source() # Find bare "collection": collection_name (not wrapped in sanitize_log_input) - for match in re.finditer(r'"collection":\s*(collection_name|request\.collection)', source): - value = match.group(1) + for match in re.finditer( + r'"collection":\s*(collection_name|request\.collection)', source + ): + _value = match.group(1) context_start = max(0, match.start() - 60) - context = source[context_start:match.end()] - assert "sanitize_log_input" in context, ( - f"Unsanitized collection_name found at log call site: ...{context}..." - ) + context = source[context_start : match.end()] + assert ( + "sanitize_log_input" in context + ), f"Unsanitized collection_name found at log call site: ...{context}..." diff --git a/tests/unit/test_evaluator_retry.py b/tests/unit/test_evaluator_retry.py index dec2d951..5a478701 100644 --- a/tests/unit/test_evaluator_retry.py +++ b/tests/unit/test_evaluator_retry.py @@ -7,11 +7,11 @@ from memory.evaluator.provider import EvaluatorConfig - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- + def _make_config(provider="ollama", max_retries=3) -> EvaluatorConfig: cfg = EvaluatorConfig(provider=provider, max_retries=max_retries) return cfg @@ -19,7 +19,7 @@ def _make_config(provider="ollama", max_retries=3) -> EvaluatorConfig: def _http_error(status_code: int, retry_after: str | None = None): """Create a mock exception that looks like an HTTP error.""" - exc = Exception(f"HTTP {status_code}") + exc = RuntimeError(f"HTTP {status_code}") exc.status_code = status_code if retry_after is not None: response = Mock() @@ -43,6 +43,7 @@ def _good_openai_response(text: str = '{"score": 0.9, "reasoning": "ok"}'): # Tests # --------------------------------------------------------------------------- + class TestEvaluatorRetry: def test_success_on_first_attempt(self): @@ -88,10 +89,14 @@ def test_exponential_backoff_timing(self): sleep_calls = [] - with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append): - with patch("memory.evaluator.provider.random.uniform", return_value=0.0): - with pytest.raises(Exception): - cfg.evaluate("test prompt") + with ( + patch( + "memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append + ), + patch("memory.evaluator.provider.random.uniform", return_value=0.0), + pytest.raises(RuntimeError), + ): + cfg.evaluate("test prompt") # With max_retries=3 we sleep 3 times (attempts 0,1,2) before final failure assert len(sleep_calls) == 3 @@ -107,11 +112,15 @@ def test_max_retries_exhaustion_raises(self): mock_client.chat.completions.create.side_effect = _http_error(503) cfg._client = mock_client - with patch("memory.evaluator.provider.time.sleep"): - with pytest.raises(Exception, match="HTTP 503"): - cfg.evaluate("test prompt") + with ( + patch("memory.evaluator.provider.time.sleep"), + pytest.raises(Exception, match="HTTP 503"), + ): + cfg.evaluate("test prompt") - assert mock_client.chat.completions.create.call_count == 3 # initial + 2 retries + assert ( + mock_client.chat.completions.create.call_count == 3 + ) # initial + 2 retries def test_non_retryable_400_fails_immediately(self): """400 errors should not be retried.""" @@ -143,9 +152,13 @@ def side_effect(**kwargs): sleep_calls = [] - with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append): - with patch("memory.evaluator.provider.random.uniform", return_value=0.0): - result = cfg.evaluate("test prompt") + with ( + patch( + "memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append + ), + patch("memory.evaluator.provider.random.uniform", return_value=0.0), + ): + result = cfg.evaluate("test prompt") assert result["score"] == 0.9 # First sleep should use Retry-After=5 as base (plus zero jitter) @@ -202,9 +215,11 @@ def test_client_reset_after_exhaustion(self): mock_client.chat.completions.create.side_effect = _http_error(500) cfg._client = mock_client - with patch("memory.evaluator.provider.time.sleep"): - with pytest.raises(Exception): - cfg.evaluate("test prompt") + with ( + patch("memory.evaluator.provider.time.sleep"), + pytest.raises(RuntimeError), + ): + cfg.evaluate("test prompt") assert cfg._client is None @@ -215,8 +230,10 @@ def test_client_reset_after_connection_error_exhaustion(self): mock_client.chat.completions.create.side_effect = ConnectionError("refused") cfg._client = mock_client - with patch("memory.evaluator.provider.time.sleep"): - with pytest.raises(ConnectionError): - cfg.evaluate("test prompt") + with ( + patch("memory.evaluator.provider.time.sleep"), + pytest.raises(ConnectionError), + ): + cfg.evaluate("test prompt") assert cfg._client is None diff --git a/tests/unit/test_evaluator_scheduler.py b/tests/unit/test_evaluator_scheduler.py index 938f910b..16fb69c1 100644 --- a/tests/unit/test_evaluator_scheduler.py +++ b/tests/unit/test_evaluator_scheduler.py @@ -11,7 +11,6 @@ import importlib.util import sys -import time from datetime import datetime, timedelta, timezone from pathlib import Path from unittest.mock import MagicMock, patch @@ -29,7 +28,8 @@ CRONITER_AVAILABLE = False pytestmark_croniter = pytest.mark.skipif( - not CRONITER_AVAILABLE, reason="croniter not installed — add croniter>=2.0.0 to requirements.txt" + not CRONITER_AVAILABLE, + reason="croniter not installed — add croniter>=2.0.0 to requirements.txt", ) @@ -54,7 +54,9 @@ def _make_croniter_mock(next_run_dt: datetime): def _load_scheduler(): """Load evaluator_scheduler module via importlib (not on sys.path as a package).""" - spec = importlib.util.spec_from_file_location("evaluator_scheduler", _SCHEDULER_PATH) + spec = importlib.util.spec_from_file_location( + "evaluator_scheduler", _SCHEDULER_PATH + ) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) return mod @@ -114,7 +116,7 @@ class TestCronScheduleParsing: @pytestmark_croniter def test_croniter_returns_future_datetime(self): """croniter.get_next() returns a datetime after 'now'.""" - from croniter import croniter # noqa: PLC0415 + from croniter import croniter now = datetime(2026, 3, 14, 4, 0, 0, tzinfo=timezone.utc) cron = croniter("0 5 * * *", now) @@ -128,7 +130,7 @@ def test_croniter_returns_future_datetime(self): @pytestmark_croniter def test_croniter_sleep_seconds_positive(self): """sleep_seconds should always be positive (next run is in the future).""" - from croniter import croniter # noqa: PLC0415 + from croniter import croniter now = datetime(2026, 3, 14, 4, 59, 59, tzinfo=timezone.utc) cron = croniter("0 5 * * *", now) @@ -177,19 +179,21 @@ def test_enabled_false_does_not_call_runner(self, tmp_path): mod = _load_scheduler() mod._shutdown_requested = False - call_count = 0 + _call_count = 0 def fake_interruptible_sleep(seconds, chunk=5.0): # After one sleep (schedule disabled), set shutdown to stop the loop mod._shutdown_requested = True - with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep): - with patch.object(mod, "load_schedule_config", return_value={"enabled": False}): - with patch( - "memory.evaluator.EvaluatorRunner" - ) as mock_runner_cls: - mod.run_scheduler(str(config)) - mock_runner_cls.assert_not_called() + with ( + patch.object( + mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep + ), + patch.object(mod, "load_schedule_config", return_value={"enabled": False}), + patch("memory.evaluator.EvaluatorRunner") as mock_runner_cls, + ): + mod.run_scheduler(str(config)) + mock_runner_cls.assert_not_called() # --------------------------------------------------------------------------- @@ -225,17 +229,25 @@ def fake_interruptible_sleep(seconds, chunk=5.0): _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) mock_croniter_mod, _, _ = _make_croniter_mock(_next_run) - with patch.dict(sys.modules, {"croniter": mock_croniter_mod}): - with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep): - with patch.object( - mod, - "load_schedule_config", - return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1}, - ): - with patch.object(mod, "_touch_health_file") as mock_touch: - with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner): - # Should NOT raise — errors are caught and logged - mod.run_scheduler(str(config)) + with ( + patch.dict(sys.modules, {"croniter": mock_croniter_mod}), + patch.object( + mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep + ), + patch.object( + mod, + "load_schedule_config", + return_value={ + "enabled": True, + "cron": "* * * * *", + "lookback_hours": 1, + }, + ), + patch.object(mod, "_touch_health_file") as mock_touch, + patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner), + ): + # Should NOT raise — errors are caught and logged + mod.run_scheduler(str(config)) # Runner was called at least once mock_runner.run.assert_called() @@ -275,16 +287,22 @@ def fake_touch(): _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) mock_croniter_mod, _, _ = _make_croniter_mock(_next_run) - with patch.dict(sys.modules, {"croniter": mock_croniter_mod}): - with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep): - with patch.object( - mod, - "load_schedule_config", - return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1}, - ): - with patch.object(mod, "_touch_health_file", side_effect=fake_touch): - with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner): - mod.run_scheduler(str(config)) + with ( + patch.dict(sys.modules, {"croniter": mock_croniter_mod}), + patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep), + patch.object( + mod, + "load_schedule_config", + return_value={ + "enabled": True, + "cron": "* * * * *", + "lookback_hours": 1, + }, + ), + patch.object(mod, "_touch_health_file", side_effect=fake_touch), + patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner), + ): + mod.run_scheduler(str(config)) # Health file should NOT be touched after the failed evaluation assert touch_after_startup[0] == 0 @@ -310,14 +328,16 @@ def fake_sleep(seconds, chunk=5.0): # Immediately shut down after first sleep mod._shutdown_requested = True - with patch.object(mod, "_touch_health_file", side_effect=fake_touch): - with patch.object( + with ( + patch.object(mod, "_touch_health_file", side_effect=fake_touch), + patch.object( mod, "load_schedule_config", return_value={"enabled": False}, - ): - with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep): - mod.run_scheduler(str(tmp_path / "evaluator_config.yaml")) + ), + patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep), + ): + mod.run_scheduler(str(tmp_path / "evaluator_config.yaml")) # At minimum, touched on startup assert len(touch_calls) >= 1 @@ -358,16 +378,22 @@ def fake_sleep(seconds, chunk=5.0): _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc) mock_croniter_mod, _, _ = _make_croniter_mock(_next_run) - with patch.dict(sys.modules, {"croniter": mock_croniter_mod}): - with patch.object(mod, "_touch_health_file", side_effect=fake_touch): - with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep): - with patch.object( - mod, - "load_schedule_config", - return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1}, - ): - with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner): - mod.run_scheduler(str(config)) + with ( + patch.dict(sys.modules, {"croniter": mock_croniter_mod}), + patch.object(mod, "_touch_health_file", side_effect=fake_touch), + patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep), + patch.object( + mod, + "load_schedule_config", + return_value={ + "enabled": True, + "cron": "* * * * *", + "lookback_hours": 1, + }, + ), + patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner), + ): + mod.run_scheduler(str(config)) # Touched on startup + once after successful run = at least 2 calls assert touch_calls[0] >= 2 From 9e20ca1aa04de965562ea22db7fd77a67ea4a02d Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 02:05:51 -0700 Subject: [PATCH 03/13] fix(installer): always copy requirements.txt and pyproject.toml on update Option 1 (add-project) and copy_files() both skipped requirements.txt if it already existed, preventing new dependencies like croniter from reaching Docker builds. Now always overwrites both files. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/install.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/scripts/install.sh b/scripts/install.sh index 9f5f83aa..6a36e337 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1049,6 +1049,17 @@ update_shared_scripts() { log_debug "Updated evaluators/ directory" fi + # Sync requirements.txt and pyproject.toml (needed by Docker builds) + # Always overwrite — new dependencies (e.g. croniter) must reach containers + if [[ -f "$SCRIPT_DIR/../requirements.txt" ]]; then + cp "$SCRIPT_DIR/../requirements.txt" "$INSTALL_DIR/requirements.txt" || log_warning "Failed to copy requirements.txt" + log_debug "Updated requirements.txt" + fi + if [[ -f "$SCRIPT_DIR/../pyproject.toml" ]]; then + cp "$SCRIPT_DIR/../pyproject.toml" "$INSTALL_DIR/pyproject.toml" || log_warning "Failed to copy pyproject.toml" + log_debug "Updated pyproject.toml" + fi + # Update templates (new templates added in updates must reach installed dir) if [[ -d "$SCRIPT_DIR/../templates" ]]; then mkdir -p "$INSTALL_DIR/templates" @@ -1334,12 +1345,13 @@ install_python_dependencies() { return 0 fi - # Copy pyproject.toml and requirements.txt to install directory if not already there - if [[ ! -f "$INSTALL_DIR/pyproject.toml" ]]; then + # Copy pyproject.toml and requirements.txt to install directory + # Always overwrite — updated deps (e.g. croniter) must reach Docker builds + if [[ -f "$source_dir/pyproject.toml" ]]; then cp "$source_dir/pyproject.toml" "$INSTALL_DIR/" log_debug "Copied pyproject.toml to $INSTALL_DIR" fi - if [[ -f "$source_dir/requirements.txt" && ! -f "$INSTALL_DIR/requirements.txt" ]]; then + if [[ -f "$source_dir/requirements.txt" ]]; then cp "$source_dir/requirements.txt" "$INSTALL_DIR/" log_debug "Copied requirements.txt to $INSTALL_DIR" fi From 7c936640d29271dabfc43aaff33b1d3c476eb583 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 02:14:17 -0700 Subject: [PATCH 04/13] fix(score-configs): use api.score_configs.get() not .list() for V3 SDK V3 SDK ScoreConfigsClient exposes get(page=, limit=) not list(). Also fixes test mocks to match. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/create_score_configs.py | 4 ++-- tests/test_create_score_configs.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py index 06cfc2e6..446907f9 100644 --- a/scripts/create_score_configs.py +++ b/scripts/create_score_configs.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md # FORBIDDEN: Langfuse() constructor, start_span(), start_generation(), langfuse_context -# REQUIRED: get_client(), api.score_configs.create(), api.score_configs.list(), flush() +# REQUIRED: get_client(), api.score_configs.create(), api.score_configs.get(), flush() """Idempotent Score Config setup in Langfuse. Creates 6 Score Configs that enforce validation schemas on evaluation scores. @@ -35,7 +35,7 @@ def _fetch_existing_configs(langfuse) -> dict[str, list]: page = 1 while True: try: - response = langfuse.api.score_configs.list(page=page, limit=100) + response = langfuse.api.score_configs.get(page=page, limit=100) except Exception as exc: print(f" [WARN] Could not list existing score configs: {exc}") break diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py index 3d506506..c15002ec 100644 --- a/tests/test_create_score_configs.py +++ b/tests/test_create_score_configs.py @@ -109,9 +109,9 @@ def _make_client_with_existing(existing_names: list[str], list_side_effect=None) client = MagicMock() configs = [_make_config(n) for n in existing_names] if list_side_effect is not None: - client.api.score_configs.list.side_effect = list_side_effect + client.api.score_configs.get.side_effect = list_side_effect else: - client.api.score_configs.list.return_value = _make_list_response(configs) + client.api.score_configs.get.return_value = _make_list_response(configs) client.api.score_configs.create.return_value = MagicMock() client.api.score_configs.delete.return_value = None client.flush.return_value = None @@ -143,14 +143,14 @@ def test_detects_duplicates(self): _make_config("retrieval_relevance", "id-2"), ] client = MagicMock() - client.api.score_configs.list.return_value = _make_list_response(configs) + client.api.score_configs.get.return_value = _make_list_response(configs) with _patched_module(client) as (mod, c): result = mod._fetch_existing_configs(c) assert len(result["retrieval_relevance"]) == 2 def test_handles_list_api_error_gracefully(self): client = MagicMock() - client.api.score_configs.list.side_effect = RuntimeError("network error") + client.api.score_configs.get.side_effect = RuntimeError("network error") with _patched_module(client) as (mod, c): result = mod._fetch_existing_configs(c) assert result == {} @@ -272,7 +272,7 @@ def test_cleanup_duplicates_flag_triggers_dedup(self): deduped = [_make_config("retrieval_relevance", "id-old"), *other_existing] client = MagicMock() - client.api.score_configs.list.side_effect = [ + client.api.score_configs.get.side_effect = [ _make_list_response([older, newer, *other_existing]), # initial fetch _make_list_response(deduped), # post-cleanup fetch ] From e0246c91a3971d15a98e9a313f9cce7cc515c9d6 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 02:21:44 -0700 Subject: [PATCH 05/13] fix(score-configs): archive duplicates instead of delete (V3 SDK has no delete) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Langfuse V3 API returns 405 on DELETE for score configs. Uses update(isArchived=True) instead — archived configs hidden in UI. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/create_score_configs.py | 23 ++++++++++++++++------- tests/test_create_score_configs.py | 26 +++++++++++++++----------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py index 446907f9..2d4fb516 100644 --- a/scripts/create_score_configs.py +++ b/scripts/create_score_configs.py @@ -60,11 +60,17 @@ def _fetch_existing_configs(langfuse) -> dict[str, list]: def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None: - """Delete duplicate score configs, keeping the oldest one (first created).""" + """Archive duplicate score configs, keeping the oldest one (first created). + + Langfuse API does not support DELETE on score configs (405). + Uses update(isArchived=True) instead — archived configs are hidden in UI. + """ + from langfuse.api.resources.score_configs.types import UpdateScoreConfigRequest + for name, configs in existing.items(): if len(configs) <= 1: continue - # Sort by created_at ascending, keep first, delete the rest + # Sort by created_at ascending, keep first, archive the rest try: sorted_configs = sorted( configs, key=lambda c: getattr(c, "created_at", "9999-99-99") @@ -72,17 +78,20 @@ def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None: except Exception: sorted_configs = configs - to_delete = sorted_configs[1:] - for cfg in to_delete: + to_archive = sorted_configs[1:] + for cfg in to_archive: cfg_id = getattr(cfg, "id", None) if not cfg_id: continue try: - langfuse.api.score_configs.delete(cfg_id) - print(f" [DEL] Removed duplicate config '{name}' id={cfg_id}") + langfuse.api.score_configs.update( + cfg_id, + request=UpdateScoreConfigRequest(isArchived=True), + ) + print(f" [ARCHIVED] Duplicate config '{name}' id={cfg_id}") except Exception as exc: print( - f" [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}" + f" [WARN] Could not archive duplicate '{name}' id={cfg_id}: {exc}" ) diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py index c15002ec..f2e8686d 100644 --- a/tests/test_create_score_configs.py +++ b/tests/test_create_score_configs.py @@ -71,6 +71,7 @@ def _build_langfuse_modules(mock_client): fake_sc_types = types.ModuleType("langfuse.api.resources.score_configs.types") fake_sc_types.CreateScoreConfigRequest = MagicMock(side_effect=lambda **kw: kw) + fake_sc_types.UpdateScoreConfigRequest = MagicMock(side_effect=lambda **kw: kw) modules = { "langfuse": fake_langfuse, @@ -113,7 +114,7 @@ def _make_client_with_existing(existing_names: list[str], list_side_effect=None) else: client.api.score_configs.get.return_value = _make_list_response(configs) client.api.score_configs.create.return_value = MagicMock() - client.api.score_configs.delete.return_value = None + client.api.score_configs.update.return_value = None client.flush.return_value = None return client @@ -169,7 +170,7 @@ def test_skips_non_duplicates(self): "retrieval_relevance": [_make_config("retrieval_relevance", "id-1")] } mod._cleanup_duplicates(c, existing) - client.api.score_configs.delete.assert_not_called() + client.api.score_configs.update.assert_not_called() def test_deletes_extra_copies_keeps_oldest(self): client = MagicMock() @@ -178,12 +179,13 @@ def test_deletes_extra_copies_keeps_oldest(self): with _patched_module(client) as (mod, c): existing = {"retrieval_relevance": [older, newer]} mod._cleanup_duplicates(c, existing) - # Newer (id-new) should be deleted; oldest (id-old) kept - client.api.score_configs.delete.assert_called_once_with("id-new") + # Newer (id-new) should be archived; oldest (id-old) kept + client.api.score_configs.update.assert_called_once() + assert client.api.score_configs.update.call_args[0][0] == "id-new" def test_handles_delete_error_gracefully(self): client = MagicMock() - client.api.score_configs.delete.side_effect = RuntimeError("delete failed") + client.api.score_configs.update.side_effect = RuntimeError("delete failed") older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z") newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z") with _patched_module(client) as (mod, c): @@ -194,7 +196,7 @@ def test_handles_delete_error_gracefully(self): def test_missing_created_at_sorts_last_and_gets_deleted(self): """Config without created_at should sort last (treated as newest) and be deleted.""" client = MagicMock() - client.api.score_configs.delete.return_value = None + client.api.score_configs.update.return_value = None # Config with a real date — should be kept as the "oldest" with_date = _make_config( "retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z" @@ -207,8 +209,9 @@ def test_missing_created_at_sorts_last_and_gets_deleted(self): with _patched_module(client) as (mod, c): existing = {"retrieval_relevance": [with_date, no_date]} mod._cleanup_duplicates(c, existing) - # The one without created_at should be deleted; the dated one kept - c.api.score_configs.delete.assert_called_once_with("id-no-date") + # The one without created_at should be archived; the dated one kept + c.api.score_configs.update.assert_called_once() + assert c.api.score_configs.update.call_args[0][0] == "id-no-date" # --------------------------------------------------------------------------- @@ -277,7 +280,7 @@ def test_cleanup_duplicates_flag_triggers_dedup(self): _make_list_response(deduped), # post-cleanup fetch ] client.api.score_configs.create.return_value = MagicMock() - client.api.score_configs.delete.return_value = None + client.api.score_configs.update.return_value = None client.flush.return_value = None rc, c = _run_main( @@ -285,8 +288,9 @@ def test_cleanup_duplicates_flag_triggers_dedup(self): ) assert rc == 0 - # Duplicate deleted - c.api.score_configs.delete.assert_called_once_with("id-new") + # Duplicate archived + c.api.score_configs.update.assert_called_once() + assert c.api.score_configs.update.call_args[0][0] == "id-new" # All 6 configs present after cleanup — nothing to create c.api.score_configs.create.assert_not_called() c.flush.assert_called_once() From ab4fd34dd15c1701ade15304e02d8f96026ae1e4 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 02:35:07 -0700 Subject: [PATCH 06/13] fix(runner): use page-based pagination for observations.get_many() V3 SDK observations.get_many() uses page=/total_pages, not cursor. Both trace.list() and observations.get_many() are page-based in V3. Fixed runner and all test mocks to match actual SDK signatures. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/memory/evaluator/runner.py | 25 ++++++++++++------------- tests/test_evaluator_runner.py | 34 +++++++++++++++++----------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/memory/evaluator/runner.py b/src/memory/evaluator/runner.py index a370ce29..8e85f7be 100644 --- a/src/memory/evaluator/runner.py +++ b/src/memory/evaluator/runner.py @@ -8,8 +8,8 @@ 2. Per evaluator, read target: "trace" or "observation" (per-evaluator YAML field) 3. Trace path (EV-05, EV-06): page-based pagination via trace.list() - uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API) - 4. Observation path (EV-01-EV-04): cursor-based pagination via observations.get_many() - - uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217) + 4. Observation path (EV-01-EV-04): page-based pagination via observations.get_many() + - uses from_start_time/to_start_time/page/meta.total_pages (V3 observation API) 5. Filter observations by name (event_type) — server-side via name= parameter (Path B) 6. Sample traces/observations per evaluator's sampling_rate 7. Evaluate each trace/observation via configurable LLM judge @@ -20,7 +20,7 @@ Use the per-evaluator target: field in each evaluator YAML instead. PLAN-012 Phase 2 — Section 5.4 -S-16.3: Observation-level evaluation + cursor-based observation pagination (BUG-217) +S-16.3: Observation-level evaluation + page-based observation pagination """ import hashlib @@ -245,7 +245,7 @@ def _run_observation_evaluator( ) -> tuple[int, int, int, int]: """Run observation-level evaluation for a single evaluator. - Fetches observations via observations.get_many() with cursor-based pagination + Fetches observations via observations.get_many() with page-based pagination (BUG-217). Filters by observation name (event_type) server-side via name= parameter (Path B — tags are trace-level only in V3). @@ -274,15 +274,15 @@ def _run_observation_evaluator( try: # R2-F3: catch fetch-level errors so one evaluator doesn't kill the whole run for name_filter in name_filters: - cursor: str | None = None + page = 1 while True: - # Cursor-based pagination — V3 observations API (BUG-217) + # Page-based pagination — V3 observations API obs_response = langfuse.api.observations.get_many( name=name_filter, from_start_time=since, to_start_time=until, - cursor=cursor, + page=page, limit=batch_size, ) observations = obs_response.data or [] @@ -393,13 +393,12 @@ def _run_observation_evaluator( ) continue - # Cursor-based stop condition — no next cursor means last page - next_cursor = getattr( - obs_response.meta, "next_cursor", None - ) # R2-F1 - if not next_cursor or not observations: + # Page-based stop condition + meta = getattr(obs_response, "meta", None) + total_pages = getattr(meta, "total_pages", 1) if meta else 1 + if page >= total_pages or not observations: break - cursor = next_cursor + page += 1 except Exception as exc: logger.error("Evaluator %s: error fetching observations: %s", ev_name, exc) diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py index ec90fa02..d992f178 100644 --- a/tests/test_evaluator_runner.py +++ b/tests/test_evaluator_runner.py @@ -212,12 +212,12 @@ def make_paginated_response(traces: list, total_pages=1): return response -def make_observation_response(observations: list, next_cursor: str | None = None): - """Cursor-based response for observations.get_many().""" +def make_observation_response(observations: list, total_pages: int = 1): + """Page-based response for observations.get_many().""" response = MagicMock() response.data = observations response.meta = MagicMock() - response.meta.next_cursor = next_cursor # R2-F1: V3 API uses next_cursor not cursor + response.meta.total_pages = total_pages return response @@ -693,22 +693,22 @@ def test_observation_path_appends_audit_log_with_observation_id( # --------------------------------------------------------------------------- -class TestCursorPaginationObservations: - def test_cursor_pagination_fetches_all_pages( +class TestPagePaginationObservations: + def test_page_pagination_fetches_all_pages( self, runner, observation_evaluator_yaml ): - """observations.get_many() must be called with next_cursor on subsequent pages.""" + """observations.get_many() must paginate via page= with total_pages.""" obs1 = make_mock_observation(obs_id="obs_p1", trace_id="t1") obs2 = make_mock_observation(obs_id="obs_p2", trace_id="t2") mock_langfuse = MagicMock() - # First call returns obs1 + next_cursor; second call returns obs2 + no cursor + # First call returns obs1 with total_pages=2; second returns obs2 with total_pages=2 mock_langfuse.api.observations.get_many.side_effect = [ - make_observation_response([obs1], next_cursor="cursor_abc"), - make_observation_response([obs2], next_cursor=None), + make_observation_response([obs1], total_pages=2), + make_observation_response([obs2], total_pages=2), # Additional responses for the second event_type - make_observation_response([obs1], next_cursor="cursor_abc"), - make_observation_response([obs2], next_cursor=None), + make_observation_response([obs1], total_pages=2), + make_observation_response([obs2], total_pages=2), ] mock_evaluator_config = MagicMock() @@ -724,22 +724,22 @@ def test_cursor_pagination_fetches_all_pages( # Each event_type does 2 pages → 4 total calls for 2 event_types assert mock_langfuse.api.observations.get_many.call_count == 4 - # Second call for first event_type must pass cursor + # Second call for first event_type must pass page=2 second_call_kwargs = mock_langfuse.api.observations.get_many.call_args_list[ 1 ].kwargs - assert second_call_kwargs.get("cursor") == "cursor_abc" + assert second_call_kwargs.get("page") == 2 - def test_cursor_pagination_stops_when_no_next_cursor( + def test_page_pagination_stops_on_single_page( self, runner, observation_evaluator_yaml ): - """Must stop fetching when meta.cursor is None.""" + """Must stop fetching when total_pages == 1.""" obs = make_mock_observation(obs_id="obs_single") mock_langfuse = MagicMock() - # No next cursor → single page per event_type + # Single page per event_type mock_langfuse.api.observations.get_many.return_value = ( - make_observation_response([obs], next_cursor=None) + make_observation_response([obs], total_pages=1) ) mock_evaluator_config = MagicMock() From c766e9d856d14d2232977f3ab319d5a470d7c950 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 03:08:24 -0700 Subject: [PATCH 07/13] fix(installer+provider): import user .env on Option 1 updates, auto-detect Ollama cloud - Add import_user_env() call to update_shared_scripts() (Option 1 path) so credentials like OLLAMA_API_KEY are imported on updates, not just fresh installs - Auto-detect Ollama cloud vs local: if OLLAMA_API_KEY env var is set and no explicit base_url configured, use https://api.ollama.com/v1 instead of localhost Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/install.sh | 3 +++ src/memory/evaluator/provider.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/scripts/install.sh b/scripts/install.sh index 6a36e337..f85d7472 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1067,6 +1067,9 @@ update_shared_scripts() { log_debug "Templates updated" fi + # Re-import user .env customizations (new env vars from updates, e.g. OLLAMA_API_KEY) + import_user_env + if [[ $updated_count -gt 0 || $hooks_count -gt 0 || $docker_count -gt 0 ]]; then log_success "Updated $updated_count shared scripts, $hooks_count hook scripts, $docker_count docker files" if [[ $archived_count -gt 0 ]]; then diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py index 326bcc47..16330ae9 100644 --- a/src/memory/evaluator/provider.py +++ b/src/memory/evaluator/provider.py @@ -73,11 +73,19 @@ def get_client(self): if self.provider == "ollama": from openai import OpenAI + api_key = os.environ.get("OLLAMA_API_KEY", "") + # Auto-detect cloud vs local: if API key is set and no explicit + # base_url, use Ollama cloud. Otherwise default to local. + if self.base_url: + base_url = self.base_url + elif api_key: + base_url = "https://api.ollama.com/v1" + else: + base_url = "http://localhost:11434/v1" + client = OpenAI( - base_url=self.base_url or "http://localhost:11434/v1", - api_key=os.environ.get( - "OLLAMA_API_KEY", "ollama" - ), # Cloud: set OLLAMA_API_KEY; local: ignored + base_url=base_url, + api_key=api_key or "ollama", # Local Ollama ignores key ) elif self.provider == "openrouter": From 840d88c4781e44af5890cfc1ab4e6d9c0fddca74 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 03:10:24 -0700 Subject: [PATCH 08/13] fix(installer): resolve SOURCE_DIR unbound variable in Option 1 path import_user_env() used SOURCE_DIR which is only set during full install. Fall back to SCRIPT_DIR parent for Option 1 (add-project) updates. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/install.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/install.sh b/scripts/install.sh index f85d7472..1f34aceb 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1642,7 +1642,9 @@ copy_files() { # are present when credential generation checks run. import_user_env() { local docker_env="$INSTALL_DIR/docker/.env" - local user_env="$SOURCE_DIR/.env" + # SOURCE_DIR is set during full install; fall back to SCRIPT_DIR parent for Option 1 + local source_root="${SOURCE_DIR:-$(cd "$SCRIPT_DIR/.." && pwd)}" + local user_env="$source_root/.env" # Only import if user's .env exists and docker/.env exists if [[ ! -f "$user_env" ]] || [[ ! -f "$docker_env" ]]; then From 9b1a61ef8b546154e9eab0ef6604a1ae33aca3eb Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 03:17:42 -0700 Subject: [PATCH 09/13] fix(provider): correct Ollama cloud URL to ollama.com/v1 api.ollama.com returns 401; ollama.com/v1 is the correct OpenAI-compat endpoint. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/memory/evaluator/provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py index 16330ae9..e03f2580 100644 --- a/src/memory/evaluator/provider.py +++ b/src/memory/evaluator/provider.py @@ -79,7 +79,7 @@ def get_client(self): if self.base_url: base_url = self.base_url elif api_key: - base_url = "https://api.ollama.com/v1" + base_url = "https://ollama.com/v1" else: base_url = "http://localhost:11434/v1" From 4306cad4897415988fe0dcc52229803a9b0e4267 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 03:35:40 -0700 Subject: [PATCH 10/13] config(evaluator): change default model to gemma3:4b (Ollama cloud compatible) llama3.2:8b is not available on Ollama cloud. gemma3:4b is small, fast, and suitable for LLM-as-judge evaluation tasks. Co-Authored-By: Claude Opus 4.6 (1M context) --- evaluator_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluator_config.yaml b/evaluator_config.yaml index 2a788bc1..0d22835b 100644 --- a/evaluator_config.yaml +++ b/evaluator_config.yaml @@ -13,7 +13,7 @@ # --------------------------------------------------------------------------- evaluator_model: provider: ollama # ollama | openrouter | anthropic | openai | custom - model_name: llama3.2:8b # Model identifier for the selected provider + model_name: gemma3:4b # Model identifier for the selected provider temperature: 0.0 # Deterministic evaluation (do not change) max_tokens: 4096 # 4096 to accommodate thinking models (reasoning + output) max_retries: 3 # Retry on 500/502/503/429 and network errors From 780c4c7235b909b3e791ab9e0fb376d02c0538bd Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 03:39:34 -0700 Subject: [PATCH 11/13] docs: update changelog and Langfuse integration docs for v2.2.3 - Changelog: complete v2.2.3 entry with upgrade instructions including scheduler build, score config setup, and provider configuration - Langfuse docs: add LLM-as-Judge evaluation pipeline section with evaluator table, config reference, provider auto-detection, and manual evaluation commands. Add evaluator-scheduler to Docker services Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 70 +++++++++++++++++++++++++----------- docs/LANGFUSE-INTEGRATION.md | 67 ++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 296acb60..d3788a2e 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,27 +5,37 @@ All notable changes to AI Memory Module will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [2.2.3] - 2026-03-14 +## [2.2.3] - 2026-03-15 -Langfuse observability improvements and LLM-as-judge evaluation pipeline: semantic tags on all 108 trace events, per-search trace visibility in the compact injection path, and a complete multi-provider evaluation engine with 6 evaluators, 5 golden datasets (123 items), and a CI quality gate. +Complete Langfuse observability pipeline: observation-level evaluation for all 6 evaluators, automated scheduling, exponential backoff retry, and security hardening. ### Added -- **Semantic tags on all trace events**: All 108 `emit_trace_event` calls include canonical dual-element tags (e.g., `["search", "retrieval"]`) for structured Langfuse dashboard filtering -- **Per-search trace visibility in compact injection path**: Each search call within compact context injection emits its own trace, enabling per-query latency and result visibility in that path -- **LLM-as-judge evaluation engine**: Multi-provider evaluator (`src/memory/evaluator/`) supporting Ollama (default, local, free), OpenRouter, Anthropic, OpenAI, and custom OpenAI-compatible endpoints -- **6 evaluator definitions**: Retrieval relevance (EV-01), injection value (EV-02), capture completeness (EV-03), classification accuracy (EV-04), bootstrap quality (EV-05), and session coherence (EV-06) — each with YAML config and LLM judge prompt template -- **5 golden datasets** (123 items): DS-01 Retrieval (25 items), DS-02 Error Pattern Match (12 items), DS-03 Bootstrap Round-Trip (8 items), DS-04 Keyword Trigger Routing (68 items), DS-05 Chunking Quality (10 items) — for repeatable regression benchmarking -- **Regression test suite**: `tests/test_regression.py` runs Langfuse experiments against golden datasets with configurable quality thresholds (`@pytest.mark.regression`) -- **CI quality gate**: `.github/workflows/regression-tests.yml` blocks PRs on score regression when `src/memory/**` or hook scripts change -- **Evaluator YAML configuration**: `evaluator_config.yaml` — zero secrets in config; all credentials supplied via environment variables with inline documentation +- **Observation-level evaluation**: Runner scores individual Langfuse observations (spans) for EV-01 to EV-04, not just whole traces. Enables per-retrieval, per-injection, per-capture quality scoring +- **Evaluator-scheduler container**: Automated daily evaluations via `evaluator-scheduler` Docker service with `croniter`-based scheduling, health checks, graceful shutdown, and live config reload +- **Exponential backoff retry**: Provider retries on HTTP 500/502/503/429 and network errors (ConnectionError, TimeoutError) with configurable `max_retries` (default: 3) and jitter +- **12 evaluator files on disk**: 6 YAML configs + 6 prompt templates materialized from PLAN-012 spec. Filters aligned to actual `emit_trace_event()` event_types via codebase audit +- **Score config idempotency**: `create_score_configs.py` pre-checks existing configs via `.get()` API; `--cleanup-duplicates` archives extras via `update(isArchived=True)` +- **Ollama cloud auto-detection**: Provider automatically uses `https://ollama.com/v1` when `OLLAMA_API_KEY` env var is set (no manual `base_url` config needed) +- **Installer copies evaluator files**: Both fresh install and Option 1 update paths copy `evaluator_config.yaml`, `evaluators/`, `requirements.txt`, and `pyproject.toml` +- **Installer imports .env on Option 1**: `import_user_env()` now runs during add-project updates, not just fresh installs — ensures credentials like `OLLAMA_API_KEY` reach the installed `.env` ### Changed -- **`detect-secrets` moved to dev extras**: Removed from default `requirements.txt`; added to `requirements-dev.txt` to reduce production dependency footprint -- **GitHub Actions versions pinned**: All CI workflow action steps use pinned versions for reproducible builds +- **Default evaluator model**: `gemma3:4b` (Ollama cloud compatible) replaces `llama3.2:8b` (not available on cloud) +- **Observation filtering**: Path B — evaluators filter observations by `name` (event_type) instead of tags. Langfuse V3 does not support observation-level tags; trace-level tags remain for trace filtering +- **Pagination**: Both `trace.list()` and `observations.get_many()` use page-based pagination per V3 SDK (`page=`, `total_pages`) + +### Fixed +- **Log injection sanitization**: All `str(e)` in `monitoring/main.py` log statements wrapped with `sanitize_log_input()` inline at call sites (CodeQL `py/log-injection` compliance) +- **CATEGORICAL score handling**: EV-04 passes string values (`"correct"`, `"partially_correct"`, `"incorrect"`) with validation against allowed categories before submission +- **Score ID collision**: `_make_score_id()` includes `observation_id` in hash seed — prevents silent overwrites when multiple observations share a trace +- **Installer `SOURCE_DIR` unbound**: `import_user_env()` falls back to `SCRIPT_DIR/..` in Option 1 path + +### Security +- **7 CodeQL HIGH findings resolved**: `monitoring/main.py` log injection vectors sanitized at every call site with AST-verified test coverage ### Upgrade Instructions -1. **Update code and reinstall**: +1. **Pull and run installer**: ```bash cd /path/to/your/ai-memory-clone git pull origin main @@ -33,13 +43,33 @@ Langfuse observability improvements and LLM-as-judge evaluation pipeline: semant # Select Option 1 (Add project to existing installation) ``` -2. **Evaluator setup** (optional): - - Default: Ollama (local, free, no API key needed) - - Configure provider in `evaluator_config.yaml` - - Set API keys via environment variables (see config comments) - - Run: `python scripts/create_score_configs.py` (one-time Langfuse setup) - - Run: `python scripts/create_datasets.py` (one-time golden dataset creation) - - Run: `python scripts/run_evaluations.py --config evaluator_config.yaml` +2. **Build and start the evaluator-scheduler container**: + ```bash + cd ~/.ai-memory/docker + unset QDRANT_API_KEY + docker compose -f docker-compose.yml -f docker-compose.langfuse.yml build evaluator-scheduler + docker compose -f docker-compose.yml -f docker-compose.langfuse.yml --profile langfuse up -d evaluator-scheduler + ``` + +3. **Create score configs** (one-time, idempotent): + ```bash + cd /path/to/your/ai-memory-clone + source .venv/bin/activate + cd ~/.ai-memory + set -a && source docker/.env && set +a && unset QDRANT_API_KEY + python scripts/create_score_configs.py + ``` + +4. **Configure evaluator provider** (optional — defaults to Ollama): + - **Ollama cloud**: Set `OLLAMA_API_KEY` in your `.env` (auto-detects cloud endpoint) + - **Local Ollama**: No config needed (default `http://localhost:11434/v1`) + - **Other providers**: Edit `evaluator_config.yaml` `provider:` field + - Model: Edit `evaluator_config.yaml` `model_name:` (default: `gemma3:4b`) + +5. **Run evaluations manually** (optional — scheduler runs daily at 05:00 UTC): + ```bash + python scripts/run_evaluations.py --config evaluator_config.yaml + ``` --- diff --git a/docs/LANGFUSE-INTEGRATION.md b/docs/LANGFUSE-INTEGRATION.md index c922bec9..88267762 100644 --- a/docs/LANGFUSE-INTEGRATION.md +++ b/docs/LANGFUSE-INTEGRATION.md @@ -201,6 +201,73 @@ All Langfuse services use the `langfuse` profile and join the existing `ai-memor | `langfuse-redis` | `redis:7` | Job queue and ephemeral cache | | `langfuse-minio` | `cgr.dev/chainguard/minio` | S3-compatible blob storage for event upload | | `trace-flush-worker` | Custom (Dockerfile.worker) | Reads trace buffer, flushes to Langfuse | +| `evaluator-scheduler` | Custom (Dockerfile.evaluator-scheduler) | Cron-based LLM-as-judge evaluation runner | + +--- + +## LLM-as-Judge Evaluation Pipeline + +The evaluation pipeline scores memory operations using an LLM judge. Six evaluators run automatically via the `evaluator-scheduler` container (daily at 05:00 UTC by default). + +### Evaluators + +| ID | Name | Target | Score Type | What it measures | +|----|------|--------|------------|------------------| +| EV-01 | `retrieval_relevance` | observation | NUMERIC (0-1) | Was the retrieved memory relevant to the trigger? | +| EV-02 | `injection_value` | observation | BOOLEAN | Did the injected context add value vs noise? | +| EV-03 | `capture_completeness` | observation | BOOLEAN | Did the capture preserve all important information? | +| EV-04 | `classification_accuracy` | observation | CATEGORICAL | Was the memory type classification correct? | +| EV-05 | `bootstrap_quality` | trace | NUMERIC (0-1) | Was the cross-session bootstrap context useful? | +| EV-06 | `session_coherence` | trace | NUMERIC (0-1) | Were memory operations during the session coherent? | + +### How It Works + +1. **Scheduler** wakes at cron time, reads `evaluator_config.yaml` +2. **Runner** fetches observations/traces from Langfuse matching each evaluator's `filter.event_types` +3. **Sampling** applies per-evaluator sampling rate (5-100%) +4. **LLM judge** evaluates each sampled item using the evaluator's prompt template +5. **Scores** are attached back to the observation/trace in Langfuse via `create_score()` + +### Evaluator Configuration + +All config in `evaluator_config.yaml` — zero secrets (API keys via environment variables): + +```yaml +evaluator_model: + provider: ollama # ollama | openrouter | anthropic | openai | custom + model_name: gemma3:4b # Ollama cloud model + temperature: 0.0 + max_tokens: 4096 + max_retries: 3 # Retry on 500/502/503/429 + network errors + +schedule: + enabled: true + cron: "0 5 * * *" # Daily at 05:00 UTC + lookback_hours: 24 +``` + +### Provider Auto-Detection + +- **Ollama cloud**: Set `OLLAMA_API_KEY` env var → auto-uses `https://ollama.com/v1` +- **Local Ollama**: No key set → uses `http://localhost:11434/v1` +- **Other providers**: Set `provider:` in config + corresponding env var + +### Score Config Setup + +Run once to create score validation schemas in Langfuse: + +```bash +python scripts/create_score_configs.py # Create configs (idempotent) +python scripts/create_score_configs.py --cleanup-duplicates # Archive duplicates +``` + +### Manual Evaluation + +```bash +python scripts/run_evaluations.py --config evaluator_config.yaml # Full run +python scripts/run_evaluations.py --config evaluator_config.yaml --dry-run # Preview only +python scripts/run_evaluations.py --config evaluator_config.yaml --evaluator EV-01 # Single evaluator +``` --- From 9375d308edc80144214f736aab9dedea524b5068 Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 03:46:27 -0700 Subject: [PATCH 12/13] fix(evaluator): update default model to gemma3:4b in provider + tests Matches evaluator_config.yaml change. Updates dataclass default and test assertions from llama3.2:8b to gemma3:4b. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/memory/evaluator/provider.py | 4 ++-- tests/test_evaluator_config.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py index e03f2580..3739579c 100644 --- a/src/memory/evaluator/provider.py +++ b/src/memory/evaluator/provider.py @@ -31,7 +31,7 @@ class EvaluatorConfig: provider: Literal["ollama", "openrouter", "anthropic", "openai", "custom"] = ( "ollama" ) - model_name: str = "llama3.2:8b" + model_name: str = "gemma3:4b" base_url: str | None = None temperature: float = 0.0 # Deterministic for evaluation max_tokens: int = ( @@ -50,7 +50,7 @@ def from_yaml(cls, path: str) -> "EvaluatorConfig": model_cfg = data.get("evaluator_model", {}) return cls( provider=model_cfg.get("provider", "ollama"), - model_name=model_cfg.get("model_name", "llama3.2:8b"), + model_name=model_cfg.get("model_name", "gemma3:4b"), base_url=model_cfg.get("base_url"), temperature=model_cfg.get("temperature", 0.0), max_tokens=model_cfg.get("max_tokens", 4096), diff --git a/tests/test_evaluator_config.py b/tests/test_evaluator_config.py index bd11e68b..f20f84b7 100644 --- a/tests/test_evaluator_config.py +++ b/tests/test_evaluator_config.py @@ -26,7 +26,7 @@ def full_config_yaml(tmp_path) -> Path: config = { "evaluator_model": { "provider": "ollama", - "model_name": "llama3.2:8b", + "model_name": "gemma3:4b", "temperature": 0.0, "max_tokens": 512, }, @@ -74,7 +74,7 @@ def test_loads_provider(self, full_config_yaml): def test_loads_model_name(self, full_config_yaml): config = EvaluatorConfig.from_yaml(str(full_config_yaml)) - assert config.model_name == "llama3.2:8b" + assert config.model_name == "gemma3:4b" def test_loads_temperature(self, full_config_yaml): config = EvaluatorConfig.from_yaml(str(full_config_yaml)) @@ -112,7 +112,7 @@ def test_model_defaults_to_llama(self, tmp_path): config_file = tmp_path / "minimal.yaml" config_file.write_text("evaluator_model: {}\n") config = EvaluatorConfig.from_yaml(str(config_file)) - assert config.model_name == "llama3.2:8b" + assert config.model_name == "gemma3:4b" def test_temperature_defaults_to_zero(self, tmp_path): config_file = tmp_path / "minimal.yaml" @@ -142,7 +142,7 @@ def test_default_provider_is_ollama(self, repo_config): def test_default_model_is_llama(self, repo_config): config = EvaluatorConfig.from_yaml(str(repo_config)) - assert config.model_name == "llama3.2:8b" + assert config.model_name == "gemma3:4b" def test_no_secrets_in_config_file(self, repo_config): content = repo_config.read_text() @@ -268,7 +268,7 @@ def test_filters_by_evaluator_id(self, tmp_path, full_config_yaml): def test_returns_empty_list_when_dir_missing(self, tmp_path): config_content = { - "evaluator_model": {"provider": "ollama", "model_name": "llama3.2:8b"}, + "evaluator_model": {"provider": "ollama", "model_name": "gemma3:4b"}, "evaluators_dir": str(tmp_path / "nonexistent_dir"), "audit": {"log_file": str(tmp_path / "eval.jsonl")}, } From 4e6b6a92f5a43509d317e44654dbcb62c7d1baba Mon Sep 17 00:00:00 2001 From: WB Solutions Date: Sun, 15 Mar 2026 03:53:23 -0700 Subject: [PATCH 13/13] fix(tests): update remaining llama3.2:8b references to gemma3:4b test_evaluator_provider.py and test_evaluator_runner.py fixture still had hardcoded llama3.2:8b model name assertions. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_evaluator_provider.py | 6 +++--- tests/test_evaluator_runner.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_evaluator_provider.py b/tests/test_evaluator_provider.py index a561e070..75bd1eee 100644 --- a/tests/test_evaluator_provider.py +++ b/tests/test_evaluator_provider.py @@ -24,7 +24,7 @@ def test_default_provider_is_ollama(self): def test_default_model_is_llama(self): config = EvaluatorConfig() - assert config.model_name == "llama3.2:8b" + assert config.model_name == "gemma3:4b" def test_default_temperature_zero(self): config = EvaluatorConfig() @@ -69,7 +69,7 @@ def test_defaults_when_fields_missing(self, tmp_path): config = EvaluatorConfig.from_yaml(str(config_file)) assert config.provider == "ollama" - assert config.model_name == "llama3.2:8b" + assert config.model_name == "gemma3:4b" assert config.temperature == 0.0 def test_empty_evaluator_model_uses_defaults(self, tmp_path): @@ -87,7 +87,7 @@ class TestOllamaProvider: """Test Ollama provider client creation.""" def test_ollama_creates_openai_client(self): - config = EvaluatorConfig(provider="ollama", model_name="llama3.2:8b") + config = EvaluatorConfig(provider="ollama", model_name="gemma3:4b") mock_openai_cls = MagicMock() with patch.dict("sys.modules", {"openai": MagicMock(OpenAI=mock_openai_cls)}): diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py index d992f178..2c5823f5 100644 --- a/tests/test_evaluator_runner.py +++ b/tests/test_evaluator_runner.py @@ -38,7 +38,7 @@ def minimal_config(tmp_path) -> Path: config_content = """ evaluator_model: provider: ollama - model_name: llama3.2:8b + model_name: gemma3:4b temperature: 0.0 max_tokens: 4096 evaluators_dir: {evaluators_dir}