From e0981cbe30537ae7e8878c88a2a6274be43cffdb Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 01:22:13 -0700
Subject: [PATCH 01/13] =?UTF-8?q?feat(evaluator):=20complete=20Langfuse=20?=
 =?UTF-8?q?observability=20pipeline=20=E2=80=94=20observation-level=20eval?=
 =?UTF-8?q?uation,=20automated=20scheduling,=20retry=20logic?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add observation-level evaluation path to runner (EV-01 to EV-04 score
  individual spans by event_type name filtering)
- Fix pagination: cursor-based for observations.get_many(), page-based
  for trace.list() per V3 SDK
- Create all 12 evaluator YAML + prompt files with correct filter
  alignment against actual emit_trace_event() event_types
- Add evaluator-scheduler Docker container (croniter-based cron daemon)
  in docker-compose.langfuse.yml under langfuse profile
- Add exponential backoff retry logic for transient provider errors
  (500, 502, 503, 429) with configurable max_retries
- Make create_score_configs.py truly idempotent with pre-check and
  --cleanup-duplicates flag
- Sanitize all log injection vectors in monitoring/main.py (inline
  sanitize_log_input at every call site for CodeQL compliance)
- Add evaluator files to installer copy paths (both fresh and update)
- Add croniter>=2.0.0,<3.0.0 dependency

Resolves: TD-280, TD-281, TD-282, TD-283, TD-287, TD-288, TD-100, BUG-217

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker/.env.example                           |   9 +
 docker/Dockerfile.evaluator-scheduler         |  28 +
 docker/docker-compose.langfuse.yml            |  56 ++
 evaluator_config.yaml                         |   7 +-
 evaluators/ev01_retrieval_relevance.yaml      |  17 +-
 evaluators/ev01_retrieval_relevance_prompt.md |   2 +-
 evaluators/ev02_injection_value.yaml          |  10 +-
 evaluators/ev02_injection_value_prompt.md     |   2 +-
 evaluators/ev03_capture_completeness.yaml     |   9 +-
 .../ev03_capture_completeness_prompt.md       |   2 +-
 evaluators/ev04_classification_accuracy.yaml  |   9 +-
 .../ev04_classification_accuracy_prompt.md    |   2 +-
 evaluators/ev05_bootstrap_quality.yaml        |  12 +-
 evaluators/ev06_session_coherence.yaml        |   9 +-
 monitoring/main.py                            |  20 +-
 pyproject.toml                                |   2 +
 requirements.txt                              |   3 +
 scripts/create_score_configs.py               | 137 ++-
 scripts/install.sh                            |  23 +
 scripts/memory/evaluator_scheduler.py         | 261 ++++++
 src/memory/evaluator/provider.py              | 120 ++-
 src/memory/evaluator/runner.py                | 476 ++++++++--
 tests/test_create_score_configs.py            | 294 +++++++
 tests/test_evaluator_runner.py                | 825 +++++++++++++++++-
 tests/test_monitoring_api.py                  | 222 +++++
 tests/unit/test_evaluator_retry.py            | 222 +++++
 tests/unit/test_evaluator_scheduler.py        | 401 +++++++++
 27 files changed, 2982 insertions(+), 198 deletions(-)
 create mode 100644 docker/Dockerfile.evaluator-scheduler
 create mode 100644 scripts/memory/evaluator_scheduler.py
 create mode 100644 tests/test_create_score_configs.py
 create mode 100644 tests/test_monitoring_api.py
 create mode 100644 tests/unit/test_evaluator_retry.py
 create mode 100644 tests/unit/test_evaluator_scheduler.py

diff --git a/docker/.env.example b/docker/.env.example
index 9faec64f..59d2c9f3 100755
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -58,6 +58,7 @@ AI_MEMORY_LOG_LEVEL=INFO
 
 # AI Memory Installation Directory (for volume mounts)
 # Must match the project installation path
+# REQUIRED for the langfuse compose profile (evaluator-scheduler .audit volume mount)
 AI_MEMORY_INSTALL_DIR=/path/to/your/project
 
 # ============================================
@@ -422,6 +423,14 @@ LANGFUSE_FLUSH_INTERVAL=5
 LANGFUSE_TRACE_HOOKS=true
 LANGFUSE_TRACE_SESSIONS=true
 
+# =============================================================================
+# EVALUATOR SCHEDULER (v2.2.3B, S-16.5, DEC-110)
+# =============================================================================
+# Custom LLM provider for evaluator-scheduler (provider=custom in evaluator_config.yaml)
+# Leave blank if using ollama/openrouter/anthropic/openai (configured in evaluator_config.yaml)
+EVALUATOR_BASE_URL=
+EVALUATOR_API_KEY=
+
 # =============================================================================
 # OLLAMA CLOUD API (v2.2.3, PLAN-012)
 # =============================================================================
diff --git a/docker/Dockerfile.evaluator-scheduler b/docker/Dockerfile.evaluator-scheduler
new file mode 100644
index 00000000..3082a3ad
--- /dev/null
+++ b/docker/Dockerfile.evaluator-scheduler
@@ -0,0 +1,28 @@
+FROM python:3.12-slim
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements and install Python dependencies (includes croniter)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Create non-root user (security hardening 2025)
+RUN useradd -u 1001 -g users -s /usr/sbin/nologin evaluator && \
+    chown -R evaluator:users /app
+
+# Set Python path to include src directory where memory module lives
+# (src/ is volume-mounted read-only at runtime: ../src:/app/src:ro)
+ENV PYTHONPATH=/app/src
+
+# Switch to non-root user
+USER evaluator
+
+# Entry point: evaluator scheduler daemon
+CMD ["python", "scripts/memory/evaluator_scheduler.py"]
diff --git a/docker/docker-compose.langfuse.yml b/docker/docker-compose.langfuse.yml
index b402f523..e942cb0b 100644
--- a/docker/docker-compose.langfuse.yml
+++ b/docker/docker-compose.langfuse.yml
@@ -285,6 +285,62 @@ services:
     networks:
       - ai-memory_default
 
+  # ─── Evaluator Scheduler (AI Memory component) ────────────────────────────
+  # Runs LLM-as-Judge evaluation pipeline on a cron schedule.
+  # Schedule configured via evaluator_config.yaml (schedule.cron, lookback_hours).
+  # Requires Langfuse to be healthy so it can post scores via create_score().
+  evaluator-scheduler:
+    build:
+      context: ../
+      dockerfile: docker/Dockerfile.evaluator-scheduler
+    container_name: ${AI_MEMORY_CONTAINER_PREFIX:-ai-memory}-evaluator-scheduler
+    profiles: ["langfuse"]
+    volumes:
+      - ../src:/app/src:ro
+      - ../scripts:/app/scripts:ro
+      - ../evaluators:/app/evaluators:ro
+      - ../evaluator_config.yaml:/app/evaluator_config.yaml:ro
+      - ${AI_MEMORY_INSTALL_DIR:-.}/.audit:/app/.audit
+    environment:
+      - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY:?Run langfuse_setup.sh first to generate API keys}
+      - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY:?Run langfuse_setup.sh first to generate API keys}
+      # Use internal service name for container-to-container communication
+      - LANGFUSE_BASE_URL=http://langfuse-web:3000
+      # Optional LLM provider API keys (read from env; not all are required)
+      - OLLAMA_API_KEY=${OLLAMA_API_KEY:-}
+      - OPENROUTER_API_KEY=${OPENROUTER_API_KEY:-}
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+      # Optional custom LLM provider (provider=custom in evaluator_config.yaml)
+      - EVALUATOR_BASE_URL=${EVALUATOR_BASE_URL:-}
+      - EVALUATOR_API_KEY=${EVALUATOR_API_KEY:-}
+      # NOTE: EVALUATOR_PROVIDER and EVALUATOR_MODEL are NOT set here —
+      # they are read from evaluator_config.yaml (evaluator_model section)
+      - PYTHONPATH=/app/src
+      - AI_MEMORY_LOG_LEVEL=${AI_MEMORY_LOG_LEVEL:-INFO}
+    depends_on:
+      langfuse-web:
+        condition: service_healthy
+    restart: unless-stopped
+    read_only: true
+    tmpfs:
+      - /tmp
+    security_opt:
+      - no-new-privileges:true
+    cap_drop:
+      - ALL
+    networks:
+      - ai-memory_default
+    healthcheck:
+      # File-based heartbeat — scheduler touches health file on startup and after each
+      # successful evaluation run. Check: file exists AND modified within last 90000s (25h).
+      # 25h window accommodates a daily cron + restart without false failure.
+      test: ["CMD-SHELL", "python3 -c \"import os,sys,time; f='/tmp/evaluator-scheduler.health'; sys.exit(0 if os.path.exists(f) and time.time()-os.path.getmtime(f)<90000 else 1)\""]
+      interval: 60s
+      timeout: 5s
+      retries: 3
+      start_period: 90000s
+
 # ─── Network ─────────────────────────────────────────────────────────────────
 # Join the existing ai-memory stack network so Langfuse services can communicate
 # with the classifier-worker, embedding service, and other AI Memory components.
diff --git a/evaluator_config.yaml b/evaluator_config.yaml
index 6f1b741b..2a788bc1 100644
--- a/evaluator_config.yaml
+++ b/evaluator_config.yaml
@@ -16,6 +16,7 @@ evaluator_model:
   model_name: llama3.2:8b  # Model identifier for the selected provider
   temperature: 0.0          # Deterministic evaluation (do not change)
   max_tokens: 4096          # 4096 to accommodate thinking models (reasoning + output)
+  max_retries: 3            # Retry on 500/502/503/429 and network errors
 
   # base_url: Override Ollama endpoint URL.
   #   Local (default): http://localhost:11434/v1
@@ -43,9 +44,9 @@ schedule:
   lookback_hours: 24    # Evaluate last 24 hours of traces
 
 # ---------------------------------------------------------------------------
-# Evaluation targets per evaluator
-# observation = score individual spans (faster, more granular)
-# trace       = score whole trace (for session-level metrics)
+# DEPRECATED: evaluation_targets — use per-evaluator target: field instead.
+# Each evaluator YAML now has target: "observation" or target: "trace".
+# This section is no longer read by EvaluatorRunner (S-16.3).
 # ---------------------------------------------------------------------------
 evaluation_targets:
   ev01_retrieval_relevance: observation    # Per-retrieval span (high volume)
diff --git a/evaluators/ev01_retrieval_relevance.yaml b/evaluators/ev01_retrieval_relevance.yaml
index 1c0bbf7f..a78ffd70 100644
--- a/evaluators/ev01_retrieval_relevance.yaml
+++ b/evaluators/ev01_retrieval_relevance.yaml
@@ -1,6 +1,9 @@
 # Evaluator: EV-01 Retrieval Relevance
 # Scores how relevant retrieved memory is to the trigger context.
 # High-volume path — 5% sampling to stay within judge budget.
+#
+# PATH B (Langfuse V3): Filter by observation name (event_type).
+# Tags are trace-level only in V3; use event_types for observation filtering.
 
 id: EV-01
 name: retrieval_relevance
@@ -9,7 +12,19 @@ target: observation
 sampling_rate: 0.05
 
 filter:
-  tags: [search, retrieval, best_practices]
+  # All event_types that represent memory retrieval operations across hook scripts
+  # Derived from grep of emit_trace_event() calls in src/memory/ and .claude/hooks/
+  event_types:
+    - "pattern_retrieval"        # first_edit_trigger.py — code pattern lookup
+    - "convention_retrieval"     # new_file_trigger.py — convention lookup
+    - "error_retrieval"          # error_detection.py — error fix lookup
+    - "context_retrieval"        # context_injection_tier2.py, session_start.py
+    - "best_practices_retrieval" # best_practices_retrieval.py
+    - "memory_retrieval_session_summaries"  # session_start.py
+    - "memory_retrieval_decisions"          # session_start.py
+    - "memory_retrieval_sessions"           # session_start.py
+    - "bootstrap_retrieval"      # injection.py — bootstrap context retrieval
+    - "search_query"             # search.py — main hybrid search execution
 
 prompt_file: ev01_retrieval_relevance_prompt.md
 
diff --git a/evaluators/ev01_retrieval_relevance_prompt.md b/evaluators/ev01_retrieval_relevance_prompt.md
index 6bda0b5d..966d42d4 100644
--- a/evaluators/ev01_retrieval_relevance_prompt.md
+++ b/evaluators/ev01_retrieval_relevance_prompt.md
@@ -5,7 +5,7 @@ Your task is to assess how relevant a retrieved memory is to the trigger context
 
 ## Data to Evaluate
 
-Analyze the trace data provided in the **## Trace to Evaluate** section below.
+Analyze the observation data provided in the **## Observation to Evaluate** section below.
 
 - **Input**: The trigger context that caused the memory retrieval (e.g., the user's query or the event that fired the retrieval trigger).
 - **Output**: The retrieved memory content that was returned.
diff --git a/evaluators/ev02_injection_value.yaml b/evaluators/ev02_injection_value.yaml
index c07e191d..351eea94 100644
--- a/evaluators/ev02_injection_value.yaml
+++ b/evaluators/ev02_injection_value.yaml
@@ -1,6 +1,9 @@
 # Evaluator: EV-02 Injection Value
 # Scores whether injected context was valuable (true) or noise (false).
 # High-volume path — 5% sampling to stay within judge budget.
+#
+# PATH B (Langfuse V3): Filter by observation name (event_type).
+# Tags are trace-level only in V3; use event_types for observation filtering.
 
 id: EV-02
 name: injection_value
@@ -9,7 +12,12 @@ target: observation
 sampling_rate: 0.05
 
 filter:
-  tags: [injection, tier2, compact]
+  # event_types that represent the final injected context delivered to the agent
+  # context_retrieval is the primary event from context_injection_tier2.py
+  # error_fix_injection is from error_detection.py (injection into error context)
+  event_types:
+    - "context_retrieval"    # context_injection_tier2.py, session_start.py — tier-2 injections
+    - "error_fix_injection"  # error_detection.py — error fix context injection
 
 prompt_file: ev02_injection_value_prompt.md
 
diff --git a/evaluators/ev02_injection_value_prompt.md b/evaluators/ev02_injection_value_prompt.md
index 505dec65..964126db 100644
--- a/evaluators/ev02_injection_value_prompt.md
+++ b/evaluators/ev02_injection_value_prompt.md
@@ -5,7 +5,7 @@ Your task is to determine whether context injected into a user's prompt window a
 
 ## Data to Evaluate
 
-Analyze the trace data provided in the **## Trace to Evaluate** section below.
+Analyze the observation data provided in the **## Observation to Evaluate** section below.
 
 - **Input**: The user's prompt or request that triggered the injection.
 - **Output**: The injected context that was added to the user's prompt window.
diff --git a/evaluators/ev03_capture_completeness.yaml b/evaluators/ev03_capture_completeness.yaml
index c500afff..74f4448e 100644
--- a/evaluators/ev03_capture_completeness.yaml
+++ b/evaluators/ev03_capture_completeness.yaml
@@ -1,6 +1,9 @@
 # Evaluator: EV-03 Capture Completeness
 # Scores whether hook capture events collected all expected fields.
 # High-volume path — 5% sampling to stay within judge budget.
+#
+# PATH B (Langfuse V3): Filter by observation name (event_type).
+# Tags are trace-level only in V3; use event_types for observation filtering.
 
 id: EV-03
 name: capture_completeness
@@ -9,7 +12,11 @@ target: observation
 sampling_rate: 0.05
 
 filter:
-  tags: [capture, user_prompt, agent_response, session_summary, store]
+  # 1_capture is the primary event_type for capture operations across all hook scripts:
+  #   user_prompt_capture.py, post_tool_capture.py, agent_response_capture.py,
+  #   error_pattern_capture.py
+  event_types:
+    - "1_capture"
 
 prompt_file: ev03_capture_completeness_prompt.md
 
diff --git a/evaluators/ev03_capture_completeness_prompt.md b/evaluators/ev03_capture_completeness_prompt.md
index d4fac089..0d6dee28 100644
--- a/evaluators/ev03_capture_completeness_prompt.md
+++ b/evaluators/ev03_capture_completeness_prompt.md
@@ -5,7 +5,7 @@ Your task is to determine whether a hook capture event collected all expected fi
 
 ## Data to Evaluate
 
-Analyze the trace data provided in the **## Trace to Evaluate** section below.
+Analyze the observation data provided in the **## Observation to Evaluate** section below.
 
 - **Input**: The captured content and hook event data (event type, captured fields, and metadata).
 - **Output**: Any processing result or confirmation from the capture system.
diff --git a/evaluators/ev04_classification_accuracy.yaml b/evaluators/ev04_classification_accuracy.yaml
index 887e8f69..be193f53 100644
--- a/evaluators/ev04_classification_accuracy.yaml
+++ b/evaluators/ev04_classification_accuracy.yaml
@@ -1,6 +1,10 @@
 # Evaluator: EV-04 Classification Accuracy
 # Assesses whether the LLM classifier assigned the correct type to content.
 # 10% sampling — classification is medium volume.
+#
+# PATH B (Langfuse V3): Filter by observation name (event_type).
+# Tags are trace-level only in V3; use event_types for observation filtering.
+# CRITICAL: score_type is CATEGORICAL (NOT BOOLEAN).
 
 id: EV-04
 name: classification_accuracy
@@ -14,7 +18,10 @@ categories:
   - incorrect
 
 filter:
-  tags: [classification, queue, worker]
+  # 9_classify is the event_type for LLM classification observations:
+  #   classification_worker.py, process_classification_queue.py
+  event_types:
+    - "9_classify"
 
 prompt_file: ev04_classification_accuracy_prompt.md
 
diff --git a/evaluators/ev04_classification_accuracy_prompt.md b/evaluators/ev04_classification_accuracy_prompt.md
index d37da832..5d9a9219 100644
--- a/evaluators/ev04_classification_accuracy_prompt.md
+++ b/evaluators/ev04_classification_accuracy_prompt.md
@@ -5,7 +5,7 @@ Your task is to assess whether the LLM classifier correctly assigned a memory ty
 
 ## Data to Evaluate
 
-Analyze the trace data provided in the **## Trace to Evaluate** section below.
+Analyze the observation data provided in the **## Observation to Evaluate** section below.
 
 - **Input**: The original captured content that was submitted to the classifier.
 - **Output**: The classifier's decision including assigned memory type, collection, and reasoning.
diff --git a/evaluators/ev05_bootstrap_quality.yaml b/evaluators/ev05_bootstrap_quality.yaml
index 8b801c75..7d742540 100644
--- a/evaluators/ev05_bootstrap_quality.yaml
+++ b/evaluators/ev05_bootstrap_quality.yaml
@@ -1,6 +1,13 @@
 # Evaluator: EV-05 Bootstrap Quality
 # Scores how well cross-session bootstrap context serves the new session.
 # 100% sampling — one bootstrap event per session (low volume).
+#
+# PATH B (Langfuse V3): Filter by trace-level tag.
+# Rationale: EV-05 targets trace-level data (full session context), so tags is
+# the correct filter mechanism — matching the pattern EV-06 uses. event_types
+# applies to observation names and will never match trace-level items, making
+# it a dead filter. session_trace tags are set by langfuse_stop_hook.py on
+# all full-session traces, so this correctly captures bootstrap sessions.
 
 id: EV-05
 name: bootstrap_quality
@@ -9,7 +16,10 @@ target: trace
 sampling_rate: 1.0
 
 filter:
-  tags: [injection, tier1, bootstrap, skill]
+  # Trace-level tags filter — valid for trace-targeted evaluators in V3.
+  # session_trace tag is set on full-session traces by session hooks.
+  tags:
+    - "session_trace"
 
 prompt_file: ev05_bootstrap_quality_prompt.md
 
diff --git a/evaluators/ev06_session_coherence.yaml b/evaluators/ev06_session_coherence.yaml
index 734da20c..feaaa113 100644
--- a/evaluators/ev06_session_coherence.yaml
+++ b/evaluators/ev06_session_coherence.yaml
@@ -1,6 +1,10 @@
 # Evaluator: EV-06 Session Coherence
 # Scores overall memory system behaviour across a full session.
 # 100% sampling — one session summary per session (low volume).
+#
+# PATH B NOTE: EV-06 correctly uses trace-level tags filter.
+# This evaluator targets full session TRACES (not individual observations).
+# Trace-level tags ARE supported in Langfuse V3 — this filter is correct.
 
 id: EV-06
 name: session_coherence
@@ -9,7 +13,10 @@ target: trace
 sampling_rate: 1.0
 
 filter:
-  tags: [capture, session_summary, injection, tier1, bootstrap]
+  # Trace-level tags filter — valid for trace-targeted evaluators in V3.
+  # session_trace tag is set on full-session traces by session hooks.
+  tags:
+    - "session_trace"
 
 prompt_file: ev06_session_coherence_prompt.md
 
diff --git a/monitoring/main.py b/monitoring/main.py
index a0d6d6e2..d792efe8 100755
--- a/monitoring/main.py
+++ b/monitoring/main.py
@@ -229,25 +229,25 @@ async def update_metrics_periodically():
                         logger.warning(
                             "pushgateway_push_failed",
                             extra={
-                                "collection": collection_name,
-                                "error": str(push_error),
+                                "collection": sanitize_log_input(collection_name),
+                                "error": sanitize_log_input(str(push_error)),
                             },
                         )
 
                     logger.debug(
                         "metrics_updated",
                         extra={
-                            "collection": collection_name,
+                            "collection": sanitize_log_input(collection_name),
                             "total_points": stats.total_points,
                         },
                     )
                 except Exception as e:
                     logger.warning(
                         "metrics_update_failed",
-                        extra={"collection": collection_name, "error": str(e)},
+                        extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))},
                     )
         except Exception as e:
-            logger.error("metrics_updater_error", extra={"error": str(e)})
+            logger.error("metrics_updater_error", extra={"error": sanitize_log_input(str(e))})
 
         await asyncio.sleep(60)  # Update every 60 seconds
 
@@ -364,7 +364,7 @@ async def health():
             except Exception as e:
                 logger.warning(
                     "stats_check_failed",
-                    extra={"collection": collection_name, "error": str(e)},
+                    extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))},
                 )
 
         # Determine status based on warnings
@@ -390,7 +390,7 @@ async def health():
             warnings=all_warnings,
         )
     except Exception as e:
-        logger.error("health_check_failed", extra={"error": str(e)})
+        logger.error("health_check_failed", extra={"error": sanitize_log_input(str(e))})
         return JSONResponse(
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
             content={
@@ -426,7 +426,7 @@ async def readiness():
         logger.info("readiness_check_passed", extra={"qdrant_available": True})
         return {"status": "ready", "qdrant_available": True}
     except Exception as e:
-        logger.warning("readiness_check_failed", extra={"error": str(e)})
+        logger.warning("readiness_check_failed", extra={"error": sanitize_log_input(str(e))})
         return JSONResponse(
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
             content={"status": "not_ready", "qdrant_available": False},
@@ -481,7 +481,7 @@ async def get_memory(memory_id: str, collection: str = "code-patterns"):
     except UnexpectedResponse as e:
         logger.error(
             "qdrant_error",
-            extra={"error": str(e), "memory_id": sanitize_log_input(memory_id)},
+            extra={"error": sanitize_log_input(str(e)), "memory_id": sanitize_log_input(memory_id)},
         )
         raise HTTPException(
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Qdrant unavailable"
@@ -584,7 +584,7 @@ async def search_memories(request: SearchRequest):
         logger.error(
             "search_failed",
             extra={
-                "error": str(e),
+                "error": sanitize_log_input(str(e)),
                 "collection": sanitize_log_input(request.collection),
             },
         )
diff --git a/pyproject.toml b/pyproject.toml
index 855e2032..cece53b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,8 @@ dependencies = [
     "numpy>=1.26.0,<3.0.0",
     # Security scanning - Layer 3 NER-based PII detection (SPEC-009)
     "spacy>=3.8.0,<4.0.0",
+    # Cron schedule parsing for evaluator-scheduler daemon (S-16.5, DEC-110)
+    "croniter>=2.0.0,<3.0.0",
     # LLM Observability — Langfuse tracing (SPEC-020, PLAN-008)
     "langfuse>=3.0,<4.0.0",
     # Auto-instrumentation for Anthropic SDK calls (SPEC-020 §6)
diff --git a/requirements.txt b/requirements.txt
index 15d290ca..3900ff19 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,6 +48,9 @@ detect-secrets>=1.4.0
 # en_core_web_sm model downloaded in install.sh post-install step
 spacy>=3.8.0,<4.0.0
 
+# Cron schedule parsing for evaluator-scheduler daemon (S-16.5, DEC-110)
+croniter>=2.0.0,<3.0.0
+
 # LLM Observability — Langfuse tracing (SPEC-020, PLAN-008)
 langfuse>=3.0,<4.0.0
 
diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py
index 42046541..5e5732f9 100644
--- a/scripts/create_score_configs.py
+++ b/scripts/create_score_configs.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
 # LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md
 # FORBIDDEN: Langfuse() constructor, start_span(), start_generation(), langfuse_context
-# REQUIRED: get_client(), api.score_configs.create(), flush()
+# REQUIRED: get_client(), api.score_configs.create(), api.score_configs.list(), flush()
 """Idempotent Score Config setup in Langfuse.
 
 Creates 6 Score Configs that enforce validation schemas on evaluation scores.
-Safe to run multiple times — existing configs are left unchanged.
+Safe to run multiple times — existing configs are skipped (pre-checked via list API).
 
 Score Configs created:
   NUMERIC:     retrieval_relevance (0-1), bootstrap_quality (0-1), session_coherence (0-1)
@@ -14,6 +14,7 @@
 
 Usage:
   python scripts/create_score_configs.py
+  python scripts/create_score_configs.py --cleanup-duplicates
 
 Requires env vars: LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_BASE_URL
 
@@ -21,9 +22,75 @@
 """
 
 import sys
+import argparse
+
+
+def _fetch_existing_configs(langfuse) -> dict[str, list]:
+    """Fetch all existing score configs from Langfuse, keyed by name.
+
+    Returns a dict of {name: [config, ...]} to detect duplicates.
+    Uses cursor-based pagination to retrieve all pages.
+    """
+    existing: dict[str, list] = {}
+    page = 1
+    while True:
+        try:
+            response = langfuse.api.score_configs.list(page=page, limit=100)
+        except Exception as exc:
+            print(f"  [WARN] Could not list existing score configs: {exc}")
+            break
+
+        items = getattr(response, "data", None) or []
+        if not items:
+            break
+
+        for cfg in items:
+            name = getattr(cfg, "name", None)
+            if name:
+                existing.setdefault(name, []).append(cfg)
+
+        # Check if there are more pages
+        meta = getattr(response, "meta", None)
+        total_pages = getattr(meta, "total_pages", 1) if meta else 1
+        if page >= total_pages:
+            break
+        page += 1
+
+    return existing
+
+
+def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None:
+    """Delete duplicate score configs, keeping the oldest one (first created)."""
+    for name, configs in existing.items():
+        if len(configs) <= 1:
+            continue
+        # Sort by created_at ascending, keep first, delete the rest
+        try:
+            sorted_configs = sorted(configs, key=lambda c: getattr(c, "created_at", "9999-99-99"))
+        except Exception:
+            sorted_configs = configs
+
+        to_delete = sorted_configs[1:]
+        for cfg in to_delete:
+            cfg_id = getattr(cfg, "id", None)
+            if not cfg_id:
+                continue
+            try:
+                langfuse.api.score_configs.delete(cfg_id)
+                print(f"  [DEL] Removed duplicate config '{name}' id={cfg_id}")
+            except Exception as exc:
+                print(f"  [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}")
 
 
 def main() -> int:
+    parser = argparse.ArgumentParser(description="Create Langfuse score configs idempotently")
+    parser.add_argument(
+        "--cleanup-duplicates",
+        action="store_true",
+        help="Delete duplicate score configs (keep oldest), then recreate missing ones",
+    )
+    args = parser.parse_args()
+
     try:
         from langfuse import (
             get_client,  # V3 singleton — NEVER use Langfuse() constructor
@@ -36,7 +103,20 @@ def main() -> int:
 
         langfuse = get_client()
 
-        print("Creating Score Configs in Langfuse...")
+        print("Fetching existing Score Configs from Langfuse...")
+        existing = _fetch_existing_configs(langfuse)
+        if existing:
+            print(f"  Found {sum(len(v) for v in existing.values())} existing config(s): {list(existing.keys())}")
+        else:
+            print("  No existing configs found.")
+
+        if args.cleanup_duplicates:
+            print("\nCleaning up duplicate Score Configs...")
+            _cleanup_duplicates(langfuse, existing)
+            # Re-fetch after cleanup
+            existing = _fetch_existing_configs(langfuse)
+
+        print("\nCreating Score Configs in Langfuse...")
 
         # --- NUMERIC scores (0.0 - 1.0) ---
         numeric_names = [
@@ -45,6 +125,9 @@ def main() -> int:
             "session_coherence",
         ]
         for name in numeric_names:
+            if name in existing:
+                print(f"  [SKIP] NUMERIC: {name} (already exists)")
+                continue
             try:
                 langfuse.api.score_configs.create(
                     request=CreateScoreConfigRequest(
@@ -57,12 +140,14 @@ def main() -> int:
                 )
                 print(f"  [OK] NUMERIC: {name} (0.0 - 1.0)")
             except Exception as exc:
-                # Idempotent: log but don't fail if config already exists
-                print(f"  [SKIP] {name}: {exc}")
+                print(f"  [ERROR] {name}: {exc}")
 
         # --- BOOLEAN scores ---
         boolean_names = ["injection_value", "capture_completeness"]
         for name in boolean_names:
+            if name in existing:
+                print(f"  [SKIP] BOOLEAN: {name} (already exists)")
+                continue
             try:
                 langfuse.api.score_configs.create(
                     request=CreateScoreConfigRequest(
@@ -73,27 +158,31 @@ def main() -> int:
                 )
                 print(f"  [OK] BOOLEAN: {name}")
             except Exception as exc:
-                print(f"  [SKIP] {name}: {exc}")
+                print(f"  [ERROR] {name}: {exc}")
 
-        # --- CATEGORICAL score (PM #190 addition) ---
-        try:
-            langfuse.api.score_configs.create(
-                request=CreateScoreConfigRequest(
-                    name="classification_accuracy",
-                    dataType=ScoreConfigDataType.CATEGORICAL,
-                    categories=[
-                        ConfigCategory(label="correct", value=1.0),
-                        ConfigCategory(label="partially_correct", value=0.5),
-                        ConfigCategory(label="incorrect", value=0.0),
-                    ],
-                    description="LLM-as-judge classification accuracy (PLAN-012)",
+        # --- CATEGORICAL score ---
+        cat_name = "classification_accuracy"
+        if cat_name in existing:
+            print(f"  [SKIP] CATEGORICAL: {cat_name} (already exists)")
+        else:
+            try:
+                langfuse.api.score_configs.create(
+                    request=CreateScoreConfigRequest(
+                        name=cat_name,
+                        dataType=ScoreConfigDataType.CATEGORICAL,
+                        categories=[
+                            ConfigCategory(label="correct", value=1.0),
+                            ConfigCategory(label="partially_correct", value=0.5),
+                            ConfigCategory(label="incorrect", value=0.0),
+                        ],
+                        description="LLM-as-judge classification accuracy (PLAN-012)",
+                    )
                 )
-            )
-            print(
-                "  [OK] CATEGORICAL: classification_accuracy (correct|partially_correct|incorrect)"
-            )
-        except Exception as exc:
-            print(f"  [SKIP] classification_accuracy: {exc}")
+                print(
+                    "  [OK] CATEGORICAL: classification_accuracy (correct|partially_correct|incorrect)"
+                )
+            except Exception as exc:
+                print(f"  [ERROR] {cat_name}: {exc}")
 
         # Flush all buffered data before exit (V3 requirement for short-lived scripts)
         langfuse.flush()
diff --git a/scripts/install.sh b/scripts/install.sh
index 93b28d3f..9f5f83aa 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -1037,6 +1037,18 @@ update_shared_scripts() {
         log_debug "Synced $docker_count Docker files to INSTALL_DIR"
     fi
 
+    # Sync evaluator config and definitions (S-16.5, DEC-110)
+    # Required by evaluator-scheduler container at runtime
+    if [[ -f "$SCRIPT_DIR/../evaluator_config.yaml" ]]; then
+        cp "$SCRIPT_DIR/../evaluator_config.yaml" "$INSTALL_DIR/evaluator_config.yaml" || log_warning "Failed to copy evaluator_config.yaml"
+        log_debug "Updated evaluator_config.yaml"
+    fi
+    if [[ -d "$SCRIPT_DIR/../evaluators" ]]; then
+        mkdir -p "$INSTALL_DIR/evaluators"
+        cp -r "$SCRIPT_DIR/../evaluators/"* "$INSTALL_DIR/evaluators/" 2>/dev/null || true
+        log_debug "Updated evaluators/ directory"
+    fi
+
     # Update templates (new templates added in updates must reach installed dir)
     if [[ -d "$SCRIPT_DIR/../templates" ]]; then
         mkdir -p "$INSTALL_DIR/templates"
@@ -1576,6 +1588,17 @@ copy_files() {
         cp -r "$SOURCE_DIR/templates/"* "$INSTALL_DIR/templates/"
     fi
 
+    # Copy evaluator config and definitions (S-16.5, DEC-110)
+    # Required by evaluator-scheduler container at runtime (../evaluator_config.yaml, ../evaluators/)
+    log_debug "Copying evaluator configuration..."
+    if [[ -f "$SOURCE_DIR/evaluator_config.yaml" ]]; then
+        cp "$SOURCE_DIR/evaluator_config.yaml" "$INSTALL_DIR/evaluator_config.yaml" || log_warning "Failed to copy evaluator_config.yaml"
+    fi
+    if [[ -d "$SOURCE_DIR/evaluators" ]]; then
+        mkdir -p "$INSTALL_DIR/evaluators"
+        cp -r "$SOURCE_DIR/evaluators/"* "$INSTALL_DIR/evaluators/" 2>/dev/null || log_warning "Failed to copy evaluators directory"
+    fi
+
     # Copy CHANGELOG for reference (TD-170)
     if [[ -f "$SOURCE_DIR/CHANGELOG.md" ]]; then
         cp "$SOURCE_DIR/CHANGELOG.md" "$INSTALL_DIR/" || log_warning "Failed to copy CHANGELOG.md"
diff --git a/scripts/memory/evaluator_scheduler.py b/scripts/memory/evaluator_scheduler.py
new file mode 100644
index 00000000..10250105
--- /dev/null
+++ b/scripts/memory/evaluator_scheduler.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""Evaluator scheduler daemon.
+
+Runs LLM-as-Judge evaluation pipeline on a cron schedule defined in
+evaluator_config.yaml.
+
+Architecture:
+- Synchronous daemon with croniter for schedule parsing (DEC-110)
+- Health check via /tmp/evaluator-scheduler.health timestamp file
+- Graceful shutdown via atexit + SIGTERM handler
+- Placed in docker-compose.langfuse.yml under langfuse profile
+
+Usage:
+    python scripts/memory/evaluator_scheduler.py
+
+Reference:
+- DEC-110: Standalone evaluator-scheduler container design
+- PLAN-012 Phase 2: LLM-as-Judge evaluation pipeline
+"""
+
+# LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md
+
+import atexit
+import logging
+import os
+import signal
+import sys
+import time
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+
+# Setup Python path for imports
+INSTALL_DIR = os.environ.get(
+    "AI_MEMORY_INSTALL_DIR", os.path.expanduser("~/.ai-memory")
+)
+sys.path.insert(0, os.path.join(INSTALL_DIR, "src"))
+
+# Configuration
+CONFIG_PATH = os.environ.get("EVALUATOR_CONFIG_PATH", "/app/evaluator_config.yaml")
+HEALTH_FILE = Path("/tmp/evaluator-scheduler.health")
+
+# Setup logging
+logging.basicConfig(
+    level=os.environ.get("AI_MEMORY_LOG_LEVEL", "INFO"),
+    format="%(asctime)s %(name)s %(levelname)s %(message)s",
+)
+logger = logging.getLogger("ai_memory.evaluator.scheduler")
+
+# Graceful shutdown flag (set by SIGTERM handler)
+_shutdown_requested = False
+
+
+# =============================================================================
+# HEALTH CHECK
+# =============================================================================
+
+
+def _touch_health_file() -> None:
+    """Write health check marker for Docker healthcheck (graceful degradation)."""
+    try:
+        HEALTH_FILE.touch()
+        logger.debug("health_file_updated: %s", HEALTH_FILE)
+    except Exception as exc:
+        logger.warning("health_file_update_failed: %s", exc)
+
+
+# =============================================================================
+# SIGNAL HANDLING + LANGFUSE SHUTDOWN
+# =============================================================================
+
+
+def _handle_sigterm(signum, frame) -> None:  # noqa: ARG001
+    """SIGTERM handler — request graceful shutdown."""
+    global _shutdown_requested
+    logger.info("sigterm_received — requesting graceful shutdown")
+    _shutdown_requested = True
+
+
+def _register_langfuse_shutdown() -> None:
+    """Register atexit handler to flush and shutdown Langfuse V3 client on exit."""
+
+    def _langfuse_shutdown() -> None:
+        try:
+            from langfuse import get_client  # noqa: PLC0415
+
+            client = get_client()
+            if client:
+                client.flush()
+                client.shutdown()  # V3 spec: NO arguments — shutdown(timeout=x) raises TypeError
+        except Exception:
+            pass  # Never fail on process exit
+
+    atexit.register(_langfuse_shutdown)
+
+
+# =============================================================================
+# CONFIG LOADING
+# =============================================================================
+
+
+def load_schedule_config(config_path: str) -> dict:
+    """Load and return the schedule section from evaluator_config.yaml.
+
+    Args:
+        config_path: Path to evaluator_config.yaml
+
+    Returns:
+        Schedule config dict with keys: enabled, cron, lookback_hours
+    """
+    import yaml  # noqa: PLC0415
+
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    return config.get("schedule", {})
+
+
+# =============================================================================
+# SLEEP HELPER
+# =============================================================================
+
+
+def _interruptible_sleep(seconds: float, chunk: float = 5.0) -> None:
+    """Sleep for `seconds`, waking every `chunk` seconds to check shutdown flag.
+
+    Args:
+        seconds: Total sleep duration in seconds
+        chunk: Maximum sleep chunk size in seconds (default: 5.0)
+    """
+    remaining = seconds
+    while remaining > 0 and not _shutdown_requested:
+        sleep_for = min(remaining, chunk)
+        time.sleep(sleep_for)
+        remaining -= sleep_for
+
+
+# =============================================================================
+# SCHEDULER LOOP
+# =============================================================================
+
+
+def run_scheduler(config_path: str = CONFIG_PATH) -> None:
+    """Main scheduler loop.
+
+    Reads schedule config, sleeps until next cron tick, then invokes
+    EvaluatorRunner.run(). Errors in evaluation are logged and the
+    scheduler continues to the next cycle — it never crashes on eval failure.
+
+    Args:
+        config_path: Path to evaluator_config.yaml
+    """
+    global _shutdown_requested
+
+    logger.info("evaluator_scheduler_starting, config=%s", config_path)
+
+    # Write health file on startup (Docker healthcheck passes immediately — like
+    # classifier worker BUG-045 fix: don't wait until first run completes)
+    _touch_health_file()
+
+    while not _shutdown_requested:
+        # Reload schedule config each cycle to support live config changes
+        try:
+            schedule = load_schedule_config(config_path)
+        except Exception as exc:
+            logger.error("config_load_failed: %s — retrying in 60s", exc)
+            _interruptible_sleep(60)
+            continue
+
+        if not schedule.get("enabled", False):
+            logger.info("schedule_disabled — sleeping 300s before recheck")
+            _interruptible_sleep(300)
+            continue
+
+        cron_expr = schedule.get("cron", "0 5 * * *")
+        lookback_hours = int(schedule.get("lookback_hours", 24))
+
+        # Calculate next run time using croniter
+        from croniter import croniter  # noqa: PLC0415
+
+        now = datetime.now(tz=timezone.utc)
+        try:
+            cron = croniter(cron_expr, now)
+            next_run: datetime = cron.get_next(datetime)
+        except (ValueError, KeyError) as exc:
+            logger.error("invalid_cron_expression: %s — retrying in 60s", exc)
+            _interruptible_sleep(60)
+            continue
+
+        sleep_seconds = (next_run - now).total_seconds()
+        logger.info(
+            "next_evaluation_scheduled: cron=%s next_run_utc=%s sleep_seconds=%.0f",
+            cron_expr,
+            next_run.isoformat(),
+            sleep_seconds,
+        )
+
+        # Sleep until next cron tick, waking periodically to check shutdown
+        _interruptible_sleep(sleep_seconds)
+
+        if _shutdown_requested:
+            break
+
+        # Re-check enabled — config may have changed while sleeping
+        try:
+            schedule = load_schedule_config(config_path)
+        except Exception as exc:
+            logger.error("config_reload_failed before run: %s — skipping cycle", exc)
+            continue
+
+        if not schedule.get("enabled", False):
+            logger.info("schedule_disabled at run time — skipping cycle")
+            continue
+
+        # Re-read lookback_hours from reloaded config (avoid stale value from first load)
+        lookback_hours = int(schedule.get("lookback_hours", 24))
+
+        # Run evaluation with lookback window
+        since = datetime.now(tz=timezone.utc) - timedelta(hours=lookback_hours)
+        logger.info(
+            "evaluation_starting: since=%s lookback_hours=%d",
+            since.isoformat(),
+            lookback_hours,
+        )
+
+        try:
+            from memory.evaluator import EvaluatorRunner  # noqa: PLC0415
+
+            runner = EvaluatorRunner(config_path)
+            summary = runner.run(since=since)
+            logger.info("evaluation_complete: %s", summary)
+            # Touch health file only after successful run (not on failure)
+            _touch_health_file()
+        except Exception as exc:
+            logger.error(
+                "evaluation_failed: %s — continuing to next cycle",
+                exc,
+                exc_info=True,
+            )
+            # Do NOT update health file on failure
+            # Continue scheduling — do not crash daemon on evaluation errors
+
+
+# =============================================================================
+# ENTRY POINT
+# =============================================================================
+
+
+def main() -> None:
+    """Entry point."""
+    signal.signal(signal.SIGTERM, _handle_sigterm)
+    _register_langfuse_shutdown()
+
+    try:
+        run_scheduler()
+    except KeyboardInterrupt:
+        logger.info("keyboard_interrupt_received")
+    finally:
+        logger.info("evaluator_scheduler_stopped")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py
index 4a99b0e9..326bcc47 100644
--- a/src/memory/evaluator/provider.py
+++ b/src/memory/evaluator/provider.py
@@ -11,6 +11,8 @@
 import json
 import logging
 import os
+import random
+import time
 from dataclasses import dataclass, field
 from typing import Any, Literal
 
@@ -35,6 +37,7 @@ class EvaluatorConfig:
     max_tokens: int = (
         4096  # Must accommodate thinking tokens + output for reasoning models
     )
+    max_retries: int = 3
     _client: Any = field(default=None, init=False, repr=False)
 
     @classmethod
@@ -51,6 +54,7 @@ def from_yaml(cls, path: str) -> "EvaluatorConfig":
             base_url=model_cfg.get("base_url"),
             temperature=model_cfg.get("temperature", 0.0),
             max_tokens=model_cfg.get("max_tokens", 4096),
+            max_retries=model_cfg.get("max_retries", 3),
         )
 
     def get_client(self):
@@ -63,7 +67,7 @@ def get_client(self):
             openai.OpenAI for ollama/openrouter/openai/custom providers
             anthropic.Anthropic for the anthropic provider (native SDK)
         """
-        if self._client:
+        if self._client is not None:
             return self._client
 
         if self.provider == "ollama":
@@ -129,38 +133,100 @@ def get_client(self):
     def evaluate(self, prompt: str) -> dict:
         """Send prompt to LLM and return parsed evaluation result.
 
+        Retries on transient HTTP errors (500, 502, 503, 429) and network
+        errors up to self.max_retries times with exponential backoff + jitter.
+
         Args:
             prompt: The evaluation prompt (with trace input/output embedded)
 
         Returns:
             dict with keys: "score" (float|bool|str) and "reasoning" (str)
         """
-        client = self.get_client()
-
-        if self.provider == "anthropic":
-            # Native Anthropic SDK uses client.messages.create()
-            response = client.messages.create(
-                model=self.model_name,
-                max_tokens=self.max_tokens,
-                temperature=self.temperature,
-                messages=[{"role": "user", "content": prompt}],
-            )
-            content = response.content[0].text
-        else:
-            # All other providers use OpenAI-compatible chat.completions.create()
-            response = client.chat.completions.create(
-                model=self.model_name,
-                temperature=self.temperature,
-                max_tokens=self.max_tokens,
-                messages=[{"role": "user", "content": prompt}],
-            )
-            msg = response.choices[0].message
-            content = msg.content
-            # Thinking models (e.g., Qwen3) may put output in reasoning field
-            if not content and hasattr(msg, "reasoning") and msg.reasoning:
-                content = msg.reasoning
-
-        return _parse_evaluation_response(content)
+        _RETRYABLE_STATUS_CODES = {500, 502, 503, 429}
+        _BASE_DELAY = 1.0
+        last_exc: Exception | None = None
+
+        for attempt in range(self.max_retries + 1):
+            try:
+                client = self.get_client()
+
+                if self.provider == "anthropic":
+                    # Native Anthropic SDK uses client.messages.create()
+                    response = client.messages.create(
+                        model=self.model_name,
+                        max_tokens=self.max_tokens,
+                        temperature=self.temperature,
+                        messages=[{"role": "user", "content": prompt}],
+                    )
+                    content = response.content[0].text
+                else:
+                    # All other providers use OpenAI-compatible chat.completions.create()
+                    response = client.chat.completions.create(
+                        model=self.model_name,
+                        temperature=self.temperature,
+                        max_tokens=self.max_tokens,
+                        messages=[{"role": "user", "content": prompt}],
+                    )
+                    msg = response.choices[0].message
+                    content = msg.content
+                    # Thinking models (e.g., Qwen3) may put output in reasoning field
+                    if not content and hasattr(msg, "reasoning") and msg.reasoning:
+                        content = msg.reasoning
+
+                return _parse_evaluation_response(content)
+
+            except (ConnectionError, TimeoutError, OSError) as exc:
+                last_exc = exc
+                if attempt >= self.max_retries:
+                    break
+                delay = _BASE_DELAY * (2**attempt)
+                delay += random.uniform(0, 0.5 * delay)
+                logger.warning(
+                    "Retry %d/%d: network error (%s), waiting %.2fs",
+                    attempt + 1,
+                    self.max_retries,
+                    type(exc).__name__,
+                    delay,
+                )
+                time.sleep(delay)
+
+            except Exception as exc:
+                last_exc = exc
+                status_code = getattr(exc, "status_code", None)
+                if status_code not in _RETRYABLE_STATUS_CODES:
+                    raise
+
+                if attempt >= self.max_retries:
+                    break
+
+                # For 429, honour Retry-After header if present
+                delay = _BASE_DELAY * (2**attempt)
+                if status_code == 429:
+                    try:
+                        retry_after = exc.response.headers.get("Retry-After")
+                        if retry_after is not None:
+                            delay = float(retry_after)
+                    except AttributeError:
+                        pass
+
+                delay += random.uniform(0, 0.5 * delay)
+                logger.warning(
+                    "Retry %d/%d: HTTP %s, waiting %.2fs",
+                    attempt + 1,
+                    self.max_retries,
+                    status_code,
+                    delay,
+                )
+                time.sleep(delay)
+
+        # All retries exhausted — reset client to force fresh connection next call
+        self._client = None
+        logger.error(
+            "Evaluator failed after %d retries. Last error: %s",
+            self.max_retries,
+            last_exc,
+        )
+        raise last_exc
 
 
 def _parse_evaluation_response(content: str) -> dict:
diff --git a/src/memory/evaluator/runner.py b/src/memory/evaluator/runner.py
index 847d9a84..2bad2497 100644
--- a/src/memory/evaluator/runner.py
+++ b/src/memory/evaluator/runner.py
@@ -1,18 +1,26 @@
 # LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md
 # FORBIDDEN: Langfuse() constructor, start_span(), start_generation(), langfuse_context
 # REQUIRED: get_client(), create_score(), flush()
-"""Evaluation runner — fetches traces, evaluates, attaches scores.
+"""Evaluation runner — fetches traces and observations, evaluates, attaches scores.
 
 Core pipeline:
   1. Load evaluator config + evaluator definitions
-  2. Fetch traces using page-based pagination (V3 SDK)
-  3. Filter traces by evaluator criteria (event_types, tags)
-  4. Sample traces per evaluator's sampling_rate
-  5. Evaluate each trace via configurable LLM judge
-  6. Attach scores to Langfuse via create_score() (idempotent)
-  7. Log each evaluation to audit log
+  2. Per evaluator, read target: "trace" or "observation" (per-evaluator YAML field)
+  3. Trace path (EV-05, EV-06): page-based pagination via trace.list()
+     — uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API)
+  4. Observation path (EV-01–EV-04): cursor-based pagination via observations.get_many()
+     — uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217)
+  5. Filter observations by name (event_type) — server-side via name= parameter (Path B)
+  6. Sample traces/observations per evaluator's sampling_rate
+  7. Evaluate each trace/observation via configurable LLM judge
+  8. Attach scores to Langfuse via create_score() (idempotent via score_id=)
+  9. Log each evaluation to audit log
+
+Note: evaluation_targets in evaluator_config.yaml is DEPRECATED.
+      Use the per-evaluator target: field in each evaluator YAML instead.
 
 PLAN-012 Phase 2 — Section 5.4
+S-16.3: Observation-level evaluation + cursor-based observation pagination (BUG-217)
 """
 
 import hashlib
@@ -41,7 +49,7 @@ def get_client():  # type: ignore[misc]
 
 
 class EvaluatorRunner:
-    """Core evaluation pipeline for LLM-as-Judge scoring of Langfuse traces."""
+    """Core evaluation pipeline for LLM-as-Judge scoring of Langfuse traces and observations."""
 
     def __init__(self, config_path: str):
         """Load evaluator config and prepare runner.
@@ -157,23 +165,64 @@ def _build_prompt(self, evaluator: dict, trace: Any) -> str:
             f"**Metadata**: {trace_metadata}"
         )
 
+    def _build_observation_prompt(self, evaluator: dict, observation: Any) -> str:
+        """Build evaluation prompt from template + observation data.
+
+        Do NOT pass observation objects to _build_prompt() — observation schema
+        differs from trace schema (e.g. no tags, name is event_type).
+
+        Args:
+            evaluator: Evaluator definition dict
+            observation: Langfuse ObservationV2 object
+
+        Returns:
+            Formatted evaluation prompt
+        """
+        template = self._load_prompt(evaluator)
+
+        obs_name = str(getattr(observation, "name", "") or "")
+        obs_input = str(getattr(observation, "input", "") or "")[:TRACE_CONTENT_MAX]
+        obs_output = str(getattr(observation, "output", "") or "")[:TRACE_CONTENT_MAX]
+        obs_metadata = str(getattr(observation, "metadata", {}) or {})[:TRACE_CONTENT_MAX]
+
+        return (
+            f"{template}\n\n"
+            f"## Observation to Evaluate\n\n"
+            f"**Event Type**: {obs_name}\n\n"
+            f"**Input**: {obs_input}\n\n"
+            f"**Output**: {obs_output}\n\n"
+            f"**Metadata**: {obs_metadata}"
+        )
+
     def _make_score_id(
-        self, trace_id: str, evaluator_name: str, since: datetime
+        self,
+        trace_id: str,
+        evaluator_name: str,
+        since: datetime,
+        observation_id: str | None = None,
     ) -> str:
         """Generate deterministic score ID for idempotency.
 
-        Uses MD5 of f"{trace_id}:{evaluator_name}:{since.isoformat()}"
-        so re-running the same evaluation window produces the same score ID.
+        Includes observation_id in the hash when present — without it, multiple
+        observations in the same trace would produce IDENTICAL score_ids (BUG-S163).
+
+        Seed format:
+          - With observation: f"{trace_id}:{observation_id}:{evaluator_name}:{since.isoformat()}"
+          - Without observation: f"{trace_id}:{evaluator_name}:{since.isoformat()}"
 
         Args:
             trace_id: Langfuse trace ID
             evaluator_name: Evaluator name (e.g., "retrieval_relevance")
             since: Start of evaluation window
+            observation_id: Optional Langfuse observation ID (for observation-level scores)
 
         Returns:
             32-character hex MD5 string
         """
-        seed = f"{trace_id}:{evaluator_name}:{since.isoformat()}"
+        if observation_id:
+            seed = f"{trace_id}:{observation_id}:{evaluator_name}:{since.isoformat()}"
+        else:
+            seed = f"{trace_id}:{evaluator_name}:{since.isoformat()}"
         return hashlib.md5(seed.encode()).hexdigest()
 
     def _append_audit_log(self, entry: dict) -> None:
@@ -182,108 +231,272 @@ def _append_audit_log(self, entry: dict) -> None:
         with open(self.audit_log_path, "a") as f:
             f.write(json.dumps(entry) + "\n")
 
-    def run(
+    def _run_observation_evaluator(
         self,
-        evaluator_id: str | None = None,
-        *,
+        langfuse: Any,
+        evaluator: dict,
+        ev_name: str,
         since: datetime,
-        until: datetime | None = None,
-        dry_run: bool = False,
-        batch_size: int = 10,
-    ) -> dict:
-        """Run evaluations for all (or one) evaluator.
+        until: datetime,
+        dry_run: bool,
+        batch_size: int,
+    ) -> tuple[int, int, int, int]:
+        """Run observation-level evaluation for a single evaluator.
+
+        Fetches observations via observations.get_many() with cursor-based pagination
+        (BUG-217). Filters by observation name (event_type) server-side via name=
+        parameter (Path B — tags are trace-level only in V3).
 
         Args:
-            evaluator_id: Optional evaluator ID to run (e.g., "EV-01")
-            since: Start of evaluation window (required — CLI provides this)
-            until: End of evaluation window (default: now)
-            dry_run: If True, evaluate but do not save scores to Langfuse
-            batch_size: Number of traces to fetch per page
+            langfuse: Langfuse client from get_client()
+            evaluator: Evaluator definition dict
+            ev_name: Evaluator name string
+            since: Start of evaluation window
+            until: End of evaluation window
+            dry_run: If True, evaluate but do not save scores
+            batch_size: Number of observations per cursor page
 
         Returns:
-            Summary dict with counts: fetched, sampled, evaluated, scored
+            Tuple of (fetched, sampled, evaluated, scored)
         """
-        # V3 singleton — NEVER use Langfuse() constructor directly
-        langfuse = get_client()
-
-        if until is None:
-            until = datetime.now(tz=timezone.utc)
+        ev_filter = evaluator.get("filter", {})
+        sampling_rate = float(evaluator.get("sampling_rate", 1.0))
+        score_type = evaluator.get("score_type", "NUMERIC")
+        event_types = ev_filter.get("event_types", [])
 
-        evaluators = self._load_evaluators(evaluator_id)
-        if not evaluators:
-            logger.warning(
-                "No evaluators found (dir=%s, id=%s)", self.evaluators_dir, evaluator_id
-            )
-            return {"fetched": 0, "sampled": 0, "evaluated": 0, "scored": 0}
-
-        total_fetched = 0
-        total_sampled = 0
-        total_evaluated = 0
-        total_scored = 0
-
-        try:
-            for evaluator in evaluators:
-                ev_name = evaluator.get("name", evaluator.get("id", "unknown"))
-                sampling_rate = float(evaluator.get("sampling_rate", 1.0))
-                ev_filter = evaluator.get("filter", {})
+        # If no event_types defined, fetch all observations (no name filter).
+        # Otherwise iterate once per event_type using server-side name= filter.
+        name_filters: list[str | None] = event_types if event_types else [None]
 
-                print(f"\n--- Running {evaluator.get('id', '?')}: {ev_name} ---")
+        fetched = sampled = evaluated = scored = 0
 
-                page = 1  # V3 uses page-based pagination (1-indexed)
+        try:  # R2-F3: catch fetch-level errors so one evaluator doesn't kill the whole run
+            for name_filter in name_filters:
+                cursor: str | None = None
 
                 while True:
-                    traces_response = langfuse.api.trace.list(
-                        from_timestamp=since,
-                        to_timestamp=until,
-                        page=page,
+                    # Cursor-based pagination — V3 observations API (BUG-217)
+                    obs_response = langfuse.api.observations.get_many(
+                        name=name_filter,
+                        from_start_time=since,
+                        to_start_time=until,
+                        cursor=cursor,
                         limit=batch_size,
                     )
-                    traces = traces_response.data or []
-                    total_fetched += len(traces)
-
-                    for trace in traces:
-                        # Filter first, then sample from matching traces only (UF-4)
-                        if not self._matches_filter(trace, ev_filter):
-                            continue
+                    observations = obs_response.data or []
+                    fetched += len(observations)
 
+                    for obs in observations:
                         # Apply sampling
                         if random.random() > sampling_rate:
                             continue
-                        total_sampled += 1
 
-                        # Skip traces with no output
-                        if getattr(trace, "output", None) is None:
+                        obs_id = obs.id
+                        trace_id = getattr(obs, "trace_id", None) or ""
+
+                        # R2-F5: skip observations with no trace_id
+                        if not trace_id:
+                            logger.warning(
+                                "Observation %s has no trace_id — skipping", obs_id
+                            )
                             continue
 
-                        # Build prompt and evaluate — isolated per trace (UF-3)
+                        # R2-F7: check output BEFORE incrementing sampled
+                        if getattr(obs, "output", None) is None:
+                            continue
+
+                        # Note: 'sampled' counts observations that passed random gate + trace_id + output guards — not raw sampling rate
+                        sampled += 1
+
                         try:
-                            prompt = self._build_prompt(evaluator, trace)
+                            prompt = self._build_observation_prompt(evaluator, obs)
                             result = self.evaluator_config.evaluate(prompt)
                             if result.get("score") is None:
                                 logger.warning(
-                                    "Evaluator returned null score for trace %s",
-                                    trace.id,
+                                    "Evaluator returned null score for observation %s", obs_id
                                 )
                                 continue
 
-                            total_evaluated += 1
+                            evaluated += 1
 
-                            # Attach score to Langfuse (idempotent via score_id)
+                            # Attach score (idempotent via score_id)
                             if not dry_run:
-                                score_id = self._make_score_id(trace.id, ev_name, since)
+                                score_id = self._make_score_id(
+                                    trace_id, ev_name, since, observation_id=obs_id
+                                )
+                                # CATEGORICAL scores use string value; NUMERIC/BOOLEAN use float
+                                score_value = (
+                                    str(result["score"])
+                                    if score_type == "CATEGORICAL"
+                                    else result["score"]
+                                )
+                                # R1-F3: validate categorical value against defined categories
+                                if score_type == "CATEGORICAL":
+                                    valid_categories = evaluator.get("categories", [])
+                                    if valid_categories and str(result["score"]) not in valid_categories:
+                                        logger.warning(
+                                            "Observation %s: categorical value %r not in "
+                                            "categories %s — skipping",
+                                            obs_id, result["score"], valid_categories,
+                                        )
+                                        continue
                                 langfuse.create_score(
                                     score_id=score_id,
-                                    trace_id=trace.id,
+                                    trace_id=trace_id,
+                                    observation_id=obs_id,
                                     name=ev_name,
-                                    value=result["score"],
-                                    data_type=evaluator.get("score_type", "NUMERIC"),
+                                    value=score_value,
+                                    data_type=score_type,
                                     comment=str(result.get("reasoning", ""))[
                                         :TRACE_CONTENT_MAX
                                     ],
                                 )
-                                total_scored += 1
+                                scored += 1
+
+                                # R2-F2: audit log only written when not dry_run
+                                self._append_audit_log(
+                                    {
+                                        "timestamp": datetime.now(tz=timezone.utc).isoformat(),
+                                        "evaluator_id": evaluator.get("id"),
+                                        "evaluator_name": ev_name,
+                                        "trace_id": trace_id,
+                                        "observation_id": obs_id,
+                                        "score": result["score"],
+                                        "reasoning": str(result.get("reasoning", ""))[:500],
+                                        "dry_run": dry_run,
+                                    }
+                                )
+
+                            reasoning_preview = str(result.get("reasoning", ""))[:80]
+                            print(
+                                f"  [{ev_name}] obs={obs_id[:8]}... trace={trace_id[:8]}... "
+                                f"score={result['score']} | {reasoning_preview}"
+                            )
+                        except Exception as exc:
+                            logger.error(
+                                "Error evaluating observation %s with %s: %s",
+                                obs_id,
+                                ev_name,
+                                exc,
+                            )
+                            continue
+
+                    # Cursor-based stop condition — no next cursor means last page
+                    next_cursor = getattr(obs_response.meta, "next_cursor", None)  # R2-F1
+                    if not next_cursor or not observations:
+                        break
+                    cursor = next_cursor
+
+        except Exception as exc:
+            logger.error("Evaluator %s: error fetching observations: %s", ev_name, exc)
 
-                            # Audit log
+        return fetched, sampled, evaluated, scored
+
+    def _run_trace_evaluator(
+        self,
+        langfuse: Any,
+        evaluator: dict,
+        ev_name: str,
+        since: datetime,
+        until: datetime,
+        dry_run: bool,
+        batch_size: int,
+    ) -> tuple[int, int, int, int]:
+        """Run trace-level evaluation for a single evaluator.
+
+        Fetches traces via trace.list() with page-based pagination.
+        Uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API).
+
+        Args:
+            langfuse: Langfuse client from get_client()
+            evaluator: Evaluator definition dict
+            ev_name: Evaluator name string
+            since: Start of evaluation window
+            until: End of evaluation window
+            dry_run: If True, evaluate but do not save scores
+            batch_size: Number of traces per page
+
+        Returns:
+            Tuple of (fetched, sampled, evaluated, scored)
+        """
+        ev_filter = evaluator.get("filter", {})
+        sampling_rate = float(evaluator.get("sampling_rate", 1.0))
+        score_type = evaluator.get("score_type", "NUMERIC")
+
+        fetched = sampled = evaluated = scored = 0
+        page = 1  # V3 trace.list() is page-based (1-indexed)
+
+        try:  # R2-F3: catch fetch-level errors so one evaluator doesn't kill the whole run
+            while True:
+                traces_response = langfuse.api.trace.list(
+                    from_timestamp=since,
+                    to_timestamp=until,
+                    page=page,
+                    limit=batch_size,
+                )
+                traces = traces_response.data or []
+                fetched += len(traces)
+
+                for trace in traces:
+                    # Filter first, then sample from matching traces only (UF-4)
+                    if not self._matches_filter(trace, ev_filter):
+                        continue
+
+                    # Apply sampling
+                    if random.random() > sampling_rate:
+                        continue
+
+                    # R2-F7: check output BEFORE incrementing sampled
+                    if getattr(trace, "output", None) is None:
+                        continue
+
+                    sampled += 1
+
+                    # Build prompt and evaluate — isolated per trace (UF-3)
+                    try:
+                        prompt = self._build_prompt(evaluator, trace)
+                        result = self.evaluator_config.evaluate(prompt)
+                        if result.get("score") is None:
+                            logger.warning(
+                                "Evaluator returned null score for trace %s",
+                                trace.id,
+                            )
+                            continue
+
+                        evaluated += 1
+
+                        # Attach score to Langfuse (idempotent via score_id)
+                        if not dry_run:
+                            score_id = self._make_score_id(trace.id, ev_name, since)
+                            # CATEGORICAL scores use string value; NUMERIC/BOOLEAN use float
+                            score_value = (
+                                str(result["score"])
+                                if score_type == "CATEGORICAL"
+                                else result["score"]
+                            )
+                            # R1-F3: validate categorical value against defined categories
+                            if score_type == "CATEGORICAL":
+                                valid_categories = evaluator.get("categories", [])
+                                if valid_categories and str(result["score"]) not in valid_categories:
+                                    logger.warning(
+                                        "Trace %s: categorical value %r not in "
+                                        "categories %s — skipping",
+                                        trace.id, result["score"], valid_categories,
+                                    )
+                                    continue
+                            langfuse.create_score(
+                                score_id=score_id,
+                                trace_id=trace.id,
+                                name=ev_name,
+                                value=score_value,
+                                data_type=score_type,
+                                comment=str(result.get("reasoning", ""))[
+                                    :TRACE_CONTENT_MAX
+                                ],
+                            )
+                            scored += 1
+
+                            # R2-F2: audit log only written when not dry_run
                             self._append_audit_log(
                                 {
                                     "timestamp": datetime.now(
@@ -298,28 +511,99 @@ def run(
                                 }
                             )
 
-                            reasoning_preview = str(result.get("reasoning", ""))[:80]
-                            print(
-                                f"  [{ev_name}] trace={trace.id[:8]}... "
-                                f"score={result['score']} | {reasoning_preview}"
-                            )
-                        except Exception as exc:
-                            logger.error(
-                                "Error evaluating trace %s with %s: %s",
-                                trace.id,
-                                ev_name,
-                                exc,
-                            )
-                            continue
+                        reasoning_preview = str(result.get("reasoning", ""))[:80]
+                        print(
+                            f"  [{ev_name}] trace={trace.id[:8]}... "
+                            f"score={result['score']} | {reasoning_preview}"
+                        )
+                    except Exception as exc:
+                        logger.error(
+                            "Error evaluating trace %s with %s: %s",
+                            trace.id,
+                            ev_name,
+                            exc,
+                        )
+                        continue
+
+                # Page-based pagination — advance or stop (V3 trace.list() API)
+                meta = getattr(traces_response, "meta", None)
+                total_pages = getattr(meta, "total_pages", 1) if meta else 1
+                if page >= total_pages or not traces:
+                    break
+                page += 1
+
+        except Exception as exc:
+            logger.error("Evaluator %s: error fetching traces: %s", ev_name, exc)
+
+        return fetched, sampled, evaluated, scored
 
-                    # Page-based pagination — advance or stop
-                    meta = getattr(traces_response, "meta", None)
-                    total_pages = getattr(meta, "total_pages", 1) if meta else 1
-                    if page >= total_pages or not traces:
-                        break
-                    page += 1
+    def run(
+        self,
+        evaluator_id: str | None = None,
+        *,
+        since: datetime,
+        until: datetime | None = None,
+        dry_run: bool = False,
+        batch_size: int = 10,
+    ) -> dict:
+        """Run evaluations for all (or one) evaluator.
+
+        Routes each evaluator to the correct path based on the per-evaluator
+        YAML target: field ("trace" or "observation"). The global evaluation_targets
+        section in evaluator_config.yaml is DEPRECATED in favour of per-evaluator target:.
+
+        Args:
+            evaluator_id: Optional evaluator ID to run (e.g., "EV-01")
+            since: Start of evaluation window (required — CLI provides this)
+            until: End of evaluation window (default: now)
+            dry_run: If True, evaluate but do not save scores to Langfuse
+            batch_size: Number of traces/observations to fetch per page/cursor
+
+        Returns:
+            Summary dict with counts: fetched, sampled, evaluated, scored
+        """
+        # V3 singleton — NEVER use Langfuse() constructor directly
+        langfuse = get_client()
+
+        if until is None:
+            until = datetime.now(tz=timezone.utc)
+
+        evaluators = self._load_evaluators(evaluator_id)
+        if not evaluators:
+            logger.warning(
+                "No evaluators found (dir=%s, id=%s)", self.evaluators_dir, evaluator_id
+            )
+            return {"fetched": 0, "sampled": 0, "evaluated": 0, "scored": 0}
+
+        total_fetched = 0
+        total_sampled = 0
+        total_evaluated = 0
+        total_scored = 0
+
+        try:
+            for evaluator in evaluators:
+                ev_name = evaluator.get("name", evaluator.get("id", "unknown"))
+                # Per-evaluator target field is source of truth (DEPRECATED: global evaluation_targets)
+                target = evaluator.get("target", "trace")
+
+                print(f"\n--- Running {evaluator.get('id', '?')}: {ev_name} (target={target}) ---")
+
+                if target == "observation":
+                    f, s, e, sc = self._run_observation_evaluator(
+                        langfuse, evaluator, ev_name, since, until, dry_run, batch_size
+                    )
+                else:
+                    # Default: trace-level evaluation (EV-05, EV-06)
+                    f, s, e, sc = self._run_trace_evaluator(
+                        langfuse, evaluator, ev_name, since, until, dry_run, batch_size
+                    )
+
+                total_fetched += f
+                total_sampled += s
+                total_evaluated += e
+                total_scored += sc
 
-                print(f"  Evaluated: {total_evaluated} | Scored: {total_scored}")
+                print(f"  Evaluated: {e} | Scored: {sc}")
 
         finally:
             # Flush all buffered scores to Langfuse — runs even if an error occurs (UF-5)
diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py
new file mode 100644
index 00000000..899c614a
--- /dev/null
+++ b/tests/test_create_score_configs.py
@@ -0,0 +1,294 @@
+"""Tests for scripts/create_score_configs.py idempotency and dedup logic.
+
+Validates:
+  - Pre-check via list API skips configs that already exist
+  - New configs are created when absent
+  - --cleanup-duplicates removes extra copies, keeps oldest
+  - flush() is called before exit (V3 requirement)
+
+PLAN-012 Phase 2 — Section 5.6 (S-16.2)
+"""
+
+import importlib
+import os
+import sys
+import types
+from contextlib import contextmanager
+from unittest.mock import MagicMock
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Path setup
+# ---------------------------------------------------------------------------
+
+SCRIPTS_DIR = os.path.join(os.path.dirname(__file__), "..", "scripts")
+sys.path.insert(0, os.path.abspath(SCRIPTS_DIR))
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+ALL_CONFIG_NAMES = [
+    "retrieval_relevance",
+    "bootstrap_quality",
+    "session_coherence",
+    "injection_value",
+    "capture_completeness",
+    "classification_accuracy",
+]
+
+
+def _make_config(name: str, cfg_id: str = "abc123", created_at: str = "2026-01-01T00:00:00Z"):
+    cfg = MagicMock()
+    cfg.name = name
+    cfg.id = cfg_id
+    cfg.created_at = created_at
+    return cfg
+
+
+def _make_list_response(items: list, total_pages: int = 1):
+    response = MagicMock()
+    response.data = items
+    meta = MagicMock()
+    meta.total_pages = total_pages
+    response.meta = meta
+    return response
+
+
+def _build_langfuse_modules(mock_client):
+    """Build the fake langfuse module tree needed for create_score_configs imports."""
+    fake_langfuse = types.ModuleType("langfuse")
+    fake_langfuse.get_client = MagicMock(return_value=mock_client)
+
+    fake_commons = types.ModuleType("langfuse.api.resources.commons.types")
+    fake_commons.ConfigCategory = MagicMock(side_effect=lambda **kw: kw)
+    fake_commons.ScoreConfigDataType = MagicMock()
+    fake_commons.ScoreConfigDataType.NUMERIC = "NUMERIC"
+    fake_commons.ScoreConfigDataType.BOOLEAN = "BOOLEAN"
+    fake_commons.ScoreConfigDataType.CATEGORICAL = "CATEGORICAL"
+
+    fake_sc_types = types.ModuleType("langfuse.api.resources.score_configs.types")
+    fake_sc_types.CreateScoreConfigRequest = MagicMock(side_effect=lambda **kw: kw)
+
+    modules = {
+        "langfuse": fake_langfuse,
+        "langfuse.api": MagicMock(),
+        "langfuse.api.resources": MagicMock(),
+        "langfuse.api.resources.commons": MagicMock(),
+        "langfuse.api.resources.commons.types": fake_commons,
+        "langfuse.api.resources.score_configs": MagicMock(),
+        "langfuse.api.resources.score_configs.types": fake_sc_types,
+    }
+    return modules
+
+
+@contextmanager
+def _patched_module(mock_client):
+    """Context manager: inject fake langfuse, import module, yield (mod, client)."""
+    sys.modules.pop("create_score_configs", None)
+    fake_modules = _build_langfuse_modules(mock_client)
+
+    original = {k: sys.modules.get(k) for k in fake_modules}
+    sys.modules.update(fake_modules)
+    try:
+        mod = importlib.import_module("create_score_configs")
+        yield mod, mock_client
+    finally:
+        sys.modules.pop("create_score_configs", None)
+        for k, v in original.items():
+            if v is None:
+                sys.modules.pop(k, None)
+            else:
+                sys.modules[k] = v
+
+
+def _make_client_with_existing(existing_names: list[str], list_side_effect=None):
+    """Build a mock client where existing_names are already in Langfuse."""
+    client = MagicMock()
+    configs = [_make_config(n) for n in existing_names]
+    if list_side_effect is not None:
+        client.api.score_configs.list.side_effect = list_side_effect
+    else:
+        client.api.score_configs.list.return_value = _make_list_response(configs)
+    client.api.score_configs.create.return_value = MagicMock()
+    client.api.score_configs.delete.return_value = None
+    client.flush.return_value = None
+    return client
+
+
+# ---------------------------------------------------------------------------
+# Tests: _fetch_existing_configs
+# ---------------------------------------------------------------------------
+
+
+class TestFetchExistingConfigs:
+    def test_returns_empty_when_no_configs(self):
+        client = _make_client_with_existing([])
+        with _patched_module(client) as (mod, c):
+            result = mod._fetch_existing_configs(c)
+        assert result == {}
+
+    def test_returns_configs_by_name(self):
+        client = _make_client_with_existing(["retrieval_relevance", "injection_value"])
+        with _patched_module(client) as (mod, c):
+            result = mod._fetch_existing_configs(c)
+        assert "retrieval_relevance" in result
+        assert "injection_value" in result
+
+    def test_detects_duplicates(self):
+        configs = [
+            _make_config("retrieval_relevance", "id-1"),
+            _make_config("retrieval_relevance", "id-2"),
+        ]
+        client = MagicMock()
+        client.api.score_configs.list.return_value = _make_list_response(configs)
+        with _patched_module(client) as (mod, c):
+            result = mod._fetch_existing_configs(c)
+        assert len(result["retrieval_relevance"]) == 2
+
+    def test_handles_list_api_error_gracefully(self):
+        client = MagicMock()
+        client.api.score_configs.list.side_effect = RuntimeError("network error")
+        with _patched_module(client) as (mod, c):
+            result = mod._fetch_existing_configs(c)
+        assert result == {}
+
+
+# ---------------------------------------------------------------------------
+# Tests: _cleanup_duplicates
+# ---------------------------------------------------------------------------
+
+
+class TestCleanupDuplicates:
+    def test_skips_non_duplicates(self):
+        client = MagicMock()
+        with _patched_module(client) as (mod, c):
+            existing = {"retrieval_relevance": [_make_config("retrieval_relevance", "id-1")]}
+            mod._cleanup_duplicates(c, existing)
+        client.api.score_configs.delete.assert_not_called()
+
+    def test_deletes_extra_copies_keeps_oldest(self):
+        client = MagicMock()
+        older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z")
+        newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z")
+        with _patched_module(client) as (mod, c):
+            existing = {"retrieval_relevance": [older, newer]}
+            mod._cleanup_duplicates(c, existing)
+        # Newer (id-new) should be deleted; oldest (id-old) kept
+        client.api.score_configs.delete.assert_called_once_with("id-new")
+
+    def test_handles_delete_error_gracefully(self):
+        client = MagicMock()
+        client.api.score_configs.delete.side_effect = RuntimeError("delete failed")
+        older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z")
+        newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z")
+        with _patched_module(client) as (mod, c):
+            existing = {"retrieval_relevance": [older, newer]}
+            # Should not raise
+            mod._cleanup_duplicates(c, existing)
+
+    def test_missing_created_at_sorts_last_and_gets_deleted(self):
+        """Config without created_at should sort last (treated as newest) and be deleted."""
+        client = MagicMock()
+        client.api.score_configs.delete.return_value = None
+        # Config with a real date — should be kept as the "oldest"
+        with_date = _make_config("retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z")
+        # Config with no created_at attribute — fallback "9999-99-99" sorts last, gets deleted
+        no_date = MagicMock()
+        no_date.name = "retrieval_relevance"
+        no_date.id = "id-no-date"
+        del no_date.created_at  # ensure getattr fallback is triggered
+        with _patched_module(client) as (mod, c):
+            existing = {"retrieval_relevance": [with_date, no_date]}
+            mod._cleanup_duplicates(c, existing)
+        # The one without created_at should be deleted; the dated one kept
+        c.api.score_configs.delete.assert_called_once_with("id-no-date")
+
+
+# ---------------------------------------------------------------------------
+# Tests: main() idempotency
+# ---------------------------------------------------------------------------
+
+
+def _run_main(client, argv=None):
+    """Run main() with injected fake langfuse and given argv."""
+    if argv is None:
+        argv = ["create_score_configs.py"]
+    orig_argv = sys.argv
+    sys.argv = argv
+    try:
+        with _patched_module(client) as (mod, c):
+            rc = mod.main()
+            return rc, c
+    finally:
+        sys.argv = orig_argv
+
+
+class TestMainIdempotency:
+    def test_skips_all_when_all_exist(self):
+        client = _make_client_with_existing(ALL_CONFIG_NAMES)
+        rc, c = _run_main(client)
+        assert rc == 0
+        c.api.score_configs.create.assert_not_called()
+        c.flush.assert_called_once()
+
+    def test_creates_all_when_none_exist(self):
+        client = _make_client_with_existing([])
+        rc, c = _run_main(client)
+        assert rc == 0
+        # 6 configs: 3 NUMERIC + 2 BOOLEAN + 1 CATEGORICAL
+        assert c.api.score_configs.create.call_count == 6
+        c.flush.assert_called_once()
+
+    def test_creates_only_missing_configs(self):
+        existing = ["retrieval_relevance", "injection_value", "classification_accuracy"]
+        client = _make_client_with_existing(existing)
+        rc, c = _run_main(client)
+        assert rc == 0
+        # 3 missing: bootstrap_quality, session_coherence, capture_completeness
+        assert c.api.score_configs.create.call_count == 3
+        c.flush.assert_called_once()
+
+    def test_flush_always_called_on_success(self):
+        client = _make_client_with_existing([])
+        rc, c = _run_main(client)
+        assert rc == 0
+        c.flush.assert_called_once()
+
+    def test_cleanup_duplicates_flag_triggers_dedup(self):
+        older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z")
+        newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z")
+        other_existing = [_make_config(n) for n in ALL_CONFIG_NAMES if n != "retrieval_relevance"]
+
+        # After cleanup, deduped list returned
+        deduped = [_make_config("retrieval_relevance", "id-old")] + other_existing
+
+        client = MagicMock()
+        client.api.score_configs.list.side_effect = [
+            _make_list_response([older, newer] + other_existing),  # initial fetch
+            _make_list_response(deduped),                          # post-cleanup fetch
+        ]
+        client.api.score_configs.create.return_value = MagicMock()
+        client.api.score_configs.delete.return_value = None
+        client.flush.return_value = None
+
+        rc, c = _run_main(client, argv=["create_score_configs.py", "--cleanup-duplicates"])
+
+        assert rc == 0
+        # Duplicate deleted
+        c.api.score_configs.delete.assert_called_once_with("id-new")
+        # All 6 configs present after cleanup — nothing to create
+        c.api.score_configs.create.assert_not_called()
+        c.flush.assert_called_once()
+
+    def test_idempotent_on_repeated_runs(self):
+        """Second run with all configs present creates nothing."""
+        client = _make_client_with_existing(ALL_CONFIG_NAMES)
+        # Run twice
+        rc1, _ = _run_main(client)
+        rc2, c = _run_main(client)
+        assert rc1 == 0
+        assert rc2 == 0
+        c.api.score_configs.create.assert_not_called()
diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py
index 8c20477d..3cfd63d5 100644
--- a/tests/test_evaluator_runner.py
+++ b/tests/test_evaluator_runner.py
@@ -4,7 +4,17 @@
 Tests trace filtering, sampling, score attachment, and pagination using
 mocked Langfuse client.
 
+Covers:
+- Trace-level evaluation path (EV-05, EV-06 style)
+- Observation-level evaluation path (EV-01–EV-04 style)
+- Cursor-based pagination for observations (BUG-217)
+- Page-based pagination for traces
+- CATEGORICAL score type (EV-04)
+- observation_id in _make_score_id to prevent collisions (BUG-S163)
+- Per-evaluator target: routing
+
 PLAN-012 Phase 2 — AC-8
+S-16.3
 """
 
 import hashlib
@@ -30,7 +40,7 @@ def minimal_config(tmp_path) -> Path:
   provider: ollama
   model_name: llama3.2:8b
   temperature: 0.0
-  max_tokens: 512
+  max_tokens: 4096
 evaluators_dir: {evaluators_dir}
 audit:
   log_file: {audit_log}
@@ -49,6 +59,89 @@ def runner(minimal_config) -> EvaluatorRunner:
     return EvaluatorRunner(config_path=str(minimal_config))
 
 
+@pytest.fixture()
+def trace_evaluator_yaml(tmp_path, minimal_config) -> dict:
+    """Create a trace-level evaluator YAML (EV-05 style)."""
+    runner = EvaluatorRunner(config_path=str(minimal_config))
+    ev_yaml = {
+        "id": "EV-05",
+        "name": "bootstrap_quality",
+        "score_type": "NUMERIC",
+        "target": "trace",
+        "filter": {"event_types": ["session_bootstrap"]},
+        "sampling_rate": 1.0,
+        "prompt_file": "ev05_prompt.md",
+    }
+    ev_dir = runner.evaluators_dir
+    ev_dir.mkdir(parents=True, exist_ok=True)
+
+    ev_file = ev_dir / "ev05_bootstrap_quality.yaml"
+    import yaml
+
+    ev_file.write_text(yaml.dump(ev_yaml))
+
+    prompt_file = ev_dir / "ev05_prompt.md"
+    prompt_file.write_text("Evaluate this bootstrap session.")
+
+    return ev_yaml
+
+
+@pytest.fixture()
+def observation_evaluator_yaml(tmp_path, minimal_config) -> dict:
+    """Create an observation-level evaluator YAML (EV-01 style)."""
+    runner = EvaluatorRunner(config_path=str(minimal_config))
+    ev_yaml = {
+        "id": "EV-01",
+        "name": "retrieval_relevance",
+        "score_type": "NUMERIC",
+        "target": "observation",
+        "filter": {"event_types": ["search_query", "context_retrieval"]},
+        "sampling_rate": 1.0,
+        "prompt_file": "ev01_prompt.md",
+    }
+    ev_dir = runner.evaluators_dir
+    ev_dir.mkdir(parents=True, exist_ok=True)
+
+    ev_file = ev_dir / "ev01_retrieval_relevance.yaml"
+    import yaml
+
+    ev_file.write_text(yaml.dump(ev_yaml))
+
+    prompt_file = ev_dir / "ev01_prompt.md"
+    prompt_file.write_text("Evaluate this retrieval.")
+
+    return ev_yaml
+
+
+@pytest.fixture()
+def categorical_evaluator_yaml(tmp_path, minimal_config) -> dict:
+    """Create a CATEGORICAL observation-level evaluator (EV-04 style)."""
+    runner = EvaluatorRunner(config_path=str(minimal_config))
+    ev_yaml = {
+        "id": "EV-04",
+        "name": "classification_accuracy",
+        "score_type": "CATEGORICAL",
+        "target": "observation",
+        "categories": ["correct", "partially_correct", "incorrect"],
+        "filter": {"event_types": ["9_classify"]},
+        "sampling_rate": 1.0,
+        "prompt_file": "ev04_prompt.md",
+    }
+    ev_dir = runner.evaluators_dir
+    ev_dir.mkdir(parents=True, exist_ok=True)
+
+    ev_file = ev_dir / "ev04_classification_accuracy.yaml"
+    import yaml
+
+    ev_file.write_text(yaml.dump(ev_yaml))
+
+    prompt_file = ev_dir / "ev04_prompt.md"
+    prompt_file.write_text("Evaluate this classification.")
+
+    return ev_yaml
+
+
+# For backward compat — existing tests use evaluator_yaml for a trace evaluator
 @pytest.fixture()
 def evaluator_yaml(tmp_path, minimal_config) -> dict:
     """Create a single evaluator YAML in the evaluators directory."""
@@ -57,7 +150,7 @@ def evaluator_yaml(tmp_path, minimal_config) -> dict:
         "id": "EV-01",
         "name": "retrieval_relevance",
         "score_type": "NUMERIC",
-        "target": "observation",
+        "target": "trace",
         "filter": {"tags": ["retrieval"]},
         "sampling_rate": 1.0,
         "prompt_file": "ev01_prompt.md",
@@ -94,7 +187,24 @@ def make_mock_trace(
     return trace
 
 
+def make_mock_observation(
+    obs_id: str = "obs001",
+    trace_id: str = "trace001",
+    name: str = "search_query",
+    output: str | None = "retrieved context",
+):
+    obs = MagicMock()
+    obs.id = obs_id
+    obs.trace_id = trace_id
+    obs.name = name
+    obs.input = "query text"
+    obs.output = output
+    obs.metadata = {}
+    return obs
+
+
 def make_paginated_response(traces: list, total_pages=1):
+    """Page-based response for trace.list()."""
     response = MagicMock()
     response.data = traces
     response.meta = MagicMock()
@@ -102,6 +212,15 @@ def make_paginated_response(traces: list, total_pages=1):
     return response
 
 
+def make_observation_response(observations: list, next_cursor: str | None = None):
+    """Cursor-based response for observations.get_many()."""
+    response = MagicMock()
+    response.data = observations
+    response.meta = MagicMock()
+    response.meta.next_cursor = next_cursor  # R2-F1: V3 API uses next_cursor not cursor
+    return response
+
+
 # ---------------------------------------------------------------------------
 # Tests: _matches_filter
 # ---------------------------------------------------------------------------
@@ -178,7 +297,7 @@ def test_returns_hex_string(self, runner):
         assert len(score_id) == 32
         int(score_id, 16)  # Must be valid hex
 
-    def test_matches_expected_md5(self, runner):
+    def test_matches_expected_md5_without_observation_id(self, runner):
         since = datetime(2026, 3, 13, 0, 0, 0, tzinfo=timezone.utc)
         expected = hashlib.md5(
             f"trace123:retrieval_relevance:{since.isoformat()}".encode()
@@ -187,20 +306,51 @@ def test_matches_expected_md5(self, runner):
             runner._make_score_id("trace123", "retrieval_relevance", since) == expected
         )
 
+    def test_observation_id_changes_score_id(self, runner):
+        """Two observations from the same trace must get different score IDs (BUG-S163)."""
+        since = datetime(2026, 3, 13, tzinfo=timezone.utc)
+        id_without = runner._make_score_id("trace123", "retrieval_relevance", since)
+        id_with = runner._make_score_id(
+            "trace123", "retrieval_relevance", since, observation_id="obs001"
+        )
+        assert id_without != id_with
+
+    def test_different_observation_ids_produce_different_scores(self, runner):
+        """Multiple observations in same trace must get unique score IDs."""
+        since = datetime(2026, 3, 13, tzinfo=timezone.utc)
+        id1 = runner._make_score_id(
+            "trace123", "retrieval_relevance", since, observation_id="obs001"
+        )
+        id2 = runner._make_score_id(
+            "trace123", "retrieval_relevance", since, observation_id="obs002"
+        )
+        assert id1 != id2
+
+    def test_observation_id_md5_includes_observation_in_seed(self, runner):
+        since = datetime(2026, 3, 13, 0, 0, 0, tzinfo=timezone.utc)
+        expected = hashlib.md5(
+            f"trace123:obs001:retrieval_relevance:{since.isoformat()}".encode()
+        ).hexdigest()
+        assert (
+            runner._make_score_id(
+                "trace123", "retrieval_relevance", since, observation_id="obs001"
+            )
+            == expected
+        )
+
 
 # ---------------------------------------------------------------------------
-# Tests: run() — mocked Langfuse client
+# Tests: run() — trace-level path (existing behaviour, no regression)
 # ---------------------------------------------------------------------------
 
 
-class TestRunnerRun:
+class TestRunnerRunTracePath:
     def test_run_with_no_evaluators_returns_zero_summary(self, runner):
         """When evaluators_dir is empty, run() returns all-zero summary."""
         mock_langfuse = MagicMock()
         since = datetime(2026, 3, 12, tzinfo=timezone.utc)
 
         with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse):
-            # evaluators_dir is empty (created by fixture but no YAMLs)
             result = runner.run(since=since)
 
         assert result == {"fetched": 0, "sampled": 0, "evaluated": 0, "scored": 0}
@@ -232,6 +382,8 @@ def test_run_attaches_score_for_matching_trace(self, runner, evaluator_yaml):
         assert call_kwargs["trace_id"] == "trace001"
         assert call_kwargs["name"] == "retrieval_relevance"
         assert call_kwargs["value"] == 0.9
+        # Trace-level score must NOT have observation_id
+        assert call_kwargs.get("observation_id") is None
 
     def test_run_dry_run_does_not_call_create_score(self, runner, evaluator_yaml):
         """dry_run=True should evaluate but NOT call create_score()."""
@@ -270,16 +422,15 @@ def test_run_skips_trace_with_no_output(self, runner, evaluator_yaml):
             result = runner.run(since=since)
 
         assert result["evaluated"] == 0
+        assert result["sampled"] == 0  # R2-F7: sampled not incremented for no-output traces
         mock_evaluator_config.evaluate.assert_not_called()
 
-    def test_run_uses_page_pagination(self, runner, evaluator_yaml):
-        """Should advance page number when more pages are available."""
+    def test_run_uses_page_pagination_for_trace_path(self, runner, evaluator_yaml):
+        """Trace path must advance page number when more pages are available."""
         trace1 = make_mock_trace(trace_id="trace_p1", tags=["retrieval"])
         trace2 = make_mock_trace(trace_id="trace_p2", tags=["retrieval"])
 
         mock_langfuse = MagicMock()
-        # First call returns trace1 with total_pages=2
-        # Second call returns trace2 with total_pages=2 (page 2 of 2, stops)
         mock_langfuse.api.trace.list.side_effect = [
             make_paginated_response([trace1], total_pages=2),
             make_paginated_response([trace2], total_pages=2),
@@ -317,31 +468,32 @@ def test_run_calls_flush_after_all_evaluations(self, runner, evaluator_yaml):
         mock_langfuse.flush.assert_called_once()
 
     def test_run_filters_by_evaluator_id(self, runner, tmp_path):
-        """--evaluator EV-01 should only load EV-01 definition."""
+        """--evaluator EV-05 should only load EV-05 definition."""
         ev_dir = runner.evaluators_dir
         ev_dir.mkdir(parents=True, exist_ok=True)
 
         import yaml
 
-        # Create two evaluators
-        ev01 = {
-            "id": "EV-01",
-            "name": "retrieval_relevance",
+        ev05 = {
+            "id": "EV-05",
+            "name": "bootstrap_quality",
             "score_type": "NUMERIC",
+            "target": "trace",
             "filter": {},
             "sampling_rate": 1.0,
             "prompt_file": "prompt.md",
         }
-        ev02 = {
-            "id": "EV-02",
-            "name": "injection_value",
-            "score_type": "BOOLEAN",
+        ev06 = {
+            "id": "EV-06",
+            "name": "session_coherence",
+            "score_type": "NUMERIC",
+            "target": "trace",
             "filter": {},
             "sampling_rate": 1.0,
             "prompt_file": "prompt.md",
         }
-        (ev_dir / "ev01.yaml").write_text(yaml.dump(ev01))
-        (ev_dir / "ev02.yaml").write_text(yaml.dump(ev02))
+        (ev_dir / "ev05.yaml").write_text(yaml.dump(ev05))
+        (ev_dir / "ev06.yaml").write_text(yaml.dump(ev06))
         (ev_dir / "prompt.md").write_text("Evaluate this.")
 
         mock_langfuse = MagicMock()
@@ -350,9 +502,9 @@ def test_run_filters_by_evaluator_id(self, runner, tmp_path):
         since = datetime(2026, 3, 12, tzinfo=timezone.utc)
 
         with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse):
-            runner.run(evaluator_id="EV-01", since=since)
+            runner.run(evaluator_id="EV-05", since=since)
 
-        # api.trace.list should have been called once (for EV-01 only)
+        # trace.list should have been called once (for EV-05 only)
         assert mock_langfuse.api.trace.list.call_count == 1
 
     def test_run_appends_to_audit_log(self, runner, evaluator_yaml, tmp_path):
@@ -389,9 +541,7 @@ def test_run_uses_get_client_not_constructor(self, runner, evaluator_yaml):
         import memory.evaluator.runner as runner_module
 
         source = inspect.getsource(runner_module)
-        # Must import/use get_client
         assert "get_client" in source
-        # Must NOT instantiate Langfuse() directly (excluding comments and strings)
         non_comment_lines = [
             line for line in source.splitlines() if not line.strip().startswith("#")
         ]
@@ -400,7 +550,411 @@ def test_run_uses_get_client_not_constructor(self, runner, evaluator_yaml):
 
 
 # ---------------------------------------------------------------------------
-# Tests: _load_prompt
+# Tests: run() — observation-level path (S-16.3)
+# ---------------------------------------------------------------------------
+
+
+class TestRunnerRunObservationPath:
+    def test_observation_path_calls_observations_get_many(
+        self, runner, observation_evaluator_yaml
+    ):
+        """Observation target must call observations.get_many(), not trace.list()."""
+        obs = make_mock_observation(obs_id="obs001", trace_id="trace001")
+        mock_langfuse = MagicMock()
+        # Two event_types → two API calls (one per name_filter)
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.85, "reasoning": "Good"}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since)
+
+        assert mock_langfuse.api.observations.get_many.call_count == 2
+        mock_langfuse.api.trace.list.assert_not_called()
+
+    def test_observation_score_includes_observation_id(
+        self, runner, observation_evaluator_yaml
+    ):
+        """create_score() for observation target must include observation_id."""
+        obs = make_mock_observation(obs_id="obs_xyz", trace_id="trace_abc")
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.9, "reasoning": ""}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since)
+
+        assert mock_langfuse.create_score.call_count >= 1
+        call_kwargs = mock_langfuse.create_score.call_args.kwargs
+        assert call_kwargs["observation_id"] == "obs_xyz"
+        assert call_kwargs["trace_id"] == "trace_abc"
+
+    def test_observation_path_skips_no_output(self, runner, observation_evaluator_yaml):
+        """Observations with output=None must be skipped."""
+        obs = make_mock_observation(obs_id="obs_noout", output=None)
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            result = runner.run(since=since)
+
+        assert result["evaluated"] == 0
+        assert result["sampled"] == 0  # R2-F7: sampled not incremented for no-output obs
+        mock_evaluator_config.evaluate.assert_not_called()
+
+    def test_observation_path_dry_run_no_score(
+        self, runner, observation_evaluator_yaml
+    ):
+        """dry_run=True on observation path must not call create_score()."""
+        obs = make_mock_observation()
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.7, "reasoning": ""}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            result = runner.run(since=since, dry_run=True)
+
+        assert result["evaluated"] >= 1
+        assert result["scored"] == 0
+        mock_langfuse.create_score.assert_not_called()
+
+    def test_observation_path_appends_audit_log_with_observation_id(
+        self, runner, observation_evaluator_yaml
+    ):
+        """Observation audit log entries must include observation_id."""
+        obs = make_mock_observation(obs_id="obs_audit", trace_id="trace_audit")
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.6, "reasoning": "ok"}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since)
+
+        lines = runner.audit_log_path.read_text().strip().splitlines()
+        # At least one audit entry
+        assert len(lines) >= 1
+        entry = json.loads(lines[0])
+        assert entry.get("observation_id") == "obs_audit"
+        assert entry.get("trace_id") == "trace_audit"
+
+
+# ---------------------------------------------------------------------------
+# Tests: Cursor-based pagination for observations (BUG-217)
+# ---------------------------------------------------------------------------
+
+
+class TestCursorPaginationObservations:
+    def test_cursor_pagination_fetches_all_pages(
+        self, runner, observation_evaluator_yaml
+    ):
+        """observations.get_many() must be called with next_cursor on subsequent pages."""
+        obs1 = make_mock_observation(obs_id="obs_p1", trace_id="t1")
+        obs2 = make_mock_observation(obs_id="obs_p2", trace_id="t2")
+
+        mock_langfuse = MagicMock()
+        # First call returns obs1 + next_cursor; second call returns obs2 + no cursor
+        mock_langfuse.api.observations.get_many.side_effect = [
+            make_observation_response([obs1], next_cursor="cursor_abc"),
+            make_observation_response([obs2], next_cursor=None),
+            # Additional responses for the second event_type
+            make_observation_response([obs1], next_cursor="cursor_abc"),
+            make_observation_response([obs2], next_cursor=None),
+        ]
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.8, "reasoning": ""}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            result = runner.run(since=since)
+
+        # Each event_type does 2 pages → 4 total calls for 2 event_types
+        assert mock_langfuse.api.observations.get_many.call_count == 4
+        # Second call for first event_type must pass cursor
+        second_call_kwargs = mock_langfuse.api.observations.get_many.call_args_list[
+            1
+        ].kwargs
+        assert second_call_kwargs.get("cursor") == "cursor_abc"
+
+    def test_cursor_pagination_stops_when_no_next_cursor(
+        self, runner, observation_evaluator_yaml
+    ):
+        """Must stop fetching when meta.cursor is None."""
+        obs = make_mock_observation(obs_id="obs_single")
+
+        mock_langfuse = MagicMock()
+        # No next cursor → single page per event_type
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs], next_cursor=None)
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.7, "reasoning": ""}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since)
+
+        # 2 event_types, 1 page each → 2 calls total
+        event_types = observation_evaluator_yaml["filter"]["event_types"]
+        assert mock_langfuse.api.observations.get_many.call_count == len(event_types)
+
+    def test_observation_get_many_passes_name_filter(
+        self, runner, observation_evaluator_yaml
+    ):
+        """Each event_type must be passed as name= to observations.get_many()."""
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([])
+        )
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse):
+            runner.run(since=since)
+
+        called_names = [
+            call.kwargs.get("name")
+            for call in mock_langfuse.api.observations.get_many.call_args_list
+        ]
+        event_types = observation_evaluator_yaml["filter"]["event_types"]
+        for et in event_types:
+            assert et in called_names, f"event_type {et!r} not passed as name= filter"
+
+    def test_observation_get_many_passes_time_range(
+        self, runner, observation_evaluator_yaml
+    ):
+        """Must pass from_start_time/to_start_time to observations.get_many()."""
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([])
+        )
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+        until = datetime(2026, 3, 13, tzinfo=timezone.utc)
+
+        with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse):
+            runner.run(since=since, until=until)
+
+        first_call = mock_langfuse.api.observations.get_many.call_args_list[0].kwargs
+        assert first_call.get("from_start_time") == since
+        assert first_call.get("to_start_time") == until
+
+
+# ---------------------------------------------------------------------------
+# Tests: evaluation_target routing (target: field)
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluationTargetRouting:
+    def test_trace_target_routes_to_trace_list(self, runner, trace_evaluator_yaml):
+        """Evaluator with target: trace must use trace.list()."""
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.trace.list.return_value = make_paginated_response([])
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse):
+            runner.run(since=since)
+
+        # R2-F8: use call_count instead of assert_called() for stronger assertion
+        assert mock_langfuse.api.trace.list.call_count == 1
+        mock_langfuse.api.observations.get_many.assert_not_called()
+
+    def test_observation_target_routes_to_observations_get_many(
+        self, runner, observation_evaluator_yaml
+    ):
+        """Evaluator with target: observation must use observations.get_many()."""
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([])
+        )
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse):
+            runner.run(since=since)
+
+        # R2-F8: 2 event_types in fixture → 2 calls to get_many
+        event_types = observation_evaluator_yaml["filter"]["event_types"]
+        assert mock_langfuse.api.observations.get_many.call_count == len(event_types)
+        mock_langfuse.api.trace.list.assert_not_called()
+
+    def test_missing_target_defaults_to_trace(self, runner, tmp_path):
+        """Evaluator with no target: field defaults to trace path."""
+        ev_dir = runner.evaluators_dir
+        ev_dir.mkdir(parents=True, exist_ok=True)
+
+        import yaml
+
+        # No target: field
+        ev = {
+            "id": "EV-XX",
+            "name": "test_eval",
+            "score_type": "NUMERIC",
+            "filter": {},
+            "sampling_rate": 1.0,
+            "prompt_file": "prompt.md",
+        }
+        (ev_dir / "evxx.yaml").write_text(yaml.dump(ev))
+        (ev_dir / "prompt.md").write_text("Evaluate.")
+
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.trace.list.return_value = make_paginated_response([])
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with patch("memory.evaluator.runner.get_client", return_value=mock_langfuse):
+            runner.run(since=since)
+
+        assert mock_langfuse.api.trace.list.call_count == 1
+        mock_langfuse.api.observations.get_many.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Tests: CATEGORICAL score type (EV-04)
+# ---------------------------------------------------------------------------
+
+
+class TestCategoricalScoreType:
+    def test_categorical_score_passes_string_value(
+        self, runner, categorical_evaluator_yaml
+    ):
+        """CATEGORICAL evaluators must pass string value to create_score()."""
+        obs = make_mock_observation(obs_id="obs_cat", trace_id="trace_cat")
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {
+            "score": "correct",
+            "reasoning": "Correct classification",
+        }
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since)
+
+        mock_langfuse.create_score.assert_called_once()
+        call_kwargs = mock_langfuse.create_score.call_args.kwargs
+        assert call_kwargs["value"] == "correct"
+        assert isinstance(call_kwargs["value"], str)
+        assert call_kwargs["data_type"] == "CATEGORICAL"
+        assert call_kwargs["observation_id"] == "obs_cat"
+
+    def test_categorical_score_partially_correct(
+        self, runner, categorical_evaluator_yaml
+    ):
+        """CATEGORICAL 'partially_correct' must be passed as string."""
+        obs = make_mock_observation(obs_id="obs_partial")
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {
+            "score": "partially_correct",
+            "reasoning": "Partial match",
+        }
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since)
+
+        call_kwargs = mock_langfuse.create_score.call_args.kwargs
+        assert call_kwargs["value"] == "partially_correct"
+
+    def test_numeric_score_passes_float_value(self, runner, observation_evaluator_yaml):
+        """NUMERIC evaluators must pass float value (not string)."""
+        obs = make_mock_observation()
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {
+            "score": 0.85,
+            "reasoning": "Relevant",
+        }
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since)
+
+        call_kwargs = mock_langfuse.create_score.call_args.kwargs
+        assert call_kwargs["value"] == 0.85
+        assert isinstance(call_kwargs["value"], float)
+
+
+# ---------------------------------------------------------------------------
+# Tests: error isolation
 # ---------------------------------------------------------------------------
 
 
@@ -430,13 +984,48 @@ def test_single_trace_error_does_not_kill_run(self, runner, evaluator_yaml):
         ):
             result = runner.run(since=since)
 
-        # Second trace must still be evaluated
         assert result["evaluated"] == 1
         assert result["scored"] == 1
-        # Error must be logged for the first trace
+        mock_logger.error.assert_called_once()
+
+    def test_single_observation_error_does_not_kill_run(
+        self, runner, observation_evaluator_yaml
+    ):
+        """An exception on one observation must not prevent subsequent evaluations."""
+        obs1 = make_mock_observation(obs_id="obs_err")
+        obs2 = make_mock_observation(obs_id="obs_ok")
+
+        mock_langfuse = MagicMock()
+        # Both obs in same response for first event_type, empty for second
+        mock_langfuse.api.observations.get_many.side_effect = [
+            make_observation_response([obs1, obs2]),
+            make_observation_response([]),
+        ]
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.side_effect = [
+            Exception("judge crashed"),
+            {"score": 0.7, "reasoning": "OK"},
+        ]
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+            patch("memory.evaluator.runner.logger") as mock_logger,
+        ):
+            result = runner.run(since=since)
+
+        assert result["evaluated"] == 1
         mock_logger.error.assert_called_once()
 
 
+# ---------------------------------------------------------------------------
+# Tests: _load_prompt
+# ---------------------------------------------------------------------------
+
+
 class TestLoadPrompt:
     def test_load_prompt_uses_actual_file_not_fallback(self, runner):
         """_load_prompt() should return the file contents, not the fallback stub."""
@@ -451,3 +1040,183 @@ def test_load_prompt_uses_actual_file_not_fallback(self, runner):
         result = runner._load_prompt(evaluator)
 
         assert result == prompt_content
+
+
+# ---------------------------------------------------------------------------
+# Tests: _build_observation_prompt
+# ---------------------------------------------------------------------------
+
+
+class TestBuildObservationPrompt:
+    def test_build_observation_prompt_includes_event_type(self, runner):
+        """_build_observation_prompt must include the observation name (event_type)."""
+        ev_dir = runner.evaluators_dir
+        ev_dir.mkdir(parents=True, exist_ok=True)
+        (ev_dir / "prompt.md").write_text("Evaluate the following.")
+
+        evaluator = {"prompt_file": "prompt.md"}
+        obs = make_mock_observation(name="search_query")
+
+        result = runner._build_observation_prompt(evaluator, obs)
+
+        assert "search_query" in result
+        assert "Observation to Evaluate" in result
+
+    def test_build_observation_prompt_different_from_trace_prompt(self, runner):
+        """_build_observation_prompt must not produce 'Trace to Evaluate' header."""
+        ev_dir = runner.evaluators_dir
+        ev_dir.mkdir(parents=True, exist_ok=True)
+        (ev_dir / "prompt.md").write_text("Evaluate.")
+
+        evaluator = {"prompt_file": "prompt.md"}
+        obs = make_mock_observation()
+
+        result = runner._build_observation_prompt(evaluator, obs)
+        assert "Trace to Evaluate" not in result
+        assert "Observation to Evaluate" in result
+
+
+# ---------------------------------------------------------------------------
+# Tests: R2-F2 — dry_run does not write audit log
+# ---------------------------------------------------------------------------
+
+
+class TestDryRunAuditLog:
+    def test_dry_run_does_not_write_audit_log_trace(self, runner, evaluator_yaml):
+        """dry_run=True must NOT write audit log for trace path."""
+        trace = make_mock_trace(trace_id="trace_dry", tags=["retrieval"])
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.trace.list.return_value = make_paginated_response([trace])
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.7, "reasoning": "ok"}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since, dry_run=True)
+
+        assert not runner.audit_log_path.exists()
+
+    def test_dry_run_does_not_write_audit_log_observation(
+        self, runner, observation_evaluator_yaml
+    ):
+        """dry_run=True must NOT write audit log for observation path."""
+        obs = make_mock_observation(obs_id="obs_dry", trace_id="trace_dry")
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {"score": 0.6, "reasoning": "ok"}
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            runner.run(since=since, dry_run=True)
+
+        assert not runner.audit_log_path.exists()
+
+
+# ---------------------------------------------------------------------------
+# Tests: R2-F5 — observation with no trace_id is skipped
+# ---------------------------------------------------------------------------
+
+
+class TestObservationNoTraceId:
+    def test_observation_with_no_trace_id_is_skipped(
+        self, runner, observation_evaluator_yaml
+    ):
+        """Observation with empty trace_id must be skipped with a warning."""
+        obs = make_mock_observation(obs_id="obs_notrace", trace_id="")
+        obs.trace_id = ""
+
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+            patch("memory.evaluator.runner.logger") as mock_logger,
+        ):
+            result = runner.run(since=since)
+
+        assert result["evaluated"] == 0
+        assert result["scored"] == 0
+        mock_evaluator_config.evaluate.assert_not_called()
+        mock_logger.warning.assert_called()
+
+
+# ---------------------------------------------------------------------------
+# Tests: R1-F3 — invalid categorical value is skipped
+# ---------------------------------------------------------------------------
+
+
+class TestCategoricalValidation:
+    def test_invalid_categorical_value_skipped(
+        self, runner, categorical_evaluator_yaml
+    ):
+        """Categorical score not in categories list must be skipped with warning."""
+        obs = make_mock_observation(obs_id="obs_badcat", trace_id="trace_badcat")
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        # "unknown_label" is not in categories: ["correct", "partially_correct", "incorrect"]
+        mock_evaluator_config.evaluate.return_value = {
+            "score": "unknown_label",
+            "reasoning": "hallucinated category",
+        }
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+            patch("memory.evaluator.runner.logger") as mock_logger,
+        ):
+            result = runner.run(since=since)
+
+        # evaluated is incremented before validation, scored is not
+        assert result["scored"] == 0
+        mock_langfuse.create_score.assert_not_called()
+        mock_logger.warning.assert_called()
+
+    def test_valid_categorical_value_is_scored(self, runner, categorical_evaluator_yaml):
+        """Categorical score within categories list must be scored normally."""
+        obs = make_mock_observation(obs_id="obs_goodcat", trace_id="trace_goodcat")
+        mock_langfuse = MagicMock()
+        mock_langfuse.api.observations.get_many.return_value = (
+            make_observation_response([obs])
+        )
+
+        mock_evaluator_config = MagicMock()
+        mock_evaluator_config.evaluate.return_value = {
+            "score": "correct",
+            "reasoning": "valid",
+        }
+
+        since = datetime(2026, 3, 12, tzinfo=timezone.utc)
+
+        with (
+            patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
+            patch.object(runner, "evaluator_config", mock_evaluator_config),
+        ):
+            result = runner.run(since=since)
+
+        assert result["scored"] == 1
+        mock_langfuse.create_score.assert_called()
diff --git a/tests/test_monitoring_api.py b/tests/test_monitoring_api.py
new file mode 100644
index 00000000..9b8d044a
--- /dev/null
+++ b/tests/test_monitoring_api.py
@@ -0,0 +1,222 @@
+"""Unit tests for monitoring API log injection sanitization (S-16.6).
+
+Tests verify that:
+- sanitize_log_input() strips log injection payloads (newlines, etc.)
+- memory_id and collection path params are sanitized before logging
+- API endpoint responses are unchanged for valid inputs
+
+NOTE: monitoring/main.py requires FastAPI/Qdrant deps that live only in the
+monitoring Docker container. We mock those heavy deps at the sys.modules level
+before importing main so these tests run in the project's normal venv.
+"""
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, AsyncMock
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Stub out heavy monitoring-only deps so we can import monitoring/main.py
+# without a FastAPI/Qdrant installation.
+# ---------------------------------------------------------------------------
+def _make_fastapi_stub():
+    fastapi_mod = MagicMock()
+
+    class _App:
+        def __init__(self, **kw):
+            pass
+
+        def get(self, *a, **kw):
+            return lambda f: f
+
+        def post(self, *a, **kw):
+            return lambda f: f
+
+        def mount(self, *a, **kw):
+            pass
+
+    fastapi_mod.FastAPI = _App
+    fastapi_mod.HTTPException = Exception
+    fastapi_mod.status.HTTP_503_SERVICE_UNAVAILABLE = 503
+    fastapi_mod.status.HTTP_404_NOT_FOUND = 404
+    return fastapi_mod
+
+
+_STUBS = {
+    "fastapi": _make_fastapi_stub(),
+    "fastapi.responses": MagicMock(),
+    "prometheus_client": MagicMock(),
+    "pydantic": MagicMock(),
+    "qdrant_client": MagicMock(),
+    "qdrant_client.http": MagicMock(),
+    "qdrant_client.http.exceptions": MagicMock(),
+    # Stub memory package to prevent real imports (src/ is in pythonpath)
+    "memory": MagicMock(),
+    "memory.metrics": MagicMock(),
+    "memory.metrics_push": MagicMock(),
+    "memory.stats": MagicMock(),
+    "memory.warnings": MagicMock(),
+}
+
+# Patch sys.modules BEFORE importing main
+for _mod, _stub in _STUBS.items():
+    if _mod not in sys.modules:
+        sys.modules[_mod] = _stub
+
+# Add monitoring dir to path
+_monitoring_path = str(Path(__file__).parent.parent / "monitoring")
+if _monitoring_path not in sys.path:
+    sys.path.insert(0, _monitoring_path)
+
+# Now import the function under test
+import importlib
+import types
+
+# Use importlib to import main with patched sys.modules already in place
+if "main" in sys.modules:
+    del sys.modules["main"]
+
+import main as _monitoring_main  # noqa: E402  (monitoring/main.py)
+sanitize_log_input = _monitoring_main.sanitize_log_input
+
+
+# ---------------------------------------------------------------------------
+# Tests for sanitize_log_input
+# ---------------------------------------------------------------------------
+class TestSanitizeLogInput:
+    """Unit tests for the sanitize_log_input helper in monitoring/main.py."""
+
+    def test_newline_stripped(self):
+        """Newlines in user input must be escaped to prevent log injection."""
+        result = sanitize_log_input("valid-id\nINJECTED: fake log entry")
+        assert "\n" not in result
+
+    def test_carriage_return_stripped(self):
+        """Carriage returns must be escaped."""
+        result = sanitize_log_input("valid-id\rINJECTED")
+        assert "\r" not in result
+
+    def test_tab_escaped(self):
+        """Tabs must be escaped."""
+        result = sanitize_log_input("id\twith\ttabs")
+        assert "\t" not in result
+
+    def test_null_byte_stripped(self):
+        """Null bytes must be stripped."""
+        result = sanitize_log_input("id\x00null")
+        assert "\x00" not in result
+
+    def test_valid_uuid_unchanged(self):
+        """Valid UUID-style IDs should pass through cleanly."""
+        memory_id = "550e8400-e29b-41d4-a716-446655440000"
+        result = sanitize_log_input(memory_id)
+        assert "550e8400" in result
+        assert "446655440000" in result
+
+    def test_valid_collection_names_pass_through(self):
+        """Valid collection names should pass through cleanly."""
+        for collection in ("code-patterns", "conventions", "discussions"):
+            result = sanitize_log_input(collection)
+            assert collection in result
+
+    def test_truncation_at_default_max_length(self):
+        """Oversized input should be truncated to 200 chars."""
+        result = sanitize_log_input("x" * 500)
+        assert len(result) <= 200
+
+    def test_custom_max_length_respected(self):
+        """Custom max_length should be respected."""
+        result = sanitize_log_input("a" * 50, max_length=10)
+        assert len(result) <= 10
+
+    def test_non_string_coerced(self):
+        """Non-string values should be coerced to string safely."""
+        result = sanitize_log_input(12345)
+        assert "12345" in result
+
+    def test_multiline_injection_in_collection(self):
+        """Collection param with embedded newline must not produce raw newline."""
+        result = sanitize_log_input("code-patterns\nINJECTED: severity=CRITICAL")
+        assert "\n" not in result
+
+    def test_unicode_control_chars_stripped(self):
+        """Unicode control characters should be stripped."""
+        payload = "id\u0000\u001f\u007f"
+        result = sanitize_log_input(payload)
+        for char in payload:
+            if not char.isprintable():
+                assert char not in result
+
+    def test_empty_string(self):
+        """Empty string input should return empty string."""
+        assert sanitize_log_input("") == ""
+
+    def test_exception_str_sanitized(self):
+        """Exception messages with injection payloads must be sanitized."""
+        try:
+            raise ValueError("error\nINJECTED: severity=critical")
+        except ValueError as e:
+            result = sanitize_log_input(str(e))
+            assert "\n" not in result
+            assert "error" in result
+
+    def test_none_input(self):
+        assert sanitize_log_input(None) == "None"
+
+    def test_multiline_memory_id_no_raw_newline(self):
+        """memory_id injection payload must not produce raw newline in sanitized output."""
+        result = sanitize_log_input("evil-id\nINJECTED")
+        assert "\n" not in result
+
+    def test_repr_escaping_preserves_printable_content(self):
+        """repr()-based escaping should keep the meaningful part of the value."""
+        value = "abc123"
+        result = sanitize_log_input(value)
+        assert "abc123" in result
+
+
+# ---------------------------------------------------------------------------
+# Verify call-site sanitization is inline (not via intermediate variable)
+# ---------------------------------------------------------------------------
+class TestCallSiteSanitization:
+    """Verify that log call sites in monitoring/main.py sanitize inline.
+
+    CodeQL requires sanitization to appear inline at the log call site rather
+    than through intermediate variables. We inspect the source to confirm
+    the pattern is correct.
+    """
+
+    def _read_main_source(self) -> str:
+        main_path = Path(__file__).parent.parent / "monitoring" / "main.py"
+        return main_path.read_text()
+
+    def test_str_e_always_wrapped_in_sanitize(self):
+        """Every str(e) used in a logger call must be wrapped with sanitize_log_input."""
+        import re
+        source = self._read_main_source()
+
+        # More precise: look for "error": str(e) that is NOT inside sanitize_log_input(...)
+        # We check that sanitize_log_input( appears immediately before str(
+        for match in re.finditer(r'"error":\s*(str\([^)]+\))', source):
+            str_pos = match.start(1)
+            sanitize_prefix = "sanitize_log_input("
+            prefix = source[max(0, str_pos - len(sanitize_prefix)):str_pos]
+            assert prefix.endswith(sanitize_prefix), (
+                f"Unsanitized str() found at log call site: {match.group(0)}"
+            )
+
+    def test_collection_name_in_extra_always_sanitized(self):
+        """Every collection_name used in logger extra dicts must be sanitized inline."""
+        import re
+        source = self._read_main_source()
+
+        # Find bare "collection": collection_name (not wrapped in sanitize_log_input)
+        for match in re.finditer(r'"collection":\s*(collection_name|request\.collection)', source):
+            value = match.group(1)
+            context_start = max(0, match.start() - 60)
+            context = source[context_start:match.end()]
+            assert "sanitize_log_input" in context, (
+                f"Unsanitized collection_name found at log call site: ...{context}..."
+            )
diff --git a/tests/unit/test_evaluator_retry.py b/tests/unit/test_evaluator_retry.py
new file mode 100644
index 00000000..dec2d951
--- /dev/null
+++ b/tests/unit/test_evaluator_retry.py
@@ -0,0 +1,222 @@
+# LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md
+"""Unit tests for S-16.4: Exponential backoff retry in EvaluatorConfig.evaluate()."""
+
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+
+from memory.evaluator.provider import EvaluatorConfig
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_config(provider="ollama", max_retries=3) -> EvaluatorConfig:
+    cfg = EvaluatorConfig(provider=provider, max_retries=max_retries)
+    return cfg
+
+
+def _http_error(status_code: int, retry_after: str | None = None):
+    """Create a mock exception that looks like an HTTP error."""
+    exc = Exception(f"HTTP {status_code}")
+    exc.status_code = status_code
+    if retry_after is not None:
+        response = Mock()
+        response.headers = {"Retry-After": retry_after}
+        exc.response = response
+    return exc
+
+
+def _good_openai_response(text: str = '{"score": 0.9, "reasoning": "ok"}'):
+    msg = Mock()
+    msg.content = text
+    msg.reasoning = None
+    choice = Mock()
+    choice.message = msg
+    resp = Mock()
+    resp.choices = [choice]
+    return resp
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestEvaluatorRetry:
+
+    def test_success_on_first_attempt(self):
+        """Should succeed immediately with no retries."""
+        cfg = _make_config(max_retries=3)
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.return_value = _good_openai_response()
+        cfg._client = mock_client
+
+        result = cfg.evaluate("test prompt")
+
+        assert result["score"] == 0.9
+        assert mock_client.chat.completions.create.call_count == 1
+
+    def test_retry_success_after_transient_500(self):
+        """Should retry on 500 and succeed on second attempt."""
+        cfg = _make_config(max_retries=3)
+        mock_client = MagicMock()
+        call_count = 0
+
+        def side_effect(**kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise _http_error(500)
+            return _good_openai_response()
+
+        mock_client.chat.completions.create.side_effect = side_effect
+        cfg._client = mock_client
+
+        with patch("memory.evaluator.provider.time.sleep"):
+            result = cfg.evaluate("test prompt")
+
+        assert result["score"] == 0.9
+        assert call_count == 2
+
+    def test_exponential_backoff_timing(self):
+        """Sleep durations should grow with attempt number (base * 2^n)."""
+        cfg = _make_config(max_retries=3)
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = _http_error(500)
+        cfg._client = mock_client
+
+        sleep_calls = []
+
+        with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append):
+            with patch("memory.evaluator.provider.random.uniform", return_value=0.0):
+                with pytest.raises(Exception):
+                    cfg.evaluate("test prompt")
+
+        # With max_retries=3 we sleep 3 times (attempts 0,1,2) before final failure
+        assert len(sleep_calls) == 3
+        # Base delays: 1*(2^0)=1, 1*(2^1)=2, 1*(2^2)=4
+        assert sleep_calls[0] == pytest.approx(1.0)
+        assert sleep_calls[1] == pytest.approx(2.0)
+        assert sleep_calls[2] == pytest.approx(4.0)
+
+    def test_max_retries_exhaustion_raises(self):
+        """Should raise the last exception after all retries are exhausted."""
+        cfg = _make_config(max_retries=2)
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = _http_error(503)
+        cfg._client = mock_client
+
+        with patch("memory.evaluator.provider.time.sleep"):
+            with pytest.raises(Exception, match="HTTP 503"):
+                cfg.evaluate("test prompt")
+
+        assert mock_client.chat.completions.create.call_count == 3  # initial + 2 retries
+
+    def test_non_retryable_400_fails_immediately(self):
+        """400 errors should not be retried."""
+        cfg = _make_config(max_retries=3)
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = _http_error(400)
+        cfg._client = mock_client
+
+        with pytest.raises(Exception, match="HTTP 400"):
+            cfg.evaluate("test prompt")
+
+        assert mock_client.chat.completions.create.call_count == 1
+
+    def test_429_with_retry_after_header(self):
+        """On 429, Retry-After header should override calculated backoff delay."""
+        cfg = _make_config(max_retries=3)
+        mock_client = MagicMock()
+        call_count = 0
+
+        def side_effect(**kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise _http_error(429, retry_after="5")
+            return _good_openai_response()
+
+        mock_client.chat.completions.create.side_effect = side_effect
+        cfg._client = mock_client
+
+        sleep_calls = []
+
+        with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append):
+            with patch("memory.evaluator.provider.random.uniform", return_value=0.0):
+                result = cfg.evaluate("test prompt")
+
+        assert result["score"] == 0.9
+        # First sleep should use Retry-After=5 as base (plus zero jitter)
+        assert sleep_calls[0] == pytest.approx(5.0)
+
+    def test_retry_on_connection_error(self):
+        """ConnectionError should trigger retry."""
+        cfg = _make_config(max_retries=2)
+        mock_client = MagicMock()
+        call_count = 0
+
+        def side_effect(**kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count < 3:
+                raise ConnectionError("connection refused")
+            return _good_openai_response()
+
+        mock_client.chat.completions.create.side_effect = side_effect
+        cfg._client = mock_client
+
+        with patch("memory.evaluator.provider.time.sleep"):
+            result = cfg.evaluate("test prompt")
+
+        assert result["score"] == 0.9
+        assert call_count == 3
+
+    def test_retry_on_timeout_error(self):
+        """TimeoutError should trigger retry."""
+        cfg = _make_config(max_retries=2)
+        mock_client = MagicMock()
+        call_count = 0
+
+        def side_effect(**kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise TimeoutError("timed out")
+            return _good_openai_response()
+
+        mock_client.chat.completions.create.side_effect = side_effect
+        cfg._client = mock_client
+
+        with patch("memory.evaluator.provider.time.sleep"):
+            result = cfg.evaluate("test prompt")
+
+        assert result["score"] == 0.9
+        assert call_count == 2
+
+    def test_client_reset_after_exhaustion(self):
+        """_client should be set to None after all retries are exhausted."""
+        cfg = _make_config(max_retries=1)
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = _http_error(500)
+        cfg._client = mock_client
+
+        with patch("memory.evaluator.provider.time.sleep"):
+            with pytest.raises(Exception):
+                cfg.evaluate("test prompt")
+
+        assert cfg._client is None
+
+    def test_client_reset_after_connection_error_exhaustion(self):
+        """_client should be set to None after ConnectionError retries are exhausted."""
+        cfg = _make_config(max_retries=2)
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = ConnectionError("refused")
+        cfg._client = mock_client
+
+        with patch("memory.evaluator.provider.time.sleep"):
+            with pytest.raises(ConnectionError):
+                cfg.evaluate("test prompt")
+
+        assert cfg._client is None
diff --git a/tests/unit/test_evaluator_scheduler.py b/tests/unit/test_evaluator_scheduler.py
new file mode 100644
index 00000000..938f910b
--- /dev/null
+++ b/tests/unit/test_evaluator_scheduler.py
@@ -0,0 +1,401 @@
+# LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md
+"""Unit tests for S-16.5: Evaluator scheduler daemon.
+
+Tests:
+- Cron schedule parsing via croniter
+- lookback_hours calculation
+- schedule.enabled=False skips evaluation
+- Evaluation failure doesn't crash scheduler
+- Health file written after successful run
+"""
+
+import importlib.util
+import sys
+import time
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# croniter availability guard
+# ---------------------------------------------------------------------------
+try:
+    import croniter as _croniter_pkg  # noqa: F401
+
+    CRONITER_AVAILABLE = True
+except ImportError:
+    CRONITER_AVAILABLE = False
+
+pytestmark_croniter = pytest.mark.skipif(
+    not CRONITER_AVAILABLE, reason="croniter not installed — add croniter>=2.0.0 to requirements.txt"
+)
+
+
+def _make_croniter_mock(next_run_dt: datetime):
+    """Build a minimal croniter sys.modules mock returning a fixed next_run datetime."""
+    mock_instance = MagicMock()
+    mock_instance.get_next.return_value = next_run_dt
+    mock_cls = MagicMock(return_value=mock_instance)
+    mock_module = MagicMock()
+    mock_module.croniter = mock_cls
+    return mock_module, mock_cls, mock_instance
+
+
+# ---------------------------------------------------------------------------
+# Module loader helper
+# ---------------------------------------------------------------------------
+
+_SCHEDULER_PATH = (
+    Path(__file__).parent.parent.parent / "scripts/memory/evaluator_scheduler.py"
+)
+
+
+def _load_scheduler():
+    """Load evaluator_scheduler module via importlib (not on sys.path as a package)."""
+    spec = importlib.util.spec_from_file_location("evaluator_scheduler", _SCHEDULER_PATH)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+# Load once for all tests
+_mod = _load_scheduler()
+
+
+# ---------------------------------------------------------------------------
+# Test: load_schedule_config
+# ---------------------------------------------------------------------------
+
+
+class TestLoadScheduleConfig:
+    def test_parses_schedule_section(self, tmp_path):
+        """load_schedule_config returns the schedule dict from YAML."""
+        config = tmp_path / "evaluator_config.yaml"
+        config.write_text(
+            "schedule:\n"
+            "  enabled: true\n"
+            "  cron: '0 5 * * *'\n"
+            "  lookback_hours: 24\n"
+        )
+        result = _mod.load_schedule_config(str(config))
+        assert result["enabled"] is True
+        assert result["cron"] == "0 5 * * *"
+        assert result["lookback_hours"] == 24
+
+    def test_returns_empty_dict_when_schedule_missing(self, tmp_path):
+        """Returns {} when schedule key is absent."""
+        config = tmp_path / "evaluator_config.yaml"
+        config.write_text("evaluator_model:\n  provider: ollama\n")
+        result = _mod.load_schedule_config(str(config))
+        assert result == {}
+
+    def test_parses_disabled_schedule(self, tmp_path):
+        """enabled: false is returned correctly."""
+        config = tmp_path / "evaluator_config.yaml"
+        config.write_text(
+            "schedule:\n"
+            "  enabled: false\n"
+            "  cron: '0 3 * * *'\n"
+            "  lookback_hours: 48\n"
+        )
+        result = _mod.load_schedule_config(str(config))
+        assert result["enabled"] is False
+        assert result["lookback_hours"] == 48
+
+
+# ---------------------------------------------------------------------------
+# Test: cron schedule parsing + next run
+# ---------------------------------------------------------------------------
+
+
+class TestCronScheduleParsing:
+    @pytestmark_croniter
+    def test_croniter_returns_future_datetime(self):
+        """croniter.get_next() returns a datetime after 'now'."""
+        from croniter import croniter  # noqa: PLC0415
+
+        now = datetime(2026, 3, 14, 4, 0, 0, tzinfo=timezone.utc)
+        cron = croniter("0 5 * * *", now)
+        next_run = cron.get_next(datetime)
+
+        # Should be 05:00 UTC on the same day
+        assert next_run > now
+        assert next_run.hour == 5
+        assert next_run.minute == 0
+
+    @pytestmark_croniter
+    def test_croniter_sleep_seconds_positive(self):
+        """sleep_seconds should always be positive (next run is in the future)."""
+        from croniter import croniter  # noqa: PLC0415
+
+        now = datetime(2026, 3, 14, 4, 59, 59, tzinfo=timezone.utc)
+        cron = croniter("0 5 * * *", now)
+        next_run = cron.get_next(datetime)
+        sleep_seconds = (next_run - now).total_seconds()
+
+        assert sleep_seconds > 0
+        assert sleep_seconds <= 60  # Less than 1 minute until 05:00
+
+    def test_lookback_hours_calculation(self):
+        """since = now - timedelta(hours=lookback_hours)."""
+        lookback_hours = 24
+        now = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
+        since = now - timedelta(hours=lookback_hours)
+
+        expected = datetime(2026, 3, 13, 5, 0, 0, tzinfo=timezone.utc)
+        assert since == expected
+
+    def test_lookback_hours_48(self):
+        """lookback_hours=48 looks back 2 days."""
+        lookback_hours = 48
+        now = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
+        since = now - timedelta(hours=lookback_hours)
+
+        expected = datetime(2026, 3, 12, 5, 0, 0, tzinfo=timezone.utc)
+        assert since == expected
+
+
+# ---------------------------------------------------------------------------
+# Test: schedule.enabled=False skips evaluation
+# ---------------------------------------------------------------------------
+
+
+class TestScheduleDisabled:
+    def test_enabled_false_does_not_call_runner(self, tmp_path):
+        """When schedule.enabled=False, EvaluatorRunner.run() is never called."""
+        config = tmp_path / "evaluator_config.yaml"
+        config.write_text(
+            "schedule:\n"
+            "  enabled: false\n"
+            "  cron: '0 5 * * *'\n"
+            "  lookback_hours: 24\n"
+        )
+
+        # Reload module with fresh globals to avoid state bleed
+        mod = _load_scheduler()
+        mod._shutdown_requested = False
+
+        call_count = 0
+
+        def fake_interruptible_sleep(seconds, chunk=5.0):
+            # After one sleep (schedule disabled), set shutdown to stop the loop
+            mod._shutdown_requested = True
+
+        with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep):
+            with patch.object(mod, "load_schedule_config", return_value={"enabled": False}):
+                with patch(
+                    "memory.evaluator.EvaluatorRunner"
+                ) as mock_runner_cls:
+                    mod.run_scheduler(str(config))
+                    mock_runner_cls.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Test: evaluation failure doesn't crash scheduler
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluationFailure:
+    def test_evaluation_error_continues_to_next_cycle(self, tmp_path):
+        """If EvaluatorRunner.run() raises, the scheduler logs and continues."""
+        config = tmp_path / "evaluator_config.yaml"
+        config.write_text(
+            "schedule:\n"
+            "  enabled: true\n"
+            "  cron: '0 5 * * *'\n"
+            "  lookback_hours: 24\n"
+        )
+
+        mod = _load_scheduler()
+        mod._shutdown_requested = False
+
+        sleep_call_count = [0]
+
+        def fake_interruptible_sleep(seconds, chunk=5.0):
+            sleep_call_count[0] += 1
+            # Shut down after the second sleep (after one failed eval cycle)
+            if sleep_call_count[0] >= 2:
+                mod._shutdown_requested = True
+
+        mock_runner = MagicMock()
+        mock_runner.run.side_effect = RuntimeError("LLM provider unavailable")
+
+        _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
+        mock_croniter_mod, _, _ = _make_croniter_mock(_next_run)
+
+        with patch.dict(sys.modules, {"croniter": mock_croniter_mod}):
+            with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep):
+                with patch.object(
+                    mod,
+                    "load_schedule_config",
+                    return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1},
+                ):
+                    with patch.object(mod, "_touch_health_file") as mock_touch:
+                        with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner):
+                            # Should NOT raise — errors are caught and logged
+                            mod.run_scheduler(str(config))
+
+        # Runner was called at least once
+        mock_runner.run.assert_called()
+        # Health file should have been touched on startup only (not on failure)
+        assert mock_touch.call_count == 1
+
+    def test_evaluation_failure_does_not_update_health_file(self, tmp_path):
+        """Health file is NOT touched after a failed evaluation."""
+        config = tmp_path / "evaluator_config.yaml"
+        config.write_text(
+            "schedule:\n"
+            "  enabled: true\n"
+            "  cron: '* * * * *'\n"
+            "  lookback_hours: 1\n"
+        )
+
+        mod = _load_scheduler()
+        mod._shutdown_requested = False
+
+        sleep_calls = [0]
+        startup_touch_done = [False]
+        touch_after_startup = [0]
+
+        def fake_sleep(seconds, chunk=5.0):
+            sleep_calls[0] += 1
+            if sleep_calls[0] >= 2:
+                mod._shutdown_requested = True
+
+        def fake_touch():
+            if startup_touch_done[0]:
+                touch_after_startup[0] += 1
+            startup_touch_done[0] = True
+
+        mock_runner = MagicMock()
+        mock_runner.run.side_effect = Exception("eval error")
+
+        _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
+        mock_croniter_mod, _, _ = _make_croniter_mock(_next_run)
+
+        with patch.dict(sys.modules, {"croniter": mock_croniter_mod}):
+            with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep):
+                with patch.object(
+                    mod,
+                    "load_schedule_config",
+                    return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1},
+                ):
+                    with patch.object(mod, "_touch_health_file", side_effect=fake_touch):
+                        with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner):
+                            mod.run_scheduler(str(config))
+
+        # Health file should NOT be touched after the failed evaluation
+        assert touch_after_startup[0] == 0
+
+
+# ---------------------------------------------------------------------------
+# Test: health file written after successful run
+# ---------------------------------------------------------------------------
+
+
+class TestHealthFile:
+    def test_health_file_touched_on_startup(self, tmp_path):
+        """Health file is touched on daemon startup before any evaluation."""
+        mod = _load_scheduler()
+        mod._shutdown_requested = False
+
+        touch_calls = []
+
+        def fake_touch():
+            touch_calls.append("touch")
+
+        def fake_sleep(seconds, chunk=5.0):
+            # Immediately shut down after first sleep
+            mod._shutdown_requested = True
+
+        with patch.object(mod, "_touch_health_file", side_effect=fake_touch):
+            with patch.object(
+                mod,
+                "load_schedule_config",
+                return_value={"enabled": False},
+            ):
+                with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep):
+                    mod.run_scheduler(str(tmp_path / "evaluator_config.yaml"))
+
+        # At minimum, touched on startup
+        assert len(touch_calls) >= 1
+
+    def test_health_file_touched_after_successful_run(self, tmp_path):
+        """Health file is touched after a successful evaluation run."""
+        config = tmp_path / "evaluator_config.yaml"
+        config.write_text(
+            "schedule:\n"
+            "  enabled: true\n"
+            "  cron: '* * * * *'\n"
+            "  lookback_hours: 1\n"
+        )
+
+        mod = _load_scheduler()
+        mod._shutdown_requested = False
+
+        touch_calls = [0]
+        sleep_calls = [0]
+
+        def fake_touch():
+            touch_calls[0] += 1
+
+        def fake_sleep(seconds, chunk=5.0):
+            sleep_calls[0] += 1
+            # Shut down after second sleep (after one successful eval cycle)
+            if sleep_calls[0] >= 2:
+                mod._shutdown_requested = True
+
+        mock_runner = MagicMock()
+        mock_runner.run.return_value = {
+            "fetched": 10,
+            "sampled": 5,
+            "evaluated": 5,
+            "scored": 5,
+        }
+
+        _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
+        mock_croniter_mod, _, _ = _make_croniter_mock(_next_run)
+
+        with patch.dict(sys.modules, {"croniter": mock_croniter_mod}):
+            with patch.object(mod, "_touch_health_file", side_effect=fake_touch):
+                with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep):
+                    with patch.object(
+                        mod,
+                        "load_schedule_config",
+                        return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1},
+                    ):
+                        with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner):
+                            mod.run_scheduler(str(config))
+
+        # Touched on startup + once after successful run = at least 2 calls
+        assert touch_calls[0] >= 2
+        mock_runner.run.assert_called_once()
+
+    def test_health_file_actual_touch(self, tmp_path):
+        """_touch_health_file() creates the file on disk."""
+        health_file = tmp_path / "evaluator-scheduler.health"
+
+        mod = _load_scheduler()
+        original_health = mod.HEALTH_FILE
+        mod.HEALTH_FILE = health_file
+
+        try:
+            assert not health_file.exists()
+            mod._touch_health_file()
+            assert health_file.exists()
+        finally:
+            mod.HEALTH_FILE = original_health
+
+    def test_health_file_touch_graceful_on_error(self, tmp_path):
+        """_touch_health_file() does not raise even if file path is unwritable."""
+        mod = _load_scheduler()
+        original_health = mod.HEALTH_FILE
+        mod.HEALTH_FILE = Path("/nonexistent_dir/evaluator-scheduler.health")
+
+        try:
+            # Should not raise
+            mod._touch_health_file()
+        finally:
+            mod.HEALTH_FILE = original_health

From 92f7902ed2a9dc4d5b6a7361e3b17ec5780f5ad8 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 01:34:51 -0700
Subject: [PATCH 02/13] style: fix lint errors (ruff, black, isort) in
 evaluator test files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 monitoring/main.py                     |  26 ++++--
 scripts/create_score_configs.py        |  18 +++-
 scripts/memory/evaluator_scheduler.py  |  10 +-
 src/memory/evaluator/runner.py         |  47 +++++++---
 tests/test_create_score_configs.py     |  28 ++++--
 tests/test_evaluator_runner.py         |  21 +++--
 tests/test_monitoring_api.py           |  32 +++----
 tests/unit/test_evaluator_retry.py     |  55 +++++++----
 tests/unit/test_evaluator_scheduler.py | 124 +++++++++++++++----------
 9 files changed, 231 insertions(+), 130 deletions(-)

diff --git a/monitoring/main.py b/monitoring/main.py
index d792efe8..64ddc287 100755
--- a/monitoring/main.py
+++ b/monitoring/main.py
@@ -11,9 +11,8 @@
 import asyncio
 import logging
 import os
-from typing import Any, Dict, Optional
-
 import warnings
+from typing import Any, Dict, Optional
 
 from fastapi import FastAPI, HTTPException, status
 from fastapi.responses import JSONResponse, Response
@@ -244,10 +243,15 @@ async def update_metrics_periodically():
                 except Exception as e:
                     logger.warning(
                         "metrics_update_failed",
-                        extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))},
+                        extra={
+                            "collection": sanitize_log_input(collection_name),
+                            "error": sanitize_log_input(str(e)),
+                        },
                     )
         except Exception as e:
-            logger.error("metrics_updater_error", extra={"error": sanitize_log_input(str(e))})
+            logger.error(
+                "metrics_updater_error", extra={"error": sanitize_log_input(str(e))}
+            )
 
         await asyncio.sleep(60)  # Update every 60 seconds
 
@@ -364,7 +368,10 @@ async def health():
             except Exception as e:
                 logger.warning(
                     "stats_check_failed",
-                    extra={"collection": sanitize_log_input(collection_name), "error": sanitize_log_input(str(e))},
+                    extra={
+                        "collection": sanitize_log_input(collection_name),
+                        "error": sanitize_log_input(str(e)),
+                    },
                 )
 
         # Determine status based on warnings
@@ -426,7 +433,9 @@ async def readiness():
         logger.info("readiness_check_passed", extra={"qdrant_available": True})
         return {"status": "ready", "qdrant_available": True}
     except Exception as e:
-        logger.warning("readiness_check_failed", extra={"error": sanitize_log_input(str(e))})
+        logger.warning(
+            "readiness_check_failed", extra={"error": sanitize_log_input(str(e))}
+        )
         return JSONResponse(
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
             content={"status": "not_ready", "qdrant_available": False},
@@ -481,7 +490,10 @@ async def get_memory(memory_id: str, collection: str = "code-patterns"):
     except UnexpectedResponse as e:
         logger.error(
             "qdrant_error",
-            extra={"error": sanitize_log_input(str(e)), "memory_id": sanitize_log_input(memory_id)},
+            extra={
+                "error": sanitize_log_input(str(e)),
+                "memory_id": sanitize_log_input(memory_id),
+            },
         )
         raise HTTPException(
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Qdrant unavailable"
diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py
index 5e5732f9..06cfc2e6 100644
--- a/scripts/create_score_configs.py
+++ b/scripts/create_score_configs.py
@@ -21,8 +21,8 @@
 PLAN-012 Phase 2 — Section 5.6
 """
 
-import sys
 import argparse
+import sys
 
 
 def _fetch_existing_configs(langfuse) -> dict[str, list]:
@@ -66,7 +66,9 @@ def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None:
             continue
         # Sort by created_at ascending, keep first, delete the rest
         try:
-            sorted_configs = sorted(configs, key=lambda c: getattr(c, "created_at", "9999-99-99"))
+            sorted_configs = sorted(
+                configs, key=lambda c: getattr(c, "created_at", "9999-99-99")
+            )
         except Exception:
             sorted_configs = configs
 
@@ -79,11 +81,15 @@ def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None:
                 langfuse.api.score_configs.delete(cfg_id)
                 print(f"  [DEL] Removed duplicate config '{name}' id={cfg_id}")
             except Exception as exc:
-                print(f"  [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}")
+                print(
+                    f"  [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}"
+                )
 
 
 def main() -> int:
-    parser = argparse.ArgumentParser(description="Create Langfuse score configs idempotently")
+    parser = argparse.ArgumentParser(
+        description="Create Langfuse score configs idempotently"
+    )
     parser.add_argument(
         "--cleanup-duplicates",
         action="store_true",
@@ -106,7 +112,9 @@ def main() -> int:
         print("Fetching existing Score Configs from Langfuse...")
         existing = _fetch_existing_configs(langfuse)
         if existing:
-            print(f"  Found {sum(len(v) for v in existing.values())} existing config(s): {list(existing.keys())}")
+            print(
+                f"  Found {sum(len(v) for v in existing.values())} existing config(s): {list(existing.keys())}"
+            )
         else:
             print("  No existing configs found.")
 
diff --git a/scripts/memory/evaluator_scheduler.py b/scripts/memory/evaluator_scheduler.py
index 10250105..182152c8 100644
--- a/scripts/memory/evaluator_scheduler.py
+++ b/scripts/memory/evaluator_scheduler.py
@@ -69,7 +69,7 @@ def _touch_health_file() -> None:
 # =============================================================================
 
 
-def _handle_sigterm(signum, frame) -> None:  # noqa: ARG001
+def _handle_sigterm(signum, frame) -> None:
     """SIGTERM handler — request graceful shutdown."""
     global _shutdown_requested
     logger.info("sigterm_received — requesting graceful shutdown")
@@ -81,7 +81,7 @@ def _register_langfuse_shutdown() -> None:
 
     def _langfuse_shutdown() -> None:
         try:
-            from langfuse import get_client  # noqa: PLC0415
+            from langfuse import get_client
 
             client = get_client()
             if client:
@@ -107,7 +107,7 @@ def load_schedule_config(config_path: str) -> dict:
     Returns:
         Schedule config dict with keys: enabled, cron, lookback_hours
     """
-    import yaml  # noqa: PLC0415
+    import yaml
 
     with open(config_path) as f:
         config = yaml.safe_load(f)
@@ -174,7 +174,7 @@ def run_scheduler(config_path: str = CONFIG_PATH) -> None:
         lookback_hours = int(schedule.get("lookback_hours", 24))
 
         # Calculate next run time using croniter
-        from croniter import croniter  # noqa: PLC0415
+        from croniter import croniter
 
         now = datetime.now(tz=timezone.utc)
         try:
@@ -222,7 +222,7 @@ def run_scheduler(config_path: str = CONFIG_PATH) -> None:
         )
 
         try:
-            from memory.evaluator import EvaluatorRunner  # noqa: PLC0415
+            from memory.evaluator import EvaluatorRunner
 
             runner = EvaluatorRunner(config_path)
             summary = runner.run(since=since)
diff --git a/src/memory/evaluator/runner.py b/src/memory/evaluator/runner.py
index 2bad2497..a370ce29 100644
--- a/src/memory/evaluator/runner.py
+++ b/src/memory/evaluator/runner.py
@@ -7,9 +7,9 @@
   1. Load evaluator config + evaluator definitions
   2. Per evaluator, read target: "trace" or "observation" (per-evaluator YAML field)
   3. Trace path (EV-05, EV-06): page-based pagination via trace.list()
-     — uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API)
-  4. Observation path (EV-01–EV-04): cursor-based pagination via observations.get_many()
-     — uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217)
+     - uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API)
+  4. Observation path (EV-01-EV-04): cursor-based pagination via observations.get_many()
+     - uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217)
   5. Filter observations by name (event_type) — server-side via name= parameter (Path B)
   6. Sample traces/observations per evaluator's sampling_rate
   7. Evaluate each trace/observation via configurable LLM judge
@@ -183,7 +183,9 @@ def _build_observation_prompt(self, evaluator: dict, observation: Any) -> str:
         obs_name = str(getattr(observation, "name", "") or "")
         obs_input = str(getattr(observation, "input", "") or "")[:TRACE_CONTENT_MAX]
         obs_output = str(getattr(observation, "output", "") or "")[:TRACE_CONTENT_MAX]
-        obs_metadata = str(getattr(observation, "metadata", {}) or {})[:TRACE_CONTENT_MAX]
+        obs_metadata = str(getattr(observation, "metadata", {}) or {})[
+            :TRACE_CONTENT_MAX
+        ]
 
         return (
             f"{template}\n\n"
@@ -313,7 +315,8 @@ def _run_observation_evaluator(
                             result = self.evaluator_config.evaluate(prompt)
                             if result.get("score") is None:
                                 logger.warning(
-                                    "Evaluator returned null score for observation %s", obs_id
+                                    "Evaluator returned null score for observation %s",
+                                    obs_id,
                                 )
                                 continue
 
@@ -333,11 +336,16 @@ def _run_observation_evaluator(
                                 # R1-F3: validate categorical value against defined categories
                                 if score_type == "CATEGORICAL":
                                     valid_categories = evaluator.get("categories", [])
-                                    if valid_categories and str(result["score"]) not in valid_categories:
+                                    if (
+                                        valid_categories
+                                        and str(result["score"]) not in valid_categories
+                                    ):
                                         logger.warning(
                                             "Observation %s: categorical value %r not in "
                                             "categories %s — skipping",
-                                            obs_id, result["score"], valid_categories,
+                                            obs_id,
+                                            result["score"],
+                                            valid_categories,
                                         )
                                         continue
                                 langfuse.create_score(
@@ -356,13 +364,17 @@ def _run_observation_evaluator(
                                 # R2-F2: audit log only written when not dry_run
                                 self._append_audit_log(
                                     {
-                                        "timestamp": datetime.now(tz=timezone.utc).isoformat(),
+                                        "timestamp": datetime.now(
+                                            tz=timezone.utc
+                                        ).isoformat(),
                                         "evaluator_id": evaluator.get("id"),
                                         "evaluator_name": ev_name,
                                         "trace_id": trace_id,
                                         "observation_id": obs_id,
                                         "score": result["score"],
-                                        "reasoning": str(result.get("reasoning", ""))[:500],
+                                        "reasoning": str(result.get("reasoning", ""))[
+                                            :500
+                                        ],
                                         "dry_run": dry_run,
                                     }
                                 )
@@ -382,7 +394,9 @@ def _run_observation_evaluator(
                             continue
 
                     # Cursor-based stop condition — no next cursor means last page
-                    next_cursor = getattr(obs_response.meta, "next_cursor", None)  # R2-F1
+                    next_cursor = getattr(
+                        obs_response.meta, "next_cursor", None
+                    )  # R2-F1
                     if not next_cursor or not observations:
                         break
                     cursor = next_cursor
@@ -477,11 +491,16 @@ def _run_trace_evaluator(
                             # R1-F3: validate categorical value against defined categories
                             if score_type == "CATEGORICAL":
                                 valid_categories = evaluator.get("categories", [])
-                                if valid_categories and str(result["score"]) not in valid_categories:
+                                if (
+                                    valid_categories
+                                    and str(result["score"]) not in valid_categories
+                                ):
                                     logger.warning(
                                         "Trace %s: categorical value %r not in "
                                         "categories %s — skipping",
-                                        trace.id, result["score"], valid_categories,
+                                        trace.id,
+                                        result["score"],
+                                        valid_categories,
                                     )
                                     continue
                             langfuse.create_score(
@@ -586,7 +605,9 @@ def run(
                 # Per-evaluator target field is source of truth (DEPRECATED: global evaluation_targets)
                 target = evaluator.get("target", "trace")
 
-                print(f"\n--- Running {evaluator.get('id', '?')}: {ev_name} (target={target}) ---")
+                print(
+                    f"\n--- Running {evaluator.get('id', '?')}: {ev_name} (target={target}) ---"
+                )
 
                 if target == "observation":
                     f, s, e, sc = self._run_observation_evaluator(
diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py
index 899c614a..3d506506 100644
--- a/tests/test_create_score_configs.py
+++ b/tests/test_create_score_configs.py
@@ -16,8 +16,6 @@
 from contextlib import contextmanager
 from unittest.mock import MagicMock
 
-import pytest
-
 # ---------------------------------------------------------------------------
 # Path setup
 # ---------------------------------------------------------------------------
@@ -40,7 +38,9 @@
 ]
 
 
-def _make_config(name: str, cfg_id: str = "abc123", created_at: str = "2026-01-01T00:00:00Z"):
+def _make_config(
+    name: str, cfg_id: str = "abc123", created_at: str = "2026-01-01T00:00:00Z"
+):
     cfg = MagicMock()
     cfg.name = name
     cfg.id = cfg_id
@@ -165,7 +165,9 @@ class TestCleanupDuplicates:
     def test_skips_non_duplicates(self):
         client = MagicMock()
         with _patched_module(client) as (mod, c):
-            existing = {"retrieval_relevance": [_make_config("retrieval_relevance", "id-1")]}
+            existing = {
+                "retrieval_relevance": [_make_config("retrieval_relevance", "id-1")]
+            }
             mod._cleanup_duplicates(c, existing)
         client.api.score_configs.delete.assert_not_called()
 
@@ -194,7 +196,9 @@ def test_missing_created_at_sorts_last_and_gets_deleted(self):
         client = MagicMock()
         client.api.score_configs.delete.return_value = None
         # Config with a real date — should be kept as the "oldest"
-        with_date = _make_config("retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z")
+        with_date = _make_config(
+            "retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z"
+        )
         # Config with no created_at attribute — fallback "9999-99-99" sorts last, gets deleted
         no_date = MagicMock()
         no_date.name = "retrieval_relevance"
@@ -260,21 +264,25 @@ def test_flush_always_called_on_success(self):
     def test_cleanup_duplicates_flag_triggers_dedup(self):
         older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z")
         newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z")
-        other_existing = [_make_config(n) for n in ALL_CONFIG_NAMES if n != "retrieval_relevance"]
+        other_existing = [
+            _make_config(n) for n in ALL_CONFIG_NAMES if n != "retrieval_relevance"
+        ]
 
         # After cleanup, deduped list returned
-        deduped = [_make_config("retrieval_relevance", "id-old")] + other_existing
+        deduped = [_make_config("retrieval_relevance", "id-old"), *other_existing]
 
         client = MagicMock()
         client.api.score_configs.list.side_effect = [
-            _make_list_response([older, newer] + other_existing),  # initial fetch
-            _make_list_response(deduped),                          # post-cleanup fetch
+            _make_list_response([older, newer, *other_existing]),  # initial fetch
+            _make_list_response(deduped),  # post-cleanup fetch
         ]
         client.api.score_configs.create.return_value = MagicMock()
         client.api.score_configs.delete.return_value = None
         client.flush.return_value = None
 
-        rc, c = _run_main(client, argv=["create_score_configs.py", "--cleanup-duplicates"])
+        rc, c = _run_main(
+            client, argv=["create_score_configs.py", "--cleanup-duplicates"]
+        )
 
         assert rc == 0
         # Duplicate deleted
diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py
index 3cfd63d5..ec90fa02 100644
--- a/tests/test_evaluator_runner.py
+++ b/tests/test_evaluator_runner.py
@@ -6,7 +6,7 @@
 
 Covers:
 - Trace-level evaluation path (EV-05, EV-06 style)
-- Observation-level evaluation path (EV-01–EV-04 style)
+- Observation-level evaluation path (EV-01-EV-04 style)
 - Cursor-based pagination for observations (BUG-217)
 - Page-based pagination for traces
 - CATEGORICAL score type (EV-04)
@@ -422,7 +422,9 @@ def test_run_skips_trace_with_no_output(self, runner, evaluator_yaml):
             result = runner.run(since=since)
 
         assert result["evaluated"] == 0
-        assert result["sampled"] == 0  # R2-F7: sampled not incremented for no-output traces
+        assert (
+            result["sampled"] == 0
+        )  # R2-F7: sampled not incremented for no-output traces
         mock_evaluator_config.evaluate.assert_not_called()
 
     def test_run_uses_page_pagination_for_trace_path(self, runner, evaluator_yaml):
@@ -567,7 +569,10 @@ def test_observation_path_calls_observations_get_many(
         )
 
         mock_evaluator_config = MagicMock()
-        mock_evaluator_config.evaluate.return_value = {"score": 0.85, "reasoning": "Good"}
+        mock_evaluator_config.evaluate.return_value = {
+            "score": 0.85,
+            "reasoning": "Good",
+        }
 
         since = datetime(2026, 3, 12, tzinfo=timezone.utc)
 
@@ -624,7 +629,9 @@ def test_observation_path_skips_no_output(self, runner, observation_evaluator_ya
             result = runner.run(since=since)
 
         assert result["evaluated"] == 0
-        assert result["sampled"] == 0  # R2-F7: sampled not incremented for no-output obs
+        assert (
+            result["sampled"] == 0
+        )  # R2-F7: sampled not incremented for no-output obs
         mock_evaluator_config.evaluate.assert_not_called()
 
     def test_observation_path_dry_run_no_score(
@@ -713,7 +720,7 @@ def test_cursor_pagination_fetches_all_pages(
             patch("memory.evaluator.runner.get_client", return_value=mock_langfuse),
             patch.object(runner, "evaluator_config", mock_evaluator_config),
         ):
-            result = runner.run(since=since)
+            _result = runner.run(since=since)
 
         # Each event_type does 2 pages → 4 total calls for 2 event_types
         assert mock_langfuse.api.observations.get_many.call_count == 4
@@ -1196,7 +1203,9 @@ def test_invalid_categorical_value_skipped(
         mock_langfuse.create_score.assert_not_called()
         mock_logger.warning.assert_called()
 
-    def test_valid_categorical_value_is_scored(self, runner, categorical_evaluator_yaml):
+    def test_valid_categorical_value_is_scored(
+        self, runner, categorical_evaluator_yaml
+    ):
         """Categorical score within categories list must be scored normally."""
         obs = make_mock_observation(obs_id="obs_goodcat", trace_id="trace_goodcat")
         mock_langfuse = MagicMock()
diff --git a/tests/test_monitoring_api.py b/tests/test_monitoring_api.py
index 9b8d044a..9bb88819 100644
--- a/tests/test_monitoring_api.py
+++ b/tests/test_monitoring_api.py
@@ -12,9 +12,7 @@
 
 import sys
 from pathlib import Path
-from unittest.mock import MagicMock, AsyncMock
-
-import pytest
+from unittest.mock import MagicMock
 
 
 # ---------------------------------------------------------------------------
@@ -71,14 +69,12 @@ def mount(self, *a, **kw):
     sys.path.insert(0, _monitoring_path)
 
 # Now import the function under test
-import importlib
-import types
-
 # Use importlib to import main with patched sys.modules already in place
 if "main" in sys.modules:
     del sys.modules["main"]
 
 import main as _monitoring_main  # noqa: E402  (monitoring/main.py)
+
 sanitize_log_input = _monitoring_main.sanitize_log_input
 
 
@@ -195,6 +191,7 @@ def _read_main_source(self) -> str:
     def test_str_e_always_wrapped_in_sanitize(self):
         """Every str(e) used in a logger call must be wrapped with sanitize_log_input."""
         import re
+
         source = self._read_main_source()
 
         # More precise: look for "error": str(e) that is NOT inside sanitize_log_input(...)
@@ -202,21 +199,24 @@ def test_str_e_always_wrapped_in_sanitize(self):
         for match in re.finditer(r'"error":\s*(str\([^)]+\))', source):
             str_pos = match.start(1)
             sanitize_prefix = "sanitize_log_input("
-            prefix = source[max(0, str_pos - len(sanitize_prefix)):str_pos]
-            assert prefix.endswith(sanitize_prefix), (
-                f"Unsanitized str() found at log call site: {match.group(0)}"
-            )
+            prefix = source[max(0, str_pos - len(sanitize_prefix)) : str_pos]
+            assert prefix.endswith(
+                sanitize_prefix
+            ), f"Unsanitized str() found at log call site: {match.group(0)}"
 
     def test_collection_name_in_extra_always_sanitized(self):
         """Every collection_name used in logger extra dicts must be sanitized inline."""
         import re
+
         source = self._read_main_source()
 
         # Find bare "collection": collection_name (not wrapped in sanitize_log_input)
-        for match in re.finditer(r'"collection":\s*(collection_name|request\.collection)', source):
-            value = match.group(1)
+        for match in re.finditer(
+            r'"collection":\s*(collection_name|request\.collection)', source
+        ):
+            _value = match.group(1)
             context_start = max(0, match.start() - 60)
-            context = source[context_start:match.end()]
-            assert "sanitize_log_input" in context, (
-                f"Unsanitized collection_name found at log call site: ...{context}..."
-            )
+            context = source[context_start : match.end()]
+            assert (
+                "sanitize_log_input" in context
+            ), f"Unsanitized collection_name found at log call site: ...{context}..."
diff --git a/tests/unit/test_evaluator_retry.py b/tests/unit/test_evaluator_retry.py
index dec2d951..5a478701 100644
--- a/tests/unit/test_evaluator_retry.py
+++ b/tests/unit/test_evaluator_retry.py
@@ -7,11 +7,11 @@
 
 from memory.evaluator.provider import EvaluatorConfig
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 
+
 def _make_config(provider="ollama", max_retries=3) -> EvaluatorConfig:
     cfg = EvaluatorConfig(provider=provider, max_retries=max_retries)
     return cfg
@@ -19,7 +19,7 @@ def _make_config(provider="ollama", max_retries=3) -> EvaluatorConfig:
 
 def _http_error(status_code: int, retry_after: str | None = None):
     """Create a mock exception that looks like an HTTP error."""
-    exc = Exception(f"HTTP {status_code}")
+    exc = RuntimeError(f"HTTP {status_code}")
     exc.status_code = status_code
     if retry_after is not None:
         response = Mock()
@@ -43,6 +43,7 @@ def _good_openai_response(text: str = '{"score": 0.9, "reasoning": "ok"}'):
 # Tests
 # ---------------------------------------------------------------------------
 
+
 class TestEvaluatorRetry:
 
     def test_success_on_first_attempt(self):
@@ -88,10 +89,14 @@ def test_exponential_backoff_timing(self):
 
         sleep_calls = []
 
-        with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append):
-            with patch("memory.evaluator.provider.random.uniform", return_value=0.0):
-                with pytest.raises(Exception):
-                    cfg.evaluate("test prompt")
+        with (
+            patch(
+                "memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append
+            ),
+            patch("memory.evaluator.provider.random.uniform", return_value=0.0),
+            pytest.raises(RuntimeError),
+        ):
+            cfg.evaluate("test prompt")
 
         # With max_retries=3 we sleep 3 times (attempts 0,1,2) before final failure
         assert len(sleep_calls) == 3
@@ -107,11 +112,15 @@ def test_max_retries_exhaustion_raises(self):
         mock_client.chat.completions.create.side_effect = _http_error(503)
         cfg._client = mock_client
 
-        with patch("memory.evaluator.provider.time.sleep"):
-            with pytest.raises(Exception, match="HTTP 503"):
-                cfg.evaluate("test prompt")
+        with (
+            patch("memory.evaluator.provider.time.sleep"),
+            pytest.raises(Exception, match="HTTP 503"),
+        ):
+            cfg.evaluate("test prompt")
 
-        assert mock_client.chat.completions.create.call_count == 3  # initial + 2 retries
+        assert (
+            mock_client.chat.completions.create.call_count == 3
+        )  # initial + 2 retries
 
     def test_non_retryable_400_fails_immediately(self):
         """400 errors should not be retried."""
@@ -143,9 +152,13 @@ def side_effect(**kwargs):
 
         sleep_calls = []
 
-        with patch("memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append):
-            with patch("memory.evaluator.provider.random.uniform", return_value=0.0):
-                result = cfg.evaluate("test prompt")
+        with (
+            patch(
+                "memory.evaluator.provider.time.sleep", side_effect=sleep_calls.append
+            ),
+            patch("memory.evaluator.provider.random.uniform", return_value=0.0),
+        ):
+            result = cfg.evaluate("test prompt")
 
         assert result["score"] == 0.9
         # First sleep should use Retry-After=5 as base (plus zero jitter)
@@ -202,9 +215,11 @@ def test_client_reset_after_exhaustion(self):
         mock_client.chat.completions.create.side_effect = _http_error(500)
         cfg._client = mock_client
 
-        with patch("memory.evaluator.provider.time.sleep"):
-            with pytest.raises(Exception):
-                cfg.evaluate("test prompt")
+        with (
+            patch("memory.evaluator.provider.time.sleep"),
+            pytest.raises(RuntimeError),
+        ):
+            cfg.evaluate("test prompt")
 
         assert cfg._client is None
 
@@ -215,8 +230,10 @@ def test_client_reset_after_connection_error_exhaustion(self):
         mock_client.chat.completions.create.side_effect = ConnectionError("refused")
         cfg._client = mock_client
 
-        with patch("memory.evaluator.provider.time.sleep"):
-            with pytest.raises(ConnectionError):
-                cfg.evaluate("test prompt")
+        with (
+            patch("memory.evaluator.provider.time.sleep"),
+            pytest.raises(ConnectionError),
+        ):
+            cfg.evaluate("test prompt")
 
         assert cfg._client is None
diff --git a/tests/unit/test_evaluator_scheduler.py b/tests/unit/test_evaluator_scheduler.py
index 938f910b..16fb69c1 100644
--- a/tests/unit/test_evaluator_scheduler.py
+++ b/tests/unit/test_evaluator_scheduler.py
@@ -11,7 +11,6 @@
 
 import importlib.util
 import sys
-import time
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from unittest.mock import MagicMock, patch
@@ -29,7 +28,8 @@
     CRONITER_AVAILABLE = False
 
 pytestmark_croniter = pytest.mark.skipif(
-    not CRONITER_AVAILABLE, reason="croniter not installed — add croniter>=2.0.0 to requirements.txt"
+    not CRONITER_AVAILABLE,
+    reason="croniter not installed — add croniter>=2.0.0 to requirements.txt",
 )
 
 
@@ -54,7 +54,9 @@ def _make_croniter_mock(next_run_dt: datetime):
 
 def _load_scheduler():
     """Load evaluator_scheduler module via importlib (not on sys.path as a package)."""
-    spec = importlib.util.spec_from_file_location("evaluator_scheduler", _SCHEDULER_PATH)
+    spec = importlib.util.spec_from_file_location(
+        "evaluator_scheduler", _SCHEDULER_PATH
+    )
     mod = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(mod)
     return mod
@@ -114,7 +116,7 @@ class TestCronScheduleParsing:
     @pytestmark_croniter
     def test_croniter_returns_future_datetime(self):
         """croniter.get_next() returns a datetime after 'now'."""
-        from croniter import croniter  # noqa: PLC0415
+        from croniter import croniter
 
         now = datetime(2026, 3, 14, 4, 0, 0, tzinfo=timezone.utc)
         cron = croniter("0 5 * * *", now)
@@ -128,7 +130,7 @@ def test_croniter_returns_future_datetime(self):
     @pytestmark_croniter
     def test_croniter_sleep_seconds_positive(self):
         """sleep_seconds should always be positive (next run is in the future)."""
-        from croniter import croniter  # noqa: PLC0415
+        from croniter import croniter
 
         now = datetime(2026, 3, 14, 4, 59, 59, tzinfo=timezone.utc)
         cron = croniter("0 5 * * *", now)
@@ -177,19 +179,21 @@ def test_enabled_false_does_not_call_runner(self, tmp_path):
         mod = _load_scheduler()
         mod._shutdown_requested = False
 
-        call_count = 0
+        _call_count = 0
 
         def fake_interruptible_sleep(seconds, chunk=5.0):
             # After one sleep (schedule disabled), set shutdown to stop the loop
             mod._shutdown_requested = True
 
-        with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep):
-            with patch.object(mod, "load_schedule_config", return_value={"enabled": False}):
-                with patch(
-                    "memory.evaluator.EvaluatorRunner"
-                ) as mock_runner_cls:
-                    mod.run_scheduler(str(config))
-                    mock_runner_cls.assert_not_called()
+        with (
+            patch.object(
+                mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep
+            ),
+            patch.object(mod, "load_schedule_config", return_value={"enabled": False}),
+            patch("memory.evaluator.EvaluatorRunner") as mock_runner_cls,
+        ):
+            mod.run_scheduler(str(config))
+            mock_runner_cls.assert_not_called()
 
 
 # ---------------------------------------------------------------------------
@@ -225,17 +229,25 @@ def fake_interruptible_sleep(seconds, chunk=5.0):
         _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
         mock_croniter_mod, _, _ = _make_croniter_mock(_next_run)
 
-        with patch.dict(sys.modules, {"croniter": mock_croniter_mod}):
-            with patch.object(mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep):
-                with patch.object(
-                    mod,
-                    "load_schedule_config",
-                    return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1},
-                ):
-                    with patch.object(mod, "_touch_health_file") as mock_touch:
-                        with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner):
-                            # Should NOT raise — errors are caught and logged
-                            mod.run_scheduler(str(config))
+        with (
+            patch.dict(sys.modules, {"croniter": mock_croniter_mod}),
+            patch.object(
+                mod, "_interruptible_sleep", side_effect=fake_interruptible_sleep
+            ),
+            patch.object(
+                mod,
+                "load_schedule_config",
+                return_value={
+                    "enabled": True,
+                    "cron": "* * * * *",
+                    "lookback_hours": 1,
+                },
+            ),
+            patch.object(mod, "_touch_health_file") as mock_touch,
+            patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner),
+        ):
+            # Should NOT raise — errors are caught and logged
+            mod.run_scheduler(str(config))
 
         # Runner was called at least once
         mock_runner.run.assert_called()
@@ -275,16 +287,22 @@ def fake_touch():
         _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
         mock_croniter_mod, _, _ = _make_croniter_mock(_next_run)
 
-        with patch.dict(sys.modules, {"croniter": mock_croniter_mod}):
-            with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep):
-                with patch.object(
-                    mod,
-                    "load_schedule_config",
-                    return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1},
-                ):
-                    with patch.object(mod, "_touch_health_file", side_effect=fake_touch):
-                        with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner):
-                            mod.run_scheduler(str(config))
+        with (
+            patch.dict(sys.modules, {"croniter": mock_croniter_mod}),
+            patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep),
+            patch.object(
+                mod,
+                "load_schedule_config",
+                return_value={
+                    "enabled": True,
+                    "cron": "* * * * *",
+                    "lookback_hours": 1,
+                },
+            ),
+            patch.object(mod, "_touch_health_file", side_effect=fake_touch),
+            patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner),
+        ):
+            mod.run_scheduler(str(config))
 
         # Health file should NOT be touched after the failed evaluation
         assert touch_after_startup[0] == 0
@@ -310,14 +328,16 @@ def fake_sleep(seconds, chunk=5.0):
             # Immediately shut down after first sleep
             mod._shutdown_requested = True
 
-        with patch.object(mod, "_touch_health_file", side_effect=fake_touch):
-            with patch.object(
+        with (
+            patch.object(mod, "_touch_health_file", side_effect=fake_touch),
+            patch.object(
                 mod,
                 "load_schedule_config",
                 return_value={"enabled": False},
-            ):
-                with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep):
-                    mod.run_scheduler(str(tmp_path / "evaluator_config.yaml"))
+            ),
+            patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep),
+        ):
+            mod.run_scheduler(str(tmp_path / "evaluator_config.yaml"))
 
         # At minimum, touched on startup
         assert len(touch_calls) >= 1
@@ -358,16 +378,22 @@ def fake_sleep(seconds, chunk=5.0):
         _next_run = datetime(2026, 3, 14, 5, 0, 0, tzinfo=timezone.utc)
         mock_croniter_mod, _, _ = _make_croniter_mock(_next_run)
 
-        with patch.dict(sys.modules, {"croniter": mock_croniter_mod}):
-            with patch.object(mod, "_touch_health_file", side_effect=fake_touch):
-                with patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep):
-                    with patch.object(
-                        mod,
-                        "load_schedule_config",
-                        return_value={"enabled": True, "cron": "* * * * *", "lookback_hours": 1},
-                    ):
-                        with patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner):
-                            mod.run_scheduler(str(config))
+        with (
+            patch.dict(sys.modules, {"croniter": mock_croniter_mod}),
+            patch.object(mod, "_touch_health_file", side_effect=fake_touch),
+            patch.object(mod, "_interruptible_sleep", side_effect=fake_sleep),
+            patch.object(
+                mod,
+                "load_schedule_config",
+                return_value={
+                    "enabled": True,
+                    "cron": "* * * * *",
+                    "lookback_hours": 1,
+                },
+            ),
+            patch("memory.evaluator.EvaluatorRunner", return_value=mock_runner),
+        ):
+            mod.run_scheduler(str(config))
 
         # Touched on startup + once after successful run = at least 2 calls
         assert touch_calls[0] >= 2

From 9e20ca1aa04de965562ea22db7fd77a67ea4a02d Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 02:05:51 -0700
Subject: [PATCH 03/13] fix(installer): always copy requirements.txt and
 pyproject.toml on update

Option 1 (add-project) and copy_files() both skipped requirements.txt
if it already existed, preventing new dependencies like croniter from
reaching Docker builds. Now always overwrites both files.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/install.sh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index 9f5f83aa..6a36e337 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -1049,6 +1049,17 @@ update_shared_scripts() {
         log_debug "Updated evaluators/ directory"
     fi
 
+    # Sync requirements.txt and pyproject.toml (needed by Docker builds)
+    # Always overwrite — new dependencies (e.g. croniter) must reach containers
+    if [[ -f "$SCRIPT_DIR/../requirements.txt" ]]; then
+        cp "$SCRIPT_DIR/../requirements.txt" "$INSTALL_DIR/requirements.txt" || log_warning "Failed to copy requirements.txt"
+        log_debug "Updated requirements.txt"
+    fi
+    if [[ -f "$SCRIPT_DIR/../pyproject.toml" ]]; then
+        cp "$SCRIPT_DIR/../pyproject.toml" "$INSTALL_DIR/pyproject.toml" || log_warning "Failed to copy pyproject.toml"
+        log_debug "Updated pyproject.toml"
+    fi
+
     # Update templates (new templates added in updates must reach installed dir)
     if [[ -d "$SCRIPT_DIR/../templates" ]]; then
         mkdir -p "$INSTALL_DIR/templates"
@@ -1334,12 +1345,13 @@ install_python_dependencies() {
         return 0
     fi
 
-    # Copy pyproject.toml and requirements.txt to install directory if not already there
-    if [[ ! -f "$INSTALL_DIR/pyproject.toml" ]]; then
+    # Copy pyproject.toml and requirements.txt to install directory
+    # Always overwrite — updated deps (e.g. croniter) must reach Docker builds
+    if [[ -f "$source_dir/pyproject.toml" ]]; then
         cp "$source_dir/pyproject.toml" "$INSTALL_DIR/"
         log_debug "Copied pyproject.toml to $INSTALL_DIR"
     fi
-    if [[ -f "$source_dir/requirements.txt" && ! -f "$INSTALL_DIR/requirements.txt" ]]; then
+    if [[ -f "$source_dir/requirements.txt" ]]; then
         cp "$source_dir/requirements.txt" "$INSTALL_DIR/"
         log_debug "Copied requirements.txt to $INSTALL_DIR"
     fi

From 7c936640d29271dabfc43aaff33b1d3c476eb583 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 02:14:17 -0700
Subject: [PATCH 04/13] fix(score-configs): use api.score_configs.get() not
 .list() for V3 SDK

V3 SDK ScoreConfigsClient exposes get(page=, limit=) not list().
Also fixes test mocks to match.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/create_score_configs.py    |  4 ++--
 tests/test_create_score_configs.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py
index 06cfc2e6..446907f9 100644
--- a/scripts/create_score_configs.py
+++ b/scripts/create_score_configs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # LANGFUSE: V3 SDK ONLY. See LANGFUSE-INTEGRATION-SPEC.md
 # FORBIDDEN: Langfuse() constructor, start_span(), start_generation(), langfuse_context
-# REQUIRED: get_client(), api.score_configs.create(), api.score_configs.list(), flush()
+# REQUIRED: get_client(), api.score_configs.create(), api.score_configs.get(), flush()
 """Idempotent Score Config setup in Langfuse.
 
 Creates 6 Score Configs that enforce validation schemas on evaluation scores.
@@ -35,7 +35,7 @@ def _fetch_existing_configs(langfuse) -> dict[str, list]:
     page = 1
     while True:
         try:
-            response = langfuse.api.score_configs.list(page=page, limit=100)
+            response = langfuse.api.score_configs.get(page=page, limit=100)
         except Exception as exc:
             print(f"  [WARN] Could not list existing score configs: {exc}")
             break
diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py
index 3d506506..c15002ec 100644
--- a/tests/test_create_score_configs.py
+++ b/tests/test_create_score_configs.py
@@ -109,9 +109,9 @@ def _make_client_with_existing(existing_names: list[str], list_side_effect=None)
     client = MagicMock()
     configs = [_make_config(n) for n in existing_names]
     if list_side_effect is not None:
-        client.api.score_configs.list.side_effect = list_side_effect
+        client.api.score_configs.get.side_effect = list_side_effect
     else:
-        client.api.score_configs.list.return_value = _make_list_response(configs)
+        client.api.score_configs.get.return_value = _make_list_response(configs)
     client.api.score_configs.create.return_value = MagicMock()
     client.api.score_configs.delete.return_value = None
     client.flush.return_value = None
@@ -143,14 +143,14 @@ def test_detects_duplicates(self):
             _make_config("retrieval_relevance", "id-2"),
         ]
         client = MagicMock()
-        client.api.score_configs.list.return_value = _make_list_response(configs)
+        client.api.score_configs.get.return_value = _make_list_response(configs)
         with _patched_module(client) as (mod, c):
             result = mod._fetch_existing_configs(c)
         assert len(result["retrieval_relevance"]) == 2
 
     def test_handles_list_api_error_gracefully(self):
         client = MagicMock()
-        client.api.score_configs.list.side_effect = RuntimeError("network error")
+        client.api.score_configs.get.side_effect = RuntimeError("network error")
         with _patched_module(client) as (mod, c):
             result = mod._fetch_existing_configs(c)
         assert result == {}
@@ -272,7 +272,7 @@ def test_cleanup_duplicates_flag_triggers_dedup(self):
         deduped = [_make_config("retrieval_relevance", "id-old"), *other_existing]
 
         client = MagicMock()
-        client.api.score_configs.list.side_effect = [
+        client.api.score_configs.get.side_effect = [
             _make_list_response([older, newer, *other_existing]),  # initial fetch
             _make_list_response(deduped),  # post-cleanup fetch
         ]

From e0246c91a3971d15a98e9a313f9cce7cc515c9d6 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 02:21:44 -0700
Subject: [PATCH 05/13] fix(score-configs): archive duplicates instead of
 delete (V3 SDK has no delete)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Langfuse V3 API returns 405 on DELETE for score configs. Uses
update(isArchived=True) instead — archived configs hidden in UI.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/create_score_configs.py    | 23 ++++++++++++++++-------
 tests/test_create_score_configs.py | 26 +++++++++++++++-----------
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/scripts/create_score_configs.py b/scripts/create_score_configs.py
index 446907f9..2d4fb516 100644
--- a/scripts/create_score_configs.py
+++ b/scripts/create_score_configs.py
@@ -60,11 +60,17 @@ def _fetch_existing_configs(langfuse) -> dict[str, list]:
 
 
 def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None:
-    """Delete duplicate score configs, keeping the oldest one (first created)."""
+    """Archive duplicate score configs, keeping the oldest one (first created).
+
+    Langfuse API does not support DELETE on score configs (405).
+    Uses update(isArchived=True) instead — archived configs are hidden in UI.
+    """
+    from langfuse.api.resources.score_configs.types import UpdateScoreConfigRequest
+
     for name, configs in existing.items():
         if len(configs) <= 1:
             continue
-        # Sort by created_at ascending, keep first, delete the rest
+        # Sort by created_at ascending, keep first, archive the rest
         try:
             sorted_configs = sorted(
                 configs, key=lambda c: getattr(c, "created_at", "9999-99-99")
@@ -72,17 +78,20 @@ def _cleanup_duplicates(langfuse, existing: dict[str, list]) -> None:
         except Exception:
             sorted_configs = configs
 
-        to_delete = sorted_configs[1:]
-        for cfg in to_delete:
+        to_archive = sorted_configs[1:]
+        for cfg in to_archive:
             cfg_id = getattr(cfg, "id", None)
             if not cfg_id:
                 continue
             try:
-                langfuse.api.score_configs.delete(cfg_id)
-                print(f"  [DEL] Removed duplicate config '{name}' id={cfg_id}")
+                langfuse.api.score_configs.update(
+                    cfg_id,
+                    request=UpdateScoreConfigRequest(isArchived=True),
+                )
+                print(f"  [ARCHIVED] Duplicate config '{name}' id={cfg_id}")
             except Exception as exc:
                 print(
-                    f"  [WARN] Could not delete duplicate '{name}' id={cfg_id}: {exc}"
+                    f"  [WARN] Could not archive duplicate '{name}' id={cfg_id}: {exc}"
                 )
 
 
diff --git a/tests/test_create_score_configs.py b/tests/test_create_score_configs.py
index c15002ec..f2e8686d 100644
--- a/tests/test_create_score_configs.py
+++ b/tests/test_create_score_configs.py
@@ -71,6 +71,7 @@ def _build_langfuse_modules(mock_client):
 
     fake_sc_types = types.ModuleType("langfuse.api.resources.score_configs.types")
     fake_sc_types.CreateScoreConfigRequest = MagicMock(side_effect=lambda **kw: kw)
+    fake_sc_types.UpdateScoreConfigRequest = MagicMock(side_effect=lambda **kw: kw)
 
     modules = {
         "langfuse": fake_langfuse,
@@ -113,7 +114,7 @@ def _make_client_with_existing(existing_names: list[str], list_side_effect=None)
     else:
         client.api.score_configs.get.return_value = _make_list_response(configs)
     client.api.score_configs.create.return_value = MagicMock()
-    client.api.score_configs.delete.return_value = None
+    client.api.score_configs.update.return_value = None
     client.flush.return_value = None
     return client
 
@@ -169,7 +170,7 @@ def test_skips_non_duplicates(self):
                 "retrieval_relevance": [_make_config("retrieval_relevance", "id-1")]
             }
             mod._cleanup_duplicates(c, existing)
-        client.api.score_configs.delete.assert_not_called()
+        client.api.score_configs.update.assert_not_called()
 
     def test_deletes_extra_copies_keeps_oldest(self):
         client = MagicMock()
@@ -178,12 +179,13 @@ def test_deletes_extra_copies_keeps_oldest(self):
         with _patched_module(client) as (mod, c):
             existing = {"retrieval_relevance": [older, newer]}
             mod._cleanup_duplicates(c, existing)
-        # Newer (id-new) should be deleted; oldest (id-old) kept
-        client.api.score_configs.delete.assert_called_once_with("id-new")
+        # Newer (id-new) should be archived; oldest (id-old) kept
+        client.api.score_configs.update.assert_called_once()
+        assert client.api.score_configs.update.call_args[0][0] == "id-new"
 
     def test_handles_delete_error_gracefully(self):
         client = MagicMock()
-        client.api.score_configs.delete.side_effect = RuntimeError("delete failed")
+        client.api.score_configs.update.side_effect = RuntimeError("delete failed")
         older = _make_config("retrieval_relevance", "id-old", "2026-01-01T00:00:00Z")
         newer = _make_config("retrieval_relevance", "id-new", "2026-02-01T00:00:00Z")
         with _patched_module(client) as (mod, c):
@@ -194,7 +196,7 @@ def test_handles_delete_error_gracefully(self):
     def test_missing_created_at_sorts_last_and_gets_deleted(self):
         """Config without created_at should sort last (treated as newest) and be deleted."""
         client = MagicMock()
-        client.api.score_configs.delete.return_value = None
+        client.api.score_configs.update.return_value = None
         # Config with a real date — should be kept as the "oldest"
         with_date = _make_config(
             "retrieval_relevance", "id-with-date", "2026-01-01T00:00:00Z"
@@ -207,8 +209,9 @@ def test_missing_created_at_sorts_last_and_gets_deleted(self):
         with _patched_module(client) as (mod, c):
             existing = {"retrieval_relevance": [with_date, no_date]}
             mod._cleanup_duplicates(c, existing)
-        # The one without created_at should be deleted; the dated one kept
-        c.api.score_configs.delete.assert_called_once_with("id-no-date")
+        # The one without created_at should be archived; the dated one kept
+        c.api.score_configs.update.assert_called_once()
+        assert c.api.score_configs.update.call_args[0][0] == "id-no-date"
 
 
 # ---------------------------------------------------------------------------
@@ -277,7 +280,7 @@ def test_cleanup_duplicates_flag_triggers_dedup(self):
             _make_list_response(deduped),  # post-cleanup fetch
         ]
         client.api.score_configs.create.return_value = MagicMock()
-        client.api.score_configs.delete.return_value = None
+        client.api.score_configs.update.return_value = None
         client.flush.return_value = None
 
         rc, c = _run_main(
@@ -285,8 +288,9 @@ def test_cleanup_duplicates_flag_triggers_dedup(self):
         )
 
         assert rc == 0
-        # Duplicate deleted
-        c.api.score_configs.delete.assert_called_once_with("id-new")
+        # Duplicate archived
+        c.api.score_configs.update.assert_called_once()
+        assert c.api.score_configs.update.call_args[0][0] == "id-new"
         # All 6 configs present after cleanup — nothing to create
         c.api.score_configs.create.assert_not_called()
         c.flush.assert_called_once()

From ab4fd34dd15c1701ade15304e02d8f96026ae1e4 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 02:35:07 -0700
Subject: [PATCH 06/13] fix(runner): use page-based pagination for
 observations.get_many()

V3 SDK observations.get_many() uses page=/total_pages, not cursor.
Both trace.list() and observations.get_many() are page-based in V3.
Fixed runner and all test mocks to match actual SDK signatures.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/memory/evaluator/runner.py | 25 ++++++++++++-------------
 tests/test_evaluator_runner.py | 34 +++++++++++++++++-----------------
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/memory/evaluator/runner.py b/src/memory/evaluator/runner.py
index a370ce29..8e85f7be 100644
--- a/src/memory/evaluator/runner.py
+++ b/src/memory/evaluator/runner.py
@@ -8,8 +8,8 @@
   2. Per evaluator, read target: "trace" or "observation" (per-evaluator YAML field)
   3. Trace path (EV-05, EV-06): page-based pagination via trace.list()
      - uses from_timestamp/to_timestamp/page/meta.total_pages (V3 trace API)
-  4. Observation path (EV-01-EV-04): cursor-based pagination via observations.get_many()
-     - uses from_start_time/to_start_time/cursor/meta.next_cursor (V3 observation API, BUG-217)
+  4. Observation path (EV-01-EV-04): page-based pagination via observations.get_many()
+     - uses from_start_time/to_start_time/page/meta.total_pages (V3 observation API)
   5. Filter observations by name (event_type) — server-side via name= parameter (Path B)
   6. Sample traces/observations per evaluator's sampling_rate
   7. Evaluate each trace/observation via configurable LLM judge
@@ -20,7 +20,7 @@
       Use the per-evaluator target: field in each evaluator YAML instead.
 
 PLAN-012 Phase 2 — Section 5.4
-S-16.3: Observation-level evaluation + cursor-based observation pagination (BUG-217)
+S-16.3: Observation-level evaluation + page-based observation pagination
 """
 
 import hashlib
@@ -245,7 +245,7 @@ def _run_observation_evaluator(
     ) -> tuple[int, int, int, int]:
         """Run observation-level evaluation for a single evaluator.
 
-        Fetches observations via observations.get_many() with cursor-based pagination
+        Fetches observations via observations.get_many() with page-based pagination
         (BUG-217). Filters by observation name (event_type) server-side via name=
         parameter (Path B — tags are trace-level only in V3).
 
@@ -274,15 +274,15 @@ def _run_observation_evaluator(
 
         try:  # R2-F3: catch fetch-level errors so one evaluator doesn't kill the whole run
             for name_filter in name_filters:
-                cursor: str | None = None
+                page = 1
 
                 while True:
-                    # Cursor-based pagination — V3 observations API (BUG-217)
+                    # Page-based pagination — V3 observations API
                     obs_response = langfuse.api.observations.get_many(
                         name=name_filter,
                         from_start_time=since,
                         to_start_time=until,
-                        cursor=cursor,
+                        page=page,
                         limit=batch_size,
                     )
                     observations = obs_response.data or []
@@ -393,13 +393,12 @@ def _run_observation_evaluator(
                             )
                             continue
 
-                    # Cursor-based stop condition — no next cursor means last page
-                    next_cursor = getattr(
-                        obs_response.meta, "next_cursor", None
-                    )  # R2-F1
-                    if not next_cursor or not observations:
+                    # Page-based stop condition
+                    meta = getattr(obs_response, "meta", None)
+                    total_pages = getattr(meta, "total_pages", 1) if meta else 1
+                    if page >= total_pages or not observations:
                         break
-                    cursor = next_cursor
+                    page += 1
 
         except Exception as exc:
             logger.error("Evaluator %s: error fetching observations: %s", ev_name, exc)
diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py
index ec90fa02..d992f178 100644
--- a/tests/test_evaluator_runner.py
+++ b/tests/test_evaluator_runner.py
@@ -212,12 +212,12 @@ def make_paginated_response(traces: list, total_pages=1):
     return response
 
 
-def make_observation_response(observations: list, next_cursor: str | None = None):
-    """Cursor-based response for observations.get_many()."""
+def make_observation_response(observations: list, total_pages: int = 1):
+    """Page-based response for observations.get_many()."""
     response = MagicMock()
     response.data = observations
     response.meta = MagicMock()
-    response.meta.next_cursor = next_cursor  # R2-F1: V3 API uses next_cursor not cursor
+    response.meta.total_pages = total_pages
     return response
 
 
@@ -693,22 +693,22 @@ def test_observation_path_appends_audit_log_with_observation_id(
 # ---------------------------------------------------------------------------
 
 
-class TestCursorPaginationObservations:
-    def test_cursor_pagination_fetches_all_pages(
+class TestPagePaginationObservations:
+    def test_page_pagination_fetches_all_pages(
         self, runner, observation_evaluator_yaml
     ):
-        """observations.get_many() must be called with next_cursor on subsequent pages."""
+        """observations.get_many() must paginate via page= with total_pages."""
         obs1 = make_mock_observation(obs_id="obs_p1", trace_id="t1")
         obs2 = make_mock_observation(obs_id="obs_p2", trace_id="t2")
 
         mock_langfuse = MagicMock()
-        # First call returns obs1 + next_cursor; second call returns obs2 + no cursor
+        # First call returns obs1 with total_pages=2; second returns obs2 with total_pages=2
         mock_langfuse.api.observations.get_many.side_effect = [
-            make_observation_response([obs1], next_cursor="cursor_abc"),
-            make_observation_response([obs2], next_cursor=None),
+            make_observation_response([obs1], total_pages=2),
+            make_observation_response([obs2], total_pages=2),
             # Additional responses for the second event_type
-            make_observation_response([obs1], next_cursor="cursor_abc"),
-            make_observation_response([obs2], next_cursor=None),
+            make_observation_response([obs1], total_pages=2),
+            make_observation_response([obs2], total_pages=2),
         ]
 
         mock_evaluator_config = MagicMock()
@@ -724,22 +724,22 @@ def test_cursor_pagination_fetches_all_pages(
 
         # Each event_type does 2 pages → 4 total calls for 2 event_types
         assert mock_langfuse.api.observations.get_many.call_count == 4
-        # Second call for first event_type must pass cursor
+        # Second call for first event_type must pass page=2
         second_call_kwargs = mock_langfuse.api.observations.get_many.call_args_list[
             1
         ].kwargs
-        assert second_call_kwargs.get("cursor") == "cursor_abc"
+        assert second_call_kwargs.get("page") == 2
 
-    def test_cursor_pagination_stops_when_no_next_cursor(
+    def test_page_pagination_stops_on_single_page(
         self, runner, observation_evaluator_yaml
     ):
-        """Must stop fetching when meta.cursor is None."""
+        """Must stop fetching when total_pages == 1."""
         obs = make_mock_observation(obs_id="obs_single")
 
         mock_langfuse = MagicMock()
-        # No next cursor → single page per event_type
+        # Single page per event_type
         mock_langfuse.api.observations.get_many.return_value = (
-            make_observation_response([obs], next_cursor=None)
+            make_observation_response([obs], total_pages=1)
         )
 
         mock_evaluator_config = MagicMock()

From c766e9d856d14d2232977f3ab319d5a470d7c950 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 03:08:24 -0700
Subject: [PATCH 07/13] fix(installer+provider): import user .env on Option 1
 updates, auto-detect Ollama cloud

- Add import_user_env() call to update_shared_scripts() (Option 1 path)
  so credentials like OLLAMA_API_KEY are imported on updates, not just
  fresh installs
- Auto-detect Ollama cloud vs local: if OLLAMA_API_KEY env var is set
  and no explicit base_url configured, use https://api.ollama.com/v1
  instead of localhost

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/install.sh               |  3 +++
 src/memory/evaluator/provider.py | 16 ++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index 6a36e337..f85d7472 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -1067,6 +1067,9 @@ update_shared_scripts() {
         log_debug "Templates updated"
     fi
 
+    # Re-import user .env customizations (new env vars from updates, e.g. OLLAMA_API_KEY)
+    import_user_env
+
     if [[ $updated_count -gt 0 || $hooks_count -gt 0 || $docker_count -gt 0 ]]; then
         log_success "Updated $updated_count shared scripts, $hooks_count hook scripts, $docker_count docker files"
         if [[ $archived_count -gt 0 ]]; then
diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py
index 326bcc47..16330ae9 100644
--- a/src/memory/evaluator/provider.py
+++ b/src/memory/evaluator/provider.py
@@ -73,11 +73,19 @@ def get_client(self):
         if self.provider == "ollama":
             from openai import OpenAI
 
+            api_key = os.environ.get("OLLAMA_API_KEY", "")
+            # Auto-detect cloud vs local: if API key is set and no explicit
+            # base_url, use Ollama cloud. Otherwise default to local.
+            if self.base_url:
+                base_url = self.base_url
+            elif api_key:
+                base_url = "https://api.ollama.com/v1"
+            else:
+                base_url = "http://localhost:11434/v1"
+
             client = OpenAI(
-                base_url=self.base_url or "http://localhost:11434/v1",
-                api_key=os.environ.get(
-                    "OLLAMA_API_KEY", "ollama"
-                ),  # Cloud: set OLLAMA_API_KEY; local: ignored
+                base_url=base_url,
+                api_key=api_key or "ollama",  # Local Ollama ignores key
             )
 
         elif self.provider == "openrouter":

From 840d88c4781e44af5890cfc1ab4e6d9c0fddca74 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 03:10:24 -0700
Subject: [PATCH 08/13] fix(installer): resolve SOURCE_DIR unbound variable in
 Option 1 path

import_user_env() used SOURCE_DIR which is only set during full install.
Fall back to SCRIPT_DIR parent for Option 1 (add-project) updates.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/install.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index f85d7472..1f34aceb 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -1642,7 +1642,9 @@ copy_files() {
 # are present when credential generation checks run.
 import_user_env() {
     local docker_env="$INSTALL_DIR/docker/.env"
-    local user_env="$SOURCE_DIR/.env"
+    # SOURCE_DIR is set during full install; fall back to SCRIPT_DIR parent for Option 1
+    local source_root="${SOURCE_DIR:-$(cd "$SCRIPT_DIR/.." && pwd)}"
+    local user_env="$source_root/.env"
 
     # Only import if user's .env exists and docker/.env exists
     if [[ ! -f "$user_env" ]] || [[ ! -f "$docker_env" ]]; then

From 9b1a61ef8b546154e9eab0ef6604a1ae33aca3eb Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 03:17:42 -0700
Subject: [PATCH 09/13] fix(provider): correct Ollama cloud URL to
 ollama.com/v1

api.ollama.com returns 401; ollama.com/v1 is the correct OpenAI-compat endpoint.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/memory/evaluator/provider.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py
index 16330ae9..e03f2580 100644
--- a/src/memory/evaluator/provider.py
+++ b/src/memory/evaluator/provider.py
@@ -79,7 +79,7 @@ def get_client(self):
             if self.base_url:
                 base_url = self.base_url
             elif api_key:
-                base_url = "https://api.ollama.com/v1"
+                base_url = "https://ollama.com/v1"
             else:
                 base_url = "http://localhost:11434/v1"
 

From 4306cad4897415988fe0dcc52229803a9b0e4267 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 03:35:40 -0700
Subject: [PATCH 10/13] config(evaluator): change default model to gemma3:4b
 (Ollama cloud compatible)

llama3.2:8b is not available on Ollama cloud. gemma3:4b is small, fast,
and suitable for LLM-as-judge evaluation tasks.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 evaluator_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluator_config.yaml b/evaluator_config.yaml
index 2a788bc1..0d22835b 100644
--- a/evaluator_config.yaml
+++ b/evaluator_config.yaml
@@ -13,7 +13,7 @@
 # ---------------------------------------------------------------------------
 evaluator_model:
   provider: ollama          # ollama | openrouter | anthropic | openai | custom
-  model_name: llama3.2:8b  # Model identifier for the selected provider
+  model_name: gemma3:4b    # Model identifier for the selected provider
   temperature: 0.0          # Deterministic evaluation (do not change)
   max_tokens: 4096          # 4096 to accommodate thinking models (reasoning + output)
   max_retries: 3            # Retry on 500/502/503/429 and network errors

From 780c4c7235b909b3e791ab9e0fb376d02c0538bd Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 03:39:34 -0700
Subject: [PATCH 11/13] docs: update changelog and Langfuse integration docs
 for v2.2.3

- Changelog: complete v2.2.3 entry with upgrade instructions including
  scheduler build, score config setup, and provider configuration
- Langfuse docs: add LLM-as-Judge evaluation pipeline section with
  evaluator table, config reference, provider auto-detection, and
  manual evaluation commands. Add evaluator-scheduler to Docker services

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                 | 70 +++++++++++++++++++++++++-----------
 docs/LANGFUSE-INTEGRATION.md | 67 ++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 296acb60..d3788a2e 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,27 +5,37 @@ All notable changes to AI Memory Module will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [2.2.3] - 2026-03-14
+## [2.2.3] - 2026-03-15
 
-Langfuse observability improvements and LLM-as-judge evaluation pipeline: semantic tags on all 108 trace events, per-search trace visibility in the compact injection path, and a complete multi-provider evaluation engine with 6 evaluators, 5 golden datasets (123 items), and a CI quality gate.
+Complete Langfuse observability pipeline: observation-level evaluation for all 6 evaluators, automated scheduling, exponential backoff retry, and security hardening.
 
 ### Added
-- **Semantic tags on all trace events**: All 108 `emit_trace_event` calls include canonical dual-element tags (e.g., `["search", "retrieval"]`) for structured Langfuse dashboard filtering
-- **Per-search trace visibility in compact injection path**: Each search call within compact context injection emits its own trace, enabling per-query latency and result visibility in that path
-- **LLM-as-judge evaluation engine**: Multi-provider evaluator (`src/memory/evaluator/`) supporting Ollama (default, local, free), OpenRouter, Anthropic, OpenAI, and custom OpenAI-compatible endpoints
-- **6 evaluator definitions**: Retrieval relevance (EV-01), injection value (EV-02), capture completeness (EV-03), classification accuracy (EV-04), bootstrap quality (EV-05), and session coherence (EV-06) — each with YAML config and LLM judge prompt template
-- **5 golden datasets** (123 items): DS-01 Retrieval (25 items), DS-02 Error Pattern Match (12 items), DS-03 Bootstrap Round-Trip (8 items), DS-04 Keyword Trigger Routing (68 items), DS-05 Chunking Quality (10 items) — for repeatable regression benchmarking
-- **Regression test suite**: `tests/test_regression.py` runs Langfuse experiments against golden datasets with configurable quality thresholds (`@pytest.mark.regression`)
-- **CI quality gate**: `.github/workflows/regression-tests.yml` blocks PRs on score regression when `src/memory/**` or hook scripts change
-- **Evaluator YAML configuration**: `evaluator_config.yaml` — zero secrets in config; all credentials supplied via environment variables with inline documentation
+- **Observation-level evaluation**: Runner scores individual Langfuse observations (spans) for EV-01 to EV-04, not just whole traces. Enables per-retrieval, per-injection, per-capture quality scoring
+- **Evaluator-scheduler container**: Automated daily evaluations via `evaluator-scheduler` Docker service with `croniter`-based scheduling, health checks, graceful shutdown, and live config reload
+- **Exponential backoff retry**: Provider retries on HTTP 500/502/503/429 and network errors (ConnectionError, TimeoutError) with configurable `max_retries` (default: 3) and jitter
+- **12 evaluator files on disk**: 6 YAML configs + 6 prompt templates materialized from PLAN-012 spec. Filters aligned to actual `emit_trace_event()` event_types via codebase audit
+- **Score config idempotency**: `create_score_configs.py` pre-checks existing configs via `.get()` API; `--cleanup-duplicates` archives extras via `update(isArchived=True)`
+- **Ollama cloud auto-detection**: Provider automatically uses `https://ollama.com/v1` when `OLLAMA_API_KEY` env var is set (no manual `base_url` config needed)
+- **Installer copies evaluator files**: Both fresh install and Option 1 update paths copy `evaluator_config.yaml`, `evaluators/`, `requirements.txt`, and `pyproject.toml`
+- **Installer imports .env on Option 1**: `import_user_env()` now runs during add-project updates, not just fresh installs — ensures credentials like `OLLAMA_API_KEY` reach the installed `.env`
 
 ### Changed
-- **`detect-secrets` moved to dev extras**: Removed from default `requirements.txt`; added to `requirements-dev.txt` to reduce production dependency footprint
-- **GitHub Actions versions pinned**: All CI workflow action steps use pinned versions for reproducible builds
+- **Default evaluator model**: `gemma3:4b` (Ollama cloud compatible) replaces `llama3.2:8b` (not available on cloud)
+- **Observation filtering**: Path B — evaluators filter observations by `name` (event_type) instead of tags. Langfuse V3 does not support observation-level tags; trace-level tags remain for trace filtering
+- **Pagination**: Both `trace.list()` and `observations.get_many()` use page-based pagination per V3 SDK (`page=`, `total_pages`)
+
+### Fixed
+- **Log injection sanitization**: All `str(e)` in `monitoring/main.py` log statements wrapped with `sanitize_log_input()` inline at call sites (CodeQL `py/log-injection` compliance)
+- **CATEGORICAL score handling**: EV-04 passes string values (`"correct"`, `"partially_correct"`, `"incorrect"`) with validation against allowed categories before submission
+- **Score ID collision**: `_make_score_id()` includes `observation_id` in hash seed — prevents silent overwrites when multiple observations share a trace
+- **Installer `SOURCE_DIR` unbound**: `import_user_env()` falls back to `SCRIPT_DIR/..` in Option 1 path
+
+### Security
+- **7 CodeQL HIGH findings resolved**: `monitoring/main.py` log injection vectors sanitized at every call site with AST-verified test coverage
 
 ### Upgrade Instructions
 
-1. **Update code and reinstall**:
+1. **Pull and run installer**:
    ```bash
    cd /path/to/your/ai-memory-clone
    git pull origin main
@@ -33,13 +43,33 @@ Langfuse observability improvements and LLM-as-judge evaluation pipeline: semant
    # Select Option 1 (Add project to existing installation)
    ```
 
-2. **Evaluator setup** (optional):
-   - Default: Ollama (local, free, no API key needed)
-   - Configure provider in `evaluator_config.yaml`
-   - Set API keys via environment variables (see config comments)
-   - Run: `python scripts/create_score_configs.py` (one-time Langfuse setup)
-   - Run: `python scripts/create_datasets.py` (one-time golden dataset creation)
-   - Run: `python scripts/run_evaluations.py --config evaluator_config.yaml`
+2. **Build and start the evaluator-scheduler container**:
+   ```bash
+   cd ~/.ai-memory/docker
+   unset QDRANT_API_KEY
+   docker compose -f docker-compose.yml -f docker-compose.langfuse.yml build evaluator-scheduler
+   docker compose -f docker-compose.yml -f docker-compose.langfuse.yml --profile langfuse up -d evaluator-scheduler
+   ```
+
+3. **Create score configs** (one-time, idempotent):
+   ```bash
+   cd /path/to/your/ai-memory-clone
+   source .venv/bin/activate
+   cd ~/.ai-memory
+   set -a && source docker/.env && set +a && unset QDRANT_API_KEY
+   python scripts/create_score_configs.py
+   ```
+
+4. **Configure evaluator provider** (optional — defaults to Ollama):
+   - **Ollama cloud**: Set `OLLAMA_API_KEY` in your `.env` (auto-detects cloud endpoint)
+   - **Local Ollama**: No config needed (default `http://localhost:11434/v1`)
+   - **Other providers**: Edit `evaluator_config.yaml` `provider:` field
+   - Model: Edit `evaluator_config.yaml` `model_name:` (default: `gemma3:4b`)
+
+5. **Run evaluations manually** (optional — scheduler runs daily at 05:00 UTC):
+   ```bash
+   python scripts/run_evaluations.py --config evaluator_config.yaml
+   ```
 
 ---
 
diff --git a/docs/LANGFUSE-INTEGRATION.md b/docs/LANGFUSE-INTEGRATION.md
index c922bec9..88267762 100644
--- a/docs/LANGFUSE-INTEGRATION.md
+++ b/docs/LANGFUSE-INTEGRATION.md
@@ -201,6 +201,73 @@ All Langfuse services use the `langfuse` profile and join the existing `ai-memor
 | `langfuse-redis` | `redis:7` | Job queue and ephemeral cache |
 | `langfuse-minio` | `cgr.dev/chainguard/minio` | S3-compatible blob storage for event upload |
 | `trace-flush-worker` | Custom (Dockerfile.worker) | Reads trace buffer, flushes to Langfuse |
+| `evaluator-scheduler` | Custom (Dockerfile.evaluator-scheduler) | Cron-based LLM-as-judge evaluation runner |
+
+---
+
+## LLM-as-Judge Evaluation Pipeline
+
+The evaluation pipeline scores memory operations using an LLM judge. Six evaluators run automatically via the `evaluator-scheduler` container (daily at 05:00 UTC by default).
+
+### Evaluators
+
+| ID | Name | Target | Score Type | What it measures |
+|----|------|--------|------------|------------------|
+| EV-01 | `retrieval_relevance` | observation | NUMERIC (0-1) | Was the retrieved memory relevant to the trigger? |
+| EV-02 | `injection_value` | observation | BOOLEAN | Did the injected context add value vs noise? |
+| EV-03 | `capture_completeness` | observation | BOOLEAN | Did the capture preserve all important information? |
+| EV-04 | `classification_accuracy` | observation | CATEGORICAL | Was the memory type classification correct? |
+| EV-05 | `bootstrap_quality` | trace | NUMERIC (0-1) | Was the cross-session bootstrap context useful? |
+| EV-06 | `session_coherence` | trace | NUMERIC (0-1) | Were memory operations during the session coherent? |
+
+### How It Works
+
+1. **Scheduler** wakes at cron time, reads `evaluator_config.yaml`
+2. **Runner** fetches observations/traces from Langfuse matching each evaluator's `filter.event_types`
+3. **Sampling** applies per-evaluator sampling rate (5-100%)
+4. **LLM judge** evaluates each sampled item using the evaluator's prompt template
+5. **Scores** are attached back to the observation/trace in Langfuse via `create_score()`
+
+### Evaluator Configuration
+
+All config in `evaluator_config.yaml` — zero secrets (API keys via environment variables):
+
+```yaml
+evaluator_model:
+  provider: ollama          # ollama | openrouter | anthropic | openai | custom
+  model_name: gemma3:4b    # Ollama cloud model
+  temperature: 0.0
+  max_tokens: 4096
+  max_retries: 3            # Retry on 500/502/503/429 + network errors
+
+schedule:
+  enabled: true
+  cron: "0 5 * * *"         # Daily at 05:00 UTC
+  lookback_hours: 24
+```
+
+### Provider Auto-Detection
+
+- **Ollama cloud**: Set `OLLAMA_API_KEY` env var → auto-uses `https://ollama.com/v1`
+- **Local Ollama**: No key set → uses `http://localhost:11434/v1`
+- **Other providers**: Set `provider:` in config + corresponding env var
+
+### Score Config Setup
+
+Run once to create score validation schemas in Langfuse:
+
+```bash
+python scripts/create_score_configs.py              # Create configs (idempotent)
+python scripts/create_score_configs.py --cleanup-duplicates  # Archive duplicates
+```
+
+### Manual Evaluation
+
+```bash
+python scripts/run_evaluations.py --config evaluator_config.yaml              # Full run
+python scripts/run_evaluations.py --config evaluator_config.yaml --dry-run    # Preview only
+python scripts/run_evaluations.py --config evaluator_config.yaml --evaluator EV-01  # Single evaluator
+```
 
 ---
 

From 9375d308edc80144214f736aab9dedea524b5068 Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 03:46:27 -0700
Subject: [PATCH 12/13] fix(evaluator): update default model to gemma3:4b in
 provider + tests

Matches evaluator_config.yaml change. Updates dataclass default and
test assertions from llama3.2:8b to gemma3:4b.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/memory/evaluator/provider.py |  4 ++--
 tests/test_evaluator_config.py   | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/memory/evaluator/provider.py b/src/memory/evaluator/provider.py
index e03f2580..3739579c 100644
--- a/src/memory/evaluator/provider.py
+++ b/src/memory/evaluator/provider.py
@@ -31,7 +31,7 @@ class EvaluatorConfig:
     provider: Literal["ollama", "openrouter", "anthropic", "openai", "custom"] = (
         "ollama"
     )
-    model_name: str = "llama3.2:8b"
+    model_name: str = "gemma3:4b"
     base_url: str | None = None
     temperature: float = 0.0  # Deterministic for evaluation
     max_tokens: int = (
@@ -50,7 +50,7 @@ def from_yaml(cls, path: str) -> "EvaluatorConfig":
         model_cfg = data.get("evaluator_model", {})
         return cls(
             provider=model_cfg.get("provider", "ollama"),
-            model_name=model_cfg.get("model_name", "llama3.2:8b"),
+            model_name=model_cfg.get("model_name", "gemma3:4b"),
             base_url=model_cfg.get("base_url"),
             temperature=model_cfg.get("temperature", 0.0),
             max_tokens=model_cfg.get("max_tokens", 4096),
diff --git a/tests/test_evaluator_config.py b/tests/test_evaluator_config.py
index bd11e68b..f20f84b7 100644
--- a/tests/test_evaluator_config.py
+++ b/tests/test_evaluator_config.py
@@ -26,7 +26,7 @@ def full_config_yaml(tmp_path) -> Path:
     config = {
         "evaluator_model": {
             "provider": "ollama",
-            "model_name": "llama3.2:8b",
+            "model_name": "gemma3:4b",
             "temperature": 0.0,
             "max_tokens": 512,
         },
@@ -74,7 +74,7 @@ def test_loads_provider(self, full_config_yaml):
 
     def test_loads_model_name(self, full_config_yaml):
         config = EvaluatorConfig.from_yaml(str(full_config_yaml))
-        assert config.model_name == "llama3.2:8b"
+        assert config.model_name == "gemma3:4b"
 
     def test_loads_temperature(self, full_config_yaml):
         config = EvaluatorConfig.from_yaml(str(full_config_yaml))
@@ -112,7 +112,7 @@ def test_model_defaults_to_llama(self, tmp_path):
         config_file = tmp_path / "minimal.yaml"
         config_file.write_text("evaluator_model: {}\n")
         config = EvaluatorConfig.from_yaml(str(config_file))
-        assert config.model_name == "llama3.2:8b"
+        assert config.model_name == "gemma3:4b"
 
     def test_temperature_defaults_to_zero(self, tmp_path):
         config_file = tmp_path / "minimal.yaml"
@@ -142,7 +142,7 @@ def test_default_provider_is_ollama(self, repo_config):
 
     def test_default_model_is_llama(self, repo_config):
         config = EvaluatorConfig.from_yaml(str(repo_config))
-        assert config.model_name == "llama3.2:8b"
+        assert config.model_name == "gemma3:4b"
 
     def test_no_secrets_in_config_file(self, repo_config):
         content = repo_config.read_text()
@@ -268,7 +268,7 @@ def test_filters_by_evaluator_id(self, tmp_path, full_config_yaml):
 
     def test_returns_empty_list_when_dir_missing(self, tmp_path):
         config_content = {
-            "evaluator_model": {"provider": "ollama", "model_name": "llama3.2:8b"},
+            "evaluator_model": {"provider": "ollama", "model_name": "gemma3:4b"},
             "evaluators_dir": str(tmp_path / "nonexistent_dir"),
             "audit": {"log_file": str(tmp_path / "eval.jsonl")},
         }

From 4e6b6a92f5a43509d317e44654dbcb62c7d1baba Mon Sep 17 00:00:00 2001
From: WB Solutions <info@wbsolutions.ca>
Date: Sun, 15 Mar 2026 03:53:23 -0700
Subject: [PATCH 13/13] fix(tests): update remaining llama3.2:8b references to
 gemma3:4b

test_evaluator_provider.py and test_evaluator_runner.py fixture
still had hardcoded llama3.2:8b model name assertions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/test_evaluator_provider.py | 6 +++---
 tests/test_evaluator_runner.py   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_evaluator_provider.py b/tests/test_evaluator_provider.py
index a561e070..75bd1eee 100644
--- a/tests/test_evaluator_provider.py
+++ b/tests/test_evaluator_provider.py
@@ -24,7 +24,7 @@ def test_default_provider_is_ollama(self):
 
     def test_default_model_is_llama(self):
         config = EvaluatorConfig()
-        assert config.model_name == "llama3.2:8b"
+        assert config.model_name == "gemma3:4b"
 
     def test_default_temperature_zero(self):
         config = EvaluatorConfig()
@@ -69,7 +69,7 @@ def test_defaults_when_fields_missing(self, tmp_path):
 
         config = EvaluatorConfig.from_yaml(str(config_file))
         assert config.provider == "ollama"
-        assert config.model_name == "llama3.2:8b"
+        assert config.model_name == "gemma3:4b"
         assert config.temperature == 0.0
 
     def test_empty_evaluator_model_uses_defaults(self, tmp_path):
@@ -87,7 +87,7 @@ class TestOllamaProvider:
     """Test Ollama provider client creation."""
 
     def test_ollama_creates_openai_client(self):
-        config = EvaluatorConfig(provider="ollama", model_name="llama3.2:8b")
+        config = EvaluatorConfig(provider="ollama", model_name="gemma3:4b")
         mock_openai_cls = MagicMock()
 
         with patch.dict("sys.modules", {"openai": MagicMock(OpenAI=mock_openai_cls)}):
diff --git a/tests/test_evaluator_runner.py b/tests/test_evaluator_runner.py
index d992f178..2c5823f5 100644
--- a/tests/test_evaluator_runner.py
+++ b/tests/test_evaluator_runner.py
@@ -38,7 +38,7 @@ def minimal_config(tmp_path) -> Path:
     config_content = """
 evaluator_model:
   provider: ollama
-  model_name: llama3.2:8b
+  model_name: gemma3:4b
   temperature: 0.0
   max_tokens: 4096
 evaluators_dir: {evaluators_dir}