lightspeed-core · bsatapat-jpg · Apr 24, 2026
diff --git a/README.md b/README.md
@@ -491,6 +491,9 @@ export AZURE_API_BASE="https://your-resource.openai.azure.com/"
 export API_KEY="your-api-endpoint-key"
 ```
 
+#### Optional: Langfuse
+After a run, you can send one trace with per-metric scores to [Langfuse](https://langfuse.com/). Install `lightspeed-evaluation[langfuse]`, set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` (and `LANGFUSE_HOST` if not using the default cloud), then use `lightspeed-eval --langfuse` or set `LIGHTSPEED_USE_LANGFUSE=1`. From Python, pass `on_complete=build_langfuse_on_complete_callback()` (from `lightspeed_evaluation.integrations.langfuse_reporter`) to `evaluate()`.
+
 ## 📈 Output & Visualization
 
 ### Generated Reports

diff --git a/pyproject.toml b/pyproject.toml
@@ -52,8 +52,18 @@ nlp-metrics = [
     "rapidfuzz>=3.0.0,<=3.14.3",     # Required for semantic_similarity_distance
 ]
 
+# Optional Langfuse reporting (on_complete / CLI --langfuse). Uses the v2 SDK.
+#   pip install 'lightspeed-evaluation[langfuse]'
+# or
+#   uv sync --extra langfuse
+langfuse = [
+    "langfuse>=2.0.0,<3.0.0",
+]
+
 [dependency-groups]
 dev = [
+    # Matches [project.optional-dependencies] langfuse — for typecheck/tests.
+    "langfuse>=2.0.0,<3.0.0",
     "bandit>=1.7.0,<=1.9.2",
     "black==25.1.0",
     "mypy>=1.15.0,<=1.17.1",

diff --git a/requirements-all-extras.txt b/requirements-all-extras.txt
@@ -20,6 +20,7 @@ annotated-types==0.7.0
 anyio==4.13.0
     # via
     #   httpx
+    #   langfuse
     #   openai
 appdirs==1.4.4
     # via ragas
@@ -29,7 +30,9 @@ attrs==26.1.0
     #   jsonschema
     #   referencing
 backoff==2.2.1
-    # via posthog
+    # via
+    #   langfuse
+    #   posthog
 certifi==2026.2.25
     # via
     #   httpcore
@@ -134,6 +137,7 @@ httpcore==1.0.9
 httpx==0.28.1
     # via
     #   huggingface-hub
+    #   langfuse
     #   langgraph-sdk
     #   langsmith
     #   lightspeed-evaluation
@@ -152,6 +156,7 @@ idna==3.11
     # via
     #   anyio
     #   httpx
+    #   langfuse
     #   requests
     #   yarl
 importlib-metadata==8.7.1
@@ -215,6 +220,8 @@ langchain-openai==1.1.12
     # via ragas
 langchain-text-splitters==1.1.1
     # via langchain-classic
+langfuse==2.60.10
+    # via lightspeed-evaluation
 langgraph==1.1.6
     # via langchain
 langgraph-checkpoint==4.0.1
@@ -305,11 +312,12 @@ orjson==3.11.8
     #   langsmith
 ormsgpack==1.12.2
     # via langgraph-checkpoint
-packaging==26.0
+packaging==24.2
     # via
     #   datasets
     #   huggingface-hub
     #   langchain-core
+    #   langfuse
     #   langsmith
     #   marshmallow
     #   matplotlib
@@ -365,6 +373,7 @@ pydantic==2.11.7
     #   langchain-classic
     #   langchain-core
     #   langchain-google-genai
+    #   langfuse
     #   langgraph
     #   langsmith
     #   lightspeed-evaluation
@@ -448,6 +457,7 @@ requests==2.33.1
     #   instructor
     #   langchain-classic
     #   langchain-community
+    #   langfuse
     #   langsmith
     #   posthog
     #   requests-toolbelt
@@ -587,6 +597,8 @@ uuid-utils==0.14.1
     #   langsmith
 wheel==0.46.3
     # via deepeval
+wrapt==1.17.3
+    # via langfuse
 xxhash==3.6.0
     # via
     #   datasets

diff --git a/requirements-local-embeddings.txt b/requirements-local-embeddings.txt
@@ -293,7 +293,7 @@ orjson==3.11.8
     #   langsmith
 ormsgpack==1.12.2
     # via langgraph-checkpoint
-packaging==26.0
+packaging==24.2
     # via
     #   datasets
     #   huggingface-hub

diff --git a/requirements-nlp-metrics.txt b/requirements-nlp-metrics.txt
@@ -291,7 +291,7 @@ orjson==3.11.8
     #   langsmith
 ormsgpack==1.12.2
     # via langgraph-checkpoint
-packaging==26.0
+packaging==24.2
     # via
     #   datasets
     #   huggingface-hub

diff --git a/requirements.txt b/requirements.txt
@@ -279,7 +279,7 @@ orjson==3.11.8
     #   langsmith
 ormsgpack==1.12.2
     # via langgraph-checkpoint
-packaging==26.0
+packaging==24.2
     # via
     #   datasets
     #   huggingface-hub

diff --git a/src/lightspeed_evaluation/__init__.py b/src/lightspeed_evaluation/__init__.py
@@ -26,6 +26,7 @@
         APIConfig,
         EvaluationData,
         EvaluationResult,
+        EvaluationRunContext,
         LLMConfig,
         LoggingConfig,
         TurnData,
@@ -80,6 +81,10 @@
     "EvaluationData": ("lightspeed_evaluation.core.models", "EvaluationData"),
     "TurnData": ("lightspeed_evaluation.core.models", "TurnData"),
     "EvaluationResult": ("lightspeed_evaluation.core.models", "EvaluationResult"),
+    "EvaluationRunContext": (
+        "lightspeed_evaluation.core.models",
+        "EvaluationRunContext",
+    ),
     "EvaluationSummary": (
         "lightspeed_evaluation.core.models.summary",
         "EvaluationSummary",

diff --git a/src/lightspeed_evaluation/api.py b/src/lightspeed_evaluation/api.py
@@ -23,11 +23,13 @@
     print(summary.by_metric)
 """
 
+from collections.abc import Callable
 from typing import Optional
 
 from lightspeed_evaluation.core.models import (
     EvaluationData,
     EvaluationResult,
+    EvaluationRunContext,
     SystemConfig,
     TurnData,
 )
@@ -36,10 +38,15 @@
 from lightspeed_evaluation.pipeline.evaluation import EvaluationPipeline
 
 
-def evaluate(
+def evaluate(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: list[EvaluationData],
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> list[EvaluationResult]:
     """Run evaluation on the provided data using the given configuration.
 
@@ -51,6 +58,12 @@ def evaluate(
         config: A pre-built SystemConfig instance.
         data: List of EvaluationData conversations to evaluate.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Optional path to the evaluation data file, used
+            for run naming and in :class:`EvaluationRunContext` (e.g. Langfuse).
+        on_complete: Optional callback after a successful run; receives results
+            and an :class:`EvaluationRunContext`. See
+            :mod:`lightspeed_evaluation.integrations.langfuse_reporter` for
+            a Langfuse helper. Failures in the callback do not fail the run.
 
     Returns:
         List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,16 +74,25 @@ def evaluate(
     loader = ConfigLoader.from_config(config)
     pipeline = EvaluationPipeline(loader, output_dir)
     try:
-        return pipeline.run_evaluation(data)
+        return pipeline.run_evaluation(
+            data,
+            original_data_path=evaluation_data_path,
+            on_complete=on_complete,
+        )
     finally:
         pipeline.close()
 
 
-def evaluate_with_summary(
+def evaluate_with_summary(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: list[EvaluationData],
     output_dir: Optional[str] = None,
     compute_confidence_intervals: bool = False,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> EvaluationSummary:
     """Run evaluation and return structured results with computed statistics.
 
@@ -84,22 +106,35 @@ def evaluate_with_summary(
         output_dir: Optional override for the output directory.
         compute_confidence_intervals: Whether to compute bootstrap confidence
             intervals. Default False.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         EvaluationSummary with results and computed statistics.
     """
-    results = evaluate(config, data, output_dir=output_dir)
+    results = evaluate(
+        config,
+        data,
+        output_dir=output_dir,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
+    )
     return EvaluationSummary.from_results(
         results,
         evaluation_data=data if data else None,
         compute_confidence_intervals=compute_confidence_intervals,
     )
 
 
-def evaluate_conversation(
+def evaluate_conversation(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: EvaluationData,
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> list[EvaluationResult]:
     """Evaluate a single conversation group.
 
@@ -109,18 +144,31 @@ def evaluate_conversation(
         config: A pre-built SystemConfig instance.
         data: A single EvaluationData conversation to evaluate.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         List of EvaluationResult objects.
     """
-    return evaluate(config, [data], output_dir=output_dir)
+    return evaluate(
+        config,
+        [data],
+        output_dir=output_dir,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
+    )
 
 
-def evaluate_conversation_with_summary(
+def evaluate_conversation_with_summary(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: EvaluationData,
     output_dir: Optional[str] = None,
     compute_confidence_intervals: bool = False,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> EvaluationSummary:
     """Evaluate a single conversation and return structured results.
 
@@ -132,6 +180,8 @@ def evaluate_conversation_with_summary(
         output_dir: Optional override for the output directory.
         compute_confidence_intervals: Whether to compute bootstrap confidence
             intervals. Default False.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         EvaluationSummary with results and computed statistics.
@@ -141,15 +191,22 @@ def evaluate_conversation_with_summary(
         [data],
         output_dir=output_dir,
         compute_confidence_intervals=compute_confidence_intervals,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
     )
 
 
-def evaluate_turn(
+def evaluate_turn(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     turn: TurnData,
     metrics: Optional[list[str]] = None,
     conversation_group_id: str = "programmatic_eval",
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> list[EvaluationResult]:
     """Evaluate a single turn.
 
@@ -163,6 +220,8 @@ def evaluate_turn(
         metrics: Optional list of metric identifiers to override turn_metrics.
         conversation_group_id: Conversation group ID for the wrapper.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         List of EvaluationResult objects.
@@ -174,15 +233,26 @@ def evaluate_turn(
         conversation_group_id=conversation_group_id,
         turns=[turn],
     )
-    return evaluate(config, [data], output_dir=output_dir)
+    return evaluate(
+        config,
+        [data],
+        output_dir=output_dir,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
+    )
 
 
-def evaluate_turn_with_summary(
+def evaluate_turn_with_summary(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     turn: TurnData,
     metrics: Optional[list[str]] = None,
     conversation_group_id: str = "programmatic_eval",
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> EvaluationSummary:
     """Evaluate a single turn and return structured results.
 
@@ -194,6 +264,8 @@ def evaluate_turn_with_summary(
         metrics: Optional list of metric identifiers to override turn_metrics.
         conversation_group_id: Conversation group ID for the wrapper.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         EvaluationSummary with results and computed statistics.
@@ -210,4 +282,6 @@ def evaluate_turn_with_summary(
         [data],
         output_dir=output_dir,
         compute_confidence_intervals=False,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
     )