Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,9 @@ export AZURE_API_BASE="https://your-resource.openai.azure.com/"
export API_KEY="your-api-endpoint-key"
```

#### Optional: Langfuse
After a run, you can send one trace with per-metric scores to [Langfuse](https://langfuse.com/). Install `lightspeed-evaluation[langfuse]`, set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` (and `LANGFUSE_HOST` if not using the default cloud), then use `lightspeed-eval --langfuse` or set `LIGHTSPEED_USE_LANGFUSE=1`. From Python, pass `on_complete=build_langfuse_on_complete_callback()` (from `lightspeed_evaluation.integrations.langfuse_reporter`) to `evaluate()`.

## 📈 Output & Visualization

### Generated Reports
Expand Down
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,18 @@ nlp-metrics = [
"rapidfuzz>=3.0.0,<=3.14.3", # Required for semantic_similarity_distance
]

# Optional Langfuse reporting (on_complete / CLI --langfuse). Uses the v2 SDK.
# pip install 'lightspeed-evaluation[langfuse]'
# or
# uv sync --extra langfuse
langfuse = [
"langfuse>=2.0.0,<3.0.0",
]

[dependency-groups]
dev = [
# Matches [project.optional-dependencies] langfuse — for typecheck/tests.
"langfuse>=2.0.0,<3.0.0",
"bandit>=1.7.0,<=1.9.2",
"black==25.1.0",
"mypy>=1.15.0,<=1.17.1",
Expand Down
16 changes: 14 additions & 2 deletions requirements-all-extras.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ annotated-types==0.7.0
anyio==4.13.0
# via
# httpx
# langfuse
# openai
appdirs==1.4.4
# via ragas
Expand All @@ -29,7 +30,9 @@ attrs==26.1.0
# jsonschema
# referencing
backoff==2.2.1
# via posthog
# via
# langfuse
# posthog
certifi==2026.2.25
# via
# httpcore
Expand Down Expand Up @@ -134,6 +137,7 @@ httpcore==1.0.9
httpx==0.28.1
# via
# huggingface-hub
# langfuse
# langgraph-sdk
# langsmith
# lightspeed-evaluation
Expand All @@ -152,6 +156,7 @@ idna==3.11
# via
# anyio
# httpx
# langfuse
# requests
# yarl
importlib-metadata==8.7.1
Expand Down Expand Up @@ -215,6 +220,8 @@ langchain-openai==1.1.12
# via ragas
langchain-text-splitters==1.1.1
# via langchain-classic
langfuse==2.60.10
# via lightspeed-evaluation
langgraph==1.1.6
# via langchain
langgraph-checkpoint==4.0.1
Expand Down Expand Up @@ -305,11 +312,12 @@ orjson==3.11.8
# langsmith
ormsgpack==1.12.2
# via langgraph-checkpoint
packaging==26.0
packaging==24.2
# via
# datasets
# huggingface-hub
# langchain-core
# langfuse
# langsmith
# marshmallow
# matplotlib
Expand Down Expand Up @@ -365,6 +373,7 @@ pydantic==2.11.7
# langchain-classic
# langchain-core
# langchain-google-genai
# langfuse
# langgraph
# langsmith
# lightspeed-evaluation
Expand Down Expand Up @@ -448,6 +457,7 @@ requests==2.33.1
# instructor
# langchain-classic
# langchain-community
# langfuse
# langsmith
# posthog
# requests-toolbelt
Expand Down Expand Up @@ -587,6 +597,8 @@ uuid-utils==0.14.1
# langsmith
wheel==0.46.3
# via deepeval
wrapt==1.17.3
# via langfuse
xxhash==3.6.0
# via
# datasets
Expand Down
2 changes: 1 addition & 1 deletion requirements-local-embeddings.txt
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ orjson==3.11.8
# langsmith
ormsgpack==1.12.2
# via langgraph-checkpoint
packaging==26.0
packaging==24.2
# via
# datasets
# huggingface-hub
Expand Down
2 changes: 1 addition & 1 deletion requirements-nlp-metrics.txt
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ orjson==3.11.8
# langsmith
ormsgpack==1.12.2
# via langgraph-checkpoint
packaging==26.0
packaging==24.2
# via
# datasets
# huggingface-hub
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ orjson==3.11.8
# langsmith
ormsgpack==1.12.2
# via langgraph-checkpoint
packaging==26.0
packaging==24.2
# via
# datasets
# huggingface-hub
Expand Down
5 changes: 5 additions & 0 deletions src/lightspeed_evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
APIConfig,
EvaluationData,
EvaluationResult,
EvaluationRunContext,
LLMConfig,
LoggingConfig,
TurnData,
Expand Down Expand Up @@ -80,6 +81,10 @@
"EvaluationData": ("lightspeed_evaluation.core.models", "EvaluationData"),
"TurnData": ("lightspeed_evaluation.core.models", "TurnData"),
"EvaluationResult": ("lightspeed_evaluation.core.models", "EvaluationResult"),
"EvaluationRunContext": (
"lightspeed_evaluation.core.models",
"EvaluationRunContext",
),
"EvaluationSummary": (
"lightspeed_evaluation.core.models.summary",
"EvaluationSummary",
Expand Down
94 changes: 84 additions & 10 deletions src/lightspeed_evaluation/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@
print(summary.by_metric)
"""

from collections.abc import Callable
from typing import Optional

from lightspeed_evaluation.core.models import (
EvaluationData,
EvaluationResult,
EvaluationRunContext,
SystemConfig,
TurnData,
)
Expand All @@ -36,10 +38,15 @@
from lightspeed_evaluation.pipeline.evaluation import EvaluationPipeline


def evaluate(
def evaluate( # pylint: disable=too-many-arguments
config: SystemConfig,
data: list[EvaluationData],
output_dir: Optional[str] = None,
*,
evaluation_data_path: Optional[str] = None,
on_complete: Optional[
Callable[[list[EvaluationResult], EvaluationRunContext], None]
] = None,
) -> list[EvaluationResult]:
"""Run evaluation on the provided data using the given configuration.

Expand All @@ -51,6 +58,12 @@ def evaluate(
config: A pre-built SystemConfig instance.
data: List of EvaluationData conversations to evaluate.
output_dir: Optional override for the output directory.
evaluation_data_path: Optional path to the evaluation data file, used
for run naming and in :class:`EvaluationRunContext` (e.g. Langfuse).
on_complete: Optional callback after a successful run; receives results
and an :class:`EvaluationRunContext`. See
:mod:`lightspeed_evaluation.integrations.langfuse_reporter` for
a Langfuse helper. Failures in the callback do not fail the run.

Returns:
List of EvaluationResult objects (one per metric per turn/conversation).
Expand All @@ -61,16 +74,25 @@ def evaluate(
loader = ConfigLoader.from_config(config)
pipeline = EvaluationPipeline(loader, output_dir)
try:
return pipeline.run_evaluation(data)
return pipeline.run_evaluation(
data,
original_data_path=evaluation_data_path,
on_complete=on_complete,
)
finally:
pipeline.close()


def evaluate_with_summary(
def evaluate_with_summary( # pylint: disable=too-many-arguments
config: SystemConfig,
data: list[EvaluationData],
output_dir: Optional[str] = None,
compute_confidence_intervals: bool = False,
*,
evaluation_data_path: Optional[str] = None,
on_complete: Optional[
Callable[[list[EvaluationResult], EvaluationRunContext], None]
] = None,
) -> EvaluationSummary:
"""Run evaluation and return structured results with computed statistics.

Expand All @@ -84,22 +106,35 @@ def evaluate_with_summary(
output_dir: Optional override for the output directory.
compute_confidence_intervals: Whether to compute bootstrap confidence
intervals. Default False.
evaluation_data_path: Same as for :func:`evaluate`.
on_complete: Same as for :func:`evaluate`.

Returns:
EvaluationSummary with results and computed statistics.
"""
results = evaluate(config, data, output_dir=output_dir)
results = evaluate(
config,
data,
output_dir=output_dir,
evaluation_data_path=evaluation_data_path,
on_complete=on_complete,
)
return EvaluationSummary.from_results(
results,
evaluation_data=data if data else None,
compute_confidence_intervals=compute_confidence_intervals,
)


def evaluate_conversation(
def evaluate_conversation( # pylint: disable=too-many-arguments
config: SystemConfig,
data: EvaluationData,
output_dir: Optional[str] = None,
*,
evaluation_data_path: Optional[str] = None,
on_complete: Optional[
Callable[[list[EvaluationResult], EvaluationRunContext], None]
] = None,
) -> list[EvaluationResult]:
"""Evaluate a single conversation group.

Expand All @@ -109,18 +144,31 @@ def evaluate_conversation(
config: A pre-built SystemConfig instance.
data: A single EvaluationData conversation to evaluate.
output_dir: Optional override for the output directory.
evaluation_data_path: Same as for :func:`evaluate`.
on_complete: Same as for :func:`evaluate`.

Returns:
List of EvaluationResult objects.
"""
return evaluate(config, [data], output_dir=output_dir)
return evaluate(
config,
[data],
output_dir=output_dir,
evaluation_data_path=evaluation_data_path,
on_complete=on_complete,
)


def evaluate_conversation_with_summary(
def evaluate_conversation_with_summary( # pylint: disable=too-many-arguments
config: SystemConfig,
data: EvaluationData,
output_dir: Optional[str] = None,
compute_confidence_intervals: bool = False,
*,
evaluation_data_path: Optional[str] = None,
on_complete: Optional[
Callable[[list[EvaluationResult], EvaluationRunContext], None]
] = None,
) -> EvaluationSummary:
"""Evaluate a single conversation and return structured results.

Expand All @@ -132,6 +180,8 @@ def evaluate_conversation_with_summary(
output_dir: Optional override for the output directory.
compute_confidence_intervals: Whether to compute bootstrap confidence
intervals. Default False.
evaluation_data_path: Same as for :func:`evaluate`.
on_complete: Same as for :func:`evaluate`.

Returns:
EvaluationSummary with results and computed statistics.
Expand All @@ -141,15 +191,22 @@ def evaluate_conversation_with_summary(
[data],
output_dir=output_dir,
compute_confidence_intervals=compute_confidence_intervals,
evaluation_data_path=evaluation_data_path,
on_complete=on_complete,
)


def evaluate_turn(
def evaluate_turn( # pylint: disable=too-many-arguments
config: SystemConfig,
turn: TurnData,
metrics: Optional[list[str]] = None,
conversation_group_id: str = "programmatic_eval",
output_dir: Optional[str] = None,
*,
evaluation_data_path: Optional[str] = None,
on_complete: Optional[
Callable[[list[EvaluationResult], EvaluationRunContext], None]
] = None,
) -> list[EvaluationResult]:
"""Evaluate a single turn.

Expand All @@ -163,6 +220,8 @@ def evaluate_turn(
metrics: Optional list of metric identifiers to override turn_metrics.
conversation_group_id: Conversation group ID for the wrapper.
output_dir: Optional override for the output directory.
evaluation_data_path: Same as for :func:`evaluate`.
on_complete: Same as for :func:`evaluate`.

Returns:
List of EvaluationResult objects.
Expand All @@ -174,15 +233,26 @@ def evaluate_turn(
conversation_group_id=conversation_group_id,
turns=[turn],
)
return evaluate(config, [data], output_dir=output_dir)
return evaluate(
config,
[data],
output_dir=output_dir,
evaluation_data_path=evaluation_data_path,
on_complete=on_complete,
)


def evaluate_turn_with_summary(
def evaluate_turn_with_summary( # pylint: disable=too-many-arguments
config: SystemConfig,
turn: TurnData,
metrics: Optional[list[str]] = None,
conversation_group_id: str = "programmatic_eval",
output_dir: Optional[str] = None,
*,
evaluation_data_path: Optional[str] = None,
on_complete: Optional[
Callable[[list[EvaluationResult], EvaluationRunContext], None]
] = None,
) -> EvaluationSummary:
"""Evaluate a single turn and return structured results.

Expand All @@ -194,6 +264,8 @@ def evaluate_turn_with_summary(
metrics: Optional list of metric identifiers to override turn_metrics.
conversation_group_id: Conversation group ID for the wrapper.
output_dir: Optional override for the output directory.
evaluation_data_path: Same as for :func:`evaluate`.
on_complete: Same as for :func:`evaluate`.

Returns:
EvaluationSummary with results and computed statistics.
Expand All @@ -210,4 +282,6 @@ def evaluate_turn_with_summary(
[data],
output_dir=output_dir,
compute_confidence_intervals=False,
evaluation_data_path=evaluation_data_path,
on_complete=on_complete,
)
Loading
Loading