Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 75 additions & 47 deletions validmind/datasets/llm/agent_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
and enables the use of all DeepEval tests and metrics within the ValidMind library.
"""

from typing import Any, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional

import pandas as pd

Expand All @@ -21,9 +21,8 @@
# Optional DeepEval imports with graceful fallback
try:
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase, ToolCall
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase

DEEPEVAL_AVAILABLE = True
except ImportError:
Expand Down Expand Up @@ -74,21 +73,21 @@ class LLMAgentDataset(VMDataset):

def __init__(
self,
input_id: str = None,
test_cases: Optional[List] = None,
goldens: Optional[List] = None,
input_id: Optional[str] = None,
test_cases: Optional[List[Any]] = None,
goldens: Optional[List[Any]] = None,
deepeval_dataset: Optional[Any] = None,
**kwargs,
):
**kwargs: Any,
) -> None:
"""
Initialize LLMAgentDataset.

Args:
input_id: Identifier for the dataset
test_cases: List of DeepEval LLMTestCase objects
goldens: List of DeepEval Golden objects
deepeval_dataset: DeepEval EvaluationDataset instance
**kwargs: Additional arguments passed to VMDataset
input_id (Optional[str]): Identifier for the dataset.
test_cases (Optional[List[LLMTestCase]]): List of DeepEval LLMTestCase objects.
goldens (Optional[List[Golden]]): List of DeepEval Golden objects.
deepeval_dataset (Optional[EvaluationDataset]): DeepEval EvaluationDataset instance.
**kwargs (Any): Additional arguments passed to `VMDataset`.
"""
if not DEEPEVAL_AVAILABLE:
raise ImportError(
Expand Down Expand Up @@ -122,7 +121,11 @@ def __init__(
)

def _convert_to_dataframe(self) -> pd.DataFrame:
"""Convert DeepEval test cases/goldens to pandas DataFrame."""
"""Convert DeepEval test cases/goldens to pandas DataFrame.

Returns:
pandas.DataFrame: Tabular representation of test cases and goldens.
"""
data = []

# Process test cases
Expand Down Expand Up @@ -188,13 +191,27 @@ def _convert_to_dataframe(self) -> pd.DataFrame:
return pd.DataFrame(data)

def _serialize_list_field(self, field: Optional[List[str]]) -> str:
"""Serialize list field to string for DataFrame storage."""
"""Serialize list field to string for DataFrame storage.

Args:
field (Optional[List[str]]): List of strings to serialize.

Returns:
str: Pipe-delimited string.
"""
if field is None:
return ""
return "|".join(str(item) for item in field)

def _serialize_tools_field(self, tools: Optional[List]) -> str:
"""Serialize tools list to string for DataFrame storage."""
"""Serialize tools list to string for DataFrame storage.

Args:
tools (Optional[List]): List of tool objects or names.

Returns:
str: Pipe-delimited string of tool names.
"""
if tools is None:
return ""
tool_strs = []
Expand All @@ -206,59 +223,66 @@ def _serialize_tools_field(self, tools: Optional[List]) -> str:
return "|".join(tool_strs)

def _deserialize_list_field(self, field_str: str) -> List[str]:
"""Deserialize string back to list."""
"""Deserialize string back to list.

Args:
field_str (str): Pipe-delimited string.

Returns:
List[str]: List of string tokens.
"""
if not field_str:
return []
return field_str.split("|")

@classmethod
def from_test_cases(
cls, test_cases: List, input_id: str = "llm_agent_dataset", **kwargs
cls, test_cases: List[Any], input_id: str = "llm_agent_dataset", **kwargs: Any
) -> "LLMAgentDataset":
"""
Create LLMAgentDataset from DeepEval test cases.

Args:
test_cases: List of DeepEval LLMTestCase objects
input_id: Dataset identifier
**kwargs: Additional arguments
test_cases (List[LLMTestCase]): List of DeepEval LLMTestCase objects.
input_id (str): Dataset identifier.
**kwargs (Any): Additional arguments passed through to constructor.

Returns:
LLMAgentDataset instance
LLMAgentDataset: New dataset instance.
"""
return cls(input_id=input_id, test_cases=test_cases, **kwargs)

@classmethod
def from_goldens(
cls, goldens: List, input_id: str = "llm_agent_dataset", **kwargs
cls, goldens: List[Any], input_id: str = "llm_agent_dataset", **kwargs: Any
) -> "LLMAgentDataset":
"""
Create LLMAgentDataset from DeepEval goldens.

Args:
goldens: List of DeepEval Golden objects
input_id: Dataset identifier
**kwargs: Additional arguments
goldens (List[Golden]): List of DeepEval Golden objects.
input_id (str): Dataset identifier.
**kwargs (Any): Additional arguments passed through to constructor.

Returns:
LLMAgentDataset instance
LLMAgentDataset: New dataset instance.
"""
return cls(input_id=input_id, goldens=goldens, **kwargs)

@classmethod
def from_deepeval_dataset(
cls, deepeval_dataset, input_id: str = "llm_agent_dataset", **kwargs
cls, deepeval_dataset: Any, input_id: str = "llm_agent_dataset", **kwargs: Any
) -> "LLMAgentDataset":
"""
Create LLMAgentDataset from DeepEval EvaluationDataset.

Args:
deepeval_dataset: DeepEval EvaluationDataset instance
input_id: Dataset identifier
**kwargs: Additional arguments
deepeval_dataset (EvaluationDataset): DeepEval EvaluationDataset instance.
input_id (str): Dataset identifier.
**kwargs (Any): Additional arguments passed through to constructor.

Returns:
LLMAgentDataset instance
LLMAgentDataset: New dataset instance.
"""
return cls(
input_id=input_id,
Expand All @@ -268,12 +292,12 @@ def from_deepeval_dataset(
**kwargs,
)

def add_test_case(self, test_case) -> None:
def add_test_case(self, test_case: Any) -> None:
"""
Add a DeepEval test case to the dataset.

Args:
test_case: DeepEval LLMTestCase instance
test_case (LLMTestCase): DeepEval LLMTestCase instance.
"""
if not DEEPEVAL_AVAILABLE:
raise ImportError("DeepEval is required to add test cases")
Expand All @@ -284,12 +308,12 @@ def add_test_case(self, test_case) -> None:
self._df = df
self.columns = df.columns.tolist()

def add_golden(self, golden) -> None:
def add_golden(self, golden: Any) -> None:
"""
Add a DeepEval golden to the dataset.

Args:
golden: DeepEval Golden instance
golden (Golden): DeepEval Golden instance.
"""
if not DEEPEVAL_AVAILABLE:
raise ImportError("DeepEval is required to add goldens")
Expand All @@ -300,12 +324,14 @@ def add_golden(self, golden) -> None:
self._df = df
self.columns = df.columns.tolist()

def convert_goldens_to_test_cases(self, llm_app_function) -> None:
def convert_goldens_to_test_cases(
self, llm_app_function: Callable[[str], Any]
) -> None:
"""
Convert goldens to test cases by generating actual outputs.

Args:
llm_app_function: Function that takes input and returns LLM output
llm_app_function (Callable[[str], Any]): Function that takes input and returns LLM output.
"""
if not DEEPEVAL_AVAILABLE:
raise ImportError("DeepEval is required for conversion")
Expand Down Expand Up @@ -337,16 +363,18 @@ def convert_goldens_to_test_cases(self, llm_app_function) -> None:
self._df = df
self.columns = df.columns.tolist()

def evaluate_with_deepeval(self, metrics: List, **kwargs) -> Dict[str, Any]:
def evaluate_with_deepeval(
self, metrics: List[Any], **kwargs: Any
) -> Dict[str, Any]:
"""
Evaluate the dataset using DeepEval metrics.

Args:
metrics: List of DeepEval metric instances
**kwargs: Additional arguments passed to deepeval.evaluate()
metrics (List[Any]): List of DeepEval metric instances.
**kwargs (Any): Additional arguments passed to `deepeval.evaluate()`.

Returns:
Evaluation results dictionary
Dict[str, Any]: Evaluation results dictionary.
"""
if not DEEPEVAL_AVAILABLE:
raise ImportError("DeepEval is required for evaluation")
Expand All @@ -367,12 +395,12 @@ def evaluate_with_deepeval(self, metrics: List, **kwargs) -> Dict[str, Any]:
logger.error(f"DeepEval evaluation failed: {e}")
raise

def get_deepeval_dataset(self):
def get_deepeval_dataset(self) -> Any:
"""
Get or create a DeepEval EvaluationDataset instance.

Returns:
DeepEval EvaluationDataset instance
EvaluationDataset: DeepEval EvaluationDataset instance.
"""
if not DEEPEVAL_AVAILABLE:
raise ImportError("DeepEval is required to get dataset")
Expand All @@ -388,12 +416,12 @@ def get_deepeval_dataset(self):

return self.deepeval_dataset

def to_deepeval_test_cases(self) -> List:
def to_deepeval_test_cases(self) -> List[Any]:
"""
Convert dataset rows back to DeepEval test cases.

Returns:
List of DeepEval LLMTestCase objects
List[LLMTestCase]: List of DeepEval LLMTestCase objects.
"""
if not DEEPEVAL_AVAILABLE:
raise ImportError("DeepEval is required for conversion")
Expand Down
3 changes: 1 addition & 2 deletions validmind/scorer/classification/AbsoluteError.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@scorer()
@tasks("classification")
@tags("classification")
def AbsoluteError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
def AbsoluteError(model: VMModel, dataset: VMDataset) -> List[float]:
"""Calculates the absolute error per row for a classification model.

For classification tasks, this computes the absolute difference between
Expand All @@ -25,7 +25,6 @@ def AbsoluteError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
Args:
model: The classification model to evaluate
dataset: The dataset containing true labels and predictions
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[float]: Per-row absolute errors as a list of float values
Expand Down
3 changes: 1 addition & 2 deletions validmind/scorer/classification/BrierScore.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@scorer()
@tasks("classification")
@tags("classification")
def BrierScore(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
def BrierScore(model: VMModel, dataset: VMDataset) -> List[float]:
"""Calculates the Brier score per row for a classification model.

The Brier score is a proper score function that measures the accuracy of
Expand All @@ -25,7 +25,6 @@ def BrierScore(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
Args:
model: The classification model to evaluate
dataset: The dataset containing true labels and predicted probabilities
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[float]: Per-row Brier scores as a list of float values
Expand Down
3 changes: 1 addition & 2 deletions validmind/scorer/classification/CalibrationError.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
@tasks("classification")
@tags("classification")
def CalibrationError(
model: VMModel, dataset: VMDataset, n_bins: int = 10, **kwargs
model: VMModel, dataset: VMDataset, n_bins: int = 10
) -> List[float]:
"""Calculates the calibration error per row for a classification model.

Expand All @@ -28,7 +28,6 @@ def CalibrationError(
model: The classification model to evaluate
dataset: The dataset containing true labels and predicted probabilities
n_bins: Number of bins for probability calibration, defaults to 10
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[float]: Per-row calibration errors as a list of float values
Expand Down
3 changes: 1 addition & 2 deletions validmind/scorer/classification/ClassBalance.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@scorer()
@tasks("classification")
@tags("classification")
def ClassBalance(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
def ClassBalance(model: VMModel, dataset: VMDataset) -> List[float]:
"""Calculates the class balance score per row for a classification model.

For each prediction, this returns how balanced the predicted class is in the
Expand All @@ -25,7 +25,6 @@ def ClassBalance(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
Args:
model: The classification model to evaluate
dataset: The dataset containing true labels and predictions
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[float]: Per-row class balance scores as a list of float values
Expand Down
3 changes: 1 addition & 2 deletions validmind/scorer/classification/Confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@scorer()
@tasks("classification")
@tags("classification")
def Confidence(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
def Confidence(model: VMModel, dataset: VMDataset) -> List[float]:
"""Calculates the prediction confidence per row for a classification model.

For binary classification, confidence is calculated as the maximum probability
Expand All @@ -24,7 +24,6 @@ def Confidence(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
Args:
model: The classification model to evaluate
dataset: The dataset containing true labels and predicted probabilities
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[float]: Per-row confidence scores as a list of float values
Expand Down
3 changes: 1 addition & 2 deletions validmind/scorer/classification/Correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@scorer()
@tasks("classification")
@tags("classification")
def Correctness(model: VMModel, dataset: VMDataset, **kwargs) -> List[int]:
def Correctness(model: VMModel, dataset: VMDataset) -> List[int]:
"""Calculates the correctness per row for a classification model.

For classification tasks, this returns 1 for correctly classified rows
Expand All @@ -24,7 +24,6 @@ def Correctness(model: VMModel, dataset: VMDataset, **kwargs) -> List[int]:
Args:
model: The classification model to evaluate
dataset: The dataset containing true labels and predictions
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[int]: Per-row correctness as a list of 1s and 0s
Expand Down
5 changes: 1 addition & 4 deletions validmind/scorer/classification/LogLoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@
@scorer()
@tasks("classification")
@tags("classification")
def LogLoss(
model: VMModel, dataset: VMDataset, eps: float = 1e-15, **kwargs
) -> List[float]:
def LogLoss(model: VMModel, dataset: VMDataset, eps: float = 1e-15) -> List[float]:
"""Calculates the logarithmic loss per row for a classification model.

Log loss measures the performance of a classification model where the prediction
Expand All @@ -27,7 +25,6 @@ def LogLoss(
model: The classification model to evaluate
dataset: The dataset containing true labels and predicted probabilities
eps: Small value to avoid log(0), defaults to 1e-15
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[float]: Per-row log loss values as a list of float values
Expand Down
Loading
Loading