validmind · AnilSorathiya · Oct 17, 2025 · Oct 15, 2025 · Oct 16, 2025 · Oct 17, 2025
diff --git a/validmind/datasets/llm/agent_dataset.py b/validmind/datasets/llm/agent_dataset.py
@@ -9,7 +9,7 @@
 and enables the use of all DeepEval tests and metrics within the ValidMind library.
 """
 
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import pandas as pd
 
@@ -21,9 +21,8 @@
 # Optional DeepEval imports with graceful fallback
 try:
     from deepeval import evaluate
-    from deepeval.dataset import EvaluationDataset, Golden
-    from deepeval.metrics import BaseMetric
-    from deepeval.test_case import LLMTestCase, ToolCall
+    from deepeval.dataset import EvaluationDataset
+    from deepeval.test_case import LLMTestCase
 
     DEEPEVAL_AVAILABLE = True
 except ImportError:
@@ -74,21 +73,21 @@ class LLMAgentDataset(VMDataset):
 
     def __init__(
         self,
-        input_id: str = None,
-        test_cases: Optional[List] = None,
-        goldens: Optional[List] = None,
+        input_id: Optional[str] = None,
+        test_cases: Optional[List[Any]] = None,
+        goldens: Optional[List[Any]] = None,
         deepeval_dataset: Optional[Any] = None,
-        **kwargs,
-    ):
+        **kwargs: Any,
+    ) -> None:
         """
         Initialize LLMAgentDataset.
 
         Args:
-            input_id: Identifier for the dataset
-            test_cases: List of DeepEval LLMTestCase objects
-            goldens: List of DeepEval Golden objects
-            deepeval_dataset: DeepEval EvaluationDataset instance
-            **kwargs: Additional arguments passed to VMDataset
+            input_id (Optional[str]): Identifier for the dataset.
+            test_cases (Optional[List[LLMTestCase]]): List of DeepEval LLMTestCase objects.
+            goldens (Optional[List[Golden]]): List of DeepEval Golden objects.
+            deepeval_dataset (Optional[EvaluationDataset]): DeepEval EvaluationDataset instance.
+            **kwargs (Any): Additional arguments passed to `VMDataset`.
         """
         if not DEEPEVAL_AVAILABLE:
             raise ImportError(
@@ -122,7 +121,11 @@ def __init__(
         )
 
     def _convert_to_dataframe(self) -> pd.DataFrame:
-        """Convert DeepEval test cases/goldens to pandas DataFrame."""
+        """Convert DeepEval test cases/goldens to pandas DataFrame.
+
+        Returns:
+            pandas.DataFrame: Tabular representation of test cases and goldens.
+        """
         data = []
 
         # Process test cases
@@ -188,13 +191,27 @@ def _convert_to_dataframe(self) -> pd.DataFrame:
         return pd.DataFrame(data)
 
     def _serialize_list_field(self, field: Optional[List[str]]) -> str:
-        """Serialize list field to string for DataFrame storage."""
+        """Serialize list field to string for DataFrame storage.
+
+        Args:
+            field (Optional[List[str]]): List of strings to serialize.
+
+        Returns:
+            str: Pipe-delimited string.
+        """
         if field is None:
             return ""
         return "|".join(str(item) for item in field)
 
     def _serialize_tools_field(self, tools: Optional[List]) -> str:
-        """Serialize tools list to string for DataFrame storage."""
+        """Serialize tools list to string for DataFrame storage.
+
+        Args:
+            tools (Optional[List]): List of tool objects or names.
+
+        Returns:
+            str: Pipe-delimited string of tool names.
+        """
         if tools is None:
             return ""
         tool_strs = []
@@ -206,59 +223,66 @@ def _serialize_tools_field(self, tools: Optional[List]) -> str:
         return "|".join(tool_strs)
 
     def _deserialize_list_field(self, field_str: str) -> List[str]:
-        """Deserialize string back to list."""
+        """Deserialize string back to list.
+
+        Args:
+            field_str (str): Pipe-delimited string.
+
+        Returns:
+            List[str]: List of string tokens.
+        """
         if not field_str:
             return []
         return field_str.split("|")
 
     @classmethod
     def from_test_cases(
-        cls, test_cases: List, input_id: str = "llm_agent_dataset", **kwargs
+        cls, test_cases: List[Any], input_id: str = "llm_agent_dataset", **kwargs: Any
     ) -> "LLMAgentDataset":
         """
         Create LLMAgentDataset from DeepEval test cases.
 
         Args:
-            test_cases: List of DeepEval LLMTestCase objects
-            input_id: Dataset identifier
-            **kwargs: Additional arguments
+            test_cases (List[LLMTestCase]): List of DeepEval LLMTestCase objects.
+            input_id (str): Dataset identifier.
+            **kwargs (Any): Additional arguments passed through to constructor.
 
         Returns:
-            LLMAgentDataset instance
+            LLMAgentDataset: New dataset instance.
         """
         return cls(input_id=input_id, test_cases=test_cases, **kwargs)
 
     @classmethod
     def from_goldens(
-        cls, goldens: List, input_id: str = "llm_agent_dataset", **kwargs
+        cls, goldens: List[Any], input_id: str = "llm_agent_dataset", **kwargs: Any
     ) -> "LLMAgentDataset":
         """
         Create LLMAgentDataset from DeepEval goldens.
 
         Args:
-            goldens: List of DeepEval Golden objects
-            input_id: Dataset identifier
-            **kwargs: Additional arguments
+            goldens (List[Golden]): List of DeepEval Golden objects.
+            input_id (str): Dataset identifier.
+            **kwargs (Any): Additional arguments passed through to constructor.
 
         Returns:
-            LLMAgentDataset instance
+            LLMAgentDataset: New dataset instance.
         """
         return cls(input_id=input_id, goldens=goldens, **kwargs)
 
     @classmethod
     def from_deepeval_dataset(
-        cls, deepeval_dataset, input_id: str = "llm_agent_dataset", **kwargs
+        cls, deepeval_dataset: Any, input_id: str = "llm_agent_dataset", **kwargs: Any
     ) -> "LLMAgentDataset":
         """
         Create LLMAgentDataset from DeepEval EvaluationDataset.
 
         Args:
-            deepeval_dataset: DeepEval EvaluationDataset instance
-            input_id: Dataset identifier
-            **kwargs: Additional arguments
+            deepeval_dataset (EvaluationDataset): DeepEval EvaluationDataset instance.
+            input_id (str): Dataset identifier.
+            **kwargs (Any): Additional arguments passed through to constructor.
 
         Returns:
-            LLMAgentDataset instance
+            LLMAgentDataset: New dataset instance.
         """
         return cls(
             input_id=input_id,
@@ -268,12 +292,12 @@ def from_deepeval_dataset(
             **kwargs,
         )
 
-    def add_test_case(self, test_case) -> None:
+    def add_test_case(self, test_case: Any) -> None:
         """
         Add a DeepEval test case to the dataset.
 
         Args:
-            test_case: DeepEval LLMTestCase instance
+            test_case (LLMTestCase): DeepEval LLMTestCase instance.
         """
         if not DEEPEVAL_AVAILABLE:
             raise ImportError("DeepEval is required to add test cases")
@@ -284,12 +308,12 @@ def add_test_case(self, test_case) -> None:
         self._df = df
         self.columns = df.columns.tolist()
 
-    def add_golden(self, golden) -> None:
+    def add_golden(self, golden: Any) -> None:
         """
         Add a DeepEval golden to the dataset.
 
         Args:
-            golden: DeepEval Golden instance
+            golden (Golden): DeepEval Golden instance.
         """
         if not DEEPEVAL_AVAILABLE:
             raise ImportError("DeepEval is required to add goldens")
@@ -300,12 +324,14 @@ def add_golden(self, golden) -> None:
         self._df = df
         self.columns = df.columns.tolist()
 
-    def convert_goldens_to_test_cases(self, llm_app_function) -> None:
+    def convert_goldens_to_test_cases(
+        self, llm_app_function: Callable[[str], Any]
+    ) -> None:
         """
         Convert goldens to test cases by generating actual outputs.
 
         Args:
-            llm_app_function: Function that takes input and returns LLM output
+            llm_app_function (Callable[[str], Any]): Function that takes input and returns LLM output.
         """
         if not DEEPEVAL_AVAILABLE:
             raise ImportError("DeepEval is required for conversion")
@@ -337,16 +363,18 @@ def convert_goldens_to_test_cases(self, llm_app_function) -> None:
         self._df = df
         self.columns = df.columns.tolist()
 
-    def evaluate_with_deepeval(self, metrics: List, **kwargs) -> Dict[str, Any]:
+    def evaluate_with_deepeval(
+        self, metrics: List[Any], **kwargs: Any
+    ) -> Dict[str, Any]:
         """
         Evaluate the dataset using DeepEval metrics.
 
         Args:
-            metrics: List of DeepEval metric instances
-            **kwargs: Additional arguments passed to deepeval.evaluate()
+            metrics (List[Any]): List of DeepEval metric instances.
+            **kwargs (Any): Additional arguments passed to `deepeval.evaluate()`.
 
         Returns:
-            Evaluation results dictionary
+            Dict[str, Any]: Evaluation results dictionary.
         """
         if not DEEPEVAL_AVAILABLE:
             raise ImportError("DeepEval is required for evaluation")
@@ -367,12 +395,12 @@ def evaluate_with_deepeval(self, metrics: List, **kwargs) -> Dict[str, Any]:
             logger.error(f"DeepEval evaluation failed: {e}")
             raise
 
-    def get_deepeval_dataset(self):
+    def get_deepeval_dataset(self) -> Any:
         """
         Get or create a DeepEval EvaluationDataset instance.
 
         Returns:
-            DeepEval EvaluationDataset instance
+            EvaluationDataset: DeepEval EvaluationDataset instance.
         """
         if not DEEPEVAL_AVAILABLE:
             raise ImportError("DeepEval is required to get dataset")
@@ -388,12 +416,12 @@ def get_deepeval_dataset(self):
 
         return self.deepeval_dataset
 
-    def to_deepeval_test_cases(self) -> List:
+    def to_deepeval_test_cases(self) -> List[Any]:
         """
         Convert dataset rows back to DeepEval test cases.
 
         Returns:
-            List of DeepEval LLMTestCase objects
+            List[LLMTestCase]: List of DeepEval LLMTestCase objects.
         """
         if not DEEPEVAL_AVAILABLE:
             raise ImportError("DeepEval is required for conversion")

diff --git a/validmind/scorer/classification/AbsoluteError.py b/validmind/scorer/classification/AbsoluteError.py
@@ -14,7 +14,7 @@
 @scorer()
 @tasks("classification")
 @tags("classification")
-def AbsoluteError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
+def AbsoluteError(model: VMModel, dataset: VMDataset) -> List[float]:
     """Calculates the absolute error per row for a classification model.
 
     For classification tasks, this computes the absolute difference between
@@ -25,7 +25,6 @@ def AbsoluteError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
     Args:
         model: The classification model to evaluate
         dataset: The dataset containing true labels and predictions
-        **kwargs: Additional parameters (unused for compatibility)
 
     Returns:
         List[float]: Per-row absolute errors as a list of float values

diff --git a/validmind/scorer/classification/BrierScore.py b/validmind/scorer/classification/BrierScore.py
@@ -14,7 +14,7 @@
 @scorer()
 @tasks("classification")
 @tags("classification")
-def BrierScore(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
+def BrierScore(model: VMModel, dataset: VMDataset) -> List[float]:
     """Calculates the Brier score per row for a classification model.
 
     The Brier score is a proper score function that measures the accuracy of
@@ -25,7 +25,6 @@ def BrierScore(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
     Args:
         model: The classification model to evaluate
         dataset: The dataset containing true labels and predicted probabilities
-        **kwargs: Additional parameters (unused for compatibility)
 
     Returns:
         List[float]: Per-row Brier scores as a list of float values

diff --git a/validmind/scorer/classification/CalibrationError.py b/validmind/scorer/classification/CalibrationError.py
@@ -15,7 +15,7 @@
 @tasks("classification")
 @tags("classification")
 def CalibrationError(
-    model: VMModel, dataset: VMDataset, n_bins: int = 10, **kwargs
+    model: VMModel, dataset: VMDataset, n_bins: int = 10
 ) -> List[float]:
     """Calculates the calibration error per row for a classification model.
 
@@ -28,7 +28,6 @@ def CalibrationError(
         model: The classification model to evaluate
         dataset: The dataset containing true labels and predicted probabilities
         n_bins: Number of bins for probability calibration, defaults to 10
-        **kwargs: Additional parameters (unused for compatibility)
 
     Returns:
         List[float]: Per-row calibration errors as a list of float values

diff --git a/validmind/scorer/classification/ClassBalance.py b/validmind/scorer/classification/ClassBalance.py
@@ -14,7 +14,7 @@
 @scorer()
 @tasks("classification")
 @tags("classification")
-def ClassBalance(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
+def ClassBalance(model: VMModel, dataset: VMDataset) -> List[float]:
     """Calculates the class balance score per row for a classification model.
 
     For each prediction, this returns how balanced the predicted class is in the
@@ -25,7 +25,6 @@ def ClassBalance(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
     Args:
         model: The classification model to evaluate
         dataset: The dataset containing true labels and predictions
-        **kwargs: Additional parameters (unused for compatibility)
 
     Returns:
         List[float]: Per-row class balance scores as a list of float values

diff --git a/validmind/scorer/classification/Confidence.py b/validmind/scorer/classification/Confidence.py
@@ -14,7 +14,7 @@
 @scorer()
 @tasks("classification")
 @tags("classification")
-def Confidence(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
+def Confidence(model: VMModel, dataset: VMDataset) -> List[float]:
     """Calculates the prediction confidence per row for a classification model.
 
     For binary classification, confidence is calculated as the maximum probability
@@ -24,7 +24,6 @@ def Confidence(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
     Args:
         model: The classification model to evaluate
         dataset: The dataset containing true labels and predicted probabilities
-        **kwargs: Additional parameters (unused for compatibility)
 
     Returns:
         List[float]: Per-row confidence scores as a list of float values

diff --git a/validmind/scorer/classification/Correctness.py b/validmind/scorer/classification/Correctness.py
@@ -14,7 +14,7 @@
 @scorer()
 @tasks("classification")
 @tags("classification")
-def Correctness(model: VMModel, dataset: VMDataset, **kwargs) -> List[int]:
+def Correctness(model: VMModel, dataset: VMDataset) -> List[int]:
     """Calculates the correctness per row for a classification model.
 
     For classification tasks, this returns 1 for correctly classified rows
@@ -24,7 +24,6 @@ def Correctness(model: VMModel, dataset: VMDataset, **kwargs) -> List[int]:
     Args:
         model: The classification model to evaluate
         dataset: The dataset containing true labels and predictions
-        **kwargs: Additional parameters (unused for compatibility)
 
     Returns:
         List[int]: Per-row correctness as a list of 1s and 0s

diff --git a/validmind/scorer/classification/LogLoss.py b/validmind/scorer/classification/LogLoss.py
@@ -14,9 +14,7 @@
 @scorer()
 @tasks("classification")
 @tags("classification")
-def LogLoss(
-    model: VMModel, dataset: VMDataset, eps: float = 1e-15, **kwargs
-) -> List[float]:
+def LogLoss(model: VMModel, dataset: VMDataset, eps: float = 1e-15) -> List[float]:
     """Calculates the logarithmic loss per row for a classification model.
 
     Log loss measures the performance of a classification model where the prediction
@@ -27,7 +25,6 @@ def LogLoss(
         model: The classification model to evaluate
         dataset: The dataset containing true labels and predicted probabilities
         eps: Small value to avoid log(0), defaults to 1e-15
-        **kwargs: Additional parameters (unused for compatibility)
 
     Returns:
         List[float]: Per-row log loss values as a list of float values