From 383e266b1ca2e39139a8fb0481da119f6b69eca3 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Wed, 15 Oct 2025 15:40:31 +0100 Subject: [PATCH 1/3] fix warning --- validmind/datasets/llm/agent_dataset.py | 122 +++++++++++++++--------- 1 file changed, 75 insertions(+), 47 deletions(-) diff --git a/validmind/datasets/llm/agent_dataset.py b/validmind/datasets/llm/agent_dataset.py index c6dbba5ca..5441c80de 100644 --- a/validmind/datasets/llm/agent_dataset.py +++ b/validmind/datasets/llm/agent_dataset.py @@ -9,7 +9,7 @@ and enables the use of all DeepEval tests and metrics within the ValidMind library. """ -from typing import Any, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional import pandas as pd @@ -21,9 +21,8 @@ # Optional DeepEval imports with graceful fallback try: from deepeval import evaluate - from deepeval.dataset import EvaluationDataset, Golden - from deepeval.metrics import BaseMetric - from deepeval.test_case import LLMTestCase, ToolCall + from deepeval.dataset import EvaluationDataset + from deepeval.test_case import LLMTestCase DEEPEVAL_AVAILABLE = True except ImportError: @@ -74,21 +73,21 @@ class LLMAgentDataset(VMDataset): def __init__( self, - input_id: str = None, - test_cases: Optional[List] = None, - goldens: Optional[List] = None, + input_id: Optional[str] = None, + test_cases: Optional[List[Any]] = None, + goldens: Optional[List[Any]] = None, deepeval_dataset: Optional[Any] = None, - **kwargs, - ): + **kwargs: Any, + ) -> None: """ Initialize LLMAgentDataset. Args: - input_id: Identifier for the dataset - test_cases: List of DeepEval LLMTestCase objects - goldens: List of DeepEval Golden objects - deepeval_dataset: DeepEval EvaluationDataset instance - **kwargs: Additional arguments passed to VMDataset + input_id (Optional[str]): Identifier for the dataset. + test_cases (Optional[List[LLMTestCase]]): List of DeepEval LLMTestCase objects. + goldens (Optional[List[Golden]]): List of DeepEval Golden objects. + deepeval_dataset (Optional[EvaluationDataset]): DeepEval EvaluationDataset instance. + **kwargs (Any): Additional arguments passed to `VMDataset`. """ if not DEEPEVAL_AVAILABLE: raise ImportError( @@ -122,7 +121,11 @@ def __init__( ) def _convert_to_dataframe(self) -> pd.DataFrame: - """Convert DeepEval test cases/goldens to pandas DataFrame.""" + """Convert DeepEval test cases/goldens to pandas DataFrame. + + Returns: + pandas.DataFrame: Tabular representation of test cases and goldens. + """ data = [] # Process test cases @@ -188,13 +191,27 @@ def _convert_to_dataframe(self) -> pd.DataFrame: return pd.DataFrame(data) def _serialize_list_field(self, field: Optional[List[str]]) -> str: - """Serialize list field to string for DataFrame storage.""" + """Serialize list field to string for DataFrame storage. + + Args: + field (Optional[List[str]]): List of strings to serialize. + + Returns: + str: Pipe-delimited string. + """ if field is None: return "" return "|".join(str(item) for item in field) def _serialize_tools_field(self, tools: Optional[List]) -> str: - """Serialize tools list to string for DataFrame storage.""" + """Serialize tools list to string for DataFrame storage. + + Args: + tools (Optional[List]): List of tool objects or names. + + Returns: + str: Pipe-delimited string of tool names. + """ if tools is None: return "" tool_strs = [] @@ -206,59 +223,66 @@ def _serialize_tools_field(self, tools: Optional[List]) -> str: return "|".join(tool_strs) def _deserialize_list_field(self, field_str: str) -> List[str]: - """Deserialize string back to list.""" + """Deserialize string back to list. + + Args: + field_str (str): Pipe-delimited string. + + Returns: + List[str]: List of string tokens. + """ if not field_str: return [] return field_str.split("|") @classmethod def from_test_cases( - cls, test_cases: List, input_id: str = "llm_agent_dataset", **kwargs + cls, test_cases: List[Any], input_id: str = "llm_agent_dataset", **kwargs: Any ) -> "LLMAgentDataset": """ Create LLMAgentDataset from DeepEval test cases. Args: - test_cases: List of DeepEval LLMTestCase objects - input_id: Dataset identifier - **kwargs: Additional arguments + test_cases (List[LLMTestCase]): List of DeepEval LLMTestCase objects. + input_id (str): Dataset identifier. + **kwargs (Any): Additional arguments passed through to constructor. Returns: - LLMAgentDataset instance + LLMAgentDataset: New dataset instance. """ return cls(input_id=input_id, test_cases=test_cases, **kwargs) @classmethod def from_goldens( - cls, goldens: List, input_id: str = "llm_agent_dataset", **kwargs + cls, goldens: List[Any], input_id: str = "llm_agent_dataset", **kwargs: Any ) -> "LLMAgentDataset": """ Create LLMAgentDataset from DeepEval goldens. Args: - goldens: List of DeepEval Golden objects - input_id: Dataset identifier - **kwargs: Additional arguments + goldens (List[Golden]): List of DeepEval Golden objects. + input_id (str): Dataset identifier. + **kwargs (Any): Additional arguments passed through to constructor. Returns: - LLMAgentDataset instance + LLMAgentDataset: New dataset instance. """ return cls(input_id=input_id, goldens=goldens, **kwargs) @classmethod def from_deepeval_dataset( - cls, deepeval_dataset, input_id: str = "llm_agent_dataset", **kwargs + cls, deepeval_dataset: Any, input_id: str = "llm_agent_dataset", **kwargs: Any ) -> "LLMAgentDataset": """ Create LLMAgentDataset from DeepEval EvaluationDataset. Args: - deepeval_dataset: DeepEval EvaluationDataset instance - input_id: Dataset identifier - **kwargs: Additional arguments + deepeval_dataset (EvaluationDataset): DeepEval EvaluationDataset instance. + input_id (str): Dataset identifier. + **kwargs (Any): Additional arguments passed through to constructor. Returns: - LLMAgentDataset instance + LLMAgentDataset: New dataset instance. """ return cls( input_id=input_id, @@ -268,12 +292,12 @@ def from_deepeval_dataset( **kwargs, ) - def add_test_case(self, test_case) -> None: + def add_test_case(self, test_case: Any) -> None: """ Add a DeepEval test case to the dataset. Args: - test_case: DeepEval LLMTestCase instance + test_case (LLMTestCase): DeepEval LLMTestCase instance. """ if not DEEPEVAL_AVAILABLE: raise ImportError("DeepEval is required to add test cases") @@ -284,12 +308,12 @@ def add_test_case(self, test_case) -> None: self._df = df self.columns = df.columns.tolist() - def add_golden(self, golden) -> None: + def add_golden(self, golden: Any) -> None: """ Add a DeepEval golden to the dataset. Args: - golden: DeepEval Golden instance + golden (Golden): DeepEval Golden instance. """ if not DEEPEVAL_AVAILABLE: raise ImportError("DeepEval is required to add goldens") @@ -300,12 +324,14 @@ def add_golden(self, golden) -> None: self._df = df self.columns = df.columns.tolist() - def convert_goldens_to_test_cases(self, llm_app_function) -> None: + def convert_goldens_to_test_cases( + self, llm_app_function: Callable[[str], Any] + ) -> None: """ Convert goldens to test cases by generating actual outputs. Args: - llm_app_function: Function that takes input and returns LLM output + llm_app_function (Callable[[str], Any]): Function that takes input and returns LLM output. """ if not DEEPEVAL_AVAILABLE: raise ImportError("DeepEval is required for conversion") @@ -337,16 +363,18 @@ def convert_goldens_to_test_cases(self, llm_app_function) -> None: self._df = df self.columns = df.columns.tolist() - def evaluate_with_deepeval(self, metrics: List, **kwargs) -> Dict[str, Any]: + def evaluate_with_deepeval( + self, metrics: List[Any], **kwargs: Any + ) -> Dict[str, Any]: """ Evaluate the dataset using DeepEval metrics. Args: - metrics: List of DeepEval metric instances - **kwargs: Additional arguments passed to deepeval.evaluate() + metrics (List[Any]): List of DeepEval metric instances. + **kwargs (Any): Additional arguments passed to `deepeval.evaluate()`. Returns: - Evaluation results dictionary + Dict[str, Any]: Evaluation results dictionary. """ if not DEEPEVAL_AVAILABLE: raise ImportError("DeepEval is required for evaluation") @@ -367,12 +395,12 @@ def evaluate_with_deepeval(self, metrics: List, **kwargs) -> Dict[str, Any]: logger.error(f"DeepEval evaluation failed: {e}") raise - def get_deepeval_dataset(self): + def get_deepeval_dataset(self) -> Any: """ Get or create a DeepEval EvaluationDataset instance. Returns: - DeepEval EvaluationDataset instance + EvaluationDataset: DeepEval EvaluationDataset instance. """ if not DEEPEVAL_AVAILABLE: raise ImportError("DeepEval is required to get dataset") @@ -388,12 +416,12 @@ def get_deepeval_dataset(self): return self.deepeval_dataset - def to_deepeval_test_cases(self) -> List: + def to_deepeval_test_cases(self) -> List[Any]: """ Convert dataset rows back to DeepEval test cases. Returns: - List of DeepEval LLMTestCase objects + List[LLMTestCase]: List of DeepEval LLMTestCase objects. """ if not DEEPEVAL_AVAILABLE: raise ImportError("DeepEval is required for conversion") From 0d5321e4109d525c3b83a76f56fd6c23fba99c87 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Thu, 16 Oct 2025 15:11:52 +0100 Subject: [PATCH 2/3] update annotations --- validmind/scorer/classification/AbsoluteError.py | 2 +- validmind/scorer/classification/BrierScore.py | 2 +- validmind/scorer/classification/CalibrationError.py | 2 +- validmind/scorer/classification/ClassBalance.py | 2 +- validmind/scorer/classification/Confidence.py | 2 +- validmind/scorer/classification/Correctness.py | 2 +- validmind/scorer/classification/LogLoss.py | 4 +--- validmind/scorer/classification/OutlierScore.py | 2 +- validmind/scorer/classification/ProbabilityError.py | 2 +- validmind/scorer/classification/Uncertainty.py | 2 +- validmind/vm_models/result/result.py | 4 ---- 11 files changed, 10 insertions(+), 16 deletions(-) diff --git a/validmind/scorer/classification/AbsoluteError.py b/validmind/scorer/classification/AbsoluteError.py index 8c31c8b52..6832c681e 100644 --- a/validmind/scorer/classification/AbsoluteError.py +++ b/validmind/scorer/classification/AbsoluteError.py @@ -14,7 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def AbsoluteError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]: +def AbsoluteError(model: VMModel, dataset: VMDataset) -> List[float]: """Calculates the absolute error per row for a classification model. For classification tasks, this computes the absolute difference between diff --git a/validmind/scorer/classification/BrierScore.py b/validmind/scorer/classification/BrierScore.py index d383f87c0..8658ba211 100644 --- a/validmind/scorer/classification/BrierScore.py +++ b/validmind/scorer/classification/BrierScore.py @@ -14,7 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def BrierScore(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]: +def BrierScore(model: VMModel, dataset: VMDataset) -> List[float]: """Calculates the Brier score per row for a classification model. The Brier score is a proper score function that measures the accuracy of diff --git a/validmind/scorer/classification/CalibrationError.py b/validmind/scorer/classification/CalibrationError.py index 411bf63b9..01cf1eada 100644 --- a/validmind/scorer/classification/CalibrationError.py +++ b/validmind/scorer/classification/CalibrationError.py @@ -15,7 +15,7 @@ @tasks("classification") @tags("classification") def CalibrationError( - model: VMModel, dataset: VMDataset, n_bins: int = 10, **kwargs + model: VMModel, dataset: VMDataset, n_bins: int = 10 ) -> List[float]: """Calculates the calibration error per row for a classification model. diff --git a/validmind/scorer/classification/ClassBalance.py b/validmind/scorer/classification/ClassBalance.py index 4058e79b2..4b89cf86e 100644 --- a/validmind/scorer/classification/ClassBalance.py +++ b/validmind/scorer/classification/ClassBalance.py @@ -14,7 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def ClassBalance(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]: +def ClassBalance(model: VMModel, dataset: VMDataset) -> List[float]: """Calculates the class balance score per row for a classification model. For each prediction, this returns how balanced the predicted class is in the diff --git a/validmind/scorer/classification/Confidence.py b/validmind/scorer/classification/Confidence.py index e54ef9f94..e2ba08f6b 100644 --- a/validmind/scorer/classification/Confidence.py +++ b/validmind/scorer/classification/Confidence.py @@ -14,7 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def Confidence(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]: +def Confidence(model: VMModel, dataset: VMDataset) -> List[float]: """Calculates the prediction confidence per row for a classification model. For binary classification, confidence is calculated as the maximum probability diff --git a/validmind/scorer/classification/Correctness.py b/validmind/scorer/classification/Correctness.py index b969007a7..e4d6e1baf 100644 --- a/validmind/scorer/classification/Correctness.py +++ b/validmind/scorer/classification/Correctness.py @@ -14,7 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def Correctness(model: VMModel, dataset: VMDataset, **kwargs) -> List[int]: +def Correctness(model: VMModel, dataset: VMDataset) -> List[int]: """Calculates the correctness per row for a classification model. For classification tasks, this returns 1 for correctly classified rows diff --git a/validmind/scorer/classification/LogLoss.py b/validmind/scorer/classification/LogLoss.py index 8347e9423..6d8defa31 100644 --- a/validmind/scorer/classification/LogLoss.py +++ b/validmind/scorer/classification/LogLoss.py @@ -14,9 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def LogLoss( - model: VMModel, dataset: VMDataset, eps: float = 1e-15, **kwargs -) -> List[float]: +def LogLoss(model: VMModel, dataset: VMDataset, eps: float = 1e-15) -> List[float]: """Calculates the logarithmic loss per row for a classification model. Log loss measures the performance of a classification model where the prediction diff --git a/validmind/scorer/classification/OutlierScore.py b/validmind/scorer/classification/OutlierScore.py index 14685ad57..8a45c8c3b 100644 --- a/validmind/scorer/classification/OutlierScore.py +++ b/validmind/scorer/classification/OutlierScore.py @@ -17,7 +17,7 @@ @tasks("classification") @tags("classification", "outlier", "anomaly") def OutlierScore( - dataset: VMDataset, contamination: float = 0.1, **kwargs + dataset: VMDataset, contamination: float = 0.1 ) -> List[Dict[str, Any]]: """Calculates outlier scores and isolation paths for a classification model. diff --git a/validmind/scorer/classification/ProbabilityError.py b/validmind/scorer/classification/ProbabilityError.py index a32a7b9a6..b2451c19f 100644 --- a/validmind/scorer/classification/ProbabilityError.py +++ b/validmind/scorer/classification/ProbabilityError.py @@ -14,7 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def ProbabilityError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]: +def ProbabilityError(model: VMModel, dataset: VMDataset) -> List[float]: """Calculates the probability error per row for a classification model. For binary classification tasks, this computes the absolute difference between diff --git a/validmind/scorer/classification/Uncertainty.py b/validmind/scorer/classification/Uncertainty.py index 9bbceba6a..4b3bd5b18 100644 --- a/validmind/scorer/classification/Uncertainty.py +++ b/validmind/scorer/classification/Uncertainty.py @@ -14,7 +14,7 @@ @scorer() @tasks("classification") @tags("classification") -def Uncertainty(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]: +def Uncertainty(model: VMModel, dataset: VMDataset) -> List[float]: """Calculates the prediction uncertainty per row for a classification model. Uncertainty is measured using the entropy of the predicted probability distribution. diff --git a/validmind/vm_models/result/result.py b/validmind/vm_models/result/result.py index 4b4ee82dd..81ee46849 100644 --- a/validmind/vm_models/result/result.py +++ b/validmind/vm_models/result/result.py @@ -789,11 +789,7 @@ def log( """Log the result to ValidMind. Args: - section_id (str): The section ID within the model document to insert the - test result. content_id (str): The content ID to log the result to. - position (int): The position (index) within the section to insert the test - result. """ # Check description text for PII when available if self.description: From cde778f84e9621fd8ca9b65ddce41231f41dba53 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Fri, 17 Oct 2025 11:43:21 +0100 Subject: [PATCH 3/3] update doc string --- validmind/scorer/classification/AbsoluteError.py | 1 - validmind/scorer/classification/BrierScore.py | 1 - validmind/scorer/classification/CalibrationError.py | 1 - validmind/scorer/classification/ClassBalance.py | 1 - validmind/scorer/classification/Confidence.py | 1 - validmind/scorer/classification/Correctness.py | 1 - validmind/scorer/classification/LogLoss.py | 1 - validmind/scorer/classification/OutlierScore.py | 1 - validmind/scorer/classification/ProbabilityError.py | 1 - validmind/scorer/classification/Uncertainty.py | 1 - 10 files changed, 10 deletions(-) diff --git a/validmind/scorer/classification/AbsoluteError.py b/validmind/scorer/classification/AbsoluteError.py index 6832c681e..1773a6ad1 100644 --- a/validmind/scorer/classification/AbsoluteError.py +++ b/validmind/scorer/classification/AbsoluteError.py @@ -25,7 +25,6 @@ def AbsoluteError(model: VMModel, dataset: VMDataset) -> List[float]: Args: model: The classification model to evaluate dataset: The dataset containing true labels and predictions - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row absolute errors as a list of float values diff --git a/validmind/scorer/classification/BrierScore.py b/validmind/scorer/classification/BrierScore.py index 8658ba211..bb55013ef 100644 --- a/validmind/scorer/classification/BrierScore.py +++ b/validmind/scorer/classification/BrierScore.py @@ -25,7 +25,6 @@ def BrierScore(model: VMModel, dataset: VMDataset) -> List[float]: Args: model: The classification model to evaluate dataset: The dataset containing true labels and predicted probabilities - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row Brier scores as a list of float values diff --git a/validmind/scorer/classification/CalibrationError.py b/validmind/scorer/classification/CalibrationError.py index 01cf1eada..e2963e4c9 100644 --- a/validmind/scorer/classification/CalibrationError.py +++ b/validmind/scorer/classification/CalibrationError.py @@ -28,7 +28,6 @@ def CalibrationError( model: The classification model to evaluate dataset: The dataset containing true labels and predicted probabilities n_bins: Number of bins for probability calibration, defaults to 10 - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row calibration errors as a list of float values diff --git a/validmind/scorer/classification/ClassBalance.py b/validmind/scorer/classification/ClassBalance.py index 4b89cf86e..a5f0febaf 100644 --- a/validmind/scorer/classification/ClassBalance.py +++ b/validmind/scorer/classification/ClassBalance.py @@ -25,7 +25,6 @@ def ClassBalance(model: VMModel, dataset: VMDataset) -> List[float]: Args: model: The classification model to evaluate dataset: The dataset containing true labels and predictions - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row class balance scores as a list of float values diff --git a/validmind/scorer/classification/Confidence.py b/validmind/scorer/classification/Confidence.py index e2ba08f6b..5722fa614 100644 --- a/validmind/scorer/classification/Confidence.py +++ b/validmind/scorer/classification/Confidence.py @@ -24,7 +24,6 @@ def Confidence(model: VMModel, dataset: VMDataset) -> List[float]: Args: model: The classification model to evaluate dataset: The dataset containing true labels and predicted probabilities - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row confidence scores as a list of float values diff --git a/validmind/scorer/classification/Correctness.py b/validmind/scorer/classification/Correctness.py index e4d6e1baf..5c9a50876 100644 --- a/validmind/scorer/classification/Correctness.py +++ b/validmind/scorer/classification/Correctness.py @@ -24,7 +24,6 @@ def Correctness(model: VMModel, dataset: VMDataset) -> List[int]: Args: model: The classification model to evaluate dataset: The dataset containing true labels and predictions - **kwargs: Additional parameters (unused for compatibility) Returns: List[int]: Per-row correctness as a list of 1s and 0s diff --git a/validmind/scorer/classification/LogLoss.py b/validmind/scorer/classification/LogLoss.py index 6d8defa31..80811d101 100644 --- a/validmind/scorer/classification/LogLoss.py +++ b/validmind/scorer/classification/LogLoss.py @@ -25,7 +25,6 @@ def LogLoss(model: VMModel, dataset: VMDataset, eps: float = 1e-15) -> List[floa model: The classification model to evaluate dataset: The dataset containing true labels and predicted probabilities eps: Small value to avoid log(0), defaults to 1e-15 - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row log loss values as a list of float values diff --git a/validmind/scorer/classification/OutlierScore.py b/validmind/scorer/classification/OutlierScore.py index 8a45c8c3b..2bac59813 100644 --- a/validmind/scorer/classification/OutlierScore.py +++ b/validmind/scorer/classification/OutlierScore.py @@ -29,7 +29,6 @@ def OutlierScore( Args: dataset: The dataset containing feature data contamination: Expected proportion of outliers, defaults to 0.1 - **kwargs: Additional parameters (unused for compatibility) Returns: List[Dict[str, Any]]: Per-row outlier metrics as a list of dictionaries. diff --git a/validmind/scorer/classification/ProbabilityError.py b/validmind/scorer/classification/ProbabilityError.py index b2451c19f..6a918a93f 100644 --- a/validmind/scorer/classification/ProbabilityError.py +++ b/validmind/scorer/classification/ProbabilityError.py @@ -25,7 +25,6 @@ def ProbabilityError(model: VMModel, dataset: VMDataset) -> List[float]: Args: model: The classification model to evaluate dataset: The dataset containing true labels and predicted probabilities - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row probability errors as a list of float values diff --git a/validmind/scorer/classification/Uncertainty.py b/validmind/scorer/classification/Uncertainty.py index 4b3bd5b18..c0f3ebe5f 100644 --- a/validmind/scorer/classification/Uncertainty.py +++ b/validmind/scorer/classification/Uncertainty.py @@ -24,7 +24,6 @@ def Uncertainty(model: VMModel, dataset: VMDataset) -> List[float]: Args: model: The classification model to evaluate dataset: The dataset containing true labels and predicted probabilities - **kwargs: Additional parameters (unused for compatibility) Returns: List[float]: Per-row uncertainty scores as a list of float values