validmind · johnwalz97 · Jul 17, 2025 · May 15, 2025 · May 19, 2025 · May 19, 2025
diff --git a/notebooks/how_to/explore_tests.ipynb b/notebooks/how_to/explore_tests.ipynb
diff --git a/scripts/bulk_ai_test_updates.py b/scripts/bulk_ai_test_updates.py
@@ -264,6 +264,98 @@ def ExampleConfusionMatrix(model: VMModel, dataset: VMDataset):
 DO NOT CHANGE ANYTHING OTHER THAN ADDING THE NEW RAW DATA MECHANISM... I.E. DO NOT REMOVE ANYTHING FROM THE RETURN TUPLE OR THE RETURN VALUE (if it is a single object)
 """
 
+add_return_type_prompt = """
+You are an expert Python engineer and data scientist with broad experience across many domains.
+ValidMind is a company that provides a Python SDK for building and running tests for the purposes of model risk management.
+ValidMind's SDK offers a library of "test" functions that are run with our test harness against many types of models and datasets.
+
+Your task is to analyze the test function and add appropriate return type annotations to the function signature.
+
+CRITICAL: DO NOT CHANGE ANYTHING IN THE CODE EXCEPT:
+1. Adding the return type annotation to the function signature
+2. Adding any necessary import statements WITH THE EXISTING IMPORTS (do not add imports elsewhere)
+
+EXTREMELY IMPORTANT: ALWAYS PRESERVE COPYRIGHT AND LICENSE INFORMATION AT THE TOP OF THE FILE!
+You must include any copyright, license, and SPDX identifier lines from the original file!
+
+ValidMind test functions return either a single object or a tuple of objects.
+These objects are turned into a test result report by the test harness.
+They can return any number of the following types of objects:
+- Tables (pd.DataFrame or List[Dict[str, Any]])
+- Figures (matplotlib.figure.Figure, plotly.graph_objects.Figure (go.Figure), or List of these)
+- Values (scalar values like float, int, str, or container types like List, Dict)
+- Pass/Fail (bool value indicating whether the test passed or failed)
+- Raw Data (RawData object containing intermediate data)
+
+Common imports that might be needed in the return type annotation:
+- from typing import Any, Dict, List, Tuple, Union, Optional
+- import plotly.graph_objects as go
+- import matplotlib.figure
+- import pandas as pd
+- from validmind import RawData
+
+You should inspect the return statement(s) in the function to determine what the function actually returns.
+Then, add the appropriate return type annotation to the function signature.
+
+If the function already has a return type annotation, don't change it - in this case, return the original code without any changes.
+
+Examples:
+
+1. For a function that returns a single figure:
+```python
+def PlotHistogram(dataset: VMDataset):
+    # ... code ...
+    return fig
+```
+Should become:
+```python
+def PlotHistogram(dataset: VMDataset) -> go.Figure:
+    # ... code ...
+    return fig
+```
+
+2. For a function that returns multiple objects in a tuple:
+```python
+def ClassImbalance(dataset: VMDataset):
+    # ... code ...
+    return stats, fig, passed
+```
+Should become:
+```python
+def ClassImbalance(dataset: VMDataset) -> Tuple[Dict[str, Any], go.Figure, bool]:
+    # ... code ...
+    return stats, fig, passed
+```
+
+3. For a function that builds a list of figures and returns it as a tuple:
+```python
+def MultiplePlots(dataset: VMDataset):
+    # ... code ...
+    returns = []
+    returns.append(fig1)
+    returns.append(fig2)
+    returns.append(RawData(...))
+    return tuple(returns)
+```
+Should become:
+```python
+def MultiplePlots(dataset: VMDataset) -> Tuple[go.Figure, go.Figure, RawData]:
+    # ... code ...
+    returns = []
+    returns.append(fig1)
+    returns.append(fig2)
+    returns.append(RawData(...))
+    return tuple(returns)
+```
+
+Return only the updated code and nothing else.
+Do not wrap the code in backticks, simply return valid Python code.
+Only add the correct imports if they are not already present in the file, and place them with the existing imports.
+DO NOT modify the function body in any way - the only changes should be to the function signature and possibly adding imports.
+NEVER REMOVE COPYRIGHT NOTICES OR LICENSE INFORMATION!
+If the function already has a return type annotation, return the original code without any changes.
+""".strip()
+
 custom_prompt_system = """
 You are an expert Python engineer and data scientist with broad experience across many domains.
 ValidMind is a company that provides a Python SDK for building and running tests for the purposes of model risk management.
@@ -394,6 +486,31 @@ def add_raw_data_to_test(path):
         f.write(updated_file_contents)
 
 
+def add_return_type_to_test(path):
+    """Add return type annotation to a test function"""
+    # get file contents from path
+    click.echo(f"> {path}")
+    with open(path, "r") as f:
+        file_contents = f.read()
+
+    response = client.chat.completions.create(
+        model=OPENAI_GPT_MODEL,
+        messages=[
+            {"role": "system", "content": add_return_type_prompt},
+            {"role": "user", "content": f"```python\n{file_contents}```"},
+        ],
+    )
+
+    updated_file_contents = response.choices[0].message.content
+    # remove starting "```python" and ending "```"
+    updated_file_contents = (
+        updated_file_contents.lstrip("```python").rstrip("```").strip()
+    )
+
+    with open(path, "w") as f:
+        f.write(updated_file_contents)
+
+
 def custom_prompt(path, user_prompt):
     """Custom prompt for a test file"""
     # get file contents from path
@@ -461,7 +578,13 @@ def _is_test_file(path):
 @click.option(
     "--action",
     type=click.Choice(
-        ["add_description", "add_raw_data", "custom_prompt", "custom_review"]
+        [
+            "add_description",
+            "add_raw_data",
+            "add_return_type",
+            "custom_prompt",
+            "custom_review",
+        ]
     ),
     required=True,
 )
@@ -494,6 +617,8 @@ def main(action, path, model):
         func = add_description_to_test
     elif action == "add_raw_data":
         func = add_raw_data_to_test
+    elif action == "add_return_type":
+        func = add_return_type_to_test
     elif action == "custom_prompt":
         if not USER_PROMPT:
             user_prompt = input("Enter your prompt: ")

diff --git a/validmind/tests/data_validation/ACFandPACFPlot.py b/validmind/tests/data_validation/ACFandPACFPlot.py
@@ -2,6 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Tuple
+
 import pandas as pd
 import plotly.graph_objects as go
 from statsmodels.tsa.stattools import acf, pacf
@@ -12,7 +14,7 @@
 
 @tags("time_series_data", "forecasting", "statistical_test", "visualization")
 @tasks("regression")
-def ACFandPACFPlot(dataset: VMDataset):
+def ACFandPACFPlot(dataset: VMDataset) -> Tuple[go.Figure, RawData]:
     """
     Analyzes time series data using Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots to
     reveal trends and correlations.

diff --git a/validmind/tests/data_validation/ADF.py b/validmind/tests/data_validation/ADF.py
@@ -2,6 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Dict
+
 import pandas as pd
 from statsmodels.tsa.stattools import adfuller
 
@@ -16,7 +18,7 @@
     "time_series_data", "statsmodels", "forecasting", "statistical_test", "stationarity"
 )
 @tasks("regression")
-def ADF(dataset: VMDataset):
+def ADF(dataset: VMDataset) -> Dict[str, pd.DataFrame]:
     """
     Assesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test.
 

diff --git a/validmind/tests/data_validation/AutoAR.py b/validmind/tests/data_validation/AutoAR.py
@@ -2,6 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Dict
+
 import pandas as pd
 from statsmodels.tsa.ar_model import AutoReg
 from statsmodels.tsa.stattools import adfuller
@@ -15,7 +17,7 @@
 
 @tags("time_series_data", "statsmodels", "forecasting", "statistical_test")
 @tasks("regression")
-def AutoAR(dataset: VMDataset, max_ar_order: int = 3):
+def AutoAR(dataset: VMDataset, max_ar_order: int = 3) -> Dict[str, pd.DataFrame]:
     """
     Automatically identifies the optimal Autoregressive (AR) order for a time series using BIC and AIC criteria.
 

diff --git a/validmind/tests/data_validation/AutoMA.py b/validmind/tests/data_validation/AutoMA.py
@@ -2,6 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Dict, Tuple
+
 import pandas as pd
 from statsmodels.tsa.arima.model import ARIMA
 from statsmodels.tsa.stattools import adfuller
@@ -15,7 +17,9 @@
 
 @tags("time_series_data", "statsmodels", "forecasting", "statistical_test")
 @tasks("regression")
-def AutoMA(dataset: VMDataset, max_ma_order: int = 3):
+def AutoMA(
+    dataset: VMDataset, max_ma_order: int = 3
+) -> Tuple[Dict[str, pd.DataFrame], RawData]:
     """
     Automatically selects the optimal Moving Average (MA) order for each variable in a time series dataset based on
     minimal BIC and AIC values.

diff --git a/validmind/tests/data_validation/AutoStationarity.py b/validmind/tests/data_validation/AutoStationarity.py
@@ -2,6 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Dict
+
 import numpy as np
 import pandas as pd
 from statsmodels.tsa.stattools import adfuller
@@ -12,7 +14,9 @@
 
 @tags("time_series_data", "statsmodels", "forecasting", "statistical_test")
 @tasks("regression")
-def AutoStationarity(dataset: VMDataset, max_order: int = 5, threshold: float = 0.05):
+def AutoStationarity(
+    dataset: VMDataset, max_order: int = 5, threshold: float = 0.05
+) -> Dict[str, pd.DataFrame]:
     """
     Automates Augmented Dickey-Fuller test to assess stationarity across multiple time series in a DataFrame.
 

diff --git a/validmind/tests/data_validation/BivariateScatterPlots.py b/validmind/tests/data_validation/BivariateScatterPlots.py
@@ -3,15 +3,17 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
 import itertools
+from typing import Tuple
 
 import plotly.express as px
+import plotly.graph_objects as go
 
 from validmind import RawData, tags, tasks
 
 
 @tags("tabular_data", "numerical_data", "visualization")
 @tasks("classification")
-def BivariateScatterPlots(dataset):
+def BivariateScatterPlots(dataset) -> Tuple[go.Figure, RawData]:
     """
     Generates bivariate scatterplots to visually inspect relationships between pairs of numerical predictor variables
     in machine learning classification tasks.

diff --git a/validmind/tests/data_validation/BoxPierce.py b/validmind/tests/data_validation/BoxPierce.py
@@ -2,6 +2,9 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+
+from typing import Tuple
+
 import pandas as pd
 from statsmodels.stats.diagnostic import acorr_ljungbox
 
@@ -10,7 +13,7 @@
 
 @tasks("regression")
 @tags("time_series_data", "forecasting", "statistical_test", "statsmodels")
-def BoxPierce(dataset):
+def BoxPierce(dataset) -> Tuple[pd.DataFrame, RawData]:
     """
     Detects autocorrelation in time-series data through the Box-Pierce test to validate model performance.
 

diff --git a/validmind/tests/data_validation/ChiSquaredFeaturesTable.py b/validmind/tests/data_validation/ChiSquaredFeaturesTable.py
@@ -12,7 +12,7 @@
 
 @tags("tabular_data", "categorical_data", "statistical_test")
 @tasks("classification")
-def ChiSquaredFeaturesTable(dataset, p_threshold=0.05):
+def ChiSquaredFeaturesTable(dataset, p_threshold=0.05) -> pd.DataFrame:
     """
     Assesses the statistical association between categorical features and a target variable using the Chi-Squared test.
 

diff --git a/validmind/tests/data_validation/ClassImbalance.py b/validmind/tests/data_validation/ClassImbalance.py
@@ -20,7 +20,7 @@
 @tasks("classification")
 def ClassImbalance(
     dataset: VMDataset, min_percent_threshold: int = 10
-) -> Tuple[Dict[str, Any], go.Figure, bool]:
+) -> Tuple[Dict[str, Any], go.Figure, bool, RawData]:
     """
     Evaluates and quantifies class distribution imbalance in a dataset used by a machine learning model.
 

diff --git a/validmind/tests/data_validation/DatasetDescription.py b/validmind/tests/data_validation/DatasetDescription.py
@@ -4,6 +4,7 @@
 
 import re
 from collections import Counter
+from typing import Any, Dict, List, Tuple
 
 import numpy as np
 
@@ -142,7 +143,9 @@ def describe_column(df, column):
 
 @tags("tabular_data", "time_series_data", "text_data")
 @tasks("classification", "regression", "text_classification", "text_summarization")
-def DatasetDescription(dataset: VMDataset):
+def DatasetDescription(
+    dataset: VMDataset,
+) -> Tuple[Dict[str, List[Dict[str, Any]]], RawData]:
     """
     Provides comprehensive analysis and statistical summaries of each column in a machine learning model's dataset.
 

diff --git a/validmind/tests/data_validation/DatasetSplit.py b/validmind/tests/data_validation/DatasetSplit.py
@@ -2,7 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
-from typing import List
+
+from typing import Any, Dict, List, Tuple
 
 from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset
@@ -17,7 +18,7 @@
 
 @tags("tabular_data", "time_series_data", "text_data")
 @tasks("classification", "regression", "text_classification", "text_summarization")
-def DatasetSplit(datasets: List[VMDataset]):
+def DatasetSplit(datasets: List[VMDataset]) -> Tuple[List[Dict[str, Any]], RawData]:
     """
     Evaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML
     model.

diff --git a/validmind/tests/data_validation/DescriptiveStatistics.py b/validmind/tests/data_validation/DescriptiveStatistics.py
@@ -2,6 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Any, Dict
+
 import pandas as pd
 
 from validmind import tags, tasks
@@ -46,7 +48,7 @@ def get_summary_statistics_categorical(df, categorical_fields):
 
 @tags("tabular_data", "time_series_data", "data_quality")
 @tasks("classification", "regression")
-def DescriptiveStatistics(dataset: VMDataset):
+def DescriptiveStatistics(dataset: VMDataset) -> Dict[str, Any]:
     """
     Performs a detailed descriptive statistical analysis of both numerical and categorical data within a model's
     dataset.

diff --git a/validmind/tests/data_validation/DickeyFullerGLS.py b/validmind/tests/data_validation/DickeyFullerGLS.py
@@ -2,6 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Any, Dict, Tuple
+
 import pandas as pd
 from arch.unitroot import DFGLS
 from numpy.linalg import LinAlgError
@@ -16,7 +18,7 @@
 
 @tags("time_series_data", "forecasting", "unit_root_test")
 @tasks("regression")
-def DickeyFullerGLS(dataset: VMDataset):
+def DickeyFullerGLS(dataset: VMDataset) -> Tuple[Dict[str, Any], RawData]:
     """
     Assesses stationarity in time series data using the Dickey-Fuller GLS test to determine the order of integration.
 

diff --git a/validmind/tests/data_validation/Duplicates.py b/validmind/tests/data_validation/Duplicates.py
@@ -2,14 +2,16 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
+from typing import Any, Dict, Tuple
+
 import pandas as pd
 
 from validmind import tags, tasks
 
 
 @tags("tabular_data", "data_quality", "text_data")
 @tasks("classification", "regression")
-def Duplicates(dataset, min_threshold=1):
+def Duplicates(dataset, min_threshold=1) -> Tuple[Dict[str, Any], bool]:
     """
     Tests dataset for duplicate entries, ensuring model reliability via data quality verification.