Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6,832 changes: 3,748 additions & 3,084 deletions notebooks/how_to/explore_tests.ipynb

Large diffs are not rendered by default.

127 changes: 126 additions & 1 deletion scripts/bulk_ai_test_updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,98 @@ def ExampleConfusionMatrix(model: VMModel, dataset: VMDataset):
DO NOT CHANGE ANYTHING OTHER THAN ADDING THE NEW RAW DATA MECHANISM... I.E. DO NOT REMOVE ANYTHING FROM THE RETURN TUPLE OR THE RETURN VALUE (if it is a single object)
"""

add_return_type_prompt = """
You are an expert Python engineer and data scientist with broad experience across many domains.
ValidMind is a company that provides a Python SDK for building and running tests for the purposes of model risk management.
ValidMind's SDK offers a library of "test" functions that are run with our test harness against many types of models and datasets.

Your task is to analyze the test function and add appropriate return type annotations to the function signature.

CRITICAL: DO NOT CHANGE ANYTHING IN THE CODE EXCEPT:
1. Adding the return type annotation to the function signature
2. Adding any necessary import statements WITH THE EXISTING IMPORTS (do not add imports elsewhere)

EXTREMELY IMPORTANT: ALWAYS PRESERVE COPYRIGHT AND LICENSE INFORMATION AT THE TOP OF THE FILE!
You must include any copyright, license, and SPDX identifier lines from the original file!

ValidMind test functions return either a single object or a tuple of objects.
These objects are turned into a test result report by the test harness.
They can return any number of the following types of objects:
- Tables (pd.DataFrame or List[Dict[str, Any]])
- Figures (matplotlib.figure.Figure, plotly.graph_objects.Figure (go.Figure), or List of these)
- Values (scalar values like float, int, str, or container types like List, Dict)
- Pass/Fail (bool value indicating whether the test passed or failed)
- Raw Data (RawData object containing intermediate data)

Common imports that might be needed in the return type annotation:
- from typing import Any, Dict, List, Tuple, Union, Optional
- import plotly.graph_objects as go
- import matplotlib.figure
- import pandas as pd
- from validmind import RawData

You should inspect the return statement(s) in the function to determine what the function actually returns.
Then, add the appropriate return type annotation to the function signature.

If the function already has a return type annotation, don't change it - in this case, return the original code without any changes.

Examples:

1. For a function that returns a single figure:
```python
def PlotHistogram(dataset: VMDataset):
# ... code ...
return fig
```
Should become:
```python
def PlotHistogram(dataset: VMDataset) -> go.Figure:
# ... code ...
return fig
```

2. For a function that returns multiple objects in a tuple:
```python
def ClassImbalance(dataset: VMDataset):
# ... code ...
return stats, fig, passed
```
Should become:
```python
def ClassImbalance(dataset: VMDataset) -> Tuple[Dict[str, Any], go.Figure, bool]:
# ... code ...
return stats, fig, passed
```

3. For a function that builds a list of figures and returns it as a tuple:
```python
def MultiplePlots(dataset: VMDataset):
# ... code ...
returns = []
returns.append(fig1)
returns.append(fig2)
returns.append(RawData(...))
return tuple(returns)
```
Should become:
```python
def MultiplePlots(dataset: VMDataset) -> Tuple[go.Figure, go.Figure, RawData]:
# ... code ...
returns = []
returns.append(fig1)
returns.append(fig2)
returns.append(RawData(...))
return tuple(returns)
```

Return only the updated code and nothing else.
Do not wrap the code in backticks, simply return valid Python code.
Only add the correct imports if they are not already present in the file, and place them with the existing imports.
DO NOT modify the function body in any way - the only changes should be to the function signature and possibly adding imports.
NEVER REMOVE COPYRIGHT NOTICES OR LICENSE INFORMATION!
If the function already has a return type annotation, return the original code without any changes.
""".strip()

custom_prompt_system = """
You are an expert Python engineer and data scientist with broad experience across many domains.
ValidMind is a company that provides a Python SDK for building and running tests for the purposes of model risk management.
Expand Down Expand Up @@ -394,6 +486,31 @@ def add_raw_data_to_test(path):
f.write(updated_file_contents)


def add_return_type_to_test(path):
"""Add return type annotation to a test function"""
# get file contents from path
click.echo(f"> {path}")
with open(path, "r") as f:
file_contents = f.read()

response = client.chat.completions.create(
model=OPENAI_GPT_MODEL,
messages=[
{"role": "system", "content": add_return_type_prompt},
{"role": "user", "content": f"```python\n{file_contents}```"},
],
)

updated_file_contents = response.choices[0].message.content
# remove starting "```python" and ending "```"
updated_file_contents = (
updated_file_contents.lstrip("```python").rstrip("```").strip()
)

with open(path, "w") as f:
f.write(updated_file_contents)


def custom_prompt(path, user_prompt):
"""Custom prompt for a test file"""
# get file contents from path
Expand Down Expand Up @@ -461,7 +578,13 @@ def _is_test_file(path):
@click.option(
"--action",
type=click.Choice(
["add_description", "add_raw_data", "custom_prompt", "custom_review"]
[
"add_description",
"add_raw_data",
"add_return_type",
"custom_prompt",
"custom_review",
]
),
required=True,
)
Expand Down Expand Up @@ -494,6 +617,8 @@ def main(action, path, model):
func = add_description_to_test
elif action == "add_raw_data":
func = add_raw_data_to_test
elif action == "add_return_type":
func = add_return_type_to_test
elif action == "custom_prompt":
if not USER_PROMPT:
user_prompt = input("Enter your prompt: ")
Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/data_validation/ACFandPACFPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Tuple

import pandas as pd
import plotly.graph_objects as go
from statsmodels.tsa.stattools import acf, pacf
Expand All @@ -12,7 +14,7 @@

@tags("time_series_data", "forecasting", "statistical_test", "visualization")
@tasks("regression")
def ACFandPACFPlot(dataset: VMDataset):
def ACFandPACFPlot(dataset: VMDataset) -> Tuple[go.Figure, RawData]:
"""
Analyzes time series data using Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots to
reveal trends and correlations.
Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/data_validation/ADF.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Dict

import pandas as pd
from statsmodels.tsa.stattools import adfuller

Expand All @@ -16,7 +18,7 @@
"time_series_data", "statsmodels", "forecasting", "statistical_test", "stationarity"
)
@tasks("regression")
def ADF(dataset: VMDataset):
def ADF(dataset: VMDataset) -> Dict[str, pd.DataFrame]:
"""
Assesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test.

Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/data_validation/AutoAR.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Dict

import pandas as pd
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import adfuller
Expand All @@ -15,7 +17,7 @@

@tags("time_series_data", "statsmodels", "forecasting", "statistical_test")
@tasks("regression")
def AutoAR(dataset: VMDataset, max_ar_order: int = 3):
def AutoAR(dataset: VMDataset, max_ar_order: int = 3) -> Dict[str, pd.DataFrame]:
"""
Automatically identifies the optimal Autoregressive (AR) order for a time series using BIC and AIC criteria.

Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/data_validation/AutoMA.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Dict, Tuple

import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
Expand All @@ -15,7 +17,9 @@

@tags("time_series_data", "statsmodels", "forecasting", "statistical_test")
@tasks("regression")
def AutoMA(dataset: VMDataset, max_ma_order: int = 3):
def AutoMA(
dataset: VMDataset, max_ma_order: int = 3
) -> Tuple[Dict[str, pd.DataFrame], RawData]:
"""
Automatically selects the optimal Moving Average (MA) order for each variable in a time series dataset based on
minimal BIC and AIC values.
Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/data_validation/AutoStationarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Dict

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
Expand All @@ -12,7 +14,9 @@

@tags("time_series_data", "statsmodels", "forecasting", "statistical_test")
@tasks("regression")
def AutoStationarity(dataset: VMDataset, max_order: int = 5, threshold: float = 0.05):
def AutoStationarity(
dataset: VMDataset, max_order: int = 5, threshold: float = 0.05
) -> Dict[str, pd.DataFrame]:
"""
Automates Augmented Dickey-Fuller test to assess stationarity across multiple time series in a DataFrame.

Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/data_validation/BivariateScatterPlots.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

import itertools
from typing import Tuple

import plotly.express as px
import plotly.graph_objects as go

from validmind import RawData, tags, tasks


@tags("tabular_data", "numerical_data", "visualization")
@tasks("classification")
def BivariateScatterPlots(dataset):
def BivariateScatterPlots(dataset) -> Tuple[go.Figure, RawData]:
"""
Generates bivariate scatterplots to visually inspect relationships between pairs of numerical predictor variables
in machine learning classification tasks.
Expand Down
5 changes: 4 additions & 1 deletion validmind/tests/data_validation/BoxPierce.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial


from typing import Tuple

import pandas as pd
from statsmodels.stats.diagnostic import acorr_ljungbox

Expand All @@ -10,7 +13,7 @@

@tasks("regression")
@tags("time_series_data", "forecasting", "statistical_test", "statsmodels")
def BoxPierce(dataset):
def BoxPierce(dataset) -> Tuple[pd.DataFrame, RawData]:
"""
Detects autocorrelation in time-series data through the Box-Pierce test to validate model performance.

Expand Down
2 changes: 1 addition & 1 deletion validmind/tests/data_validation/ChiSquaredFeaturesTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

@tags("tabular_data", "categorical_data", "statistical_test")
@tasks("classification")
def ChiSquaredFeaturesTable(dataset, p_threshold=0.05):
def ChiSquaredFeaturesTable(dataset, p_threshold=0.05) -> pd.DataFrame:
"""
Assesses the statistical association between categorical features and a target variable using the Chi-Squared test.

Expand Down
2 changes: 1 addition & 1 deletion validmind/tests/data_validation/ClassImbalance.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
@tasks("classification")
def ClassImbalance(
dataset: VMDataset, min_percent_threshold: int = 10
) -> Tuple[Dict[str, Any], go.Figure, bool]:
) -> Tuple[Dict[str, Any], go.Figure, bool, RawData]:
"""
Evaluates and quantifies class distribution imbalance in a dataset used by a machine learning model.

Expand Down
5 changes: 4 additions & 1 deletion validmind/tests/data_validation/DatasetDescription.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import re
from collections import Counter
from typing import Any, Dict, List, Tuple

import numpy as np

Expand Down Expand Up @@ -142,7 +143,9 @@ def describe_column(df, column):

@tags("tabular_data", "time_series_data", "text_data")
@tasks("classification", "regression", "text_classification", "text_summarization")
def DatasetDescription(dataset: VMDataset):
def DatasetDescription(
dataset: VMDataset,
) -> Tuple[Dict[str, List[Dict[str, Any]]], RawData]:
"""
Provides comprehensive analysis and statistical summaries of each column in a machine learning model's dataset.

Expand Down
5 changes: 3 additions & 2 deletions validmind/tests/data_validation/DatasetSplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import List

from typing import Any, Dict, List, Tuple

from validmind import RawData, tags, tasks
from validmind.vm_models import VMDataset
Expand All @@ -17,7 +18,7 @@

@tags("tabular_data", "time_series_data", "text_data")
@tasks("classification", "regression", "text_classification", "text_summarization")
def DatasetSplit(datasets: List[VMDataset]):
def DatasetSplit(datasets: List[VMDataset]) -> Tuple[List[Dict[str, Any]], RawData]:
"""
Evaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML
model.
Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/data_validation/DescriptiveStatistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Any, Dict

import pandas as pd

from validmind import tags, tasks
Expand Down Expand Up @@ -46,7 +48,7 @@ def get_summary_statistics_categorical(df, categorical_fields):

@tags("tabular_data", "time_series_data", "data_quality")
@tasks("classification", "regression")
def DescriptiveStatistics(dataset: VMDataset):
def DescriptiveStatistics(dataset: VMDataset) -> Dict[str, Any]:
"""
Performs a detailed descriptive statistical analysis of both numerical and categorical data within a model's
dataset.
Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/data_validation/DickeyFullerGLS.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Any, Dict, Tuple

import pandas as pd
from arch.unitroot import DFGLS
from numpy.linalg import LinAlgError
Expand All @@ -16,7 +18,7 @@

@tags("time_series_data", "forecasting", "unit_root_test")
@tasks("regression")
def DickeyFullerGLS(dataset: VMDataset):
def DickeyFullerGLS(dataset: VMDataset) -> Tuple[Dict[str, Any], RawData]:
"""
Assesses stationarity in time series data using the Dickey-Fuller GLS test to determine the order of integration.

Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/data_validation/Duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Any, Dict, Tuple

import pandas as pd

from validmind import tags, tasks


@tags("tabular_data", "data_quality", "text_data")
@tasks("classification", "regression")
def Duplicates(dataset, min_threshold=1):
def Duplicates(dataset, min_threshold=1) -> Tuple[Dict[str, Any], bool]:
"""
Tests dataset for duplicate entries, ensuring model reliability via data quality verification.

Expand Down
Loading