From 0fdbb45dfe73a3467a82083d446a06ec3a45d61d Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Fri, 20 Jun 2025 13:49:28 +0100 Subject: [PATCH 1/3] update diagnosis tests --- .../sklearn/OverfitDiagnosis.py | 17 ++++++++++++++++- .../sklearn/WeakspotsDiagnosis.py | 13 +++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py b/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py index 0ef87f5f2..cb045c943 100644 --- a/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +++ b/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py @@ -220,6 +220,21 @@ def OverfitDiagnosis( - May not capture more subtle forms of overfitting that do not exceed the threshold. - Assumes that the binning of features adequately represents the data segments. """ + + feature_columns = datasets[0].feature_columns + numeric_and_categorical_columns = ( + datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical + ) + + feature_columns = [ + col for col in feature_columns if col in numeric_and_categorical_columns + ] + + if not feature_columns: + raise ValueError( + "No valid numeric or categorical columns found in features_columns" + ) + is_classification = bool(datasets[0].probability_column(model)) if not metric: @@ -246,7 +261,7 @@ def OverfitDiagnosis( figures = [] results_headers = ["slice", "shape", "feature", metric] - for feature_column in datasets[0].feature_columns: + for feature_column in feature_columns: bins = 10 if feature_column in datasets[0].feature_columns_categorical: bins = len(train_df[feature_column].unique()) diff --git a/validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py b/validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py index f8f0b6667..591dccedb 100644 --- a/validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +++ b/validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py @@ -211,6 +211,19 @@ def WeakspotsDiagnosis( improvement. """ feature_columns = features_columns or datasets[0].feature_columns + numeric_and_categorical_columns = ( + datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical + ) + + feature_columns = [ + col for col in feature_columns if col in numeric_and_categorical_columns + ] + + if not feature_columns: + raise ValueError( + "No valid numeric or categorical columns found in features_columns" + ) + if not all(col in datasets[0].feature_columns for col in feature_columns): raise ValueError( "Column(s) provided in features_columns do not exist in the dataset" From 4800d9dcbf3d23ad6dd1bba08acbe58133a176fb Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Fri, 20 Jun 2025 14:06:39 +0100 Subject: [PATCH 2/3] update diagnosis tests --- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py b/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py index cb045c943..d6497bb0e 100644 --- a/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +++ b/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py @@ -221,13 +221,12 @@ def OverfitDiagnosis( - Assumes that the binning of features adequately represents the data segments. """ - feature_columns = datasets[0].feature_columns numeric_and_categorical_columns = ( datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical ) feature_columns = [ - col for col in feature_columns if col in numeric_and_categorical_columns + col for col in datasets[0].feature_columns if col in numeric_and_categorical_columns ] if not feature_columns: From d632c14b4a902aae3d77cd0c121dfa0795fcfd65 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Mon, 23 Jun 2025 11:15:37 +0100 Subject: [PATCH 3/3] remove unnecessary condition --- .../tests/model_validation/sklearn/OverfitDiagnosis.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py b/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py index d6497bb0e..9994efd82 100644 --- a/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +++ b/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py @@ -221,15 +221,11 @@ def OverfitDiagnosis( - Assumes that the binning of features adequately represents the data segments. """ - numeric_and_categorical_columns = ( + numeric_and_categorical_feature_columns = ( datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical ) - feature_columns = [ - col for col in datasets[0].feature_columns if col in numeric_and_categorical_columns - ] - - if not feature_columns: + if not numeric_and_categorical_feature_columns: raise ValueError( "No valid numeric or categorical columns found in features_columns" ) @@ -260,7 +256,7 @@ def OverfitDiagnosis( figures = [] results_headers = ["slice", "shape", "feature", metric] - for feature_column in feature_columns: + for feature_column in numeric_and_categorical_feature_columns: bins = 10 if feature_column in datasets[0].feature_columns_categorical: bins = len(train_df[feature_column].unique())