From 0ce31acf5d0e59483c1987fe7585aa6b064a9874 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Fri, 25 Apr 2025 12:56:14 -0700 Subject: [PATCH 1/3] Improve performance of feature_columns_numeric and feature_columns_categorical --- validmind/vm_models/dataset/dataset.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py index f0a9571d1..3b04bcf16 100644 --- a/validmind/vm_models/dataset/dataset.py +++ b/validmind/vm_models/dataset/dataset.py @@ -133,16 +133,19 @@ def _set_feature_columns(self, feature_columns=None): excluded = [self.target_column, *self.extra_columns.flatten()] self.feature_columns = [col for col in self.columns if col not in excluded] - self.feature_columns_numeric = ( - self._df[self.feature_columns] - .select_dtypes(include=[np.number]) - .columns.tolist() - ) - self.feature_columns_categorical = ( - self._df[self.feature_columns] - .select_dtypes(include=[object, pd.Categorical]) - .columns.tolist() - ) + # Get dtypes without loading data into memory + feature_dtypes = self._df[self.feature_columns].dtypes + + self.feature_columns_numeric = feature_dtypes[ + feature_dtypes.apply(lambda x: pd.api.types.is_numeric_dtype(x)) + ].index.tolist() + + self.feature_columns_categorical = feature_dtypes[ + feature_dtypes.apply( + lambda x: pd.api.types.is_categorical_dtype(x) + or pd.api.types.is_object_dtype(x) + ) + ].index.tolist() def _add_column(self, column_name, column_values): column_values = np.array(column_values) @@ -560,6 +563,7 @@ def __init__( index = None if isinstance(raw_dataset.index, pd.Index): + print("Index is a pandas Index") index = raw_dataset.index.values self.index = index @@ -585,6 +589,7 @@ def __init__( "and you won't modify the source data." ) + print("Setting feature columns...") self._set_feature_columns(feature_columns) if model: From e16d837c7b6840c00ff43556a619406c146fdb22 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Fri, 25 Apr 2025 12:59:55 -0700 Subject: [PATCH 2/3] Remove debugging lines --- validmind/vm_models/dataset/dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py index 3b04bcf16..d40c1d692 100644 --- a/validmind/vm_models/dataset/dataset.py +++ b/validmind/vm_models/dataset/dataset.py @@ -563,7 +563,6 @@ def __init__( index = None if isinstance(raw_dataset.index, pd.Index): - print("Index is a pandas Index") index = raw_dataset.index.values self.index = index @@ -589,7 +588,6 @@ def __init__( "and you won't modify the source data." ) - print("Setting feature columns...") self._set_feature_columns(feature_columns) if model: From 07623754e81fc7b798a66be65d98f4f16d1fe722 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Fri, 25 Apr 2025 14:41:09 -0700 Subject: [PATCH 3/3] 2.8.22 --- pyproject.toml | 2 +- validmind/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e3375290f..32728854d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ description = "ValidMind Library" license = "Commercial License" name = "validmind" readme = "README.pypi.md" -version = "2.8.21" +version = "2.8.22" [tool.poetry.dependencies] aiohttp = {extras = ["speedups"], version = "*"} diff --git a/validmind/__version__.py b/validmind/__version__.py index cfdf41d3a..7217d14b6 100644 --- a/validmind/__version__.py +++ b/validmind/__version__.py @@ -1 +1 @@ -__version__ = "2.8.21" +__version__ = "2.8.22"