From 84b9b553db1bbeb0a41dda5e915180328ac23a23 Mon Sep 17 00:00:00 2001 From: Juan Date: Wed, 23 Apr 2025 13:51:14 +0200 Subject: [PATCH 1/4] Fix dtype preservation in DataFrameDataset by bypassing NumPy conversion through the VMDataset --- validmind/vm_models/dataset/dataset.py | 40 ++++++++++++++++---------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py index e953dece7..c52ea709a 100644 --- a/validmind/vm_models/dataset/dataset.py +++ b/validmind/vm_models/dataset/dataset.py @@ -524,7 +524,7 @@ def __init__( date_time_index: bool = False, ): """ - Initializes a DataFrameDataset instance. + Initializes a DataFrameDataset instance, preserving original pandas dtypes. Args: raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame. @@ -537,24 +537,34 @@ def __init__( target_class_labels (dict, optional): The class labels for the target columns. Defaults to None. date_time_index (bool, optional): Whether to use date-time index. Defaults to False. """ + + VMInput.__init__(self) + + self.input_id = input_id + self._raw_dataset = raw_dataset.values + index = None if isinstance(raw_dataset.index, pd.Index): index = raw_dataset.index.values + self.index = index - super().__init__( - raw_dataset=raw_dataset.values, - input_id=input_id, - model=model, - index_name=raw_dataset.index.name, - index=index, - columns=raw_dataset.columns.to_list(), - target_column=target_column, - extra_columns=extra_columns, - feature_columns=feature_columns, - text_column=text_column, - target_class_labels=target_class_labels, - date_time_index=date_time_index, - ) + # Store the DataFrame directly + self._df = raw_dataset.copy() + + if date_time_index: + self._df = convert_index_to_datetime(self._df) + + self.columns = raw_dataset.columns.tolist() + self.column_aliases = {} + self.target_column = target_column + self.text_column = text_column + self.target_class_labels = target_class_labels + self.extra_columns = ExtraColumns.from_dict(extra_columns) + + self._set_feature_columns(feature_columns) + + if model: + self.assign_predictions(model) class PolarsDataset(VMDataset): From 738bbbca6677e4c34ae91f7a0b761d33c908b84e Mon Sep 17 00:00:00 2001 From: Juan Date: Thu, 24 Apr 2025 12:30:49 +0200 Subject: [PATCH 2/4] Add option to not copy data in init_dataset for DataFrameDataset --- validmind/client.py | 3 +++ validmind/vm_models/dataset/dataset.py | 32 ++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/validmind/client.py b/validmind/client.py index fe0517085..7f6d227c9 100644 --- a/validmind/client.py +++ b/validmind/client.py @@ -61,6 +61,7 @@ def init_dataset( class_labels: Optional[Dict[str, Any]] = None, type: Optional[str] = None, input_id: Optional[str] = None, + copy_data: bool = True, __log: bool = True, ) -> VMDataset: """ @@ -92,6 +93,7 @@ def init_dataset( this will be set to `dataset` but if you are passing this dataset as a test input using some other key than `dataset`, then you should set this to the same key. + copy_data (bool, optional): Whether to copy the data. Defaults to True. __log (bool): Whether to log the input. Defaults to True. Raises: @@ -121,6 +123,7 @@ def init_dataset( extra_columns=extra_columns, target_class_labels=class_labels, date_time_index=date_time_index, + copy_data=copy_data, ) elif isinstance(dataset, pl.DataFrame): vm_dataset = PolarsDataset( diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py index c52ea709a..f0a9571d1 100644 --- a/validmind/vm_models/dataset/dataset.py +++ b/validmind/vm_models/dataset/dataset.py @@ -47,6 +47,7 @@ class VMDataset(VMInput): target_class_labels (Dict): The class labels for the target columns. df (pd.DataFrame): The dataset as a pandas DataFrame. extra_columns (Dict): Extra columns to include in the dataset. + copy_data (bool): Whether to copy the data. Defaults to True. """ def __repr__(self): @@ -66,6 +67,7 @@ def __init__( text_column: str = None, extra_columns: dict = None, target_class_labels: dict = None, + copy_data: bool = True, ): """ Initializes a VMDataset instance. @@ -82,6 +84,7 @@ def __init__( feature_columns (str, optional): The feature column names of the dataset. Defaults to None. text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None. target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None. + copy_data (bool, optional): Whether to copy the data. Defaults to True. """ # initialize input_id self.input_id = input_id @@ -112,6 +115,7 @@ def __init__( self.target_class_labels = target_class_labels self.extra_columns = ExtraColumns.from_dict(extra_columns) self._set_feature_columns(feature_columns) + self._copy_data = copy_data if model: self.assign_predictions(model) @@ -397,8 +401,18 @@ def df(self) -> pd.DataFrame: assert self.target_column not in columns columns.append(self.target_column) - # return a copy to prevent accidental modification - return as_df(self._df[columns]).copy() + # Check if all columns in self._df are requested + all_columns = set(columns) == set(self._df.columns) + + # For copy_data=False and all columns: return exact same DataFrame object + if not self._copy_data and all_columns: + return self._df + # For copy_data=False and subset of columns: return view with shared data + elif not self._copy_data: + return as_df(self._df[columns]) + # For copy_data=True: return independent copy with duplicated data + else: + return as_df(self._df[columns]).copy() @property def x(self) -> np.ndarray: @@ -522,6 +536,7 @@ def __init__( text_column: str = None, target_class_labels: dict = None, date_time_index: bool = False, + copy_data: bool = True, ): """ Initializes a DataFrameDataset instance, preserving original pandas dtypes. @@ -536,12 +551,12 @@ def __init__( text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None. target_class_labels (dict, optional): The class labels for the target columns. Defaults to None. date_time_index (bool, optional): Whether to use date-time index. Defaults to False. + copy_data (bool, optional): Whether to create a copy of the input data. Defaults to True. """ VMInput.__init__(self) self.input_id = input_id - self._raw_dataset = raw_dataset.values index = None if isinstance(raw_dataset.index, pd.Index): @@ -549,7 +564,7 @@ def __init__( self.index = index # Store the DataFrame directly - self._df = raw_dataset.copy() + self._df = raw_dataset if date_time_index: self._df = convert_index_to_datetime(self._df) @@ -560,6 +575,15 @@ def __init__( self.text_column = text_column self.target_class_labels = target_class_labels self.extra_columns = ExtraColumns.from_dict(extra_columns) + self._copy_data = copy_data + + # Add warning when copy_data is False + if not copy_data: + logger.warning( + "Dataset initialized with copy_data=False. Changes to the original DataFrame " + "may affect this dataset. Use this option only when memory efficiency is critical " + "and you won't modify the source data." + ) self._set_feature_columns(feature_columns) From 30a5e134514096da3a9a03338c2e7dd16a5f3155 Mon Sep 17 00:00:00 2001 From: Juan Date: Thu, 24 Apr 2025 12:52:48 +0200 Subject: [PATCH 3/4] Add unit test to very dtype preservation --- tests/test_dataset.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index a2a65760d..e18a90aa4 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -34,8 +34,6 @@ def test_init_dataset_pandas_no_options(self): """ vm_dataset = DataFrameDataset(raw_dataset=self.df) - # Pandas dataframe gets converted to numpy internally and raw_dataset is a numpy array - np.testing.assert_array_equal(vm_dataset._raw_dataset, self.df.values) pd.testing.assert_frame_equal(vm_dataset.df, self.df) def test_init_dataset_pandas_target_column(self): @@ -68,6 +66,27 @@ def test_init_dataset_pandas_feature_columns(self): self.assertEqual(vm_dataset.feature_columns_categorical, []) self.assertEqual(vm_dataset.feature_columns, ["col1"]) + def test_dtype_preserved(self): + """ + Test that dtype is preserved in DataFrameDataset. + """ + + test_df = pd.DataFrame({"col1": pd.Categorical(["x", "y", "z"])}) + + # Verify original data is categorical + self.assertTrue( + pd.api.types.is_categorical_dtype(test_df["col1"]), + "Original DataFrame should have categorical dtype", + ) + + # Verify categorical dtype is preserved + dataset = DataFrameDataset(raw_dataset=test_df, input_id="test_dataset") + + self.assertTrue( + pd.api.types.is_categorical_dtype(dataset.df["col1"]), + "DataFrameDataset should preserve categorical dtype", + ) + def test_assign_predictions_invalid_model(self): """ Test assigning predictions to dataset with an invalid model From ad05ba3ef475e15a7b3e1dc1a09032cd8b018a39 Mon Sep 17 00:00:00 2001 From: Juan Date: Fri, 25 Apr 2025 10:38:57 +0200 Subject: [PATCH 4/4] 2.8.21 --- pyproject.toml | 2 +- validmind/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4e996a8f1..e3375290f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ description = "ValidMind Library" license = "Commercial License" name = "validmind" readme = "README.pypi.md" -version = "2.8.20" +version = "2.8.21" [tool.poetry.dependencies] aiohttp = {extras = ["speedups"], version = "*"} diff --git a/validmind/__version__.py b/validmind/__version__.py index ca466009f..cfdf41d3a 100644 --- a/validmind/__version__.py +++ b/validmind/__version__.py @@ -1 +1 @@ -__version__ = "2.8.20" +__version__ = "2.8.21"