diff --git a/pyproject.toml b/pyproject.toml index 4e996a8f1..e3375290f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ description = "ValidMind Library" license = "Commercial License" name = "validmind" readme = "README.pypi.md" -version = "2.8.20" +version = "2.8.21" [tool.poetry.dependencies] aiohttp = {extras = ["speedups"], version = "*"} diff --git a/tests/test_dataset.py b/tests/test_dataset.py index a2a65760d..e18a90aa4 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -34,8 +34,6 @@ def test_init_dataset_pandas_no_options(self): """ vm_dataset = DataFrameDataset(raw_dataset=self.df) - # Pandas dataframe gets converted to numpy internally and raw_dataset is a numpy array - np.testing.assert_array_equal(vm_dataset._raw_dataset, self.df.values) pd.testing.assert_frame_equal(vm_dataset.df, self.df) def test_init_dataset_pandas_target_column(self): @@ -68,6 +66,27 @@ def test_init_dataset_pandas_feature_columns(self): self.assertEqual(vm_dataset.feature_columns_categorical, []) self.assertEqual(vm_dataset.feature_columns, ["col1"]) + def test_dtype_preserved(self): + """ + Test that dtype is preserved in DataFrameDataset. + """ + + test_df = pd.DataFrame({"col1": pd.Categorical(["x", "y", "z"])}) + + # Verify original data is categorical + self.assertTrue( + pd.api.types.is_categorical_dtype(test_df["col1"]), + "Original DataFrame should have categorical dtype", + ) + + # Verify categorical dtype is preserved + dataset = DataFrameDataset(raw_dataset=test_df, input_id="test_dataset") + + self.assertTrue( + pd.api.types.is_categorical_dtype(dataset.df["col1"]), + "DataFrameDataset should preserve categorical dtype", + ) + def test_assign_predictions_invalid_model(self): """ Test assigning predictions to dataset with an invalid model diff --git a/validmind/__version__.py b/validmind/__version__.py index ca466009f..cfdf41d3a 100644 --- a/validmind/__version__.py +++ b/validmind/__version__.py @@ -1 +1 @@ -__version__ = "2.8.20" +__version__ = "2.8.21" diff --git a/validmind/client.py b/validmind/client.py index fe0517085..7f6d227c9 100644 --- a/validmind/client.py +++ b/validmind/client.py @@ -61,6 +61,7 @@ def init_dataset( class_labels: Optional[Dict[str, Any]] = None, type: Optional[str] = None, input_id: Optional[str] = None, + copy_data: bool = True, __log: bool = True, ) -> VMDataset: """ @@ -92,6 +93,7 @@ def init_dataset( this will be set to `dataset` but if you are passing this dataset as a test input using some other key than `dataset`, then you should set this to the same key. + copy_data (bool, optional): Whether to copy the data. Defaults to True. __log (bool): Whether to log the input. Defaults to True. Raises: @@ -121,6 +123,7 @@ def init_dataset( extra_columns=extra_columns, target_class_labels=class_labels, date_time_index=date_time_index, + copy_data=copy_data, ) elif isinstance(dataset, pl.DataFrame): vm_dataset = PolarsDataset( diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py index e953dece7..f0a9571d1 100644 --- a/validmind/vm_models/dataset/dataset.py +++ b/validmind/vm_models/dataset/dataset.py @@ -47,6 +47,7 @@ class VMDataset(VMInput): target_class_labels (Dict): The class labels for the target columns. df (pd.DataFrame): The dataset as a pandas DataFrame. extra_columns (Dict): Extra columns to include in the dataset. + copy_data (bool): Whether to copy the data. Defaults to True. """ def __repr__(self): @@ -66,6 +67,7 @@ def __init__( text_column: str = None, extra_columns: dict = None, target_class_labels: dict = None, + copy_data: bool = True, ): """ Initializes a VMDataset instance. @@ -82,6 +84,7 @@ def __init__( feature_columns (str, optional): The feature column names of the dataset. Defaults to None. text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None. target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None. + copy_data (bool, optional): Whether to copy the data. Defaults to True. """ # initialize input_id self.input_id = input_id @@ -112,6 +115,7 @@ def __init__( self.target_class_labels = target_class_labels self.extra_columns = ExtraColumns.from_dict(extra_columns) self._set_feature_columns(feature_columns) + self._copy_data = copy_data if model: self.assign_predictions(model) @@ -397,8 +401,18 @@ def df(self) -> pd.DataFrame: assert self.target_column not in columns columns.append(self.target_column) - # return a copy to prevent accidental modification - return as_df(self._df[columns]).copy() + # Check if all columns in self._df are requested + all_columns = set(columns) == set(self._df.columns) + + # For copy_data=False and all columns: return exact same DataFrame object + if not self._copy_data and all_columns: + return self._df + # For copy_data=False and subset of columns: return view with shared data + elif not self._copy_data: + return as_df(self._df[columns]) + # For copy_data=True: return independent copy with duplicated data + else: + return as_df(self._df[columns]).copy() @property def x(self) -> np.ndarray: @@ -522,9 +536,10 @@ def __init__( text_column: str = None, target_class_labels: dict = None, date_time_index: bool = False, + copy_data: bool = True, ): """ - Initializes a DataFrameDataset instance. + Initializes a DataFrameDataset instance, preserving original pandas dtypes. Args: raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame. @@ -536,25 +551,44 @@ def __init__( text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None. target_class_labels (dict, optional): The class labels for the target columns. Defaults to None. date_time_index (bool, optional): Whether to use date-time index. Defaults to False. + copy_data (bool, optional): Whether to create a copy of the input data. Defaults to True. """ + + VMInput.__init__(self) + + self.input_id = input_id + index = None if isinstance(raw_dataset.index, pd.Index): index = raw_dataset.index.values + self.index = index - super().__init__( - raw_dataset=raw_dataset.values, - input_id=input_id, - model=model, - index_name=raw_dataset.index.name, - index=index, - columns=raw_dataset.columns.to_list(), - target_column=target_column, - extra_columns=extra_columns, - feature_columns=feature_columns, - text_column=text_column, - target_class_labels=target_class_labels, - date_time_index=date_time_index, - ) + # Store the DataFrame directly + self._df = raw_dataset + + if date_time_index: + self._df = convert_index_to_datetime(self._df) + + self.columns = raw_dataset.columns.tolist() + self.column_aliases = {} + self.target_column = target_column + self.text_column = text_column + self.target_class_labels = target_class_labels + self.extra_columns = ExtraColumns.from_dict(extra_columns) + self._copy_data = copy_data + + # Add warning when copy_data is False + if not copy_data: + logger.warning( + "Dataset initialized with copy_data=False. Changes to the original DataFrame " + "may affect this dataset. Use this option only when memory efficiency is critical " + "and you won't modify the source data." + ) + + self._set_feature_columns(feature_columns) + + if model: + self.assign_predictions(model) class PolarsDataset(VMDataset):