validmind · juanmleng · Apr 25, 2025 · Apr 23, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ description = "ValidMind Library"
 license = "Commercial License"
 name = "validmind"
 readme = "README.pypi.md"
-version = "2.8.20"
+version = "2.8.21"
 
 [tool.poetry.dependencies]
 aiohttp = {extras = ["speedups"], version = "*"}

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -34,8 +34,6 @@ def test_init_dataset_pandas_no_options(self):
         """
         vm_dataset = DataFrameDataset(raw_dataset=self.df)
 
-        # Pandas dataframe gets converted to numpy internally and raw_dataset is a numpy array
-        np.testing.assert_array_equal(vm_dataset._raw_dataset, self.df.values)
         pd.testing.assert_frame_equal(vm_dataset.df, self.df)
 
     def test_init_dataset_pandas_target_column(self):
@@ -68,6 +66,27 @@ def test_init_dataset_pandas_feature_columns(self):
         self.assertEqual(vm_dataset.feature_columns_categorical, [])
         self.assertEqual(vm_dataset.feature_columns, ["col1"])
 
+    def test_dtype_preserved(self):
+        """
+        Test that dtype is preserved in DataFrameDataset.
+        """
+
+        test_df = pd.DataFrame({"col1": pd.Categorical(["x", "y", "z"])})
+
+        # Verify original data is categorical
+        self.assertTrue(
+            pd.api.types.is_categorical_dtype(test_df["col1"]),
+            "Original DataFrame should have categorical dtype",
+        )
+
+        # Verify categorical dtype is preserved
+        dataset = DataFrameDataset(raw_dataset=test_df, input_id="test_dataset")
+
+        self.assertTrue(
+            pd.api.types.is_categorical_dtype(dataset.df["col1"]),
+            "DataFrameDataset should preserve categorical dtype",
+        )
+
     def test_assign_predictions_invalid_model(self):
         """
         Test assigning predictions to dataset with an invalid model

diff --git a/validmind/__version__.py b/validmind/__version__.py
@@ -1 +1 @@
-__version__ = "2.8.20"
+__version__ = "2.8.21"
diff --git a/validmind/client.py b/validmind/client.py
@@ -61,6 +61,7 @@ def init_dataset(
     class_labels: Optional[Dict[str, Any]] = None,
     type: Optional[str] = None,
     input_id: Optional[str] = None,
+    copy_data: bool = True,
     __log: bool = True,
 ) -> VMDataset:
     """
@@ -92,6 +93,7 @@ def init_dataset(
             this will be set to `dataset` but if you are passing this dataset as a
             test input using some other key than `dataset`, then you should set
             this to the same key.
+        copy_data (bool, optional): Whether to copy the data. Defaults to True.
         __log (bool): Whether to log the input. Defaults to True.
 
     Raises:
@@ -121,6 +123,7 @@ def init_dataset(
             extra_columns=extra_columns,
             target_class_labels=class_labels,
             date_time_index=date_time_index,
+            copy_data=copy_data,
         )
     elif isinstance(dataset, pl.DataFrame):
         vm_dataset = PolarsDataset(

diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py
@@ -47,6 +47,7 @@ class VMDataset(VMInput):
         target_class_labels (Dict): The class labels for the target columns.
         df (pd.DataFrame): The dataset as a pandas DataFrame.
         extra_columns (Dict): Extra columns to include in the dataset.
+        copy_data (bool): Whether to copy the data. Defaults to True.
     """
 
     def __repr__(self):
@@ -66,6 +67,7 @@ def __init__(
         text_column: str = None,
         extra_columns: dict = None,
         target_class_labels: dict = None,
+        copy_data: bool = True,
     ):
         """
         Initializes a VMDataset instance.
@@ -82,6 +84,7 @@ def __init__(
             feature_columns (str, optional): The feature column names of the dataset. Defaults to None.
             text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
             target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
+            copy_data (bool, optional): Whether to copy the data. Defaults to True.
         """
         # initialize input_id
         self.input_id = input_id
@@ -112,6 +115,7 @@ def __init__(
         self.target_class_labels = target_class_labels
         self.extra_columns = ExtraColumns.from_dict(extra_columns)
         self._set_feature_columns(feature_columns)
+        self._copy_data = copy_data
 
         if model:
             self.assign_predictions(model)
@@ -397,8 +401,18 @@ def df(self) -> pd.DataFrame:
             assert self.target_column not in columns
             columns.append(self.target_column)
 
-        # return a copy to prevent accidental modification
-        return as_df(self._df[columns]).copy()
+        # Check if all columns in self._df are requested
+        all_columns = set(columns) == set(self._df.columns)
+
+        # For copy_data=False and all columns: return exact same DataFrame object
+        if not self._copy_data and all_columns:
+            return self._df
+        # For copy_data=False and subset of columns: return view with shared data
+        elif not self._copy_data:
+            return as_df(self._df[columns])
+        # For copy_data=True: return independent copy with duplicated data
+        else:
+            return as_df(self._df[columns]).copy()
 
     @property
     def x(self) -> np.ndarray:
@@ -522,9 +536,10 @@ def __init__(
         text_column: str = None,
         target_class_labels: dict = None,
         date_time_index: bool = False,
+        copy_data: bool = True,
     ):
         """
-        Initializes a DataFrameDataset instance.
+        Initializes a DataFrameDataset instance, preserving original pandas dtypes.
 
         Args:
             raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
@@ -536,25 +551,44 @@ def __init__(
             text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
             target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
             date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
+            copy_data (bool, optional): Whether to create a copy of the input data. Defaults to True.
         """
+
+        VMInput.__init__(self)
+
+        self.input_id = input_id
+
         index = None
         if isinstance(raw_dataset.index, pd.Index):
             index = raw_dataset.index.values
+        self.index = index
 
-        super().__init__(
-            raw_dataset=raw_dataset.values,
-            input_id=input_id,
-            model=model,
-            index_name=raw_dataset.index.name,
-            index=index,
-            columns=raw_dataset.columns.to_list(),
-            target_column=target_column,
-            extra_columns=extra_columns,
-            feature_columns=feature_columns,
-            text_column=text_column,
-            target_class_labels=target_class_labels,
-            date_time_index=date_time_index,
-        )
+        # Store the DataFrame directly
+        self._df = raw_dataset
+
+        if date_time_index:
+            self._df = convert_index_to_datetime(self._df)
+
+        self.columns = raw_dataset.columns.tolist()
+        self.column_aliases = {}
+        self.target_column = target_column
+        self.text_column = text_column
+        self.target_class_labels = target_class_labels
+        self.extra_columns = ExtraColumns.from_dict(extra_columns)
+        self._copy_data = copy_data
+
+        # Add warning when copy_data is False
+        if not copy_data:
+            logger.warning(
+                "Dataset initialized with copy_data=False. Changes to the original DataFrame "
+                "may affect this dataset. Use this option only when memory efficiency is critical "
+                "and you won't modify the source data."
+            )
+
+        self._set_feature_columns(feature_columns)
+
+        if model:
+            self.assign_predictions(model)
 
 
 class PolarsDataset(VMDataset):