From 84b9b553db1bbeb0a41dda5e915180328ac23a23 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Wed, 23 Apr 2025 13:51:14 +0200
Subject: [PATCH 1/4] Fix dtype preservation in DataFrameDataset by bypassing
 NumPy conversion through the VMDataset

---
 validmind/vm_models/dataset/dataset.py | 40 ++++++++++++++++----------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py
index e953dece7..c52ea709a 100644
--- a/validmind/vm_models/dataset/dataset.py
+++ b/validmind/vm_models/dataset/dataset.py
@@ -524,7 +524,7 @@ def __init__(
         date_time_index: bool = False,
     ):
         """
-        Initializes a DataFrameDataset instance.
+        Initializes a DataFrameDataset instance, preserving original pandas dtypes.
 
         Args:
             raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
@@ -537,24 +537,34 @@ def __init__(
             target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
             date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
         """
+
+        VMInput.__init__(self)
+
+        self.input_id = input_id
+        self._raw_dataset = raw_dataset.values
+
         index = None
         if isinstance(raw_dataset.index, pd.Index):
             index = raw_dataset.index.values
+        self.index = index
 
-        super().__init__(
-            raw_dataset=raw_dataset.values,
-            input_id=input_id,
-            model=model,
-            index_name=raw_dataset.index.name,
-            index=index,
-            columns=raw_dataset.columns.to_list(),
-            target_column=target_column,
-            extra_columns=extra_columns,
-            feature_columns=feature_columns,
-            text_column=text_column,
-            target_class_labels=target_class_labels,
-            date_time_index=date_time_index,
-        )
+        # Store the DataFrame directly
+        self._df = raw_dataset.copy()
+
+        if date_time_index:
+            self._df = convert_index_to_datetime(self._df)
+
+        self.columns = raw_dataset.columns.tolist()
+        self.column_aliases = {}
+        self.target_column = target_column
+        self.text_column = text_column
+        self.target_class_labels = target_class_labels
+        self.extra_columns = ExtraColumns.from_dict(extra_columns)
+
+        self._set_feature_columns(feature_columns)
+
+        if model:
+            self.assign_predictions(model)
 
 
 class PolarsDataset(VMDataset):

From 738bbbca6677e4c34ae91f7a0b761d33c908b84e Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Thu, 24 Apr 2025 12:30:49 +0200
Subject: [PATCH 2/4] Add option to not copy data in init_dataset for
 DataFrameDataset

---
 validmind/client.py                    |  3 +++
 validmind/vm_models/dataset/dataset.py | 32 ++++++++++++++++++++++----
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/validmind/client.py b/validmind/client.py
index fe0517085..7f6d227c9 100644
--- a/validmind/client.py
+++ b/validmind/client.py
@@ -61,6 +61,7 @@ def init_dataset(
     class_labels: Optional[Dict[str, Any]] = None,
     type: Optional[str] = None,
     input_id: Optional[str] = None,
+    copy_data: bool = True,
     __log: bool = True,
 ) -> VMDataset:
     """
@@ -92,6 +93,7 @@ def init_dataset(
             this will be set to `dataset` but if you are passing this dataset as a
             test input using some other key than `dataset`, then you should set
             this to the same key.
+        copy_data (bool, optional): Whether to copy the data. Defaults to True.
         __log (bool): Whether to log the input. Defaults to True.
 
     Raises:
@@ -121,6 +123,7 @@ def init_dataset(
             extra_columns=extra_columns,
             target_class_labels=class_labels,
             date_time_index=date_time_index,
+            copy_data=copy_data,
         )
     elif isinstance(dataset, pl.DataFrame):
         vm_dataset = PolarsDataset(
diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py
index c52ea709a..f0a9571d1 100644
--- a/validmind/vm_models/dataset/dataset.py
+++ b/validmind/vm_models/dataset/dataset.py
@@ -47,6 +47,7 @@ class VMDataset(VMInput):
         target_class_labels (Dict): The class labels for the target columns.
         df (pd.DataFrame): The dataset as a pandas DataFrame.
         extra_columns (Dict): Extra columns to include in the dataset.
+        copy_data (bool): Whether to copy the data. Defaults to True.
     """
 
     def __repr__(self):
@@ -66,6 +67,7 @@ def __init__(
         text_column: str = None,
         extra_columns: dict = None,
         target_class_labels: dict = None,
+        copy_data: bool = True,
     ):
         """
         Initializes a VMDataset instance.
@@ -82,6 +84,7 @@ def __init__(
             feature_columns (str, optional): The feature column names of the dataset. Defaults to None.
             text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
             target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
+            copy_data (bool, optional): Whether to copy the data. Defaults to True.
         """
         # initialize input_id
         self.input_id = input_id
@@ -112,6 +115,7 @@ def __init__(
         self.target_class_labels = target_class_labels
         self.extra_columns = ExtraColumns.from_dict(extra_columns)
         self._set_feature_columns(feature_columns)
+        self._copy_data = copy_data
 
         if model:
             self.assign_predictions(model)
@@ -397,8 +401,18 @@ def df(self) -> pd.DataFrame:
             assert self.target_column not in columns
             columns.append(self.target_column)
 
-        # return a copy to prevent accidental modification
-        return as_df(self._df[columns]).copy()
+        # Check if all columns in self._df are requested
+        all_columns = set(columns) == set(self._df.columns)
+
+        # For copy_data=False and all columns: return exact same DataFrame object
+        if not self._copy_data and all_columns:
+            return self._df
+        # For copy_data=False and subset of columns: return view with shared data
+        elif not self._copy_data:
+            return as_df(self._df[columns])
+        # For copy_data=True: return independent copy with duplicated data
+        else:
+            return as_df(self._df[columns]).copy()
 
     @property
     def x(self) -> np.ndarray:
@@ -522,6 +536,7 @@ def __init__(
         text_column: str = None,
         target_class_labels: dict = None,
         date_time_index: bool = False,
+        copy_data: bool = True,
     ):
         """
         Initializes a DataFrameDataset instance, preserving original pandas dtypes.
@@ -536,12 +551,12 @@ def __init__(
             text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
             target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
             date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
+            copy_data (bool, optional): Whether to create a copy of the input data. Defaults to True.
         """
 
         VMInput.__init__(self)
 
         self.input_id = input_id
-        self._raw_dataset = raw_dataset.values
 
         index = None
         if isinstance(raw_dataset.index, pd.Index):
@@ -549,7 +564,7 @@ def __init__(
         self.index = index
 
         # Store the DataFrame directly
-        self._df = raw_dataset.copy()
+        self._df = raw_dataset
 
         if date_time_index:
             self._df = convert_index_to_datetime(self._df)
@@ -560,6 +575,15 @@ def __init__(
         self.text_column = text_column
         self.target_class_labels = target_class_labels
         self.extra_columns = ExtraColumns.from_dict(extra_columns)
+        self._copy_data = copy_data
+
+        # Add warning when copy_data is False
+        if not copy_data:
+            logger.warning(
+                "Dataset initialized with copy_data=False. Changes to the original DataFrame "
+                "may affect this dataset. Use this option only when memory efficiency is critical "
+                "and you won't modify the source data."
+            )
 
         self._set_feature_columns(feature_columns)
 

From 30a5e134514096da3a9a03338c2e7dd16a5f3155 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Thu, 24 Apr 2025 12:52:48 +0200
Subject: [PATCH 3/4] Add unit test to very dtype preservation

---
 tests/test_dataset.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index a2a65760d..e18a90aa4 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -34,8 +34,6 @@ def test_init_dataset_pandas_no_options(self):
         """
         vm_dataset = DataFrameDataset(raw_dataset=self.df)
 
-        # Pandas dataframe gets converted to numpy internally and raw_dataset is a numpy array
-        np.testing.assert_array_equal(vm_dataset._raw_dataset, self.df.values)
         pd.testing.assert_frame_equal(vm_dataset.df, self.df)
 
     def test_init_dataset_pandas_target_column(self):
@@ -68,6 +66,27 @@ def test_init_dataset_pandas_feature_columns(self):
         self.assertEqual(vm_dataset.feature_columns_categorical, [])
         self.assertEqual(vm_dataset.feature_columns, ["col1"])
 
+    def test_dtype_preserved(self):
+        """
+        Test that dtype is preserved in DataFrameDataset.
+        """
+
+        test_df = pd.DataFrame({"col1": pd.Categorical(["x", "y", "z"])})
+
+        # Verify original data is categorical
+        self.assertTrue(
+            pd.api.types.is_categorical_dtype(test_df["col1"]),
+            "Original DataFrame should have categorical dtype",
+        )
+
+        # Verify categorical dtype is preserved
+        dataset = DataFrameDataset(raw_dataset=test_df, input_id="test_dataset")
+
+        self.assertTrue(
+            pd.api.types.is_categorical_dtype(dataset.df["col1"]),
+            "DataFrameDataset should preserve categorical dtype",
+        )
+
     def test_assign_predictions_invalid_model(self):
         """
         Test assigning predictions to dataset with an invalid model

From ad05ba3ef475e15a7b3e1dc1a09032cd8b018a39 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 25 Apr 2025 10:38:57 +0200
Subject: [PATCH 4/4] 2.8.21

---
 pyproject.toml           | 2 +-
 validmind/__version__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4e996a8f1..e3375290f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ description = "ValidMind Library"
 license = "Commercial License"
 name = "validmind"
 readme = "README.pypi.md"
-version = "2.8.20"
+version = "2.8.21"
 
 [tool.poetry.dependencies]
 aiohttp = {extras = ["speedups"], version = "*"}
diff --git a/validmind/__version__.py b/validmind/__version__.py
index ca466009f..cfdf41d3a 100644
--- a/validmind/__version__.py
+++ b/validmind/__version__.py
@@ -1 +1 @@
-__version__ = "2.8.20"
+__version__ = "2.8.21"