Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ description = "ValidMind Library"
license = "Commercial License"
name = "validmind"
readme = "README.pypi.md"
version = "2.8.20"
version = "2.8.21"

[tool.poetry.dependencies]
aiohttp = {extras = ["speedups"], version = "*"}
Expand Down
23 changes: 21 additions & 2 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ def test_init_dataset_pandas_no_options(self):
"""
vm_dataset = DataFrameDataset(raw_dataset=self.df)

# Pandas dataframe gets converted to numpy internally and raw_dataset is a numpy array
np.testing.assert_array_equal(vm_dataset._raw_dataset, self.df.values)
pd.testing.assert_frame_equal(vm_dataset.df, self.df)

def test_init_dataset_pandas_target_column(self):
Expand Down Expand Up @@ -68,6 +66,27 @@ def test_init_dataset_pandas_feature_columns(self):
self.assertEqual(vm_dataset.feature_columns_categorical, [])
self.assertEqual(vm_dataset.feature_columns, ["col1"])

def test_dtype_preserved(self):
"""
Test that dtype is preserved in DataFrameDataset.
"""

test_df = pd.DataFrame({"col1": pd.Categorical(["x", "y", "z"])})

# Verify original data is categorical
self.assertTrue(
pd.api.types.is_categorical_dtype(test_df["col1"]),
"Original DataFrame should have categorical dtype",
)

# Verify categorical dtype is preserved
dataset = DataFrameDataset(raw_dataset=test_df, input_id="test_dataset")

self.assertTrue(
pd.api.types.is_categorical_dtype(dataset.df["col1"]),
"DataFrameDataset should preserve categorical dtype",
)

def test_assign_predictions_invalid_model(self):
"""
Test assigning predictions to dataset with an invalid model
Expand Down
2 changes: 1 addition & 1 deletion validmind/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.8.20"
__version__ = "2.8.21"
3 changes: 3 additions & 0 deletions validmind/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def init_dataset(
class_labels: Optional[Dict[str, Any]] = None,
type: Optional[str] = None,
input_id: Optional[str] = None,
copy_data: bool = True,
__log: bool = True,
) -> VMDataset:
"""
Expand Down Expand Up @@ -92,6 +93,7 @@ def init_dataset(
this will be set to `dataset` but if you are passing this dataset as a
test input using some other key than `dataset`, then you should set
this to the same key.
copy_data (bool, optional): Whether to copy the data. Defaults to True.
__log (bool): Whether to log the input. Defaults to True.

Raises:
Expand Down Expand Up @@ -121,6 +123,7 @@ def init_dataset(
extra_columns=extra_columns,
target_class_labels=class_labels,
date_time_index=date_time_index,
copy_data=copy_data,
)
elif isinstance(dataset, pl.DataFrame):
vm_dataset = PolarsDataset(
Expand Down
68 changes: 51 additions & 17 deletions validmind/vm_models/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class VMDataset(VMInput):
target_class_labels (Dict): The class labels for the target columns.
df (pd.DataFrame): The dataset as a pandas DataFrame.
extra_columns (Dict): Extra columns to include in the dataset.
copy_data (bool): Whether to copy the data. Defaults to True.
"""

def __repr__(self):
Expand All @@ -66,6 +67,7 @@ def __init__(
text_column: str = None,
extra_columns: dict = None,
target_class_labels: dict = None,
copy_data: bool = True,
):
"""
Initializes a VMDataset instance.
Expand All @@ -82,6 +84,7 @@ def __init__(
feature_columns (str, optional): The feature column names of the dataset. Defaults to None.
text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
copy_data (bool, optional): Whether to copy the data. Defaults to True.
"""
# initialize input_id
self.input_id = input_id
Expand Down Expand Up @@ -112,6 +115,7 @@ def __init__(
self.target_class_labels = target_class_labels
self.extra_columns = ExtraColumns.from_dict(extra_columns)
self._set_feature_columns(feature_columns)
self._copy_data = copy_data

if model:
self.assign_predictions(model)
Expand Down Expand Up @@ -397,8 +401,18 @@ def df(self) -> pd.DataFrame:
assert self.target_column not in columns
columns.append(self.target_column)

# return a copy to prevent accidental modification
return as_df(self._df[columns]).copy()
# Check if all columns in self._df are requested
all_columns = set(columns) == set(self._df.columns)

# For copy_data=False and all columns: return exact same DataFrame object
if not self._copy_data and all_columns:
return self._df
# For copy_data=False and subset of columns: return view with shared data
elif not self._copy_data:
return as_df(self._df[columns])
# For copy_data=True: return independent copy with duplicated data
else:
return as_df(self._df[columns]).copy()

@property
def x(self) -> np.ndarray:
Expand Down Expand Up @@ -522,9 +536,10 @@ def __init__(
text_column: str = None,
target_class_labels: dict = None,
date_time_index: bool = False,
copy_data: bool = True,
):
"""
Initializes a DataFrameDataset instance.
Initializes a DataFrameDataset instance, preserving original pandas dtypes.

Args:
raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
Expand All @@ -536,25 +551,44 @@ def __init__(
text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
copy_data (bool, optional): Whether to create a copy of the input data. Defaults to True.
"""

VMInput.__init__(self)

self.input_id = input_id

index = None
if isinstance(raw_dataset.index, pd.Index):
index = raw_dataset.index.values
self.index = index

super().__init__(
raw_dataset=raw_dataset.values,
input_id=input_id,
model=model,
index_name=raw_dataset.index.name,
index=index,
columns=raw_dataset.columns.to_list(),
target_column=target_column,
extra_columns=extra_columns,
feature_columns=feature_columns,
text_column=text_column,
target_class_labels=target_class_labels,
date_time_index=date_time_index,
)
# Store the DataFrame directly
self._df = raw_dataset

if date_time_index:
self._df = convert_index_to_datetime(self._df)

self.columns = raw_dataset.columns.tolist()
self.column_aliases = {}
self.target_column = target_column
self.text_column = text_column
self.target_class_labels = target_class_labels
self.extra_columns = ExtraColumns.from_dict(extra_columns)
self._copy_data = copy_data

# Add warning when copy_data is False
if not copy_data:
logger.warning(
"Dataset initialized with copy_data=False. Changes to the original DataFrame "
"may affect this dataset. Use this option only when memory efficiency is critical "
"and you won't modify the source data."
)

self._set_feature_columns(feature_columns)

if model:
self.assign_predictions(model)


class PolarsDataset(VMDataset):
Expand Down
Loading